From 63310467a3d1ed6a0460ec1f4268126cd1ceec2e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 20 Jan 2011 11:12:26 -0600 Subject: mm: Remove support for kmem_cache_name() The last user was ext4 and Eric Sandeen removed the call in a recent patch. See the following URL for the discussion: http://marc.info/?l=linux-ext4&m=129546975702198&w=2 Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slab.c | 8 -------- mm/slob.c | 6 ------ mm/slub.c | 6 ------ 3 files changed, 20 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 37961d1f584f..4bab2d1a8291 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2147,8 +2147,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. - * Note that kmem_cache_name() is not guaranteed to return the same pointer, - * therefore applications must manage it themselves. * * The flags are * @@ -3840,12 +3838,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *cachep) -{ - return cachep->name; -} -EXPORT_SYMBOL_GPL(kmem_cache_name); - /* * This initializes kmem_list3 or resizes various caches for all nodes. */ diff --git a/mm/slob.c b/mm/slob.c index 3588eaaef726..46e0aee33a23 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -666,12 +666,6 @@ unsigned int kmem_cache_size(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *c) -{ - return c->name; -} -EXPORT_SYMBOL(kmem_cache_name); - int kmem_cache_shrink(struct kmem_cache *d) { return 0; diff --git a/mm/slub.c b/mm/slub.c index e15aa7f193c9..d2f343a54bad 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2399,12 +2399,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *s) -{ - return s->name; -} -EXPORT_SYMBOL(kmem_cache_name); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { -- cgit v1.2.3 From 3ff84a7f36554b257cd57325b1a7c1fa4b49fbe3 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Mon, 14 Feb 2011 17:46:21 +0200 Subject: Revert "slab: Fix missing DEBUG_SLAB last user" This reverts commit 5c5e3b33b7cb959a401f823707bee006caadd76e. The commit breaks ARM thusly: | Mount-cache hash table entries: 512 | slab error in verify_redzone_free(): cache `idr_layer_cache': memory outside object was overwritten | Backtrace: | [] (dump_backtrace+0x0/0x110) from [] (dump_stack+0x18/0x1c) | [] (dump_stack+0x0/0x1c) from [] (__slab_error+0x28/0x30) | [] (__slab_error+0x0/0x30) from [] (cache_free_debugcheck+0x1c0/0x2b8) | [] (cache_free_debugcheck+0x0/0x2b8) from [] (kmem_cache_free+0x3c/0xc0) | [] (kmem_cache_free+0x0/0xc0) from [] (ida_get_new_above+0x19c/0x1c0) | [] (ida_get_new_above+0x0/0x1c0) from [] (alloc_vfsmnt+0x54/0x144) | [] (alloc_vfsmnt+0x0/0x144) from [] (vfs_kern_mount+0x30/0xec) | [] (vfs_kern_mount+0x0/0xec) from [] (kern_mount_data+0x1c/0x20) | [] (kern_mount_data+0x0/0x20) from [] (sysfs_init+0x68/0xc8) | [] (sysfs_init+0x0/0xc8) from [] (mnt_init+0x90/0x1b0) | [] (mnt_init+0x0/0x1b0) from [] (vfs_caches_init+0x100/0x140) | [] (vfs_caches_init+0x0/0x140) from [] (start_kernel+0x2e8/0x368) | [] (start_kernel+0x0/0x368) from [] (__enable_mmu+0x0/0x2c) | c0113268: redzone 1:0xd84156c5c032b3ac, redzone 2:0xd84156c5635688c0. | slab error in cache_alloc_debugcheck_after(): cache `idr_layer_cache': double free, or memory outside object was overwritten | ... | c011307c: redzone 1:0x9f91102ffffffff, redzone 2:0x9f911029d74e35b | slab: Internal list corruption detected in cache 'idr_layer_cache'(24), slabp c0113000(16). Hexdump: | | 000: 20 4f 10 c0 20 4f 10 c0 7c 00 00 00 7c 30 11 c0 | 010: 10 00 00 00 10 00 00 00 00 00 c9 17 fe ff ff ff | 020: fe ff ff ff fe ff ff ff fe ff ff ff fe ff ff ff | 030: fe ff ff ff fe ff ff ff fe ff ff ff fe ff ff ff | 040: fe ff ff ff fe ff ff ff fe ff ff ff fe ff ff ff | 050: fe ff ff ff fe ff ff ff fe ff ff ff 11 00 00 00 | 060: 12 00 00 00 13 00 00 00 14 00 00 00 15 00 00 00 | 070: 16 00 00 00 17 00 00 00 c0 88 56 63 | kernel BUG at /home/rmk/git/linux-2.6-rmk/mm/slab.c:2928! Reference: https://lkml.org/lkml/2011/2/7/238 Cc: # 2.6.35.y and later Reported-and-analyzed-by: Russell King Signed-off-by: Pekka Enberg --- mm/slab.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 37961d1f584f..4c6e2e31ced0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2288,8 +2288,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign < align) { ralign = align; } - /* disable debug if not aligning with REDZONE_ALIGN */ - if (ralign & (__alignof__(unsigned long long) - 1)) + /* disable debug if necessary */ + if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. @@ -2315,8 +2315,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += align; - size += align + sizeof(unsigned long long); + cachep->obj_offset += sizeof(unsigned long long); + size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of -- cgit v1.2.3 From b3d41885d9cd0d9db31c8f49e362bae02c96fa3f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Feb 2011 18:35:22 +0100 Subject: slub: fix kmemcheck calls to match ksize() hints Recent use of ksize() in network stack (commit ca44ac38 : net: don't reallocate skb->head unless the current one hasn't the needed extra size or is shared) triggers kmemcheck warnings, because ksize() can return more space than kmemcheck is aware of. Pekka Enberg noticed SLAB+kmemcheck is doing the right thing, while SLUB +kmemcheck doesnt. Bugzilla reference #27212 Reported-by: Christian Casteyde Suggested-by: Pekka Enberg Signed-off-by: Eric Dumazet Acked-by: David S. Miller Acked-by: David Rientjes Acked-by: Christoph Lameter CC: Changli Gao CC: Andrew Morton Signed-off-by: Pekka Enberg --- mm/slub.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d2f343a54bad..217b5b5338a2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -797,10 +797,34 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) return should_failslab(s->objsize, flags, s->flags); } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) { flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, s->objsize); + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); } @@ -2690,7 +2714,6 @@ EXPORT_SYMBOL(__kmalloc_node); size_t ksize(const void *object) { struct page *page; - struct kmem_cache *s; if (unlikely(object == ZERO_SIZE_PTR)) return 0; @@ -2701,28 +2724,8 @@ size_t ksize(const void *object) WARN_ON(!PageCompound(page)); return PAGE_SIZE << compound_order(page); } - s = page->slab; -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; + return slab_ksize(page->slab); } EXPORT_SYMBOL(ksize); -- cgit v1.2.3 From d71f606f687ef9d0cdddfd3619ca7cb9a0b3fb63 Mon Sep 17 00:00:00 2001 From: Mariusz Kozlowski Date: Sat, 26 Feb 2011 20:10:26 +0100 Subject: slub: fix ksize() build error mm/slub.c: In function 'ksize': mm/slub.c:2728: error: implicit declaration of function 'slab_ksize' slab_ksize() needs to go out of CONFIG_SLUB_DEBUG section. Acked-by: Randy Dunlap Acked-by: David Rientjes Signed-off-by: Mariusz Kozlowski Signed-off-by: Pekka Enberg --- mm/slub.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 217b5b5338a2..ea6f0390996f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -281,6 +281,30 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + static inline struct kmem_cache_order_objects oo_make(int order, unsigned long size) { @@ -797,30 +821,6 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) return should_failslab(s->objsize, flags, s->flags); } -static inline size_t slab_ksize(const struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; -} - static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) { flags &= gfp_allowed_mask; -- cgit v1.2.3 From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2011 08:52:07 +0100 Subject: block: remove per-queue plugging Code has been converted over to the new explicit on-stack plugging, and delay users have been converted to use the new API for that. So lets kill off the old plugging along with aops->sync_page(). Signed-off-by: Jens Axboe --- mm/backing-dev.c | 6 ----- mm/filemap.c | 67 +++++------------------------------------------------ mm/memory-failure.c | 8 +++---- mm/nommu.c | 4 ---- mm/page-writeback.c | 2 +- mm/readahead.c | 12 ---------- mm/shmem.c | 1 - mm/swap_state.c | 5 +--- mm/swapfile.c | 37 ----------------------------- mm/vmscan.c | 2 +- 10 files changed, 13 insertions(+), 131 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d30227..c91e139a652e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,17 +14,11 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} -EXPORT_SYMBOL(default_unplug_io_fn); - struct backing_dev_info default_backing_dev_info = { .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, - .unplug_io_fn = default_unplug_io_fn, }; EXPORT_SYMBOL_GPL(default_backing_dev_info); diff --git a/mm/filemap.c b/mm/filemap.c index 83a45d35468b..380776c2a9ac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -155,45 +155,15 @@ void remove_from_page_cache(struct page *page) } EXPORT_SYMBOL(remove_from_page_cache); -static int sync_page(void *word) +static int sleep_on_page(void *word) { - struct address_space *mapping; - struct page *page; - - page = container_of((unsigned long *)word, struct page, flags); - - /* - * page_mapping() is being called without PG_locked held. - * Some knowledge of the state and use of the page is used to - * reduce the requirements down to a memory barrier. - * The danger here is of a stale page_mapping() return value - * indicating a struct address_space different from the one it's - * associated with when it is associated with one. - * After smp_mb(), it's either the correct page_mapping() for - * the page, or an old page_mapping() and the page's own - * page_mapping() has gone NULL. - * The ->sync_page() address_space operation must tolerate - * page_mapping() going NULL. By an amazing coincidence, - * this comes about because none of the users of the page - * in the ->sync_page() methods make essential use of the - * page_mapping(), merely passing the page down to the backing - * device's unplug functions when it's non-NULL, which in turn - * ignore it for all cases but swap, where only page_private(page) is - * of interest. When page_mapping() does go NULL, the entire - * call stack gracefully ignores the page and returns. - * -- wli - */ - smp_mb(); - mapping = page_mapping(page); - if (mapping && mapping->a_ops && mapping->a_ops->sync_page) - mapping->a_ops->sync_page(page); io_schedule(); return 0; } -static int sync_page_killable(void *word) +static int sleep_on_page_killable(void *word) { - sync_page(word); + sleep_on_page(word); return fatal_signal_pending(current) ? -EINTR : 0; } @@ -479,12 +449,6 @@ struct page *__page_cache_alloc(gfp_t gfp) EXPORT_SYMBOL(__page_cache_alloc); #endif -static int __sleep_on_page_lock(void *word) -{ - io_schedule(); - return 0; -} - /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -512,7 +476,7 @@ void wait_on_page_bit(struct page *page, int bit_nr) DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sync_page, + __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); @@ -576,17 +540,12 @@ EXPORT_SYMBOL(end_page_writeback); /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock - * - * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some - * random driver's requestfn sets TASK_RUNNING, we could busywait. However - * chances are that on the second loop, the block layer's plug list is empty, - * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); @@ -596,24 +555,10 @@ int __lock_page_killable(struct page *page) DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); return __wait_on_bit_lock(page_waitqueue(page), &wait, - sync_page_killable, TASK_KILLABLE); + sleep_on_page_killable, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); -/** - * __lock_page_nosync - get a lock on the page, without calling sync_page() - * @page: the page to lock - * - * Variant of lock_page that does not require the caller to hold a reference - * on the page's mapping. - */ -void __lock_page_nosync(struct page *page) -{ - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, - TASK_UNINTERRUPTIBLE); -} - int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0207c2f6f8bd..bfba796d374d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -945,7 +945,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, collect_procs(ppage, &tokill); if (hpage != ppage) - lock_page_nosync(ppage); + lock_page(ppage); ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) @@ -1038,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * Check "just unpoisoned", "filter hit", and * "race with other subpage." */ - lock_page_nosync(hpage); + lock_page(hpage); if (!PageHWPoison(hpage) || (hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { @@ -1088,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - lock_page_nosync(hpage); + lock_page(hpage); /* * unpoison always clear PG_hwpoison inside page lock @@ -1231,7 +1231,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - lock_page_nosync(page); + lock_page(page); /* * This test is racy because PG_hwpoison is set outside of page lock. * That's acceptable because that won't trigger kernel panic. Instead, diff --git a/mm/nommu.c b/mm/nommu.c index f59e1424d3db..fb6cbd6abe16 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1842,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} - unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2cb01f6ec5d0..cc0ede169e41 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1239,7 +1239,7 @@ int set_page_dirty_lock(struct page *page) { int ret; - lock_page_nosync(page); + lock_page(page); ret = set_page_dirty(page); unlock_page(page); return ret; diff --git a/mm/readahead.c b/mm/readahead.c index 77506a291a2d..cbddc3e17246 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -554,17 +554,5 @@ page_cache_async_readahead(struct address_space *mapping, /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); - -#ifdef CONFIG_BLOCK - /* - * Normally the current page is !uptodate and lock_page() will be - * immediately called to implicitly unplug the device. However this - * is not always true for RAID conifgurations, where data arrives - * not strictly in their submission order. In this case we need to - * explicitly kick off the IO. - */ - if (PageUptodate(page)) - blk_run_backing_dev(mapping->backing_dev_info, NULL); -#endif } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/shmem.c b/mm/shmem.c index 5ee67c990602..24d23f5bedf1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, - .unplug_io_fn = default_unplug_io_fn, }; static LIST_HEAD(shmem_swaplist); diff --git a/mm/swap_state.c b/mm/swap_state.c index 5c8cfabbc9bc..46680461785b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -24,12 +24,10 @@ /* * swapper_space is a fiction, retained to simplify the path through - * vmscan's shrink_page_list, to make sync_page look nicer, and to allow - * future use of radix_tree tags in the swap cache. + * vmscan's shrink_page_list. */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .sync_page = block_sync_page, .set_page_dirty = __set_page_dirty_nobuffers, .migratepage = migrate_page, }; @@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = { static struct backing_dev_info swap_backing_dev_info = { .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, - .unplug_io_fn = swap_unplug_io_fn, }; struct address_space swapper_space = { diff --git a/mm/swapfile.c b/mm/swapfile.c index 07a458d72fa8..7ceea78ceb20 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -94,39 +94,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) return ret; } -/* - * We need this because the bdev->unplug_fn can sleep and we cannot - * hold swap_lock while calling the unplug_fn. And swap_lock - * cannot be turned into a mutex. - */ -static DECLARE_RWSEM(swap_unplug_sem); - -void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) -{ - swp_entry_t entry; - - down_read(&swap_unplug_sem); - entry.val = page_private(page); - if (PageSwapCache(page)) { - struct block_device *bdev = swap_info[swp_type(entry)]->bdev; - struct backing_dev_info *bdi; - - /* - * If the page is removed from swapcache from under us (with a - * racy try_to_unuse/swapoff) we need an additional reference - * count to avoid reading garbage from page_private(page) above. - * If the WARN_ON triggers during a swapoff it maybe the race - * condition and it's harmless. However if it triggers without - * swapoff it signals a problem. - */ - WARN_ON(page_count(page) <= 1); - - bdi = bdev->bd_inode->i_mapping->backing_dev_info; - blk_run_backing_dev(bdi, page); - } - up_read(&swap_unplug_sem); -} - /* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. @@ -1643,10 +1610,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out_dput; } - /* wait for any unplug function to finish */ - down_write(&swap_unplug_sem); - up_write(&swap_unplug_sem); - destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); diff --git a/mm/vmscan.c b/mm/vmscan.c index 17497d0cd8b9..251bed73ac03 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -358,7 +358,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi, static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page_nosync(page); + lock_page(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); -- cgit v1.2.3 From 55602dd66f5353981b53f734e26c307f310ced37 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Jun 2010 15:05:37 +0200 Subject: fs: make generic file read/write functions plug Signed-off-by: Jens Axboe --- mm/filemap.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 380776c2a9ac..f9a29c87a2cf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1243,12 +1243,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; + struct blk_plug plug; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; + blk_start_plug(&plug); + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { loff_t size; @@ -1321,6 +1324,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, break; } out: + blk_finish_plug(&plug); return retval; } EXPORT_SYMBOL(generic_file_aio_read); @@ -2432,11 +2436,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); + blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2447,6 +2453,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } + blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(generic_file_aio_write); -- cgit v1.2.3 From 5b417b1873694ece3291d7f64a943304559a817b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Apr 2010 10:04:38 +0200 Subject: read-ahead: use plugging Signed-off-by: Jens Axboe --- mm/readahead.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index cbddc3e17246..2c0cc489e288 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages); static int read_pages(struct address_space *mapping, struct file *filp, struct list_head *pages, unsigned nr_pages) { + struct blk_plug plug; unsigned page_idx; int ret; + blk_start_plug(&plug); + if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); /* Clean up the remaining pages */ @@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp, page_cache_release(page); } ret = 0; + out: + blk_finish_plug(&plug); + return ret; } -- cgit v1.2.3 From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Mar 2011 11:56:30 +0100 Subject: block: kill off REQ_UNPLUG With the plugging now being explicitly controlled by the submitter, callers need not pass down unplugging hints to the block layer. If they want to unplug, it's because they manually plugged on their own - in which case, they should just unplug at will. Signed-off-by: Jens Axboe --- mm/page_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 2dee975bf469..dc76b4d0611e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); -- cgit v1.2.3 From d3f661d69a486db0e0e6343b452f45d91b4b3656 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 25 Feb 2011 11:38:52 -0600 Subject: slub: Get rid of slab_free_hook_irq() The following patch will make the fastpaths lockless and will no longer require interrupts to be disabled. Calling the free hook with irq disabled will no longer be possible. Move the slab_free_hook_irq() logic into slab_free_hook. Only disable interrupts if the features are selected that require callbacks with interrupts off and reenable after calls have been made. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e15aa7f193c9..bae7a5c636f4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -807,14 +807,24 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) -{ - kmemcheck_slab_free(s, object, s->objsize); - debug_check_no_locks_freed(object, s->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + /* + * Trouble is that we may no longer disable interupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->objsize); + debug_check_no_locks_freed(x, s->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->objsize); + local_irq_restore(flags); + } +#endif } /* @@ -1101,9 +1111,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, static inline void slab_free_hook(struct kmem_cache *s, void *x) {} -static inline void slab_free_hook_irq(struct kmem_cache *s, - void *object) {} - #endif /* CONFIG_SLUB_DEBUG */ /* @@ -1909,8 +1916,6 @@ static __always_inline void slab_free(struct kmem_cache *s, local_irq_save(flags); c = __this_cpu_ptr(s->cpu_slab); - slab_free_hook_irq(s, x); - if (likely(page == c->page && c->node != NUMA_NO_NODE)) { set_freepointer(s, object, c->freelist); c->freelist = object; -- cgit v1.2.3 From 8a5ec0ba42c4919e2d8f4c3138cc8b987fdb0b79 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 25 Feb 2011 11:38:54 -0600 Subject: Lockless (and preemptless) fastpaths for slub Use the this_cpu_cmpxchg_double functionality to implement a lockless allocation algorithm on arches that support fast this_cpu_ops. Each of the per cpu pointers is paired with a transaction id that ensures that updates of the per cpu information can only occur in sequence on a certain cpu. A transaction id is a "long" integer that is comprised of an event number and the cpu number. The event number is incremented for every change to the per cpu state. This means that the cmpxchg instruction can verify for an update that nothing interfered and that we are updating the percpu structure for the processor where we picked up the information and that we are also currently on that processor when we update the information. This results in a significant decrease of the overhead in the fastpaths. It also makes it easy to adopt the fast path for realtime kernels since this is lockless and does not require the use of the current per cpu area over the critical section. It is only important that the per cpu area is current at the beginning of the critical section and at the end. So there is no need even to disable preemption. Test results show that the fastpath cycle count is reduced by up to ~ 40% (alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree adds a few cycles. Sadly this does nothing for the slowpath which is where the main issues with performance in slub are but the best case performance rises significantly. (For that see the more complex slub patches that require cmpxchg_double) Kmalloc: alloc/free test Before: 10000 times kmalloc(8)/kfree -> 134 cycles 10000 times kmalloc(16)/kfree -> 152 cycles 10000 times kmalloc(32)/kfree -> 144 cycles 10000 times kmalloc(64)/kfree -> 142 cycles 10000 times kmalloc(128)/kfree -> 142 cycles 10000 times kmalloc(256)/kfree -> 132 cycles 10000 times kmalloc(512)/kfree -> 132 cycles 10000 times kmalloc(1024)/kfree -> 135 cycles 10000 times kmalloc(2048)/kfree -> 135 cycles 10000 times kmalloc(4096)/kfree -> 135 cycles 10000 times kmalloc(8192)/kfree -> 144 cycles 10000 times kmalloc(16384)/kfree -> 754 cycles After: 10000 times kmalloc(8)/kfree -> 78 cycles 10000 times kmalloc(16)/kfree -> 78 cycles 10000 times kmalloc(32)/kfree -> 82 cycles 10000 times kmalloc(64)/kfree -> 88 cycles 10000 times kmalloc(128)/kfree -> 79 cycles 10000 times kmalloc(256)/kfree -> 79 cycles 10000 times kmalloc(512)/kfree -> 85 cycles 10000 times kmalloc(1024)/kfree -> 82 cycles 10000 times kmalloc(2048)/kfree -> 82 cycles 10000 times kmalloc(4096)/kfree -> 85 cycles 10000 times kmalloc(8192)/kfree -> 82 cycles 10000 times kmalloc(16384)/kfree -> 706 cycles Kmalloc: Repeatedly allocate then free test Before: 10000 times kmalloc(8) -> 211 cycles kfree -> 113 cycles 10000 times kmalloc(16) -> 174 cycles kfree -> 115 cycles 10000 times kmalloc(32) -> 235 cycles kfree -> 129 cycles 10000 times kmalloc(64) -> 222 cycles kfree -> 120 cycles 10000 times kmalloc(128) -> 343 cycles kfree -> 139 cycles 10000 times kmalloc(256) -> 827 cycles kfree -> 147 cycles 10000 times kmalloc(512) -> 1048 cycles kfree -> 272 cycles 10000 times kmalloc(1024) -> 2043 cycles kfree -> 528 cycles 10000 times kmalloc(2048) -> 4002 cycles kfree -> 571 cycles 10000 times kmalloc(4096) -> 7740 cycles kfree -> 628 cycles 10000 times kmalloc(8192) -> 8062 cycles kfree -> 850 cycles 10000 times kmalloc(16384) -> 8895 cycles kfree -> 1249 cycles After: 10000 times kmalloc(8) -> 190 cycles kfree -> 129 cycles 10000 times kmalloc(16) -> 76 cycles kfree -> 123 cycles 10000 times kmalloc(32) -> 126 cycles kfree -> 124 cycles 10000 times kmalloc(64) -> 181 cycles kfree -> 128 cycles 10000 times kmalloc(128) -> 310 cycles kfree -> 140 cycles 10000 times kmalloc(256) -> 809 cycles kfree -> 165 cycles 10000 times kmalloc(512) -> 1005 cycles kfree -> 269 cycles 10000 times kmalloc(1024) -> 1999 cycles kfree -> 527 cycles 10000 times kmalloc(2048) -> 3967 cycles kfree -> 570 cycles 10000 times kmalloc(4096) -> 7658 cycles kfree -> 637 cycles 10000 times kmalloc(8192) -> 8111 cycles kfree -> 859 cycles 10000 times kmalloc(16384) -> 8791 cycles kfree -> 1173 cycles Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 203 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index bae7a5c636f4..65030c7fd7e2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1494,6 +1494,77 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) } } +#ifdef CONFIG_CMPXCHG_LOCAL +#ifdef CONFIG_PREEMPT +/* + * Calculate the next globally unique transaction for disambiguiation + * during cmpxchg. The transactions start with the cpu number and are then + * incremented by CONFIG_NR_CPUS. + */ +#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) +#else +/* + * No preemption supported therefore also no need to check for + * different cpus. + */ +#define TID_STEP 1 +#endif + +static inline unsigned long next_tid(unsigned long tid) +{ + return tid + TID_STEP; +} + +static inline unsigned int tid_to_cpu(unsigned long tid) +{ + return tid % TID_STEP; +} + +static inline unsigned long tid_to_event(unsigned long tid) +{ + return tid / TID_STEP; +} + +static inline unsigned int init_tid(int cpu) +{ + return cpu; +} + +static inline void note_cmpxchg_failure(const char *n, + const struct kmem_cache *s, unsigned long tid) +{ +#ifdef SLUB_DEBUG_CMPXCHG + unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); + + printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + +#ifdef CONFIG_PREEMPT + if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + printk("due to cpu change %d -> %d\n", + tid_to_cpu(tid), tid_to_cpu(actual_tid)); + else +#endif + if (tid_to_event(tid) != tid_to_event(actual_tid)) + printk("due to cpu running other code. Event %ld->%ld\n", + tid_to_event(tid), tid_to_event(actual_tid)); + else + printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + actual_tid, tid, next_tid(tid)); +#endif +} + +#endif + +void init_kmem_cache_cpus(struct kmem_cache *s) +{ +#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT) + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); +#endif + +} /* * Remove the cpu slab */ @@ -1525,6 +1596,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) page->inuse--; } c->page = NULL; +#ifdef CONFIG_CMPXCHG_LOCAL + c->tid = next_tid(c->tid); +#endif unfreeze_slab(s, page, tail); } @@ -1659,6 +1733,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void **object; struct page *new; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long flags; + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif +#endif /* We handle __GFP_ZERO in the caller */ gfpflags &= ~__GFP_ZERO; @@ -1685,6 +1772,10 @@ load_freelist: c->node = page_to_nid(c->page); unlock_out: slab_unlock(c->page); +#ifdef CONFIG_CMPXCHG_LOCAL + c->tid = next_tid(c->tid); + local_irq_restore(flags); +#endif stat(s, ALLOC_SLOWPATH); return object; @@ -1746,23 +1837,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, { void **object; struct kmem_cache_cpu *c; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; +#else unsigned long flags; +#endif if (slab_pre_alloc_hook(s, gfpflags)) return NULL; +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); +#else +redo: +#endif + + /* + * Must read kmem_cache cpu data via this cpu ptr. Preemption is + * enabled. We may switch back and forth between cpus while + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + */ c = __this_cpu_ptr(s->cpu_slab); + +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * The transaction ids are globally unique per cpu and per operation on + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double + * occurs on the right processor and that there was no operation on the + * linked list in between. + */ + tid = c->tid; + barrier(); +#endif + object = c->freelist; if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * The cmpxchg will only match if there was no additonal + * operation and if we are on the right processor. + * + * The cmpxchg does the following atomically (without lock semantics!) + * 1. Relocate first pointer to the current per cpu area. + * 2. Verify that tid and freelist have not been changed + * 3. If they were not changed replace tid and freelist + * + * Since this is without lock semantics the protection is only against + * code executing on this cpu *not* from access by other cpus. + */ + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + object, tid, + get_freepointer(s, object), next_tid(tid)))) { + + note_cmpxchg_failure("slab_alloc", s, tid); + goto redo; + } +#else c->freelist = get_freepointer(s, object); +#endif stat(s, ALLOC_FASTPATH); } + +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_restore(flags); +#endif if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->objsize); @@ -1840,9 +1984,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long flags; - stat(s, FREE_SLOWPATH); + local_irq_save(flags); +#endif slab_lock(page); + stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s)) goto debug; @@ -1872,6 +2020,9 @@ checks_ok: out_unlock: slab_unlock(page); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif return; slab_empty: @@ -1883,6 +2034,9 @@ slab_empty: stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif stat(s, FREE_SLAB); discard_slab(s, page); return; @@ -1909,21 +2063,54 @@ static __always_inline void slab_free(struct kmem_cache *s, { void **object = (void *)x; struct kmem_cache_cpu *c; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; +#else unsigned long flags; +#endif slab_free_hook(s, x); +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); +#endif + +redo: + /* + * Determine the currently cpus per cpu slab. + * The cpu may change afterward. However that does not matter since + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succedd. + */ c = __this_cpu_ptr(s->cpu_slab); +#ifdef CONFIG_CMPXCHG_LOCAL + tid = c->tid; + barrier(); +#endif + if (likely(page == c->page && c->node != NUMA_NO_NODE)) { set_freepointer(s, object, c->freelist); + +#ifdef CONFIG_CMPXCHG_LOCAL + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + c->freelist, tid, + object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } +#else c->freelist = object; +#endif stat(s, FREE_FASTPATH); } else __slab_free(s, page, x, addr); +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_restore(flags); +#endif } void kmem_cache_free(struct kmem_cache *s, void *x) @@ -2115,9 +2302,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * Must align to double word boundary for the double cmpxchg instructions + * to work. + */ + s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); +#else + /* Regular alignment is sufficient */ s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); +#endif + + if (!s->cpu_slab) + return 0; + + init_kmem_cache_cpus(s); - return s->cpu_slab != NULL; + return 1; } static struct kmem_cache *kmem_cache_node; -- cgit v1.2.3 From ab9a0f196f2f4f080df54402493ea3dc31b5243e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 10 Mar 2011 15:21:48 +0800 Subject: slub: automatically reserve bytes at the end of slab There is no "struct" for slub's slab, it shares with struct page. But struct page is very small, it is insufficient when we need to add some metadata for slab. So we add a field "reserved" to struct kmem_cache, when a slab is allocated, kmem_cache->reserved bytes are automatically reserved at the end of the slab for slab's metadata. Changed from v1: Export the reserved field via sysfs Acked-by: Christoph Lameter Signed-off-by: Lai Jiangshan Signed-off-by: Pekka Enberg --- mm/slub.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e15aa7f193c9..d3d17677bab5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -281,11 +281,16 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline int order_objects(int order, unsigned long size, int reserved) +{ + return ((PAGE_SIZE << order) - reserved) / size; +} + static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size) + unsigned long size, int reserved) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + order_objects(order, size, reserved) }; return x; @@ -617,7 +622,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)); + length = (PAGE_SIZE << compound_order(page)) - s->reserved; end = start + length; remainder = length % s->size; if (!remainder) @@ -698,7 +703,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", s->name, page->objects, maxobj); @@ -748,7 +753,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + max_objects = order_objects(compound_order(page), s->size, s->reserved); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1988,13 +1993,13 @@ static int slub_nomerge; * the smallest order which will fit the object. */ static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover) + int max_order, int fract_leftover, int reserved) { int order; int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, @@ -2003,10 +2008,10 @@ static inline int slab_order(int size, int min_objects, unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size) + if (slab_size < min_objects * size + reserved) continue; - rem = slab_size % size; + rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; @@ -2016,7 +2021,7 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size) +static inline int calculate_order(int size, int reserved) { int order; int min_objects; @@ -2034,14 +2039,14 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = (PAGE_SIZE << slub_max_order)/size; + max_objects = order_objects(slub_max_order, size, reserved); min_objects = min(min_objects, max_objects); while (min_objects > 1) { fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction); + slub_max_order, fraction, reserved); if (order <= slub_max_order) return order; fraction /= 2; @@ -2053,14 +2058,14 @@ static inline int calculate_order(int size) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1); + order = slab_order(size, 1, slub_max_order, 1, reserved); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1); + order = slab_order(size, 1, MAX_ORDER, 1, reserved); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -2311,7 +2316,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size); + order = calculate_order(size, s->reserved); if (order < 0) return 0; @@ -2329,8 +2334,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size); - s->min = oo_make(get_order(size), size); + s->oo = oo_make(order, size, s->reserved); + s->min = oo_make(get_order(size), size, s->reserved); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -2349,6 +2354,7 @@ static int kmem_cache_open(struct kmem_cache *s, s->objsize = size; s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); + s->reserved = 0; if (!calculate_sizes(s, -1)) goto error; @@ -4017,6 +4023,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); +static ssize_t reserved_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->reserved); +} +SLAB_ATTR_RO(reserved); + #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -4303,6 +4315,7 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, + &reserved_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, -- cgit v1.2.3 From da9a638c6f8fc0633fa94a334f1c053f5e307177 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 10 Mar 2011 15:22:00 +0800 Subject: slub,rcu: don't assume the size of struct rcu_head The size of struct rcu_head may be changed. When it becomes larger, it will pollute the page array. We reserve some some bytes for struct rcu_head when a slab is allocated in this situation. Changed from V1: use VM_BUG_ON instead BUG_ON Acked-by: Christoph Lameter Signed-off-by: Lai Jiangshan Signed-off-by: Pekka Enberg --- mm/slub.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d3d17677bab5..ebba3eb19369 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1254,21 +1254,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } +#define need_reserve_slab_rcu \ + (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) + static void rcu_free_slab(struct rcu_head *h) { struct page *page; - page = container_of((struct list_head *)h, struct page, lru); + if (need_reserve_slab_rcu) + page = virt_to_head_page(h); + else + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); } static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - /* - * RCU free overloads the RCU head over the LRU - */ - struct rcu_head *head = (void *)&page->lru; + struct rcu_head *head; + + if (need_reserve_slab_rcu) { + int order = compound_order(page); + int offset = (PAGE_SIZE << order) - s->reserved; + + VM_BUG_ON(s->reserved != sizeof(*head)); + head = page_address(page) + offset; + } else { + /* + * RCU free overloads the RCU head over the LRU + */ + head = (void *)&page->lru; + } call_rcu(head, rcu_free_slab); } else @@ -2356,6 +2373,9 @@ static int kmem_cache_open(struct kmem_cache *s, s->flags = kmem_cache_flags(size, flags, name, ctor); s->reserved = 0; + if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + s->reserved = sizeof(struct rcu_head); + if (!calculate_sizes(s, -1)) goto error; if (disable_higher_order_debug) { -- cgit v1.2.3 From 5bfe53a77e8a3ffce4a10003c75f464a138e272d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 10 Mar 2011 15:22:24 +0800 Subject: slab,rcu: don't assume the size of struct rcu_head The size of struct rcu_head may be changed. When it becomes larger, it may pollute the data after struct slab. Acked-by: Christoph Lameter Signed-off-by: Lai Jiangshan Signed-off-by: Pekka Enberg --- mm/slab.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 37961d1f584f..52cf0b4634d4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -190,22 +190,6 @@ typedef unsigned int kmem_bufctl_t; #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) -/* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. - */ -struct slab { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; -}; - /* * struct slab_rcu * @@ -219,8 +203,6 @@ struct slab { * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. - * - * We assume struct slab_rcu can overlay struct slab when destroying. */ struct slab_rcu { struct rcu_head head; @@ -228,6 +210,27 @@ struct slab_rcu { void *addr; }; +/* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + union { + struct { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; + unsigned short nodeid; + }; + struct slab_rcu __slab_cover_slab_rcu; + }; +}; + /* * struct array_cache * -- cgit v1.2.3 From 9b6096a65f99a89dfd8328c4e469e7b53b3ae04a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 17 Mar 2011 10:47:06 +0100 Subject: mm: make generic_writepages() use plugging This recovers a performance regression caused by the removal of the per-device plugging. Signed-off-by: Jens Axboe --- mm/page-writeback.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cc0ede169e41..24b7ac2bc36b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1039,11 +1039,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc, int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct blk_plug plug; + int ret; + /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; - return write_cache_pages(mapping, wbc, __writepage, mapping); + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + return ret; } EXPORT_SYMBOL(generic_writepages); -- cgit v1.2.3 From 95f28604a65b1c40b6c6cd95e58439cd7ded3add Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Mar 2011 11:13:12 +0100 Subject: fs: assign sb->s_bdi to default_backing_dev_info if the bdi is going away We don't have proper reference counting for this yet, so we run into cases where the device is pulled and we OOPS on flushing the fs data. This happens even though the dirty inodes have already been migrated to the default_backing_dev_info. Reported-by: Torsten Hilbrich Tested-by: Torsten Hilbrich Cc: stable@kernel.org Signed-off-by: Jens Axboe --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c91e139a652e..8fe9d3407921 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -598,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_bdi == bdi) - sb->s_bdi = NULL; + sb->s_bdi = &default_backing_dev_info; } spin_unlock(&sb_lock); } -- cgit v1.2.3 From a24c5a0ea902bcda348f086bd909cc2d6e305bf8 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 15 Mar 2011 12:45:21 -0500 Subject: slub: Dont define useless label in the !CONFIG_CMPXCHG_LOCAL case The redo label needs #ifdeffery. Fixes the following problem introduced by commit 8a5ec0ba42c4 ("Lockless (and preemptless) fastpaths for slub"): mm/slub.c: In function 'slab_free': mm/slub.c:2124: warning: label 'redo' defined but not used Reported-by: Stephen Rothwell Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 65030c7fd7e2..f32aee37840b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2073,9 +2073,11 @@ static __always_inline void slab_free(struct kmem_cache *s, #ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); -#endif +#else redo: +#endif + /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since -- cgit v1.2.3 From 2fd66c517d5e98de2528d86e0e62f5069ff99f59 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 22 Mar 2011 13:32:53 -0500 Subject: slub: Add missing irq restore for the OOM path OOM path is missing the irq restore in the CONFIG_CMPXCHG_LOCAL case. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7e4f835e32ab..e126cfbd3df2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1857,6 +1857,9 @@ new_slab: } if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) slab_out_of_memory(s, gfpflags, node); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) -- cgit v1.2.3 From 4fdccdfbb4652a7bbac8adbce7449eb093775118 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 22 Mar 2011 13:35:00 -0500 Subject: slub: Add statistics for this_cmpxchg_double failures Add some statistics for debugging. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e126cfbd3df2..93de30db95f5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -217,7 +217,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) #endif -static inline void stat(struct kmem_cache *s, enum stat_item si) +static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS __this_cpu_inc(s->cpu_slab->stat[si]); @@ -1597,6 +1597,7 @@ static inline void note_cmpxchg_failure(const char *n, printk("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); #endif + stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } #endif -- cgit v1.2.3 From 52c50567d8ab0a0a87f12cceaa4194967854f0bd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 22 Mar 2011 16:30:08 -0700 Subject: mm: swap: unlock swapfile inode mutex before closing file on bad swapfiles If an administrator tries to swapon a file backed by NFS, the inode mutex is taken (as it is for any swapfile) but later identified to be a bad swapfile due to the lack of bmap and tries to cleanup. During cleanup, an attempt is made to close the file but with inode->i_mutex still held. Closing an NFS file syncs it which tries to acquire the inode mutex leading to deadlock. If lockdep is enabled the following appears on the console; ============================================= [ INFO: possible recursive locking detected ] 2.6.38-rc8-autobuild #1 --------------------------------------------- swapon/2192 is trying to acquire lock: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: vfs_fsync_range+0x47/0x7c but task is already holding lock: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: sys_swapon+0x28d/0xae7 other info that might help us debug this: 1 lock held by swapon/2192: #0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: sys_swapon+0x28d/0xae7 stack backtrace: Pid: 2192, comm: swapon Not tainted 2.6.38-rc8-autobuild #1 Call Trace: __lock_acquire+0x2eb/0x1623 find_get_pages_tag+0x14a/0x174 pagevec_lookup_tag+0x25/0x2e vfs_fsync_range+0x47/0x7c lock_acquire+0xd3/0x100 vfs_fsync_range+0x47/0x7c nfs_flush_one+0x0/0xdf [nfs] mutex_lock_nested+0x40/0x2b1 vfs_fsync_range+0x47/0x7c vfs_fsync_range+0x47/0x7c vfs_fsync+0x1c/0x1e nfs_file_flush+0x64/0x69 [nfs] filp_close+0x43/0x72 sys_swapon+0xa39/0xae7 sysret_check+0x2e/0x69 system_call_fastpath+0x16/0x1b This patch releases the mutex if its held before calling filep_close() so swapon fails as expected without deadlock when the swapfile is backed by NFS. If accepted for 2.6.39, it should also be considered a -stable candidate for 2.6.38 and 2.6.37. Signed-off-by: Mel Gorman Acked-by: Hugh Dickins Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 0341c5700e34..6d6d28c0a72f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2149,8 +2149,13 @@ bad_swap_2: p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); - if (swap_file) + if (swap_file) { + if (did_down) { + mutex_unlock(&inode->i_mutex); + did_down = 0; + } filp_close(swap_file, NULL); + } out: if (page && !IS_ERR(page)) { kunmap(page); -- cgit v1.2.3 From 3a5dda7a17cf3706f79b86293f29db02d61e0d48 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 22 Mar 2011 16:30:09 -0700 Subject: oom: prevent unnecessary oom kills or kernel panics This patch prevents unnecessary oom kills or kernel panics by reverting two commits: 495789a5 (oom: make oom_score to per-process value) cef1d352 (oom: multi threaded process coredump don't make deadlock) First, 495789a5 (oom: make oom_score to per-process value) ignores the fact that all threads in a thread group do not necessarily exit at the same time. It is imperative that select_bad_process() detect threads that are in the exit path, specifically those with PF_EXITING set, to prevent needlessly killing additional tasks. If a process is oom killed and the thread group leader exits, select_bad_process() cannot detect the other threads that are PF_EXITING by iterating over only processes. Thus, it currently chooses another task unnecessarily for oom kill or panics the machine when nothing else is eligible. By iterating over threads instead, it is possible to detect threads that are exiting and nominate them for oom kill so they get access to memory reserves. Second, cef1d352 (oom: multi threaded process coredump don't make deadlock) erroneously avoids making the oom killer a no-op when an eligible thread other than current isfound to be exiting. We want to detect this situation so that we may allow that exiting thread time to exit and free its memory; if it is able to exit on its own, that should free memory so current is no loner oom. If it is not able to exit on its own, the oom killer will nominate it for oom kill which, in this case, only means it will get access to memory reserves. Without this change, it is easy for the oom killer to unnecessarily target tasks when all threads of a victim don't exit before the thread group leader or, in the worst case, panic the machine. Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Andrey Vagin Cc: [2.6.38.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7dcca55ede7c..b5a7b5f46e7a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -292,11 +292,11 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long totalpages, struct mem_cgroup *mem, const nodemask_t *nodemask) { - struct task_struct *p; + struct task_struct *g, *p; struct task_struct *chosen = NULL; *ppoints = 0; - for_each_process(p) { + do_each_thread(g, p) { unsigned int points; if (oom_unkillable_task(p, mem, nodemask)) @@ -324,7 +324,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * the process of exiting and releasing its resources. * Otherwise we could get an easy OOM deadlock. */ - if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { + if ((p->flags & PF_EXITING) && p->mm) { if (p != current) return ERR_PTR(-1UL); @@ -337,7 +337,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, chosen = p; *ppoints = points; } - } + } while_each_thread(g, p); return chosen; } -- cgit v1.2.3 From 30e2b41f20b6238f51e7cffb879c7a0f0073f5fe Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 22 Mar 2011 16:30:11 -0700 Subject: oom: skip zombies when iterating tasklist We shouldn't defer oom killing if a thread has already detached its ->mm and still has TIF_MEMDIE set. Memory needs to be freed, so find kill other threads that pin the same ->mm or find another task to kill. Signed-off-by: Andrey Vagin Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: [2.6.38.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b5a7b5f46e7a..d7f345e47e73 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -299,6 +299,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, do_each_thread(g, p) { unsigned int points; + if (!p->mm) + continue; if (oom_unkillable_task(p, mem, nodemask)) continue; @@ -324,7 +326,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * the process of exiting and releasing its resources. * Otherwise we could get an easy OOM deadlock. */ - if ((p->flags & PF_EXITING) && p->mm) { + if (p->flags & PF_EXITING) { if (p != current) return ERR_PTR(-1UL); -- cgit v1.2.3 From edd45544c6f09550df0a5491aa8a07af24767e73 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 22 Mar 2011 16:30:12 -0700 Subject: oom: avoid deferring oom killer if exiting task is being traced The oom killer naturally defers killing anything if it finds an eligible task that is already exiting and has yet to detach its ->mm. This avoids unnecessarily killing tasks when one is already in the exit path and may free enough memory that the oom killer is no longer needed. This is detected by PF_EXITING since threads that have already detached its ->mm are no longer considered at all. The problem with always deferring when a thread is PF_EXITING, however, is that it may never actually exit when being traced, specifically if another task is tracing it with PTRACE_O_TRACEEXIT. The oom killer does not want to defer in this case since there is no guarantee that thread will ever exit without intervention. This patch will now only defer the oom killer when a thread is PF_EXITING and no ptracer has stopped its progress in the exit path. It also ensures that a child is sacrificed for the chosen parent only if it has a different ->mm as the comment implies: this ensures that the thread group leader is always targeted appropriately. Signed-off-by: David Rientjes Reported-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Andrey Vagin Cc: [2.6.38.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d7f345e47e73..33b58615072c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -31,6 +31,7 @@ #include #include #include +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -316,22 +317,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (test_tsk_thread_flag(p, TIF_MEMDIE)) return ERR_PTR(-1UL); - /* - * This is in the process of releasing memory so wait for it - * to finish before killing some other task by mistake. - * - * However, if p is the current task, we allow the 'kill' to - * go ahead if it is exiting: this will simply set TIF_MEMDIE, - * which will allow it to gain access to memory reserves in - * the process of exiting and releasing its resources. - * Otherwise we could get an easy OOM deadlock. - */ if (p->flags & PF_EXITING) { - if (p != current) - return ERR_PTR(-1UL); - - chosen = p; - *ppoints = 1000; + /* + * If p is the current task and is in the process of + * releasing memory, we allow the "kill" to set + * TIF_MEMDIE, which will allow it to gain access to + * memory reserves. Otherwise, it may stall forever. + * + * The loop isn't broken here, however, in case other + * threads are found to have already been oom killed. + */ + if (p == current) { + chosen = p; + *ppoints = 1000; + } else { + /* + * If this task is not being ptraced on exit, + * then wait for it to finish before killing + * some other task unnecessarily. + */ + if (!(task_ptrace(p->group_leader) & + PT_TRACE_EXIT)) + return ERR_PTR(-1UL); + } } points = oom_badness(p, mem, nodemask, totalpages); @@ -493,6 +501,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; + if (child->mm == p->mm) + continue; /* * oom_badness() returns 0 if the thread is unkillable */ -- cgit v1.2.3 From 89699605fe7cfd8611900346f61cb6cbf179b10a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 22 Mar 2011 16:30:36 -0700 Subject: mm: vmap area cache Provide a free area cache for the vmalloc virtual address allocator, based on the algorithm used by the user virtual memory allocator. This reduces the number of rbtree operations and linear traversals over the vmap extents in order to find a free area, by starting off at the last point that a free area was found. The free area cache is reset if areas are freed behind it, or if we are searching for a smaller area or alignment than last time. So allocation patterns are not changed (verified by corner-case and random test cases in userspace testing). This solves a regression caused by lazy vunmap TLB purging introduced in db64fe02 (mm: rewrite vmap layer). That patch will leave extents in the vmap allocator after they are vunmapped, and until a significant number accumulate that can be flushed in a single batch. So in a workload that vmalloc/vfree frequently, a chain of extents will build up from VMALLOC_START address, which have to be iterated over each time (giving an O(n) type of behaviour). After this patch, the search will start from where it left off, giving closer to an amortized O(1). This is verified to solve regressions reported Steven in GFS2, and Avi in KVM. Hugh's update: : I tried out the recent mmotm, and on one machine was fortunate to hit : the BUG_ON(first->va_start < addr) which seems to have been stalling : your vmap area cache patch ever since May. : I can get you addresses etc, I did dump a few out; but once I stared : at them, it was easier just to look at the code: and I cannot see how : you would be so sure that first->va_start < addr, once you've done : that addr = ALIGN(max(...), align) above, if align is over 0x1000 : (align was 0x8000 or 0x4000 in the cases I hit: ioremaps like Steve). : I originally got around it by just changing the : if (first->va_start < addr) { : to : while (first->va_start < addr) { : without thinking about it any further; but that seemed unsatisfactory, : why would we want to loop here when we've got another very similar : loop just below it? : I am never going to admit how long I've spent trying to grasp your : "while (n)" rbtree loop just above this, the one with the peculiar : if (!first && tmp->va_start < addr + size) : in. That's unfamiliar to me, I'm guessing it's designed to save a : subsequent rb_next() in a few circumstances (at risk of then setting : a wrong cached_hole_size?); but they did appear few to me, and I didn't : feel I could sign off something with that in when I don't grasp it, : and it seems responsible for extra code and mistaken BUG_ON below it. : I've reverted to the familiar rbtree loop that find_vma() does (but : with va_end >= addr as you had, to respect the additional guard page): : and then (given that cached_hole_size starts out 0) I don't see the : need for any complications below it. If you do want to keep that loop : as you had it, please add a comment to explain what it's trying to do, : and where addr is relative to first when you emerge from it. : Aren't your tests "size <= cached_hole_size" and : "addr + size > first->va_start" forgetting the guard page we want : before the next area? I've changed those. : I have not changed your many "addr + size - 1 < addr" overflow tests, : but have since come to wonder, shouldn't they be "addr + size < addr" : tests - won't the vend checks go wrong if addr + size is 0? : I have added a few comments - Wolfgang Wander's 2.6.13 description of : 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Avoiding mmap fragmentation : helped me a lot, perhaps a pointer to that would be good too. And I found : it easier to understand when I renamed cached_start slightly and moved the : overflow label down. : This patch would go after your mm-vmap-area-cache.patch in mmotm. : Trivially, nobody is going to get that BUG_ON with this patch, and it : appears to work fine on my machines; but I have not given it anything like : the testing you did on your original, and may have broken all the : performance you were aiming for. Please take a look and test it out : integrate with yours if you're satisfied - thanks. [akpm@linux-foundation.org: add locking comment] Signed-off-by: Nick Piggin Signed-off-by: Hugh Dickins Reviewed-by: Minchan Kim Reported-and-tested-by: Steven Whitehouse Reported-and-tested-by: Avi Kivity Tested-by: "Barry J. Marson" Cc: Prarit Bhargava Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 156 +++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 104 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f9b166732e70..fc77adabb5e3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -261,8 +261,15 @@ struct vmap_area { }; static DEFINE_SPINLOCK(vmap_area_lock); -static struct rb_root vmap_area_root = RB_ROOT; static LIST_HEAD(vmap_area_list); +static struct rb_root vmap_area_root = RB_ROOT; + +/* The vmap cache globals are protected by vmap_area_lock */ +static struct rb_node *free_vmap_cache; +static unsigned long cached_hole_size; +static unsigned long cached_vstart; +static unsigned long cached_align; + static unsigned long vmap_area_pcpu_hole; static struct vmap_area *__find_vmap_area(unsigned long addr) @@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, struct rb_node *n; unsigned long addr; int purged = 0; + struct vmap_area *first; BUG_ON(!size); BUG_ON(size & ~PAGE_MASK); + BUG_ON(!is_power_of_2(align)); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); @@ -341,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, return ERR_PTR(-ENOMEM); retry: - addr = ALIGN(vstart, align); - spin_lock(&vmap_area_lock); - if (addr + size - 1 < addr) - goto overflow; + /* + * Invalidate cache if we have more permissive parameters. + * cached_hole_size notes the largest hole noticed _below_ + * the vmap_area cached in free_vmap_cache: if size fits + * into that hole, we want to scan from vstart to reuse + * the hole instead of allocating above free_vmap_cache. + * Note that __free_vmap_area may update free_vmap_cache + * without updating cached_hole_size or cached_align. + */ + if (!free_vmap_cache || + size < cached_hole_size || + vstart < cached_vstart || + align < cached_align) { +nocache: + cached_hole_size = 0; + free_vmap_cache = NULL; + } + /* record if we encounter less permissive parameters */ + cached_vstart = vstart; + cached_align = align; + + /* find starting point for our search */ + if (free_vmap_cache) { + first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); + addr = ALIGN(first->va_end + PAGE_SIZE, align); + if (addr < vstart) + goto nocache; + if (addr + size - 1 < addr) + goto overflow; + + } else { + addr = ALIGN(vstart, align); + if (addr + size - 1 < addr) + goto overflow; - /* XXX: could have a last_hole cache */ - n = vmap_area_root.rb_node; - if (n) { - struct vmap_area *first = NULL; + n = vmap_area_root.rb_node; + first = NULL; - do { + while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end >= addr) { - if (!first && tmp->va_start < addr + size) - first = tmp; - n = n->rb_left; - } else { first = tmp; + if (tmp->va_start <= addr) + break; + n = n->rb_left; + } else n = n->rb_right; - } - } while (n); + } if (!first) goto found; - - if (first->va_end < addr) { - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct vmap_area, rb_node); - else - goto found; - } - - while (addr + size > first->va_start && addr + size <= vend) { - addr = ALIGN(first->va_end + PAGE_SIZE, align); - if (addr + size - 1 < addr) - goto overflow; - - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct vmap_area, rb_node); - else - goto found; - } } -found: - if (addr + size > vend) { -overflow: - spin_unlock(&vmap_area_lock); - if (!purged) { - purge_vmap_area_lazy(); - purged = 1; - goto retry; - } - if (printk_ratelimit()) - printk(KERN_WARNING - "vmap allocation for size %lu failed: " - "use vmalloc= to increase size.\n", size); - kfree(va); - return ERR_PTR(-EBUSY); + + /* from the starting point, walk areas until a suitable hole is found */ + while (addr + size >= first->va_start && addr + size <= vend) { + if (addr + cached_hole_size < first->va_start) + cached_hole_size = first->va_start - addr; + addr = ALIGN(first->va_end + PAGE_SIZE, align); + if (addr + size - 1 < addr) + goto overflow; + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; } - BUG_ON(addr & (align-1)); +found: + if (addr + size > vend) + goto overflow; va->va_start = addr; va->va_end = addr + size; va->flags = 0; __insert_vmap_area(va); + free_vmap_cache = &va->rb_node; spin_unlock(&vmap_area_lock); + BUG_ON(va->va_start & (align-1)); + BUG_ON(va->va_start < vstart); + BUG_ON(va->va_end > vend); + return va; + +overflow: + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + if (printk_ratelimit()) + printk(KERN_WARNING + "vmap allocation for size %lu failed: " + "use vmalloc= to increase size.\n", size); + kfree(va); + return ERR_PTR(-EBUSY); } static void rcu_free_va(struct rcu_head *head) @@ -426,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head) static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + + if (free_vmap_cache) { + if (va->va_end < cached_vstart) { + free_vmap_cache = NULL; + } else { + struct vmap_area *cache; + cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); + if (va->va_start <= cache->va_start) { + free_vmap_cache = rb_prev(&va->rb_node); + /* + * We don't try to update cached_hole_size or + * cached_align, but it won't go very wrong. + */ + } + } + } rb_erase(&va->rb_node, &vmap_area_root); RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); -- cgit v1.2.3 From d527caf22e48480b102c7c6ee5b9ba12170148f7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Mar 2011 16:30:38 -0700 Subject: mm: compaction: prevent kswapd compacting memory to reduce CPU usage This patch reverts 5a03b051 ("thp: use compaction in kswapd for GFP_ATOMIC order > 0") due to reports stating that kswapd CPU usage was higher and IRQs were being disabled more frequently. This was reported at http://www.spinics.net/linux/fedora/alsa-user/msg09885.html. Without this patch applied, CPU usage by kswapd hovers around the 20% mark according to the tester (Arthur Marsh: http://www.spinics.net/linux/fedora/alsa-user/msg09899.html). With this patch applied, it's around 2%. The problem is not related to THP which specifies __GFP_NO_KSWAPD but is triggered by high-order allocations hitting the low watermark for their order and waking kswapd on kernels with CONFIG_COMPACTION set. The most common trigger for this is network cards configured for jumbo frames but it's also possible it'll be triggered by fork-heavy workloads (order-1) and some wireless cards which depend on order-1 allocations. The symptoms for the user will be high CPU usage by kswapd in low-memory situations which could be confused with another writeback problem. While a patch like 5a03b051 may be reintroduced in the future, this patch plays it safe for now and reverts it. [mel@csn.ul.ie: Beefed up the changelog] Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Reported-by: Arthur Marsh Tested-by: Arthur Marsh Cc: [2.6.38.1] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 24 +++--------------------- mm/vmscan.c | 18 +----------------- 2 files changed, 4 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 8be430b812de..dcb058bd76c4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,8 +42,6 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - - int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -397,10 +395,7 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ - if (cc->compact_mode != COMPACT_MODE_KSWAPD) - watermark = low_wmark_pages(zone); - else - watermark = high_wmark_pages(zone); + watermark = low_wmark_pages(zone); watermark += (1 << cc->order); if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) @@ -413,15 +408,6 @@ static int compact_finished(struct zone *zone, if (cc->order == -1) return COMPACT_CONTINUE; - /* - * Generating only one page of the right order is not enough - * for kswapd, we must continue until we're above the high - * watermark as a pool for high order GFP_ATOMIC allocations - * too. - */ - if (cc->compact_mode == COMPACT_MODE_KSWAPD) - return COMPACT_CONTINUE; - /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { /* Job done if page is free of the right migratetype */ @@ -543,8 +529,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync, - int compact_mode) + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -553,7 +538,6 @@ unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, - .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -599,8 +583,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, - COMPACT_MODE_DIRECT_RECLAIM); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -631,7 +614,6 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, - .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/vmscan.c b/mm/vmscan.c index 6771ea70bfe7..3b4a41d72489 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2397,7 +2397,6 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { - int compaction; struct zone *zone = pgdat->node_zones + i; int nr_slab; @@ -2428,24 +2427,9 @@ loop_again: sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; - compaction = 0; - if (order && - zone_watermark_ok(zone, 0, - high_wmark_pages(zone), - end_zone, 0) && - !zone_watermark_ok(zone, order, - high_wmark_pages(zone), - end_zone, 0)) { - compact_zone_order(zone, - order, - sc.gfp_mask, false, - COMPACT_MODE_KSWAPD); - compaction = 1; - } - if (zone->all_unreclaimable) continue; - if (!compaction && nr_slab == 0 && + if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* -- cgit v1.2.3 From 9d502c1c8d47b337c378c2ac8eaeee7918ad16b1 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:30:39 -0700 Subject: mm/compaction: check migrate_pages's return value instead of list_empty() Many migrate_page's caller check return value instead of list_empy by cf608ac19c ("mm: compaction: fix COMPACTPAGEFAILED counting"). This patch makes compaction's migrate_pages consistent with others. This patch should not change old behavior. Signed-off-by: Minchan Kim Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index dcb058bd76c4..38ce48805c08 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -494,12 +494,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { unsigned long nr_migrate, nr_remaining; + int err; if (!isolate_migratepages(zone, cc)) continue; nr_migrate = cc->nr_migratepages; - migrate_pages(&cc->migratepages, compaction_alloc, + err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, cc->sync); update_nr_listpages(cc); @@ -513,7 +514,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_remaining); /* Release LRU pages not migrated */ - if (!list_empty(&cc->migratepages)) { + if (err) { putback_lru_pages(&cc->migratepages); cc->nr_migratepages = 0; } -- cgit v1.2.3 From ddd588b5dd55f14320379961e47683db4e4c1d90 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 22 Mar 2011 16:30:46 -0700 Subject: oom: suppress nodes that are not allowed from meminfo on oom kill The oom killer is extremely verbose for machines with a large number of cpus and/or nodes. This verbosity can often be harmful if it causes other important messages to be scrolled from the kernel log and incurs a signicant time delay, specifically for kernels with CONFIG_NODES_SHIFT > 8. This patch causes only memory information to be displayed for nodes that are allowed by current's cpuset when dumping the VM state. Information for all other nodes is irrelevant to the oom condition; we don't care if there's an abundance of memory elsewhere if we can't access it. This only affects the behavior of dumping memory information when an oom is triggered. Other dumps, such as for sysrq+m, still display the unfiltered form when using the existing show_mem() interface. Additionally, the per-cpu pageset statistics are extremely verbose in oom killer output, so it is now suppressed. This removes nodes_weight(current->mems_allowed) * (1 + nr_cpus) lines from the oom killer output. Callers may use __show_mem(SHOW_MEM_FILTER_NODES) to filter disallowed nodes. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- mm/page_alloc.c | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 33b58615072c..3100bc57036b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -406,7 +406,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(current); dump_stack(); mem_cgroup_print_oom_info(mem, p); - show_mem(); + __show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(mem, nodemask); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7945247b1e53..36be3ba4bbed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2411,19 +2411,42 @@ void si_meminfo_node(struct sysinfo *val, int nid) } #endif +/* + * Determine whether the zone's node should be displayed or not, depending on + * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas(). + */ +static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) +{ + bool ret = false; + + if (!(flags & SHOW_MEM_FILTER_NODES)) + goto out; + + get_mems_allowed(); + ret = !node_isset(zone->zone_pgdat->node_id, + cpuset_current_mems_allowed); + put_mems_allowed(); +out: + return ret; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. + * Suppresses nodes that are not allowed by current's cpuset if + * SHOW_MEM_FILTER_NODES is passed. */ -void show_free_areas(void) +void __show_free_areas(unsigned int filter) { int cpu; struct zone *zone; for_each_populated_zone(zone) { + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -2465,6 +2488,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { int i; + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s" " free:%lukB" @@ -2532,6 +2557,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s: ", zone->name); @@ -2551,6 +2578,11 @@ void show_free_areas(void) show_swap_cache_info(); } +void show_free_areas(void) +{ + __show_free_areas(0); +} + static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; -- cgit v1.2.3 From 29423e77c06cee7d4e335ef4a7cbd949da978c91 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 22 Mar 2011 16:30:47 -0700 Subject: oom: suppress show_mem() for many nodes in irq context on page alloc failure When a page allocation failure occurs, show_mem() is called to dump the state of the VM so users may understand what happened to get into that condition. This output, however, can be extremely verbose. In irq context, it may result in significant delays that incur NMI watchdog timeouts when the machine is large (we use CONFIG_NODES_SHIFT > 8 here to define a "large" machine since the length of the show_mem() output is proportional to the number of possible nodes). This patch suppresses the show_mem() call in irq context when the kernel has CONFIG_NODES_SHIFT > 8. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36be3ba4bbed..2aaafe82f513 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1714,6 +1714,20 @@ try_next_zone: return page; } +/* + * Large machines with many possible nodes should not always dump per-node + * meminfo in irq context. + */ +static inline bool should_suppress_show_mem(void) +{ + bool ret = false; + +#if NODES_SHIFT > 8 + ret = in_interrupt(); +#endif + return ret; +} + static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed) @@ -2161,7 +2175,8 @@ nopage: " order:%d, mode:0x%x\n", current->comm, order, gfp_mask); dump_stack(); - show_mem(); + if (!should_suppress_show_mem()) + show_mem(); } return page; got_pg: -- cgit v1.2.3 From cbf978bfb12d7deca97d7333f65eda0381a072de Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 22 Mar 2011 16:30:48 -0700 Subject: oom: suppress nodes that are not allowed from meminfo on page alloc failure Displaying extremely verbose meminfo for all nodes on the system is overkill for page allocation failures when the context restricts that allocation to only a subset of nodes. We don't particularly care about the state of all nodes when some are not allowed in the current context, they can have an abundance of memory but we can't allocate from that part of memory. This patch suppresses disallowed nodes from the meminfo dump on a page allocation failure if the context requires it. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2aaafe82f513..36a168e383b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2171,12 +2171,25 @@ rebalance: nopage: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { - printk(KERN_WARNING "%s: page allocation failure." - " order:%d, mode:0x%x\n", + unsigned int filter = SHOW_MEM_FILTER_NODES; + + /* + * This documents exceptions given to allocations in certain + * contexts that are allowed to allocate outside current's set + * of allowed nodes. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (test_thread_flag(TIF_MEMDIE) || + (current->flags & (PF_MEMALLOC | PF_EXITING))) + filter &= ~SHOW_MEM_FILTER_NODES; + if (in_interrupt() || !wait) + filter &= ~SHOW_MEM_FILTER_NODES; + + pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n", current->comm, order, gfp_mask); dump_stack(); if (!should_suppress_show_mem()) - show_mem(); + __show_mem(filter); } return page; got_pg: -- cgit v1.2.3 From 5fda1bd5b8869574dad8e1f9f71e23bf0c186274 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 22 Mar 2011 16:30:49 -0700 Subject: mm: notifier_from_errno() cleanup While looking at some other notifier callbacks I noticed this code could use a simple cleanup. notifier_from_errno() no longer needs the if (ret)/else conditional. That same conditional is now done in notifier_from_errno(). Signed-off-by: Prarit Bhargava Cc: Paul Menage Cc: Li Zefan Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 7 +------ mm/slab.c | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5bffada7cde1..59a3cd4c799d 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -243,12 +243,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, break; } - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } #endif diff --git a/mm/slab.c b/mm/slab.c index a18ba57517af..568803f121a8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1390,7 +1390,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self, break; } out: - return ret ? notifier_from_errno(ret) : NOTIFY_OK; + return notifier_from_errno(ret); } #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ -- cgit v1.2.3 From 318b275fbca1ab9ec0862de71420e0e92c3d1aa7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 22 Mar 2011 16:30:51 -0700 Subject: mm: allow GUP to fail instead of waiting on a page GUP user may want to try to acquire a reference to a page if it is already in memory, but not if IO, to bring it in, is needed. For example KVM may tell vcpu to schedule another guest process if current one is trying to access swapped out page. Meanwhile, the page will be swapped in and the guest process, that depends on it, will be able to run again. This patch adds FAULT_FLAG_RETRY_NOWAIT (suggested by Linus) and FOLL_NOWAIT follow_page flags. FAULT_FLAG_RETRY_NOWAIT, when used in conjunction with VM_FAULT_ALLOW_RETRY, indicates to handle_mm_fault that it shouldn't drop mmap_sem and wait on a page, but return VM_FAULT_RETRY instead. [akpm@linux-foundation.org: improve FOLL_NOWAIT comment] Signed-off-by: Gleb Natapov Cc: Linus Torvalds Cc: Hugh Dickins Acked-by: Rik van Riel Cc: Michel Lespinasse Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ++++-- mm/memory.c | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 83a45d35468b..312b6eb78430 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -621,8 +621,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, __lock_page(page); return 1; } else { - up_read(&mm->mmap_sem); - wait_on_page_locked(page); + if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) { + up_read(&mm->mmap_sem); + wait_on_page_locked(page); + } return 0; } } diff --git a/mm/memory.c b/mm/memory.c index e48945ab362b..615be5127ce1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1569,6 +1569,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, fault_flags |= FAULT_FLAG_WRITE; if (nonblocking) fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (foll_flags & FOLL_NOWAIT) + fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); ret = handle_mm_fault(mm, vma, start, fault_flags); @@ -1595,7 +1597,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, tsk->min_flt++; if (ret & VM_FAULT_RETRY) { - *nonblocking = 0; + if (nonblocking) + *nonblocking = 0; return i; } -- cgit v1.2.3 From ef6a3c63112e865d632ff7c478ba7c7160cad0d1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 22 Mar 2011 16:30:52 -0700 Subject: mm: add replace_page_cache_page() function This function basically does: remove_from_page_cache(old); page_cache_release(old); add_to_page_cache_locked(new); Except it does this atomically, so there's no possibility for the "add" to fail because of a race. If memory cgroups are enabled, then the memory cgroup charge is also moved from the old page to the new. This function is currently used by fuse to move pages into the page cache on read, instead of copying the page contents. [minchan.kim@gmail.com: add freepage() hook to replace_page_cache_page()] Signed-off-by: Miklos Szeredi Acked-by: Rik van Riel Acked-by: KAMEZAWA Hiroyuki Cc: Mel Gorman Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 4 ++-- mm/migrate.c | 2 +- 3 files changed, 73 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 312b6eb78430..c1459f2cdb5e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -386,6 +386,76 @@ int filemap_write_and_wait_range(struct address_space *mapping, } EXPORT_SYMBOL(filemap_write_and_wait_range); +/** + * replace_page_cache_page - replace a pagecache page with a new one + * @old: page to be replaced + * @new: page to replace with + * @gfp_mask: allocation mode + * + * This function replaces a page in the pagecache with a new one. On + * success it acquires the pagecache reference for the new page and + * drops it for the old page. Both the old and new pages must be + * locked. This function does not add the new page to the LRU, the + * caller must do that. + * + * The remove + add is atomic. The only way this function can fail is + * memory allocation failure. + */ +int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) +{ + int error; + struct mem_cgroup *memcg = NULL; + + VM_BUG_ON(!PageLocked(old)); + VM_BUG_ON(!PageLocked(new)); + VM_BUG_ON(new->mapping); + + /* + * This is not page migration, but prepare_migration and + * end_migration does enough work for charge replacement. + * + * In the longer term we probably want a specialized function + * for moving the charge from old to new in a more efficient + * manner. + */ + error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); + if (error) + return error; + + error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + if (!error) { + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *); + + pgoff_t offset = old->index; + freepage = mapping->a_ops->freepage; + + page_cache_get(new); + new->mapping = mapping; + new->index = offset; + + spin_lock_irq(&mapping->tree_lock); + __remove_from_page_cache(old); + error = radix_tree_insert(&mapping->page_tree, offset, new); + BUG_ON(error); + mapping->nrpages++; + __inc_zone_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(new)) + __inc_zone_page_state(new, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + radix_tree_preload_end(); + if (freepage) + freepage(old); + page_cache_release(old); + mem_cgroup_end_migration(memcg, old, new, true); + } else { + mem_cgroup_end_migration(memcg, old, new, false); + } + + return error; +} +EXPORT_SYMBOL_GPL(replace_page_cache_page); + /** * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da53a252b259..6ef5c53dffcb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2883,7 +2883,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * page belongs to. */ int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr) + struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; @@ -2940,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, ptr, false, PAGE_SIZE); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { diff --git a/mm/migrate.c b/mm/migrate.c index 352de555626c..8aacce3af8cd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -678,7 +678,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, } /* charge against new page */ - charge = mem_cgroup_prepare_migration(page, newpage, &mem); + charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); if (charge == -ENOMEM) { rc = -ENOMEM; goto unlock; -- cgit v1.2.3 From 97cecb5a254fec22d28ef32235d888bfbfd7c783 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:30:53 -0700 Subject: mm: introduce delete_from_page_cache() Presently we increase the page refcount in add_to_page_cache() but don't decrease it in remove_from_page_cache(). Such asymmetry adds confusion, requiring that callers notice it and a comment explaining why they release a page reference. It's not a good API. A long time ago, Hugh tried it (http://lkml.org/lkml/2004/10/24/140) but gave up because reiser4's drop_page() had to unlock the page between removing it from page cache and doing the page_cache_release(). But now the situation is changed. I think at least things in current mainline don't have any obstacles. The problem is for out-of-mainline filesystems - if they have done such things as reiser4, this patch could be a problem but they will discover this at compile time since we remove remove_from_page_cache(). This patch: This function works as just wrapper remove_from_page_cache(). The difference is that it decreases page references in itself. So caller have to make sure it has a page reference before calling. This patch is ready for removing remove_from_page_cache(). Signed-off-by: Minchan Kim Cc: Christoph Hellwig Acked-by: Hugh Dickins Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Cc: Edward Shishkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index c1459f2cdb5e..e7b59785ceb9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -155,6 +155,22 @@ void remove_from_page_cache(struct page *page) } EXPORT_SYMBOL(remove_from_page_cache); +/** + * delete_from_page_cache - delete page from page cache + * @page: the page which the kernel is trying to remove from page cache + * + * This must be called only on pages that have + * been verified to be in the page cache and locked. + * It will never put the page into the free list, + * the caller has a reference on the page. + */ +void delete_from_page_cache(struct page *page) +{ + remove_from_page_cache(page); + page_cache_release(page); +} +EXPORT_SYMBOL(delete_from_page_cache); + static int sync_page(void *word) { struct address_space *mapping; -- cgit v1.2.3 From 4c73b1bc6bb14aab7888ebe6bffe957cf7c07fa0 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:40 -0700 Subject: mm: shmem: change remove_from_page_cache This patch series changes remove_from_page_cache()'s page ref counting rule. Page cache ref count is decreased in delete_from_page_cache(). So we don't need to decrease the page reference in callers. Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 048a95a5244d..88593586bdb7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1081,7 +1081,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_recalc_inode(inode); if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - remove_from_page_cache(page); + delete_from_page_cache(page); shmem_swp_set(info, entry, swap.val); shmem_swp_unmap(entry); if (list_empty(&info->swaplist)) @@ -1091,7 +1091,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock(&info->lock); swap_shmem_alloc(swap); BUG_ON(page_mapped(page)); - page_cache_release(page); /* pagecache ref */ swap_writepage(page, wbc); if (inode) { mutex_lock(&shmem_swaplist_mutex); -- cgit v1.2.3 From 5adc7b518b54f7af2b8395d2035898340d96b1d5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:41 -0700 Subject: mm: truncate: change remove_from_page_cache This patch series changes remove_from_page_cache()'s page ref counting rule. Page cache ref count is decreased in delete_from_page_cache(). So we don't need to decrease the page reference in callers. Signed-off-by: Minchan Kim Cc: Dan Magenheimer Cc: Andi Kleen Cc: Nick Piggin Cc: Al Viro Acked-by: Hugh Dickins Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index d64296be00d3..7c617b5f03ee 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -106,9 +106,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); clear_page_mlock(page); - remove_from_page_cache(page); ClearPageMappedToDisk(page); - page_cache_release(page); /* pagecache ref */ + delete_from_page_cache(page); return 0; } -- cgit v1.2.3 From 702cfbf93aaf3a091b0c64c8766c1ade0a820c38 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:43 -0700 Subject: mm: goodbye remove_from_page_cache() Now delete_from_page_cache() replaces remove_from_page_cache(). So we remove remove_from_page_cache so fs or something out of mainline will notice it when compile time and can fix it. Signed-off-by: Minchan Kim Cc: Christoph Hellwig Acked-by: Hugh Dickins Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index e7b59785ceb9..1cfb8fd84b27 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -137,7 +137,15 @@ void __remove_from_page_cache(struct page *page) } } -void remove_from_page_cache(struct page *page) +/** + * delete_from_page_cache - delete page from page cache + * @page: the page which the kernel is trying to remove from page cache + * + * This must be called only on pages that have been verified to be in the page + * cache and locked. It will never put the page into the free list, the caller + * has a reference on the page. + */ +void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; void (*freepage)(struct page *); @@ -152,21 +160,6 @@ void remove_from_page_cache(struct page *page) if (freepage) freepage(page); -} -EXPORT_SYMBOL(remove_from_page_cache); - -/** - * delete_from_page_cache - delete page from page cache - * @page: the page which the kernel is trying to remove from page cache - * - * This must be called only on pages that have - * been verified to be in the page cache and locked. - * It will never put the page into the free list, - * the caller has a reference on the page. - */ -void delete_from_page_cache(struct page *page) -{ - remove_from_page_cache(page); page_cache_release(page); } EXPORT_SYMBOL(delete_from_page_cache); -- cgit v1.2.3 From e64a782fec684c29a8204c51b3cb554dce588592 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:44 -0700 Subject: mm: change __remove_from_page_cache() Now we renamed remove_from_page_cache with delete_from_page_cache. As consistency of __remove_from_swap_cache and remove_from_swap_cache, we change internal page cache handling function name, too. Signed-off-by: Minchan Kim Cc: Christoph Hellwig Acked-by: Hugh Dickins Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++++---- mm/memory-failure.c | 2 +- mm/truncate.c | 2 +- mm/vmscan.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1cfb8fd84b27..cb476e70cf19 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -108,11 +108,11 @@ */ /* - * Remove a page from the page cache and free it. Caller has to make + * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */ -void __remove_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; @@ -154,7 +154,7 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -444,7 +444,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->index = offset; spin_lock_irq(&mapping->tree_lock); - __remove_from_page_cache(old); + __delete_from_page_cache(old); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 99ccb4472623..e0af336530c6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1130,7 +1130,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) /* * Now take care of user space mappings. - * Abort on fail: __remove_from_page_cache() assumes unmapped page. + * Abort on fail: __delete_from_page_cache() assumes unmapped page. */ if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); diff --git a/mm/truncate.c b/mm/truncate.c index 7c617b5f03ee..3d2ae1f423dc 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -388,7 +388,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) clear_page_mlock(page); BUG_ON(page_has_private(page)); - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3b4a41d72489..665b090b6c72 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -514,7 +514,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) freepage = mapping->a_ops->freepage; - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); -- cgit v1.2.3 From 1d16871d8c96deadc5f9753b6b096074f2cbcbe1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 22 Mar 2011 16:32:45 -0700 Subject: mm: batch-free pcp list if possible free_pcppages_bulk() frees pages from pcp lists in a round-robin fashion by keeping batch_free counter. But it doesn't need to spin if there is only one non-empty list. This can be checked by batch_free == MIGRATE_PCPTYPES. [akpm@linux-foundation.org: fix comment] Signed-off-by: Namhyung Kim Acked-by: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36a168e383b5..426056aff12a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -614,6 +614,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, list = &pcp->lists[migratetype]; } while (list_empty(list)); + /* This is the only non-empty list. Free them all. */ + if (batch_free == MIGRATE_PCPTYPES) + batch_free = to_free; + do { page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ -- cgit v1.2.3 From 7bc32f6f90dae67730645da67bfd44304f810f93 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 22 Mar 2011 16:32:46 -0700 Subject: mm: debug-pagealloc: fix kconfig dependency warning Fix kconfig dependency warning to satisfy dependencies: warning: (PAGE_POISONING) selects DEBUG_PAGEALLOC which has unmet direct dependencies (DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC && (!HIBERNATION || !PPC && !SPARC) && !KMEMCHECK) Signed-off-by: Akinobu Mita Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index af7cfb43d2f0..8b1a477162dc 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -1,27 +1,24 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" - depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on !HIBERNATION || !PPC && !SPARC + depends on DEBUG_KERNEL + depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC depends on !KMEMCHECK + select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types of memory corruption. + For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, + fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages(). Additionally, + this option cannot be enabled in combination with hibernation as + that would result in incorrect warnings of memory corruption after + a resume because free pages are not saved to the suspend image. + config WANT_PAGE_DEBUG_FLAGS bool config PAGE_POISONING - bool "Debug page memory allocations" - depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on !HIBERNATION - select DEBUG_PAGEALLOC + bool select WANT_PAGE_DEBUG_FLAGS - ---help--- - Fill the pages with poison patterns after free_pages() and verify - the patterns before alloc_pages(). This results in a large slowdown, - but helps to find certain types of memory corruption. - - This option cannot be enabled in combination with hibernation as - that would result in incorrect warnings of memory corruption after - a resume because free pages are not saved to the suspend image. -- cgit v1.2.3 From 9e60109f125013b6c571f399a15a8b0fe1ffa4e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Mar 2011 16:32:46 -0700 Subject: mm: rename drop_anon_vma() to put_anon_vma() The normal code pattern used in the kernel is: get/put. Signed-off-by: Peter Zijlstra Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Reviewed-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 23 +++++------------------ mm/migrate.c | 4 ++-- mm/rmap.c | 4 ++-- 3 files changed, 9 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index c2b2a94f9d67..1bbe785aa559 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -301,20 +301,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) return rmap_item->address & STABLE_FLAG; } -static void hold_anon_vma(struct rmap_item *rmap_item, - struct anon_vma *anon_vma) -{ - rmap_item->anon_vma = anon_vma; - get_anon_vma(anon_vma); -} - -static void ksm_drop_anon_vma(struct rmap_item *rmap_item) -{ - struct anon_vma *anon_vma = rmap_item->anon_vma; - - drop_anon_vma(anon_vma); -} - /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -397,7 +383,7 @@ static void break_cow(struct rmap_item *rmap_item) * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ - ksm_drop_anon_vma(rmap_item); + put_anon_vma(rmap_item->anon_vma); down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) @@ -466,7 +452,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) ksm_pages_sharing--; else ksm_pages_shared--; - ksm_drop_anon_vma(rmap_item); + put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } @@ -554,7 +540,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) else ksm_pages_shared--; - ksm_drop_anon_vma(rmap_item); + put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { @@ -949,7 +935,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, goto out; /* Must get reference to anon_vma while still holding mmap_sem */ - hold_anon_vma(rmap_item, vma->anon_vma); + rmap_item->anon_vma = vma->anon_vma; + get_anon_vma(vma->anon_vma); out: up_read(&mm->mmap_sem); return err; diff --git a/mm/migrate.c b/mm/migrate.c index 8aacce3af8cd..7d2983f3783e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -764,7 +764,7 @@ skip_unmap: /* Drop an anon_vma reference if we took one */ if (anon_vma) - drop_anon_vma(anon_vma); + put_anon_vma(anon_vma); uncharge: if (!charge) @@ -856,7 +856,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, remove_migration_ptes(hpage, hpage); if (anon_vma) - drop_anon_vma(anon_vma); + put_anon_vma(anon_vma); out: unlock_page(hpage); diff --git a/mm/rmap.c b/mm/rmap.c index 941bf82e8961..ad416afb2061 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -278,7 +278,7 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) if (empty) { /* We no longer need the root anon_vma */ if (anon_vma->root != anon_vma) - drop_anon_vma(anon_vma->root); + put_anon_vma(anon_vma->root); anon_vma_free(anon_vma); } } @@ -1493,7 +1493,7 @@ int try_to_munlock(struct page *page) * we know we are the last user, nobody else can get a reference and we * can do the freeing without the lock. */ -void drop_anon_vma(struct anon_vma *anon_vma) +void put_anon_vma(struct anon_vma *anon_vma) { BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { -- cgit v1.2.3 From 83813267c699ab11cc65a6d9d0f42db42f0862b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Mar 2011 16:32:48 -0700 Subject: mm: move anon_vma ref out from under CONFIG_foo We need the anon_vma refcount unconditionally to simplify the anon_vma lifetime rules. Signed-off-by: Peter Zijlstra Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index ad416afb2061..873cd9ef912c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -272,7 +272,7 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); + empty = list_empty(&anon_vma->head) && !atomic_read(&anon_vma->refcount); anon_vma_unlock(anon_vma); if (empty) { @@ -303,7 +303,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); - anonvma_external_refcount_init(anon_vma); + atomic_set(&anon_vma->refcount, 0); INIT_LIST_HEAD(&anon_vma->head); } @@ -1486,7 +1486,6 @@ int try_to_munlock(struct page *page) return try_to_unmap_file(page, TTU_MUNLOCK); } -#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) /* * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root * if necessary. Be careful to do all the tests under the lock. Once @@ -1495,8 +1494,8 @@ int try_to_munlock(struct page *page) */ void put_anon_vma(struct anon_vma *anon_vma) { - BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); - if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { + BUG_ON(atomic_read(&anon_vma->refcount) <= 0); + if (atomic_dec_and_lock(&anon_vma->refcount, &anon_vma->root->lock)) { struct anon_vma *root = anon_vma->root; int empty = list_empty(&anon_vma->head); int last_root_user = 0; @@ -1507,8 +1506,8 @@ void put_anon_vma(struct anon_vma *anon_vma) * the refcount on the root and check if we need to free it. */ if (empty && anon_vma != root) { - BUG_ON(atomic_read(&root->external_refcount) <= 0); - last_root_user = atomic_dec_and_test(&root->external_refcount); + BUG_ON(atomic_read(&root->refcount) <= 0); + last_root_user = atomic_dec_and_test(&root->refcount); root_empty = list_empty(&root->head); } anon_vma_unlock(anon_vma); @@ -1520,7 +1519,6 @@ void put_anon_vma(struct anon_vma *anon_vma) } } } -#endif #ifdef CONFIG_MIGRATION /* -- cgit v1.2.3 From 01d8b20dec5f4019283e244aba50ba86fe6ead6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Mar 2011 16:32:49 -0700 Subject: mm: simplify anon_vma refcounts This patch changes the anon_vma refcount to be 0 when the object is free. It does this by adding 1 ref to being in use in the anon_vma structure (iow. the anon_vma->head list is not empty). This allows a simpler release scheme without having to check both the refcount and the list as well as avoids taking a ref for each entry on the list. Signed-off-by: Peter Zijlstra Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Acked-by: Mel Gorman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 76 +++++++++++++++++++++++---------------------------------------- 1 file changed, 28 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 873cd9ef912c..4a8e99a0fb97 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -67,11 +67,24 @@ static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { - return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); + struct anon_vma *anon_vma; + + anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); + if (anon_vma) { + atomic_set(&anon_vma->refcount, 1); + /* + * Initialise the anon_vma root to point to itself. If called + * from fork, the root will be reset to the parents anon_vma. + */ + anon_vma->root = anon_vma; + } + + return anon_vma; } -void anon_vma_free(struct anon_vma *anon_vma) +static inline void anon_vma_free(struct anon_vma *anon_vma) { + VM_BUG_ON(atomic_read(&anon_vma->refcount)); kmem_cache_free(anon_vma_cachep, anon_vma); } @@ -133,11 +146,6 @@ int anon_vma_prepare(struct vm_area_struct *vma) if (unlikely(!anon_vma)) goto out_enomem_free_avc; allocated = anon_vma; - /* - * This VMA had no anon_vma yet. This anon_vma is - * the root of any anon_vma tree that might form. - */ - anon_vma->root = anon_vma; } anon_vma_lock(anon_vma); @@ -156,7 +164,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) anon_vma_unlock(anon_vma); if (unlikely(allocated)) - anon_vma_free(allocated); + put_anon_vma(allocated); if (unlikely(avc)) anon_vma_chain_free(avc); } @@ -241,9 +249,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) */ anon_vma->root = pvma->anon_vma->root; /* - * With KSM refcounts, an anon_vma can stay around longer than the - * process it belongs to. The root anon_vma needs to be pinned - * until this anon_vma is freed, because the lock lives in the root. + * With refcounts, an anon_vma can stay around longer than the + * process it belongs to. The root anon_vma needs to be pinned until + * this anon_vma is freed, because the lock lives in the root. */ get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ @@ -253,7 +261,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) return 0; out_error_free_anon_vma: - anon_vma_free(anon_vma); + put_anon_vma(anon_vma); out_error: unlink_anon_vmas(vma); return -ENOMEM; @@ -272,15 +280,11 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head) && !atomic_read(&anon_vma->refcount); + empty = list_empty(&anon_vma->head); anon_vma_unlock(anon_vma); - if (empty) { - /* We no longer need the root anon_vma */ - if (anon_vma->root != anon_vma) - put_anon_vma(anon_vma->root); - anon_vma_free(anon_vma); - } + if (empty) + put_anon_vma(anon_vma); } void unlink_anon_vmas(struct vm_area_struct *vma) @@ -1486,38 +1490,14 @@ int try_to_munlock(struct page *page) return try_to_unmap_file(page, TTU_MUNLOCK); } -/* - * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root - * if necessary. Be careful to do all the tests under the lock. Once - * we know we are the last user, nobody else can get a reference and we - * can do the freeing without the lock. - */ -void put_anon_vma(struct anon_vma *anon_vma) +void __put_anon_vma(struct anon_vma *anon_vma) { - BUG_ON(atomic_read(&anon_vma->refcount) <= 0); - if (atomic_dec_and_lock(&anon_vma->refcount, &anon_vma->root->lock)) { - struct anon_vma *root = anon_vma->root; - int empty = list_empty(&anon_vma->head); - int last_root_user = 0; - int root_empty = 0; + struct anon_vma *root = anon_vma->root; - /* - * The refcount on a non-root anon_vma got dropped. Drop - * the refcount on the root and check if we need to free it. - */ - if (empty && anon_vma != root) { - BUG_ON(atomic_read(&root->refcount) <= 0); - last_root_user = atomic_dec_and_test(&root->refcount); - root_empty = list_empty(&root->head); - } - anon_vma_unlock(anon_vma); + if (root != anon_vma && atomic_dec_and_test(&root->refcount)) + anon_vma_free(root); - if (empty) { - anon_vma_free(anon_vma); - if (root_empty && last_root_user) - anon_vma_free(root); - } - } + anon_vma_free(anon_vma); } #ifdef CONFIG_MIGRATION -- cgit v1.2.3 From 315601809d124d046abd6c3ffa346d0dbd7aa29d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:52 -0700 Subject: mm: deactivate invalidated pages Recently, there are reported problem about thrashing. (http://marc.info/?l=rsync&m=128885034930933&w=2) It happens by backup workloads(ex, nightly rsync). That's because the workload makes just use-once pages and touches pages twice. It promotes the page into active list so that it results in working set page eviction. Some app developer want to support POSIX_FADV_NOREUSE. But other OSes don't support it, either. (http://marc.info/?l=linux-mm&m=128928979512086&w=2) By other approach, app developers use POSIX_FADV_DONTNEED. But it has a problem. If kernel meets page is writing during invalidate_mapping_pages, it can't work. It makes for application programmer to use it since they always have to sync data before calling fadivse(..POSIX_FADV_DONTNEED) to make sure the pages could be discardable. At last, they can't use deferred write of kernel so that they could see performance loss. (http://insights.oetiker.ch/linux/fadvise.html) In fact, invalidation is very big hint to reclaimer. It means we don't use the page any more. So let's move the writing page into inactive list's head if we can't truncate it right now. Why I move page to head of lru on this patch, Dirty/Writeback page would be flushed sooner or later. It can prevent writeout of pageout which is less effective than flusher's writeout. Originally, I reused lru_demote of Peter with some change so added his Signed-off-by. Signed-off-by: Minchan Kim Reported-by: Ben Gamari Signed-off-by: Peter Zijlstra Acked-by: Rik van Riel Acked-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Cc: Wu Fengguang Acked-by: Johannes Weiner Cc: Nick Piggin Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/truncate.c | 17 +++++++++---- 2 files changed, 90 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index c02f93611a84..4aea806d0d44 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -39,6 +39,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -346,6 +347,60 @@ void add_page_to_unevictable_list(struct page *page) spin_unlock_irq(&zone->lru_lock); } +/* + * If the page can not be invalidated, it is moved to the + * inactive list to speed up its reclaim. It is moved to the + * head of the list, rather than the tail, to give the flusher + * threads some time to write it out, as this is much more + * effective than the single-page writeout from reclaim. + */ +static void lru_deactivate(struct page *page, struct zone *zone) +{ + int lru, file; + + if (!PageLRU(page) || !PageActive(page)) + return; + + /* Some processes are using the page */ + if (page_mapped(page)) + return; + + file = page_is_file_cache(page); + lru = page_lru_base_type(page); + del_page_from_lru_list(zone, page, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(zone, page, lru); + __count_vm_event(PGDEACTIVATE); + + update_page_reclaim_stat(zone, page, file, 0); +} + +static void ____pagevec_lru_deactivate(struct pagevec *pvec) +{ + int i; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + lru_deactivate(page, zone); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + + /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -372,6 +427,29 @@ static void drain_cpu_pagevecs(int cpu) pagevec_move_tail(pvec); local_irq_restore(flags); } + + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + ____pagevec_lru_deactivate(pvec); +} + +/** + * deactivate_page - forcefully deactivate a page + * @page: page to deactivate + * + * This function hints the VM that @page is a good reclaim candidate, + * for example if its invalidation fails due to the page being dirty + * or under writeback. + */ +void deactivate_page(struct page *page) +{ + if (likely(get_page_unless_zero(page))) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + if (!pagevec_add(pvec, page)) + ____pagevec_lru_deactivate(pvec); + put_cpu_var(lru_deactivate_pvecs); + } } void lru_add_drain(void) diff --git a/mm/truncate.c b/mm/truncate.c index 3d2ae1f423dc..a95667529135 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -321,11 +321,12 @@ EXPORT_SYMBOL(truncate_inode_pages); * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) + pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t next = start; - unsigned long ret = 0; + unsigned long ret; + unsigned long count = 0; int i; pagevec_init(&pvec, 0); @@ -352,9 +353,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (lock_failed) continue; - ret += invalidate_inode_page(page); - + ret = invalidate_inode_page(page); unlock_page(page); + /* + * Invalidation is a hint that the page is no longer + * of interest and try to speed up its reclaim. + */ + if (!ret) + deactivate_page(page); + count += ret; if (next > end) break; } @@ -362,7 +369,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, mem_cgroup_uncharge_end(); cond_resched(); } - return ret; + return count; } EXPORT_SYMBOL(invalidate_mapping_pages); -- cgit v1.2.3 From 3f58a82943337fb6e79acfa5346719a97d3c0b98 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:53 -0700 Subject: memcg: move memcg reclaimable page into tail of inactive list The rotate_reclaimable_page function moves just written out pages, which the VM wanted to reclaim, to the end of the inactive list. That way the VM will find those pages first next time it needs to free memory. This patch applies the rule in memcg. It can help to prevent unnecessary working page eviction of memcg. Signed-off-by: Minchan Kim Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Rik van Riel Cc: KOSAKI Motohiro Acked-by: Johannes Weiner Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 26 ++++++++++++++++++++++++++ mm/swap.c | 3 ++- 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6ef5c53dffcb..9e0f05efd114 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -829,6 +829,32 @@ void mem_cgroup_del_lru(struct page *page) mem_cgroup_del_lru_list(page, page_lru(page)); } +/* + * Writeback is about to end against a page which has been marked for immediate + * reclaim. If it still appears to be reclaimable, move it to the tail of the + * inactive list. + */ +void mem_cgroup_rotate_reclaimable_page(struct page *page) +{ + struct mem_cgroup_per_zone *mz; + struct page_cgroup *pc; + enum lru_list lru = page_lru(page); + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(page); + /* unused or root page is not rotated. */ + if (!PageCgroupUsed(pc)) + return; + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; + mz = page_cgroup_zoneinfo(pc); + list_move_tail(&pc->lru, &mz->lists[lru]); +} + void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) { struct mem_cgroup_per_zone *mz; diff --git a/mm/swap.c b/mm/swap.c index 4aea806d0d44..1b9e4ebaffc8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -200,8 +200,9 @@ static void pagevec_move_tail(struct pagevec *pvec) spin_lock(&zone->lru_lock); } if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); + enum lru_list lru = page_lru_base_type(page); list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); pgmoved++; } } -- cgit v1.2.3 From 278df9f451dc71dcd002246be48358a473504ad0 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2011 16:32:54 -0700 Subject: mm: reclaim invalidated page ASAP invalidate_mapping_pages is very big hint to reclaimer. It means user doesn't want to use the page any more. So in order to prevent working set page eviction, this patch move the page into tail of inactive list by PG_reclaim. Please, remember that pages in inactive list are working set as well as active list. If we don't move pages into inactive list's tail, pages near by tail of inactive list can be evicted although we have a big clue about useless pages. It's totally bad. Now PG_readahead/PG_reclaim is shared. fe3cba17 added ClearPageReclaim into clear_page_dirty_for_io for preventing fast reclaiming readahead marker page. In this series, PG_reclaim is used by invalidated page, too. If VM find the page is invalidated and it's dirty, it sets PG_reclaim to reclaim asap. Then, when the dirty page will be writeback, clear_page_dirty_for_io will clear PG_reclaim unconditionally. It disturbs this serie's goal. I think it's okay to clear PG_readahead when the page is dirty, not writeback time. So this patch moves ClearPageReadahead. In v4, ClearPageReadahead in set_page_dirty has a problem which is reported by Steven Barrett. It's due to compound page. Some driver(ex, audio) calls set_page_dirty with compound page which isn't on LRU. but my patch does ClearPageRelcaim on compound page. In non-CONFIG_PAGEFLAGS_EXTENDED, it breaks PageTail flag. I think it doesn't affect THP and pass my test with THP enabling but Cced Andrea for double check. Signed-off-by: Minchan Kim Reported-by: Steven Barrett Reviewed-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Mel Gorman Cc: Wu Fengguang Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 12 +++++++++++- mm/swap.c | 41 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2cb01f6ec5d0..b437fe6257b0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1211,6 +1211,17 @@ int set_page_dirty(struct page *page) if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; + /* + * readahead/lru_deactivate_page could remain + * PG_readahead/PG_reclaim due to race with end_page_writeback + * About readahead, if the page is written, the flags would be + * reset. So no problem. + * About lru_deactivate_page, if the page is redirty, the flag + * will be reset. So no problem. but if the page is used by readahead + * it will confuse readahead and make it restart the size rampup + * process. But it's a trivial problem. + */ + ClearPageReclaim(page); #ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; @@ -1266,7 +1277,6 @@ int clear_page_dirty_for_io(struct page *page) BUG_ON(!PageLocked(page)); - ClearPageReclaim(page); if (mapping && mapping_cap_account_dirty(mapping)) { /* * Yes, Virginia, this is indeed insane. diff --git a/mm/swap.c b/mm/swap.c index 1b9e4ebaffc8..0a33714a7cba 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -354,26 +354,61 @@ void add_page_to_unevictable_list(struct page *page) * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. + * + * If the page isn't page_mapped and dirty/writeback, the page + * could reclaim asap using PG_reclaim. + * + * 1. active, mapped page -> none + * 2. active, dirty/writeback page -> inactive, head, PG_reclaim + * 3. inactive, mapped page -> none + * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim + * 5. inactive, clean -> inactive, tail + * 6. Others -> none + * + * In 4, why it moves inactive's head, the VM expects the page would + * be write it out by flusher threads as this is much more effective + * than the single-page writeout from reclaim. */ static void lru_deactivate(struct page *page, struct zone *zone) { int lru, file; + bool active; - if (!PageLRU(page) || !PageActive(page)) + if (!PageLRU(page)) return; /* Some processes are using the page */ if (page_mapped(page)) return; + active = PageActive(page); + file = page_is_file_cache(page); lru = page_lru_base_type(page); - del_page_from_lru_list(zone, page, lru + LRU_ACTIVE); + del_page_from_lru_list(zone, page, lru + active); ClearPageActive(page); ClearPageReferenced(page); add_page_to_lru_list(zone, page, lru); - __count_vm_event(PGDEACTIVATE); + if (PageWriteback(page) || PageDirty(page)) { + /* + * PG_reclaim could be raced with end_page_writeback + * It can make readahead confusing. But race window + * is _really_ small and it's non-critical problem. + */ + SetPageReclaim(page); + } else { + /* + * The page's writeback ends up during pagevec + * We moves tha page into tail of inactive. + */ + list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); + __count_vm_event(PGROTATED); + } + + if (active) + __count_vm_event(PGDEACTIVATE); update_page_reclaim_stat(zone, page, file, 0); } -- cgit v1.2.3 From 033193275b3ffcfe7f3fde7b569f3d207f6cd6a0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Mar 2011 16:32:56 -0700 Subject: pagewalk: only split huge pages when necessary Right now, if a mm_walk has either ->pte_entry or ->pmd_entry set, it will unconditionally split any transparent huge pages it runs in to. In practice, that means that anyone doing a cat /proc/$pid/smaps will unconditionally break down every huge page in the process and depend on khugepaged to re-collapse it later. This is fairly suboptimal. This patch changes that behavior. It teaches each ->pmd_entry handler (there are five) that they must break down the THPs themselves. Also, the _generic_ code will never break down a THP unless a ->pte_entry handler is actually set. This means that the ->pmd_entry handlers can now choose to deal with THPs without breaking them down. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Dave Hansen Acked-by: Mel Gorman Acked-by: David Rientjes Reviewed-by: Eric B Munson Tested-by: Eric B Munson Cc: Michael J Wolf Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Matt Mackall Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- mm/pagewalk.c | 24 ++++++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e0f05efd114..e1ee6ad9c971 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4763,7 +4763,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - VM_BUG_ON(pmd_trans_huge(*pmd)); + split_huge_page_pmd(walk->mm, pmd); + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4925,8 +4926,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + split_huge_page_pmd(walk->mm, pmd); retry: - VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 7cfa6ae02303..c3450d533611 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -33,19 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_offset(pud, addr); do { +again: next = pmd_addr_end(addr, end); - split_huge_page_pmd(walk->mm, pmd); - if (pmd_none_or_clear_bad(pmd)) { + if (pmd_none(*pmd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; continue; } + /* + * This implies that each ->pmd_entry() handler + * needs to know about pmd_trans_huge() pmds + */ if (walk->pmd_entry) err = walk->pmd_entry(pmd, addr, next, walk); - if (!err && walk->pte_entry) - err = walk_pte_range(pmd, addr, next, walk); + if (err) + break; + + /* + * Check this here so we only break down trans_huge + * pages when we _need_ to + */ + if (!walk->pte_entry) + continue; + + split_huge_page_pmd(walk->mm, pmd); + if (pmd_none_or_clear_bad(pmd)) + goto again; + err = walk_pte_range(pmd, addr, next, walk); if (err) break; } while (pmd++, addr = next, addr != end); -- cgit v1.2.3 From 7571966189e54adf0a8bc1384d6f13f44052ba63 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 22 Mar 2011 16:33:02 -0700 Subject: mempolicy: remove redundant check in __mpol_equal() The 'flags' field is already checked, no need to do it again. Signed-off-by: Namhyung Kim Cc: Bob Liu Cc: Lee Schermerhorn Reviewed-by: Minchan Kim Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78062ab641ff..959a8b8c7350 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1979,8 +1979,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: return nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - return a->v.preferred_node == b->v.preferred_node && - a->flags == b->flags; + return a->v.preferred_node == b->v.preferred_node; default: BUG(); return 0; -- cgit v1.2.3 From 8afdcece4911e51cfff2b50a269418914cab8a3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 22 Mar 2011 16:33:04 -0700 Subject: mm: vmscan: kswapd should not free an excessive number of pages when balancing small zones When reclaiming for order-0 pages, kswapd requires that all zones be balanced. Each cycle through balance_pgdat() does background ageing on all zones if necessary and applies equal pressure on the inactive zone unless a lot of pages are free already. A "lot of free pages" is defined as a "balance gap" above the high watermark which is currently 7*high_watermark. Historically this was reasonable as min_free_kbytes was small. However, on systems using huge pages, it is recommended that min_free_kbytes is higher and it is tuned with hugeadm --set-recommended-min_free_kbytes. With the introduction of transparent huge page support, this recommended value is also applied. On X86-64 with 4G of memory, min_free_kbytes becomes 67584 so one would expect around 68M of memory to be free. The Normal zone is approximately 35000 pages so under even normal memory pressure such as copying a large file, it gets exhausted quickly. As it is getting exhausted, kswapd applies pressure equally to all zones, including the DMA32 zone. DMA32 is approximately 700,000 pages with a high watermark of around 23,000 pages. In this situation, kswapd will reclaim around (23000*8 where 8 is the high watermark + balance gap of 7 * high watermark) pages or 718M of pages before the zone is ignored. What the user sees is that free memory far higher than it should be. To avoid an excessive number of pages being reclaimed from the larger zones, explicitely defines the "balance gap" to be either 1% of the zone or the low watermark for the zone, whichever is smaller. While kswapd will check all zones to apply pressure, it'll ignore zones that meets the (high_wmark + balance_gap) watermark. To test this, 80G were copied from a partition and the amount of memory being used was recorded. A comparison of a patch and unpatched kernel can be seen at http://www.csn.ul.ie/~mel/postings/minfree-20110222/memory-usage-hydra.ps and shows that kswapd is not reclaiming as much memory with the patch applied. Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Shaohua Li Cc: "Chen, Tim C" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 665b090b6c72..060e4c191403 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2399,6 +2399,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; + unsigned long balance_gap; if (!populated_zone(zone)) continue; @@ -2415,11 +2416,20 @@ loop_again: mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); /* - * We put equal pressure on every zone, unless one - * zone has way too many pages free already. + * We put equal pressure on every zone, unless + * one zone has way too many pages free + * already. The "too many pages" is defined + * as the high wmark plus a "gap" where the + * gap is either the low watermark or 1% + * of the zone, whichever is smaller. */ + balance_gap = min(low_wmark_pages(zone), + (zone->present_pages + + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); if (!zone_watermark_ok_safe(zone, order, - 8*high_wmark_pages(zone), end_zone, 0)) + high_wmark_pages(zone) + balance_gap, + end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, -- cgit v1.2.3 From c033a93c0d961fc7ec5b0872649143e061d97dd4 Mon Sep 17 00:00:00 2001 From: Petr Holasek Date: Tue, 22 Mar 2011 16:33:05 -0700 Subject: hugetlbfs: correct handling of negative input to /proc/sys/vm/nr_hugepages When the user inserts a negative value into /proc/sys/vm/nr_hugepages it will cause the kernel to allocate as many hugepages as possible and to then update /proc/meminfo to reflect this. This changes the behavior so that the negative input will result in nr_hugepages value being unchanged. Signed-off-by: Petr Holasek Signed-off-by: Anton Arapov Reviewed-by: Naoya Horiguchi Acked-by: David Rientjes Acked-by: Mel Gorman Acked-by: Eric B Munson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb0b7c128015..06de5aa4d644 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1872,8 +1872,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; - if (!write) - tmp = h->max_huge_pages; + tmp = h->max_huge_pages; if (write && h->order >= MAX_ORDER) return -EINVAL; @@ -1938,8 +1937,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; - if (!write) - tmp = h->nr_overcommit_huge_pages; + tmp = h->nr_overcommit_huge_pages; if (write && h->order >= MAX_ORDER) return -EINVAL; -- cgit v1.2.3 From 9d8aa4ea855e0d64bb6926acb5618e6d1e2ed344 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Mar 2011 16:33:06 -0700 Subject: mm: remove worrying dead code from find_get_pages() The radix_tree_deref_retry() case in find_get_pages() has a strange little excrescence, not seen in the other gang lookups: it looks like the start of an abandoned attempt to guarantee forward progress in a case that cannot arise. ret should always be 0 here: if it isn't, then going back to restart will leak references to pages already gotten. There used to be a comment saying nr_found is necessarily 1 here: that's not quite true, but the radix_tree_deref_retry() case is peculiar to the entry at index 0, when we race with it being moved out of the radix_tree root or back. Remove the worrisome two lines, add a brief comment here and in find_get_pages_contig() and find_get_pages_tag(), and a WARN_ON in find_get_pages() should it ever be seen elsewhere than at 0. Signed-off-by: Hugh Dickins Cc: Nick Piggin Acked-by: Peter Zijlstra Cc: Wu Fengguang Cc: Salman Qazi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index cb476e70cf19..a29318147365 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -863,9 +863,13 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; + + /* + * This can only trigger when the entry at index 0 moves out + * of or back to the root: none yet gotten, safe to restart. + */ if (radix_tree_deref_retry(page)) { - if (ret) - start = pages[ret-1]->index; + WARN_ON(start | i); goto restart; } @@ -915,6 +919,11 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; + + /* + * This can only trigger when the entry at index 0 moves out + * of or back to the root: none yet gotten, safe to restart. + */ if (radix_tree_deref_retry(page)) goto restart; @@ -975,6 +984,11 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; + + /* + * This can only trigger when the entry at index 0 moves out + * of or back to the root: none yet gotten, safe to restart. + */ if (radix_tree_deref_retry(page)) goto restart; -- cgit v1.2.3 From 5b280c0cc70062967bb9d630b216375b18db3a0b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Mar 2011 16:33:07 -0700 Subject: mm: don't return 0 too early from find_get_pages() Callers of find_get_pages(), or its wrapper pagevec_lookup() - notably truncate_inode_pages_range() - stop looking further when it returns 0. But if an interrupt comes just after its radix_tree_gang_lookup_slot(), especially if we have preemptible RCU enabled, isn't it conceivable that all 14 pages returned could be removed from the page cache by shrink_page_list(), before find_get_pages() gets to process them? So causing it to return 0 although there may be plenty more pages beyond. Make find_get_pages() and find_get_pages_tag() check for this unlikely case, and restart should it occur; but callers of find_get_pages_contig() have no such expectation, it's okay for that to return 0 early. I have not seen this in practice, just worried by the possibility. Signed-off-by: Hugh Dickins Cc: Nick Piggin Acked-by: Peter Zijlstra Cc: Wu Fengguang Cc: Salman Qazi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index a29318147365..f807afda86f2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -885,6 +885,13 @@ repeat: pages[ret] = page; ret++; } + + /* + * If all entries were removed before we could secure them, + * try again, because callers stop trying once 0 is returned. + */ + if (unlikely(!ret && nr_found)) + goto restart; rcu_read_unlock(); return ret; } @@ -1004,6 +1011,13 @@ repeat: pages[ret] = page; ret++; } + + /* + * If all entries were removed before we could secure them, + * try again, because callers stop trying once 0 is returned. + */ + if (unlikely(!ret && nr_found)) + goto restart; rcu_read_unlock(); if (ret) -- cgit v1.2.3 From 602605a42ea4c299aeed4d806c49fb9dd18cd204 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 22 Mar 2011 16:33:08 -0700 Subject: mm: compaction: minimise the time IRQs are disabled while isolating free pages compaction_alloc() isolates free pages to be used as migration targets. While its scanning, IRQs are disabled on the mistaken assumption the scanning should be short. Analysis showed that IRQs were in fact being disabled for substantial time. A simple test was run using large anonymous mappings with transparent hugepage support enabled to trigger frequent compactions. A monitor sampled what the worst IRQ-off latencies were and a post-processing tool found the following; Total sampled time IRQs off (not real total time): 22355 Event compaction_alloc..compaction_alloc 8409 us count 1 Event compaction_alloc..compaction_alloc 7341 us count 1 Event compaction_alloc..compaction_alloc 2463 us count 1 Event compaction_alloc..compaction_alloc 2054 us count 1 Event shrink_inactive_list..shrink_zone 1864 us count 1 Event shrink_inactive_list..shrink_zone 88 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __make_request..__blk_run_queue 24 us count 1 Event __alloc_pages_nodemask..__alloc_pages_nodemask 6 us count 1 i.e. compaction is disabled IRQs for a prolonged period of time - 8ms in one instance. The full report generated by the tool can be found at http://www.csn.ul.ie/~mel/postings/minfree-20110225/irqsoff-vanilla-micro.report This patch reduces the time IRQs are disabled by simply disabling IRQs at the last possible minute. An updated IRQs-off summary report then looks like; Total sampled time IRQs off (not real total time): 5493 Event shrink_inactive_list..shrink_zone 1596 us count 1 Event shrink_inactive_list..shrink_zone 1530 us count 1 Event shrink_inactive_list..shrink_zone 956 us count 1 Event shrink_inactive_list..shrink_zone 541 us count 1 Event shrink_inactive_list..shrink_zone 531 us count 1 Event split_huge_page..add_to_swap 232 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __wake_up..__wake_up 1 us count 1 A full report is again available at http://www.csn.ul.ie/~mel/postings/minfree-20110225/irqsoff-minimiseirq-free-v1r4-micro.report As should be obvious, IRQ disabled latencies due to compaction are almost elimimnated for this particular test. [aarcange@redhat.com: Fix initialisation of isolated] Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Acked-by: Andrea Arcangeli Cc: Arthur Marsh Cc: Clemens Ladisch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 38ce48805c08..b27802e04b91 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -153,7 +153,6 @@ static void isolate_freepages(struct zone *zone, * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - spin_lock_irqsave(&zone->lock, flags); for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; @@ -176,9 +175,19 @@ static void isolate_freepages(struct zone *zone, if (!suitable_migration_target(page)) continue; - /* Found a block suitable for isolating free pages from */ - isolated = isolate_freepages_block(zone, pfn, freelist); - nr_freepages += isolated; + /* + * Found a block suitable for isolating free pages from. Now + * we disabled interrupts, double check things are ok and + * isolate the pages. This is to minimise the time IRQs + * are disabled + */ + isolated = 0; + spin_lock_irqsave(&zone->lock, flags); + if (suitable_migration_target(page)) { + isolated = isolate_freepages_block(zone, pfn, freelist); + nr_freepages += isolated; + } + spin_unlock_irqrestore(&zone->lock, flags); /* * Record the highest PFN we isolated pages from. When next @@ -188,7 +197,6 @@ static void isolate_freepages(struct zone *zone, if (isolated) high_pfn = max(high_pfn, pfn); } - spin_unlock_irqrestore(&zone->lock, flags); /* split_free_page does not map the pages */ list_for_each_entry(page, freelist, lru) { -- cgit v1.2.3 From b2eef8c0d09101bbbff2531c097543aedde0b525 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Mar 2011 16:33:10 -0700 Subject: mm: compaction: minimise the time IRQs are disabled while isolating pages for migration compaction_alloc() isolates pages for migration in isolate_migratepages. While it's scanning, IRQs are disabled on the mistaken assumption the scanning should be short. Tests show this to be true for the most part but contention times on the LRU lock can be increased. Before this patch, the IRQ disabled times for a simple test looked like Total sampled time IRQs off (not real total time): 5493 Event shrink_inactive_list..shrink_zone 1596 us count 1 Event shrink_inactive_list..shrink_zone 1530 us count 1 Event shrink_inactive_list..shrink_zone 956 us count 1 Event shrink_inactive_list..shrink_zone 541 us count 1 Event shrink_inactive_list..shrink_zone 531 us count 1 Event split_huge_page..add_to_swap 232 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __wake_up..__wake_up 1 us count 1 This patch reduces the worst-case IRQs-disabled latencies by releasing the lock every SWAP_CLUSTER_MAX pages that are scanned and releasing the CPU if necessary. The cost of this is that the processing performing compaction will be slower but IRQs being disabled for too long a time has worse consequences as the following report shows; Total sampled time IRQs off (not real total time): 4367 Event shrink_inactive_list..shrink_zone 881 us count 1 Event shrink_inactive_list..shrink_zone 875 us count 1 Event shrink_inactive_list..shrink_zone 868 us count 1 Event shrink_inactive_list..shrink_zone 555 us count 1 Event split_huge_page..add_to_swap 495 us count 1 Event compact_zone..compact_zone_order 269 us count 1 Event split_huge_page..add_to_swap 266 us count 1 Event shrink_inactive_list..shrink_zone 85 us count 1 Event save_args..call_softirq 36 us count 2 Event __wake_up..__wake_up 1 us count 1 [akpm@linux-foundation.org: simplify with s/unlocked/locked/] Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Arthur Marsh Cc: Clemens Ladisch Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index b27802e04b91..021a2960ef9e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -277,9 +277,27 @@ static unsigned long isolate_migratepages(struct zone *zone, } /* Time to isolate some pages for migration */ + cond_resched(); spin_lock_irq(&zone->lru_lock); for (; low_pfn < end_pfn; low_pfn++) { struct page *page; + bool locked = true; + + /* give a chance to irqs before checking need_resched() */ + if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { + spin_unlock_irq(&zone->lru_lock); + locked = false; + } + if (need_resched() || spin_is_contended(&zone->lru_lock)) { + if (locked) + spin_unlock_irq(&zone->lru_lock); + cond_resched(); + spin_lock_irq(&zone->lru_lock); + if (fatal_signal_pending(current)) + break; + } else if (!locked) + spin_lock_irq(&zone->lru_lock); + if (!pfn_valid_within(low_pfn)) continue; nr_scanned++; -- cgit v1.2.3 From 11bc82d67d1150767901bca54a24466621d763d7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Mar 2011 16:33:11 -0700 Subject: mm: compaction: Use async migration for __GFP_NO_KSWAPD and enforce no writeback __GFP_NO_KSWAPD allocations are usually very expensive and not mandatory to succeed as they have graceful fallback. Waiting for I/O in those, tends to be overkill in terms of latencies, so we can reduce their latency by disabling sync migrate. Unfortunately, even with async migration it's still possible for the process to be blocked waiting for a request slot (e.g. get_request_wait in the block layer) when ->writepage is called. To prevent __GFP_NO_KSWAPD blocking, this patch prevents ->writepage being called on dirty page cache for asynchronous migration. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=31142 [mel@csn.ul.ie: Avoid writebacks for NFS, retry locked pages, use bool] Signed-off-by: Andrea Arcangeli Signed-off-by: Mel Gorman Cc: Arthur Marsh Cc: Clemens Ladisch Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Reported-by: Alex Villacis Lasso Tested-by: Alex Villacis Lasso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 48 +++++++++++++++++++++++++++++++++--------------- mm/page_alloc.c | 2 +- 2 files changed, 34 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 7d2983f3783e..89e5c3fe8bbc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -564,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping, * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache) + int remap_swapcache, bool sync) { struct address_space *mapping; int rc; @@ -586,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) rc = migrate_page(mapping, newpage, page); - else if (mapping->a_ops->migratepage) + else { /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. + * Do not writeback pages if !sync and migratepage is + * not pointing to migrate_page() which is nonblocking + * (swapcache/tmpfs uses migratepage = migrate_page). */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); + if (PageDirty(page) && !sync && + mapping->a_ops->migratepage != migrate_page) + rc = -EBUSY; + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, + newpage, page); + else + rc = fallback_migrate_page(mapping, newpage, page); + } if (rc) { newpage->mapping = NULL; @@ -641,7 +651,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, rc = -EAGAIN; if (!trylock_page(page)) { - if (!force) + if (!force || !sync) goto move_newpage; /* @@ -686,7 +696,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); if (PageWriteback(page)) { - if (!force || !sync) + /* + * For !sync, there is no point retrying as the retry loop + * is expected to be too short for PageWriteback to be cleared + */ + if (!sync) { + rc = -EBUSY; + goto uncharge; + } + if (!force) goto uncharge; wait_on_page_writeback(page); } @@ -757,7 +775,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache); + rc = move_to_new_page(newpage, page, remap_swapcache, sync); if (rc && remap_swapcache) remove_migration_ptes(page, page); @@ -850,7 +868,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1); + rc = move_to_new_page(new_hpage, hpage, 1, sync); if (rc) remove_migration_ptes(hpage, hpage); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 426056aff12a..6d0032bdb5d8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2103,7 +2103,7 @@ rebalance: sync_migration); if (page) goto got_pg; - sync_migration = true; + sync_migration = !(gfp_mask & __GFP_NO_KSWAPD); /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, -- cgit v1.2.3 From 78afd5612deb8268bafc8b6507d72341d5ed9aac Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 22 Mar 2011 16:33:12 -0700 Subject: mm: add __GFP_OTHER_NODE flag Add a new __GFP_OTHER_NODE flag to tell the low level numa statistics in zone_statistics() that an allocation is on behalf of another thread. This way the local and remote counters can be still correct, even when background daemons like khugepaged are changing memory mappings. This only affects the accounting, but I think it's worth doing that right to avoid confusing users. I first tried to just pass down the right node, but this required a lot of changes to pass down this parameter and at least one addition of a 10th argument to a 9 argument function. Using the flag is a lot less intrusive. Open: should be also used for migration? [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andi Kleen Cc: Andrea Arcangeli Reviewed-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/vmstat.c | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d0032bdb5d8..136a547262a0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1337,7 +1337,7 @@ again: } __count_zone_vm_events(PGALLOC, zone, 1 << order); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); VM_BUG_ON(bad_range(zone, page)); diff --git a/mm/vmstat.c b/mm/vmstat.c index 0c3b5048773e..772b39b87d95 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -500,8 +500,12 @@ void refresh_cpu_vm_stats(int cpu) * z = the zone from which the allocation occurred. * * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. */ -void zone_statistics(struct zone *preferred_zone, struct zone *z) +void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) { if (z->zone_pgdat == preferred_zone->zone_pgdat) { __inc_zone_state(z, NUMA_HIT); @@ -509,7 +513,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) __inc_zone_state(z, NUMA_MISS); __inc_zone_state(preferred_zone, NUMA_FOREIGN); } - if (z->node == numa_node_id()) + if (z->node == ((flags & __GFP_OTHER_NODE) ? + preferred_zone->node : numa_node_id())) __inc_zone_state(z, NUMA_LOCAL); else __inc_zone_state(z, NUMA_OTHER); -- cgit v1.2.3 From cc5d462f7777c06c5cf0b55d736be325cda747b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 22 Mar 2011 16:33:13 -0700 Subject: mm: use __GFP_OTHER_NODE for transparent huge pages Pass __GFP_OTHER_NODE for transparent hugepages NUMA allocations done by the hugepages daemon. This way the low level accounting for local versus remote pages works correctly. Contains improvements from Andrea Arcangeli [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andi Kleen Cc: Andrea Arcangeli Reviewed-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 113e35c47502..0a619e0e2e0b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -643,23 +643,24 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return ret; } -static inline gfp_t alloc_hugepage_gfpmask(int defrag) +static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; } static inline struct page *alloc_hugepage_vma(int defrag, struct vm_area_struct *vma, - unsigned long haddr, int nd) + unsigned long haddr, int nd, + gfp_t extra_gfp) { - return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), HPAGE_PMD_ORDER, vma, haddr, nd); } #ifndef CONFIG_NUMA static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(alloc_hugepage_gfpmask(defrag), + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); } #endif @@ -678,7 +679,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id()); + vma, haddr, numa_node_id(), 0); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -799,7 +800,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | + __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_newpage_charge(pages[i], mm, @@ -902,7 +904,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id()); + vma, haddr, numa_node_id(), 0); else new_page = NULL; @@ -1779,7 +1781,7 @@ static void collapse_huge_page(struct mm_struct *mm, * scalability. */ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, - node); + node, __GFP_OTHER_NODE); if (unlikely(!new_page)) { up_read(&mm->mmap_sem); *hpage = ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 803d0c8351b47b72b8b018457a33b342557b90a2 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:14 -0700 Subject: sys_swapon: use vzalloc() instead of vmalloc/memset This patch series refactors the sys_swapon function. sys_swapon is currently a very large function, with 313 lines (more than 12 25-line screens), which can make it a bit hard to read. This patch series reduces this size by half, by extracting large chunks of related code to new helper functions. One of these chunks of code was nearly identical to the part of sys_swapoff which is used in case of a failure return from try_to_unuse(), so this patch series also makes both share the same code. As a side effect of all this refactoring, the compiled code gets a bit smaller (from v1 of this patch series): text data bss dec hex filename 14012 944 276 15232 3b80 mm/swapfile.o.before 13941 944 276 15161 3b39 mm/swapfile.o.after This patch: Use vzalloc() instead of vmalloc/memset. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: Jesper Juhl Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 6d6d28c0a72f..99eb5048b7a8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2047,13 +2047,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages); + swap_map = vzalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap; } - memset(swap_map, 0, maxpages); nr_good_pages = maxpages - 1; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { -- cgit v1.2.3 From 80b0df12b808bf8e8391afae1b43f5e529f76d89 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:15 -0700 Subject: sys_swapon: remove changelog from function comment Changelogs belong in the git history instead of in the source code. Also, "The swapon system call" is redundant with "SYSCALL_DEFINE2(swapon, ...)". Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: Jesper Juhl Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton [ Gaah. That's a _historical_ comment. But the patch-series depends on removal ] Signed-off-by: Linus Torvalds --- mm/swapfile.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 99eb5048b7a8..971464470938 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1844,11 +1844,6 @@ static int __init max_swapfiles_check(void) late_initcall(max_swapfiles_check); #endif -/* - * Written 01/25/92 by Simmule Turner, heavily changed by Linus. - * - * The swapon system call - */ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; -- cgit v1.2.3 From e8e6c2ec403ecfaa226857d8204344c98fe12b7b Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:16 -0700 Subject: sys_swapon: do not depend on "type" after allocation Within sys_swapon, after the swap_info entry has been allocated, we always have type == p->type and swap_info[type] == p. Use this fact to reduce the dependency on the "type" local variable within the function, as a preparation to move the allocation of the swap_info entry to a separate function. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 971464470938..5238d8d15d78 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1927,7 +1927,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) for (i = 0; i < nr_swapfiles; i++) { struct swap_info_struct *q = swap_info[i]; - if (i == type || !q->swap_file) + if (q == p || !q->swap_file) continue; if (mapping == q->swap_file->f_mapping) goto bad_swap; @@ -2062,7 +2062,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = swap_cgroup_swapon(type, maxpages); + error = swap_cgroup_swapon(p->type, maxpages); if (error) goto bad_swap; @@ -2120,9 +2120,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } p->next = i; if (prev < 0) - swap_list.head = swap_list.next = type; + swap_list.head = swap_list.next = p->type; else - swap_info[prev]->next = type; + swap_info[prev]->next = p->type; spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); @@ -2136,7 +2136,7 @@ bad_swap: blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); - swap_cgroup_swapoff(type); + swap_cgroup_swapoff(p->type); bad_swap_2: spin_lock(&swap_lock); p->swap_file = NULL; -- cgit v1.2.3 From 53cbb2435f161f2a8b36af8f6d2c46dc59d0d757 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:17 -0700 Subject: sys_swapon: separate swap_info allocation Move the swap_info allocation to its own function. Only code movement, no functional changes. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 57 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 5238d8d15d78..6d1c3c67ae65 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1844,33 +1844,15 @@ static int __init max_swapfiles_check(void) late_initcall(max_swapfiles_check); #endif -SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; - char *name = NULL; - struct block_device *bdev = NULL; - struct file *swap_file = NULL; - struct address_space *mapping; unsigned int type; - int i, prev; int error; - union swap_header *swap_header; - unsigned int nr_good_pages; - int nr_extents = 0; - sector_t span; - unsigned long maxpages; - unsigned long swapfilepages; - unsigned char *swap_map = NULL; - struct page *page = NULL; - struct inode *inode = NULL; - int did_down = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) - return -ENOMEM; + return ERR_PTR(-ENOMEM); spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { @@ -1906,6 +1888,41 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->next = -1; spin_unlock(&swap_lock); + return p; + +out: + return ERR_PTR(error); +} + +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +{ + struct swap_info_struct *p; + char *name = NULL; + struct block_device *bdev = NULL; + struct file *swap_file = NULL; + struct address_space *mapping; + int i, prev; + int error; + union swap_header *swap_header; + unsigned int nr_good_pages; + int nr_extents = 0; + sector_t span; + unsigned long maxpages; + unsigned long swapfilepages; + unsigned char *swap_map = NULL; + struct page *page = NULL; + struct inode *inode = NULL; + int did_down = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = alloc_swap_info(); + if (IS_ERR(p)) { + error = PTR_ERR(p); + goto out; + } + name = getname(specialfile); error = PTR_ERR(name); if (IS_ERR(name)) { -- cgit v1.2.3 From 2542e5134d2c19a9e6a4e641ef78cac6bccebd9b Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:18 -0700 Subject: sys_swapon: simplify error return from swap_info allocation At this point in sys_swapon, there is nothing to free. Return directly instead of jumping to the cleanup block at the end of the function. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 6d1c3c67ae65..4d457d699c1f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1918,10 +1918,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) return -EPERM; p = alloc_swap_info(); - if (IS_ERR(p)) { - error = PTR_ERR(p); - goto out; - } + if (IS_ERR(p)) + return PTR_ERR(p); name = getname(specialfile); error = PTR_ERR(name); -- cgit v1.2.3 From 730c0581c82dbc5be2f41a2d85bde6bad11bc8a4 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:19 -0700 Subject: sys_swapon: simplify error flow in alloc_swap_info() Since there is no cleanup to do, there is no reason to jump to a label. Return directly instead. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 4d457d699c1f..4d89c4c9336c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1848,7 +1848,6 @@ static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; unsigned int type; - int error; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) @@ -1859,11 +1858,10 @@ static struct swap_info_struct *alloc_swap_info(void) if (!(swap_info[type]->flags & SWP_USED)) break; } - error = -EPERM; if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); kfree(p); - goto out; + return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; @@ -1889,9 +1887,6 @@ static struct swap_info_struct *alloc_swap_info(void) spin_unlock(&swap_lock); return p; - -out: - return ERR_PTR(error); } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) -- cgit v1.2.3 From 28b36bd741bd44db30e12b1048a659aa346e9b76 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:20 -0700 Subject: sys_swapon: remove initial value of name variable Now there is nothing which jumps to the cleanup blocks before the name variable is set. There is no need to set it initially to NULL anymore. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 4d89c4c9336c..6f25ece88c4a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1892,7 +1892,7 @@ static struct swap_info_struct *alloc_swap_info(void) SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; - char *name = NULL; + char *name; struct block_device *bdev = NULL; struct file *swap_file = NULL; struct address_space *mapping; -- cgit v1.2.3 From 83ef99befc324803a54cf2a5fab5a322df3a99d6 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:21 -0700 Subject: sys_swapon: remove did_down variable Since mutex_lock(&inode->i_mutex) is called just after setting inode, did_down is always equivalent to (inode && S_ISREG(inode->i_mode)). Use this fact to remove the did_down variable. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 6f25ece88c4a..b261e55e0b82 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1907,7 +1907,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned char *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; - int did_down = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1962,7 +1961,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; mutex_lock(&inode->i_mutex); - did_down = 1; if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap; @@ -2154,10 +2152,8 @@ bad_swap_2: spin_unlock(&swap_lock); vfree(swap_map); if (swap_file) { - if (did_down) { + if (inode && S_ISREG(inode->i_mode)) mutex_unlock(&inode->i_mutex); - did_down = 0; - } filp_close(swap_file, NULL); } out: @@ -2167,7 +2163,7 @@ out: } if (name) putname(name); - if (did_down) { + if (inode && S_ISREG(inode->i_mode)) { if (!error) inode->i_flags |= S_SWAPFILE; mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From 7de7fb6b3422e6d1dac00666a992834085e745dc Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:22 -0700 Subject: sys_swapon: move setting of error nearer use Move the setting of the error variable nearer the goto in a few places. Avoids calling PTR_ERR() if not IS_ERR() in two places, and makes the error condition more explicit in two other places. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Jesper Juhl Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index b261e55e0b82..e4dc94996c58 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1916,14 +1916,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) return PTR_ERR(p); name = getname(specialfile); - error = PTR_ERR(name); if (IS_ERR(name)) { + error = PTR_ERR(name); name = NULL; goto bad_swap_2; } swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); - error = PTR_ERR(swap_file); if (IS_ERR(swap_file)) { + error = PTR_ERR(swap_file); swap_file = NULL; goto bad_swap_2; } @@ -1932,17 +1932,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mapping = swap_file->f_mapping; inode = mapping->host; - error = -EBUSY; for (i = 0; i < nr_swapfiles; i++) { struct swap_info_struct *q = swap_info[i]; if (q == p || !q->swap_file) continue; - if (mapping == q->swap_file->f_mapping) + if (mapping == q->swap_file->f_mapping) { + error = -EBUSY; goto bad_swap; + } } - error = -EINVAL; if (S_ISBLK(inode->i_mode)) { bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, @@ -1966,6 +1966,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } } else { + error = -EINVAL; goto bad_swap; } -- cgit v1.2.3 From f2090d2df51d7cdb2f952dcfdcd8baaac0aec444 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:23 -0700 Subject: sys_swapon: remove bdev variable The bdev variable is always equivalent to (S_ISBLK(inode->i_mode) ? p->bdev : NULL), as long as it being set is moved to a bit earlier. Use this fact to remove the bdev variable. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index e4dc94996c58..a314d42c0fa5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1893,7 +1893,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; char *name; - struct block_device *bdev = NULL; struct file *swap_file = NULL; struct address_space *mapping; int i, prev; @@ -1944,19 +1943,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } if (S_ISBLK(inode->i_mode)) { - bdev = bdgrab(I_BDEV(inode)); - error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, + p->bdev = bdgrab(I_BDEV(inode)); + error = blkdev_get(p->bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, sys_swapon); if (error < 0) { - bdev = NULL; + p->bdev = NULL; error = -EINVAL; goto bad_swap; } - p->old_block_size = block_size(bdev); - error = set_blocksize(bdev, PAGE_SIZE); + p->old_block_size = block_size(p->bdev); + error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) goto bad_swap; - p->bdev = bdev; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; @@ -2140,9 +2139,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; bad_swap: - if (bdev) { - set_blocksize(bdev, p->old_block_size); - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + if (S_ISBLK(inode->i_mode) && p->bdev) { + set_blocksize(p->bdev, p->old_block_size); + blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); swap_cgroup_swapoff(p->type); -- cgit v1.2.3 From 9b01c350af4fb00fe2ab66ff9bf16058c50b69bd Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:24 -0700 Subject: sys_swapon: do only cleanup in the cleanup blocks The only way error is 0 in the cleanup blocks is when the function is returning successfully. In this case, the cleanup blocks were setting S_SWAPFILE in the S_ISREG case. But this is not a cleanup. Move the setting of S_SWAPFILE to just before the "goto out;" to make this more clear. At this point, we do not need to test for inode because it will never be NULL. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index a314d42c0fa5..e356e5e70313 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2136,6 +2136,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); + if (S_ISREG(inode->i_mode)) + inode->i_flags |= S_SWAPFILE; error = 0; goto out; bad_swap: @@ -2163,11 +2165,8 @@ out: } if (name) putname(name); - if (inode && S_ISREG(inode->i_mode)) { - if (!error) - inode->i_flags |= S_SWAPFILE; + if (inode && S_ISREG(inode->i_mode)) mutex_unlock(&inode->i_mutex); - } return error; } -- cgit v1.2.3 From bd69010b042a60ca41a890df1b10019e94746c2f Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:25 -0700 Subject: sys_swapon: use a single error label sys_swapon currently has two error labels, bad_swap and bad_swap_2. bad_swap does the same as bad_swap_2 plus destroy_swap_extents() and swap_cgroup_swapoff(); both are noops in the places where bad_swap_2 is jumped to. With a single extra test for inode (matching the one in the S_ISREG case below), all the error paths in the function can go to bad_swap. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index e356e5e70313..14590775212d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1918,13 +1918,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (IS_ERR(name)) { error = PTR_ERR(name); name = NULL; - goto bad_swap_2; + goto bad_swap; } swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; - goto bad_swap_2; + goto bad_swap; } p->swap_file = swap_file; @@ -2141,13 +2141,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; bad_swap: - if (S_ISBLK(inode->i_mode) && p->bdev) { + if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); swap_cgroup_swapoff(p->type); -bad_swap_2: spin_lock(&swap_lock); p->swap_file = NULL; p->flags = 0; -- cgit v1.2.3 From 4d0e1e10752ca487d83d1ab2a1d4ae6d664e1cc0 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:26 -0700 Subject: sys_swapon: separate bdev claim and inode lock Move the code which claims the bdev (S_ISBLK) or locks the inode (S_ISREG) to a separate function. Only code movement, no functional changes. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 64 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 14590775212d..fc687b234eb5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1889,6 +1889,43 @@ static struct swap_info_struct *alloc_swap_info(void) return p; } +static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) +{ + int error; + + if (S_ISBLK(inode->i_mode)) { + p->bdev = bdgrab(I_BDEV(inode)); + error = blkdev_get(p->bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + sys_swapon); + if (error < 0) { + p->bdev = NULL; + error = -EINVAL; + goto bad_swap; + } + p->old_block_size = block_size(p->bdev); + error = set_blocksize(p->bdev, PAGE_SIZE); + if (error < 0) + goto bad_swap; + p->flags |= SWP_BLKDEV; + } else if (S_ISREG(inode->i_mode)) { + p->bdev = inode->i_sb->s_bdev; + mutex_lock(&inode->i_mutex); + if (IS_SWAPFILE(inode)) { + error = -EBUSY; + goto bad_swap; + } + } else { + error = -EINVAL; + goto bad_swap; + } + + return 0; + +bad_swap: + return error; +} + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -1942,32 +1979,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - if (S_ISBLK(inode->i_mode)) { - p->bdev = bdgrab(I_BDEV(inode)); - error = blkdev_get(p->bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - sys_swapon); - if (error < 0) { - p->bdev = NULL; - error = -EINVAL; - goto bad_swap; - } - p->old_block_size = block_size(p->bdev); - error = set_blocksize(p->bdev, PAGE_SIZE); - if (error < 0) - goto bad_swap; - p->flags |= SWP_BLKDEV; - } else if (S_ISREG(inode->i_mode)) { - p->bdev = inode->i_sb->s_bdev; - mutex_lock(&inode->i_mutex); - if (IS_SWAPFILE(inode)) { - error = -EBUSY; - goto bad_swap; - } - } else { - error = -EINVAL; + error = claim_swapfile(p, inode); + if (unlikely(error)) goto bad_swap; - } swapfilepages = i_size_read(inode) >> PAGE_SHIFT; -- cgit v1.2.3 From 87ade72a799a9a895b4a60918c32b43d3dfc3324 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:27 -0700 Subject: sys_swapon: simplify error flow in claim_swapfile() Since there is no cleanup to do, there is no reason to jump to a label. Return directly instead. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index fc687b234eb5..61a604c12a96 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1900,30 +1900,22 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) sys_swapon); if (error < 0) { p->bdev = NULL; - error = -EINVAL; - goto bad_swap; + return -EINVAL; } p->old_block_size = block_size(p->bdev); error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) - goto bad_swap; + return error; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; mutex_lock(&inode->i_mutex); - if (IS_SWAPFILE(inode)) { - error = -EBUSY; - goto bad_swap; - } - } else { - error = -EINVAL; - goto bad_swap; - } + if (IS_SWAPFILE(inode)) + return -EBUSY; + } else + return -EINVAL; return 0; - -bad_swap: - return error; } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) -- cgit v1.2.3 From 5de771e41f0fc2243c39585357b73f0ff757b280 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:28 -0700 Subject: sys_swapon: move setting of swapfilepages near use There is no reason I can see to read inode->i_size long before it is needed. Move its read to just before it is needed, to reduce the variable lifetime. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Jesper Juhl Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 61a604c12a96..4dade515b086 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1975,8 +1975,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (unlikely(error)) goto bad_swap; - swapfilepages = i_size_read(inode) >> PAGE_SHIFT; - /* * Read the swap header. */ @@ -2045,6 +2043,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (!maxpages) goto bad_swap; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); -- cgit v1.2.3 From ca8bd38bf6f05481c4155fc444178151884f65d0 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:29 -0700 Subject: sys_swapon: separate parsing of swapfile header Move the code which parses and checks the swapfile header (except for the bad block list) to a separate function. Only code movement, no functional changes. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 140 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 78 insertions(+), 62 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 4dade515b086..5e13bff1764c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1918,6 +1918,82 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) return 0; } +static unsigned long read_swap_header(struct swap_info_struct *p, + union swap_header *swap_header, + struct inode *inode) +{ + int i; + unsigned long maxpages; + unsigned long swapfilepages; + + if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { + printk(KERN_ERR "Unable to find swap-space signature\n"); + goto bad_swap; + } + + /* swap partition endianess hack... */ + if (swab32(swap_header->info.version) == 1) { + swab32s(&swap_header->info.version); + swab32s(&swap_header->info.last_page); + swab32s(&swap_header->info.nr_badpages); + for (i = 0; i < swap_header->info.nr_badpages; i++) + swab32s(&swap_header->info.badpages[i]); + } + /* Check the swap header's sub-version */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); + goto bad_swap; + } + + p->lowest_bit = 1; + p->cluster_next = 1; + p->cluster_nr = 0; + + /* + * Find out how many pages are allowed for a single swap + * device. There are two limiting factors: 1) the number of + * bits for the swap offset in the swp_entry_t type and + * 2) the number of bits in the a swap pte as defined by + * the different architectures. In order to find the + * largest possible bit mask a swap entry with swap type 0 + * and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again and finally the swap + * offset is extracted. This will mask all the bits from + * the initial ~0UL mask that can't be encoded in either + * the swp_entry_t or the architecture definition of a + * swap pte. + */ + maxpages = swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + if (maxpages > swap_header->info.last_page) { + maxpages = swap_header->info.last_page + 1; + /* p->max is an unsigned int: don't overflow it */ + if ((unsigned int)maxpages == 0) + maxpages = UINT_MAX; + } + p->highest_bit = maxpages - 1; + + if (!maxpages) + goto bad_swap; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; + if (swapfilepages && maxpages > swapfilepages) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + goto bad_swap; + } + if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) + goto bad_swap; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; + + return maxpages; + +bad_swap: + return 0; +} + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -1931,7 +2007,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) int nr_extents = 0; sector_t span; unsigned long maxpages; - unsigned long swapfilepages; unsigned char *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -1989,71 +2064,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } swap_header = kmap(page); - if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { - printk(KERN_ERR "Unable to find swap-space signature\n"); + maxpages = read_swap_header(p, swap_header, inode); + if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap; } - /* swap partition endianess hack... */ - if (swab32(swap_header->info.version) == 1) { - swab32s(&swap_header->info.version); - swab32s(&swap_header->info.last_page); - swab32s(&swap_header->info.nr_badpages); - for (i = 0; i < swap_header->info.nr_badpages; i++) - swab32s(&swap_header->info.badpages[i]); - } - /* Check the swap header's sub-version */ - if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); - error = -EINVAL; - goto bad_swap; - } - - p->lowest_bit = 1; - p->cluster_next = 1; - p->cluster_nr = 0; - - /* - * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number of - * bits for the swap offset in the swp_entry_t type and - * 2) the number of bits in the a swap pte as defined by - * the different architectures. In order to find the - * largest possible bit mask a swap entry with swap type 0 - * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again and finally the swap - * offset is extracted. This will mask all the bits from - * the initial ~0UL mask that can't be encoded in either - * the swp_entry_t or the architecture definition of a - * swap pte. - */ - maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; - if (maxpages > swap_header->info.last_page) { - maxpages = swap_header->info.last_page + 1; - /* p->max is an unsigned int: don't overflow it */ - if ((unsigned int)maxpages == 0) - maxpages = UINT_MAX; - } - p->highest_bit = maxpages - 1; - - error = -EINVAL; - if (!maxpages) - goto bad_swap; - swapfilepages = i_size_read(inode) >> PAGE_SHIFT; - if (swapfilepages && maxpages > swapfilepages) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); - goto bad_swap; - } - if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) - goto bad_swap; - if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; - /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { -- cgit v1.2.3 From 38719025384cf7121331bd6d41c062d3c5f7bb91 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:30 -0700 Subject: sys_swapon: simplify error flow in read_swap_header() Since there is no cleanup to do, there is no reason to jump to a label. Return directly instead. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 5e13bff1764c..058a9dfefefd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1928,7 +1928,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { printk(KERN_ERR "Unable to find swap-space signature\n"); - goto bad_swap; + return 0; } /* swap partition endianess hack... */ @@ -1944,7 +1944,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, printk(KERN_WARNING "Unable to handle swap header version %d\n", swap_header->info.version); - goto bad_swap; + return 0; } p->lowest_bit = 1; @@ -1976,22 +1976,19 @@ static unsigned long read_swap_header(struct swap_info_struct *p, p->highest_bit = maxpages - 1; if (!maxpages) - goto bad_swap; + return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); - goto bad_swap; + return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) - goto bad_swap; + return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; + return 0; return maxpages; - -bad_swap: - return 0; } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) -- cgit v1.2.3 From 1421ef3cd15b87ef949e965efeb1e527479d3f75 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:31 -0700 Subject: sys_swapon: call swap_cgroup_swapon() earlier The call to swap_cgroup_swapon is in the middle of loading the swap map and extents. As it only does memory allocation and does not depend on the swapfile layout (map/extents), it can be called earlier (or later). Move it to just after the allocation of swap_map, since it is conceptually similar (allocates a map). Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 058a9dfefefd..10f2b33805f6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2074,6 +2074,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } + error = swap_cgroup_swapon(p->type, maxpages); + if (error) + goto bad_swap; + nr_good_pages = maxpages - 1; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { @@ -2088,10 +2092,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = swap_cgroup_swapon(p->type, maxpages); - if (error) - goto bad_swap; - if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; -- cgit v1.2.3 From 915d4d7bc0d719f2f0907273c01967d38751c625 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:32 -0700 Subject: sys_swapon: separate parsing of bad blocks and extents Move the code which parses the bad block list and the extents to a separate function. Only code movement, no functional changes. This change uses the fact that, after the success path, nr_good_pages == p->pages. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 83 ++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 10f2b33805f6..a0f39f9dc5a5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1991,6 +1991,54 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return maxpages; } +static int setup_swap_map_and_extents(struct swap_info_struct *p, + union swap_header *swap_header, + unsigned char *swap_map, + unsigned long maxpages, + sector_t *span) +{ + int i; + int error; + unsigned int nr_good_pages; + int nr_extents; + + nr_good_pages = maxpages - 1; /* omit header page */ + + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; + if (page_nr == 0 || page_nr > swap_header->info.last_page) { + error = -EINVAL; + goto bad_swap; + } + if (page_nr < maxpages) { + swap_map[page_nr] = SWAP_MAP_BAD; + nr_good_pages--; + } + } + + if (nr_good_pages) { + swap_map[0] = SWAP_MAP_BAD; + p->max = maxpages; + p->pages = nr_good_pages; + nr_extents = setup_swap_extents(p, span); + if (nr_extents < 0) { + error = nr_extents; + goto bad_swap; + } + nr_good_pages = p->pages; + } + if (!nr_good_pages) { + printk(KERN_WARNING "Empty swap-file\n"); + error = -EINVAL; + goto bad_swap; + } + + return nr_extents; + +bad_swap: + return error; +} + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -2001,7 +2049,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) int error; union swap_header *swap_header; unsigned int nr_good_pages; - int nr_extents = 0; + int nr_extents; sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; @@ -2078,36 +2126,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap; - nr_good_pages = maxpages - 1; /* omit header page */ - - for (i = 0; i < swap_header->info.nr_badpages; i++) { - unsigned int page_nr = swap_header->info.badpages[i]; - if (page_nr == 0 || page_nr > swap_header->info.last_page) { - error = -EINVAL; - goto bad_swap; - } - if (page_nr < maxpages) { - swap_map[page_nr] = SWAP_MAP_BAD; - nr_good_pages--; - } - } - - if (nr_good_pages) { - swap_map[0] = SWAP_MAP_BAD; - p->max = maxpages; - p->pages = nr_good_pages; - nr_extents = setup_swap_extents(p, &span); - if (nr_extents < 0) { - error = nr_extents; - goto bad_swap; - } - nr_good_pages = p->pages; - } - if (!nr_good_pages) { - printk(KERN_WARNING "Empty swap-file\n"); - error = -EINVAL; + nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, + maxpages, &span); + if (unlikely(nr_extents < 0)) { + error = nr_extents; goto bad_swap; } + nr_good_pages = p->pages; if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { -- cgit v1.2.3 From bdb8e3f68320f897de3f3a4c363fe2802037f21d Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:33 -0700 Subject: sys_swapon: simplify error flow in setup_swap_map_and_extents() Since there is no cleanup to do, there is no reason to jump to a label. Return directly instead. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index a0f39f9dc5a5..be0d0a28690e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1998,7 +1998,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, sector_t *span) { int i; - int error; unsigned int nr_good_pages; int nr_extents; @@ -2006,10 +2005,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; - if (page_nr == 0 || page_nr > swap_header->info.last_page) { - error = -EINVAL; - goto bad_swap; - } + if (page_nr == 0 || page_nr > swap_header->info.last_page) + return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; @@ -2021,22 +2018,16 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, p->max = maxpages; p->pages = nr_good_pages; nr_extents = setup_swap_extents(p, span); - if (nr_extents < 0) { - error = nr_extents; - goto bad_swap; - } + if (nr_extents < 0) + return nr_extents; nr_good_pages = p->pages; } if (!nr_good_pages) { printk(KERN_WARNING "Empty swap-file\n"); - error = -EINVAL; - goto bad_swap; + return -EINVAL; } return nr_extents; - -bad_swap: - return error; } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) -- cgit v1.2.3 From 9c8100ef26ba9012b8677a383179a0cf169fc7f3 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:34 -0700 Subject: sys_swapon: remove nr_good_pages variable It still exists within setup_swap_map_and_extents(), but after it nr_good_pages == p->pages. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index be0d0a28690e..9e3613b91612 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2039,7 +2039,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) int i, prev; int error; union swap_header *swap_header; - unsigned int nr_good_pages; int nr_extents; sector_t span; unsigned long maxpages; @@ -2123,7 +2122,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = nr_extents; goto bad_swap; } - nr_good_pages = p->pages; if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { @@ -2143,12 +2141,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->prio = --least_priority; p->swap_map = swap_map; p->flags |= SWP_WRITEOK; - nr_swap_pages += nr_good_pages; - total_swap_pages += nr_good_pages; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; printk(KERN_INFO "Adding %uk swap on %s. " "Priority:%d extents:%d across:%lluk %s%s\n", - nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, + p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : ""); -- cgit v1.2.3 From c69dbfb84e88503468b6c481aecdb48d76ad5bc6 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:35 -0700 Subject: sys_swapon: move printk outside lock The block in sys_swapon which does the final adjustments to the swap_info_struct and to swap_list is the same as the block which re-inserts it again at sys_swapoff on failure of try_to_unuse(). To be able to make both share the same code, move the printk() call in the middle of it to just after it. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 9e3613b91612..df3dc7a3c2e2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2144,13 +2144,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) nr_swap_pages += p->pages; total_swap_pages += p->pages; - printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s\n", - p->pages<<(PAGE_SHIFT-10), name, p->prio, - nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), - (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : ""); - /* insert swap space into swap_list: */ prev = -1; for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { @@ -2164,6 +2157,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) else swap_info[prev]->next = p->type; spin_unlock(&swap_lock); + + printk(KERN_INFO "Adding %uk swap on %s. " + "Priority:%d extents:%d across:%lluk %s%s\n", + p->pages<<(PAGE_SHIFT-10), name, p->prio, + nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), + (p->flags & SWP_SOLIDSTATE) ? "SS" : "", + (p->flags & SWP_DISCARDABLE) ? "D" : ""); + mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); -- cgit v1.2.3 From c6a2b64ba5d09a1e281e85988ffd650655fa0f39 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:36 -0700 Subject: sys_swapoff: change order to match sys_swapon The block in sys_swapon which does the final adjustments to the swap_info_struct and to swap_list is the same as the block which re-inserts it again at sys_swapoff on failure of try_to_unuse(), except for the order of the operations within the lock. Since the order should not matter, arbitrarily change sys_swapoff to match sys_swapon, in preparation to making both share the same code. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index df3dc7a3c2e2..465d972f4c7c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1625,6 +1625,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_lock(&swap_lock); if (p->prio < 0) p->prio = --least_priority; + p->flags |= SWP_WRITEOK; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + prev = -1; for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { if (p->prio >= swap_info[i]->prio) @@ -1636,9 +1640,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_list.head = swap_list.next = type; else swap_info[prev]->next = type; - nr_swap_pages += p->pages; - total_swap_pages += p->pages; - p->flags |= SWP_WRITEOK; spin_unlock(&swap_lock); goto out_dput; } -- cgit v1.2.3 From 40531542e2832419566c997af0808513f6f2815d Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:37 -0700 Subject: sys_swapon: separate final enabling of the swapfile The block in sys_swapon which does the final adjustments to the swap_info_struct and to swap_list is the same as the block which re-inserts it again at sys_swapoff on failure of try_to_unuse(). Move this code to a separate function, and use it both in sys_swapon and sys_swapoff. Signed-off-by: Cesar Eduardo Barros Tested-by: Eric B Munson Acked-by: Eric B Munson Reviewed-by: Pekka Enberg Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 84 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 42 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 465d972f4c7c..7243044c4139 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1550,6 +1550,36 @@ bad_bmap: goto out; } +static void enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map) +{ + int i, prev; + + spin_lock(&swap_lock); + if (prio >= 0) + p->prio = prio; + else + p->prio = --least_priority; + p->swap_map = swap_map; + p->flags |= SWP_WRITEOK; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) + break; + prev = i; + } + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p->type; + else + swap_info[prev]->next = p->type; + spin_unlock(&swap_lock); +} + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -1621,26 +1651,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) current->flags &= ~PF_OOM_ORIGIN; if (err) { + /* + * reading p->prio and p->swap_map outside the lock is + * safe here because only sys_swapon and sys_swapoff + * change them, and there can be no other sys_swapon or + * sys_swapoff for this swap_info_struct at this point. + */ /* re-insert swap space back into swap_list */ - spin_lock(&swap_lock); - if (p->prio < 0) - p->prio = --least_priority; - p->flags |= SWP_WRITEOK; - nr_swap_pages += p->pages; - total_swap_pages += p->pages; - - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = type; - else - swap_info[prev]->next = type; - spin_unlock(&swap_lock); + enable_swap_info(p, p->prio, p->swap_map); goto out_dput; } @@ -2037,7 +2055,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) char *name; struct file *swap_file = NULL; struct address_space *mapping; - int i, prev; + int i; + int prio; int error; union swap_header *swap_header; int nr_extents; @@ -2134,30 +2153,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } mutex_lock(&swapon_mutex); - spin_lock(&swap_lock); + prio = -1; if (swap_flags & SWAP_FLAG_PREFER) - p->prio = + prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - else - p->prio = --least_priority; - p->swap_map = swap_map; - p->flags |= SWP_WRITEOK; - nr_swap_pages += p->pages; - total_swap_pages += p->pages; - - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = p->type; - else - swap_info[prev]->next = p->type; - spin_unlock(&swap_lock); + enable_swap_info(p, prio, swap_map); printk(KERN_INFO "Adding %uk swap on %s. " "Priority:%d extents:%d across:%lluk %s%s\n", -- cgit v1.2.3 From 24b8ff7c27d9e975540656e377de44a2a181a01f Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 16:33:38 -0700 Subject: mm: remove inline from scan_swap_map() scan_swap_map() is a large function (224 lines), with several loops and a complex control flow involving several gotos. Given all that, it is a bit silly that it is marked as inline. The compiler agrees with me: on a x86-64 compile, it did not inline the function. Remove the "inline" and let the compiler decide instead. Signed-off-by: Cesar Eduardo Barros Reviewed-by: Pekka Enberg Reviewed-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 7243044c4139..aafcf3611b31 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -212,8 +212,8 @@ static int wait_for_discard(void *word) #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 -static inline unsigned long scan_swap_map(struct swap_info_struct *si, - unsigned char usage) +static unsigned long scan_swap_map(struct swap_info_struct *si, + unsigned char usage) { unsigned long offset; unsigned long scan_base; -- cgit v1.2.3 From cf15b07cf448e19dcb31a19f0cbaf898b08ce975 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Tue, 22 Mar 2011 16:33:40 -0700 Subject: writeback: make mapping->writeback_index to point to the last written page For range-cyclic writeback (e.g. kupdate), the writeback code sets a continuation point of the next writeback to mapping->writeback_index which is set the page after the last written page. This happens so that we evenly write the whole file even if pages in it get continuously redirtied. However, in some cases, sequential writer is writing in the middle of the page and it just redirties the last written page by continuing from that. For example with an application which uses a file as a big ring buffer we see: [1st writeback session] ... flush-8:0-2743 4571: block_bio_queue: 8,0 W 94898514 + 8 flush-8:0-2743 4571: block_bio_queue: 8,0 W 94898522 + 8 flush-8:0-2743 4571: block_bio_queue: 8,0 W 94898530 + 8 flush-8:0-2743 4571: block_bio_queue: 8,0 W 94898538 + 8 flush-8:0-2743 4571: block_bio_queue: 8,0 W 94898546 + 8 kworker/0:1-11 4571: block_rq_issue: 8,0 W 0 () 94898514 + 40 >> flush-8:0-2743 4571: block_bio_queue: 8,0 W 94898554 + 8 >> flush-8:0-2743 4571: block_rq_issue: 8,0 W 0 () 94898554 + 8 [2nd writeback session after 35sec] flush-8:0-2743 4606: block_bio_queue: 8,0 W 94898562 + 8 flush-8:0-2743 4606: block_bio_queue: 8,0 W 94898570 + 8 flush-8:0-2743 4606: block_bio_queue: 8,0 W 94898578 + 8 ... kworker/0:1-11 4606: block_rq_issue: 8,0 W 0 () 94898562 + 640 kworker/0:1-11 4606: block_rq_issue: 8,0 W 0 () 94899202 + 72 ... flush-8:0-2743 4606: block_bio_queue: 8,0 W 94899962 + 8 flush-8:0-2743 4606: block_bio_queue: 8,0 W 94899970 + 8 flush-8:0-2743 4606: block_bio_queue: 8,0 W 94899978 + 8 flush-8:0-2743 4606: block_bio_queue: 8,0 W 94899986 + 8 flush-8:0-2743 4606: block_bio_queue: 8,0 W 94899994 + 8 kworker/0:1-11 4606: block_rq_issue: 8,0 W 0 () 94899962 + 40 >> flush-8:0-2743 4606: block_bio_queue: 8,0 W 94898554 + 8 >> flush-8:0-2743 4606: block_rq_issue: 8,0 W 0 () 94898554 + 8 So we seeked back to 94898554 after we wrote all the pages at the end of the file. This extra seek seems unnecessary. If we continue writeback from the last written page, we can avoid it and do not cause harm to other cases. The original intent of even writeout over the whole file is preserved and if the page does not get redirtied pagevec_lookup_tag() just skips it. As an exceptional case, when I/O error happens, set done_index to the next page as the comment in the code suggests. Tested-by: Wu Fengguang Signed-off-by: Jun'ichi Nomura Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b437fe6257b0..632b46479c94 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -927,7 +927,7 @@ retry: break; } - done_index = page->index + 1; + done_index = page->index; lock_page(page); @@ -977,6 +977,7 @@ continue_unlock: * not be suitable for data integrity * writeout). */ + done_index = page->index + 1; done = 1; break; } -- cgit v1.2.3 From a42931bf9c02fbf3628a27a2a5c55d2b83e4ff20 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 22 Mar 2011 16:33:41 -0700 Subject: vmalloc: remove confusing comment on vwrite() KM_USER1 is never used for vwrite() path so the caller doesn't need to guarantee it is not used. Only the caller should guarantee is KM_USER0 and it is commented already. Signed-off-by: Namhyung Kim Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fc77adabb5e3..5d6030235d7a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2003,8 +2003,6 @@ finished: * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any informaion, as /dev/kmem. - * - * The caller should guarantee KM_USER1 is not used. */ long vwrite(char *buf, char *addr, unsigned long count) -- cgit v1.2.3 From 84be48d84a53044e13aa8816aab201ab5480815d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 22 Mar 2011 16:33:41 -0700 Subject: mm/page_alloc.c: use list_move() instead of list_del()/list_add() combination Signed-off-by: Kirill A. Shutemov Cc: Mel Gorman Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 136a547262a0..3a58221f4c22 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -867,9 +867,8 @@ static int move_freepages(struct zone *zone, } order = page_order(page); - list_del(&page->lru); - list_add(&page->lru, - &zone->free_area[order].free_list[migratetype]); + list_move(&page->lru, + &zone->free_area[order].free_list[migratetype]); page += 1 << order; pages_moved += 1 << order; } -- cgit v1.2.3 From 8f7a66051b7523108c5aefb08c6a637e54aedc47 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 22 Mar 2011 16:33:43 -0700 Subject: mm/memblock: properly handle overlaps and fix error path Currently memblock_reserve() or memblock_free() don't handle overlaps of any kind. There is some special casing for coalescing exactly adjacent regions but that's about it. This is annoying because typically memblock_reserve() is used to mark regions passed by the firmware as reserved and we all know how much we can trust our firmwares... Also, with the current code, if we do something it doesn't handle right such as trying to memblock_reserve() a large range spanning multiple existing smaller reserved regions for example, or doing overlapping reservations, it can silently corrupt the internal region array, causing odd errors much later on, such as allocations returning reserved regions etc... This patch rewrites the underlying functions that add or remove a region to the arrays. The new code is a lot more robust as it fully handles overlapping regions. It's also, imho, simpler than the previous implementation. In addition, while doing so, I found a bug where if we fail to double the array while adding a region, we would remove the last region of the array rather than the region we just allocated. This fixes it too. Signed-off-by: Benjamin Herrenschmidt Acked-by: Yinghai Lu Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 241 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 132 insertions(+), 109 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 4618fda975a0..a0562d1a6ad4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -58,28 +58,6 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } -static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1, - phys_addr_t base2, phys_addr_t size2) -{ - if (base2 == base1 + size1) - return 1; - else if (base1 == base2 + size2) - return -1; - - return 0; -} - -static long __init_memblock memblock_regions_adjacent(struct memblock_type *type, - unsigned long r1, unsigned long r2) -{ - phys_addr_t base1 = type->regions[r1].base; - phys_addr_t size1 = type->regions[r1].size; - phys_addr_t base2 = type->regions[r2].base; - phys_addr_t size2 = type->regions[r2].size; - - return memblock_addrs_adjacent(base1, size1, base2, size2); -} - long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { unsigned long i; @@ -206,14 +184,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u type->regions[i].size = type->regions[i + 1].size; } type->cnt--; -} -/* Assumption: base addr of region 1 < base addr of region 2 */ -static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, - unsigned long r1, unsigned long r2) -{ - type->regions[r1].size += type->regions[r2].size; - memblock_remove_region(type, r2); + /* Special case for empty arrays */ + if (type->cnt == 0) { + type->cnt = 1; + type->regions[0].base = 0; + type->regions[0].size = 0; + } } /* Defined below but needed now */ @@ -276,7 +253,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); + BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size)); /* If the array wasn't our static init one, then free it. We only do * that before SLAB is available as later on, we don't know whether @@ -296,58 +273,99 @@ extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1 return 1; } -static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +static long __init_memblock memblock_add_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { - unsigned long coalesced = 0; - long adjacent, i; - - if ((type->cnt == 1) && (type->regions[0].size == 0)) { - type->regions[0].base = base; - type->regions[0].size = size; - return 0; - } + phys_addr_t end = base + size; + int i, slot = -1; - /* First try and coalesce this MEMBLOCK with another. */ + /* First try and coalesce this MEMBLOCK with others */ for (i = 0; i < type->cnt; i++) { - phys_addr_t rgnbase = type->regions[i].base; - phys_addr_t rgnsize = type->regions[i].size; + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rend = rgn->base + rgn->size; + + /* Exit if there's no possible hits */ + if (rgn->base > end || rgn->size == 0) + break; - if ((rgnbase == base) && (rgnsize == size)) - /* Already have this region, so we're done */ + /* Check if we are fully enclosed within an existing + * block + */ + if (rgn->base <= base && rend >= end) return 0; - adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); - /* Check if arch allows coalescing */ - if (adjacent != 0 && type == &memblock.memory && - !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) - break; - if (adjacent > 0) { - type->regions[i].base -= size; - type->regions[i].size += size; - coalesced++; - break; - } else if (adjacent < 0) { - type->regions[i].size += size; - coalesced++; - break; + /* Check if we overlap or are adjacent with the bottom + * of a block. + */ + if (base < rgn->base && end >= rgn->base) { + /* If we can't coalesce, create a new block */ + if (!memblock_memory_can_coalesce(base, size, + rgn->base, + rgn->size)) { + /* Overlap & can't coalesce are mutually + * exclusive, if you do that, be prepared + * for trouble + */ + WARN_ON(end != rgn->base); + goto new_block; + } + /* We extend the bottom of the block down to our + * base + */ + rgn->base = base; + rgn->size = rend - base; + + /* Return if we have nothing else to allocate + * (fully coalesced) + */ + if (rend >= end) + return 0; + + /* We continue processing from the end of the + * coalesced block. + */ + base = rend; + size = end - base; + } + + /* Now check if we overlap or are adjacent with the + * top of a block + */ + if (base <= rend && end >= rend) { + /* If we can't coalesce, create a new block */ + if (!memblock_memory_can_coalesce(rgn->base, + rgn->size, + base, size)) { + /* Overlap & can't coalesce are mutually + * exclusive, if you do that, be prepared + * for trouble + */ + WARN_ON(rend != base); + goto new_block; + } + /* We adjust our base down to enclose the + * original block and destroy it. It will be + * part of our new allocation. Since we've + * freed an entry, we know we won't fail + * to allocate one later, so we won't risk + * losing the original block allocation. + */ + size += (base - rgn->base); + base = rgn->base; + memblock_remove_region(type, i--); } } - /* If we plugged a hole, we may want to also coalesce with the - * next region + /* If the array is empty, special case, replace the fake + * filler region and return */ - if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && - ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, - type->regions[i].size, - type->regions[i+1].base, - type->regions[i+1].size)))) { - memblock_coalesce_regions(type, i, i+1); - coalesced++; + if ((type->cnt == 1) && (type->regions[0].size == 0)) { + type->regions[0].base = base; + type->regions[0].size = size; + return 0; } - if (coalesced) - return coalesced; - + new_block: /* If we are out of space, we fail. It's too late to resize the array * but then this shouldn't have happened in the first place. */ @@ -362,13 +380,14 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys } else { type->regions[i+1].base = base; type->regions[i+1].size = size; + slot = i + 1; break; } } - if (base < type->regions[0].base) { type->regions[0].base = base; type->regions[0].size = size; + slot = 0; } type->cnt++; @@ -376,7 +395,8 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys * our allocation and return an error */ if (type->cnt == type->max && memblock_double_array(type)) { - type->cnt--; + BUG_ON(slot < 0); + memblock_remove_region(type, slot); return -1; } @@ -389,52 +409,55 @@ long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) } -static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +static long __init_memblock __memblock_remove(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { - phys_addr_t rgnbegin, rgnend; phys_addr_t end = base + size; int i; - rgnbegin = rgnend = 0; /* supress gcc warnings */ - - /* Find the region where (base, size) belongs to */ - for (i=0; i < type->cnt; i++) { - rgnbegin = type->regions[i].base; - rgnend = rgnbegin + type->regions[i].size; + /* Walk through the array for collisions */ + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rend = rgn->base + rgn->size; - if ((rgnbegin <= base) && (end <= rgnend)) + /* Nothing more to do, exit */ + if (rgn->base > end || rgn->size == 0) break; - } - /* Didn't find the region */ - if (i == type->cnt) - return -1; + /* If we fully enclose the block, drop it */ + if (base <= rgn->base && end >= rend) { + memblock_remove_region(type, i--); + continue; + } - /* Check to see if we are removing entire region */ - if ((rgnbegin == base) && (rgnend == end)) { - memblock_remove_region(type, i); - return 0; - } + /* If we are fully enclosed within a block + * then we need to split it and we are done + */ + if (base > rgn->base && end < rend) { + rgn->size = base - rgn->base; + if (!memblock_add_region(type, end, rend - end)) + return 0; + /* Failure to split is bad, we at least + * restore the block before erroring + */ + rgn->size = rend - rgn->base; + WARN_ON(1); + return -1; + } - /* Check to see if region is matching at the front */ - if (rgnbegin == base) { - type->regions[i].base = end; - type->regions[i].size -= size; - return 0; - } + /* Check if we need to trim the bottom of a block */ + if (rgn->base < end && rend > end) { + rgn->size -= end - rgn->base; + rgn->base = end; + break; + } - /* Check to see if the region is matching at the end */ - if (rgnend == end) { - type->regions[i].size -= size; - return 0; - } + /* And check if we need to trim the top of a block */ + if (base < rend) + rgn->size -= rend - base; - /* - * We need to split the entry - adjust the current one to the - * beginging of the hole and add the region after hole. - */ - type->regions[i].size = base - type->regions[i].base; - return memblock_add_region(type, end, rgnend - end); + } + return 0; } long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) @@ -467,7 +490,7 @@ phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, ph found = memblock_find_base(size, align, 0, max_addr); if (found != MEMBLOCK_ERROR && - memblock_add_region(&memblock.reserved, found, size) >= 0) + !memblock_add_region(&memblock.reserved, found, size)) return found; return 0; @@ -548,7 +571,7 @@ static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, if (this_nid == nid) { phys_addr_t ret = memblock_find_region(start, this_end, size, align); if (ret != MEMBLOCK_ERROR && - memblock_add_region(&memblock.reserved, ret, size) >= 0) + !memblock_add_region(&memblock.reserved, ret, size)) return ret; } start = this_end; -- cgit v1.2.3 From bee4c36a5cf5c9f63ce1d7372aa62045fbd16d47 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Mar 2011 16:33:43 -0700 Subject: shmem: let shared anonymous be nonlinear again Up to 2.6.22, you could use remap_file_pages(2) on a tmpfs file or a shared mapping of /dev/zero or a shared anonymous mapping. In 2.6.23 we disabled it by default, but set VM_CAN_NONLINEAR to enable it on safe mappings. We made sure to set it in shmem_mmap() for tmpfs files, but missed it in shmem_zero_setup() for the others. Fix that at last. Reported-by: Kenny Simpson Signed-off-by: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 88593586bdb7..91ce9a1024d7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2793,5 +2793,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } -- cgit v1.2.3 From 3dd7ae8ec0ef399bfea347f297d2a95504d35571 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 22 Mar 2011 16:33:45 -0700 Subject: mm: simplify code of swap.c Clean up code and remove duplicate code. Next patch will use pagevec_lru_move_fn introduced here too. Signed-off-by: Shaohua Li Cc: KOSAKI Motohiro Cc: Hiroyuki Kamezawa Cc: Andi Kleen Reviewed-by: Minchan Kim Cc: Rik van Riel Cc: Mel Gorman Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 133 +++++++++++++++++++++++++++----------------------------------- 1 file changed, 58 insertions(+), 75 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 0a33714a7cba..a448db377cb0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -179,15 +179,13 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) +static void pagevec_lru_move_fn(struct pagevec *pvec, + void (*move_fn)(struct page *page, void *arg), + void *arg) { int i; - int pgmoved = 0; struct zone *zone = NULL; + unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -195,30 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec) if (pagezone != zone) { if (zone) - spin_unlock(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; - spin_lock(&zone->lru_lock); - } - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - mem_cgroup_rotate_reclaimable_page(page); - pgmoved++; + spin_lock_irqsave(&zone->lru_lock, flags); } + + (*move_fn)(page, arg); } if (zone) - spin_unlock(&zone->lru_lock); - __count_vm_events(PGROTATED, pgmoved); + spin_unlock_irqrestore(&zone->lru_lock, flags); release_pages(pvec->pages, pvec->nr, pvec->cold); pagevec_reinit(pvec); } +static void pagevec_move_tail_fn(struct page *page, void *arg) +{ + int *pgmoved = arg; + struct zone *zone = page_zone(page); + + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + enum lru_list lru = page_lru_base_type(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); + (*pgmoved)++; + } +} + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int pgmoved = 0; + + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + __count_vm_events(PGROTATED, pgmoved); +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && !PageUnevictable(page) && PageLRU(page)) { @@ -369,10 +387,11 @@ void add_page_to_unevictable_list(struct page *page) * be write it out by flusher threads as this is much more effective * than the single-page writeout from reclaim. */ -static void lru_deactivate(struct page *page, struct zone *zone) +static void lru_deactivate_fn(struct page *page, void *arg) { int lru, file; bool active; + struct zone *zone = page_zone(page); if (!PageLRU(page)) return; @@ -412,31 +431,6 @@ static void lru_deactivate(struct page *page, struct zone *zone) update_page_reclaim_stat(zone, page, file, 0); } -static void ____pagevec_lru_deactivate(struct pagevec *pvec) -{ - int i; - struct zone *zone = NULL; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - lru_deactivate(page, zone); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); -} - - /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -466,7 +460,7 @@ static void drain_cpu_pagevecs(int cpu) pvec = &per_cpu(lru_deactivate_pvecs, cpu); if (pagevec_count(pvec)) - ____pagevec_lru_deactivate(pvec); + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); } /** @@ -483,7 +477,7 @@ void deactivate_page(struct page *page) struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); if (!pagevec_add(pvec, page)) - ____pagevec_lru_deactivate(pvec); + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); put_cpu_var(lru_deactivate_pvecs); } } @@ -630,44 +624,33 @@ void lru_add_page_tail(struct zone* zone, } } +static void ____pagevec_lru_add_fn(struct page *page, void *arg) +{ + enum lru_list lru = (enum lru_list)arg; + struct zone *zone = page_zone(page); + int file = is_file_lru(lru); + int active = is_active_lru(lru); + + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + + SetPageLRU(page); + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { - int i; - struct zone *zone = NULL; - VM_BUG_ON(is_unevictable_lru(lru)); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - int file; - int active; - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - active = is_active_lru(lru); - file = is_file_lru(lru); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); + pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); } EXPORT_SYMBOL(____pagevec_lru_add); -- cgit v1.2.3 From 2130781e2aaab66e5a9f2fdc8af35da0153f405c Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 22 Mar 2011 23:03:13 -0300 Subject: sys_swapon: fix inode locking A conflict between 52c50567d8ab ("mm: swap: unlock swapfile inode mutex before closing file on bad swapfiles") and 83ef99befc32 ("sys_swapon: remove did_down variable") caused a double unlock of the inode mutex (once in bad_swap: before the filp_close, once at the end just before returning). The patch which added the extra unlock cleared did_down to avoid unlocking twice, but the other patch removed the did_down variable. To fix, set inode to NULL after the first unlock, since it will be used after that point only for the final unlock. While checking this patch, I found a path which could unlock without locking, in case the same inode was added as a swapfile twice. To fix, move the setting of the inode variable further down, to just before claim_swapfile, which will lock the inode before doing anything else. Cc: Mel Gorman Cc: Hugh Dickins Cc: Eric B Munson Cc: KAMEZAWA Hiroyuki Cc: Andrew Morton Signed-off-by: Cesar Eduardo Barros Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index aafcf3611b31..71b42ec55b78 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2088,7 +2088,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; - inode = mapping->host; for (i = 0; i < nr_swapfiles; i++) { struct swap_info_struct *q = swap_info[i]; @@ -2101,6 +2100,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } + inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; @@ -2187,8 +2188,10 @@ bad_swap: spin_unlock(&swap_lock); vfree(swap_map); if (swap_file) { - if (inode && S_ISREG(inode->i_mode)) + if (inode && S_ISREG(inode->i_mode)) { mutex_unlock(&inode->i_mutex); + inode = NULL; + } filp_close(swap_file, NULL); } out: -- cgit v1.2.3 From 31db58b3ab432f72ea76be58b12e6ffaf627d5db Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:15 -0400 Subject: mm: arch: make get_gate_vma take an mm_struct instead of a task_struct Morally, the presence of a gate vma is more an attribute of a particular mm than a particular task. Moreover, dropping the dependency on task_struct will help make both existing and future operations on mm's more flexible and convenient. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- mm/memory.c | 4 ++-- mm/mlock.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e48945ab362b..b6dc37097433 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1488,7 +1488,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk); + struct vm_area_struct *gate_vma = get_gate_vma(tsk->mm); pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -3496,7 +3496,7 @@ static int __init gate_vma_init(void) __initcall(gate_vma_init); #endif -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef AT_SYSINFO_EHDR return &gate_vma; diff --git a/mm/mlock.c b/mm/mlock.c index c3924c7f00be..2689a08c79af 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -237,7 +237,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current))) { + vma == get_gate_vma(current->mm))) { __mlock_vma_pages_range(vma, start, end, NULL); @@ -332,7 +332,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int lock = newflags & VM_LOCKED; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || - is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) goto out; /* don't set VM_LOCKED, don't count */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); -- cgit v1.2.3 From 83b964bbf82eb13a8f31bb49ca420787fe01f7a6 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:16 -0400 Subject: mm: arch: make in_gate_area take an mm_struct instead of a task_struct Morally, the question of whether an address lies in a gate vma should be asked with respect to an mm, not a particular task. Moreover, dropping the dependency on task_struct will help make existing and future operations on mm's more flexible and convenient. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index b6dc37097433..931d479b80c2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1486,7 +1486,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(tsk, start)) { + if (!vma && in_gate_area(tsk->mm, start)) { unsigned long pg = start & PAGE_MASK; struct vm_area_struct *gate_vma = get_gate_vma(tsk->mm); pgd_t *pgd; -- cgit v1.2.3 From cae5d39032acf26c265f6b1dc73d7ce6ff4bc387 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:17 -0400 Subject: mm: arch: rename in_gate_area_no_task to in_gate_area_no_mm Now that gate vma's are referenced with respect to a particular mm and not a particular task it only makes sense to propagate the change to this predicate as well. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- mm/memory.c | 2 +- mm/nommu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 931d479b80c2..5f5b5de5a40e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3505,7 +3505,7 @@ struct vm_area_struct *get_gate_vma(struct mm_struct *mm) #endif } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { #ifdef AT_SYSINFO_EHDR if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) diff --git a/mm/nommu.c b/mm/nommu.c index f59e1424d3db..e629143f9440 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1963,7 +1963,7 @@ error: return -ENOMEM; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } -- cgit v1.2.3 From e7f22e207bacdba5b73f2893a3abe935a5373e2e Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:18 -0400 Subject: mm: use mm_struct to resolve gate vma's in __get_user_pages We now check if a requested user page overlaps a gate vma using the supplied mm instead of the supplied task. The given task is now used solely for accounting purposes and may be NULL. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- mm/memory.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 5f5b5de5a40e..5f585b65d734 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1486,9 +1486,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(tsk->mm, start)) { + if (!vma && in_gate_area(mm, start)) { unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk->mm); + struct vm_area_struct *gate_vma = get_gate_vma(mm); pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -1589,10 +1589,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? i : -EFAULT; BUG(); } - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } if (ret & VM_FAULT_RETRY) { *nonblocking = 0; @@ -1638,7 +1641,8 @@ EXPORT_SYMBOL(__get_user_pages); /** * get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin -- cgit v1.2.3 From 206cb636576b969e9b471cdedeaea7752e6acb33 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:19 -0400 Subject: mm: factor out main logic of access_process_vm Introduce an internal helper __access_remote_vm and base access_process_vm on top of it. This new method may be called with a NULL task_struct if page fault accounting is not desired. This code will be shared with a new address space accessor that is independent of task_struct. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- mm/memory.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 5f585b65d734..820b4c4810f0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3650,20 +3650,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, #endif /* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages + * Access another process' address space as given in mm. If non-NULL, use the + * given task for page fault accounting. */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) { - struct mm_struct *mm; struct vm_area_struct *vma; void *old_buf = buf; - mm = get_task_mm(tsk); - if (!mm) - return 0; - down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ while (len) { @@ -3712,11 +3707,31 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in addr += bytes; } up_read(&mm->mmap_sem); - mmput(mm); return buf - old_buf; } +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, int write) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = __access_remote_vm(tsk, mm, addr, buf, len, write); + mmput(mm); + + return ret; +} + /* * Print the name of a VMA. */ -- cgit v1.2.3 From 5ddd36b9c59887c6416e21daf984fbdd9b1818df Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:20 -0400 Subject: mm: implement access_remote_vm Provide an alternative to access_process_vm that allows the caller to obtain a reference to the supplied mm_struct. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- mm/memory.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 820b4c4810f0..468f5076754c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3711,6 +3711,22 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, return buf - old_buf; } +/** + * @access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + /* * Access another process' address space. * Source/target buffer must be kernel space, -- cgit v1.2.3 From 56039efa18f2530fc23e8ef19e716b65ee2a1d1e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 23 Mar 2011 16:42:19 -0700 Subject: memcg: fix ugly initialization of return value is in caller Remove initialization of vaiable in caller of memory cgroup function. Actually, it's return value of memcg function but it's initialized in caller. Some memory cgroup uses following style to bring the result of start function to the end function for avoiding races. mem_cgroup_start_A(&(*ptr)) /* Something very complicated can happen here. */ mem_cgroup_end_A(*ptr) In some calls, *ptr should be initialized to NULL be caller. But it's ugly. This patch fixes that *ptr is initialized by _start function. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Johannes Weiner Acked-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++++-- mm/memory.c | 2 +- mm/migrate.c | 2 +- mm/swapfile.c | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e1ee6ad9c971..b56bd74b486f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2475,7 +2475,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, /* shmem */ if (PageSwapCache(page)) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *mem; ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); if (!ret) @@ -2501,6 +2501,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct mem_cgroup *mem; int ret; + *ptr = NULL; + if (mem_cgroup_disabled()) return 0; @@ -2916,6 +2918,8 @@ int mem_cgroup_prepare_migration(struct page *page, enum charge_type ctype; int ret = 0; + *ptr = NULL; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -3058,7 +3062,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *mem; int ret; if (mem_cgroup_disabled()) diff --git a/mm/memory.c b/mm/memory.c index 615be5127ce1..20d5f7499ce2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2767,7 +2767,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swp_entry_t entry; pte_t pte; int locked; - struct mem_cgroup *ptr = NULL; + struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; diff --git a/mm/migrate.c b/mm/migrate.c index 89e5c3fe8bbc..b0406d739ea7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -633,7 +633,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, struct page *newpage = get_new_page(page, private, &result); int remap_swapcache = 1; int charge = 0; - struct mem_cgroup *mem = NULL; + struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; if (!newpage) diff --git a/mm/swapfile.c b/mm/swapfile.c index 71b42ec55b78..039e61677635 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -880,7 +880,7 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { - struct mem_cgroup *ptr = NULL; + struct mem_cgroup *ptr; spinlock_t *ptl; pte_t *pte; int ret = 1; -- cgit v1.2.3 From b7c6167848fa36e32f1874b95c1edc02881cd040 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:20 -0700 Subject: memcg: soft limit reclaim should end at limit not below Soft limit reclaim continues until the usage is below the current soft limit, but the documented semantics are actually that soft limit reclaim will push usage back until the soft limits are met again. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Acked-by: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b56bd74b486f..13de53fe0108 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1477,7 +1477,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return ret; total += ret; if (check_soft) { - if (res_counter_check_under_soft_limit(&root_mem->res)) + if (res_counter_check_within_soft_limit(&root_mem->res)) return total; } else if (mem_cgroup_check_under_limit(root_mem)) return 1 + total; -- cgit v1.2.3 From 9d11ea9f163a14920487bdda77461e64d600fd48 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:21 -0700 Subject: memcg: simplify the way memory limits are checked Since transparent huge pages, checking whether memory cgroups are below their limits is no longer enough, but the actual amount of chargeable space is important. To not have more than one limit-checking interface, replace memory_cgroup_check_under_limit() and memory_cgroup_check_margin() with a single memory_cgroup_margin() that returns the chargeable space and leaves the comparison to the callsite. Soft limits are now checked the other way round, by using the already existing function that returns the amount by which soft limits are exceeded: res_counter_soft_limit_excess(). Also remove all the corresponding functions on the res_counter side that are now no longer used. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Acked-by: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 49 +++++++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 13de53fe0108..62bbb48980e5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -504,11 +504,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) } } -static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) -{ - return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; -} - static struct mem_cgroup_per_zone * __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) { @@ -1127,33 +1122,21 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) -static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) -{ - if (do_swap_account) { - if (res_counter_check_under_limit(&mem->res) && - res_counter_check_under_limit(&mem->memsw)) - return true; - } else - if (res_counter_check_under_limit(&mem->res)) - return true; - return false; -} - /** - * mem_cgroup_check_margin - check if the memory cgroup allows charging - * @mem: memory cgroup to check - * @bytes: the number of bytes the caller intends to charge + * mem_cgroup_margin - calculate chargeable space of a memory cgroup + * @mem: the memory cgroup * - * Returns a boolean value on whether @mem can be charged @bytes or - * whether this would exceed the limit. + * Returns the maximum amount of memory @mem can be charged with, in + * bytes. */ -static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +static unsigned long long mem_cgroup_margin(struct mem_cgroup *mem) { - if (!res_counter_check_margin(&mem->res, bytes)) - return false; - if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) - return false; - return true; + unsigned long long margin; + + margin = res_counter_margin(&mem->res); + if (do_swap_account) + margin = min(margin, res_counter_margin(&mem->memsw)); + return margin; } static unsigned int get_swappiness(struct mem_cgroup *memcg) @@ -1420,7 +1403,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; - unsigned long excess = mem_cgroup_get_excess(root_mem); + unsigned long excess; + + excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ if (root_mem->memsw_is_minimum) @@ -1477,9 +1462,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return ret; total += ret; if (check_soft) { - if (res_counter_check_within_soft_limit(&root_mem->res)) + if (!res_counter_soft_limit_excess(&root_mem->res)) return total; - } else if (mem_cgroup_check_under_limit(root_mem)) + } else if (mem_cgroup_margin(root_mem)) return 1 + total; } return total; @@ -1898,7 +1883,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, gfp_mask, flags); - if (mem_cgroup_check_margin(mem_over_limit, csize)) + if (mem_cgroup_margin(mem_over_limit) >= csize) return CHARGE_RETRY; /* * Even though the limit is exceeded at this point, reclaim -- cgit v1.2.3 From 3403968d7a7dc373901cad0cad56b3afcb09cc50 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:22 -0700 Subject: memcg: remove unused page flag bitfield defines These definitions have been unused since '4b3bde4 memcg: remove the overhead associated with the root cgroup'. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 62bbb48980e5..0595e2b184a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -327,13 +327,6 @@ enum charge_type { NR_CHARGE_TYPE, }; -/* only for here (for easy reading.) */ -#define PCGF_CACHE (1UL << PCG_CACHE) -#define PCGF_USED (1UL << PCG_USED) -#define PCGF_LOCK (1UL << PCG_LOCK) -/* Not used, but added here for completeness */ -#define PCGF_ACCT (1UL << PCG_ACCT) - /* for encoding cft->private value on file */ #define _MEM (0) #define _MEMSWAP (1) -- cgit v1.2.3 From c14f35c70e068392ccae0b2d6f755baea5eed4d6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:23 -0700 Subject: memcg: remove impossible conditional when committing No callsite ever passes a NULL pointer for a struct mem_cgroup * to the committing function. There is no need to check for it. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0595e2b184a2..b94cd24c5baa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2118,10 +2118,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, { int nr_pages = page_size >> PAGE_SHIFT; - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); -- cgit v1.2.3 From af4a662144884a7dbb19acbef70878b3b955f928 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:24 -0700 Subject: memcg: remove NULL check from lookup_page_cgroup() result The page_cgroup array is set up before even fork is initialized. I seriously doubt that this code executes before the array is alloc'd. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b94cd24c5baa..3a2d54bdf076 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2369,10 +2369,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, } pc = lookup_page_cgroup(page); - /* can happen at boot */ - if (unlikely(!pc)) - return 0; - prefetchw(pc); + BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); if (ret || !mem) -- cgit v1.2.3 From f212ad7cf9c73f8a7fa160e223dcb3f074441a72 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 23 Mar 2011 16:42:25 -0700 Subject: memcg: add memcg sanity checks at allocating and freeing pages Add checks at allocating or freeing a page whether the page is used (iow, charged) from the view point of memcg. This check may be useful in debugging a problem and we did similar checks before the commit 52d4b9ac(memcg: allocate all page_cgroup at boot). This patch adds some overheads at allocating or freeing memory, so it's enabled only when CONFIG_DEBUG_VM is enabled. Signed-off-by: Daisuke Nishimura Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 8 ++++++-- 2 files changed, 52 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3a2d54bdf076..0356cb6c9504 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3046,6 +3046,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, return ret; } +#ifdef CONFIG_DEBUG_VM +static struct page_cgroup *lookup_page_cgroup_used(struct page *page) +{ + struct page_cgroup *pc; + + pc = lookup_page_cgroup(page); + if (likely(pc) && PageCgroupUsed(pc)) + return pc; + return NULL; +} + +bool mem_cgroup_bad_page_check(struct page *page) +{ + if (mem_cgroup_disabled()) + return false; + + return lookup_page_cgroup_used(page) != NULL; +} + +void mem_cgroup_print_bad_page(struct page *page) +{ + struct page_cgroup *pc; + + pc = lookup_page_cgroup_used(page); + if (pc) { + int ret = -1; + char *path; + + printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", + pc, pc->flags, pc->mem_cgroup); + + path = kmalloc(PATH_MAX, GFP_KERNEL); + if (path) { + rcu_read_lock(); + ret = cgroup_path(pc->mem_cgroup->css.cgroup, + path, PATH_MAX); + rcu_read_unlock(); + } + + printk(KERN_CONT "(%s)\n", + (ret < 0) ? "cannot get the path" : path); + kfree(path); + } +} +#endif + static DEFINE_MUTEX(set_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3a58221f4c22..8e5726ab0d85 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -565,7 +566,8 @@ static inline int free_pages_check(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { + (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -754,7 +756,8 @@ static inline int check_new_page(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { + (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -5684,4 +5687,5 @@ void dump_page(struct page *page) page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); + mem_cgroup_print_bad_page(page); } -- cgit v1.2.3 From ad324e94475a04cfcdfdb11ad20f8ea81268e411 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:26 -0700 Subject: memcg: no uncharged pages reach page_cgroup_zoneinfo This patch series removes the direct page pointer from struct page_cgroup, which saves 20% of per-page memcg memory overhead (Fedora and Ubuntu enable memcg per default, openSUSE apparently too). The node id or section number is encoded in the remaining free bits of pc->flags which allows calculating the corresponding page without the extra pointer. I ran, what I think is, a worst-case microbenchmark that just cats a large sparse file to /dev/null, because it means that walking the LRU list on behalf of per-cgroup reclaim and looking up pages from page_cgroups is happening constantly and at a high rate. But it made no measurable difference. A profile reported a 0.11% share of the new lookup_cgroup_page() function in this benchmark. This patch: All callsites check PCG_USED before passing pc->mem_cgroup, so the latter is never NULL. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Acked-by: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0356cb6c9504..5f7b0e1d789c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -370,9 +370,6 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) int nid = page_cgroup_nid(pc); int zid = page_cgroup_zid(pc); - if (!mem) - return NULL; - return mem_cgroup_zoneinfo(mem, nid, zid); } -- cgit v1.2.3 From 97a6c37b34f46feed2544bd40891ee6dd0fd1554 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:27 -0700 Subject: memcg: change page_cgroup_zoneinfo signature Instead of passing a whole struct page_cgroup to this function, let it take only what it really needs from it: the struct mem_cgroup and the page. This has the advantage that reading pc->mem_cgroup is now done at the same place where the ordering rules for this pointer are enforced and explained. It is also in preparation for removing the pc->page backpointer. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5f7b0e1d789c..2881c9ef969a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -364,11 +364,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct page_cgroup *pc) +page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) { - struct mem_cgroup *mem = pc->mem_cgroup; - int nid = page_cgroup_nid(pc); - int zid = page_cgroup_zid(pc); + int nid = page_to_nid(page); + int zid = page_zonenum(page); return mem_cgroup_zoneinfo(mem, nid, zid); } @@ -800,7 +799,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); if (mem_cgroup_is_root(pc->mem_cgroup)) @@ -836,7 +835,7 @@ void mem_cgroup_rotate_reclaimable_page(struct page *page) smp_rmb(); if (mem_cgroup_is_root(pc->mem_cgroup)) return; - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move_tail(&pc->lru, &mz->lists[lru]); } @@ -856,7 +855,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) smp_rmb(); if (mem_cgroup_is_root(pc->mem_cgroup)) return; - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move(&pc->lru, &mz->lists[lru]); } @@ -873,7 +872,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); @@ -1043,7 +1042,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); - mz = page_cgroup_zoneinfo(pc); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); if (!mz) return NULL; @@ -2192,7 +2191,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) * We hold lru_lock, then, reduce counter directly. */ lru = page_lru(head); - mz = page_cgroup_zoneinfo(head_pc); + mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); MEM_CGROUP_ZSTAT(mz, lru) -= 1; } tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; -- cgit v1.2.3 From de3638d9cdc89ac899225996b8dcedbcbc53bdd2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:28 -0700 Subject: memcg: fold __mem_cgroup_move_account into caller It is one logical function, no need to have it split up. Also, get rid of some checks from the inner function that ensured the sanity of the outer function. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 66 +++++++++++++++++++++++++-------------------------------- 1 file changed, 29 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2881c9ef969a..e9d33dc151a5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2200,33 +2200,49 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) #endif /** - * __mem_cgroup_move_account - move account of the page + * mem_cgroup_move_account - move account of the page * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * @uncharge: whether we should call uncharge and css_put against @from. + * @charge_size: number of bytes to charge (regular or huge page) * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) - * - the pc is locked, used, and ->mem_cgroup points to @from. + * - compound_lock is held when charge_size > PAGE_SIZE * * This function doesn't do "charge" nor css_get to new cgroup. It should be * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is * true, this function does "uncharge" from old cgroup, but it doesn't if * @uncharge is false, so a caller should do "uncharge". */ - -static void __mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, - int charge_size) +static int mem_cgroup_move_account(struct page_cgroup *pc, + struct mem_cgroup *from, struct mem_cgroup *to, + bool uncharge, int charge_size) { int nr_pages = charge_size >> PAGE_SHIFT; + unsigned long flags; + int ret; VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); - VM_BUG_ON(!page_is_cgroup_locked(pc)); - VM_BUG_ON(!PageCgroupUsed(pc)); - VM_BUG_ON(pc->mem_cgroup != from); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (charge_size > PAGE_SIZE && !PageTransHuge(pc->page)) + goto out; + + lock_page_cgroup(pc); + + ret = -EINVAL; + if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) + goto unlock; + + move_lock_page_cgroup(pc, &flags); if (PageCgroupFileMapped(pc)) { /* Update mapped_file data for mem_cgroup */ @@ -2250,40 +2266,16 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, * garanteed that "to" is never removed. So, we don't check rmdir * status here. */ -} - -/* - * check whether the @pc is valid for moving account and call - * __mem_cgroup_move_account() - */ -static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, - bool uncharge, int charge_size) -{ - int ret = -EINVAL; - unsigned long flags; - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) - return -EBUSY; - - lock_page_cgroup(pc); - if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { - move_lock_page_cgroup(pc, &flags); - __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); - move_unlock_page_cgroup(pc, &flags); - ret = 0; - } + move_unlock_page_cgroup(pc, &flags); + ret = 0; +unlock: unlock_page_cgroup(pc); /* * check events */ memcg_check_events(to, pc->page); memcg_check_events(from, pc->page); +out: return ret; } -- cgit v1.2.3 From 5564e88ba6fd2f6dcd83a592771810cd84b5ae80 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:29 -0700 Subject: memcg: condense page_cgroup-to-page lookup points The per-cgroup LRU lists string up 'struct page_cgroup's. To get from those structures to the page they represent, a lookup is required. Currently, the lookup is done through a direct pointer in struct page_cgroup, so a lot of functions down the callchain do this lookup by themselves instead of receiving the page pointer from their callers. The next patch removes this pointer, however, and the lookup is no longer that straight-forward. In preparation for that, this patch only leaves the non-optional lookups when coming directly from the LRU list and passes the page down the stack. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e9d33dc151a5..e286e1603e4f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1077,9 +1077,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, if (scan >= nr_to_scan) break; - page = pc->page; if (unlikely(!PageCgroupUsed(pc))) continue; + + page = pc->page; + if (unlikely(!PageLRU(page))) continue; @@ -2108,6 +2110,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) } static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page *page, struct page_cgroup *pc, enum charge_type ctype, int page_size) @@ -2154,7 +2157,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ - memcg_check_events(mem, pc->page); + memcg_check_events(mem, page); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2201,6 +2204,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) /** * mem_cgroup_move_account - move account of the page + * @page: the page * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. @@ -2216,7 +2220,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) * true, this function does "uncharge" from old cgroup, but it doesn't if * @uncharge is false, so a caller should do "uncharge". */ -static int mem_cgroup_move_account(struct page_cgroup *pc, +static int mem_cgroup_move_account(struct page *page, struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, int charge_size) { @@ -2225,7 +2229,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, int ret; VM_BUG_ON(from == to); - VM_BUG_ON(PageLRU(pc->page)); + VM_BUG_ON(PageLRU(page)); /* * The page is isolated from LRU. So, collapse function * will not handle this page. But page splitting can happen. @@ -2233,7 +2237,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, * hold it. */ ret = -EBUSY; - if (charge_size > PAGE_SIZE && !PageTransHuge(pc->page)) + if (charge_size > PAGE_SIZE && !PageTransHuge(page)) goto out; lock_page_cgroup(pc); @@ -2273,8 +2277,8 @@ unlock: /* * check events */ - memcg_check_events(to, pc->page); - memcg_check_events(from, pc->page); + memcg_check_events(to, page); + memcg_check_events(from, page); out: return ret; } @@ -2283,11 +2287,11 @@ out: * move charges to its parent. */ -static int mem_cgroup_move_parent(struct page_cgroup *pc, +static int mem_cgroup_move_parent(struct page *page, + struct page_cgroup *pc, struct mem_cgroup *child, gfp_t gfp_mask) { - struct page *page = pc->page; struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; @@ -2317,7 +2321,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (page_size > PAGE_SIZE) flags = compound_lock_irqsave(page); - ret = mem_cgroup_move_account(pc, child, parent, true, page_size); + ret = mem_cgroup_move_account(page, pc, child, parent, true, page_size); if (ret) mem_cgroup_cancel_charge(parent, page_size); @@ -2363,7 +2367,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype, page_size); + __mem_cgroup_commit_charge(mem, page, pc, ctype, page_size); return 0; } @@ -2501,7 +2505,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); + __mem_cgroup_commit_charge(ptr, page, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2956,7 +2960,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); + __mem_cgroup_commit_charge(mem, page, pc, ctype, PAGE_SIZE); return ret; } @@ -3323,6 +3327,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, loop += 256; busy = NULL; while (loop--) { + struct page *page; + ret = 0; spin_lock_irqsave(&zone->lru_lock, flags); if (list_empty(list)) { @@ -3338,7 +3344,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, } spin_unlock_irqrestore(&zone->lru_lock, flags); - ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); + page = pc->page; + + ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); if (ret == -ENOMEM) break; @@ -4956,7 +4964,7 @@ retry: if (isolate_lru_page(page)) goto put; pc = lookup_page_cgroup(page); - if (!mem_cgroup_move_account(pc, + if (!mem_cgroup_move_account(page, pc, mc.from, mc.to, false, PAGE_SIZE)) { mc.precharge--; /* we uncharge from mc.from later. */ -- cgit v1.2.3 From 6b3ae58efca06623c197fd6d91ded4aa3a8fe039 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:30 -0700 Subject: memcg: remove direct page_cgroup-to-page pointer In struct page_cgroup, we have a full word for flags but only a few are reserved. Use the remaining upper bits to encode, depending on configuration, the node or the section, to enable page_cgroup-to-page lookups without a direct pointer. This saves a full word for every page in a system with memory cgroups enabled. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +-- mm/page_cgroup.c | 91 ++++++++++++++++++++++++++++++++++---------------------- 2 files changed, 57 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e286e1603e4f..660dfc27d971 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1080,7 +1080,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, if (unlikely(!PageCgroupUsed(pc))) continue; - page = pc->page; + page = lookup_cgroup_page(pc); if (unlikely(!PageLRU(page))) continue; @@ -3344,7 +3344,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, } spin_unlock_irqrestore(&zone->lru_lock, flags); - page = pc->page; + page = lookup_cgroup_page(pc); ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); if (ret == -ENOMEM) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 59a3cd4c799d..6c3f7a6a481a 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,12 +11,11 @@ #include #include -static void __meminit -__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) +static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) { pc->flags = 0; + set_page_cgroup_array_id(pc, id); pc->mem_cgroup = NULL; - pc->page = pfn_to_page(pfn); INIT_LIST_HEAD(&pc->lru); } static unsigned long total_usage; @@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return base + offset; } +struct page *lookup_cgroup_page(struct page_cgroup *pc) +{ + unsigned long pfn; + struct page *page; + pg_data_t *pgdat; + + pgdat = NODE_DATA(page_cgroup_array_id(pc)); + pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; + page = pfn_to_page(pfn); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + static int __init alloc_node_page_cgroup(int nid) { struct page_cgroup *base, *pc; @@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid) return -ENOMEM; for (index = 0; index < nr_pages; index++) { pc = base + index; - __init_page_cgroup(pc, start_pfn + index); + init_page_cgroup(pc, nid); } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; @@ -105,46 +117,53 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return section->page_cgroup + pfn; } +struct page *lookup_cgroup_page(struct page_cgroup *pc) +{ + struct mem_section *section; + struct page *page; + unsigned long nr; + + nr = page_cgroup_array_id(pc); + section = __nr_to_section(nr); + page = pfn_to_page(pc - section->page_cgroup); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + /* __alloc_bootmem...() is protected by !slab_available() */ static int __init_refok init_section_page_cgroup(unsigned long pfn) { - struct mem_section *section = __pfn_to_section(pfn); struct page_cgroup *base, *pc; + struct mem_section *section; unsigned long table_size; + unsigned long nr; int nid, index; - if (!section->page_cgroup) { - nid = page_to_nid(pfn_to_page(pfn)); - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - VM_BUG_ON(!slab_is_available()); - if (node_state(nid, N_HIGH_MEMORY)) { - base = kmalloc_node(table_size, - GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); - if (!base) - base = vmalloc(table_size); - } - /* - * The value stored in section->page_cgroup is (base - pfn) - * and it does not point to the memory block allocated above, - * causing kmemleak false positives. - */ - kmemleak_not_leak(base); + nr = pfn_to_section_nr(pfn); + section = __nr_to_section(nr); + + if (section->page_cgroup) + return 0; + + nid = page_to_nid(pfn_to_page(pfn)); + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + VM_BUG_ON(!slab_is_available()); + if (node_state(nid, N_HIGH_MEMORY)) { + base = kmalloc_node(table_size, + GFP_KERNEL | __GFP_NOWARN, nid); + if (!base) + base = vmalloc_node(table_size, nid); } else { - /* - * We don't have to allocate page_cgroup again, but - * address of memmap may be changed. So, we have to initialize - * again. - */ - base = section->page_cgroup + pfn; - table_size = 0; - /* check address of memmap is changed or not. */ - if (base->page == pfn_to_page(pfn)) - return 0; + base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); + if (!base) + base = vmalloc(table_size); } + /* + * The value stored in section->page_cgroup is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); if (!base) { printk(KERN_ERR "page cgroup allocation failure\n"); @@ -153,7 +172,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) for (index = 0; index < PAGES_PER_SECTION; index++) { pc = base + index; - __init_page_cgroup(pc, pfn + index); + init_page_cgroup(pc, nr); } section->page_cgroup = base - pfn; -- cgit v1.2.3 From 4dc03de1b29901b61cb27e4cab44a7f578dc0fc9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:31 -0700 Subject: memcg: charged pages always have valid per-memcg zone info page_cgroup_zoneinfo() will never return NULL for a charged page, remove the check for it in mem_cgroup_get_reclaim_stat_from_page(). Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 660dfc27d971..b35a28d80ab5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1043,9 +1043,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - if (!mz) - return NULL; - return &mz->reclaim_stat; } -- cgit v1.2.3 From bf1ff2635a5fda207fc870df348bfc766e8dcd4d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:32 -0700 Subject: memcg: remove memcg->reclaim_param_lock The reclaim_param_lock is only taken around single reads and writes to integer variables and is thus superfluous. Drop it. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b35a28d80ab5..8f9381976c59 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -218,12 +218,6 @@ struct mem_cgroup { * per zone LRU lists. */ struct mem_cgroup_lru_info info; - - /* - protect against reclaim related member. - */ - spinlock_t reclaim_param_lock; - /* * While reclaiming in a hierarchy, we cache the last child we * reclaimed from. @@ -1130,17 +1124,12 @@ static unsigned long long mem_cgroup_margin(struct mem_cgroup *mem) static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; - unsigned int swappiness; /* root ? */ if (cgrp->parent == NULL) return vm_swappiness; - spin_lock(&memcg->reclaim_param_lock); - swappiness = memcg->swappiness; - spin_unlock(&memcg->reclaim_param_lock); - - return swappiness; + return memcg->swappiness; } static void mem_cgroup_start_move(struct mem_cgroup *mem) @@ -1356,13 +1345,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) rcu_read_unlock(); /* Updates scanning parameter */ - spin_lock(&root_mem->reclaim_param_lock); if (!css) { /* this means start scan from ID:1 */ root_mem->last_scanned_child = 0; } else root_mem->last_scanned_child = found; - spin_unlock(&root_mem->reclaim_param_lock); } return ret; @@ -3868,9 +3855,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; } - spin_lock(&memcg->reclaim_param_lock); memcg->swappiness = val; - spin_unlock(&memcg->reclaim_param_lock); cgroup_unlock(); @@ -4526,7 +4511,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->memsw, NULL); } mem->last_scanned_child = 0; - spin_lock_init(&mem->reclaim_param_lock); INIT_LIST_HEAD(&mem->oom_notify); if (parent) -- cgit v1.2.3 From e7018b8d27e0c9aa2200e5b393e0fe9093c6565c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:33 -0700 Subject: memcg: keep only one charge cancelling function We have two charge cancelling functions: one takes a page count, the other a page size. The second one just divides the parameter by PAGE_SIZE and then calls the first one. This is trivial, no need for an extra function. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8f9381976c59..09a450684f70 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2031,21 +2031,17 @@ bypass: * gotten by try_charge(). */ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, - unsigned long count) + unsigned int nr_pages) { if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE * count); + unsigned long bytes = nr_pages * PAGE_SIZE; + + res_counter_uncharge(&mem->res, bytes); if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); + res_counter_uncharge(&mem->memsw, bytes); } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, - int page_size) -{ - __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); -} - /* * A helper function to get mem_cgroup from ID. must be called under * rcu_read_lock(). The caller must check css_is_removed() or some if @@ -2104,7 +2100,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem, page_size); + __mem_cgroup_cancel_charge(mem, nr_pages); return; } /* @@ -2242,7 +2238,7 @@ static int mem_cgroup_move_account(struct page *page, struct page_cgroup *pc, mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from, charge_size); + __mem_cgroup_cancel_charge(from, nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -2307,7 +2303,7 @@ static int mem_cgroup_move_parent(struct page *page, ret = mem_cgroup_move_account(page, pc, child, parent, true, page_size); if (ret) - mem_cgroup_cancel_charge(parent, page_size); + __mem_cgroup_cancel_charge(parent, page_size >> PAGE_SHIFT); if (page_size > PAGE_SIZE) compound_unlock_irqrestore(page, flags); @@ -2538,7 +2534,7 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem, PAGE_SIZE); + __mem_cgroup_cancel_charge(mem, 1); } static void -- cgit v1.2.3 From 11c9ea4e80fc3be83485667204c68d0a732f3757 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:34 -0700 Subject: memcg: convert per-cpu stock from bytes to page granularity We never keep subpage quantities in the per-cpu stock. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09a450684f70..91120a04f935 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1650,14 +1650,14 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat); #define CHARGE_SIZE (32 * PAGE_SIZE) struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ - int charge; + unsigned int nr_pages; struct work_struct work; }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static atomic_t memcg_drain_count; /* - * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed + * Try to consume stocked charge on this cpu. If success, one page is consumed * from local stock and true is returned. If the stock is 0 or charges from a * cgroup which is not current target, returns false. This stock will be * refilled. @@ -1668,8 +1668,8 @@ static bool consume_stock(struct mem_cgroup *mem) bool ret = true; stock = &get_cpu_var(memcg_stock); - if (mem == stock->cached && stock->charge) - stock->charge -= PAGE_SIZE; + if (mem == stock->cached && stock->nr_pages) + stock->nr_pages--; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -1683,13 +1683,15 @@ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; - if (stock->charge) { - res_counter_uncharge(&old->res, stock->charge); + if (stock->nr_pages) { + unsigned long bytes = stock->nr_pages * PAGE_SIZE; + + res_counter_uncharge(&old->res, bytes); if (do_swap_account) - res_counter_uncharge(&old->memsw, stock->charge); + res_counter_uncharge(&old->memsw, bytes); + stock->nr_pages = 0; } stock->cached = NULL; - stock->charge = 0; } /* @@ -1706,7 +1708,7 @@ static void drain_local_stock(struct work_struct *dummy) * Cache charges(val) which is from res_counter, to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *mem, int val) +static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) { struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); @@ -1714,7 +1716,7 @@ static void refill_stock(struct mem_cgroup *mem, int val) drain_stock(stock); stock->cached = mem; } - stock->charge += val; + stock->nr_pages += nr_pages; put_cpu_var(memcg_stock); } @@ -2012,7 +2014,7 @@ again: } while (ret != CHARGE_OK); if (csize > page_size) - refill_stock(mem, csize - page_size); + refill_stock(mem, (csize - page_size) >> PAGE_SHIFT); css_put(&mem->css); done: *memcg = mem; -- cgit v1.2.3 From 7ffd4ca7a2cdd7a18f0b499a4e9e0e7cf36ba018 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:35 -0700 Subject: memcg: convert uncharge batching from bytes to page granularity We never uncharge subpage quantities. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 91120a04f935..9dfbed2aacc9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2579,9 +2579,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, if (batch->memcg != mem) goto direct_uncharge; /* remember freed charge and uncharge it later */ - batch->bytes += PAGE_SIZE; + batch->nr_pages++; if (uncharge_memsw) - batch->memsw_bytes += PAGE_SIZE; + batch->memsw_nr_pages++; return; direct_uncharge: res_counter_uncharge(&mem->res, page_size); @@ -2708,8 +2708,8 @@ void mem_cgroup_uncharge_start(void) /* We can do nest. */ if (current->memcg_batch.do_batch == 1) { current->memcg_batch.memcg = NULL; - current->memcg_batch.bytes = 0; - current->memcg_batch.memsw_bytes = 0; + current->memcg_batch.nr_pages = 0; + current->memcg_batch.memsw_nr_pages = 0; } } @@ -2730,10 +2730,12 @@ void mem_cgroup_uncharge_end(void) * This "batch->memcg" is valid without any css_get/put etc... * bacause we hide charges behind us. */ - if (batch->bytes) - res_counter_uncharge(&batch->memcg->res, batch->bytes); - if (batch->memsw_bytes) - res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + if (batch->nr_pages) + res_counter_uncharge(&batch->memcg->res, + batch->nr_pages * PAGE_SIZE); + if (batch->memsw_nr_pages) + res_counter_uncharge(&batch->memcg->memsw, + batch->memsw_nr_pages * PAGE_SIZE); memcg_oom_recover(batch->memcg); /* forget this pointer (for sanity check) */ batch->memcg = NULL; -- cgit v1.2.3 From 7ec99d6213b579a84c85ad37f2aa8ded4857c53c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:36 -0700 Subject: memcg: unify charge/uncharge quantities to units of pages There is no clear pattern when we pass a page count and when we pass a byte count that is a multiple of PAGE_SIZE. We never charge or uncharge subpage quantities, so convert it all to page counts. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 135 +++++++++++++++++++++++++++----------------------------- 1 file changed, 65 insertions(+), 70 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9dfbed2aacc9..bc02218eab01 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1109,16 +1109,16 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * @mem: the memory cgroup * * Returns the maximum amount of memory @mem can be charged with, in - * bytes. + * pages. */ -static unsigned long long mem_cgroup_margin(struct mem_cgroup *mem) +static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) { unsigned long long margin; margin = res_counter_margin(&mem->res); if (do_swap_account) margin = min(margin, res_counter_margin(&mem->memsw)); - return margin; + return margin >> PAGE_SHIFT; } static unsigned int get_swappiness(struct mem_cgroup *memcg) @@ -1647,7 +1647,7 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat); * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. */ -#define CHARGE_SIZE (32 * PAGE_SIZE) +#define CHARGE_BATCH 32U struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -1822,9 +1822,10 @@ enum { CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; -static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, - int csize, bool oom_check) +static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, + unsigned int nr_pages, bool oom_check) { + unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; unsigned long flags = 0; @@ -1845,14 +1846,13 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * csize can be either a huge page (HPAGE_SIZE), a batch of - * regular pages (CHARGE_SIZE), or a single regular page - * (PAGE_SIZE). + * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch + * of regular pages (CHARGE_BATCH), or a single regular page (1). * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (csize == CHARGE_SIZE) + if (nr_pages == CHARGE_BATCH) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) @@ -1860,7 +1860,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, gfp_mask, flags); - if (mem_cgroup_margin(mem_over_limit) >= csize) + if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* * Even though the limit is exceeded at this point, reclaim @@ -1871,7 +1871,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (csize == PAGE_SIZE && ret) + if (nr_pages == 1 && ret) return CHARGE_RETRY; /* @@ -1897,13 +1897,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, - struct mem_cgroup **memcg, bool oom, - int page_size) + unsigned int nr_pages, + struct mem_cgroup **memcg, + bool oom) { + unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1928,7 +1929,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (page_size == PAGE_SIZE && consume_stock(mem)) + if (nr_pages == 1 && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1951,7 +1952,7 @@ again: rcu_read_unlock(); goto done; } - if (page_size == PAGE_SIZE && consume_stock(mem)) { + if (nr_pages == 1 && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1986,13 +1987,12 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); - + ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); switch (ret) { case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = page_size; + batch = nr_pages; css_put(&mem->css); mem = NULL; goto again; @@ -2013,8 +2013,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > page_size) - refill_stock(mem, (csize - page_size) >> PAGE_SHIFT); + if (batch > nr_pages) + refill_stock(mem, batch - nr_pages); css_put(&mem->css); done: *memcg = mem; @@ -2093,12 +2093,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, struct page *page, + unsigned int nr_pages, struct page_cgroup *pc, - enum charge_type ctype, - int page_size) + enum charge_type ctype) { - int nr_pages = page_size >> PAGE_SHIFT; - lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); @@ -2187,26 +2185,28 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) /** * mem_cgroup_move_account - move account of the page * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * @uncharge: whether we should call uncharge and css_put against @from. - * @charge_size: number of bytes to charge (regular or huge page) * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when charge_size > PAGE_SIZE + * - compound_lock is held when nr_pages > 1 * * This function doesn't do "charge" nor css_get to new cgroup. It should be * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is * true, this function does "uncharge" from old cgroup, but it doesn't if * @uncharge is false, so a caller should do "uncharge". */ -static int mem_cgroup_move_account(struct page *page, struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, - bool uncharge, int charge_size) +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct page_cgroup *pc, + struct mem_cgroup *from, + struct mem_cgroup *to, + bool uncharge) { - int nr_pages = charge_size >> PAGE_SHIFT; unsigned long flags; int ret; @@ -2219,7 +2219,7 @@ static int mem_cgroup_move_account(struct page *page, struct page_cgroup *pc, * hold it. */ ret = -EBUSY; - if (charge_size > PAGE_SIZE && !PageTransHuge(page)) + if (nr_pages > 1 && !PageTransHuge(page)) goto out; lock_page_cgroup(pc); @@ -2277,7 +2277,7 @@ static int mem_cgroup_move_parent(struct page *page, struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; - int page_size = PAGE_SIZE; + unsigned int nr_pages; unsigned long flags; int ret; @@ -2291,23 +2291,21 @@ static int mem_cgroup_move_parent(struct page *page, if (isolate_lru_page(page)) goto put; - if (PageTransHuge(page)) - page_size = HPAGE_SIZE; + nr_pages = hpage_nr_pages(page); parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, - &parent, false, page_size); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); if (ret || !parent) goto put_back; - if (page_size > PAGE_SIZE) + if (nr_pages > 1) flags = compound_lock_irqsave(page); - ret = mem_cgroup_move_account(page, pc, child, parent, true, page_size); + ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); if (ret) - __mem_cgroup_cancel_charge(parent, page_size >> PAGE_SHIFT); + __mem_cgroup_cancel_charge(parent, nr_pages); - if (page_size > PAGE_SIZE) + if (nr_pages > 1) compound_unlock_irqrestore(page, flags); put_back: putback_lru_page(page); @@ -2327,13 +2325,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; - int page_size = PAGE_SIZE; + unsigned int nr_pages = 1; struct page_cgroup *pc; bool oom = true; int ret; if (PageTransHuge(page)) { - page_size <<= compound_order(page); + nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); /* * Never OOM-kill a process for a huge page. The @@ -2345,11 +2343,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, pc = lookup_page_cgroup(page); BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); + ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, page, pc, ctype, page_size); + __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); return 0; } @@ -2465,13 +2463,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); + return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); } static void @@ -2487,7 +2485,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, page, pc, ctype, PAGE_SIZE); + __mem_cgroup_commit_charge(ptr, page, 1, pc, ctype); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2539,12 +2537,13 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) __mem_cgroup_cancel_charge(mem, 1); } -static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, - int page_size) +static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, + unsigned int nr_pages, + const enum charge_type ctype) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; + /* If swapout, usage of swap doesn't decrease */ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) uncharge_memsw = false; @@ -2568,7 +2567,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; - if (page_size != PAGE_SIZE) + if (nr_pages > 1) goto direct_uncharge; /* @@ -2584,9 +2583,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, batch->memsw_nr_pages++; return; direct_uncharge: - res_counter_uncharge(&mem->res, page_size); + res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, page_size); + res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2598,10 +2597,9 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { - int count; - struct page_cgroup *pc; struct mem_cgroup *mem = NULL; - int page_size = PAGE_SIZE; + unsigned int nr_pages = 1; + struct page_cgroup *pc; if (mem_cgroup_disabled()) return NULL; @@ -2610,11 +2608,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) return NULL; if (PageTransHuge(page)) { - page_size <<= compound_order(page); + nr_pages <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); } - - count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2647,7 +2643,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); ClearPageCgroupUsed(pc); /* @@ -2668,7 +2664,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype, page_size); + mem_cgroup_do_uncharge(mem, nr_pages, ctype); return mem; @@ -2860,8 +2856,8 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) { - struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + struct page_cgroup *pc; enum charge_type ctype; int ret = 0; @@ -2917,7 +2913,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, gfp_mask, ptr, false, PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2944,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, page, pc, ctype, PAGE_SIZE); + __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); return ret; } @@ -4598,8 +4594,7 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, - PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4945,8 +4940,8 @@ retry: if (isolate_lru_page(page)) goto put; pc = lookup_page_cgroup(page); - if (!mem_cgroup_move_account(page, pc, - mc.from, mc.to, false, PAGE_SIZE)) { + if (!mem_cgroup_move_account(page, 1, pc, + mc.from, mc.to, false)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; -- cgit v1.2.3 From e9f8974f2f559b00c87ccfba67bca3903f913d50 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:37 -0700 Subject: memcg: break out event counters from other stats For increasing and decreasing per-cpu cgroup usage counters it makes sense to use signed types, as single per-cpu values might go negative during updates. But this is not the case for only-ever-increasing event counters. All the counters have been signed 64-bit so far, which was enough to count events even with the sign bit wasted. This patch: - divides s64 counters into signed usage counters and unsigned monotonically increasing event counters. - converts unsigned event counters into 'unsigned long' rather than 'u64'. This matches the type used by the /proc/vmstat event counters. The next patch narrows the signed usage counters type (on 32-bit CPUs, that is). Signed-off-by: Johannes Weiner Signed-off-by: Greg Thelen Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 49 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc02218eab01..d884f758c0e3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -93,19 +93,22 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ - MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ - /* incremented at every pagein/pageout */ - MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ - MEM_CGROUP_STAT_NSTATS, }; +enum mem_cgroup_events_index { + MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ + MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ + MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ + MEM_CGROUP_EVENTS_NSTATS, +}; + struct mem_cgroup_stat_cpu { s64 count[MEM_CGROUP_STAT_NSTATS]; + unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; }; /* @@ -577,6 +580,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } +static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, + enum mem_cgroup_events_index idx) +{ + unsigned long val = 0; + int cpu; + + for_each_online_cpu(cpu) + val += per_cpu(mem->stat->events[idx], cpu); +#ifdef CONFIG_HOTPLUG_CPU + spin_lock(&mem->pcp_counter_lock); + val += mem->nocpu_base.events[idx]; + spin_unlock(&mem->pcp_counter_lock); +#endif + return val; +} + static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, bool file, int nr_pages) { @@ -589,13 +608,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); + __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); else { - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); + __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); preempt_enable(); } @@ -617,9 +636,9 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) { - s64 val; + unsigned long val; - val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); + val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); return !(val & ((1 << event_mask_shift) - 1)); } @@ -1773,6 +1792,12 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) per_cpu(mem->stat->count[i], cpu) = 0; mem->nocpu_base.count[i] += x; } + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + unsigned long x = per_cpu(mem->stat->events[i], cpu); + + per_cpu(mem->stat->events[i], cpu) = 0; + mem->nocpu_base.events[i] += x; + } /* need to clear ON_MOVE value, works as a kind of lock. */ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; spin_unlock(&mem->pcp_counter_lock); @@ -3725,9 +3750,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) s->stat[MCS_RSS] += val * PAGE_SIZE; val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); s->stat[MCS_PGPGOUT] += val; if (do_swap_account) { val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); -- cgit v1.2.3 From 7a159cc9d7987cdb4853f8711f5f89e01cfffe42 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:38 -0700 Subject: memcg: use native word page statistics counters The statistic counters are in units of pages, there is no reason to make them 64-bit wide on 32-bit machines. Make them native words. Since they are signed, this leaves 31 bit on 32-bit machines, which can represent roughly 8TB assuming a page size of 4k. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Johannes Weiner Signed-off-by: Greg Thelen Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 88 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d884f758c0e3..e5759b51f37e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0; #define do_swap_account (0) #endif -/* - * Per memcg event counter is incremented at every pagein/pageout. This counter - * is used for trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - * - * These values will be used as !((event) & ((1 <<(thresh)) - 1)) - */ -#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ -#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ /* * Statistics for memory cgroup. @@ -105,10 +96,24 @@ enum mem_cgroup_events_index { MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ MEM_CGROUP_EVENTS_NSTATS, }; +/* + * Per memcg event counter is incremented at every pagein/pageout. With THP, + * it will be incremated by the number of pages. This counter is used for + * for trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + */ +enum mem_cgroup_events_target { + MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_NTARGETS, +}; +#define THRESHOLDS_EVENTS_TARGET (128) +#define SOFTLIMIT_EVENTS_TARGET (1024) struct mem_cgroup_stat_cpu { - s64 count[MEM_CGROUP_STAT_NSTATS]; + long count[MEM_CGROUP_STAT_NSTATS]; unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long targets[MEM_CGROUP_NTARGETS]; }; /* @@ -546,11 +551,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) * common workload, threashold and synchonization as vmstat[] should be * implemented. */ -static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx) +static long mem_cgroup_read_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { + long val = 0; int cpu; - s64 val = 0; get_online_cpus(); for_each_online_cpu(cpu) @@ -564,9 +569,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, return val; } -static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) +static long mem_cgroup_local_usage(struct mem_cgroup *mem) { - s64 ret; + long ret; ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); @@ -634,13 +639,34 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, return total; } -static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) +static bool __memcg_event_check(struct mem_cgroup *mem, int target) +{ + unsigned long val, next; + + val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); + next = this_cpu_read(mem->stat->targets[target]); + /* from time_after() in jiffies.h */ + return ((long)next - (long)val < 0); +} + +static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) { - unsigned long val; + unsigned long val, next; val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); - return !(val & ((1 << event_mask_shift) - 1)); + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + default: + return; + } + + this_cpu_write(mem->stat->targets[target], next); } /* @@ -650,10 +676,15 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) static void memcg_check_events(struct mem_cgroup *mem, struct page *page) { /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { + if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { mem_cgroup_threshold(mem); - if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) + __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); + if (unlikely(__memcg_event_check(mem, + MEM_CGROUP_TARGET_SOFTLIMIT))){ mem_cgroup_update_tree(mem, page); + __mem_cgroup_target_update(mem, + MEM_CGROUP_TARGET_SOFTLIMIT); + } } } @@ -1787,7 +1818,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) spin_lock(&mem->pcp_counter_lock); for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { - s64 x = per_cpu(mem->stat->count[i], cpu); + long x = per_cpu(mem->stat->count[i], cpu); per_cpu(mem->stat->count[i], cpu) = 0; mem->nocpu_base.count[i] += x; @@ -3499,13 +3530,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, } -static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx) +static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; - s64 val = 0; + long val = 0; - /* each per cpu's value can be minus.Then, use s64 */ + /* Per-cpu values can be negative, use a signed accumulator */ for_each_mem_cgroup_tree(iter, mem) val += mem_cgroup_read_stat(iter, idx); @@ -3525,12 +3556,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) return res_counter_read_u64(&mem->memsw, RES_USAGE); } - val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); if (swap) - val += mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_SWAPOUT); + val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); return val << PAGE_SHIFT; } -- cgit v1.2.3 From 4be4489feae6da890765cc1bdc1af5e4f8c4b75f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 23 Mar 2011 16:42:39 -0700 Subject: mm/memcontrol.c: suppress uninitialized-var warning with older gcc's mm/memcontrol.c: In function 'mem_cgroup_force_empty': mm/memcontrol.c:2280: warning: 'flags' may be used uninitialized in this function It's a false positive. Cc: Balbir Singh Cc: Daisuke Nishimura Cc: Greg Thelen Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e5759b51f37e..61ffe712afe0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2334,7 +2334,7 @@ static int mem_cgroup_move_parent(struct page *page, struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; unsigned int nr_pages; - unsigned long flags; + unsigned long uninitialized_var(flags); int ret; /* Is ROOT ? */ -- cgit v1.2.3 From dde79e005a769d800166687c9e00d50d93e411ff Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 23 Mar 2011 16:42:40 -0700 Subject: page_cgroup: reduce allocation overhead for page_cgroup array for CONFIG_SPARSEMEM Currently we are allocating a single page_cgroup array per memory section (stored in mem_section->base) when CONFIG_SPARSEMEM is selected. This is correct but memory inefficient solution because the allocated memory (unless we fall back to vmalloc) is not kmalloc friendly: - 32b - 16384 entries (20B per entry) fit into 327680B so the 524288B slab cache is used - 32b with PAE - 131072 entries with 2621440B fit into 4194304B - 64b - 32768 entries (40B per entry) fit into 2097152 cache This is ~37% wasted space per memory section and it sumps up for the whole memory. On a x86_64 machine it is something like 6MB per 1GB of RAM. We can reduce the internal fragmentation by using alloc_pages_exact which allocates PAGE_SIZE aligned blocks so we will get down to <4kB wasted memory per section which is much better. We still need a fallback to vmalloc because we have no guarantees that we will have a continuous memory of that size (order-10) later on during the hotplug events. [hannes@cmpxchg.org: do not define unused free_page_cgroup() without memory hotplug] Signed-off-by: Michal Hocko Cc: Dave Hansen Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 58 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 6c3f7a6a481a..8e7577cdf5e4 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -130,7 +130,38 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc) return page; } -/* __alloc_bootmem...() is protected by !slab_available() */ +static void *__init_refok alloc_page_cgroup(size_t size, int nid) +{ + void *addr = NULL; + + addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN); + if (addr) + return addr; + + if (node_state(nid, N_HIGH_MEMORY)) + addr = vmalloc_node(size, nid); + else + addr = vmalloc(size); + + return addr; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_cgroup(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + if (!PageReserved(page)) { /* Is bootmem ? */ + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + free_pages_exact(addr, table_size); + } + } +} +#endif + static int __init_refok init_section_page_cgroup(unsigned long pfn) { struct page_cgroup *base, *pc; @@ -147,17 +178,8 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - VM_BUG_ON(!slab_is_available()); - if (node_state(nid, N_HIGH_MEMORY)) { - base = kmalloc_node(table_size, - GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); - if (!base) - base = vmalloc(table_size); - } + base = alloc_page_cgroup(table_size, nid); + /* * The value stored in section->page_cgroup is (base - pfn) * and it does not point to the memory block allocated above, @@ -189,16 +211,8 @@ void __free_page_cgroup(unsigned long pfn) if (!ms || !ms->page_cgroup) return; base = ms->page_cgroup + pfn; - if (is_vmalloc_addr(base)) { - vfree(base); - ms->page_cgroup = NULL; - } else { - struct page *page = virt_to_page(base); - if (!PageReserved(page)) { /* Is bootmem ? */ - kfree(base); - ms->page_cgroup = NULL; - } - } + free_page_cgroup(base); + ms->page_cgroup = NULL; } int __meminit online_page_cgroup(unsigned long start_pfn, -- cgit v1.2.3 From 6cfddb261555dd0c0529a5fb7cf8bc5b85ad95a5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 23 Mar 2011 16:42:41 -0700 Subject: memcg: page_cgroup array is never stored on reserved pages KAMEZAWA Hiroyuki noted that free_pages_cgroup doesn't have to check for PageReserved because we never store the array on reserved pages (neither alloc_pages_exact nor vmalloc use those pages). So we can replace the check by a BUG_ON. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 8e7577cdf5e4..a12cc3fa9859 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -153,11 +153,11 @@ static void free_page_cgroup(void *addr) vfree(addr); } else { struct page *page = virt_to_page(addr); - if (!PageReserved(page)) { /* Is bootmem ? */ - size_t table_size = - sizeof(struct page_cgroup) * PAGES_PER_SECTION; - free_pages_exact(addr, table_size); - } + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); } } #endif -- cgit v1.2.3 From 5a6475a4e162200f43855e2d42bbf55bcca1a9f2 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 23 Mar 2011 16:42:42 -0700 Subject: memcg: fix leak on wrong LRU with FUSE fs/fuse/dev.c::fuse_try_move_page() does (1) remove a page by ->steal() (2) re-add the page to page cache (3) link the page to LRU if it was not on LRU at (1) This implies the page is _on_ LRU when it's added to radix-tree. So, the page is added to memory cgroup while it's on LRU. because LRU is lazy and no one flushs it. This is the same behavior as SwapCache and needs special care as - remove page from LRU before overwrite pc->mem_cgroup. - add page to LRU after overwrite pc->mem_cgroup. And we need to taking care of pagevec. If PageLRU(page) is set before we add PCG_USED bit, the page will not be added to memcg's LRU (in short period). So, regardlress of PageLRU(page) value before commit_charge(), we need to check PageLRU(page) after commit_charge(). Addresses https://bugzilla.kernel.org/show_bug.cgi?id=30432 Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Acked-by: Daisuke Nishimura Cc: Miklos Szeredi Cc: Balbir Singh Reported-by: Daniel Poelzleithner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 70 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 61ffe712afe0..1f0b460fe58c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -926,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) } /* - * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to - * lru because the page may.be reused after it's fully uncharged (because of - * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge - * it again. This function is only used to charge SwapCache. It's done under - * lock_page and expected that zone->lru_lock is never held. + * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed + * while it's linked to lru because the page may be reused after it's fully + * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. + * It's done under lock_page and expected that zone->lru_lock isnever held. */ -static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) +static void mem_cgroup_lru_del_before_commit(struct page *page) { unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Doing this check without taking ->lru_lock seems wrong but this + * is safe. Because if page_cgroup's USED bit is unset, the page + * will not be added to any memcg's LRU. If page_cgroup's USED bit is + * set, the commit after this will fail, anyway. + * This all charge/uncharge is done under some mutual execustion. + * So, we don't need to taking care of changes in USED bit. + */ + if (likely(!PageLRU(page))) + return; + spin_lock_irqsave(&zone->lru_lock, flags); /* * Forget old LRU when this page_cgroup is *not* used. This Used bit @@ -948,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) spin_unlock_irqrestore(&zone->lru_lock, flags); } -static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) +static void mem_cgroup_lru_add_after_commit(struct page *page) { unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); + /* taking care of that the page is added to LRU while we commit it */ + if (likely(!PageLRU(page))) + return; spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ if (PageLRU(page) && !PageCgroupAcctLRU(pc)) @@ -2431,9 +2444,26 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype); +static void +__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, + enum charge_type ctype) +{ + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * In some case, SwapCache, FUSE(splice_buf->radixtree), the page + * is already on LRU. It means the page may on some other page_cgroup's + * LRU. Take care of it. + */ + mem_cgroup_lru_del_before_commit(page); + __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); + mem_cgroup_lru_add_after_commit(page); + return; +} + int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + struct mem_cgroup *mem = NULL; int ret; if (mem_cgroup_disabled()) @@ -2468,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (unlikely(!mm)) mm = &init_mm; - if (page_is_file_cache(page)) - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE); + if (page_is_file_cache(page)) { + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); + if (ret || !mem) + return ret; + /* + * FUSE reuses pages without going through the final + * put that would remove them from the LRU list, make + * sure that they get relinked properly. + */ + __mem_cgroup_commit_charge_lrucare(page, mem, + MEM_CGROUP_CHARGE_TYPE_CACHE); + return ret; + } /* shmem */ if (PageSwapCache(page)) { - struct mem_cgroup *mem; - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); if (!ret) __mem_cgroup_commit_charge_swapin(page, mem, @@ -2532,17 +2570,13 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype) { - struct page_cgroup *pc; - if (mem_cgroup_disabled()) return; if (!ptr) return; cgroup_exclude_rmdir(&ptr->css); - pc = lookup_page_cgroup(page); - mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, page, 1, pc, ctype); - mem_cgroup_lru_add_after_commit_swapcache(page); + + __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. -- cgit v1.2.3 From f9434ad1552427fab49336e1a6e3ef121895b9d1 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 23 Mar 2011 16:42:44 -0700 Subject: memcg: give current access to memory reserves if it's trying to die When a memcg is oom and current has already received a SIGKILL, then give it access to memory reserves with a higher scheduling priority so that it may quickly exit and free its memory. This is identical to the global oom killer and is done even before checking for panic_on_oom: a pending SIGKILL here while panic_on_oom is selected is guaranteed to have come from userspace; the thread only needs access to memory reserves to exit and thus we don't unnecessarily panic the machine until the kernel has no last resort to free memory. Signed-off-by: David Rientjes Cc: Balbir Singh Cc: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3100bc57036b..62a5cec08a17 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -549,6 +549,17 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) unsigned int points = 0; struct task_struct *p; + /* + * If current has a pending SIGKILL, then automatically select it. The + * goal is to allow it to allocate so that it may quickly exit and free + * its memory. + */ + if (fatal_signal_pending(current)) { + set_thread_flag(TIF_MEMDIE); + boost_dying_task_prio(current, NULL); + return; + } + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; read_lock(&tasklist_lock); -- cgit v1.2.3 From 93a72052be81823fa1584b9be037d51924f9efa4 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Wed, 23 Mar 2011 16:43:29 -0700 Subject: crash_dump: export is_kdump_kernel to modules, consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn The Xen PV drivers in a crashed HVM guest can not connect to the dom0 backend drivers because both frontend and backend drivers are still in connected state. To run the connection reset function only in case of a crashdump, the is_kdump_kernel() function needs to be available for the PV driver modules. Consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn into kernel/crash_dump.c Also export elfcorehdr_addr to make is_kdump_kernel() usable for modules. Leave 'elfcorehdr' as early_param(). This changes powerpc from __setup() to early_param(). It adds an address range check from x86 also on ia64 and powerpc. [akpm@linux-foundation.org: additional #includes] [akpm@linux-foundation.org: remove elfcorehdr_addr export] [akpm@linux-foundation.org: fix for Tejun's mm/nobootmem.c changes] Signed-off-by: Olaf Hering Cc: Russell King Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 8 -------- mm/nobootmem.c | 8 -------- 2 files changed, 16 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 07aeb89e396e..01d5a4b3dd0c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -34,14 +34,6 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -#ifdef CONFIG_CRASH_DUMP -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; -#endif - bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e2bdb07079ce..e99f6cd1da1f 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -32,14 +32,6 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -#ifdef CONFIG_CRASH_DUMP -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; -#endif - static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { -- cgit v1.2.3 From f9b615de4663c4b852e07257e9f967df6a0161c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 Mar 2011 21:26:46 +0200 Subject: slub: Fix debugobjects with lockless fastpath On Thu, 24 Mar 2011, Ingo Molnar wrote: > RIP: 0010:[] [] get_next_timer_interrupt+0x119/0x260 That's a typical timer crash, but you were unable to debug it with debugobjects because commit d3f661d6 broke those. Cc: Christoph Lameter Tested-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 93de30db95f5..a6a783594ad4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -849,11 +849,11 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) local_irq_save(flags); kmemcheck_slab_free(s, x, s->objsize); debug_check_no_locks_freed(x, s->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->objsize); local_irq_restore(flags); } #endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->objsize); } /* -- cgit v1.2.3 From b8c4c96ed4cdecf5ae51fc6f4c006658e873047f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 24 Mar 2011 14:51:38 -0500 Subject: SLUB: Write to per cpu data when allocating it It turns out that the cmpxchg16b emulation has to access vmalloced percpu memory with interrupts disabled. If the memory has never been touched before then the fault necessary to establish the mapping will not to occur and the kernel will fail on boot. Fix that by reusing the CONFIG_PREEMPT code that writes the cpu number into a field on every cpu. Writing to the per cpu area before causes the mapping to be established before we get to a cmpxchg16b emulation. Tested-by: Ingo Molnar Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a6a783594ad4..f881874843a5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1604,7 +1604,7 @@ static inline void note_cmpxchg_failure(const char *n, void init_kmem_cache_cpus(struct kmem_cache *s) { -#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT) +#ifdef CONFIG_CMPXCHG_LOCAL int cpu; for_each_possible_cpu(cpu) -- cgit v1.2.3 From b2b755b5f10eb32fbdc73a9907c07006b17f714b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 24 Mar 2011 15:18:15 -0700 Subject: lib, arch: add filter argument to show_mem and fix private implementations Commit ddd588b5dd55 ("oom: suppress nodes that are not allowed from meminfo on oom kill") moved lib/show_mem.o out of lib/lib.a, which resulted in build warnings on all architectures that implement their own versions of show_mem(): lib/lib.a(show_mem.o): In function `show_mem': show_mem.c:(.text+0x1f4): multiple definition of `show_mem' arch/sparc/mm/built-in.o:(.text+0xd70): first defined here The fix is to remove __show_mem() and add its argument to show_mem() in all implementations to prevent this breakage. Architectures that implement their own show_mem() actually don't do anything with the argument yet, but they could be made to filter nodes that aren't allowed in the current context in the future just like the generic implementation. Reported-by: Stephen Rothwell Reported-by: James Bottomley Suggested-by: Andrew Morton Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 62a5cec08a17..6a819d1b2c7d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -406,7 +406,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(current); dump_stack(); mem_cgroup_print_oom_info(mem, p); - __show_mem(SHOW_MEM_FILTER_NODES); + show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(mem, nodemask); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e5726ab0d85..d6e7ba7373be 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2195,7 +2195,7 @@ nopage: current->comm, order, gfp_mask); dump_stack(); if (!should_suppress_show_mem()) - __show_mem(filter); + show_mem(filter); } return page; got_pg: -- cgit v1.2.3 From 250df6ed274d767da844a5d9f05720b804240197 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:36 +1100 Subject: fs: protect inode->i_state with inode->i_lock Protect inode state transitions and validity checks with the inode->i_lock. This enables us to make inode state transitions independently of the inode_lock and is the first step to peeling away the inode_lock from the code. This requires that __iget() is done atomically with i_state checks during list traversals so that we don't race with another thread marking the inode I_FREEING between the state check and grabbing the reference. Also remove the unlock_new_inode() memory barrier optimisation required to avoid taking the inode_lock when clearing I_NEW. Simplify the code by simply taking the inode->i_lock around the state change and wakeup. Because the wakeup is no longer tricky, remove the wake_up_inode() function and open code the wakeup where necessary. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- mm/filemap.c | 2 ++ mm/rmap.c | 1 + 2 files changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index f807afda86f2..499e9aa91450 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -99,7 +99,9 @@ * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (page_remove_rmap->set_page_dirty) + * ->inode->i_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (zap_pte_range->set_page_dirty) + * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * * (code doesn't rely on that order, so you could switch it around) diff --git a/mm/rmap.c b/mm/rmap.c index 4a8e99a0fb97..7dada0456448 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,6 +32,7 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * inode_lock (in set_page_dirty's __mark_inode_dirty) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, -- cgit v1.2.3 From a66979abad090b2765a6c6790c9fdeab996833f2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:41 +1100 Subject: fs: move i_wb_list out from under inode_lock Protect the inode writeback list with a new global lock inode_wb_list_lock and use it to protect the list manipulations and traversals. This lock replaces the inode_lock as the inodes on the list can be validity checked while holding the inode->i_lock and hence the inode_lock is no longer needed to protect the list. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- mm/backing-dev.c | 8 ++++---- mm/filemap.c | 8 ++++---- mm/rmap.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d30227..4b3e9f17ee21 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -73,14 +73,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct inode *inode; nr_wb = nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -682,11 +682,11 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } bdi_unregister(bdi); diff --git a/mm/filemap.c b/mm/filemap.c index 499e9aa91450..d8b34d1a1071 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -80,8 +80,8 @@ * ->i_mutex * ->i_alloc_sem (various) * - * ->inode_lock - * ->sb_lock (fs/fs-writeback.c) + * inode_wb_list_lock + * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * * ->i_mmap_lock @@ -98,9 +98,9 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) + * inode_wb_list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (zap_pte_range->set_page_dirty) + * inode_wb_list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * diff --git a/mm/rmap.c b/mm/rmap.c index 7dada0456448..8da044a1db0f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -31,12 +31,12 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) - * inode_lock (in set_page_dirty's __mark_inode_dirty) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, - * within inode_lock in __sync_single_inode) + * within inode_wb_list_lock in __sync_single_inode) * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock -- cgit v1.2.3 From ae91dbfc9949cf042c45798557b48d3b83bc3635 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 26 Mar 2011 13:27:01 -0700 Subject: mm: fix memory.c incorrect kernel-doc Fix mm/memory.c incorrect kernel-doc function notation: Warning(mm/memory.c:3718): Cannot understand * @access_remote_vm - access another process' address space on line 3718 - I thought it was a doc line Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 51a5c23704af..9da8cab1b1b0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3715,7 +3715,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, } /** - * @access_remote_vm - access another process' address space + * access_remote_vm - access another process' address space * @mm: the mm_struct of the target address space * @addr: start address to access * @buf: source or destination buffer -- cgit v1.2.3 From eac522ef438f8ea173569fd0469371bc5d317947 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 28 Mar 2011 12:53:29 +0100 Subject: NOMMU: percpu should use is_vmalloc_addr(). per_cpu_ptr_to_phys() uses VMALLOC_START and VMALLOC_END to determine if an address is in the vmalloc() region or not. This is incorrect on NOMMU as there is no real vmalloc() capability (vmalloc() is emulated by kmalloc()). The correct way to do this is to use is_vmalloc_addr(). This encapsulates the vmalloc() region test in MMU mode and just returns 0 in NOMMU mode. On FRV in NOMMU mode, the percpu compilation fails without this patch: mm/percpu.c: In function 'per_cpu_ptr_to_phys': mm/percpu.c:1011: error: 'VMALLOC_START' undeclared (first use in this function) mm/percpu.c:1011: error: (Each undeclared identifier is reported only once mm/percpu.c:1011: error: for each function it appears in.) mm/percpu.c:1012: error: 'VMALLOC_END' undeclared (first use in this function) mm/percpu.c:1018: warning: control reaches end of non-void function Signed-off-by: David Howells --- mm/percpu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 3f930018aa60..55d4d113fbd3 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1008,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) } if (in_first_chunk) { - if ((unsigned long)addr < VMALLOC_START || - (unsigned long)addr >= VMALLOC_END) + if (!is_vmalloc_addr(addr)) return __pa(addr); else return page_to_phys(vmalloc_to_page(addr)); -- cgit v1.2.3 From f55f199b7d76a01e7ce9d1c3bb004327e075c327 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 29 Mar 2011 14:05:12 +0100 Subject: NOMMU: implement access_remote_vm Recent vm changes brought in a new function which the core procfs code utilizes. So implement it for nommu systems too to avoid link failures. Signed-off-by: Mike Frysinger Signed-off-by: David Howells Tested-by: Simon Horman Tested-by: Ithamar Adema Acked-by: Greg Ungerer --- mm/nommu.c | 52 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index cb86e7d5e7f5..c4c542c736a9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1971,21 +1971,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -/* - * Access another process' address space. - * - source/target buffer must be kernel space - */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) { struct vm_area_struct *vma; - struct mm_struct *mm; - - if (addr + len < addr) - return 0; - - mm = get_task_mm(tsk); - if (!mm) - return 0; down_read(&mm->mmap_sem); @@ -2010,6 +1999,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in } up_read(&mm->mmap_sem); + + return len; +} + +/** + * @access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + +/* + * Access another process' address space. + * - source/target buffer must be kernel space + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + + if (addr + len < addr) + return 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + len = __access_remote_vm(tsk, mm, addr, buf, len, write); + mmput(mm); return len; } -- cgit v1.2.3 From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 30 Mar 2011 22:57:33 -0300 Subject: Fix common misspellings Fixes generated by 'codespell' and manually reviewed. Signed-off-by: Lucas De Marchi --- mm/backing-dev.c | 2 +- mm/hugetlb.c | 10 +++++----- mm/hwpoison-inject.c | 2 +- mm/internal.h | 2 +- mm/kmemleak.c | 6 +++--- mm/ksm.c | 2 +- mm/memcontrol.c | 8 ++++---- mm/memory-failure.c | 6 +++--- mm/memory_hotplug.c | 2 +- mm/migrate.c | 2 +- mm/nobootmem.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_cgroup.c | 2 +- mm/percpu.c | 10 +++++----- mm/slab.c | 4 ++-- mm/slub.c | 8 ++++---- mm/sparse.c | 2 +- mm/util.c | 2 +- mm/vmscan.c | 4 ++-- 19 files changed, 40 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0d9a036ada66..befc87531e4f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -787,7 +787,7 @@ EXPORT_SYMBOL(congestion_wait); * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absense of zone congestion, cond_resched() is called to yield + * In the absence of zone congestion, cond_resched() is called to yield * the processor if necessary but otherwise does not sleep. * * The return value is 0 if the sleep is for the full timeout. Otherwise, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 06de5aa4d644..8ee3bd8ec5b5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t) if (rg->from > t) return chg; - /* We overlap with this area, if it extends futher than + /* We overlap with this area, if it extends further than * us then we must extend ourselves. Account for its * existing reservation. */ if (rg->to > t) { @@ -842,7 +842,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) } /* - * Increase the hugetlb pool such that it can accomodate a reservation + * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. */ static int gather_surplus_pages(struct hstate *h, int delta) @@ -890,7 +890,7 @@ retry: /* * The surplus_list now contains _at_least_ the number of extra pages - * needed to accomodate the reservation. Add the appropriate number + * needed to accommodate the reservation. Add the appropriate number * of pages to the hugetlb pool and free the extras back to the buddy * allocator. Commit the entire reservation here to prevent another * process from stealing the pages as they are added to the pool but @@ -2043,7 +2043,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA - * has a reference to the reservation map it cannot dissappear until + * has a reference to the reservation map it cannot disappear until * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ @@ -2490,7 +2490,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed - * COW. Warn that such a situation has occured as it may not be obvious + * COW. Warn that such a situation has occurred as it may not be obvious */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { printk(KERN_WARNING diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 0948f1072d6b..c7fc7fd00e32 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -1,4 +1,4 @@ -/* Inject a hwpoison memory failure on a arbitary pfn */ +/* Inject a hwpoison memory failure on a arbitrary pfn */ #include #include #include diff --git a/mm/internal.h b/mm/internal.h index 3438dd43a062..9d0ced8e505e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -162,7 +162,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset) } /* - * Iterator over all subpages withing the maximally aligned gigantic + * Iterator over all subpages within the maximally aligned gigantic * page 'base'. Handle any discontiguity in the mem_map. */ static inline struct page *mem_map_next(struct page *iter, diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 84225f3b7190..c1d5867543e4 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -265,7 +265,7 @@ static void kmemleak_disable(void); } while (0) /* - * Macro invoked when a serious kmemleak condition occured and cannot be + * Macro invoked when a serious kmemleak condition occurred and cannot be * recovered from. Kmemleak will be disabled and further allocation/freeing * tracing no longer available. */ @@ -1006,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object) /* * Memory scanning is a long process and it needs to be interruptable. This - * function checks whether such interrupt condition occured. + * function checks whether such interrupt condition occurred. */ static int scan_should_stop(void) { @@ -1733,7 +1733,7 @@ static int __init kmemleak_late_init(void) if (atomic_read(&kmemleak_error)) { /* - * Some error occured and kmemleak was disabled. There is a + * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately * after setting kmemleak_initialized and we may end up with * two clean-up threads but serialized by scan_mutex. diff --git a/mm/ksm.c b/mm/ksm.c index 1bbe785aa559..942dfc73a2ff 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -720,7 +720,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, swapped = PageSwapCache(page); flush_cache_page(vma, addr, page_to_pfn(page)); /* - * Ok this is tricky, when get_user_pages_fast() run it doesnt + * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make * with the pagecount against the mapcount is racey and * O_DIRECT can happen right after the check. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f0b460fe58c..010f9166fa6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1466,7 +1466,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, break; } /* - * We want to do more targetted reclaim. + * We want to do more targeted reclaim. * excess >> 2 is not to excessive so as to * reclaim too much, nor too less that we keep * coming back to reclaim from this cgroup @@ -2265,7 +2265,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) * - compound_lock is held when nr_pages > 1 * * This function doesn't do "charge" nor css_get to new cgroup. It should be - * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is + * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is * true, this function does "uncharge" from old cgroup, but it doesn't if * @uncharge is false, so a caller should do "uncharge". */ @@ -2318,7 +2318,7 @@ static int mem_cgroup_move_account(struct page *page, * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of * this function is just force_empty() and move charge, so it's - * garanteed that "to" is never removed. So, we don't check rmdir + * guaranteed that "to" is never removed. So, we don't check rmdir * status here. */ move_unlock_page_cgroup(pc, &flags); @@ -2648,7 +2648,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, batch->memcg = mem; /* * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continously can be expected to be in + * In those cases, all pages freed continuously can be expected to be in * the same cgroup and we have chance to coalesce uncharges. * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) * because we want to do uncharge as soon as possible. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 37feb9fec228..2b9a5eef39e0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -208,7 +208,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, * Don't use force here, it's convenient if the signal * can be temporarily blocked. * This could cause a loop when the user sets SIGBUS - * to SIG_IGN, but hopefully noone will do that? + * to SIG_IGN, but hopefully no one will do that? */ ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ if (ret < 0) @@ -634,7 +634,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * when the page is reread or dropped. If an * application assumes it will always get error on * fsync, but does other operations on the fd before - * and the page is dropped inbetween then the error + * and the page is dropped between then the error * will not be properly reported. * * This can already happen even without hwpoisoned @@ -728,7 +728,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) * The table matches them in order and calls the right handler. * * This is quite tricky because we can access page at any time - * in its live cycle, so all accesses have to be extremly careful. + * in its live cycle, so all accesses have to be extremely careful. * * This is not complete. More states could be added. * For any missing state don't attempt recovery. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 321fc7455df7..a2acaf820fe5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -724,7 +724,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) pfn); dump_page(page); #endif - /* Becasue we don't have big zone->lock. we should + /* Because we don't have big zone->lock. we should check this again here. */ if (page_count(page)) { not_managed++; diff --git a/mm/migrate.c b/mm/migrate.c index b0406d739ea7..34132f8e9109 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -375,7 +375,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) * redo the accounting that clear_page_dirty_for_io undid, * but we can't use set_page_dirty because that function * is actually a signal that all of the page has become dirty. - * Wheras only part of our page may be dirty. + * Whereas only part of our page may be dirty. */ __set_page_dirty_nobuffers(newpage); } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e99f6cd1da1f..9109049f0bbc 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -150,7 +150,7 @@ unsigned long __init free_all_bootmem(void) { /* * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesnt have RAM installed + * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 * Use MAX_NUMNODES will make sure all ranges in early_node_map[] * will be used instead of only Node0 related diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d6e7ba7373be..2747f5e5abc1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -942,7 +942,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * If breaking a large block of pages, move all free * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more - * agressive about taking ownership of free pages + * aggressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || start_migratetype == MIGRATE_RECLAIMABLE || @@ -3926,7 +3926,7 @@ static void __init find_usable_zone_for_movable(void) /* * The zone ranges provided by the architecture do not include ZONE_MOVABLE - * because it is sized independant of architecture. Unlike the other zones, + * because it is sized independent of architecture. Unlike the other zones, * the starting point for ZONE_MOVABLE is not fixed. It may be different * in each node depending on the size of each node and how evenly kernelcore * is distributed. This helper function adjusts the zone ranges diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index a12cc3fa9859..99055010cece 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -377,7 +377,7 @@ not_enough_page: * @new: new id * * Returns old id at success, 0 at failure. - * (There is no mem_cgroup useing 0 as its id) + * (There is no mem_cgroup using 0 as its id) */ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) diff --git a/mm/percpu.c b/mm/percpu.c index 55d4d113fbd3..a160db39b810 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -342,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * @chunk: chunk of interest * * Determine whether area map of @chunk needs to be extended to - * accomodate a new allocation. + * accommodate a new allocation. * * CONTEXT: * pcpu_lock. @@ -431,7 +431,7 @@ out_unlock: * depending on @head, is reduced by @tail bytes and @tail byte block * is inserted after the target block. * - * @chunk->map must have enough free slots to accomodate the split. + * @chunk->map must have enough free slots to accommodate the split. * * CONTEXT: * pcpu_lock. @@ -1435,7 +1435,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest - * which can accomodate 4k aligned segments which are equal to + * which can accommodate 4k aligned segments which are equal to * or larger than min_unit_size. */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); @@ -1550,7 +1550,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @alloc_fn: function to allocate percpu page - * @free_fn: funtion to free percpu page + * @free_fn: function to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. @@ -1678,7 +1678,7 @@ out_free: * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE - * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @free_fn: function to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * * This is a helper to ease setting up page-remapped first percpu diff --git a/mm/slab.c b/mm/slab.c index 568803f121a8..46a9c163a92f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -878,7 +878,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transfered to another + * However, when such objects are allocated or transferred to another * cache the pointers are not cleared and they could be counted as * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. @@ -2606,7 +2606,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); * * The cache must be empty before calling this function. * - * The caller must guarantee that noone will allocate memory from the cache + * The caller must guarantee that no one will allocate memory from the cache * during the kmem_cache_destroy(). */ void kmem_cache_destroy(struct kmem_cache *cachep) diff --git a/mm/slub.c b/mm/slub.c index f881874843a5..94d2a33a866e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -64,7 +64,7 @@ * we must stay away from it for a while since we may cause a bouncing * cacheline if we try to acquire the lock. So go onto the next slab. * If all pages are busy then we may allocate a new slab instead of reusing - * a partial slab. A new slab has noone operating on it and thus there is + * a partial slab. A new slab has no one operating on it and thus there is * no danger of cacheline contention. * * Interrupts are disabled during allocation and deallocation in order to @@ -1929,7 +1929,7 @@ redo: else { #ifdef CONFIG_CMPXCHG_LOCAL /* - * The cmpxchg will only match if there was no additonal + * The cmpxchg will only match if there was no additional * operation and if we are on the right processor. * * The cmpxchg does the following atomically (without lock semantics!) @@ -3547,7 +3547,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); - /* Honor the call site pointer we recieved. */ + /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); return ret; @@ -3577,7 +3577,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, ret = slab_alloc(s, gfpflags, node, caller); - /* Honor the call site pointer we recieved. */ + /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); return ret; diff --git a/mm/sparse.c b/mm/sparse.c index 93250207c5cf..aa64b12831a2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -500,7 +500,7 @@ void __init sparse_init(void) * so alloc 2M (with 2M align) and 24 bytes in turn will * make next 2M slip to one more 2M later. * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continously. + * here try to allocate 2M pages continuously. * * powerpc need to call sparse_init_one_section right after each * sparse_early_mem_map_alloc, so allocate usemap_map at first. diff --git a/mm/util.c b/mm/util.c index f126975ef23e..e7b103a6fd21 100644 --- a/mm/util.c +++ b/mm/util.c @@ -227,7 +227,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. - * If the architecture not support this fucntion, simply return with no + * If the architecture not support this function, simply return with no * page pinned */ int __attribute__((weak)) __get_user_pages_fast(unsigned long start, diff --git a/mm/vmscan.c b/mm/vmscan.c index f73b8657c2d0..c7f5a6d4b75b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1065,7 +1065,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * surrounding the tag page. Only take those pages of * the same active state as that tag page. We may safely * round the target page pfn down to the requested order - * as the mem_map is guarenteed valid out to MAX_ORDER, + * as the mem_map is guaranteed valid out to MAX_ORDER, * where that page is in a different zone we will detect * it from its zone id and abort this block scan. */ @@ -2224,7 +2224,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * o a 16M DMA zone that is balanced will not balance a zone on any * reasonable sized machine * o On all other machines, the top zone must be at least a reasonable - * precentage of the middle zones. For example, on 32-bit x86, highmem + * percentage of the middle zones. For example, on 32-bit x86, highmem * would need to be at least 256M for it to be balance a whole node. * Similarly, on x86-64 the Normal zone would need to be at least 1G * to balance a node on its own. These seemed like reasonable ratios. -- cgit v1.2.3 From 982134ba62618c2d69fbbbd166d0a11ee3b7e3d8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 7 Apr 2011 07:35:50 -0700 Subject: mm: avoid wrapping vm_pgoff in mremap() The normal mmap paths all avoid creating a mapping where the pgoff inside the mapping could wrap around due to overflow. However, an expanding mremap() can take such a non-wrapping mapping and make it bigger and cause a wrapping condition. Noticed by Robert Swiecki when running a system call fuzzer, where it caused a BUG_ON() due to terminally confusing the vma_prio_tree code. A vma dumping patch by Hugh then pinpointed the crazy wrapped case. Reported-and-tested-by: Robert Swiecki Acked-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/mremap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 1de98d492ddc..a7c1f9f9b941 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -277,9 +277,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (old_len > vma->vm_end - addr) goto Efault; - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { - if (new_len > old_len) + /* Need to be careful about a growing mapping */ + if (new_len > old_len) { + unsigned long pgoff; + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) goto Efault; + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + goto Einval; } if (vma->vm_flags & VM_LOCKED) { -- cgit v1.2.3 From 95042f9eb78a8d9a17455e2ef263f2f310ecef15 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Apr 2011 14:15:51 -0700 Subject: vm: fix mlock() on stack guard page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 53a7706d5ed8 ("mlock: do not hold mmap_sem for extended periods of time") changed mlock() to care about the exact number of pages that __get_user_pages() had brought it. Before, it would only care about errors. And that doesn't work, because we also handled one page specially in __mlock_vma_pages_range(), namely the stack guard page. So when that case was handled, the number of pages that the function returned was off by one. In particular, it could be zero, and then the caller would end up not making any progress at all. Rather than try to fix up that off-by-one error for the mlock case specially, this just moves the logic to handle the stack guard page into__get_user_pages() itself, thus making all the counts come out right automatically. Reported-by: Robert Święcki Cc: Hugh Dickins Cc: Oleg Nesterov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 26 ++++++++++++++++++-------- mm/mlock.c | 13 ------------- 2 files changed, 18 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 9da8cab1b1b0..b623a249918c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1410,6 +1410,13 @@ no_page_table: return page; } +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSDOWN) && + (vma->vm_start == addr) && + !vma_stack_continue(vma->vm_prev, addr); +} + /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task @@ -1488,7 +1495,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(mm); pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -1513,10 +1519,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pte_unmap(pte); return i ? : -EFAULT; } + vma = get_gate_vma(mm); if (pages) { struct page *page; - page = vm_normal_page(gate_vma, start, *pte); + page = vm_normal_page(vma, start, *pte); if (!page) { if (!(gup_flags & FOLL_DUMP) && is_zero_pfn(pte_pfn(*pte))) @@ -1530,12 +1537,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, get_page(page); } pte_unmap(pte); - if (vmas) - vmas[i] = gate_vma; - i++; - start += PAGE_SIZE; - nr_pages--; - continue; + goto next_page; } if (!vma || @@ -1549,6 +1551,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } + /* + * If we don't actually want the page itself, + * and it's the stack guard page, just skip it. + */ + if (!pages && stack_guard_page(vma, start)) + goto next_page; + do { struct page *page; unsigned int foll_flags = gup_flags; @@ -1631,6 +1640,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, flush_anon_page(vma, page, start); flush_dcache_page(page); } +next_page: if (vmas) vmas[i] = vma; i++; diff --git a/mm/mlock.c b/mm/mlock.c index 2689a08c79af..6b55e3efe0df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page) } } -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return (vma->vm_flags & VM_GROWSDOWN) && - (vma->vm_start == addr) && - !vma_stack_continue(vma->vm_prev, addr); -} - /** * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma @@ -188,12 +181,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) gup_flags |= FOLL_MLOCK; - /* We don't try to access the guard page of a stack vma */ - if (stack_guard_page(vma, start)) { - addr += PAGE_SIZE; - nr_pages--; - } - return __get_user_pages(current, mm, addr, nr_pages, gup_flags, NULL, NULL, nonblocking); } -- cgit v1.2.3 From a626ca6a656450e9f4df91d0dda238fff23285f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 Apr 2011 08:07:28 -0700 Subject: vm: fix vm_pgoff wrap in stack expansion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 982134ba6261 ("mm: avoid wrapping vm_pgoff in mremap()") fixed the case of a expanding mapping causing vm_pgoff wrapping when you used mremap. But there was another case where we expand mappings hiding in plain sight: the automatic stack expansion. This fixes that case too. This one also found by Robert Święcki, using his nasty system call fuzzer tool. Good job. Reported-and-tested-by: Robert Święcki Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/mmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2ec8eb5a9cdd..8c05e5b43b69 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1814,11 +1814,14 @@ static int expand_downwards(struct vm_area_struct *vma, size = vma->vm_end - address; grow = (vma->vm_start - address) >> PAGE_SHIFT; - error = acct_stack_growth(vma, size, grow); - if (!error) { - vma->vm_start = address; - vma->vm_pgoff -= grow; - perf_event_mmap(vma); + error = -ENOMEM; + if (grow <= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + vma->vm_start = address; + vma->vm_pgoff -= grow; + perf_event_mmap(vma); + } } } vma_unlock_anon_vma(vma); -- cgit v1.2.3 From 584208e6b4103d2cfb08a7889c9fa3540826e0d5 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Thu, 14 Apr 2011 15:21:53 -0700 Subject: mm: optimize pfn calculation in online_page() If CONFIG_FLATMEM is enabled pfn is calculated in online_page() more than once. It is possible to optimize that and use value established at beginning of that function. Signed-off-by: Daniel Kiper Acked-by: Dave Hansen Cc: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Mel Gorman Cc: Christoph Lameter Acked-by: David Rientjes Reviewed-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a2acaf820fe5..9ca1d604f7cd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -375,7 +375,7 @@ void online_page(struct page *page) #endif #ifdef CONFIG_FLATMEM - max_mapnr = max(page_to_pfn(page), max_mapnr); + max_mapnr = max(pfn, max_mapnr); #endif ClearPageReserved(page); -- cgit v1.2.3 From 9f6ae448bfc6cdf40279f43bb0b4fd159edc4e0a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 14 Apr 2011 15:21:57 -0700 Subject: mm/page_alloc.c: silence build_all_zonelists() section mismatch The memory hotplug case involves calling to build_all_zonelists() which in turns calls in to setup_zone_pageset(). The latter is marked __meminit while build_all_zonelists() itself has no particular annotation. build_all_zonelists() is only handed a non-NULL pointer in the case of memory hotplug through an existing __meminit path, so the setup_zone_pageset() reference is always safe. The options as such are either to flag build_all_zonelists() as __ref (as per __build_all_zonelists()), or to simply discard the __meminit annotation from setup_zone_pageset(). Signed-off-by: Paul Mundt Acked-by: Mel Gorman Cc: KAMEZAWA Hiroyuki Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2747f5e5abc1..9f8a97b9a350 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3176,7 +3176,7 @@ static __init_refok int __build_all_zonelists(void *data) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. */ -void build_all_zonelists(void *data) +void __ref build_all_zonelists(void *data) { set_zonelist_order(); -- cgit v1.2.3 From d3bc2367180f7ee6afe4ee6e886bfba3ad4eb290 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Apr 2011 15:21:58 -0700 Subject: vmstat: update comment regarding stat_threshold Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 772b39b87d95..8cb0f0a703e5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -321,9 +321,12 @@ static inline void mod_state(struct zone *zone, /* * The fetching of the stat_threshold is racy. We may apply * a counter threshold to the wrong the cpu if we get - * rescheduled while executing here. However, the following - * will apply the threshold again and therefore bring the - * counter under the threshold. + * rescheduled while executing here. However, the next + * counter update will apply the threshold again and + * therefore bring the counter under the threshold again. + * + * Most of the time the thresholds are the same anyways + * for all cpus in a zone. */ t = this_cpu_read(pcp->stat_threshold); -- cgit v1.2.3 From 81ab4201fb7d91d6b0cd9ad5b4b16776e4bed145 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 14 Apr 2011 15:22:06 -0700 Subject: mm: add VM counters for transparent hugepages I found it difficult to make sense of transparent huge pages without having any counters for its actions. Add some counters to vmstat for allocation of transparent hugepages and fallback to smaller pages. Optional patch, but useful for development and understanding the system. Contains improvements from Andrea Arcangeli and Johannes Weiner [akpm@linux-foundation.org: coding-style fixes] [hannes@cmpxchg.org: fix vmstat_text[] entries] Signed-off-by: Andi Kleen Acked-by: Andrea Arcangeli Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 25 +++++++++++++++++++++---- mm/vmstat.c | 9 +++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0a619e0e2e0b..1722683bde23 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -680,8 +680,11 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), vma, haddr, numa_node_id(), 0); - if (unlikely(!page)) + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK); goto out; + } + count_vm_event(THP_FAULT_ALLOC); if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { put_page(page); goto out; @@ -909,11 +912,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, new_page = NULL; if (unlikely(!new_page)) { + count_vm_event(THP_FAULT_FALLBACK); ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); put_page(page); goto out; } + count_vm_event(THP_FAULT_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); @@ -1390,6 +1395,7 @@ int split_huge_page(struct page *page) BUG_ON(!PageSwapBacked(page)); __split_huge_page(page, anon_vma); + count_vm_event(THP_SPLIT); BUG_ON(PageCompound(page)); out_unlock: @@ -1784,9 +1790,11 @@ static void collapse_huge_page(struct mm_struct *mm, node, __GFP_OTHER_NODE); if (unlikely(!new_page)) { up_read(&mm->mmap_sem); + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); return; } + count_vm_event(THP_COLLAPSE_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { up_read(&mm->mmap_sem); put_page(new_page); @@ -2151,8 +2159,11 @@ static void khugepaged_do_scan(struct page **hpage) #ifndef CONFIG_NUMA if (!*hpage) { *hpage = alloc_hugepage(khugepaged_defrag()); - if (unlikely(!*hpage)) + if (unlikely(!*hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); break; + } + count_vm_event(THP_COLLAPSE_ALLOC); } #else if (IS_ERR(*hpage)) @@ -2192,8 +2203,11 @@ static struct page *khugepaged_alloc_hugepage(void) do { hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); } while (unlikely(!hpage) && likely(khugepaged_enabled())); return hpage; @@ -2210,8 +2224,11 @@ static void khugepaged_loop(void) while (likely(khugepaged_enabled())) { #ifndef CONFIG_NUMA hpage = khugepaged_alloc_hugepage(); - if (unlikely(!hpage)) + if (unlikely(!hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); break; + } + count_vm_event(THP_COLLAPSE_ALLOC); #else if (IS_ERR(hpage)) { khugepaged_alloc_sleep(); diff --git a/mm/vmstat.c b/mm/vmstat.c index 8cb0f0a703e5..897ea9e88238 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -948,7 +948,16 @@ static const char * const vmstat_text[] = { "unevictable_pgs_cleared", "unevictable_pgs_stranded", "unevictable_pgs_mlockfreed", + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "thp_fault_alloc", + "thp_fault_fallback", + "thp_collapse_alloc", + "thp_collapse_alloc_failed", + "thp_split", #endif + +#endif /* CONFIG_VM_EVENTS_COUNTERS */ }; static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, -- cgit v1.2.3 From fc5da22ae35d4720be59af8787a8a6d5e4da9517 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 14 Apr 2011 15:22:07 -0700 Subject: tmpfs: fix off-by-one in max_blocks checks If you fill up a tmpfs, df was showing tmpfs 460800 - - - /tmp because of an off-by-one in the max_blocks checks. Fix it so df shows tmpfs 460800 460800 0 100% /tmp Signed-off-by: Hugh Dickins Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 58da7c150ba6..8fa27e4e582a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -421,7 +421,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long * a waste to allocate index if we cannot allocate data. */ if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - 1) >= 0) return ERR_PTR(-ENOSPC); percpu_counter_inc(&sbinfo->used_blocks); spin_lock(&inode->i_lock); @@ -1397,7 +1398,8 @@ repeat: shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { - if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks) >= 0 || shmem_acct_block(info->flags)) { spin_unlock(&info->lock); error = -ENOSPC; -- cgit v1.2.3 From 4471a675dfc7ca676c165079e91c712b09dc9ce4 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 14 Apr 2011 15:22:09 -0700 Subject: brk: COMPAT_BRK: fix detection of randomized brk 5520e89 ("brk: fix min_brk lower bound computation for COMPAT_BRK") tried to get the whole logic of brk randomization for legacy (libc5-based) applications finally right. It turns out that the way to detect whether brk has actually been randomized in the end or not introduced by that patch still doesn't work for those binaries, as reported by Geert: : /sbin/init from my old m68k ramdisk exists prematurely. : : Before the patch: : : | brk(0x80005c8e) = 0x80006000 : : After the patch: : : | brk(0x80005c8e) = 0x80005c8e : : Old libc5 considers brk() to have failed if the return value is not : identical to the requested value. I don't like it, but currently see no better option than a bit flag in task_struct to catch the CONFIG_COMPAT_BRK && randomize_va_space == 2 case. Signed-off-by: Jiri Kosina Tested-by: Geert Uytterhoeven Reported-by: Geert Uytterhoeven Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 8c05e5b43b69..e27e0cf0de03 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -259,7 +259,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ - if (mm->start_brk > PAGE_ALIGN(mm->end_data)) + if (current->brk_randomized) min_brk = mm->start_brk; else min_brk = mm->end_data; -- cgit v1.2.3 From fe936dfc23fed3475b11067e8d9b70553eafcd9e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 14 Apr 2011 15:22:10 -0700 Subject: mm: check that we have the right vma in __access_remote_vm() In __access_remote_vm() we need to check that we have found the right vma, not the following vma before we try to access it. Otherwise we might call the vma's access routine with an address which does not fall inside the vma. It was discovered on a current kernel but with an unreleased driver, from memory it was strace leading to a kernel bad access, but it obviously depends on what the access implementation does. Looking at other access implementations I only see: $ git grep -A 5 vm_operations|grep access arch/powerpc/platforms/cell/spufs/file.c- .access = spufs_mem_mmap_access, arch/x86/pci/i386.c- .access = generic_access_phys, drivers/char/mem.c- .access = generic_access_phys fs/sysfs/bin.c- .access = bin_access, The spufs one looks like it might behave badly given the wrong vma, it assumes vma->vm_file->private_data is a spu_context, and looks like it would probably blow up pretty quickly if it wasn't. generic_access_phys() only uses the vma to check vm_flags and get the mm, and then walks page tables using the address. So it should bail on the vm_flags check, or at worst let you access some other VM_IO mapping. And bin_access() just proxies to another access implementation. Signed-off-by: Michael Ellerman Reviewed-by: KOSAKI Motohiro Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index b623a249918c..ce22a250926f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3688,7 +3688,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, */ #ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); - if (!vma) + if (!vma || vma->vm_start > addr) break; if (vma->vm_ops && vma->vm_ops->access) ret = vma->vm_ops->access(vma, addr, buf, -- cgit v1.2.3 From 929bea7c714220fc76ce3f75bef9056477c28e74 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 14 Apr 2011 15:22:12 -0700 Subject: vmscan: all_unreclaimable() use zone->all_unreclaimable as a name all_unreclaimable check in direct reclaim has been introduced at 2.6.19 by following commit. 2006 Sep 25; commit 408d8544; oom: use unreclaimable info And it went through strange history. firstly, following commit broke the logic unintentionally. 2008 Apr 29; commit a41f24ea; page allocator: smarter retry of costly-order allocations Two years later, I've found obvious meaningless code fragment and restored original intention by following commit. 2010 Jun 04; commit bb21c7ce; vmscan: fix do_try_to_free_pages() return value when priority==0 But, the logic didn't works when 32bit highmem system goes hibernation and Minchan slightly changed the algorithm and fixed it . 2010 Sep 22: commit d1908362: vmscan: check all_unreclaimable in direct reclaim path But, recently, Andrey Vagin found the new corner case. Look, struct zone { .. int all_unreclaimable; .. unsigned long pages_scanned; .. } zone->all_unreclaimable and zone->pages_scanned are neigher atomic variables nor protected by lock. Therefore zones can become a state of zone->page_scanned=0 and zone->all_unreclaimable=1. In this case, current all_unreclaimable() return false even though zone->all_unreclaimabe=1. This resulted in the kernel hanging up when executing a loop of the form 1. fork 2. mmap 3. touch memory 4. read memory 5. munmmap as described in http://www.gossamer-threads.com/lists/linux/kernel/1348725#1348725 Is this ignorable minor issue? No. Unfortunately, x86 has very small dma zone and it become zone->all_unreclamble=1 easily. and if it become all_unreclaimable=1, it never restore all_unreclaimable=0. Why? if all_unreclaimable=1, vmscan only try DEF_PRIORITY reclaim and a-few-lru-pages>>DEF_PRIORITY always makes 0. that mean no page scan at all! Eventually, oom-killer never works on such systems. That said, we can't use zone->pages_scanned for this purpose. This patch restore all_unreclaimable() use zone->all_unreclaimable as old. and in addition, to add oom_killer_disabled check to avoid reintroduce the issue of commit d1908362 ("vmscan: check all_unreclaimable in direct reclaim path"). Reported-by: Andrey Vagin Signed-off-by: KOSAKI Motohiro Cc: Nick Piggin Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Acked-by: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index c7f5a6d4b75b..f6b435c80079 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -1988,17 +1989,12 @@ static bool zone_reclaimable(struct zone *zone) return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; } -/* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. It can't handle OOM during hibernation. - * So let's check zone's unreclaimable in direct reclaim as well as kswapd. - */ +/* All zones in zonelist are unreclaimable? */ static bool all_unreclaimable(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; - bool all_unreclaimable = true; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2006,13 +2002,11 @@ static bool all_unreclaimable(struct zonelist *zonelist, continue; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone_reclaimable(zone)) { - all_unreclaimable = false; - break; - } + if (!zone->all_unreclaimable) + return false; } - return all_unreclaimable; + return true; } /* @@ -2108,6 +2102,14 @@ out: if (sc->nr_reclaimed) return sc->nr_reclaimed; + /* + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. Thus bypassing all_unreclaimable + * check. + */ + if (oom_killer_disabled) + return 0; + /* top priority shrink_zones still had more to do? don't OOM, then */ if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) return 1; -- cgit v1.2.3 From 341aea2bc48bf652777fb015cc2b3dfa9a451817 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 14 Apr 2011 15:22:13 -0700 Subject: oom-kill: remove boost_dying_task_prio() This is an almost-revert of commit 93b43fa ("oom: give the dying task a higher priority"). That commit dramatically improved oom killer logic when a fork-bomb occurs. But I've found that it has nasty corner case. Now cpu cgroup has strange default RT runtime. It's 0! That said, if a process under cpu cgroup promote RT scheduling class, the process never run at all. If an admin inserts a !RT process into a cpu cgroup by setting rtruntime=0, usually it runs perfectly because a !RT task isn't affected by the rtruntime knob. But if it promotes an RT task via an explicit setscheduler() syscall or an OOM, the task can't run at all. In short, the oom killer doesn't work at all if admins are using cpu cgroup and don't touch the rtruntime knob. Eventually, kernel may hang up when oom kill occur. I and the original author Luis agreed to disable this logic. Signed-off-by: KOSAKI Motohiro Acked-by: Luis Claudio R. Goncalves Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Acked-by: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6a819d1b2c7d..83fb72c108b7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -83,24 +83,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, } #endif /* CONFIG_NUMA */ -/* - * If this is a system OOM (not a memcg OOM) and the task selected to be - * killed is not already running at high (RT) priorities, speed up the - * recovery by boosting the dying task to the lowest FIFO priority. - * That helps with the recovery and avoids interfering with RT tasks. - */ -static void boost_dying_task_prio(struct task_struct *p, - struct mem_cgroup *mem) -{ - struct sched_param param = { .sched_priority = 1 }; - - if (mem) - return; - - if (!rt_task(p)) - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); -} - /* * The process p may have detached its own ->mm while exiting or through * use_mm(), but one or more of its subthreads may still have a valid @@ -452,13 +434,6 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); - /* - * We give our sacrificial lamb high priority and access to - * all the memory it needs. That way it should be able to - * exit() and clear out its resources quickly... - */ - boost_dying_task_prio(p, mem); - return 0; } #undef K @@ -482,7 +457,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ if (p->flags & PF_EXITING) { set_tsk_thread_flag(p, TIF_MEMDIE); - boost_dying_task_prio(p, mem); return 0; } @@ -556,7 +530,6 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) */ if (fatal_signal_pending(current)) { set_thread_flag(TIF_MEMDIE); - boost_dying_task_prio(current, NULL); return; } @@ -712,7 +685,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ if (fatal_signal_pending(current)) { set_thread_flag(TIF_MEMDIE); - boost_dying_task_prio(current, NULL); return; } -- cgit v1.2.3 From e27e6151b154ff6e5e8162efa291bc60196d29ea Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 14 Apr 2011 15:22:21 -0700 Subject: mm/thp: use conventional format for boolean attributes The conventional format for boolean attributes in sysfs is numeric ("0" or "1" followed by new-line). Any boolean attribute can then be read and written using a generic function. Using the strings "yes [no]", "[yes] no" (read), "yes" and "no" (write) will frustrate this. [akpm@linux-foundation.org: use kstrtoul()] [akpm@linux-foundation.org: test_bit() doesn't return 1/0, per Neil] Signed-off-by: Ben Hutchings Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Johannes Weiner Cc: Rik van Riel Cc: Hugh Dickins Tested-by: David Rientjes Cc: NeilBrown Cc: [2.6.38.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1722683bde23..470dcda10add 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -244,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag) { - if (test_bit(flag, &transparent_hugepage_flags)) - return sprintf(buf, "[yes] no\n"); - else - return sprintf(buf, "yes [no]\n"); + return sprintf(buf, "%d\n", + !!test_bit(flag, &transparent_hugepage_flags)); } + static ssize_t single_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag) { - if (!memcmp("yes", buf, - min(sizeof("yes")-1, count))) { + unsigned long value; + int ret; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > 1) + return -EINVAL; + + if (value) set_bit(flag, &transparent_hugepage_flags); - } else if (!memcmp("no", buf, - min(sizeof("no")-1, count))) { + else clear_bit(flag, &transparent_hugepage_flags); - } else - return -EINVAL; return count; } -- cgit v1.2.3