From fb993fa1a2f669215fa03a09eed7848f2663e336 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Tue, 2 Dec 2014 15:59:25 -0800 Subject: mm: frontswap: invalidate expired data on a dup-store failure If a frontswap dup-store failed, it should invalidate the expired page in the backend, or it could trigger some data corruption issue. Such as: 1. use zswap as the frontswap backend with writeback feature 2. store a swap page(version_1) to entry A, success 3. dup-store a newer page(version_2) to the same entry A, fail 4. use __swap_writepage() write version_2 page to swapfile, success 5. zswap do shrink, writeback version_1 page to swapfile 6. version_2 page is overwrited by version_1, data corrupt. This patch fixes this issue by invalidating expired data immediately when meet a dup-store failure. Signed-off-by: Weijie Yang Cc: Konrad Rzeszutek Wilk Cc: Seth Jennings Cc: Dan Streetman Cc: Minchan Kim Cc: Bob Liu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index c30eec536f03..f2a3571c6e22 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -244,8 +244,10 @@ int __frontswap_store(struct page *page) the (older) page from frontswap */ inc_frontswap_failed_stores(); - if (dup) + if (dup) { __frontswap_clear(sis, offset); + frontswap_ops->invalidate_page(type, offset); + } } if (frontswap_writethrough_enabled) /* report failure so swap also writes to swap device */ -- cgit v1.2.3 From 91b57191cfd152c02ded0745250167d0263084f8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 2 Dec 2014 15:59:28 -0800 Subject: mm/vmpressure.c: fix race in vmpressure_work_fn() In some android devices, there will be a "divide by zero" exception. vmpr->scanned could be zero before spin_lock(&vmpr->sr_lock). Addresses https://bugzilla.kernel.org/show_bug.cgi?id=88051 [akpm@linux-foundation.org: neaten] Reported-by: ji_ang Cc: Anton Vorontsov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmpressure.c b/mm/vmpressure.c index d4042e75f7c7..c5afd573d7da 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -165,6 +165,7 @@ static void vmpressure_work_fn(struct work_struct *work) unsigned long scanned; unsigned long reclaimed; + spin_lock(&vmpr->sr_lock); /* * Several contexts might be calling vmpressure(), so it is * possible that the work was rescheduled again before the old @@ -173,11 +174,12 @@ static void vmpressure_work_fn(struct work_struct *work) * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ - if (!vmpr->scanned) + scanned = vmpr->scanned; + if (!scanned) { + spin_unlock(&vmpr->sr_lock); return; + } - spin_lock(&vmpr->sr_lock); - scanned = vmpr->scanned; reclaimed = vmpr->reclaimed; vmpr->scanned = 0; vmpr->reclaimed = 0; -- cgit v1.2.3 From 2022b4d18a491a578218ce7a4eca8666db895a73 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 2 Dec 2014 15:59:39 -0800 Subject: mm: fix swapoff hang after page migration and fork I've been seeing swapoff hangs in recent testing: it's cycling around trying unsuccessfully to find an mm for some remaining pages of swap. I have been exercising swap and page migration more heavily recently, and now notice a long-standing error in copy_one_pte(): it's trying to add dst_mm to swapoff's mmlist when it finds a swap entry, but is doing so even when it's a migration entry or an hwpoison entry. Which wouldn't matter much, except it adds dst_mm next to src_mm, assuming src_mm is already on the mmlist: which may not be so. Then if pages are later swapped out from dst_mm, swapoff won't be able to find where to replace them. There's already a !non_swap_entry() test for stats: move that up before the swap_duplicate() and the addition to mmlist. Signed-off-by: Hugh Dickins Cc: Kelley Nielsen Cc: [2.6.18+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 3e503831e042..d5f2ae9c4a23 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -815,20 +815,20 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - if (swap_duplicate(entry) < 0) - return entry.val; - - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (likely(!non_swap_entry(entry))) + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } rss[MM_SWAPENTS]++; - else if (is_migration_entry(entry)) { + } else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); if (PageAnon(page)) -- cgit v1.2.3 From c4ea95d7cd08d9ffd7fa75e6c5e0332d596dd11e Mon Sep 17 00:00:00 2001 From: Daniel Forrest Date: Tue, 2 Dec 2014 15:59:42 -0800 Subject: mm: fix anon_vma_clone() error treatment Andrew Morton noticed that the error return from anon_vma_clone() was being dropped and replaced with -ENOMEM (which is not itself a bug because the only error return value from anon_vma_clone() is -ENOMEM). I did an audit of callers of anon_vma_clone() and discovered an actual bug where the error return was being lost. In __split_vma(), between Linux 3.11 and 3.12 the code was changed so the err variable is used before the call to anon_vma_clone() and the default initial value of -ENOMEM is overwritten. So a failure of anon_vma_clone() will return success since err at this point is now zero. Below is a patch which fixes this bug and also propagates the error return value from anon_vma_clone() in all cases. Fixes: ef0855d334e1 ("mm: mempolicy: turn vma_set_policy() into vma_dup_policy()") Signed-off-by: Daniel Forrest Reviewed-by: Michal Hocko Cc: Konstantin Khlebnikov Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Tim Hartrick Cc: Hugh Dickins Cc: Michel Lespinasse Cc: Vlastimil Babka Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 +++++++--- mm/rmap.c | 6 ++++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 87e82b38453c..ae919891a087 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -776,8 +776,11 @@ again: remove_next = 1 + (end > next->vm_end); * shrinking vma had, to cover any anon pages imported. */ if (exporter && exporter->anon_vma && !importer->anon_vma) { - if (anon_vma_clone(importer, exporter)) - return -ENOMEM; + int error; + + error = anon_vma_clone(importer, exporter); + if (error) + return error; importer->anon_vma = exporter->anon_vma; } } @@ -2469,7 +2472,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (err) goto out_free_vma; - if (anon_vma_clone(new, vma)) + err = anon_vma_clone(new, vma); + if (err) goto out_free_mpol; if (new->vm_file) diff --git a/mm/rmap.c b/mm/rmap.c index 19886fb2f13a..3e4c7213210c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -274,6 +274,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; + int error; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) @@ -283,8 +284,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - if (anon_vma_clone(vma, pvma)) - return -ENOMEM; + error = anon_vma_clone(vma, pvma); + if (error) + return error; /* Then add our own anon_vma. */ anon_vma = anon_vma_alloc(); -- cgit v1.2.3 From 7c3fbbdd04a681a1992ad6a3d7a36a63ff668753 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 2 Dec 2014 15:59:48 -0800 Subject: slab: fix nodeid bounds check for non-contiguous node IDs The bounds check for nodeid in ____cache_alloc_node gives false positives on machines where the node IDs are not contiguous, leading to a panic at boot time. For example, on a POWER8 machine the node IDs are typically 0, 1, 16 and 17. This means that num_online_nodes() returns 4, so when ____cache_alloc_node is called with nodeid = 16 the VM_BUG_ON triggers, like this: kernel BUG at /home/paulus/kernel/kvm/mm/slab.c:3079! Call Trace: .____cache_alloc_node+0x5c/0x270 (unreliable) .kmem_cache_alloc_node_trace+0xdc/0x360 .init_list+0x3c/0x128 .kmem_cache_init+0x1dc/0x258 .start_kernel+0x2a0/0x568 start_here_common+0x20/0xa8 To fix this, we instead compare the nodeid with MAX_NUMNODES, and additionally make sure it isn't negative (since nodeid is an int). The check is there mainly to protect the array dereference in the get_node() call in the next line, and the array being dereferenced is of size MAX_NUMNODES. If the nodeid is in range but invalid (for example if the node is off-line), the BUG_ON in the next line will catch that. Fixes: 14e50c6a9bc2 ("mm: slab: Verify the nodeid passed to ____cache_alloc_node") Signed-off-by: Paul Mackerras Reviewed-by: Yasuaki Ishimatsu Reviewed-by: Pekka Enberg Acked-by: David Rientjes Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index eb2b2ea30130..f34e053ec46e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3076,7 +3076,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, void *obj; int x; - VM_BUG_ON(nodeid > num_online_nodes()); + VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); n = get_node(cachep, nodeid); BUG_ON(!n); -- cgit v1.2.3