From 1dd49bfa3465756b3ce72214b58a33e4afb67aa3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:42 +0100 Subject: mm: numa: Do not account for a hinting fault if we raced If another task handled a hinting fault in parallel then do not double account for it. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-5-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 610e3df2768a..33ee637648ba 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1325,8 +1325,11 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, check_same: spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) + if (unlikely(!pmd_same(pmd, *pmdp))) { + /* Someone else took our fault */ + current_nid = -1; goto out_unlock; + } clear_pmdnuma: pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); -- cgit v1.2.3 From 42836f5f8baa33085f547098b74aa98991ee9216 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:43 +0100 Subject: mm: Wait for THP migrations to complete during NUMA hinting faults The locking for migrating THP is unusual. While normal page migration prevents parallel accesses using a migration PTE, THP migration relies on a combination of the page_table_lock, the page lock and the existance of the NUMA hinting PTE to guarantee safety but there is a bug in the scheme. If a THP page is currently being migrated and another thread traps a fault on the same page it checks if the page is misplaced. If it is not, then pmd_numa is cleared. The problem is that it checks if the page is misplaced without holding the page lock meaning that the racing thread can be migrating the THP when the second thread clears the NUMA bit and faults a stale page. This patch checks if the page is potentially being migrated and stalls using the lock_page if it is potentially being migrated before checking if the page is misplaced or not. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-6-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33ee637648ba..e10d780c4781 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1295,13 +1295,14 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (current_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { - put_page(page); - goto clear_pmdnuma; - } + /* + * Acquire the page lock to serialise THP migrations but avoid dropping + * page_table_lock if at all possible + */ + if (trylock_page(page)) + goto got_lock; - /* Acquire the page lock to serialise THP migrations */ + /* Serialise against migrationa and check placement check placement */ spin_unlock(&mm->page_table_lock); lock_page(page); @@ -1312,9 +1313,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(page); goto out_unlock; } - spin_unlock(&mm->page_table_lock); + +got_lock: + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + unlock_page(page); + put_page(page); + goto clear_pmdnuma; + } /* Migrate the THP to the requested node */ + spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); if (!migrated) -- cgit v1.2.3 From 587fe586f44a48f9691001ba6c45b86c8e4ba21f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:44 +0100 Subject: mm: Prevent parallel splits during THP migration THP migrations are serialised by the page lock but on its own that does not prevent THP splits. If the page is split during THP migration then the pmd_same checks will prevent page table corruption but the unlock page and other fix-ups potentially will cause corruption. This patch takes the anon_vma lock to prevent parallel splits during migration. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-7-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e10d780c4781..d8534b3630e4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1278,18 +1278,18 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { + struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; int current_nid = -1; - bool migrated; + bool migrated, page_locked; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); - get_page(page); current_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (current_nid == numa_node_id()) @@ -1299,12 +1299,29 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Acquire the page lock to serialise THP migrations but avoid dropping * page_table_lock if at all possible */ - if (trylock_page(page)) - goto got_lock; + page_locked = trylock_page(page); + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + /* If the page was locked, there are no parallel migrations */ + if (page_locked) { + unlock_page(page); + goto clear_pmdnuma; + } - /* Serialise against migrationa and check placement check placement */ + /* Otherwise wait for potential migrations and retry fault */ + spin_unlock(&mm->page_table_lock); + wait_on_page_locked(page); + goto out; + } + + /* Page is misplaced, serialise migrations and parallel THP splits */ + get_page(page); spin_unlock(&mm->page_table_lock); - lock_page(page); + if (!page_locked) { + lock_page(page); + page_locked = true; + } + anon_vma = page_lock_anon_vma_read(page); /* Confirm the PTE did not while locked */ spin_lock(&mm->page_table_lock); @@ -1314,14 +1331,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; } -got_lock: - target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) { - unlock_page(page); - put_page(page); - goto clear_pmdnuma; - } - /* Migrate the THP to the requested node */ spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, @@ -1330,6 +1339,8 @@ got_lock: goto check_same; task_numa_fault(target_nid, HPAGE_PMD_NR, true); + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); return 0; check_same: @@ -1346,6 +1357,11 @@ clear_pmdnuma: update_mmu_cache_pmd(vma, addr, pmdp); out_unlock: spin_unlock(&mm->page_table_lock); + +out: + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); + if (current_nid != -1) task_numa_fault(current_nid, HPAGE_PMD_NR, false); return 0; -- cgit v1.2.3 From c61109e34f60f6e85bb43c5a1cd51c0e3db40847 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:45 +0100 Subject: mm: numa: Sanitize task_numa_fault() callsites There are three callers of task_numa_fault(): - do_huge_pmd_numa_page(): Accounts against the current node, not the node where the page resides, unless we migrated, in which case it accounts against the node we migrated to. - do_numa_page(): Accounts against the current node, not the node where the page resides, unless we migrated, in which case it accounts against the node we migrated to. - do_pmd_numa_page(): Accounts not at all when the page isn't migrated, otherwise accounts against the node we migrated towards. This seems wrong to me; all three sites should have the same sementaics, furthermore we should accounts against where the page really is, we already know where the task is. So modify all three sites to always account; we did after all receive the fault; and always account to where the page is after migration, regardless of success. They all still differ on when they clear the PTE/PMD; ideally that would get sorted too. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-8-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 25 +++++++++++++------------ mm/memory.c | 53 +++++++++++++++++++++-------------------------------- 2 files changed, 34 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d8534b3630e4..00ddfcdd810e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1281,18 +1281,19 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; + int page_nid = -1, this_nid = numa_node_id(); int target_nid; - int current_nid = -1; - bool migrated, page_locked; + bool page_locked; + bool migrated = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); - current_nid = page_to_nid(page); + page_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) + if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); /* @@ -1335,19 +1336,18 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); - if (!migrated) + if (migrated) + page_nid = target_nid; + else goto check_same; - task_numa_fault(target_nid, HPAGE_PMD_NR, true); - if (anon_vma) - page_unlock_anon_vma_read(anon_vma); - return 0; + goto out; check_same: spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) { /* Someone else took our fault */ - current_nid = -1; + page_nid = -1; goto out_unlock; } clear_pmdnuma: @@ -1362,8 +1362,9 @@ out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); - if (current_nid != -1) - task_numa_fault(current_nid, HPAGE_PMD_NR, false); + if (page_nid != -1) + task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); + return 0; } diff --git a/mm/memory.c b/mm/memory.c index 1311f26497e6..d176154c243f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3521,12 +3521,12 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int current_nid) + unsigned long addr, int page_nid) { get_page(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (current_nid == numa_node_id()) + if (page_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); return mpol_misplaced(page, vma, addr); @@ -3537,7 +3537,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page = NULL; spinlock_t *ptl; - int current_nid = -1; + int page_nid = -1; int target_nid; bool migrated = false; @@ -3567,15 +3567,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - current_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, current_nid); + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { - /* - * Account for the fault against the current node if it not - * being replaced regardless of where the page is located. - */ - current_nid = numa_node_id(); put_page(page); goto out; } @@ -3583,11 +3578,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate to the requested node */ migrated = migrate_misplaced_page(page, target_nid); if (migrated) - current_nid = target_nid; + page_nid = target_nid; out: - if (current_nid != -1) - task_numa_fault(current_nid, 1, migrated); + if (page_nid != -1) + task_numa_fault(page_nid, 1, migrated); return 0; } @@ -3602,7 +3597,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; - int local_nid = numa_node_id(); spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3625,9 +3619,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { pte_t pteval = *pte; struct page *page; - int curr_nid = local_nid; + int page_nid = -1; int target_nid; - bool migrated; + bool migrated = false; + if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3649,25 +3644,19 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(page_mapcount(page) != 1)) continue; - /* - * Note that the NUMA fault is later accounted to either - * the node that is currently running or where the page is - * migrated to. - */ - curr_nid = local_nid; - target_nid = numa_migrate_prep(page, vma, addr, - page_to_nid(page)); - if (target_nid == -1) { + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid); + pte_unmap_unlock(pte, ptl); + if (target_nid != -1) { + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) + page_nid = target_nid; + } else { put_page(page); - continue; } - /* Migrate to the requested node */ - pte_unmap_unlock(pte, ptl); - migrated = migrate_misplaced_page(page, target_nid); - if (migrated) - curr_nid = target_nid; - task_numa_fault(curr_nid, 1, migrated); + if (page_nid != -1) + task_numa_fault(page_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit v1.2.3 From 3f926ab945b60a5824369d21add7710622a2eac0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:46 +0100 Subject: mm: Close races between THP migration and PMD numa clearing THP migration uses the page lock to guard against parallel allocations but there are cases like this still open Task A Task B --------------------- --------------------- do_huge_pmd_numa_page do_huge_pmd_numa_page lock_page mpol_misplaced == -1 unlock_page goto clear_pmdnuma lock_page mpol_misplaced == 2 migrate_misplaced_transhuge pmd = pmd_mknonnuma set_pmd_at During hours of testing, one crashed with weird errors and while I have no direct evidence, I suspect something like the race above happened. This patch extends the page lock to being held until the pmd_numa is cleared to prevent migration starting in parallel while the pmd_numa is being cleared. It also flushes the old pmd entry and orders pagetable insertion before rmap insertion. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 33 +++++++++++++++------------------ mm/migrate.c | 19 +++++++++++-------- 2 files changed, 26 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 00ddfcdd810e..cca80d96e509 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1304,24 +1304,25 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, target_nid = mpol_misplaced(page, vma, haddr); if (target_nid == -1) { /* If the page was locked, there are no parallel migrations */ - if (page_locked) { - unlock_page(page); + if (page_locked) goto clear_pmdnuma; - } - /* Otherwise wait for potential migrations and retry fault */ + /* + * Otherwise wait for potential migrations and retry. We do + * relock and check_same as the page may no longer be mapped. + * As the fault is being retried, do not account for it. + */ spin_unlock(&mm->page_table_lock); wait_on_page_locked(page); + page_nid = -1; goto out; } /* Page is misplaced, serialise migrations and parallel THP splits */ get_page(page); spin_unlock(&mm->page_table_lock); - if (!page_locked) { + if (!page_locked) lock_page(page); - page_locked = true; - } anon_vma = page_lock_anon_vma_read(page); /* Confirm the PTE did not while locked */ @@ -1329,32 +1330,28 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); put_page(page); + page_nid = -1; goto out_unlock; } - /* Migrate the THP to the requested node */ + /* + * Migrate the THP to the requested node, returns with page unlocked + * and pmd_numa cleared. + */ spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); if (migrated) page_nid = target_nid; - else - goto check_same; goto out; - -check_same: - spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) { - /* Someone else took our fault */ - page_nid = -1; - goto out_unlock; - } clear_pmdnuma: + BUG_ON(!PageLocked(page)); pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); + unlock_page(page); out_unlock: spin_unlock(&mm->page_table_lock); diff --git a/mm/migrate.c b/mm/migrate.c index 7a7325ee1d08..c04692774e88 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1715,12 +1715,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unlock_page(new_page); put_page(new_page); /* Free it */ - unlock_page(page); + /* Retake the callers reference and putback on LRU */ + get_page(page); putback_lru_page(page); - - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - isolated = 0; - goto out; + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); + goto out_fail; } /* @@ -1737,9 +1737,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); - page_add_new_anon_rmap(new_page, vma, haddr); - + pmdp_clear_flush(vma, haddr, pmd); set_pmd_at(mm, haddr, pmd, entry); + page_add_new_anon_rmap(new_page, vma, haddr); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(page); /* @@ -1758,7 +1758,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); -out: mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); @@ -1767,6 +1766,10 @@ out: out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: + entry = pmd_mknonnuma(entry); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + unlock_page(page); put_page(page); return 0; -- cgit v1.2.3 From 0255d491848032f6c601b6410c3b8ebded3a37b1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:47 +0100 Subject: mm: Account for a THP NUMA hinting update as one PTE update A THP PMD update is accounted for as 512 pages updated in vmstat. This is large difference when estimating the cost of automatic NUMA balancing and can be misleading when comparing results that had collapsed versus split THP. This patch addresses the accounting issue. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Cc: Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-10-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index a3af058f68e4..412ba2b7326a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -148,7 +148,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, split_huge_page_pmd(vma, addr, pmd); else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { - pages += HPAGE_PMD_NR; + pages++; continue; } /* fall through */ -- cgit v1.2.3 From c56b097af26cb11c1f49a4311ba538c825666fed Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 30 Oct 2013 14:16:16 +0000 Subject: mm: list_lru: fix almost infinite loop causing effective livelock I've seen a fair number of issues with kswapd and other processes appearing to get stuck in v3.12-rc. Using sysrq-p many times seems to indicate that it gets stuck somewhere in list_lru_walk_node(), called from prune_icache_sb() and super_cache_scan(). I never seem to be able to trigger a calltrace for functions above that point. So I decided to add the following to super_cache_scan(): @@ -81,10 +81,14 @@ static unsigned long super_cache_scan(struct shrinker *shrink, inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); total_objects = dentries + inodes + fs_objects + 1; +printk("%s:%u: %s: dentries %lu inodes %lu total %lu\n", current->comm, current->pid, __func__, dentries, inodes, total_objects); /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); +printk("%s:%u: %s: dentries %lu inodes %lu\n", current->comm, current->pid, __func__, dentries, inodes); +BUG_ON(dentries == 0); +BUG_ON(inodes == 0); /* * prune the dcache first as the icache is pinned by it, then @@ -99,7 +103,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, freed += sb->s_op->free_cached_objects(sb, fs_objects, sc->nid); } - +printk("%s:%u: %s: dentries %lu inodes %lu freed %lu\n", current->comm, current->pid, __func__, dentries, inodes, freed); drop_super(sb); return freed; } and shortly thereafter, having applied some pressure, I got this: update-apt-xapi:1616: super_cache_scan: dentries 25632 inodes 2 total 25635 update-apt-xapi:1616: super_cache_scan: dentries 1023 inodes 0 ------------[ cut here ]------------ Kernel BUG at c0101994 [verbose debug info unavailable] Internal error: Oops - BUG: 0 [#3] SMP ARM Modules linked in: fuse rfcomm bnep bluetooth hid_cypress CPU: 0 PID: 1616 Comm: update-apt-xapi Tainted: G D 3.12.0-rc7+ #154 task: daea1200 ti: c3bf8000 task.ti: c3bf8000 PC is at super_cache_scan+0x1c0/0x278 LR is at trace_hardirqs_on+0x14/0x18 Process update-apt-xapi (pid: 1616, stack limit = 0xc3bf8240) ... Backtrace: (super_cache_scan) from [] (shrink_slab+0x254/0x4c8) (shrink_slab) from [] (try_to_free_pages+0x3a0/0x5e0) (try_to_free_pages) from [] (__alloc_pages_nodemask+0x5) (__alloc_pages_nodemask) from [] (__pte_alloc+0x2c/0x13) (__pte_alloc) from [] (handle_mm_fault+0x84c/0x914) (handle_mm_fault) from [] (do_page_fault+0x1f0/0x3bc) (do_page_fault) from [] (do_translation_fault+0xac/0xb8) (do_translation_fault) from [] (do_DataAbort+0x38/0xa0) (do_DataAbort) from [] (__dabt_usr+0x38/0x40) Notice that we had a very low number of inodes, which were reduced to zero my mult_frac(). Now, prune_icache_sb() calls list_lru_walk_node() passing that number of inodes (0) into that as the number of objects to scan: long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, int nid) { LIST_HEAD(freeable); long freed; freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, &freeable, &nr_to_scan); which does: unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { struct list_lru_node *nlru = &lru->node[nid]; struct list_head *item, *n; unsigned long isolated = 0; spin_lock(&nlru->lock); restart: list_for_each_safe(item, n, &nlru->list) { enum lru_status ret; /* * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbesr of LRU_RETRY items */ if (--(*nr_to_walk) == 0) break; So, if *nr_to_walk was zero when this function was entered, that means we're wanting to operate on (~0UL)+1 objects - which might as well be infinite. Clearly this is not correct behaviour. If we think about the behaviour of this function when *nr_to_walk is 1, then clearly it's wrong - we decrement first and then test for zero - which results in us doing nothing at all. A post-decrement would give the desired behaviour - we'd try to walk one object and one object only if *nr_to_walk were one. It also gives the correct behaviour for zero - we exit at this point. Fixes: 5cedf721a7cd ("list_lru: fix broken LRU_RETRY behaviour") Signed-off-by: Russell King Cc: Dave Chinner Cc: Al Viro Cc: Andrew Morton [ Modified to make sure we never underflow the count: this function gets called in a loop, so the 0 -> ~0ul transition is dangerous - Linus ] Signed-off-by: Linus Torvalds --- mm/list_lru.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/list_lru.c b/mm/list_lru.c index 72467914b856..72f9decb0104 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -81,8 +81,9 @@ restart: * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbesr of LRU_RETRY items */ - if (--(*nr_to_walk) == 0) + if (!*nr_to_walk) break; + --*nr_to_walk; ret = isolate(item, &nlru->lock, cb_arg); switch (ret) { -- cgit v1.2.3 From 3017f079efd6af199b0852b5c425364513db460e Mon Sep 17 00:00:00 2001 From: Chen LinX Date: Wed, 30 Oct 2013 13:56:18 -0700 Subject: mm/pagewalk.c: fix walk_page_range() access of wrong PTEs When walk_page_range walk a memory map's page tables, it'll skip VM_PFNMAP area, then variable 'next' will to assign to vma->vm_end, it maybe larger than 'end'. In next loop, 'addr' will be larger than 'next'. Then in /proc/XXXX/pagemap file reading procedure, the 'addr' will growing forever in pagemap_pte_range, pte_to_pagemap_entry will access the wrong pte. BUG: Bad page map in process procrank pte:8437526f pmd:785de067 addr:9108d000 vm_flags:00200073 anon_vma:f0d99020 mapping: (null) index:9108d CPU: 1 PID: 4974 Comm: procrank Tainted: G B W O 3.10.1+ #1 Call Trace: dump_stack+0x16/0x18 print_bad_pte+0x114/0x1b0 vm_normal_page+0x56/0x60 pagemap_pte_range+0x17a/0x1d0 walk_page_range+0x19e/0x2c0 pagemap_read+0x16e/0x200 vfs_read+0x84/0x150 SyS_read+0x4a/0x80 syscall_call+0x7/0xb Signed-off-by: Liu ShuoX Signed-off-by: Chen LinX Acked-by: Kirill A. Shutemov Reviewed-by: Naoya Horiguchi Cc: [3.10.x+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 5da2cbcfdbb5..2beeabf502c5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -242,7 +242,7 @@ int walk_page_range(unsigned long addr, unsigned long end, if (err) break; pgd++; - } while (addr = next, addr != end); + } while (addr = next, addr < end); return err; } -- cgit v1.2.3 From 5e8cfc3c75b3e43497389896c0ecda62fc311ce9 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 30 Oct 2013 13:56:21 -0700 Subject: memcg: use __this_cpu_sub() to dec stats to avoid incorrect subtrahend casting As of commit 3ea67d06e467 ("memcg: add per cgroup writeback pages accounting") memcg counter errors are possible when moving charged memory to a different memcg. Charge movement occurs when processing writes to memory.force_empty, moving tasks to a memcg with memcg.move_charge_at_immigrate=1, or memcg deletion. An example showing error after memory.force_empty: $ cd /sys/fs/cgroup/memory $ mkdir x $ rm /data/tmp/file $ (echo $BASHPID >> x/tasks && exec mmap_writer /data/tmp/file 1M) & [1] 13600 $ grep ^mapped x/memory.stat mapped_file 1048576 $ echo 13600 > tasks $ echo 1 > x/memory.force_empty $ grep ^mapped x/memory.stat mapped_file 4503599627370496 mapped_file should end with 0. 4503599627370496 == 0x10,0000,0000,0000 == 0x100,0000,0000 pages 1048576 == 0x10,0000 == 0x100 pages This issue only affects the source memcg on 64 bit machines; the destination memcg counters are correct. So the rmdir case is not too important because such counters are soon disappearing with the entire memcg. But the memcg.force_empty and memory.move_charge_at_immigrate=1 cases are larger problems as the bogus counters are visible for the (possibly long) remaining life of the source memcg. The problem is due to memcg use of __this_cpu_from(.., -nr_pages), which is subtly wrong because it subtracts the unsigned int nr_pages (either -1 or -512 for THP) from a signed long percpu counter. When nr_pages=-1, -nr_pages=0xffffffff. On 64 bit machines stat->count[idx] is signed 64 bit. So memcg's attempt to simply decrement a count (e.g. from 1 to 0) boils down to: long count = 1 unsigned int nr_pages = 1 count += -nr_pages /* -nr_pages == 0xffff,ffff */ count is now 0x1,0000,0000 instead of 0 The fix is to subtract the unsigned page count rather than adding its negation. This only works once "percpu: fix this_cpu_sub() subtrahend casting for unsigneds" is applied to fix this_cpu_sub(). Signed-off-by: Greg Thelen Acked-by: Tejun Heo Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 34d3ca9572d6..497ec33ff22d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3774,7 +3774,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, /* Update stat data for mem_cgroup */ preempt_disable(); WARN_ON_ONCE(from->stat->count[idx] < nr_pages); - __this_cpu_add(from->stat->count[idx], -nr_pages); + __this_cpu_sub(from->stat->count[idx], nr_pages); __this_cpu_add(to->stat->count[idx], nr_pages); preempt_enable(); } -- cgit v1.2.3 From 3168ecbe1c04ec3feb7cb42388a17d7f047fe1a2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 31 Oct 2013 16:34:13 -0700 Subject: mm: memcg: use proper memcg in limit bypass Commit 84235de394d9 ("fs: buffer: move allocation failure loop into the allocator") allowed __GFP_NOFAIL allocations to bypass the limit if they fail to reclaim enough memory for the charge. But because the main test case was on a 3.2-based system, the patch missed the fact that on newer kernels the charge function needs to return root_mem_cgroup when bypassing the limit, and not NULL. This will corrupt whatever memory is at NULL + percpu pointer offset. Fix this quickly before problems are reported. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 497ec33ff22d..623d5c8bb1e1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2765,10 +2765,10 @@ done: *ptr = memcg; return 0; nomem: - *ptr = NULL; - if (gfp_mask & __GFP_NOFAIL) - return 0; - return -ENOMEM; + if (!(gfp_mask & __GFP_NOFAIL)) { + *ptr = NULL; + return -ENOMEM; + } bypass: *ptr = root_mem_cgroup; return -EINTR; -- cgit v1.2.3 From 0056f4e66a1b8f00245248877e80386af36af14c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 31 Oct 2013 16:34:14 -0700 Subject: mm: memcg: lockdep annotation for memcg OOM lock The memcg OOM lock is a mutex-type lock that is open-coded due to memcg's special needs. Add annotations for lockdep coverage. Signed-off-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 623d5c8bb1e1..7e11cb7d75b1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -54,6 +54,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -2046,6 +2047,12 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, return total; } +#ifdef CONFIG_LOCKDEP +static struct lockdep_map memcg_oom_lock_dep_map = { + .name = "memcg_oom_lock", +}; +#endif + static DEFINE_SPINLOCK(memcg_oom_lock); /* @@ -2083,7 +2090,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) } iter->oom_lock = false; } - } + } else + mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); spin_unlock(&memcg_oom_lock); @@ -2095,6 +2103,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) struct mem_cgroup *iter; spin_lock(&memcg_oom_lock); + mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; spin_unlock(&memcg_oom_lock); -- cgit v1.2.3 From 696ac172fffa653dca401bb2b0cad91cf2ce453f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 31 Oct 2013 16:34:15 -0700 Subject: mm: memcg: fix test for child groups When memcg code needs to know whether any given memcg has children, it uses the cgroup child iteration primitives and returns true/false depending on whether the iteration loop is executed at least once or not. Because a cgroup's list of children is RCU protected, these primitives require the RCU read-lock to be held, which is not the case for all memcg callers. This results in the following splat when e.g. enabling hierarchy mode: WARNING: CPU: 3 PID: 1 at kernel/cgroup.c:3043 css_next_child+0xa3/0x160() CPU: 3 PID: 1 Comm: systemd Not tainted 3.12.0-rc5-00117-g83f11a9-dirty #18 Hardware name: LENOVO 3680B56/3680B56, BIOS 6QET69WW (1.39 ) 04/26/2012 Call Trace: dump_stack+0x54/0x74 warn_slowpath_common+0x78/0xa0 warn_slowpath_null+0x1a/0x20 css_next_child+0xa3/0x160 mem_cgroup_hierarchy_write+0x5b/0xa0 cgroup_file_write+0x108/0x2a0 vfs_write+0xbd/0x1e0 SyS_write+0x4c/0xa0 system_call_fastpath+0x16/0x1b In the memcg case, we only care about children when we are attempting to modify inheritable attributes interactively. Racing with deletion could mean a spurious -EBUSY, no problem. Racing with addition is handled just fine as well through the memcg_create_mutex: if the child group is not on the list after the mutex is acquired, it won't be initialized from the parent's attributes until after the unlock. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e11cb7d75b1..e63278222be5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4959,31 +4959,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) } while (usage > 0); } -/* - * This mainly exists for tests during the setting of set of use_hierarchy. - * Since this is the very setting we are changing, the current hierarchy value - * is meaningless - */ -static inline bool __memcg_has_children(struct mem_cgroup *memcg) -{ - struct cgroup_subsys_state *pos; - - /* bounce at first found */ - css_for_each_child(pos, &memcg->css) - return true; - return false; -} - -/* - * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed - * to be already dead (as in mem_cgroup_force_empty, for instance). This is - * from mem_cgroup_count_children(), in the sense that we don't really care how - * many children we have; we only need to know if we have any. It also counts - * any memcg without hierarchy as infertile. - */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { - return memcg->use_hierarchy && __memcg_has_children(memcg); + lockdep_assert_held(&memcg_create_mutex); + /* + * The lock does not prevent addition or deletion to the list + * of children, but it prevents a new child from being + * initialized based on this parent in css_online(), so it's + * enough to decide whether hierarchically inherited + * attributes can still be changed or not. + */ + return memcg->use_hierarchy && + !list_empty(&memcg->css.cgroup->children); } /* @@ -5063,7 +5050,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, */ if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { - if (!__memcg_has_children(memcg)) + if (list_empty(&memcg->css.cgroup->children)) memcg->use_hierarchy = val; else retval = -EBUSY; -- cgit v1.2.3 From 6920a1bd037374a632d585de127b6f945199dcb8 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 1 Nov 2013 12:16:59 -0700 Subject: memcg: remove incorrect underflow check When a memcg is deleted mem_cgroup_reparent_charges() moves charged memory to the parent memcg. As of v3.11-9444-g3ea67d0 "memcg: add per cgroup writeback pages accounting" there's bad pointer read. The goal was to check for counter underflow. The counter is a per cpu counter and there are two problems with the code: (1) per cpu access function isn't used, instead a naked pointer is used which easily causes oops. (2) the check doesn't sum all cpus Test: $ cd /sys/fs/cgroup/memory $ mkdir x $ echo 3 > /proc/sys/vm/drop_caches $ (echo $BASHPID >> x/tasks && exec cat) & [1] 7154 $ grep ^mapped x/memory.stat mapped_file 53248 $ echo 7154 > tasks $ rmdir x The fix is to remove the check. It's currently dangerous and isn't worth fixing it to use something expensive, such as percpu_counter_sum(), for each reparented page. __this_cpu_read() isn't enough to fix this because there's no guarantees of the current cpus count. The only guarantees is that the sum of all per-cpu counter is >= nr_pages. Fixes: 3ea67d06e467 ("memcg: add per cgroup writeback pages accounting") Reported-and-tested-by: Flavio Leitner Signed-off-by: Greg Thelen Reviewed-by: Sha Zhengju Acked-by: Johannes Weiner Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e63278222be5..13b9d0f221b8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3782,7 +3782,6 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, { /* Update stat data for mem_cgroup */ preempt_disable(); - WARN_ON_ONCE(from->stat->count[idx] < nr_pages); __this_cpu_sub(from->stat->count[idx], nr_pages); __this_cpu_add(to->stat->count[idx], nr_pages); preempt_enable(); -- cgit v1.2.3