From 7da4d641c58d201c3cc1835c05ca1a7fa26f0856 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Nov 2012 03:14:23 +0100 Subject: mm: Count the number of pages affected in change_protection() This will be used for three kinds of purposes: - to optimize mprotect() - to speed up working set scanning for working set areas that have not been touched - to more accurately scan per real working set No change in functionality from this patch. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- mm/mprotect.c | 58 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 14 deletions(-) (limited to 'mm/mprotect.c') diff --git a/mm/mprotect.c b/mm/mprotect.c index a40992610ab6..1e265be25f85 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,12 +35,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { pte_t *pte, oldpte; spinlock_t *ptl; + unsigned long pages = 0; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -60,6 +61,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ptent = pte_mkwrite(ptent); ptep_modify_prot_commit(mm, addr, pte, ptent); + pages++; } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -72,18 +74,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, swp_entry_to_pte(entry)); } + pages++; } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); + + return pages; } -static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { pmd_t *pmd; unsigned long next; + unsigned long pages = 0; pmd = pmd_offset(pud, addr); do { @@ -91,35 +97,42 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma->vm_mm, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot)) + else if (change_huge_pmd(vma, pmd, addr, newprot)) { + pages += HPAGE_PMD_NR; continue; + } /* fall through */ } if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(vma->vm_mm, pmd, addr, next, newprot, + pages += change_pte_range(vma->vm_mm, pmd, addr, next, newprot, dirty_accountable); } while (pmd++, addr = next, addr != end); + + return pages; } -static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { pud_t *pud; unsigned long next; + unsigned long pages = 0; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(vma, pud, addr, next, newprot, + pages += change_pmd_range(vma, pud, addr, next, newprot, dirty_accountable); } while (pud++, addr = next, addr != end); + + return pages; } -static void change_protection(struct vm_area_struct *vma, +static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -127,6 +140,7 @@ static void change_protection(struct vm_area_struct *vma, pgd_t *pgd; unsigned long next; unsigned long start = addr; + unsigned long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); @@ -135,10 +149,30 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(vma, pgd, addr, next, newprot, + pages += change_pud_range(vma, pgd, addr, next, newprot, dirty_accountable); } while (pgd++, addr = next, addr != end); + flush_tlb_range(vma, start, end); + + return pages; +} + +unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long pages; + + mmu_notifier_invalidate_range_start(mm, start, end); + if (is_vm_hugetlb_page(vma)) + pages = hugetlb_change_protection(vma, start, end, newprot); + else + pages = change_protection_range(vma, start, end, newprot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); + + return pages; } int @@ -213,12 +247,8 @@ success: dirty_accountable = 1; } - mmu_notifier_invalidate_range_start(mm, start, end); - if (is_vm_hugetlb_page(vma)) - hugetlb_change_protection(vma, start, end, vma->vm_page_prot); - else - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); - mmu_notifier_invalidate_range_end(mm, start, end); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); -- cgit v1.2.3 From 1233d588210737ed3696b44c26e71dfa44a6995a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 19 Nov 2012 03:14:24 +0100 Subject: mm: Optimize the TLB flush of sys_mprotect() and change_protection() users Reuse the NUMA code's 'modified page protections' count that change_protection() computes and skip the TLB flush if there's no changes to a range that sys_mprotect() modifies. Given that mprotect() already optimizes the same-flags case I expected this optimization to dominantly trigger on CONFIG_NUMA_BALANCING=y kernels - but even with that feature disabled it triggers rather often. There's two reasons for that: 1) While sys_mprotect() already optimizes the same-flag case: if (newflags == oldflags) { *pprev = vma; return 0; } and this test works in many cases, but it is too sharp in some others, where it differentiates between protection values that the underlying PTE format makes no distinction about, such as PROT_EXEC == PROT_READ on x86. 2) Even where the pte format over vma flag changes necessiates a modification of the pagetables, there might be no pagetables yet to modify: they might not be instantiated yet. During a regular desktop bootup this optimization hits a couple of hundred times. During a Java test I measured thousands of hits. So this optimization improves sys_mprotect() in general, not just CONFIG_NUMA_BALANCING=y kernels. [ We could further increase the efficiency of this optimization if change_pte_range() and change_huge_pmd() was a bit smarter about recognizing exact-same-value protection masks - when the hardware can do that safely. This would probably further speed up mprotect(). ] Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Mel Gorman Cc: Hugh Dickins Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- mm/mprotect.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/mprotect.c') diff --git a/mm/mprotect.c b/mm/mprotect.c index 1e265be25f85..7c3628a8b486 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -153,7 +153,9 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, dirty_accountable); } while (pgd++, addr = next, addr != end); - flush_tlb_range(vma, start, end); + /* Only flush the TLB if we actually modified any entries: */ + if (pages) + flush_tlb_range(vma, start, end); return pages; } -- cgit v1.2.3 From 4b10e7d562c90d0a72f324832c26653947a07381 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:32 +0200 Subject: mm: mempolicy: Implement change_prot_numa() in terms of change_protection() This patch converts change_prot_numa() to use change_protection(). As pte_numa and friends check the PTE bits directly it is necessary for change_protection() to use pmd_mknuma(). Hence the required modifications to change_protection() are a little clumsy but the end result is that most of the numa page table helpers are just one or two instructions. Signed-off-by: Mel Gorman --- mm/mprotect.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 16 deletions(-) (limited to 'mm/mprotect.c') diff --git a/mm/mprotect.c b/mm/mprotect.c index 7c3628a8b486..7ef6ae964e8f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,10 +35,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { + struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; @@ -49,19 +50,39 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; + bool updated = false; ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); + if (!prot_numa) { + ptent = pte_modify(ptent, newprot); + updated = true; + } else { + struct page *page; + + page = vm_normal_page(vma, addr, oldpte); + if (page) { + /* only check non-shared pages */ + if (!pte_numa(oldpte) && + page_mapcount(page) == 1) { + ptent = pte_mknuma(ptent); + updated = true; + } + } + } /* * Avoid taking write faults for pages we know to be * dirty. */ - if (dirty_accountable && pte_dirty(ptent)) + if (dirty_accountable && pte_dirty(ptent)) { ptent = pte_mkwrite(ptent); + updated = true; + } + + if (updated) + pages++; ptep_modify_prot_commit(mm, addr, pte, ptent); - pages++; } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -83,9 +104,25 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, return pages; } +#ifdef CONFIG_NUMA_BALANCING +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); +} +#else +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -97,7 +134,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma->vm_mm, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -105,8 +142,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * } if (pmd_none_or_clear_bad(pmd)) continue; - pages += change_pte_range(vma->vm_mm, pmd, addr, next, newprot, - dirty_accountable); + pages += change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa); + + if (prot_numa) + change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); return pages; @@ -114,7 +154,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -126,7 +166,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pud++, addr = next, addr != end); return pages; @@ -134,7 +174,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -150,7 +190,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; pages += change_pud_range(vma, pgd, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -162,7 +202,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; unsigned long pages; @@ -171,7 +211,7 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable); + pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); mmu_notifier_invalidate_range_end(mm, start, end); return pages; @@ -249,7 +289,7 @@ success: dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); -- cgit v1.2.3 From 9532fec118d485ea37ab6e3ea372d68cd8b4cd0d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 15 Nov 2012 01:24:32 +0000 Subject: mm: numa: Migrate pages handled during a pmd_numa hinting fault To say that the PMD handling code was incorrectly transferred from autonuma is an understatement. The intention was to handle a PMDs worth of pages in the same fault and effectively batch the taking of the PTL and page migration. The copied version instead has the impact of clearing a number of pte_numa PTE entries and whether any page migration takes place depends on racing. This just happens to work in some cases. This patch handles pte_numa faults in batch when a pmd_numa fault is handled. The pages are migrated if they are currently misplaced. Essentially this is making an assumption that NUMA locality is on a PMD boundary but that could be addressed by only setting pmd_numa if all the pages within that PMD are on the same node if necessary. Signed-off-by: Mel Gorman --- mm/mprotect.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'mm/mprotect.c') diff --git a/mm/mprotect.c b/mm/mprotect.c index 7ef6ae964e8f..dce6fb48edc4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,12 +37,14 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + int dirty_accountable, int prot_numa, bool *ret_all_same_node) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; + bool all_same_node = true; + int last_nid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -61,6 +63,12 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, oldpte); if (page) { + int this_nid = page_to_nid(page); + if (last_nid == -1) + last_nid = this_nid; + if (last_nid != this_nid) + all_same_node = false; + /* only check non-shared pages */ if (!pte_numa(oldpte) && page_mapcount(page) == 1) { @@ -81,7 +89,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (updated) pages++; - ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -101,6 +108,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); + *ret_all_same_node = all_same_node; return pages; } @@ -127,6 +135,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * pmd_t *pmd; unsigned long next; unsigned long pages = 0; + bool all_same_node; pmd = pmd_offset(pud, addr); do { @@ -143,9 +152,15 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_none_or_clear_bad(pmd)) continue; pages += change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa); - - if (prot_numa) + dirty_accountable, prot_numa, &all_same_node); + + /* + * If we are changing protections for NUMA hinting faults then + * set pmd_numa if the examined pages were all on the same + * node. This allows a regular PMD to be handled as one fault + * and effectively batches the taking of the PTL + */ + if (prot_numa && all_same_node) change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); -- cgit v1.2.3