From d16dfc550f5326a4000f3322582a7c05dec91d7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2011 17:11:45 -0700 Subject: mm: mmu_gather rework Rework the existing mmu_gather infrastructure. The direct purpose of these patches was to allow preemptible mmu_gather, but even without that I think these patches provide an improvement to the status quo. The first 9 patches rework the mmu_gather infrastructure. For review purpose I've split them into generic and per-arch patches with the last of those a generic cleanup. The next patch provides generic RCU page-table freeing, and the followup is a patch converting s390 to use this. I've also got 4 patches from DaveM lined up (not included in this series) that uses this to implement gup_fast() for sparc64. Then there is one patch that extends the generic mmu_gather batching. After that follow the mm preemptibility patches, these make part of the mm a lot more preemptible. It converts i_mmap_lock and anon_vma->lock to mutexes which together with the mmu_gather rework makes mmu_gather preemptible as well. Making i_mmap_lock a mutex also enables a clean-up of the truncate code. This also allows for preemptible mmu_notifiers, something that XPMEM I think wants. Furthermore, it removes the new and universially detested unmap_mutex. This patch: Remove the first obstacle towards a fully preemptible mmu_gather. The current scheme assumes mmu_gather is always done with preemption disabled and uses per-cpu storage for the page batches. Change this to try and allocate a page for batching and in case of failure, use a small on-stack array to make some progress. Preemptible mmu_gather is desired in general and usable once i_mmap_lock becomes a mutex. Doing it before the mutex conversion saves us from having to rework the code by moving the mmu_gather bits inside the pte_lock. Also avoid flushing the tlb batches from under the pte lock, this is useful even without the i_mmap_lock conversion as it significantly reduces pte lock hold times. [akpm@linux-foundation.org: fix comment tpyo] Signed-off-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: David Miller Cc: Martin Schwidefsky Cc: Russell King Cc: Paul Mundt Cc: Jeff Dike Cc: Richard Weinberger Cc: Tony Luck Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Acked-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 96 ++++++++++++++++++++++++++++++++++------------- include/linux/mm.h | 2 +- 2 files changed, 70 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e43f9766259f..2d3547c84235 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -5,6 +5,8 @@ * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * + * Copyright 2011 Red Hat, Inc., Peter Zijlstra + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -22,51 +24,71 @@ * and page free order so much.. */ #ifdef CONFIG_SMP - #ifdef ARCH_FREE_PTR_NR - #define FREE_PTR_NR ARCH_FREE_PTR_NR - #else - #define FREE_PTE_NR 506 - #endif #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U) #else - #define FREE_PTE_NR 1 #define tlb_fast_mode(tlb) 1 #endif +/* + * If we can't allocate a page to make a big batch of page pointers + * to work on, then just handle a few from the on-stack structure. + */ +#define MMU_GATHER_BUNDLE 8 + /* struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; unsigned int nr; /* set to ~0U means fast mode */ + unsigned int max; /* nr < max */ unsigned int need_flush;/* Really unmapped some ptes? */ unsigned int fullmm; /* non-zero means full mm flush */ - struct page * pages[FREE_PTE_NR]; +#ifdef HAVE_ARCH_MMU_GATHER + struct arch_mmu_gather arch; +#endif + struct page **pages; + struct page *local[MMU_GATHER_BUNDLE]; }; -/* Users of the generic TLB shootdown code must declare this storage space. */ -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); +static inline void __tlb_alloc_page(struct mmu_gather *tlb) +{ + unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); + + if (addr) { + tlb->pages = (void *)addr; + tlb->max = PAGE_SIZE / sizeof(struct page *); + } +} /* tlb_gather_mmu - * Return a pointer to an initialized struct mmu_gather. + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @fullmm argument is used when @mm is without + * users and we're going to destroy the full address space (exit/execve). */ -static inline struct mmu_gather * -tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) +static inline void +tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { - struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); - tlb->mm = mm; - /* Use fast mode if only one CPU is online */ - tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; + tlb->max = ARRAY_SIZE(tlb->local); + tlb->pages = tlb->local; + + if (num_online_cpus() > 1) { + tlb->nr = 0; + __tlb_alloc_page(tlb); + } else /* Use fast mode if only one CPU is online */ + tlb->nr = ~0U; - tlb->fullmm = full_mm_flush; + tlb->fullmm = fullmm; - return tlb; +#ifdef HAVE_ARCH_MMU_GATHER + tlb->arch = ARCH_MMU_GATHER_INIT; +#endif } static inline void -tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +tlb_flush_mmu(struct mmu_gather *tlb) { if (!tlb->need_flush) return; @@ -75,6 +97,13 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) if (!tlb_fast_mode(tlb)) { free_pages_and_swap_cache(tlb->pages, tlb->nr); tlb->nr = 0; + /* + * If we are using the local on-stack array of pages for MMU + * gather, try allocating an off-stack array again as we have + * recently freed pages. + */ + if (tlb->pages == tlb->local) + __tlb_alloc_page(tlb); } } @@ -85,29 +114,42 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - tlb_flush_mmu(tlb, start, end); + tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ check_pgt_cache(); - put_cpu_var(mmu_gathers); + if (tlb->pages != tlb->local) + free_pages((unsigned long)tlb->pages, 0); } -/* tlb_remove_page +/* __tlb_remove_page * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while * handling the additional races in SMP caused by other CPUs caching valid - * mappings in their TLBs. + * mappings in their TLBs. Returns the number of free page slots left. + * When out of page slots we must call tlb_flush_mmu(). */ -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { tlb->need_flush = 1; if (tlb_fast_mode(tlb)) { free_page_and_swap_cache(page); - return; + return 1; /* avoid calling tlb_flush_mmu() */ } tlb->pages[tlb->nr++] = page; - if (tlb->nr >= FREE_PTE_NR) - tlb_flush_mmu(tlb, 0, 0); + VM_BUG_ON(tlb->nr > tlb->max); + + return tlb->max - tlb->nr; +} + +/* tlb_remove_page + * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when + * required. + */ +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + if (!__tlb_remove_page(tlb, page)) + tlb_flush_mmu(tlb); } /** diff --git a/include/linux/mm.h b/include/linux/mm.h index d2948af126ca..ffcce9bf2b54 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -906,7 +906,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, +unsigned long unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); -- cgit v1.2.3