From fbbb7f4e149f6dd19a8dbebc9fa5c5b72173c6de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:01 -0400 Subject: percpu: remove the usage of separate populated bitmap in percpu-vm percpu-vm uses pcpu_get_pages_and_bitmap() to acquire temp pages array and populated bitmap and uses the two during [de]population. The temp bitmap is used only to build the new bitmap that is copied to chunk->populated after the operation succeeds; however, the new bitmap can be trivially set after success without using the temp bitmap. This patch removes the temp populated bitmap usage from percpu-vm.c. * pcpu_get_pages_and_bitmap() is renamed to pcpu_get_pages() and no longer hands out the temp bitmap. * @populated arugment is dropped from all the related functions. @populated updates in pcpu_[un]map_pages() are dropped. * Two loops in pcpu_map_pages() are merged. * pcpu_[de]populated_chunk() modify chunk->populated bitmap directly from @page_start and @page_end after success. Signed-off-by: Tejun Heo Acked-by: Christoph Lameter --- mm/percpu-vm.c | 93 ++++++++++++++++------------------------------------------ 1 file changed, 25 insertions(+), 68 deletions(-) (limited to 'mm') diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 51108165f829..47b47bfbcb66 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -20,46 +20,24 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, } /** - * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * pcpu_get_pages - get temp pages array * @chunk: chunk of interest - * @bitmapp: output parameter for bitmap * @may_alloc: may allocate the array * - * Returns pointer to array of pointers to struct page and bitmap, - * both of which can be indexed with pcpu_page_idx(). The returned - * array is cleared to zero and *@bitmapp is copied from - * @chunk->populated. Note that there is only one array and bitmap - * and access exclusion is the caller's responsibility. - * - * CONTEXT: - * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. - * Otherwise, don't care. + * Returns pointer to array of pointers to struct page which can be indexed + * with pcpu_page_idx(). Note that there is only one array and access + * exclusion is the caller's responsibility. * * RETURNS: - * Pointer to temp pages array on success, NULL on failure. + * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, - unsigned long **bitmapp, - bool may_alloc) +static struct page **pcpu_get_pages(struct pcpu_chunk *chunk, bool may_alloc) { static struct page **pages; - static unsigned long *bitmap; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); - size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * - sizeof(unsigned long); - - if (!pages || !bitmap) { - if (may_alloc && !pages) - pages = pcpu_mem_zalloc(pages_size); - if (may_alloc && !bitmap) - bitmap = pcpu_mem_zalloc(bitmap_size); - if (!pages || !bitmap) - return NULL; - } - bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); - - *bitmapp = bitmap; + if (!pages && may_alloc) + pages = pcpu_mem_zalloc(pages_size); return pages; } @@ -67,7 +45,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * @@ -75,8 +52,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -95,7 +71,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @@ -104,8 +79,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; unsigned int cpu, tcpu; @@ -164,7 +138,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free - * @populated: populated bitmap * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * @@ -175,8 +148,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -192,8 +164,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } - - bitmap_clear(populated, page_start, page_end - page_start); } /** @@ -228,7 +198,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped - * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * @@ -236,13 +205,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * - * This function is responsible for setting corresponding bits in - * @chunk->populated bitmap and whatever is necessary for reverse - * lookup (addr -> chunk). + * This function is responsible for setting up whatever is necessary for + * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; @@ -253,18 +220,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, page_end - page_start); if (err < 0) goto err; - } - /* mapping successful, link chunk and mark populated */ - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) + for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); - __set_bit(i, populated); } - return 0; - err: for_each_possible_cpu(tcpu) { if (tcpu == cpu) @@ -314,7 +275,6 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) int page_end = PFN_UP(off + size); int free_end = page_start, unmap_end = page_start; struct page **pages; - unsigned long *populated; unsigned int cpu; int rs, re, rc; @@ -327,28 +287,27 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) /* need to allocate and map pages, this chunk can't be immutable */ WARN_ON(chunk->immutable); - pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + pages = pcpu_get_pages(chunk, true); if (!pages) return -ENOMEM; /* alloc and map */ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + rc = pcpu_alloc_pages(chunk, pages, rs, re); if (rc) goto err_free; free_end = re; } pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, populated, rs, re); + rc = pcpu_map_pages(chunk, pages, rs, re); if (rc) goto err_unmap; unmap_end = re; } pcpu_post_map_flush(chunk, page_start, page_end); - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); + bitmap_set(chunk->populated, page_start, page_end - page_start); clear: for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -357,11 +316,11 @@ clear: err_unmap: pcpu_pre_unmap_flush(chunk, page_start, unmap_end); pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_unmap_pages(chunk, pages, rs, re); pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); err_free: pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, populated, rs, re); + pcpu_free_pages(chunk, pages, rs, re); return rc; } @@ -383,7 +342,6 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); struct page **pages; - unsigned long *populated; int rs, re; /* quick path, check whether it's empty already */ @@ -400,22 +358,21 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + pages = pcpu_get_pages(chunk, false); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_unmap_pages(chunk, pages, rs, re); /* no need to flush tlb, vmalloc will handle it lazily */ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, populated, rs, re); + pcpu_free_pages(chunk, pages, rs, re); - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); + bitmap_clear(chunk->populated, page_start, page_end - page_start); } static struct pcpu_chunk *pcpu_create_chunk(void) -- cgit v1.2.3 From cdb4cba5a3c9fa27240d04f4f8dad316b10d995b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:01 -0400 Subject: percpu: remove @may_alloc from pcpu_get_pages() pcpu_get_pages() creates the temp pages array if not already allocated and returns the pointer to it. As the function is called from both [de]population paths and depopulation can only happen after at least one successful population, the param doesn't make any difference - the allocation will always happen on the population path anyway. Remove @may_alloc from pcpu_get_pages(). Also, add an lockdep assertion pcpu_alloc_mutex instead of vaguely stating that the exclusion is the caller's responsibility. Signed-off-by: Tejun Heo --- mm/percpu-vm.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 47b47bfbcb66..d9e0b615492e 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -22,21 +22,22 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, /** * pcpu_get_pages - get temp pages array * @chunk: chunk of interest - * @may_alloc: may allocate the array * * Returns pointer to array of pointers to struct page which can be indexed - * with pcpu_page_idx(). Note that there is only one array and access - * exclusion is the caller's responsibility. + * with pcpu_page_idx(). Note that there is only one array and accesses + * should be serialized by pcpu_alloc_mutex. * * RETURNS: * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages(struct pcpu_chunk *chunk, bool may_alloc) +static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) { static struct page **pages; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); - if (!pages && may_alloc) + lockdep_assert_held(&pcpu_alloc_mutex); + + if (!pages) pages = pcpu_mem_zalloc(pages_size); return pages; } @@ -287,7 +288,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) /* need to allocate and map pages, this chunk can't be immutable */ WARN_ON(chunk->immutable); - pages = pcpu_get_pages(chunk, true); + pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; @@ -358,7 +359,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages(chunk, false); + pages = pcpu_get_pages(chunk); BUG_ON(!pages); /* unmap and free */ -- cgit v1.2.3 From dca496451bddea9aa87b7510dc2eb413d1a19dfd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:01 -0400 Subject: percpu: move common parts out of pcpu_[de]populate_chunk() percpu-vm and percpu-km implement separate versions of pcpu_[de]populate_chunk() and some part which is or should be common are currently in the specific implementations. Make the following changes. * Allocate area clearing is moved from the pcpu_populate_chunk() implementations to pcpu_alloc(). This makes percpu-km's version noop. * Quick exit tests in pcpu_[de]populate_chunk() of percpu-vm are moved to their respective callers so that they are applied to percpu-km too. This doesn't make any meaningful difference as both functions are noop for percpu-km; however, this is more consistent and will help implementing atomic allocation support. Signed-off-by: Tejun Heo --- mm/percpu-km.c | 5 ----- mm/percpu-vm.c | 27 +-------------------------- mm/percpu.c | 39 ++++++++++++++++++++++++++++++--------- 3 files changed, 31 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 89633fefc6a2..9a9096f08867 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -35,11 +35,6 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - unsigned int cpu; - - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - return 0; } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index d9e0b615492e..edf709793318 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -265,7 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * @size: size of the area to populate in bytes * * For each cpu, populate and map pages [@page_start,@page_end) into - * @chunk. The area is cleared on return. + * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. @@ -276,18 +276,8 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) int page_end = PFN_UP(off + size); int free_end = page_start, unmap_end = page_start; struct page **pages; - unsigned int cpu; int rs, re, rc; - /* quick path, check whether all pages are already there */ - rs = page_start; - pcpu_next_pop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - goto clear; - - /* need to allocate and map pages, this chunk can't be immutable */ - WARN_ON(chunk->immutable); - pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; @@ -308,10 +298,6 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) } pcpu_post_map_flush(chunk, page_start, page_end); - bitmap_set(chunk->populated, page_start, page_end - page_start); -clear: - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; err_unmap: @@ -345,15 +331,6 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) struct page **pages; int rs, re; - /* quick path, check whether it's empty already */ - rs = page_start; - pcpu_next_unpop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - return; - - /* immutable chunks can't be depopulated */ - WARN_ON(chunk->immutable); - /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must @@ -372,8 +349,6 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) pcpu_free_pages(chunk, pages, rs, re); - - bitmap_clear(chunk->populated, page_start, page_end - page_start); } static struct pcpu_chunk *pcpu_create_chunk(void) diff --git a/mm/percpu.c b/mm/percpu.c index da997f9800bd..6087384f6ef0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -709,7 +709,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc; + int slot, off, new_alloc, cpu; + int page_start, page_end, rs, re; unsigned long flags; void __percpu *ptr; @@ -802,17 +803,32 @@ restart: area_found: spin_unlock_irqrestore(&pcpu_lock, flags); - /* populate, map and clear the area */ - if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + /* populate if not all pages are already there */ + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); + + rs = page_start; + pcpu_next_pop(chunk, &rs, &re, page_end); + + if (rs != page_start || re != page_end) { + WARN_ON(chunk->immutable); + + if (pcpu_populate_chunk(chunk, off, size)) { + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_free_area(chunk, off); + err = "failed to populate"; + goto fail_unlock; + } + + bitmap_set(chunk->populated, page_start, page_end - page_start); } mutex_unlock(&pcpu_alloc_mutex); - /* return address relative to base address */ + /* clear the areas and return address relative to base address */ + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size); return ptr; @@ -903,7 +919,12 @@ static void pcpu_reclaim(struct work_struct *work) spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + int rs = 0, re; + + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(pcpu_unit_size)); + if (rs || re != PFN_UP(pcpu_unit_size)) + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + pcpu_destroy_chunk(chunk); } -- cgit v1.2.3 From a93ace487a339dccf7040be7fee08c3415188e14 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: move region iterations out of pcpu_[de]populate_chunk() Previously, pcpu_[de]populate_chunk() were called with the range which may contain multiple target regions in it and pcpu_[de]populate_chunk() iterated over the regions. This has the benefit of batching up cache flushes for all the regions; however, we're planning to add more bookkeeping logic around [de]population to support atomic allocations and this delegation of iterations gets in the way. This patch moves the region iterations out of pcpu_[de]populate_chunk() into its callers - pcpu_alloc() and pcpu_reclaim() - so that we can later add logic to track more states around them. This change may make cache and tlb flushes more frequent but multi-region [de]populations are rare anyway and if this actually becomes a problem, it's not difficult to factor out cache flushes as separate callbacks which are directly invoked from percpu.c. Signed-off-by: Tejun Heo --- mm/percpu-km.c | 6 ++++-- mm/percpu-vm.c | 57 ++++++++++++++++----------------------------------------- mm/percpu.c | 19 ++++++++----------- 3 files changed, 28 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 9a9096f08867..a6e34bc1aff4 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -33,12 +33,14 @@ #include -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { return 0; } -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { /* nada */ } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index edf709793318..538998a137d2 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -261,8 +261,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. @@ -270,66 +270,43 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int free_end = page_start, unmap_end = page_start; struct page **pages; - int rs, re, rc; pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; - /* alloc and map */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, rs, re); - if (rc) - goto err_free; - free_end = re; - } + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + return -ENOMEM; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, rs, re); - if (rc) - goto err_unmap; - unmap_end = re; + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { + pcpu_free_pages(chunk, pages, page_start, page_end); + return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); return 0; - -err_unmap: - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, rs, re); - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); -err_free: - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, rs, re); - return rc; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. + * from @chunk. * * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); struct page **pages; - int rs, re; /* * If control reaches here, there must have been at least one @@ -342,13 +319,11 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, rs, re); + pcpu_unmap_pages(chunk, pages, page_start, page_end); /* no need to flush tlb, vmalloc will handle it lazily */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, rs, re); + pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(void) diff --git a/mm/percpu.c b/mm/percpu.c index 6087384f6ef0..fe5de97d7caa 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -807,20 +807,17 @@ area_found: page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); - rs = page_start; - pcpu_next_pop(chunk, &rs, &re, page_end); - - if (rs != page_start || re != page_end) { + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { WARN_ON(chunk->immutable); - if (pcpu_populate_chunk(chunk, off, size)) { + if (pcpu_populate_chunk(chunk, rs, re)) { spin_lock_irqsave(&pcpu_lock, flags); pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } - bitmap_set(chunk->populated, page_start, page_end - page_start); + bitmap_set(chunk->populated, rs, re - rs); } mutex_unlock(&pcpu_alloc_mutex); @@ -919,12 +916,12 @@ static void pcpu_reclaim(struct work_struct *work) spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &todo, list) { - int rs = 0, re; - - pcpu_next_unpop(chunk, &rs, &re, PFN_UP(pcpu_unit_size)); - if (rs || re != PFN_UP(pcpu_unit_size)) - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + int rs, re; + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_depopulate_chunk(chunk, rs, re); + bitmap_clear(chunk->populated, rs, re - rs); + } pcpu_destroy_chunk(chunk); } -- cgit v1.2.3 From a63d4ac4ab6094c051a5a240260d16117a7a2f86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: make percpu-km set chunk->populated bitmap properly percpu-km instantiates the whole chunk on creation and doesn't make use of chunk->populated bitmap and leaves it as zero. While this currently doesn't cause any problem, the inconsistency makes it difficult to build further logic on top of chunk->populated. This patch makes percpu-km fill chunk->populated on creation so that the bitmap is always consistent. Signed-off-by: Tejun Heo Acked-by: Christoph Lameter --- mm/percpu-km.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index a6e34bc1aff4..67a971b7f745 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -67,6 +67,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + + bitmap_fill(chunk->populated, nr_pages); + return chunk; } -- cgit v1.2.3 From b38d08f3181c5025a7ce84646494cc4748492a3b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: restructure locking At first, the percpu allocator required a sleepable context for both alloc and free paths and used pcpu_alloc_mutex to protect everything. Later, pcpu_lock was introduced to protect the index data structure so that the free path can be invoked from atomic contexts. The conversion only updated what's necessary and left most of the allocation path under pcpu_alloc_mutex. The percpu allocator is planned to add support for atomic allocation and this patch restructures locking so that the coverage of pcpu_alloc_mutex is further reduced. * pcpu_alloc() now grab pcpu_alloc_mutex only while creating a new chunk and populating the allocated area. Everything else is now protected soley by pcpu_lock. After this change, multiple instances of pcpu_extend_area_map() may race but the function already implements sufficient synchronization using pcpu_lock. This also allows multiple allocators to arrive at new chunk creation. To avoid creating multiple empty chunks back-to-back, a new chunk is created iff there is no other empty chunk after grabbing pcpu_alloc_mutex. * pcpu_lock is now held while modifying chunk->populated bitmap. After this, all data structures are protected by pcpu_lock. Signed-off-by: Tejun Heo --- mm/percpu-km.c | 2 ++ mm/percpu.c | 75 +++++++++++++++++++++++++++------------------------------- 2 files changed, 37 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 67a971b7f745..e662b4947a65 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -68,7 +68,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + spin_lock_irq(&pcpu_lock); bitmap_fill(chunk->populated, nr_pages); + spin_unlock_irq(&pcpu_lock); return chunk; } diff --git a/mm/percpu.c b/mm/percpu.c index fe5de97d7caa..e59f7b405bed 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -152,31 +152,12 @@ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; /* - * Synchronization rules. - * - * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks, populated bitmap and - * vmalloc mapping. The latter is a spinlock and protects the index - * data structures - chunk slots, chunks and area maps in chunks. - * - * During allocation, pcpu_alloc_mutex is kept locked all the time and - * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. In - * general, percpu memory can't be allocated with irq off but - * irqsave/restore are still used in alloc path so that it can be used - * from early init path - sched_init() specifically. - * - * Free path accesses and alters only the index data structures, so it - * can be safely called from atomic context. When memory needs to be - * returned to the system, free path schedules reclaim_work which - * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be - * reclaimed, release both locks and frees the chunks. Note that it's - * necessary to grab both locks to remove a chunk from circulation as - * allocation path might be referencing the chunk with only - * pcpu_alloc_mutex locked. + * Free path accesses and alters only the index data structures and can be + * safely called from atomic context. When memory needs to be returned to + * the system, free path schedules reclaim_work. */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ -static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ +static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ @@ -709,7 +690,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc, cpu; + int slot, off, new_alloc, cpu, ret; int page_start, page_end, rs, re; unsigned long flags; void __percpu *ptr; @@ -729,7 +710,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) return NULL; } - mutex_lock(&pcpu_alloc_mutex); spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -745,7 +725,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } @@ -771,7 +751,7 @@ restart: if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); /* @@ -787,37 +767,53 @@ restart: } } - /* hmmm... no space left, create a new chunk */ spin_unlock_irqrestore(&pcpu_lock, flags); - chunk = pcpu_create_chunk(); - if (!chunk) { - err = "failed to allocate new chunk"; - goto fail_unlock_mutex; + /* + * No space left. Create a new chunk. We don't want multiple + * tasks to create chunks simultaneously. Serialize and create iff + * there's still no empty chunk after grabbing the mutex. + */ + mutex_lock(&pcpu_alloc_mutex); + + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { + chunk = pcpu_create_chunk(); + if (!chunk) { + err = "failed to allocate new chunk"; + goto fail; + } + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_relocate(chunk, -1); + } else { + spin_lock_irqsave(&pcpu_lock, flags); } - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_chunk_relocate(chunk, -1); + mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ + mutex_lock(&pcpu_alloc_mutex); page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { WARN_ON(chunk->immutable); - if (pcpu_populate_chunk(chunk, rs, re)) { - spin_lock_irqsave(&pcpu_lock, flags); + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } - bitmap_set(chunk->populated, rs, re - rs); + spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); @@ -832,8 +828,7 @@ area_found: fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); -fail_unlock_mutex: - mutex_unlock(&pcpu_alloc_mutex); +fail: if (warn_limit) { pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " "%s\n", size, align, err); -- cgit v1.2.3 From a16037c8dfc2734c1a2c8e3ffd4766ed25f2a41d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:02 -0400 Subject: percpu: make pcpu_alloc_area() capable of allocating only from populated areas Update pcpu_alloc_area() so that it can skip unpopulated areas if the new parameter @pop_only is true. This is implemented by a new function, pcpu_fit_in_area(), which determines the amount of head padding considering the alignment and populated state. @pop_only is currently always false but this will be used to implement atomic allocation. Signed-off-by: Tejun Heo --- mm/percpu.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index e59f7b405bed..e18aa143aab1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -399,11 +399,61 @@ out_unlock: return 0; } +/** + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area + * @chunk: chunk the candidate area belongs to + * @off: the offset to the start of the candidate area + * @this_size: the size of the candidate area + * @size: the size of the target allocation + * @align: the alignment of the target allocation + * @pop_only: only allocate from already populated region + * + * We're trying to allocate @size bytes aligned at @align. @chunk's area + * at @off sized @this_size is a candidate. This function determines + * whether the target allocation fits in the candidate area and returns the + * number of bytes to pad after @off. If the target area doesn't fit, -1 + * is returned. + * + * If @pop_only is %true, this function only considers the already + * populated part of the candidate area. + */ +static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, + int size, int align, bool pop_only) +{ + int cand_off = off; + + while (true) { + int head = ALIGN(cand_off, align) - off; + int page_start, page_end, rs, re; + + if (this_size < head + size) + return -1; + + if (!pop_only) + return head; + + /* + * If the first unpopulated page is beyond the end of the + * allocation, the whole allocation is populated; + * otherwise, retry from the end of the unpopulated area. + */ + page_start = PFN_DOWN(head + off); + page_end = PFN_UP(head + off + size); + + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); + if (rs >= page_end) + return head; + cand_off = re * PAGE_SIZE; + } +} + /** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes * @align: wanted align + * @pop_only: allocate only from the populated area * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -418,7 +468,8 @@ out_unlock: * Allocated offset in @chunk on success, -1 if no matching area is * found. */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, + bool pop_only) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -434,11 +485,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) if (off & 1) continue; - /* extra for alignment requirement */ - head = ALIGN(off, align) - off; - this_size = (p[1] & ~1) - off; - if (this_size < head + size) { + + head = pcpu_fit_in_area(chunk, off, this_size, size, align, + pop_only); + if (head < 0) { if (!seen_free) { chunk->first_free = i; seen_free = true; @@ -730,7 +781,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, false); if (off >= 0) goto area_found; @@ -761,7 +812,7 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, false); if (off >= 0) goto area_found; } -- cgit v1.2.3 From e04d320838f573d8fa989a0d7af0972f9b0142d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:04 -0400 Subject: percpu: indent the population block in pcpu_alloc() The next patch will conditionalize the population block in pcpu_alloc() which will end up making a rather large indentation change obfuscating the actual logic change. This patch puts the block under "if (true)" so that the next patch can avoid indentation changes. The defintions of the local variables which are used only in the block are moved into the block. This patch is purely cosmetic. Signed-off-by: Tejun Heo --- mm/percpu.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index e18aa143aab1..577d84fb3002 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -742,7 +742,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) struct pcpu_chunk *chunk; const char *err; int slot, off, new_alloc, cpu, ret; - int page_start, page_end, rs, re; unsigned long flags; void __percpu *ptr; @@ -847,27 +846,32 @@ area_found: spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ - mutex_lock(&pcpu_alloc_mutex); - page_start = PFN_DOWN(off); - page_end = PFN_UP(off + size); + if (true) { + int page_start, page_end, rs, re; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - WARN_ON(chunk->immutable); + mutex_lock(&pcpu_alloc_mutex); - ret = pcpu_populate_chunk(chunk, rs, re); + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); - spin_lock_irqsave(&pcpu_lock, flags); - if (ret) { - mutex_unlock(&pcpu_alloc_mutex); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + WARN_ON(chunk->immutable); + + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); + pcpu_free_area(chunk, off); + err = "failed to populate"; + goto fail_unlock; + } + bitmap_set(chunk->populated, rs, re - rs); + spin_unlock_irqrestore(&pcpu_lock, flags); } - bitmap_set(chunk->populated, rs, re - rs); - spin_unlock_irqrestore(&pcpu_lock, flags); - } - mutex_unlock(&pcpu_alloc_mutex); + mutex_unlock(&pcpu_alloc_mutex); + } /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) -- cgit v1.2.3 From 5835d96e9ce4efdba8c6cefffc2f1575925456de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:04 -0400 Subject: percpu: implement [__]alloc_percpu_gfp() Now that pcpu_alloc_area() can allocate only from populated areas, it's easy to add atomic allocation support to [__]alloc_percpu(). Update pcpu_alloc() so that it accepts @gfp and skips all the blocking operations and allocates only from the populated areas if @gfp doesn't contain GFP_KERNEL. New interface functions [__]alloc_percpu_gfp() are added. While this means that atomic allocations are possible, this isn't complete yet as there's no mechanism to ensure that certain amount of populated areas is kept available and atomic allocations may keep failing under certain conditions. Signed-off-by: Tejun Heo --- mm/percpu.c | 64 +++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 577d84fb3002..c52b93117dc2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -151,11 +151,6 @@ static struct pcpu_chunk *pcpu_first_chunk; static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; -/* - * Free path accesses and alters only the index data structures and can be - * safely called from atomic context. When memory needs to be returned to - * the system, free path schedules reclaim_work. - */ static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ @@ -727,20 +722,21 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags * - * Allocate percpu area of @size bytes aligned at @align. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) { static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; + bool is_atomic = !(gfp & GFP_KERNEL); int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -773,14 +769,15 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) while ((new_alloc = pcpu_need_to_extend(chunk))) { spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + if (is_atomic || + pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align, false); + off = pcpu_alloc_area(chunk, size, align, is_atomic); if (off >= 0) goto area_found; @@ -797,6 +794,8 @@ restart: new_alloc = pcpu_need_to_extend(chunk); if (new_alloc) { + if (is_atomic) + continue; spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { @@ -811,7 +810,7 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align, false); + off = pcpu_alloc_area(chunk, size, align, is_atomic); if (off >= 0) goto area_found; } @@ -824,6 +823,9 @@ restart: * tasks to create chunks simultaneously. Serialize and create iff * there's still no empty chunk after grabbing the mutex. */ + if (is_atomic) + goto fail; + mutex_lock(&pcpu_alloc_mutex); if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { @@ -846,7 +848,7 @@ area_found: spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ - if (true) { + if (!is_atomic) { int page_start, page_end, rs, re; mutex_lock(&pcpu_alloc_mutex); @@ -884,9 +886,9 @@ area_found: fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: - if (warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " - "%s\n", size, align, err); + if (!is_atomic && warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); @@ -895,22 +897,34 @@ fail: } /** - * __alloc_percpu - allocate dynamic percpu area + * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags * - * Allocate zero-filled percpu area of @size bytes aligned at @align. - * Might sleep. Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ void __percpu *__alloc_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, false); + return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); @@ -932,7 +946,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, true); + return pcpu_alloc(size, align, true, GFP_KERNEL); } /** -- cgit v1.2.3 From 9c824b6a172c8d44a6b037946bae90127c969b1b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: make sure chunk->map array has available space An allocation attempt may require extending chunk->map array which requires GFP_KERNEL context which isn't available for atomic allocations. This patch ensures that chunk->map array usually keeps some amount of available space by directly allocating buffer space during GFP_KERNEL allocations and scheduling async extension during atomic ones. This should make atomic allocation failures from map space exhaustion rare. Signed-off-by: Tejun Heo --- mm/percpu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index c52b93117dc2..546ced05cf33 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,8 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 +#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -102,9 +104,12 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct work_struct map_extend_work;/* async ->map[] extension */ + void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ @@ -318,9 +323,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) /** * pcpu_need_to_extend - determine whether chunk area map needs to be extended * @chunk: chunk of interest + * @is_atomic: the allocation context * - * Determine whether area map of @chunk needs to be extended to - * accommodate a new allocation. + * Determine whether area map of @chunk needs to be extended. If + * @is_atomic, only the amount necessary for a new allocation is + * considered; however, async extension is scheduled if the left amount is + * low. If !@is_atomic, it aims for more empty space. Combined, this + * ensures that the map is likely to have enough available space to + * accomodate atomic allocations which can't extend maps directly. * * CONTEXT: * pcpu_lock. @@ -329,15 +339,25 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * New target map allocation length if extension is necessary, 0 * otherwise. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk) +static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { - int new_alloc; + int margin, new_alloc; + + if (is_atomic) { + margin = 3; - if (chunk->map_alloc >= chunk->map_used + 3) + if (chunk->map_alloc < + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) + schedule_work(&chunk->map_extend_work); + } else { + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; + } + + if (chunk->map_alloc >= chunk->map_used + margin) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 3) + while (new_alloc < chunk->map_used + margin) new_alloc *= 2; return new_alloc; @@ -394,6 +414,20 @@ out_unlock: return 0; } +static void pcpu_map_extend_workfn(struct work_struct *work) +{ + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, + map_extend_work); + int new_alloc; + + spin_lock_irq(&pcpu_lock); + new_alloc = pcpu_need_to_extend(chunk, false); + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); +} + /** * pcpu_fit_in_area - try to fit the requested allocation in a candidate area * @chunk: chunk the candidate area belongs to @@ -647,6 +681,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -767,7 +802,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk))) { + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { spin_unlock_irqrestore(&pcpu_lock, flags); if (is_atomic || pcpu_extend_area_map(chunk, new_alloc) < 0) { @@ -792,7 +827,7 @@ restart: if (size > chunk->contig_hint) continue; - new_alloc = pcpu_need_to_extend(chunk); + new_alloc = pcpu_need_to_extend(chunk, is_atomic); if (new_alloc) { if (is_atomic) continue; @@ -1418,6 +1453,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); @@ -1446,6 +1482,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); -- cgit v1.2.3 From b539b87fed37ffc16c89a6bc3beca2d7aed82e1c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: implmeent pcpu_nr_empty_pop_pages and chunk->nr_populated pcpu_nr_empty_pop_pages counts the number of empty populated pages across all chunks and chunk->nr_populated counts the number of populated pages in a chunk. Both will be used to implement pre/async population for atomic allocations. pcpu_chunk_[de]populated() are added to update chunk->populated, chunk->nr_populated and pcpu_nr_empty_pop_pages together. All successful chunk [de]populations should be followed by the corresponding pcpu_chunk_[de]populated() calls. Signed-off-by: Tejun Heo --- mm/percpu-km.c | 2 +- mm/percpu.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 114 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index e662b4947a65..10e3d0b8a86d 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; spin_lock_irq(&pcpu_lock); - bitmap_fill(chunk->populated, nr_pages); + pcpu_chunk_populated(chunk, 0, nr_pages); spin_unlock_irq(&pcpu_lock); return chunk; diff --git a/mm/percpu.c b/mm/percpu.c index 546ced05cf33..4f2d58760c9c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -113,6 +113,7 @@ struct pcpu_chunk { void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ + int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; @@ -161,6 +162,12 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* + * The number of empty populated pages, protected by pcpu_lock. The + * reserved chunk doesn't contribute to the count. + */ +static int pcpu_nr_empty_pop_pages; + /* reclaim work to release fully free chunks, scheduled from free path */ static void pcpu_reclaim(struct work_struct *work); static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); @@ -295,6 +302,38 @@ static void pcpu_mem_free(void *ptr, size_t size) vfree(ptr); } +/** + * pcpu_count_occupied_pages - count the number of pages an area occupies + * @chunk: chunk of interest + * @i: index of the area in question + * + * Count the number of pages chunk's @i'th area occupies. When the area's + * start and/or end address isn't aligned to page boundary, the straddled + * page is included in the count iff the rest of the page is free. + */ +static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) +{ + int off = chunk->map[i] & ~1; + int end = chunk->map[i + 1] & ~1; + + if (!PAGE_ALIGNED(off) && i > 0) { + int prev = chunk->map[i - 1]; + + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) + off = round_down(off, PAGE_SIZE); + } + + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { + int next = chunk->map[i + 1]; + int nend = chunk->map[i + 2] & ~1; + + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) + end = round_up(end, PAGE_SIZE); + } + + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); +} + /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest @@ -483,6 +522,7 @@ static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, * @size: wanted size in bytes * @align: wanted align * @pop_only: allocate only from the populated area + * @occ_pages_p: out param for the number of pages the area occupies * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -498,7 +538,7 @@ static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, * found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, - bool pop_only) + bool pop_only, int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -587,6 +627,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, chunk->free_size -= size; *p |= 1; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); pcpu_chunk_relocate(chunk, oslot); return off; } @@ -602,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, * pcpu_free_area - free area to a pcpu_chunk * @chunk: chunk of interest * @freeme: offset of area to free + * @occ_pages_p: out param for the number of pages the area occupies * * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap @@ -610,7 +652,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, * CONTEXT: * pcpu_lock. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, + int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int off = 0; @@ -641,6 +684,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) *p = off &= ~1; chunk->free_size += (p[1] & ~1) - off; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + /* merge with next? */ if (!(p[1] & 1)) to_free++; @@ -696,6 +741,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) pcpu_mem_free(chunk, pcpu_chunk_struct_size); } +/** + * pcpu_chunk_populated - post-population bookkeeping + * @chunk: pcpu_chunk which got populated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been populated to @chunk. Update + * the bookkeeping information accordingly. Must be called after each + * successful population. + */ +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_set(chunk->populated, page_start, nr); + chunk->nr_populated += nr; + pcpu_nr_empty_pop_pages += nr; +} + +/** + * pcpu_chunk_depopulated - post-depopulation bookkeeping + * @chunk: pcpu_chunk which got depopulated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been depopulated from @chunk. + * Update the bookkeeping information accordingly. Must be called after + * each successful depopulation. + */ +static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_clear(chunk->populated, page_start, nr); + chunk->nr_populated -= nr; + pcpu_nr_empty_pop_pages -= nr; +} + /* * Chunk management implementation. * @@ -772,6 +861,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, struct pcpu_chunk *chunk; const char *err; bool is_atomic = !(gfp & GFP_KERNEL); + int occ_pages = 0; int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -812,7 +902,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align, is_atomic); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; @@ -845,7 +936,8 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align, is_atomic); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; } @@ -899,17 +991,20 @@ area_found: spin_lock_irqsave(&pcpu_lock, flags); if (ret) { mutex_unlock(&pcpu_alloc_mutex); - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); err = "failed to populate"; goto fail_unlock; } - bitmap_set(chunk->populated, rs, re - rs); + pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages -= occ_pages; + /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -1019,7 +1114,9 @@ static void pcpu_reclaim(struct work_struct *work) pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { pcpu_depopulate_chunk(chunk, rs, re); - bitmap_clear(chunk->populated, rs, re - rs); + spin_lock_irq(&pcpu_lock); + pcpu_chunk_depopulated(chunk, rs, re); + spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); } @@ -1041,7 +1138,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off; + int off, occ_pages; if (!ptr) return; @@ -1055,7 +1152,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); + + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += occ_pages; /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { @@ -1459,6 +1559,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); + schunk->nr_populated = pcpu_unit_pages; if (ai->reserved_size) { schunk->free_size = ai->reserved_size; @@ -1488,6 +1589,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; bitmap_fill(dchunk->populated, pcpu_unit_pages); + dchunk->nr_populated = pcpu_unit_pages; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[0] = 1; @@ -1498,6 +1600,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; + pcpu_nr_empty_pop_pages += + pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ -- cgit v1.2.3 From fe6bd8c3d28357174587c4fe895d10b00321b692 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: rename pcpu_reclaim_work to pcpu_balance_work pcpu_reclaim_work will also be used to populate chunks asynchronously. Rename it to pcpu_balance_work in preparation. pcpu_reclaim() is renamed to pcpu_balance_workfn() and some of its local variables are renamed too. This is pure rename. Signed-off-by: Tejun Heo --- mm/percpu.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 4f2d58760c9c..28a830590b4c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -168,9 +168,9 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ */ static int pcpu_nr_empty_pop_pages; -/* reclaim work to release fully free chunks, scheduled from free path */ -static void pcpu_reclaim(struct work_struct *work); -static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); +/* balance work is used to populate or destroy chunks asynchronously */ +static void pcpu_balance_workfn(struct work_struct *work); +static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); static bool pcpu_addr_in_first_chunk(void *addr) { @@ -1080,36 +1080,33 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) } /** - * pcpu_reclaim - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - reclaim fully free chunks, workqueue function * @work: unused * * Reclaim all fully free chunks except for the first one. - * - * CONTEXT: - * workqueue context. */ -static void pcpu_reclaim(struct work_struct *work) +static void pcpu_balance_workfn(struct work_struct *work) { - LIST_HEAD(todo); - struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + LIST_HEAD(to_free); + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, head, list) { + list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ - if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_move(&chunk->list, &todo); + list_move(&chunk->list, &to_free); } spin_unlock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, &todo, list) { + list_for_each_entry_safe(chunk, next, &to_free, list) { int rs, re; pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { @@ -1163,7 +1160,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_reclaim_work); + schedule_work(&pcpu_balance_work); break; } } -- cgit v1.2.3 From 1a4d76076cda69b0abf15463a8cebc172406da25 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Sep 2014 14:46:05 -0400 Subject: percpu: implement asynchronous chunk population The percpu allocator now supports atomic allocations by only allocating from already populated areas but the mechanism to ensure that there's adequate amount of populated areas was missing. This patch expands pcpu_balance_work so that in addition to freeing excess free chunks it also populates chunks to maintain an adequate level of populated areas. pcpu_alloc() schedules pcpu_balance_work if the amount of free populated areas is too low or after an atomic allocation failure. * PERPCU_DYNAMIC_RESERVE is increased by two pages to account for PCPU_EMPTY_POP_PAGES_LOW. * pcpu_async_enabled is added to gate both async jobs - chunk->map_extend_work and pcpu_balance_work - so that we don't end up scheduling them while the needed subsystems aren't up yet. Signed-off-by: Tejun Heo --- mm/percpu.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 113 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 28a830590b4c..867efd38d879 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -78,6 +78,8 @@ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -168,9 +170,22 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ */ static int pcpu_nr_empty_pop_pages; -/* balance work is used to populate or destroy chunks asynchronously */ +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; + +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} static bool pcpu_addr_in_first_chunk(void *addr) { @@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) margin = 3; if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) schedule_work(&chunk->map_extend_work); } else { margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; @@ -1005,6 +1021,9 @@ area_found: if (chunk != pcpu_reserved_chunk) pcpu_nr_empty_pop_pages -= occ_pages; + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); @@ -1023,6 +1042,11 @@ fail: if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } return NULL; } @@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) } /** - * pcpu_balance_workfn - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Reclaim all fully free chunks except for the first one. @@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work) LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); @@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + mutex_unlock(&pcpu_alloc_mutex); } @@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_balance_work); + pcpu_schedule_balance_work(); break; } } @@ -2187,3 +2284,15 @@ void __init percpu_init_late(void) spin_unlock_irqrestore(&pcpu_lock, flags); } } + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); -- cgit v1.2.3 From 908c7f1949cb7cc6e92ba8f18f2998e87e265b8e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:29 +0900 Subject: percpu_counter: add @gfp to percpu_counter_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_counter_init() so that !GFP_KERNEL allocation masks can be used with percpu_counters too. We could have left percpu_counter_init() alone and added percpu_counter_init_gfp(); however, the number of users isn't that high and introducing _gfp variants to all percpu data structures would be quite ugly, so let's just do the conversion. This is the one with the most users. Other percpu data structures are a lot easier to convert. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Acked-by: Jan Kara Acked-by: "David S. Miller" Cc: x86@kernel.org Cc: Jens Axboe Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andrew Morton --- mm/backing-dev.c | 2 +- mm/mmap.c | 2 +- mm/nommu.c | 2 +- mm/shmem.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f0..f19a818be2d3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); if (err) goto err; } diff --git a/mm/mmap.c b/mm/mmap.c index c1f2ea4a0b99..d7ec93e25fa1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3196,7 +3196,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index a881d9673c6b..bd1808e194a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -539,7 +539,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } diff --git a/mm/shmem.c b/mm/shmem.c index 0e5fb225007c..d4bc55d3f107 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2993,7 +2993,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - if (percpu_counter_init(&sbinfo->used_blocks, 0)) + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; -- cgit v1.2.3 From 20ae00792c6f1f18fc4fc5965445a145df92827e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:30 +0900 Subject: proportions: add @gfp to init functions Percpu allocator now supports allocation mask. Add @gfp to [flex_]proportions init functions so that !GFP_KERNEL allocation masks can be used with them too. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Peter Zijlstra --- mm/backing-dev.c | 2 +- mm/page-writeback.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f19a818be2d3..64ec49d1772b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = fprop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); if (err) { err: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d73ef1744d..508599403721 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1777,7 +1777,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions); + fprop_global_init(&writeout_completions, GFP_KERNEL); } /** -- cgit v1.2.3 From 23cb8981ed929b4dd48141401cd0fd31e0fa4ed0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Sep 2014 08:02:45 +0900 Subject: percpu: fix locking regression in the failure path of pcpu_alloc() While updating locking, b38d08f3181c ("percpu: restructure locking") broke pcpu_create_chunk() creation path in pcpu_alloc(). It returns without releasing pcpu_alloc_mutex. Fix it. Signed-off-by: Tejun Heo Reported-by: Julia Lawall --- mm/percpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 867efd38d879..af3dd2704efd 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -974,6 +974,7 @@ restart: if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); if (!chunk) { + mutex_unlock(&pcpu_alloc_mutex); err = "failed to allocate new chunk"; goto fail; } -- cgit v1.2.3 From bb2e226b3bef596dd56be97df655d857b4603923 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 21 Sep 2014 15:04:53 -0700 Subject: Revert "percpu: free percpu allocation info for uniprocessor system" This reverts commit 3189eddbcafc ("percpu: free percpu allocation info for uniprocessor system"). The commit causes a hang with a crisv32 image. This may be an architecture problem, but at least for now the revert is necessary to be able to boot a crisv32 image. Cc: Tejun Heo Cc: Honggang Li Signed-off-by: Guenter Roeck Signed-off-by: Tejun Heo Fixes: 3189eddbcafc ("percpu: free percpu allocation info for uniprocessor system") Cc: stable@vger.kernel.org # Please don't apply 3189eddbcafc --- mm/percpu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index af3dd2704efd..e10f9f7a8887 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2250,8 +2250,6 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); - - pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 6ae833c7fe0c6ef1f0ab13cc775da230d6f4c256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Oct 2014 12:01:52 -0400 Subject: percpu: fix how @gfp is interpreted by the percpu allocator When @gfp is specified, the percpu allocator is interested in whether it contains all of GFP_KERNEL or not. If it does, the normal allocation path is taken; otherwise, the atomic allocation path. Unfortunately, pcpu_alloc() was incorrectly testing for whether @gfp contains any part of GFP_KERNEL. Fix it by testing "(gfp & GFP_KERNEL) != GFP_KERNEL" instead of "!(gfp & GFP_KERNEL)" to decide whether the allocation should be atomic or not. Signed-off-by: Tejun Heo --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index e10f9f7a8887..014bab65e0ff 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -876,7 +876,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - bool is_atomic = !(gfp & GFP_KERNEL); + bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; int occ_pages = 0; int slot, off, new_alloc, cpu, ret; unsigned long flags; -- cgit v1.2.3