From d1b9432712a25eeb06114fb4b587133525a47de5 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 4 Dec 2013 18:19:06 +0800 Subject: aio: clean up aio ring in the fail path Clean up the aio ring file in the fail path of aio_setup_ring and ioctx_alloc. And maybe it can fix the GPF issue reported by Dave Jones: https://lkml.org/lkml/2013/11/25/898 Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index ad460d78d6c5..a2f92aa23ee3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -316,8 +316,10 @@ static int aio_setup_ring(struct kioctx *ctx) if (nr_pages > AIO_RING_PAGES) { ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!ctx->ring_pages) + if (!ctx->ring_pages) { + put_aio_ring_file(ctx); return -ENOMEM; + } } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -594,7 +596,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); err = -EAGAIN; - goto err; + goto err_ctx; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); @@ -611,6 +613,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err_cleanup: aio_nr_sub(ctx->max_reqs); +err_ctx: + aio_free_ring(ctx); err: free_percpu(ctx->cpu); free_percpu(ctx->reqs.pcpu_count); -- cgit v1.2.3 From 1881686f842065d2f92ec9c6424830ffc17d23b0 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Sat, 21 Dec 2013 15:49:28 -0500 Subject: aio: fix kioctx leak introduced by "aio: Fix a trinity splat" e34ecee2ae791df674dfb466ce40692ca6218e43 reworked the percpu reference counting to correct a bug trinity found. Unfortunately, the change lead to kioctxes being leaked because there was no final reference count to put. Add that reference count back in to fix things. Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 6efb7f6cb22e..fd1c0baf15bb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -652,7 +652,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); - percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ err = ioctx_add_table(ctx, mm); if (err) -- cgit v1.2.3 From 8e321fefb0e60bae4e2a28d20fc4fa30758d27c6 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Sat, 21 Dec 2013 17:56:08 -0500 Subject: aio/migratepages: make aio migrate pages sane The arbitrary restriction on page counts offered by the core migrate_page_move_mapping() code results in rather suspicious looking fiddling with page reference counts in the aio_migratepage() operation. To fix this, make migrate_page_move_mapping() take an extra_count parameter that allows aio to tell the code about its own reference count on the page being migrated. While cleaning up aio_migratepage(), make it validate that the old page being passed in is actually what aio_migratepage() expects to prevent misbehaviour in the case of races. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index fd1c0baf15bb..efa708b29054 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -244,9 +244,14 @@ static void aio_free_ring(struct kioctx *ctx) int i; for (i = 0; i < ctx->nr_pages; i++) { + struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, page_count(ctx->ring_pages[i])); - put_page(ctx->ring_pages[i]); + page = ctx->ring_pages[i]; + if (!page) + continue; + ctx->ring_pages[i] = NULL; + put_page(page); } put_aio_ring_file(ctx); @@ -280,18 +285,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, unsigned long flags; int rc; + rc = 0; + + /* Make sure the old page hasn't already been changed */ + spin_lock(&mapping->private_lock); + ctx = mapping->private_data; + if (ctx) { + pgoff_t idx; + spin_lock_irqsave(&ctx->completion_lock, flags); + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; + } else + rc = -EINVAL; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else + rc = -EINVAL; + spin_unlock(&mapping->private_lock); + + if (rc != 0) + return rc; + /* Writeback must be complete */ BUG_ON(PageWriteback(old)); - put_page(old); + get_page(new); - rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { - get_page(old); + put_page(new); return rc; } - get_page(new); - /* We can potentially race against kioctx teardown here. Use the * address_space's private data lock to protect the mapping's * private_data. @@ -303,13 +328,24 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, spin_lock_irqsave(&ctx->completion_lock, flags); migrate_page_copy(new, old); idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) - ctx->ring_pages[idx] = new; + if (idx < (pgoff_t)ctx->nr_pages) { + /* And only do the move if things haven't changed */ + if (ctx->ring_pages[idx] == old) + ctx->ring_pages[idx] = new; + else + rc = -EAGAIN; + } else + rc = -EINVAL; spin_unlock_irqrestore(&ctx->completion_lock, flags); } else rc = -EBUSY; spin_unlock(&mapping->private_lock); + if (rc == MIGRATEPAGE_SUCCESS) + put_page(old); + else + put_page(new); + return rc; } #endif -- cgit v1.2.3 From 3dc9acb67600393249a795934ccdfc291a200e6b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 20 Dec 2013 05:11:12 +0900 Subject: aio: clean up and fix aio_setup_ring page mapping Since commit 36bc08cc01709 ("fs/aio: Add support to aio ring pages migration") the aio ring setup code has used a special per-ring backing inode for the page allocations, rather than just using random anonymous pages. However, rather than remembering the pages as it allocated them, it would allocate the pages, insert them into the file mapping (dirty, so that they couldn't be free'd), and then forget about them. And then to look them up again, it would mmap the mapping, and then use "get_user_pages()" to get back an array of the pages we just created. Now, not only is that incredibly inefficient, it also leaked all the pages if the mmap failed (which could happen due to excessive number of mappings, for example). So clean it all up, making it much more straightforward. Also remove some left-overs of the previous (broken) mm_populate() usage that was removed in commit d6c355c7dabc ("aio: fix race in ring buffer page lookup introduced by page migration support") but left the pointless and now misleading MAP_POPULATE flag around. Tested-and-acked-by: Benjamin LaHaise Signed-off-by: Linus Torvalds --- fs/aio.c | 58 +++++++++++++++++++++++----------------------------------- 1 file changed, 23 insertions(+), 35 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 6efb7f6cb22e..643db8fc43c5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -326,7 +326,7 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; unsigned nr_events = ctx->max_reqs; struct mm_struct *mm = current->mm; - unsigned long size, populate; + unsigned long size, unused; int nr_pages; int i; struct file *file; @@ -347,6 +347,20 @@ static int aio_setup_ring(struct kioctx *ctx) return -EAGAIN; } + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); + + ctx->ring_pages = ctx->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) { + put_aio_ring_file(ctx); + return -ENOMEM; + } + } + for (i = 0; i < nr_pages; i++) { struct page *page; page = find_or_create_page(file->f_inode->i_mapping, @@ -358,19 +372,14 @@ static int aio_setup_ring(struct kioctx *ctx) SetPageUptodate(page); SetPageDirty(page); unlock_page(page); + + ctx->ring_pages[i] = page; } - ctx->aio_ring_file = file; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) - / sizeof(struct io_event); + ctx->nr_pages = i; - ctx->ring_pages = ctx->internal_pages; - if (nr_pages > AIO_RING_PAGES) { - ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_KERNEL); - if (!ctx->ring_pages) { - put_aio_ring_file(ctx); - return -ENOMEM; - } + if (unlikely(i != nr_pages)) { + aio_free_ring(ctx); + return -EAGAIN; } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -379,9 +388,9 @@ static int aio_setup_ring(struct kioctx *ctx) down_write(&mm->mmap_sem); ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, 0, &populate); + MAP_SHARED, 0, &unused); + up_write(&mm->mmap_sem); if (IS_ERR((void *)ctx->mmap_base)) { - up_write(&mm->mmap_sem); ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; @@ -389,27 +398,6 @@ static int aio_setup_ring(struct kioctx *ctx) pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); - /* We must do this while still holding mmap_sem for write, as we - * need to be protected against userspace attempting to mremap() - * or munmap() the ring buffer. - */ - ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, - 1, 0, ctx->ring_pages, NULL); - - /* Dropping the reference here is safe as the page cache will hold - * onto the pages for us. It is also required so that page migration - * can unmap the pages and get the right reference count. - */ - for (i = 0; i < ctx->nr_pages; i++) - put_page(ctx->ring_pages[i]); - - up_write(&mm->mmap_sem); - - if (unlikely(ctx->nr_pages != nr_pages)) { - aio_free_ring(ctx); - return -EAGAIN; - } - ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ -- cgit v1.2.3