diff options
author | Jens Axboe <axboe@kernel.dk> | 2024-03-12 20:24:21 -0600 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2024-04-15 08:10:26 -0600 |
commit | 87585b05757dc70545efb434669708d276125559 (patch) | |
tree | d3020002d23692f431baf61797e51d439312950f /io_uring/io_uring.c | |
parent | e270bfd22a2a10d1cfbaddf23e79b6d0b405d21e (diff) | |
download | linux-stable-87585b05757dc70545efb434669708d276125559.tar.gz linux-stable-87585b05757dc70545efb434669708d276125559.tar.bz2 linux-stable-87585b05757dc70545efb434669708d276125559.zip |
io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring
Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_page() and have it Just Work.
This requires a bit of effort on the mmap lookup side, as the ctx
uring_lock isn't held, which otherwise protects buffer_lists from being
torn down, and it's not safe to grab from mmap context that would
introduce an ABBA deadlock between the mmap lock and the ctx uring_lock.
Instead, lookup the buffer_list under RCU, as the the list is RCU freed
already. Use the existing reference count to determine whether it's
possible to safely grab a reference to it (eg if it's not zero already),
and drop that reference when done with the mapping. If the mmap
reference is the last one, the buffer_list and the associated memory can
go away, since the vma insertion has references to the inserted pages at
that point.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r-- | io_uring/io_uring.c | 58 |
1 files changed, 16 insertions, 42 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0ef418faac33..50e859da59e1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -303,7 +303,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_HLIST_HEAD(&ctx->io_buf_list); ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, @@ -2598,15 +2597,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void io_pages_unmap(void *ptr, struct page ***pages, - unsigned short *npages) +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages) { bool do_vunmap = false; if (!ptr) return; - if (*npages) { + if (put_pages && *npages) { struct page **to_free = *pages; int i; @@ -2628,14 +2627,6 @@ static void io_pages_unmap(void *ptr, struct page ***pages, *npages = 0; } -void io_mem_free(void *ptr) -{ - if (!ptr) - return; - - folio_put(virt_to_folio(ptr)); -} - static void io_pages_free(struct page ***pages, int npages) { struct page **page_array = *pages; @@ -2730,8 +2721,10 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, + true); + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, + true); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; @@ -2788,8 +2781,8 @@ err: return ERR_PTR(-ENOMEM); } -static void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size) +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; struct page **pages; @@ -2819,17 +2812,6 @@ done: return ret; } -void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - void *ret; - - ret = (void *) __get_free_pages(gfp, get_order(size)); - if (ret) - return ret; - return ERR_PTR(-ENOMEM); -} - static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset) { @@ -2926,7 +2908,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->mm_account = NULL; } io_rings_free(ctx); - io_kbuf_mmap_list_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -3396,10 +3377,8 @@ static void *io_uring_validate_mmap_request(struct file *file, { struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; - struct page *page; - void *ptr; - switch (offset & IORING_OFF_MMAP_MASK) { + switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: /* Don't allow mmap if the ring was setup without it */ @@ -3414,6 +3393,7 @@ static void *io_uring_validate_mmap_request(struct file *file, case IORING_OFF_PBUF_RING: { struct io_buffer_list *bl; unsigned int bgid; + void *ptr; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bl = io_pbuf_get_bl(ctx, bgid); @@ -3421,17 +3401,11 @@ static void *io_uring_validate_mmap_request(struct file *file, return bl; ptr = bl->buf_ring; io_put_bl(ctx, bl); - break; + return ptr; } - default: - return ERR_PTR(-EINVAL); } - page = virt_to_head_page(ptr); - if (sz > page_size(page)) - return ERR_PTR(-EINVAL); - - return ptr; + return ERR_PTR(-EINVAL); } int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, @@ -3450,7 +3424,6 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned long pfn; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); @@ -3465,10 +3438,11 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) case IORING_OFF_SQES: return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, ctx->n_sqe_pages); + case IORING_OFF_PBUF_RING: + return io_pbuf_mmap(file, vma); } - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); + return -EINVAL; } static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, |