summaryrefslogtreecommitdiffstats
path: root/io_uring/io_uring.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2024-03-12 20:24:21 -0600
committerJens Axboe <axboe@kernel.dk>2024-04-15 08:10:26 -0600
commit87585b05757dc70545efb434669708d276125559 (patch)
treed3020002d23692f431baf61797e51d439312950f /io_uring/io_uring.c
parente270bfd22a2a10d1cfbaddf23e79b6d0b405d21e (diff)
downloadlinux-87585b05757dc70545efb434669708d276125559.tar.gz
linux-87585b05757dc70545efb434669708d276125559.tar.bz2
linux-87585b05757dc70545efb434669708d276125559.zip
io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring
Rather than use remap_pfn_range() for this and manually free later, switch to using vm_insert_page() and have it Just Work. This requires a bit of effort on the mmap lookup side, as the ctx uring_lock isn't held, which otherwise protects buffer_lists from being torn down, and it's not safe to grab from mmap context that would introduce an ABBA deadlock between the mmap lock and the ctx uring_lock. Instead, lookup the buffer_list under RCU, as the the list is RCU freed already. Use the existing reference count to determine whether it's possible to safely grab a reference to it (eg if it's not zero already), and drop that reference when done with the mapping. If the mmap reference is the last one, the buffer_list and the associated memory can go away, since the vma insertion has references to the inserted pages at that point. Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r--io_uring/io_uring.c58
1 files changed, 16 insertions, 42 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0ef418faac33..50e859da59e1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -303,7 +303,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
- INIT_HLIST_HEAD(&ctx->io_buf_list);
ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node));
ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
@@ -2598,15 +2597,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void io_pages_unmap(void *ptr, struct page ***pages,
- unsigned short *npages)
+void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
+ bool put_pages)
{
bool do_vunmap = false;
if (!ptr)
return;
- if (*npages) {
+ if (put_pages && *npages) {
struct page **to_free = *pages;
int i;
@@ -2628,14 +2627,6 @@ static void io_pages_unmap(void *ptr, struct page ***pages,
*npages = 0;
}
-void io_mem_free(void *ptr)
-{
- if (!ptr)
- return;
-
- folio_put(virt_to_folio(ptr));
-}
-
static void io_pages_free(struct page ***pages, int npages)
{
struct page **page_array = *pages;
@@ -2730,8 +2721,10 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
static void io_rings_free(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
- io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
- io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
+ io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
+ true);
+ io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
+ true);
} else {
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
ctx->n_ring_pages = 0;
@@ -2788,8 +2781,8 @@ err:
return ERR_PTR(-ENOMEM);
}
-static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
- size_t size)
+void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+ size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
struct page **pages;
@@ -2819,17 +2812,6 @@ done:
return ret;
}
-void *io_mem_alloc(size_t size)
-{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
- void *ret;
-
- ret = (void *) __get_free_pages(gfp, get_order(size));
- if (ret)
- return ret;
- return ERR_PTR(-ENOMEM);
-}
-
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset)
{
@@ -2926,7 +2908,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
io_rings_free(ctx);
- io_kbuf_mmap_list_free(ctx);
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
@@ -3396,10 +3377,8 @@ static void *io_uring_validate_mmap_request(struct file *file,
{
struct io_ring_ctx *ctx = file->private_data;
loff_t offset = pgoff << PAGE_SHIFT;
- struct page *page;
- void *ptr;
- switch (offset & IORING_OFF_MMAP_MASK) {
+ switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
/* Don't allow mmap if the ring was setup without it */
@@ -3414,6 +3393,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
case IORING_OFF_PBUF_RING: {
struct io_buffer_list *bl;
unsigned int bgid;
+ void *ptr;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
bl = io_pbuf_get_bl(ctx, bgid);
@@ -3421,17 +3401,11 @@ static void *io_uring_validate_mmap_request(struct file *file,
return bl;
ptr = bl->buf_ring;
io_put_bl(ctx, bl);
- break;
+ return ptr;
}
- default:
- return ERR_PTR(-EINVAL);
}
- page = virt_to_head_page(ptr);
- if (sz > page_size(page))
- return ERR_PTR(-EINVAL);
-
- return ptr;
+ return ERR_PTR(-EINVAL);
}
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
@@ -3450,7 +3424,6 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
struct io_ring_ctx *ctx = file->private_data;
size_t sz = vma->vm_end - vma->vm_start;
long offset = vma->vm_pgoff << PAGE_SHIFT;
- unsigned long pfn;
void *ptr;
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
@@ -3465,10 +3438,11 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
case IORING_OFF_SQES:
return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
ctx->n_sqe_pages);
+ case IORING_OFF_PBUF_RING:
+ return io_pbuf_mmap(file, vma);
}
- pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
- return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+ return -EINVAL;
}
static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,