diff options
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 446 |
1 files changed, 348 insertions, 98 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 0e9fb2cb1984..fdc18321d70c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4,15 +4,28 @@ * supporting fast/efficient IO. * * A note on the read/write ordering memory barriers that are matched between - * the application and kernel side. When the application reads the CQ ring - * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() - * the kernel uses after writing the tail. Failure to do so could cause a - * delay in when the application notices that completion events available. - * This isn't a fatal condition. Likewise, the application must use an - * appropriate smp_wmb() both before writing the SQ tail, and after writing - * the SQ tail. The first one orders the sqe writes with the tail write, and - * the latter is paired with the smp_rmb() the kernel will issue before - * reading the SQ tail on submission. + * the application and kernel side. + * + * After the application reads the CQ ring tail, it must use an + * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses + * before writing the tail (using smp_load_acquire to read the tail will + * do). It also needs a smp_mb() before updating CQ head (ordering the + * entry load(s) with the head store), pairing with an implicit barrier + * through a control-dependency in io_get_cqring (smp_store_release to + * store head will do). Failure to do so could lead to reading invalid + * CQ entries. + * + * Likewise, the application must use an appropriate smp_wmb() before + * writing the SQ tail (ordering SQ entry stores with the tail store), + * which pairs with smp_load_acquire in io_get_sqring (smp_store_release + * to store the tail will do). And it needs a barrier ordering the SQ + * head load before writing new SQ entries (smp_load_acquire to read + * head will do). + * + * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application + * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* + * updating the SQ tail; a full memory barrier smp_mb() is needed + * between. * * Also see the examples in the liburing library: * @@ -70,20 +83,108 @@ struct io_uring { u32 tail ____cacheline_aligned_in_smp; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_SQ_RING. + * + * The offsets to the member fields are published through struct + * io_sqring_offsets when calling io_uring_setup. + */ struct io_sq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The kernel controls head and the application controls tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of invalid entries dropped by the kernel due to + * invalid index stored in array + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * After a new SQ head value was read by the application this + * counter includes all submissions that were dropped reaching + * the new SQ head (and possibly more). + */ u32 dropped; + /* + * Runtime flags + * + * Written by the kernel, shouldn't be modified by the + * application. + * + * The application needs a full memory barrier before checking + * for IORING_SQ_NEED_WAKEUP after updating the sq tail. + */ u32 flags; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ u32 array[]; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_CQ_RING. + * + * The offsets to the member fields are published through struct + * io_cqring_offsets when calling io_uring_setup. + */ struct io_cq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The application controls head and the kernel tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of completion events lost because the queue was full; + * this should be avoided by the application by making sure + * there are not more requests pending thatn there is space in + * the completion queue. + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * As completion events come in out of order this counter is not + * ordered with any other data. + */ u32 overflow; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ struct io_uring_cqe cqes[]; }; @@ -121,6 +222,8 @@ struct io_ring_ctx { unsigned sq_mask; unsigned sq_thread_idle; struct io_uring_sqe *sq_sqes; + + struct list_head defer_list; } ____cacheline_aligned_in_smp; /* IO offload */ @@ -138,6 +241,7 @@ struct io_ring_ctx { unsigned cq_mask; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; + struct eventfd_ctx *cq_ev_fd; } ____cacheline_aligned_in_smp; /* @@ -221,13 +325,16 @@ struct io_kiocb { struct list_head list; unsigned int flags; refcount_t refs; -#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ +#define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ #define REQ_F_PREPPED 16 /* prep already done */ +#define REQ_F_IO_DRAIN 32 /* drain existing IO first */ +#define REQ_F_IO_DRAINED 64 /* drain done */ u64 user_data; - u64 error; + u32 error; /* iopoll result from callback */ + u32 sequence; struct work_struct work; }; @@ -255,6 +362,8 @@ struct io_submit_state { unsigned int ios_left; }; +static void io_sq_wq_submit_work(struct work_struct *work); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -306,10 +415,36 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); + INIT_LIST_HEAD(&ctx->defer_list); return ctx; } -static void io_commit_cqring(struct io_ring_ctx *ctx) +static inline bool io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + return false; + + return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped; +} + +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + if (list_empty(&ctx->defer_list)) + return NULL; + + req = list_first_entry(&ctx->defer_list, struct io_kiocb, list); + if (!io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } + + return NULL; +} + +static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_cq_ring *ring = ctx->cq_ring; @@ -317,12 +452,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) /* order cqe stores with ring update */ smp_store_release(&ring->r.tail, ctx->cached_cq_tail); - /* - * Write sider barrier of tail update, app has read side. See - * comment at the top of this file. - */ - smp_wmb(); - if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); @@ -330,14 +459,29 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) } } +static void io_commit_cqring(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + __io_commit_cqring(ctx); + + while ((req = io_get_deferred_req(ctx)) != NULL) { + req->flags |= REQ_F_IO_DRAINED; + queue_work(ctx->sqo_wq, &req->work); + } +} + static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { struct io_cq_ring *ring = ctx->cq_ring; unsigned tail; tail = ctx->cached_cq_tail; - /* See comment at the top of the file */ - smp_rmb(); + /* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) return NULL; @@ -373,6 +517,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); + if (ctx->cq_ev_fd) + eventfd_signal(ctx->cq_ev_fd, 1); } static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, @@ -774,10 +920,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) return ret; - if (force_nonblock) { + + /* don't allow async punt if RWF_NOWAIT was requested */ + if (kiocb->ki_flags & IOCB_NOWAIT) + req->flags |= REQ_F_NOWAIT; + + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; - req->flags |= REQ_F_FORCE_NONBLOCK; - } + if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) @@ -1120,6 +1270,54 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret = 0; + + if (!req->file) + return -EBADF; + /* Prep already done (EAGAIN retry) */ + if (req->flags & REQ_F_PREPPED) + return 0; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + return -EINVAL; + + req->flags |= REQ_F_PREPPED; + return ret; +} + +static int io_sync_file_range(struct io_kiocb *req, + const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + loff_t sqe_off; + loff_t sqe_len; + unsigned flags; + int ret; + + ret = io_prep_sfr(req, sqe); + if (ret) + return ret; + + /* sync_file_range always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + + sqe_off = READ_ONCE(sqe->off); + sqe_len = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->sync_range_flags); + + ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); + + io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_put_req(req); + return 0; +} + static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -1322,7 +1520,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_unlock(&poll->head->lock); } if (mask) { /* no async, we'd stolen it */ - req->error = mangle_poll(mask); ipt.error = 0; io_poll_complete(ctx, req, mask); } @@ -1335,6 +1532,34 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ipt.error; } +static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_uring_sqe *sqe_copy; + + if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) + return 0; + + sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + if (!sqe_copy) + return -EAGAIN; + + spin_lock_irq(&ctx->completion_lock); + if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) { + spin_unlock_irq(&ctx->completion_lock); + kfree(sqe_copy); + return 0; + } + + memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); + req->submit.sqe = sqe_copy; + + INIT_WORK(&req->work, io_sq_wq_submit_work); + list_add_tail(&req->list, &ctx->defer_list); + spin_unlock_irq(&ctx->completion_lock); + return -EIOCBQUEUED; +} + static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock) { @@ -1374,6 +1599,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_POLL_REMOVE: ret = io_poll_remove(req, s->sqe); break; + case IORING_OP_SYNC_FILE_RANGE: + ret = io_sync_file_range(req, s->sqe, force_nonblock); + break; default: ret = -EINVAL; break; @@ -1436,8 +1664,7 @@ restart: struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; - /* Ensure we clear previously set forced non-block flag */ - req->flags &= ~REQ_F_FORCE_NONBLOCK; + /* Ensure we clear previously set non-block flag */ req->rw.ki_flags &= ~IOCB_NOWAIT; ret = 0; @@ -1467,10 +1694,11 @@ restart: break; cond_resched(); } while (1); - - /* drop submission reference */ - io_put_req(req); } + + /* drop submission reference */ + io_put_req(req); + if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); io_put_req(req); @@ -1582,6 +1810,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, flags = READ_ONCE(s->sqe->flags); fd = READ_ONCE(s->sqe->fd); + if (flags & IOSQE_IO_DRAIN) { + req->flags |= REQ_F_IO_DRAIN; + req->sequence = ctx->cached_sq_head - 1; + } + if (!io_op_needs_file(s->sqe)) { req->file = NULL; return 0; @@ -1611,7 +1844,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, int ret; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) + if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN))) return -EINVAL; req = io_get_req(ctx, state); @@ -1622,8 +1855,15 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(ret)) goto out; + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret == -EIOCBQUEUED) + ret = 0; + return ret; + } + ret = __io_submit_sqe(ctx, req, s, true); - if (ret == -EAGAIN) { + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); @@ -1697,24 +1937,10 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * write new data to them. */ smp_store_release(&ring->r.head, ctx->cached_sq_head); - - /* - * write side barrier of head update, app has read side. See - * comment at the top of this file - */ - smp_wmb(); } } /* - * Undo last io_get_sqring() - */ -static void io_drop_sqring(struct io_ring_ctx *ctx) -{ - ctx->cached_sq_head--; -} - -/* * Fetch an sqe, if one is available. Note that s->sqe will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always @@ -1736,8 +1962,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) * though the application is the one updating it. */ head = ctx->cached_sq_head; - /* See comment at the top of this file */ - smp_rmb(); /* make sure SQ entry isn't read before tail */ if (head == smp_load_acquire(&ring->r.tail)) return false; @@ -1753,8 +1977,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) /* drop invalid entries */ ctx->cached_sq_head++; ring->dropped++; - /* See comment at the top of this file */ - smp_wmb(); return false; } @@ -1878,13 +2100,11 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); continue; } finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); } i = 0; @@ -1929,7 +2149,7 @@ static int io_sq_thread(void *data) static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; - int i, ret = 0, submit = 0; + int i, submit = 0; if (to_submit > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, to_submit); @@ -1938,6 +2158,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) for (i = 0; i < to_submit; i++) { struct sqe_submit s; + int ret; if (!io_get_sqring(ctx, &s)) break; @@ -1945,21 +2166,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; + submit++; ret = io_submit_sqe(ctx, &s, statep); - if (ret) { - io_drop_sqring(ctx); - break; - } - - submit++; + if (ret) + io_cqring_add_event(ctx, s.sqe->user_data, ret, 0); } io_commit_sqring(ctx); if (statep) io_submit_state_end(statep); - return submit ? submit : ret; + return submit; } static unsigned io_cqring_events(struct io_cq_ring *ring) @@ -2145,7 +2363,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) left = ctx->nr_user_files; while (left) { unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); - int ret; ret = __io_sqe_files_scm(ctx, this_files, total); if (ret) @@ -2240,10 +2457,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, mmgrab(current->mm); ctx->sqo_mm = current->mm; - ret = -EINVAL; - if (!cpu_possible(p->sq_thread_cpu)) - goto err; - if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) @@ -2254,11 +2467,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, ctx->sq_thread_idle = HZ; if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu; + int cpu = array_index_nospec(p->sq_thread_cpu, + nr_cpu_ids); - cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); ret = -EINVAL; - if (!cpu_possible(p->sq_thread_cpu)) + if (!cpu_online(cpu)) goto err; ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, @@ -2321,8 +2534,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) static void io_mem_free(void *ptr) { - struct page *page = virt_to_head_page(ptr); + struct page *page; + if (!ptr) + return; + + page = virt_to_head_page(ptr); if (put_page_testzero(page)) free_compound_page(page); } @@ -2363,7 +2580,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); - kfree(imu->bvec); + kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -2455,9 +2672,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, if (!pages || nr_pages > got_pages) { kfree(vmas); kfree(pages); - pages = kmalloc_array(nr_pages, sizeof(struct page *), + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - vmas = kmalloc_array(nr_pages, + vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), GFP_KERNEL); if (!pages || !vmas) { @@ -2469,7 +2686,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, got_pages = nr_pages; } - imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), + imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { @@ -2480,8 +2697,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, - pages, vmas); + pret = get_user_pages(ubuf, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); if (pret == nr_pages) { /* don't support file backed memory */ for (j = 0; j < nr_pages; j++) { @@ -2508,6 +2726,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, } if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); + kvfree(imu->bvec); goto err; } @@ -2530,16 +2749,48 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ctx->nr_user_bufs++; } - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); return 0; err: - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); io_sqe_buffer_unregister(ctx); return ret; } +static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) +{ + __s32 __user *fds = arg; + int fd; + + if (ctx->cq_ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ctx->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ctx->cq_ev_fd)) { + int ret = PTR_ERR(ctx->cq_ev_fd); + ctx->cq_ev_fd = NULL; + return ret; + } + + return 0; +} + +static int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + if (ctx->cq_ev_fd) { + eventfd_ctx_put(ctx->cq_ev_fd); + ctx->cq_ev_fd = NULL; + return 0; + } + + return -ENXIO; +} + static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); @@ -2549,6 +2800,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); + io_eventfd_unregister(ctx); #if defined(CONFIG_UNIX) if (ctx->ring_sock) @@ -2573,7 +2825,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) __poll_t mask = 0; poll_wait(file, &ctx->cq_wait, wait); - /* See comment at the top of this file */ + /* + * synchronizes with barrier from wq_has_sleeper call in + * io_commit_cqring + */ smp_rmb(); if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != ctx->sq_ring->ring_entries) @@ -2687,24 +2942,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - - if (submitted < 0) - goto out_ctx; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; min_complete = min(min_complete, ctx->cq_entries); - /* - * The application could have included the 'to_submit' count - * in how many events it wanted to wait for. If we failed to - * submit the desired count, we may need to adjust the number - * of events to poll/wait for. - */ - if (submitted < to_submit) - min_complete = min_t(unsigned, submitted, min_complete); - if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); ret = io_iopoll_check(ctx, &nr_events, min_complete); @@ -2750,17 +2993,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->sq_ring); + if (!ctx->sq_sqes) return -ENOMEM; - } cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) { - io_mem_free(ctx->sq_ring); - io_mem_free(ctx->sq_sqes); + if (!cq_ring) return -ENOMEM; - } ctx->cq_ring = cq_ring; cq_ring->ring_mask = p->cq_entries - 1; @@ -2976,6 +3214,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sqe_files_unregister(ctx); break; + case IORING_REGISTER_EVENTFD: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_eventfd_register(ctx, arg); + break; + case IORING_UNREGISTER_EVENTFD: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_eventfd_unregister(ctx); + break; default: ret = -EINVAL; break; |