diff options
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/Makefile | 10 | ||||
-rw-r--r-- | io_uring/advise.c | 16 | ||||
-rw-r--r-- | io_uring/cancel.h | 4 | ||||
-rw-r--r-- | io_uring/eventfd.c | 161 | ||||
-rw-r--r-- | io_uring/eventfd.h | 8 | ||||
-rw-r--r-- | io_uring/fdinfo.c | 17 | ||||
-rw-r--r-- | io_uring/io-wq.c | 64 | ||||
-rw-r--r-- | io_uring/io-wq.h | 2 | ||||
-rw-r--r-- | io_uring/io_uring.c | 429 | ||||
-rw-r--r-- | io_uring/io_uring.h | 36 | ||||
-rw-r--r-- | io_uring/kbuf.c | 105 | ||||
-rw-r--r-- | io_uring/kbuf.h | 94 | ||||
-rw-r--r-- | io_uring/memmap.c | 7 | ||||
-rw-r--r-- | io_uring/msg_ring.c | 122 | ||||
-rw-r--r-- | io_uring/msg_ring.h | 1 | ||||
-rw-r--r-- | io_uring/napi.c | 82 | ||||
-rw-r--r-- | io_uring/napi.h | 18 | ||||
-rw-r--r-- | io_uring/net.c | 176 | ||||
-rw-r--r-- | io_uring/net.h | 6 | ||||
-rw-r--r-- | io_uring/opdef.c | 39 | ||||
-rw-r--r-- | io_uring/opdef.h | 4 | ||||
-rw-r--r-- | io_uring/poll.c | 1 | ||||
-rw-r--r-- | io_uring/register.c | 160 | ||||
-rw-r--r-- | io_uring/register.h | 1 | ||||
-rw-r--r-- | io_uring/rsrc.c | 323 | ||||
-rw-r--r-- | io_uring/rsrc.h | 15 | ||||
-rw-r--r-- | io_uring/rw.c | 76 | ||||
-rw-r--r-- | io_uring/sqpoll.c | 39 | ||||
-rw-r--r-- | io_uring/statx.c | 3 | ||||
-rw-r--r-- | io_uring/timeout.c | 2 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 9 | ||||
-rw-r--r-- | io_uring/xattr.c | 4 |
32 files changed, 1375 insertions, 659 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile index fc1b23c524e8..53167bef37d7 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,11 +2,15 @@ # # Makefile for io_uring +ifdef CONFIG_GCOV_PROFILE_URING +GCOV_PROFILE := y +endif + obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ tctx.o filetable.o rw.o net.o poll.o \ - uring_cmd.o openclose.o sqpoll.o \ - xattr.o nop.o fs.o splice.o sync.o \ - msg_ring.o advise.o openclose.o \ + eventfd.o uring_cmd.o openclose.o \ + sqpoll.o xattr.o nop.o fs.o splice.o \ + sync.o msg_ring.o advise.o openclose.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o diff --git a/io_uring/advise.c b/io_uring/advise.c index 7085804c513c..cb7b881665e5 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -17,14 +17,14 @@ struct io_fadvise { struct file *file; u64 offset; - u32 len; + u64 len; u32 advice; }; struct io_madvise { struct file *file; u64 addr; - u32 len; + u64 len; u32 advice; }; @@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; ma->addr = READ_ONCE(sqe->addr); - ma->len = READ_ONCE(sqe->len); + ma->len = READ_ONCE(sqe->off); + if (!ma->len) + ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); req->flags |= REQ_F_FORCE_ASYNC; return 0; @@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; fa->offset = READ_ONCE(sqe->off); - fa->len = READ_ONCE(sqe->len); + fa->len = READ_ONCE(sqe->addr); + if (!fa->len) + fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); if (io_fadvise_force_async(fa)) req->flags |= REQ_F_FORCE_ASYNC; diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 76b32e65c03c..b33995e00ba9 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -27,10 +27,10 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { - if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq) + if (req->cancel_seq_set && sequence == req->work.cancel_seq) return true; - req->flags |= REQ_F_CANCEL_SEQ; + req->cancel_seq_set = true; req->work.cancel_seq = sequence; return false; } diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c new file mode 100644 index 000000000000..e37fddd5d9ce --- /dev/null +++ b/io_uring/eventfd.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/eventfd.h> +#include <linux/eventpoll.h> +#include <linux/io_uring.h> +#include <linux/io_uring_types.h> + +#include "io-wq.h" +#include "eventfd.h" + +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; + refcount_t refs; + atomic_t ops; +}; + +enum { + IO_EVENTFD_OP_SIGNAL_BIT, +}; + +static void io_eventfd_free(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); +} + +static void io_eventfd_do_signal(struct rcu_head *rcu) +{ + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + + if (refcount_dec_and_test(&ev_fd->refs)) + io_eventfd_free(rcu); +} + +void io_eventfd_signal(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd = NULL; + + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return; + + guard(rcu)(); + + /* + * rcu_dereference ctx->io_ev_fd once and use it for both for checking + * and eventfd_signal + */ + ev_fd = rcu_dereference(ctx->io_ev_fd); + + /* + * Check again if ev_fd exists incase an io_eventfd_unregister call + * completed between the NULL check of ctx->io_ev_fd at the start of + * the function and rcu_read_lock. + */ + if (unlikely(!ev_fd)) + return; + if (!refcount_inc_not_zero(&ev_fd->refs)) + return; + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) + goto out; + + if (likely(eventfd_signal_allowed())) { + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + } else { + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return; + } + } +out: + if (refcount_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx) +{ + bool skip; + + spin_lock(&ctx->completion_lock); + + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count + * only changing IFF a new CQE has been added to the CQ ring. There's + * no depedency on 1:1 relationship between how many times this + * function is called (and hence the eventfd count) and number of CQEs + * posted to the CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; + + io_eventfd_signal(ctx); +} + +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) +{ + struct io_ev_fd *ev_fd; + __s32 __user *fds = arg; + int fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; + + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + + kfree(ev_fd); + return ret; + } + + spin_lock(&ctx->completion_lock); + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + refcount_set(&ev_fd->refs, 1); + atomic_set(&ev_fd->ops, 0); + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + return 0; +} + +int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + if (refcount_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); + return 0; + } + + return -ENXIO; +} diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h new file mode 100644 index 000000000000..d394f49c6321 --- /dev/null +++ b/io_uring/eventfd.h @@ -0,0 +1,8 @@ + +struct io_ring_ctx; +int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async); +int io_eventfd_unregister(struct io_ring_ctx *ctx); + +void io_eventfd_flush_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index b1e0e0d85349..6b1247664b35 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -177,9 +177,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { struct io_mapped_ubuf *buf = ctx->user_bufs[i]; - unsigned int len = buf->ubuf_end - buf->ubuf; - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); + seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len); } if (has_lock && !xa_empty(&ctx->personalities)) { unsigned long index; @@ -221,7 +220,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) cqe->user_data, cqe->res, cqe->flags); } - spin_unlock(&ctx->completion_lock); + +#ifdef CONFIG_NET_RX_BUSY_POLL + if (ctx->napi_enabled) { + seq_puts(m, "NAPI:\tenabled\n"); + seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); + if (ctx->napi_prefer_busy_poll) + seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); + else + seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); + } else { + seq_puts(m, "NAPI:\tdisabled\n"); + } +#endif } #endif diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index d1c47a9d9215..a38f36b68060 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/rculist_nulls.h> #include <linux/cpu.h> +#include <linux/cpuset.h> #include <linux/task_work.h> #include <linux/audit.h> #include <linux/mmu_context.h> @@ -23,6 +24,7 @@ #include "io_uring.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) +#define WORKER_INIT_LIMIT 3 enum { IO_WORKER_F_UP = 0, /* up and active */ @@ -58,6 +60,7 @@ struct io_worker { unsigned long create_state; struct callback_head create_work; + int init_retries; union { struct rcu_head rcu; @@ -159,7 +162,7 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, struct io_wq_work *work) { - return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) @@ -451,7 +454,7 @@ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) static inline unsigned int io_get_work_hash(struct io_wq_work *work) { - return work->flags >> IO_WQ_HASH_SHIFT; + return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; } static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) @@ -592,8 +595,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct, next_hashed = wq_next_work(work); - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) - work->flags |= IO_WQ_WORK_CANCEL; + if (do_kill && + (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -744,7 +748,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data) return true; } -static inline bool io_should_retry_thread(long err) +static inline bool io_should_retry_thread(struct io_worker *worker, long err) { /* * Prevent perpetual task_work retry, if the task (or its group) is @@ -752,6 +756,8 @@ static inline bool io_should_retry_thread(long err) */ if (fatal_signal_pending(current)) return false; + if (worker->init_retries++ >= WORKER_INIT_LIMIT) + return false; switch (err) { case -EAGAIN: @@ -778,7 +784,7 @@ static void create_worker_cont(struct callback_head *cb) io_init_new_worker(wq, worker, tsk); io_worker_release(worker); return; - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_dec(&acct->nr_running); @@ -845,7 +851,7 @@ fail: tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { io_init_new_worker(wq, worker, tsk); - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; } else { @@ -891,7 +897,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); work = wq->free_work(work); } while (work); @@ -926,8 +932,12 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); - unsigned long work_flags = work->flags; - struct io_cb_cancel_data match; + unsigned int work_flags = atomic_read(&work->flags); + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_item, + .data = work, + .cancel_all = false, + }; bool do_create; /* @@ -935,7 +945,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) * been marked as one that should not get executed, cancel it here. */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || - (work->flags & IO_WQ_WORK_CANCEL)) { + (work_flags & IO_WQ_WORK_CANCEL)) { io_run_cancel(work, wq); return; } @@ -965,10 +975,6 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) raw_spin_unlock(&wq->lock); /* fatal condition, failed to create the first worker */ - match.fn = io_wq_work_match_item, - match.data = work, - match.cancel_all = false, - io_acct_cancel_pending_work(wq, acct, &match); } } @@ -982,7 +988,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) unsigned int bit; bit = hash_ptr(val, IO_WQ_HASH_ORDER); - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags); } static bool __io_wq_worker_cancel(struct io_worker *worker, @@ -990,7 +996,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, struct io_wq_work *work) { if (work && match->fn(work, match->data)) { - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); __set_notify_signal(worker->task); return true; } @@ -1162,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) goto err; - cpumask_copy(wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(data->task, wq->cpu_mask); wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); @@ -1317,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) { + cpumask_var_t allowed_mask; + int ret = 0; + if (!tctx || !tctx->io_wq) return -EINVAL; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) + return -ENOMEM; + rcu_read_lock(); - if (mask) - cpumask_copy(tctx->io_wq->cpu_mask, mask); - else - cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); + cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask); + if (mask) { + if (cpumask_subset(mask, allowed_mask)) + cpumask_copy(tctx->io_wq->cpu_mask, mask); + else + ret = -EINVAL; + } else { + cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask); + } rcu_read_unlock(); - return 0; + free_cpumask_var(allowed_mask); + return ret; } /* diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 2b2a6406dd8e..b3b004a7b625 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void); static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return work->flags & IO_WQ_WORK_HASHED; + return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 86fd72f6a1c2..b2736e3491b8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,12 +95,14 @@ #include "futex.h" #include "napi.h" #include "uring_cmd.h" +#include "msg_ring.h" #include "memmap.h" #include "timeout.h" #include "poll.h" #include "rw.h" #include "alloc_cache.h" +#include "eventfd.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -314,9 +316,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) sizeof(struct io_async_rw)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, sizeof(struct uring_cache)); + spin_lock_init(&ctx->msg_lock); + ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_kiocb)); ret |= io_futex_cache_init(ctx); if (ret) - goto err; + goto free_ref; init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -344,12 +349,16 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) io_napi_init(ctx); return ctx; + +free_ref: + percpu_ref_exit(&ctx->refs); err: io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); @@ -461,9 +470,9 @@ static void io_prep_async_work(struct io_kiocb *req) } req->work.list.next = NULL; - req->work.flags = 0; + atomic_set(&req->work.flags, 0); if (req->flags & REQ_F_FORCE_ASYNC) - req->work.flags |= IO_WQ_WORK_CONCURRENT; + atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(req->file); @@ -479,7 +488,7 @@ static void io_prep_async_work(struct io_kiocb *req) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) - req->work.flags |= IO_WQ_WORK_UNBOUND; + atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags); } } @@ -519,7 +528,7 @@ static void io_queue_iowq(struct io_kiocb *req) * worker for it). */ if (WARN_ON_ONCE(!same_thread_group(req->task, current))) - req->work.flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); @@ -527,6 +536,17 @@ static void io_queue_iowq(struct io_kiocb *req) io_queue_linked_timeout(link); } +static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) +{ + io_queue_iowq(req); +} + +void io_req_queue_iowq(struct io_kiocb *req) +{ + req->io_task_work.func = io_req_queue_iowq_tw; + io_req_task_work_add(req); +} + static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { @@ -541,84 +561,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -void io_eventfd_ops(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - int ops = atomic_xchg(&ev_fd->ops, 0); - - if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - - /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback - * ordering in a race but if references are 0 we know we have to free - * it regardless. - */ - if (atomic_dec_and_test(&ev_fd->refs)) { - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); - } -} - -static void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd = NULL; - - rcu_read_lock(); - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ - ev_fd = rcu_dereference(ctx->io_ev_fd); - - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call - * completed between the NULL check of ctx->io_ev_fd at the start of - * the function and rcu_read_lock. - */ - if (unlikely(!ev_fd)) - goto out; - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - goto out; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; - - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - } else { - atomic_inc(&ev_fd->refs); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) - call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); - else - atomic_dec(&ev_fd->refs); - } - -out: - rcu_read_unlock(); -} - -static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - bool skip; - - spin_lock(&ctx->completion_lock); - - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; - - io_eventfd_signal(ctx); -} - void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) @@ -696,6 +638,21 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) } list_del(&ocqe->list); kfree(ocqe); + + /* + * For silly syzbot cases that deliberately overflow by huge + * amounts, check if we need to resched and drop and + * reacquire the locks if so. Nothing real would ever hit this. + * Ideally we'd have a non-posting unlock for this, but hard + * to care for a non-real case. + */ + if (need_resched()) { + io_cq_unlock_post(ctx); + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + io_cq_lock(ctx); + } } if (list_empty(&ctx->cq_overflow_list)) { @@ -878,20 +835,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags) { bool filled; - io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + return filled; +} + +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +{ + bool filled; + + io_cq_lock(ctx); + filled = __io_post_aux_cqe(ctx, user_data, res, cflags); io_cq_unlock_post(ctx); return filled; } /* + * Must be called from inline task_work so we now a flush will happen later, + * and obviously with ctx->uring_lock held (tw always has that). + */ +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) +{ + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { + spin_lock(&ctx->completion_lock); + io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + spin_unlock(&ctx->completion_lock); + } + ctx->submit_state.cq_flush = true; +} + +/* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ @@ -953,7 +933,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); if (def->fail) def->fail(req); io_req_complete_defer(req); @@ -1175,9 +1155,10 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } -static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +static inline void io_req_local_work_add(struct io_kiocb *req, + struct io_ring_ctx *ctx, + unsigned flags) { - struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; @@ -1191,6 +1172,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; + guard(rcu)(); + head = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; @@ -1259,8 +1242,8 @@ static void io_req_normal_work_add(struct io_kiocb *req) if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sqd = ctx->sq_data; - if (wq_has_sleeper(&sqd->wait)) - wake_up(&sqd->wait); + if (sqd->thread) + __set_notify_signal(sqd->thread); return; } @@ -1272,13 +1255,18 @@ static void io_req_normal_work_add(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - rcu_read_lock(); - io_req_local_work_add(req, flags); - rcu_read_unlock(); - } else { + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_req_local_work_add(req, req->ctx, flags); + else io_req_normal_work_add(req); - } +} + +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags) +{ + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + return; + io_req_local_work_add(req, ctx, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) @@ -1467,7 +1455,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) } __io_cq_unlock_post(ctx); - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (!wq_list_empty(&state->compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } @@ -1813,14 +1801,14 @@ void io_wq_submit_work(struct io_wq_work *work) io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) { + if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); goto fail; } @@ -1890,7 +1878,7 @@ fail: } while (1); /* avoid locking problems by failing it from a clean context */ - if (ret < 0) + if (ret) io_req_task_queue_fail(req, ret); } @@ -2053,11 +2041,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ sqe_flags = READ_ONCE(sqe->flags); - req->flags = (io_req_flags_t) sqe_flags; + req->flags = (__force io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->rsrc_node = NULL; req->task = current; + req->cancel_seq_set = false; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; @@ -2193,7 +2182,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { - trace_io_uring_link(req, link->head); + trace_io_uring_link(req, link->last); link->last->link = req; link->last = req; @@ -2390,22 +2379,92 @@ static bool current_pending_io(void) return percpu_counter_read_positive(&tctx->inflight); } -/* when returns >0, the caller should retry */ -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq) +static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) { - int ret; + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); - if (unlikely(READ_ONCE(ctx->check_cq))) - return 1; - if (unlikely(!llist_empty(&ctx->work_llist))) - return 1; - if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) - return 1; - if (unlikely(task_sigpending(current))) - return -EINTR; - if (unlikely(io_should_wake(iowq))) - return 0; + WRITE_ONCE(iowq->hit_timeout, 1); + iowq->min_timeout = 0; + wake_up_process(iowq->wq.private); + return HRTIMER_NORESTART; +} + +/* + * Doing min_timeout portion. If we saw any timeouts, events, or have work, + * wake up. If not, and we have a normal timeout, switch to that and keep + * sleeping. + */ +static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) +{ + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); + struct io_ring_ctx *ctx = iowq->ctx; + + /* no general timeout, or shorter (or equal), we are done */ + if (iowq->timeout == KTIME_MAX || + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) + goto out_wake; + /* work we may need to run, wake function will see if we need to wake */ + if (io_has_work(ctx)) + goto out_wake; + /* got events since we started waiting, min timeout is done */ + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) + goto out_wake; + /* if we have any events and min timeout expired, we're done */ + if (io_cqring_events(ctx)) + goto out_wake; + + /* + * If using deferred task_work running and application is waiting on + * more than one request, ensure we reset it now where we are switching + * to normal sleeps. Any request completion post min_wait should wake + * the task and return. + */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + if (!llist_empty(&ctx->work_llist)) + goto out_wake; + } + + iowq->t.function = io_cqring_timer_wakeup; + hrtimer_set_expires(timer, iowq->timeout); + return HRTIMER_RESTART; +out_wake: + return io_cqring_timer_wakeup(timer); +} + +static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, + clockid_t clock_id, ktime_t start_time) +{ + ktime_t timeout; + + hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS); + if (iowq->min_timeout) { + timeout = ktime_add_ns(iowq->min_timeout, start_time); + iowq->t.function = io_cqring_min_timer_wakeup; + } else { + timeout = iowq->timeout; + iowq->t.function = io_cqring_timer_wakeup; + } + + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); + + if (!READ_ONCE(iowq->hit_timeout)) + schedule(); + + hrtimer_cancel(&iowq->t); + destroy_hrtimer_on_stack(&iowq->t); + __set_current_state(TASK_RUNNING); + + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; +} + +static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + ktime_t start_time) +{ + int ret = 0; /* * Mark us as being in io_wait if we have pending requests, so cpufreq @@ -2414,25 +2473,50 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, */ if (current_pending_io()) current->in_iowait = 1; - ret = 0; - if (iowq->timeout == KTIME_MAX) + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); + else schedule(); - else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) - ret = -ETIME; current->in_iowait = 0; return ret; } +/* If this returns > 0, the caller should retry */ +static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + ktime_t start_time) +{ + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(!llist_empty(&ctx->work_llist))) + return 1; + if (unlikely(task_work_pending(current))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; + if (unlikely(io_should_wake(iowq))) + return 0; + + return __io_cqring_wait_schedule(ctx, iowq, start_time); +} + +struct ext_arg { + size_t argsz; + struct __kernel_timespec __user *ts; + const sigset_t __user *sig; + ktime_t min_time; +}; + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. */ -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz, - struct __kernel_timespec __user *uts) +static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, + struct ext_arg *ext_arg) { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; + ktime_t start_time; int ret; if (!io_allowed_run_tw(ctx)) @@ -2450,28 +2534,33 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.hit_timeout = 0; + iowq.min_timeout = ext_arg->min_time; iowq.timeout = KTIME_MAX; + start_time = io_get_time(ctx); - if (uts) { + if (ext_arg->ts) { struct timespec64 ts; - if (get_timespec64(&ts, uts)) + if (get_timespec64(&ts, ext_arg->ts)) return -EFAULT; - iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - io_napi_adjust_timeout(ctx, &iowq, &ts); + iowq.timeout = timespec64_to_ktime(ts); + if (!(flags & IORING_ENTER_ABS_TIMER)) + iowq.timeout = ktime_add(iowq.timeout, start_time); } - if (sig) { + if (ext_arg->sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - sigsz); + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, + ext_arg->argsz); else #endif - ret = set_user_sigmask(sig, sigsz); + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); if (ret) return ret; @@ -2481,8 +2570,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, trace_io_uring_cqring_wait(ctx, min_events); do { - int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); unsigned long check_cq; + int nr_wait; + + /* if min timeout has been hit, don't reset wait count */ + if (!iowq.hit_timeout) + nr_wait = (int) iowq.cq_tail - + READ_ONCE(ctx->rings->cq.tail); + else + nr_wait = 1; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, nr_wait); @@ -2492,7 +2588,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, TASK_INTERRUPTIBLE); } - ret = io_cqring_wait_schedule(ctx, &iowq); + ret = io_cqring_wait_schedule(ctx, &iowq, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); @@ -2501,9 +2597,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ - io_run_task_work(); if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx, nr_wait); + io_run_task_work(); /* * Non-local task_work will be run on exit to userspace, but @@ -2597,13 +2693,11 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries #endif if (ctx->flags & IORING_SETUP_NO_SQARRAY) { - if (sq_offset) - *sq_offset = SIZE_MAX; + *sq_offset = SIZE_MAX; return off; } - if (sq_offset) - *sq_offset = off; + *sq_offset = off; sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) @@ -2650,6 +2744,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); @@ -3072,8 +3167,11 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) bool loop = false; io_uring_drop_tctx_refs(current); + if (!tctx_inflight(tctx, !cancel_all)) + break; + /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !cancel_all); + inflight = tctx_inflight(tctx, false); if (!inflight) break; @@ -3148,9 +3246,8 @@ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t a return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, - struct __kernel_timespec __user **ts, - const sigset_t __user **sig) +static int io_get_ext_arg(unsigned flags, const void __user *argp, + struct ext_arg *ext_arg) { struct io_uring_getevents_arg arg; @@ -3159,8 +3256,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * is just a pointer to the sigset_t. */ if (!(flags & IORING_ENTER_EXT_ARG)) { - *sig = (const sigset_t __user *) argp; - *ts = NULL; + ext_arg->sig = (const sigset_t __user *) argp; + ext_arg->ts = NULL; return 0; } @@ -3168,15 +3265,14 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ - if (*argsz != sizeof(arg)) + if (ext_arg->argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; - if (arg.pad) - return -EINVAL; - *sig = u64_to_user_ptr(arg.sigmask); - *argsz = arg.sigmask_sz; - *ts = u64_to_user_ptr(arg.ts); + ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(arg.sigmask); + ext_arg->argsz = arg.sigmask_sz; + ext_arg->ts = u64_to_user_ptr(arg.ts); return 0; } @@ -3190,7 +3286,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | - IORING_ENTER_REGISTERED_RING))) + IORING_ENTER_REGISTERED_RING | + IORING_ENTER_ABS_TIMER))) return -EINVAL; /* @@ -3281,15 +3378,14 @@ iopoll_locked: } mutex_unlock(&ctx->uring_lock); } else { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; + struct ext_arg ext_arg = { .argsz = argsz }; - ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + ret2 = io_get_ext_arg(flags, argp, &ext_arg); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); - ret2 = io_cqring_wait(ctx, min_complete, sig, - argsz, ts); + ret2 = io_cqring_wait(ctx, min_complete, flags, + &ext_arg); } } @@ -3460,6 +3556,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (!ctx) return -ENOMEM; + ctx->clockid = CLOCK_MONOTONIC; + ctx->clock_offset = 0; + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) @@ -3571,7 +3670,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE; + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3674,6 +3773,11 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, static int __init io_uring_init(void) { + struct kmem_cache_args kmem_args = { + .useroffset = offsetof(struct io_kiocb, cmd.data), + .usersize = sizeof_field(struct io_kiocb, cmd.data), + }; + #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ @@ -3758,12 +3862,9 @@ static int __init io_uring_init(void) * range, and HARDENED_USERCOPY will complain if we haven't * correctly annotated this range. */ - req_cachep = kmem_cache_create_usercopy("io_kiocb", - sizeof(struct io_kiocb), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, - offsetof(struct io_kiocb, cmd.data), - sizeof_field(struct io_kiocb, cmd.data), NULL); + req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU); io_buf_cachep = KMEM_CACHE(io_buffer, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 624ca9076a50..70b6675941ff 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -39,11 +39,15 @@ struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; unsigned cq_tail; + unsigned cq_min_tail; unsigned nr_timeouts; + int hit_timeout; + ktime_t min_timeout; ktime_t timeout; + struct hrtimer t; #ifdef CONFIG_NET_RX_BUSY_POLL - unsigned int napi_busy_poll_to; + ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; #endif }; @@ -65,6 +69,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); +void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); @@ -73,6 +78,8 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); +void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, + unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); @@ -87,6 +94,7 @@ int io_uring_alloc_task_context(struct task_struct *task, int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); +void io_req_queue_iowq(struct io_kiocb *req); int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); @@ -104,12 +112,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -enum { - IO_EVENTFD_OP_SIGNAL_BIT, - IO_EVENTFD_OP_FREE_BIT, -}; - -void io_eventfd_ops(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) @@ -282,7 +284,14 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; + /* + * SQPOLL must use the actual sqring head, as using the cached_sq_head + * is race prone if the SQPOLL thread has grabbed entries but not yet + * committed them to the ring. For !SQPOLL, this doesn't matter, but + * since this helper is just used for SQPOLL sqring waits (or POLLOUT), + * just read the actual sqring head unconditionally. + */ + return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) @@ -318,6 +327,7 @@ static inline int io_run_task_work(void) if (current->io_uring) { unsigned int count = 0; + __set_current_state(TASK_RUNNING); tctx_task_work_run(current->io_uring, UINT_MAX, &count); if (count) ret = true; @@ -433,13 +443,21 @@ static inline bool io_file_can_poll(struct io_kiocb *req) { if (req->flags & REQ_F_CAN_POLL) return true; - if (file_can_poll(req->file)) { + if (req->file && file_can_poll(req->file)) { req->flags |= REQ_F_CAN_POLL; return true; } return false; } +static inline ktime_t io_get_time(struct io_ring_ctx *ctx) +{ + if (ctx->clockid == CLOCK_MONOTONIC) + return ktime_get(); + + return ktime_get_with_offset(ctx->clock_offset); +} + enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d2945c9c812b..d407576ddfb7 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -70,7 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) return true; } -void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) +void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { /* * We can add this buffer back to two lists: @@ -88,12 +88,12 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); - __io_put_kbuf_list(req, &ctx->io_buffers_comp); + __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); spin_unlock(&ctx->completion_lock); } else { lockdep_assert_held(&req->ctx->uring_lock); - __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); + __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); } } @@ -129,13 +129,7 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, iov[0].iov_base = buf; iov[0].iov_len = *len; - return 0; -} - -static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, - __u16 head, __u16 mask) -{ - return &br->bufs[head & mask]; + return 1; } static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, @@ -171,9 +165,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ - req->flags &= ~REQ_F_BUFFERS_COMMIT; + io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; - bl->head++; } return u64_to_user_ptr(buf->addr); } @@ -189,7 +182,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, req->buf_index); if (likely(bl)) { - if (bl->is_buf_ring) + if (bl->flags & IOBL_BUF_RING) ret = io_ring_buffer_select(req, len, bl, issue_flags); else ret = io_provided_buffer_select(req, len, bl); @@ -218,12 +211,26 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, buf = io_ring_head_to_buf(br, head, bl->mask); if (arg->max_len) { - int needed; + u32 len = READ_ONCE(buf->len); - needed = (arg->max_len + buf->len - 1) / buf->len; - needed = min(needed, PEEK_MAX_IMPORT); - if (nr_avail > needed) - nr_avail = needed; + if (unlikely(!len)) + return -ENOBUFS; + /* + * Limit incremental buffers to 1 segment. No point trying + * to peek ahead and map more than we need, when the buffers + * themselves should be large when setup with + * IOU_PBUF_RING_INC. + */ + if (bl->flags & IOBL_INC) { + nr_avail = 1; + } else { + size_t needed; + + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; + } } /* @@ -248,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, req->buf_index = buf->bid; do { - /* truncate end piece, if needed */ - if (buf->len > arg->max_len) - buf->len = arg->max_len; + u32 len = buf->len; + + /* truncate end piece, if needed, for non partial buffers */ + if (len > arg->max_len) { + len = arg->max_len; + if (!(bl->flags & IOBL_INC)) + buf->len = len; + } iov->iov_base = u64_to_user_ptr(buf->addr); - iov->iov_len = buf->len; + iov->iov_len = len; iov++; - arg->out_len += buf->len; - arg->max_len -= buf->len; + arg->out_len += len; + arg->max_len -= len; if (!arg->max_len) break; @@ -284,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, if (unlikely(!bl)) goto out_unlock; - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { ret = io_ring_buffers_peek(req, arg, bl); /* * Don't recycle these buffers if we need to go through poll. @@ -294,8 +306,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, * committed them, they cannot be put back in the queue. */ if (ret > 0) { - req->flags |= REQ_F_BL_NO_RECYCLE; - req->buf_list->head += ret; + req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; + io_kbuf_commit(req, bl, arg->out_len, ret); } } else { ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); @@ -317,7 +329,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) if (unlikely(!bl)) return -ENOENT; - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { ret = io_ring_buffers_peek(req, arg, bl); if (ret > 0) req->flags |= REQ_F_BUFFERS_COMMIT; @@ -337,22 +349,22 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { i = bl->buf_ring->tail - bl->head; if (bl->buf_nr_pages) { int j; - if (!bl->is_mmap) { + if (!(bl->flags & IOBL_MMAP)) { for (j = 0; j < bl->buf_nr_pages; j++) unpin_user_page(bl->buf_pages[j]); } io_pages_unmap(bl->buf_ring, &bl->buf_pages, - &bl->buf_nr_pages, bl->is_mmap); - bl->is_mmap = 0; + &bl->buf_nr_pages, bl->flags & IOBL_MMAP); + bl->flags &= ~IOBL_MMAP; } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); - bl->is_buf_ring = 0; + bl->flags &= ~IOBL_BUF_RING; return i; } @@ -439,7 +451,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (bl) { ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->is_buf_ring) + if (!(bl->flags & IOBL_BUF_RING)) ret = __io_remove_buffers(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); @@ -586,7 +598,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) } } /* can't add buffers via this command for a mapped buffer ring */ - if (bl->is_buf_ring) { + if (bl->flags & IOBL_BUF_RING) { ret = -EINVAL; goto err; } @@ -638,8 +650,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; bl->buf_ring = br; - bl->is_buf_ring = 1; - bl->is_mmap = 0; + bl->flags |= IOBL_BUF_RING; + bl->flags &= ~IOBL_MMAP; return 0; error_unpin: unpin_user_pages(pages, nr_pages); @@ -657,11 +669,12 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); - if (!bl->buf_ring) + if (IS_ERR(bl->buf_ring)) { + bl->buf_ring = NULL; return -ENOMEM; + } - bl->is_buf_ring = 1; - bl->is_mmap = 1; + bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); return 0; } @@ -678,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL; - if (reg.flags & ~IOU_PBUF_RING_MMAP) + if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; if (!(reg.flags & IOU_PBUF_RING_MMAP)) { if (!reg.ring_addr) @@ -700,7 +713,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->is_buf_ring || !list_empty(&bl->buf_list)) + if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) return -EEXIST; } else { free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); @@ -716,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!ret) { bl->nr_entries = reg.ring_entries; bl->mask = reg.ring_entries - 1; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; io_buffer_add_list(ctx, bl, reg.bgid); return 0; @@ -742,7 +757,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; - if (!bl->is_buf_ring) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; xa_erase(&ctx->io_bl_xa, bl->bgid); @@ -766,7 +781,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) return -ENOENT; - if (!bl->is_buf_ring) + if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; buf_status.head = bl->head; @@ -797,7 +812,7 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, bl = xa_load(&ctx->io_bl_xa, bgid); /* must be a mmap'able buffer ring and have pages */ ret = false; - if (bl && bl->is_mmap) + if (bl && bl->flags & IOBL_MMAP) ret = atomic_inc_not_zero(&bl->refs); rcu_read_unlock(); diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index b90aca3a57fa..36aadfe5ac00 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -4,6 +4,16 @@ #include <uapi/linux/io_uring.h> +enum { + /* ring mapped provided buffers */ + IOBL_BUF_RING = 1, + /* ring mapped provided buffers, but mmap'ed by application */ + IOBL_MMAP = 2, + /* buffers are consumed incrementally rather than always fully */ + IOBL_INC = 4, + +}; + struct io_buffer_list { /* * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, @@ -25,12 +35,9 @@ struct io_buffer_list { __u16 head; __u16 mask; - atomic_t refs; + __u16 flags; - /* ring mapped provided buffers */ - __u8 is_buf_ring; - /* ring mapped provided buffers, but mmap'ed by application */ - __u8 is_mmap; + atomic_t refs; }; struct io_buffer { @@ -52,8 +59,8 @@ struct buf_sel_arg { struct iovec *iovs; size_t out_len; size_t max_len; - int nr_iovs; - int mode; + unsigned short nr_iovs; + unsigned short mode; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, @@ -73,7 +80,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); +void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); @@ -117,25 +124,55 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) +/* Mapped buffer ring, return io_uring_buf from head */ +#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] + +static inline bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) +{ + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) + return true; + + req->flags &= ~REQ_F_BUFFERS_COMMIT; + + if (unlikely(len < 0)) + return true; + + if (bl->flags & IOBL_INC) { + struct io_uring_buf *buf; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + if (WARN_ON_ONCE(len > buf->len)) + len = buf->len; + buf->len -= len; + if (buf->len) { + buf->addr += len; + return false; + } + } + + bl->head += nr; + return true; +} + +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) { struct io_buffer_list *bl = req->buf_list; + bool ret = true; if (bl) { - if (req->flags & REQ_F_BUFFERS_COMMIT) { - bl->head += nr; - req->flags &= ~REQ_F_BUFFERS_COMMIT; - } + ret = io_kbuf_commit(req, bl, len, nr); req->buf_index = bl->bgid; } req->flags &= ~REQ_F_BUFFER_RING; + return ret; } -static inline void __io_put_kbuf_list(struct io_kiocb *req, +static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, struct list_head *list) { if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req, 1); + __io_put_kbuf_ring(req, len, 1); } else { req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); @@ -150,11 +187,12 @@ static inline void io_kbuf_drop(struct io_kiocb *req) if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return; - __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); + /* len == 0 is fine here, non-ring will always drop all of it */ + __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); } -static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, - unsigned issue_flags) +static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, + int nbufs, unsigned issue_flags) { unsigned int ret; @@ -162,22 +200,24 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, return 0; ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); - if (req->flags & REQ_F_BUFFER_RING) - __io_put_kbuf_ring(req, nbufs); - else - __io_put_kbuf(req, issue_flags); + if (req->flags & REQ_F_BUFFER_RING) { + if (!__io_put_kbuf_ring(req, len, nbufs)) + ret |= IORING_CQE_F_BUF_MORE; + } else { + __io_put_kbuf(req, len, issue_flags); + } return ret; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req, +static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { - return __io_put_kbufs(req, 1, issue_flags); + return __io_put_kbufs(req, len, 1, issue_flags); } -static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, - unsigned issue_flags) +static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, + int nbufs, unsigned issue_flags) { - return __io_put_kbufs(req, nbufs, issue_flags); + return __io_put_kbufs(req, len, nbufs, issue_flags); } #endif diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 523d982af2b0..a0f32a255fd1 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -244,6 +244,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned int npages; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); @@ -253,8 +254,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, - ctx->n_ring_pages); + npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); case IORING_OFF_SQES: return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, ctx->n_sqe_pages); @@ -305,7 +306,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, #else addr = 0UL; #endif - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 81c4a9d43729..7fd9badcfaf8 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,9 +11,9 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" +#include "alloc_cache.h" #include "msg_ring.h" - /* All valid masks for MSG_RING */ #define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ IORING_MSG_RING_FLAGS_PASS) @@ -68,59 +68,70 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - if (!target_ctx->task_complete) - return false; - return current != target_ctx->submitter_task; + return target_ctx->task_complete; } -static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) +static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) { - struct io_ring_ctx *ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - struct task_struct *task = READ_ONCE(ctx->submitter_task); + struct io_ring_ctx *ctx = req->ctx; - if (unlikely(!task)) - return -EOWNERDEAD; + io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); + if (spin_trylock(&ctx->msg_lock)) { + if (io_alloc_cache_put(&ctx->msg_cache, req)) + req = NULL; + spin_unlock(&ctx->msg_lock); + } + if (req) + kmem_cache_free(req_cachep, req); + percpu_ref_put(&ctx->refs); +} - init_task_work(&msg->tw, func); - if (task_work_add(task, &msg->tw, TWA_SIGNAL)) +static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, + int res, u32 cflags, u64 user_data) +{ + req->task = READ_ONCE(ctx->submitter_task); + if (!req->task) { + kmem_cache_free(req_cachep, req); return -EOWNERDEAD; + } + req->cqe.user_data = user_data; + io_req_set_res(req, res, cflags); + percpu_ref_get(&ctx->refs); + req->ctx = ctx; + req->io_task_work.func = io_msg_tw_complete; + io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); + return 0; +} - return IOU_ISSUE_SKIP_COMPLETE; +static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req = NULL; + + if (spin_trylock(&ctx->msg_lock)) { + req = io_alloc_cache_get(&ctx->msg_cache); + spin_unlock(&ctx->msg_lock); + if (req) + return req; + } + return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); } -static void io_msg_tw_complete(struct callback_head *head) +static int io_msg_data_remote(struct io_kiocb *req) { - struct io_msg *msg = container_of(head, struct io_msg, tw); - struct io_kiocb *req = cmd_to_io_kiocb(msg); struct io_ring_ctx *target_ctx = req->file->private_data; - int ret = 0; - - if (current->flags & PF_EXITING) { - ret = -EOWNERDEAD; - } else { - u32 flags = 0; - - if (msg->flags & IORING_MSG_RING_FLAGS_PASS) - flags = msg->cqe_flags; - - /* - * If the target ring is using IOPOLL mode, then we need to be - * holding the uring_lock for posting completions. Other ring - * types rely on the regular completion locking, which is - * handled while posting. - */ - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&target_ctx->uring_lock); - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = -EOVERFLOW; - if (target_ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&target_ctx->uring_lock); - } + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct io_kiocb *target; + u32 flags = 0; - if (ret < 0) - req_set_fail(req); - io_req_queue_tw_complete(req, ret); + target = io_msg_get_kiocb(req->ctx); + if (unlikely(!target)) + return -ENOMEM; + + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + + return io_msg_remote_post(target_ctx, target, msg->len, flags, + msg->user_data); } static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) @@ -138,7 +149,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return -EBADFD; if (io_msg_need_remote(target_ctx)) - return io_msg_exec_remote(req, io_msg_tw_complete); + return io_msg_data_remote(req); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; @@ -218,6 +229,22 @@ static void io_msg_tw_fd_complete(struct callback_head *head) io_req_queue_tw_complete(req, ret); } +static int io_msg_fd_remote(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct task_struct *task = READ_ONCE(ctx->submitter_task); + + if (unlikely(!task)) + return -EOWNERDEAD; + + init_task_work(&msg->tw, io_msg_tw_fd_complete); + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) + return -EOWNERDEAD; + + return IOU_ISSUE_SKIP_COMPLETE; +} + static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; @@ -240,7 +267,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) } if (io_msg_need_remote(target_ctx)) - return io_msg_exec_remote(req, io_msg_tw_fd_complete); + return io_msg_fd_remote(req); return io_msg_install_complete(req, issue_flags); } @@ -294,3 +321,10 @@ done: io_req_set_res(req, ret, 0); return IOU_OK; } + +void io_msg_cache_free(const void *entry) +{ + struct io_kiocb *req = (struct io_kiocb *) entry; + + kmem_cache_free(req_cachep, req); +} diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index 3987ee6c0e5f..3030f3942f0f 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -3,3 +3,4 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); void io_msg_ring_cleanup(struct io_kiocb *req); +void io_msg_cache_free(const void *entry); diff --git a/io_uring/napi.c b/io_uring/napi.c index 883a1a665907..d0cf694d0172 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -26,13 +26,18 @@ static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, hlist_for_each_entry_rcu(e, hash_list, node) { if (e->napi_id != napi_id) continue; - e->timeout = jiffies + NAPI_TIMEOUT; return e; } return NULL; } +static inline ktime_t net_to_ktime(unsigned long t) +{ + /* napi approximating usecs, reverse busy_loop_current_time */ + return ns_to_ktime(t << 10); +} + void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) { struct hlist_head *hash_list; @@ -102,14 +107,14 @@ static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) __io_napi_remove_stale(ctx); } -static inline bool io_napi_busy_loop_timeout(unsigned long start_time, - unsigned long bp_usec) +static inline bool io_napi_busy_loop_timeout(ktime_t start_time, + ktime_t bp) { - if (bp_usec) { - unsigned long end_time = start_time + bp_usec; - unsigned long now = busy_loop_current_time(); + if (bp) { + ktime_t end_time = ktime_add(start_time, bp); + ktime_t now = net_to_ktime(busy_loop_current_time()); - return time_after(now, end_time); + return ktime_after(now, end_time); } return true; @@ -124,7 +129,8 @@ static bool io_napi_busy_loop_should_end(void *data, return true; if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return true; - if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to)) + if (io_napi_busy_loop_timeout(net_to_ktime(start_time), + iowq->napi_busy_poll_dt)) return true; return false; @@ -181,10 +187,12 @@ static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, */ void io_napi_init(struct io_ring_ctx *ctx) { + u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; + INIT_LIST_HEAD(&ctx->napi_list); spin_lock_init(&ctx->napi_lock); ctx->napi_prefer_busy_poll = false; - ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll); + ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); } /* @@ -196,7 +204,6 @@ void io_napi_init(struct io_ring_ctx *ctx) void io_napi_free(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - LIST_HEAD(napi_list); unsigned int i; spin_lock(&ctx->napi_lock); @@ -217,11 +224,13 @@ void io_napi_free(struct io_ring_ctx *ctx) int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; struct io_uring_napi napi; + if (ctx->flags & IORING_SETUP_IOPOLL) + return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) @@ -230,7 +239,7 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) if (copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to); + WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); WRITE_ONCE(ctx->napi_enabled, true); return 0; @@ -247,62 +256,40 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, 0); + WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); WRITE_ONCE(ctx->napi_enabled, false); return 0; } /* - * __io_napi_adjust_timeout() - Add napi id to the busy poll list + * __io_napi_busy_loop() - execute busy poll loop * @ctx: pointer to io-uring context structure * @iowq: pointer to io wait queue - * @ts: pointer to timespec or NULL * - * Adjust the busy loop timeout according to timespec and busy poll timeout. + * Execute the busy poll loop and merge the spliced off list. */ -void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) +void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { - unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); - - if (ts) { - struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); + if (ctx->flags & IORING_SETUP_SQPOLL) + return; - if (timespec64_compare(ts, &poll_to_ts) > 0) { - *ts = timespec64_sub(*ts, poll_to_ts); - } else { - u64 to = timespec64_to_ns(ts); + iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); + if (iowq->timeout != KTIME_MAX) { + ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); - do_div(to, 1000); - ts->tv_sec = 0; - ts->tv_nsec = 0; - } + iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); } - iowq->napi_busy_poll_to = poll_to; -} - -/* - * __io_napi_busy_loop() - execute busy poll loop - * @ctx: pointer to io-uring context structure - * @iowq: pointer to io wait queue - * - * Execute the busy poll loop and merge the spliced off list. - */ -void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) -{ iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); - - if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled) - io_napi_blocking_busy_loop(ctx, iowq); + io_napi_blocking_busy_loop(ctx, iowq); } /* @@ -313,10 +300,9 @@ void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) */ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) { - LIST_HEAD(napi_list); bool is_stale = false; - if (!READ_ONCE(ctx->napi_busy_poll_to)) + if (!READ_ONCE(ctx->napi_busy_poll_dt)) return 0; if (list_empty_careful(&ctx->napi_list)) return 0; diff --git a/io_uring/napi.h b/io_uring/napi.h index 6fc0393d0dbe..fd275ef0456d 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -17,8 +17,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); -void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, struct timespec64 *ts); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -27,15 +25,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx) return !list_empty(&ctx->napi_list); } -static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - struct timespec64 *ts) -{ - if (!io_napi(ctx)) - return; - __io_napi_adjust_timeout(ctx, iowq, ts); -} - static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { @@ -55,7 +44,7 @@ static inline void io_napi_add(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct socket *sock; - if (!READ_ONCE(ctx->napi_busy_poll_to)) + if (!READ_ONCE(ctx->napi_enabled)) return; sock = sock_from_file(req->file); @@ -86,11 +75,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx) static inline void io_napi_add(struct io_kiocb *req) { } -static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - struct timespec64 *ts) -{ -} static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { diff --git a/io_uring/net.c b/io_uring/net.c index 070dea9a4eda..18507658a921 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -51,6 +51,16 @@ struct io_connect { bool seen_econnaborted; }; +struct io_bind { + struct file *file; + int addr_len; +}; + +struct io_listen { + struct file *file; + int backlog; +}; + struct io_sr_msg { struct file *file; union { @@ -424,8 +434,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->buf_group = req->buf_index; req->buf_list = NULL; } - if (req->flags & REQ_F_BUFFER_SELECT && sr->len) - return -EINVAL; #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -489,11 +497,11 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret, unsigned int cflags; if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { - cflags = io_put_kbuf(req, issue_flags); + cflags = io_put_kbuf(req, *ret, issue_flags); goto finish; } - cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); + cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); if (bundle_finished || req->flags & REQ_F_BL_EMPTY) goto finish; @@ -589,30 +597,41 @@ retry_bundle: if (io_do_buffer_select(req)) { struct buf_sel_arg arg = { .iovs = &kmsg->fast_iov, - .max_len = INT_MAX, + .max_len = min_not_zero(sr->len, INT_MAX), .nr_iovs = 1, - .mode = KBUF_MODE_EXPAND, }; if (kmsg->free_iov) { arg.nr_iovs = kmsg->free_iov_nr; arg.iovs = kmsg->free_iov; - arg.mode |= KBUF_MODE_FREE; + arg.mode = KBUF_MODE_FREE; } if (!(sr->flags & IORING_RECVSEND_BUNDLE)) arg.nr_iovs = 1; + else + arg.mode |= KBUF_MODE_EXPAND; ret = io_buffers_select(req, &arg, issue_flags); if (unlikely(ret < 0)) return ret; - sr->len = arg.out_len; - iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, - arg.out_len); if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { kmsg->free_iov_nr = ret; kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + sr->len = arg.out_len; + + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } else { + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, + arg.iovs, ret, arg.out_len); } } @@ -817,20 +836,20 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, bool mshot_finished, unsigned issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - unsigned int cflags; - - if (sr->flags & IORING_RECVSEND_BUNDLE) - cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), - issue_flags); - else - cflags = io_put_kbuf(req, issue_flags); + unsigned int cflags = 0; if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - /* bundle with no more immediate buffers, we're done */ - if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) - goto finish; + if (sr->flags & IORING_RECVSEND_BUNDLE) { + cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), + issue_flags); + /* bundle with no more immediate buffers, we're done */ + if (req->flags & REQ_F_BL_EMPTY) + goto finish; + } else { + cflags |= io_put_kbuf(req, *ret, issue_flags); + } /* * Fill CQE for this receive and see if we should keep trying to @@ -1084,6 +1103,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { kmsg->free_iov_nr = ret; kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; } } else { void __user *buf; @@ -1113,6 +1133,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; size_t len = sr->len; + bool mshot_finished; if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) @@ -1129,13 +1150,15 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) retry_multishot: if (io_do_buffer_select(req)) { ret = io_recv_buf_select(req, kmsg, &len, issue_flags); - if (unlikely(ret)) + if (unlikely(ret)) { + kmsg->msg.msg_inq = -1; goto out_free; + } sr->buf = NULL; } - kmsg->msg.msg_inq = -1; kmsg->msg.msg_flags = 0; + kmsg->msg.msg_inq = -1; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); @@ -1165,6 +1188,7 @@ out_free: req_set_fail(req); } + mshot_finished = ret <= 0; if (ret > 0) ret += sr->done_io; else if (sr->done_io) @@ -1172,7 +1196,7 @@ out_free: else io_kbuf_recycle(req, issue_flags); - if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags)) + if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) goto retry_multishot; return ret; @@ -1265,14 +1289,14 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); } -static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, +static int io_sg_from_iter_iovec(struct sk_buff *skb, struct iov_iter *from, size_t length) { skb_zcopy_downgrade_managed(skb); - return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); + return zerocopy_fill_skb_from_iter(skb, from, length); } -static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, +static int io_sg_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -1285,7 +1309,7 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, if (!frag) shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; else if (unlikely(!skb_zcopy_managed(skb))) - return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); + return zerocopy_fill_skb_from_iter(skb, from, length); bi.bi_size = min(from->count, length); bi.bi_bvec_done = from->iov_offset; @@ -1312,14 +1336,6 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, skb->data_len += copied; skb->len += copied; skb->truesize += truesize; - - if (sk && sk->sk_type == SOCK_STREAM) { - sk_wmem_queued_add(sk, truesize); - if (!skb_zcopy_pure(skb)) - sk_mem_charge(sk, truesize); - } else { - refcount_add(truesize, &skb->sk->sk_wmem_alloc); - } return ret; } @@ -1528,9 +1544,12 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; bool fixed = !!accept->file_slot; + struct proto_accept_arg arg = { + .flags = force_nonblock ? O_NONBLOCK : 0, + }; struct file *file; + unsigned cflags; int ret, fd; if (!(req->flags & REQ_F_POLLED) && @@ -1543,7 +1562,9 @@ retry: if (unlikely(fd < 0)) return fd; } - file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, + arg.err = 0; + arg.is_empty = -1; + file = do_accept(req->file, &arg, accept->addr, accept->addr_len, accept->flags); if (IS_ERR(file)) { if (!fixed) @@ -1571,17 +1592,26 @@ retry: accept->file_slot); } + cflags = 0; + if (!arg.is_empty) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, ret, 0); + io_req_set_res(req, ret, cflags); return IOU_OK; } if (ret < 0) return ret; - if (io_req_post_cqe(req, ret, IORING_CQE_F_MORE)) - goto retry; + if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) + goto retry; + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; + return -EAGAIN; + } - io_req_set_res(req, ret, 0); + io_req_set_res(req, ret, cflags); return IOU_STOP_MULTISHOT; } @@ -1701,6 +1731,70 @@ out: return IOU_OK; } +int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); + struct sockaddr __user *uaddr; + struct io_async_msghdr *io; + + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + bind->addr_len = READ_ONCE(sqe->addr2); + + io = io_msg_alloc_async(req); + if (unlikely(!io)) + return -ENOMEM; + return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); +} + +int io_bind(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); + struct io_async_msghdr *io = req->async_data; + struct socket *sock; + int ret; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_bind_socket(sock, &io->addr, bind->addr_len); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return 0; +} + +int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); + + if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) + return -EINVAL; + + listen->backlog = READ_ONCE(sqe->len); + return 0; +} + +int io_listen(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); + struct socket *sock; + int ret; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_listen_socket(sock, listen->backlog); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return 0; +} + void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; diff --git a/io_uring/net.h b/io_uring/net.h index 0eb1c1920fc9..52bfee05f06a 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -49,6 +49,12 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); +int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_bind(struct io_kiocb *req, unsigned int issue_flags); + +int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_listen(struct io_kiocb *req, unsigned int issue_flags); + void io_netmsg_cache_free(const void *entry); #else static inline void io_netmsg_cache_free(const void *entry) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 2de5cca9504e..a2be3bbca5ff 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -495,6 +495,26 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_ftruncate_prep, .issue = io_ftruncate, }, + [IORING_OP_BIND] = { +#if defined(CONFIG_NET) + .needs_file = 1, + .prep = io_bind_prep, + .issue = io_bind, + .async_size = sizeof(struct io_async_msghdr), +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_LISTEN] = { +#if defined(CONFIG_NET) + .needs_file = 1, + .prep = io_listen_prep, + .issue = io_listen, + .async_size = sizeof(struct io_async_msghdr), +#else + .prep = io_eopnotsupp_prep, +#endif + }, }; const struct io_cold_def io_cold_defs[] = { @@ -516,10 +536,12 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_READ_FIXED] = { .name = "READ_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { .name = "WRITE_FIXED", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { @@ -582,10 +604,12 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_READ] = { .name = "READ", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITE] = { .name = "WRITE", + .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { @@ -692,6 +716,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_READ_MULTISHOT] = { .name = "READ_MULTISHOT", + .cleanup = io_readv_writev_cleanup, }, [IORING_OP_WAITID] = { .name = "WAITID", @@ -711,6 +736,12 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_FTRUNCATE] = { .name = "FTRUNCATE", }, + [IORING_OP_BIND] = { + .name = "BIND", + }, + [IORING_OP_LISTEN] = { + .name = "LISTEN", + }, }; const char *io_uring_get_opcode(u8 opcode) @@ -720,6 +751,14 @@ const char *io_uring_get_opcode(u8 opcode) return "INVALID"; } +bool io_uring_op_supported(u8 opcode) +{ + if (opcode < IORING_OP_LAST && + io_issue_defs[opcode].prep != io_eopnotsupp_prep) + return true; + return false; +} + void __init io_uring_optable_init(void) { int i; diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 7ee6f5aa90aa..14456436ff74 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -17,8 +17,6 @@ struct io_issue_def { unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; /* supports ioprio */ @@ -47,5 +45,7 @@ struct io_cold_def { extern const struct io_issue_def io_issue_defs[]; extern const struct io_cold_def io_cold_defs[]; +bool io_uring_op_supported(u8 opcode); + void io_uring_optable_init(void); #endif diff --git a/io_uring/poll.c b/io_uring/poll.c index 0a8e02944689..1f63b60e85e7 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -347,6 +347,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) v &= IO_POLL_REF_MASK; } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); + io_napi_add(req); return IOU_POLL_NO_ACTION; } diff --git a/io_uring/register.c b/io_uring/register.c index ef8c908346a4..eca26d4884d9 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -27,65 +27,11 @@ #include "cancel.h" #include "kbuf.h" #include "napi.h" +#include "eventfd.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - - spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - atomic_set(&ev_fd->refs, 1); - atomic_set(&ev_fd->ops, 0); - return 0; -} - -int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) - call_rcu(&ev_fd->rcu, io_eventfd_ops); - return 0; - } - - return -ENXIO; -} - static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { @@ -93,9 +39,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, size_t size; int i, ret; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + size = struct_size(p, ops, nr_args); - if (size == SIZE_MAX) - return -EOVERFLOW; p = kzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; @@ -108,12 +55,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, goto out; p->last_op = IORING_OP_LAST - 1; - if (nr_args > IORING_OP_LAST) - nr_args = IORING_OP_LAST; for (i = 0; i < nr_args; i++) { p->ops[i].op = i; - if (!io_issue_defs[i].not_supported) + if (io_uring_op_supported(i)) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; @@ -355,8 +300,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, } if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); } if (copy_to_user(arg, new_count, sizeof(new_count))) @@ -380,12 +327,39 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return 0; err: if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); } return ret; } +static int io_register_clock(struct io_ring_ctx *ctx, + struct io_uring_clock_register __user *arg) +{ + struct io_uring_clock_register reg; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + + switch (reg.clockid) { + case CLOCK_MONOTONIC: + ctx->clock_offset = 0; + break; + case CLOCK_BOOTTIME: + ctx->clock_offset = TK_OFFS_BOOT; + break; + default: + return -EINVAL; + } + + ctx->clockid = reg.clockid; + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -562,6 +536,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_napi(ctx, arg); break; + case IORING_REGISTER_CLOCK: + ret = -EINVAL; + if (!arg || nr_args) + break; + ret = io_register_clock(ctx, arg); + break; + case IORING_REGISTER_CLONE_BUFFERS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_clone_buffers(ctx, arg); + break; default: ret = -EINVAL; break; @@ -570,21 +556,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, return ret; } -SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, - void __user *, arg, unsigned int, nr_args) +/* + * Given an 'fd' value, return the ctx associated with if. If 'registered' is + * true, then the registered index is used. Otherwise, the normal fd table. + * Caller must call fput() on the returned file, unless it's an ERR_PTR. + */ +struct file *io_uring_register_get_file(unsigned int fd, bool registered) { - struct io_ring_ctx *ctx; - long ret = -EBADF; struct file *file; - bool use_registered_ring; - - use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); - opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; - - if (opcode >= IORING_REGISTER_LAST) - return -EINVAL; - if (use_registered_ring) { + if (registered) { /* * Ring fd has been registered via IORING_REGISTER_RING_FDS, we * need only dereference our task private array to find it. @@ -592,27 +573,44 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return -EINVAL; + return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; - if (unlikely(!file)) - return -EBADF; } else { file = fget(fd); - if (unlikely(!file)) - return -EBADF; - ret = -EOPNOTSUPP; - if (!io_is_uring_fops(file)) - goto out_fput; } + if (unlikely(!file)) + return ERR_PTR(-EBADF); + if (io_is_uring_fops(file)) + return file; + fput(file); + return ERR_PTR(-EOPNOTSUPP); +} + +SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, + void __user *, arg, unsigned int, nr_args) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + struct file *file; + bool use_registered_ring; + + use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); + opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; + + if (opcode >= IORING_REGISTER_LAST) + return -EINVAL; + + file = io_uring_register_get_file(fd, use_registered_ring); + if (IS_ERR(file)) + return PTR_ERR(file); ctx = file->private_data; mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); -out_fput: if (!use_registered_ring) fput(file); return ret; diff --git a/io_uring/register.h b/io_uring/register.h index c9da997d503c..a5f39d5ef9e0 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -4,5 +4,6 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); +struct file *io_uring_register_get_file(unsigned int fd, bool registered); #endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 65417c9553b1..6f3b6de230bd 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -17,6 +17,7 @@ #include "openclose.h" #include "rsrc.h" #include "memmap.h" +#include "register.h" struct io_rsrc_update { struct file *file; @@ -37,7 +38,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, static const struct io_mapped_ubuf dummy_ubuf = { /* set invalid range, so io_import_fixed() fails meeting it */ .ubuf = -1UL, - .ubuf_end = 0, + .len = UINT_MAX, }; int __io_account_mem(struct user_struct *user, unsigned long nr_pages) @@ -85,31 +86,6 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, - void __user *arg, unsigned index) -{ - struct iovec __user *src; - -#ifdef CONFIG_COMPAT - if (ctx->compat) { - struct compat_iovec __user *ciovs; - struct compat_iovec ciov; - - ciovs = (struct compat_iovec __user *) arg; - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) - return -EFAULT; - - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); - dst->iov_len = ciov.iov_len; - return 0; - } -#endif - src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) - return -EFAULT; - return 0; -} - static int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); @@ -139,14 +115,16 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo struct io_mapped_ubuf *imu = *slot; unsigned int i; + *slot = NULL; if (imu != &dummy_ubuf) { + if (!refcount_dec_and_test(&imu->refs)) + return; for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu); } - *slot = NULL; } static void io_rsrc_put_work(struct io_rsrc_node *node) @@ -249,6 +227,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = io_run_task_work_sig(ctx); if (ret < 0) { + finish_wait(&ctx->rsrc_quiesce_wq, &we); mutex_lock(&ctx->uring_lock); if (list_empty(&ctx->rsrc_ref_list)) ret = 0; @@ -256,7 +235,6 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, } schedule(); - __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); ret = 0; } while (!list_empty(&ctx->rsrc_ref_list)); @@ -420,8 +398,10 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, unsigned int nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); + struct iovec fast_iov, *iov; struct page *last_hpage = NULL; + struct iovec __user *uvec; + u64 user_data = up->data; __u32 done; int i, err; @@ -434,21 +414,24 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu; u64 tag = 0; - err = io_copy_iov(ctx, &iov, iovs, done); - if (err) + uvec = u64_to_user_ptr(user_data); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + err = PTR_ERR(iov); break; + } if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } - err = io_buffer_validate(&iov); + err = io_buffer_validate(iov); if (err) break; - if (!iov.iov_base && tag) { + if (!iov->iov_base && tag) { err = -EINVAL; break; } - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); + err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); if (err) break; @@ -465,6 +448,10 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, ctx->user_bufs[i] = imu; *io_get_tag_slot(ctx->buf_data, i) = tag; + if (ctx->compat) + user_data += sizeof(struct compat_iovec); + else + user_data += sizeof(struct iovec); } return done ? done : err; } @@ -871,6 +858,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } +static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data, int nr_folios) +{ + struct page **page_array = *pages, **new_array = NULL; + int nr_pages_left = *nr_pages, i, j; + + /* Store head pages only*/ + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), + GFP_KERNEL); + if (!new_array) + return false; + + new_array[0] = compound_head(page_array[0]); + /* + * The pages are bound to the folio, it doesn't + * actually unpin them but drops all but one reference, + * which is usually put down by io_buffer_unmap(). + * Note, needs a better helper. + */ + if (data->nr_pages_head > 1) + unpin_user_pages(&page_array[1], data->nr_pages_head - 1); + + j = data->nr_pages_head; + nr_pages_left -= data->nr_pages_head; + for (i = 1; i < nr_folios; i++) { + unsigned int nr_unpin; + + new_array[i] = page_array[j]; + nr_unpin = min_t(unsigned int, nr_pages_left - 1, + data->nr_pages_mid - 1); + if (nr_unpin) + unpin_user_pages(&page_array[j+1], nr_unpin); + j += data->nr_pages_mid; + nr_pages_left -= data->nr_pages_mid; + } + kvfree(page_array); + *pages = new_array; + *nr_pages = nr_folios; + return true; +} + +static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data) +{ + struct page **page_array = *pages; + struct folio *folio = page_folio(page_array[0]); + unsigned int count = 1, nr_folios = 1; + int i; + + if (*nr_pages <= 1) + return false; + + data->nr_pages_mid = folio_nr_pages(folio); + if (data->nr_pages_mid == 1) + return false; + + data->folio_shift = folio_shift(folio); + /* + * Check if pages are contiguous inside a folio, and all folios have + * the same page count except for the head and tail. + */ + for (i = 1; i < *nr_pages; i++) { + if (page_folio(page_array[i]) == folio && + page_array[i] == page_array[i-1] + 1) { + count++; + continue; + } + + if (nr_folios == 1) { + if (folio_page_idx(folio, page_array[i-1]) != + data->nr_pages_mid - 1) + return false; + + data->nr_pages_head = count; + } else if (count != data->nr_pages_mid) { + return false; + } + + folio = page_folio(page_array[i]); + if (folio_size(folio) != (1UL << data->folio_shift) || + folio_page_idx(folio, page_array[i]) != 0) + return false; + + count = 1; + nr_folios++; + } + if (nr_folios == 1) + data->nr_pages_head = count; + + return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); +} + static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage) @@ -880,7 +959,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unsigned long off; size_t size; int ret, nr_pages, i; - struct folio *folio = NULL; + struct io_imu_folio_data data; + bool coalesced; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) @@ -895,31 +975,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } - /* If it's a huge page, try to coalesce them into a single bvec entry */ - if (nr_pages > 1) { - folio = page_folio(pages[0]); - for (i = 1; i < nr_pages; i++) { - /* - * Pages must be consecutive and on the same folio for - * this to work - */ - if (page_folio(pages[i]) != folio || - pages[i] != pages[i - 1] + 1) { - folio = NULL; - break; - } - } - if (folio) { - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - unpin_user_pages(&pages[1], nr_pages - 1); - nr_pages = 1; - } - } + /* If it's huge page(s), try to coalesce them into fewer bvec entries */ + coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) @@ -931,23 +988,23 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } - off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; + imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; + imu->folio_shift = PAGE_SHIFT; + if (coalesced) + imu->folio_shift = data.folio_shift; + refcount_set(&imu->refs, 1); + off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); *pimu = imu; ret = 0; - if (folio) { - bvec_set_page(&imu->bvec[0], pages[0], size, off); - goto done; - } for (i = 0; i < nr_pages; i++) { size_t vec_len; - vec_len = min_t(size_t, size, PAGE_SIZE - off); + vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; @@ -970,8 +1027,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, { struct page *last_hpage = NULL; struct io_rsrc_data *data; + struct iovec fast_iov, *iov = &fast_iov; + const struct iovec __user *uvec; int i, ret; - struct iovec iov; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); @@ -988,24 +1046,32 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } + if (!arg) + memset(iov, 0, sizeof(*iov)); + for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) + uvec = (struct iovec __user *) arg; + iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + if (IS_ERR(iov)) { + ret = PTR_ERR(iov); break; - ret = io_buffer_validate(&iov); + } + ret = io_buffer_validate(iov); if (ret) break; - } else { - memset(&iov, 0, sizeof(iov)); + if (ctx->compat) + arg += sizeof(struct compat_iovec); + else + arg += sizeof(struct iovec); } - if (!iov.iov_base && *io_get_tag_slot(data, i)) { + if (!iov->iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; } - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], + ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], &last_hpage); if (ret) break; @@ -1031,7 +1097,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; /* @@ -1049,25 +1115,19 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * we know that: * * 1) it's a BVEC iter, we set it up - * 2) all bvecs are PAGE_SIZE in size, except potentially the + * 2) all bvecs are the same in size, except potentially the * first and last bvec * * So just find our index, and adjust the iterator afterwards. * If the offset is within the first bvec (or the whole first * bvec, just use iov_iter_advance(). This makes it easier * since we can just skip the first segment, which may not - * be PAGE_SIZE aligned. + * be folio_size aligned. */ const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { - /* - * Note, huge pages buffers consists of one large - * bvec entry and should always go this way. The other - * branch doesn't expect non PAGE_SIZE'd chunks. - */ iter->bvec = bvec; - iter->nr_segs = bvec->bv_len; iter->count -= offset; iter->iov_offset = offset; } else { @@ -1075,14 +1135,105 @@ int io_import_fixed(int ddir, struct iov_iter *iter, /* skip first vec */ offset -= bvec->bv_len; - seg_skip = 1 + (offset >> PAGE_SHIFT); + seg_skip = 1 + (offset >> imu->folio_shift); iter->bvec = bvec + seg_skip; iter->nr_segs -= seg_skip; iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~PAGE_MASK; + iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } return 0; } + +static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) +{ + struct io_mapped_ubuf **user_bufs; + struct io_rsrc_data *data; + int i, ret, nbufs; + + /* + * Drop our own lock here. We'll setup the data we need and reference + * the source buffers, then re-grab, check, and assign at the end. + */ + mutex_unlock(&ctx->uring_lock); + + mutex_lock(&src_ctx->uring_lock); + ret = -ENXIO; + nbufs = src_ctx->nr_user_bufs; + if (!nbufs) + goto out_unlock; + ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data); + if (ret) + goto out_unlock; + + ret = -ENOMEM; + user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL); + if (!user_bufs) + goto out_free_data; + + for (i = 0; i < nbufs; i++) { + struct io_mapped_ubuf *src = src_ctx->user_bufs[i]; + + if (src != &dummy_ubuf) + refcount_inc(&src->refs); + user_bufs[i] = src; + } + + /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ + mutex_unlock(&src_ctx->uring_lock); + mutex_lock(&ctx->uring_lock); + if (!ctx->user_bufs) { + ctx->user_bufs = user_bufs; + ctx->buf_data = data; + ctx->nr_user_bufs = nbufs; + return 0; + } + + /* someone raced setting up buffers, dump ours */ + for (i = 0; i < nbufs; i++) + io_buffer_unmap(ctx, &user_bufs[i]); + io_rsrc_data_free(data); + kfree(user_bufs); + return -EBUSY; +out_free_data: + io_rsrc_data_free(data); +out_unlock: + mutex_unlock(&src_ctx->uring_lock); + mutex_lock(&ctx->uring_lock); + return ret; +} + +/* + * Copy the registered buffers from the source ring whose file descriptor + * is given in the src_fd to the current ring. This is identical to registering + * the buffers with ctx, except faster as mappings already exist. + * + * Since the memory is already accounted once, don't account it again. + */ +int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_clone_buffers buf; + bool registered_src; + struct file *file; + int ret; + + if (ctx->user_bufs || ctx->nr_user_bufs) + return -EBUSY; + if (copy_from_user(&buf, arg, sizeof(buf))) + return -EFAULT; + if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED) + return -EINVAL; + if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) + return -EINVAL; + + registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; + file = io_uring_register_get_file(buf.src_fd, registered_src); + if (IS_ERR(file)) + return PTR_ERR(file); + ret = io_clone_buffers(ctx, file->private_data); + if (!registered_src) + fput(file); + return ret; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c032ca3436ca..8ed588036210 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -22,8 +22,6 @@ struct io_rsrc_put { }; }; -typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); - struct io_rsrc_data { struct io_ring_ctx *ctx; @@ -44,12 +42,22 @@ struct io_rsrc_node { struct io_mapped_ubuf { u64 ubuf; - u64 ubuf_end; + unsigned int len; unsigned int nr_bvecs; + unsigned int folio_shift; + refcount_t refs; unsigned long acct_pages; struct bio_vec bvec[] __counted_by(nr_bvecs); }; +struct io_imu_folio_data { + /* Head folio can be partially included in the fixed buf */ + unsigned int nr_pages_head; + /* For non-head/tail folios, has to be fully included */ + unsigned int nr_pages_mid; + unsigned int folio_shift; +}; + void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); @@ -59,6 +67,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len); +int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, diff --git a/io_uring/rw.c b/io_uring/rw.c index 894c43a5fc0e..354c4e175654 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -31,9 +31,19 @@ struct io_rw { rwf_t flags; }; -static inline bool io_file_supports_nowait(struct io_kiocb *req) +static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) { - return req->flags & REQ_F_SUPPORT_NOWAIT; + /* If FMODE_NOWAIT is set for a file, we're golden */ + if (req->flags & REQ_F_SUPPORT_NOWAIT) + return true; + /* No FMODE_NOWAIT, if we can poll, check the status */ + if (io_file_can_poll(req)) { + struct poll_table_struct pt = { ._key = mask }; + + return vfs_poll(req->file, &pt) & mask; + } + /* No FMODE_NOWAIT support, and file isn't pollable. Tough luck. */ + return false; } #ifdef CONFIG_COMPAT @@ -467,8 +477,7 @@ static void io_req_io_end(struct io_kiocb *req) static bool __io_complete_rw_common(struct io_kiocb *req, long res) { if (unlikely(res != req->cqe.res)) { - if ((res == -EAGAIN || res == -EOPNOTSUPP) && - io_rw_should_reissue(req)) { + if (res == -EAGAIN && io_rw_should_reissue(req)) { /* * Reissue will start accounting again, finish the * current cycle. @@ -511,7 +520,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) - req->cqe.flags |= io_put_kbuf(req, 0); + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); io_req_rw_cleanup(req, 0); io_req_task_complete(req, ts); @@ -593,7 +602,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, */ io_req_io_end(req); io_req_set_res(req, final_ret, - io_put_kbuf(req, issue_flags)); + io_put_kbuf(req, ret, issue_flags)); io_req_rw_cleanup(req, issue_flags); return IOU_OK; } @@ -759,7 +768,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) struct file *file = rw->kiocb.ki_filp; if (likely(file->f_op->read_iter)) - return call_read_iter(file, &rw->kiocb, iter); + return file->f_op->read_iter(&rw->kiocb, iter); else if (file->f_op->read) return loop_rw_iter(READ, rw, iter); else @@ -772,7 +781,7 @@ static bool need_complete_io(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -787,7 +796,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) req->flags |= io_file_get_flags(file); kiocb->ki_flags = file->f_iocb_flags; - ret = kiocb_set_rw_flags(kiocb, rw->flags); + ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; kiocb->ki_flags |= IOCB_ALLOC_CACHE; @@ -797,8 +806,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) * supports async. Otherwise it's impossible to use O_NONBLOCK files * reliably. If not, or it IOCB_NOWAIT is set, don't retry. */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) + if (kiocb->ki_flags & IOCB_NOWAIT || + ((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT)))) req->flags |= REQ_F_NOWAIT; if (ctx->flags & IORING_SETUP_IOPOLL) { @@ -832,15 +841,14 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } - - ret = io_rw_init_file(req, FMODE_READ); + ret = io_rw_init_file(req, FMODE_READ, READ); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) + if (unlikely(!io_file_supports_nowait(req, EPOLLIN))) return -EAGAIN; kiocb->ki_flags |= IOCB_NOWAIT; } else { @@ -856,6 +864,14 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(rw, &io->iter); + /* + * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT + * issue, even though they should be returning -EAGAIN. To be safe, + * retry from blocking context for either. + */ + if (ret == -EOPNOTSUPP && force_nonblock) + ret = -EAGAIN; + if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; /* If we can poll, just do that. */ @@ -946,13 +962,6 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) ret = __io_read(req, issue_flags); /* - * If the file doesn't support proper NOWAIT, then disable multishot - * and stay in single shot mode. - */ - if (!io_file_supports_nowait(req)) - req->flags &= ~REQ_F_APOLL_MULTISHOT; - - /* * If we get -EAGAIN, recycle our buffer and just let normal poll * handling arm it. */ @@ -966,17 +975,18 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_MULTISHOT) return IOU_ISSUE_SKIP_COMPLETE; return -EAGAIN; - } - - /* - * Any successful return value will keep the multishot read armed. - */ - if (ret > 0 && req->flags & REQ_F_APOLL_MULTISHOT) { + } else if (ret <= 0) { + io_kbuf_recycle(req, issue_flags); + if (ret < 0) + req_set_fail(req); + } else { /* - * Put our buffer and post a CQE. If we fail to post a CQE, then + * Any successful return value will keep the multishot read + * armed, if it's still set. Put our buffer and post a CQE. If + * we fail to post a CQE, or multishot is no longer set, then * jump to the termination path. This request is then done. */ - cflags = io_put_kbuf(req, issue_flags); + cflags = io_put_kbuf(req, ret, issue_flags); rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { @@ -1013,14 +1023,14 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret, ret2; loff_t *ppos; - ret = io_rw_init_file(req, FMODE_WRITE); + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret; req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) + if (unlikely(!io_file_supports_nowait(req, EPOLLOUT))) goto ret_eagain; /* Check if we can support NOWAIT. */ @@ -1046,7 +1056,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &io->iter); + ret2 = req->file->f_op->write_iter(kiocb, &io->iter); else if (req->file->f_op->write) ret2 = loop_rw_iter(WRITE, rw, &io->iter); else @@ -1168,7 +1178,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!smp_load_acquire(&req->iopoll_completed)) break; nr_events++; - req->cqe.flags = io_put_kbuf(req, 0); + req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); if (req->opcode != IORING_OP_URING_CMD) io_req_rw_cleanup(req, 0); } diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 554c7212aa46..a26593979887 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -10,6 +10,7 @@ #include <linux/slab.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/cpuset.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> @@ -44,7 +45,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->lock) { - WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(data_race(sqd->thread) == current); atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); @@ -108,14 +109,14 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) struct fd f; f = fdget(p->wq_fd); - if (!f.file) + if (!fd_file(f)) return ERR_PTR(-ENXIO); - if (!io_is_uring_fops(f.file)) { + if (!io_is_uring_fops(fd_file(f))) { fdput(f); return ERR_PTR(-EINVAL); } - ctx_attach = f.file->private_data; + ctx_attach = fd_file(f)->private_data; sqd = ctx_attach->sq_data; if (!sqd) { fdput(f); @@ -176,7 +177,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { + if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { const struct cred *creds = NULL; if (ctx->sq_creds != current_cred()) @@ -195,9 +196,6 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - if (io_napi(ctx)) - ret += io_napi_sqpoll_busy_poll(ctx); - if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) wake_up(&ctx->sqo_sq_wait); if (creds) @@ -238,11 +236,13 @@ static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries) if (*retry_list) { *retry_list = io_handle_tw_list(*retry_list, &count, max_entries); if (count >= max_entries) - return count; + goto out; max_entries -= count; } - *retry_list = tctx_task_work_run(tctx, max_entries, &count); +out: + if (task_work_pending(current)) + task_work_run(); return count; } @@ -320,6 +320,10 @@ static int io_sq_thread(void *data) if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) sqt_spin = true; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + if (io_napi(ctx)) + io_napi_sqpoll_busy_poll(ctx); + if (sqt_spin || !time_after(jiffies, timeout)) { if (sqt_spin) { io_sq_update_worktime(sqd, &start); @@ -416,9 +420,9 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct fd f; f = fdget(p->wq_fd); - if (!f.file) + if (!fd_file(f)) return -ENXIO; - if (!io_is_uring_fops(f.file)) { + if (!io_is_uring_fops(fd_file(f))) { fdput(f); return -EINVAL; } @@ -458,11 +462,22 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, return 0; if (p->flags & IORING_SETUP_SQ_AFF) { + cpumask_var_t allowed_mask; int cpu = p->sq_thread_cpu; ret = -EINVAL; if (cpu >= nr_cpu_ids || !cpu_online(cpu)) goto err_sqpoll; + ret = -ENOMEM; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) + goto err_sqpoll; + ret = -EINVAL; + cpuset_cpus_allowed(current, allowed_mask); + if (!cpumask_test_cpu(cpu, allowed_mask)) { + free_cpumask_var(allowed_mask); + goto err_sqpoll; + } + free_cpumask_var(allowed_mask); sqd->sq_cpu = cpu; } else { sqd->sq_cpu = -1; diff --git a/io_uring/statx.c b/io_uring/statx.c index abb874209caa..f7f9b202eec0 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -37,8 +37,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sx->flags = READ_ONCE(sqe->statx_flags); sx->filename = getname_flags(path, - getname_statx_lookup_flags(sx->flags), - NULL); + getname_statx_lookup_flags(sx->flags)); if (IS_ERR(sx->filename)) { int ret = PTR_ERR(sx->filename); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 1c9bf07499b1..9973876d91b0 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -639,7 +639,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) - __must_hold(&req->ctx->timeout_lock) + __must_hold(&head->ctx->timeout_lock) { struct io_kiocb *req; diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 21ac5fb2d5f0..39c3c816ec78 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -265,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return ret; + return IOU_OK; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -277,6 +277,13 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); +void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + io_req_queue_iowq(req); +} + static inline int io_uring_cmd_getsockopt(struct socket *sock, struct io_uring_cmd *cmd, unsigned int issue_flags) diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 44905b82eea8..6cf41c3bc369 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -96,7 +96,7 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + ix->filename = getname_flags(path, LOOKUP_FOLLOW); if (IS_ERR(ix->filename)) { ret = PTR_ERR(ix->filename); ix->filename = NULL; @@ -189,7 +189,7 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + ix->filename = getname_flags(path, LOOKUP_FOLLOW); if (IS_ERR(ix->filename)) { ret = PTR_ERR(ix->filename); ix->filename = NULL; |