diff options
-rw-r--r-- | include/linux/io_uring.h | 18 | ||||
-rw-r--r-- | include/linux/io_uring_types.h | 10 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 8 | ||||
-rw-r--r-- | io_uring/Makefile | 3 | ||||
-rw-r--r-- | io_uring/cancel.c | 5 | ||||
-rw-r--r-- | io_uring/io_uring.c | 43 | ||||
-rw-r--r-- | io_uring/io_uring.h | 1 | ||||
-rw-r--r-- | io_uring/kbuf.c | 58 | ||||
-rw-r--r-- | io_uring/opdef.c | 24 | ||||
-rw-r--r-- | io_uring/opdef.h | 2 | ||||
-rw-r--r-- | io_uring/rsrc.c | 37 | ||||
-rw-r--r-- | io_uring/rw.c | 92 | ||||
-rw-r--r-- | io_uring/rw.h | 2 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 49 | ||||
-rw-r--r-- | io_uring/waitid.c | 372 | ||||
-rw-r--r-- | io_uring/waitid.h | 15 | ||||
-rw-r--r-- | kernel/exit.c | 131 | ||||
-rw-r--r-- | kernel/exit.h | 30 |
18 files changed, 781 insertions, 119 deletions
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 106cdc55ff3b..b4391e0a9bc8 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -20,8 +20,15 @@ enum io_uring_cmd_flags { IO_URING_F_SQE128 = (1 << 8), IO_URING_F_CQE32 = (1 << 9), IO_URING_F_IOPOLL = (1 << 10), + + /* set when uring wants to cancel a previously issued command */ + IO_URING_F_CANCEL = (1 << 11), }; +/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ +#define IORING_URING_CMD_CANCELABLE (1U << 30) +#define IORING_URING_CMD_POLLED (1U << 31) + struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; @@ -82,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); +void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags); +struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) @@ -122,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, { return -EOPNOTSUPP; } +static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ +} +static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +{ + return NULL; +} #endif #endif diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 13d19b9be9f4..e4e67899b134 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -265,6 +265,12 @@ struct io_ring_ctx { */ struct io_wq_work_list iopoll_list; bool poll_multi_queue; + + /* + * Any cancelable uring_cmd is added to this list in + * ->uring_cmd() by io_uring_cmd_insert_cancelable() + */ + struct hlist_head cancelable_uring_cmd; } ____cacheline_aligned_in_smp; struct { @@ -313,6 +319,8 @@ struct io_ring_ctx { struct list_head cq_overflow_list; struct io_hash_table cancel_table; + struct hlist_head waitid_list; + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ @@ -342,8 +350,6 @@ struct io_ring_ctx { struct wait_queue_head rsrc_quiesce_wq; unsigned rsrc_quiesce; - struct list_head io_buffers_pages; - #if defined(CONFIG_UNIX) struct socket *ring_sock; #endif diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8e61f8b7c2ce..425f64eee44e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -65,6 +65,7 @@ struct io_uring_sqe { __u32 xattr_flags; __u32 msg_ring_flags; __u32 uring_cmd_flags; + __u32 waitid_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -240,19 +241,20 @@ enum io_uring_op { IORING_OP_URING_CMD, IORING_OP_SEND_ZC, IORING_OP_SENDMSG_ZC, + IORING_OP_READ_MULTISHOT, + IORING_OP_WAITID, /* this goes last, obviously */ IORING_OP_LAST, }; /* - * sqe->uring_cmd_flags + * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. - * IORING_URING_CMD_POLLED driver use only */ #define IORING_URING_CMD_FIXED (1U << 0) -#define IORING_URING_CMD_POLLED (1U << 31) +#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED /* diff --git a/io_uring/Makefile b/io_uring/Makefile index 8cc8e5387a75..7bd64e442567 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o + cancel.o kbuf.o rsrc.o rw.o opdef.o \ + notif.o waitid.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 7b23607cf4af..eb77a51c5a79 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -15,6 +15,7 @@ #include "tctx.h" #include "poll.h" #include "timeout.h" +#include "waitid.h" #include "cancel.h" struct io_cancel { @@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, if (ret != -ENOENT) return ret; + ret = io_waitid_cancel(ctx, cd, issue_flags); + if (ret != -ENOENT) + return ret; + spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 783ed0fff71b..b9e1af5772f3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -92,6 +92,7 @@ #include "cancel.h" #include "net.h" #include "notif.h" +#include "waitid.h" #include "timeout.h" #include "poll.h" @@ -338,7 +339,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); - INIT_LIST_HEAD(&ctx->io_buffers_pages); INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -348,8 +348,10 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_WQ_LIST(&ctx->locked_free_list); + INIT_HLIST_HEAD(&ctx->waitid_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); + INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); return ctx; err: kfree(ctx->cancel_table.hbs); @@ -3256,6 +3258,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } +static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, + struct task_struct *task, bool cancel_all) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool ret = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, + hash_node) { + struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, + struct io_uring_cmd); + struct file *file = req->file; + + if (!cancel_all && req->task != task) + continue; + + if (cmd->flags & IORING_URING_CMD_CANCELABLE) { + /* ->sqe isn't available if no async data */ + if (!req_has_async_data(req)) + cmd->sqe = NULL; + file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); + ret = true; + } + } + io_submit_flush_completions(ctx); + + return ret; +} + static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) @@ -3303,6 +3336,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_cancel_defer_files(ctx, task, cancel_all); mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, task, cancel_all); + ret |= io_waitid_remove_all(ctx, task, cancel_all); + ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all); mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) @@ -4666,6 +4701,9 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); + /* top 8bits are for internal use */ + BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0); + io_uring_optable_init(); /* @@ -4681,6 +4719,9 @@ static int __init io_uring_init(void) SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, offsetof(struct io_kiocb, cmd.data), sizeof_field(struct io_kiocb, cmd.data), NULL); + io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 547c30582fb8..2ff719ae1b57 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -330,6 +330,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) } extern struct kmem_cache *req_cachep; +extern struct kmem_cache *io_buf_cachep; static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 556f4df25b0f..d5a04467666f 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -19,12 +19,17 @@ #define BGID_ARRAY 64 +/* BIDs are addressed by a 16-bit field in a CQE */ +#define MAX_BIDS_PER_BGID (1 << 16) + +struct kmem_cache *io_buf_cachep; + struct io_provide_buf { struct file *file; __u64 addr; __u32 len; __u32 bgid; - __u16 nbufs; + __u32 nbufs; __u16 bid; }; @@ -255,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; + struct list_head *item, *tmp; + struct io_buffer *buf; unsigned long index; int i; @@ -270,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) kfree(bl); } - while (!list_empty(&ctx->io_buffers_pages)) { - struct page *page; - - page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); - list_del_init(&page->lru); - __free_page(page); + list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { + buf = list_entry(item, struct io_buffer, list); + kmem_cache_free(io_buf_cachep, buf); } } @@ -289,7 +293,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) + if (!tmp || tmp > MAX_BIDS_PER_BGID) return -EINVAL; memset(p, 0, sizeof(*p)); @@ -332,7 +336,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) + if (!tmp || tmp > MAX_BIDS_PER_BGID) return -E2BIG; p->nbufs = tmp; p->addr = READ_ONCE(sqe->addr); @@ -352,17 +356,18 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe tmp = READ_ONCE(sqe->off); if (tmp > USHRT_MAX) return -E2BIG; - if (tmp + p->nbufs >= USHRT_MAX) + if (tmp + p->nbufs > MAX_BIDS_PER_BGID) return -EINVAL; p->bid = tmp; return 0; } +#define IO_BUFFER_ALLOC_BATCH 64 + static int io_refill_buffer_cache(struct io_ring_ctx *ctx) { - struct io_buffer *buf; - struct page *page; - int bufs_in_page; + struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; + int allocated; /* * Completions that don't happen inline (eg not under uring_lock) will @@ -382,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx) /* * No free buffers and no completion entries either. Allocate a new - * page worth of buffer entries and add those to our freelist. + * batch of buffer entries and add those to our freelist. */ - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) - return -ENOMEM; - - list_add(&page->lru, &ctx->io_buffers_pages); - buf = page_address(page); - bufs_in_page = PAGE_SIZE / sizeof(*buf); - while (bufs_in_page) { - list_add_tail(&buf->list, &ctx->io_buffers_cache); - buf++; - bufs_in_page--; + allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, + ARRAY_SIZE(bufs), (void **) bufs); + if (unlikely(!allocated)) { + /* + * Bulk alloc is all-or-nothing. If we fail to get a batch, + * retry single alloc to be on the safe side. + */ + bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); + if (!bufs[0]) + return -ENOMEM; + allocated = 1; } + while (allocated) + list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); + return 0; } diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 3b9c6489b8b6..aadcbf7136b0 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -33,6 +33,7 @@ #include "poll.h" #include "cancel.h" #include "rw.h" +#include "waitid.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -63,6 +64,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .vectored = 1, .prep = io_prep_rw, .issue = io_read, }, @@ -76,6 +78,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .vectored = 1, .prep = io_prep_rw, .issue = io_write, }, @@ -428,9 +431,21 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_READ_MULTISHOT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .audit_skip = 1, + .prep = io_read_mshot_prep, + .issue = io_read_mshot, + }, + [IORING_OP_WAITID] = { + .prep = io_waitid_prep, + .issue = io_waitid, + }, }; - const struct io_cold_def io_cold_defs[] = { [IORING_OP_NOP] = { .name = "NOP", @@ -648,6 +663,13 @@ const struct io_cold_def io_cold_defs[] = { .fail = io_sendrecv_fail, #endif }, + [IORING_OP_READ_MULTISHOT] = { + .name = "READ_MULTISHOT", + }, + [IORING_OP_WAITID] = { + .name = "WAITID", + .async_size = sizeof(struct io_waitid_async), + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/opdef.h b/io_uring/opdef.h index c22c8696e749..9e5435ec27d0 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -29,6 +29,8 @@ struct io_issue_def { unsigned iopoll_queue : 1; /* opcode specific path will handle ->async_data allocation if needed */ unsigned manual_alloc : 1; + /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ + unsigned vectored : 1; int (*issue)(struct io_kiocb *, unsigned int); int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d9c853d10587..7034be555334 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1037,39 +1037,36 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) { unsigned long start, end, nr_pages; struct page **pages = NULL; - int pret, ret = -ENOMEM; + int ret; end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; nr_pages = end - start; + WARN_ON(!nr_pages); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) - goto done; + return ERR_PTR(-ENOMEM); - ret = 0; mmap_read_lock(current->mm); - pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - pages); - if (pret == nr_pages) + ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); + mmap_read_unlock(current->mm); + + /* success, mapped all pages */ + if (ret == nr_pages) { *npages = nr_pages; - else - ret = pret < 0 ? pret : -EFAULT; + return pages; + } - mmap_read_unlock(current->mm); - if (ret) { + /* partial map, or didn't map anything */ + if (ret >= 0) { /* if we did partial map, release any pages we did get */ - if (pret > 0) - unpin_user_pages(pages, pret); - goto done; - } - ret = 0; -done: - if (ret < 0) { - kvfree(pages); - pages = ERR_PTR(ret); + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; } - return pages; + kvfree(pages); + return ERR_PTR(ret); } static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, diff --git a/io_uring/rw.c b/io_uring/rw.c index c8c822fa7980..ec0cc38ea682 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -123,6 +123,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +/* + * Multishot read is prepared just like a normal read/write request, only + * difference is that we set the MULTISHOT flag. + */ +int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = io_prep_rw(req, sqe); + if (unlikely(ret)) + return ret; + + req->flags |= REQ_F_APOLL_MULTISHOT; + return 0; +} + void io_readv_writev_cleanup(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; @@ -388,8 +404,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, buf = u64_to_user_ptr(rw->addr); sqe_len = rw->len; - if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE || - (req->flags & REQ_F_BUFFER_SELECT)) { + if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) { if (io_do_buffer_select(req)) { buf = io_buffer_select(req, &sqe_len, issue_flags); if (!buf) @@ -708,7 +723,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) return 0; } -int io_read(struct io_kiocb *req, unsigned int issue_flags) +static int __io_read(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_rw_state __s, *s = &__s; @@ -776,8 +791,11 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; - /* if we can poll, just do that */ - if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) + /* + * If we can poll, just do that. For a vectored read, we'll + * need to copy state first. + */ + if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -853,7 +871,69 @@ done: /* it's faster to check here then delegate to kfree */ if (iovec) kfree(iovec); - return kiocb_done(req, ret, issue_flags); + return ret; +} + +int io_read(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + ret = __io_read(req, issue_flags); + if (ret >= 0) + return kiocb_done(req, ret, issue_flags); + + return ret; +} + +int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) +{ + unsigned int cflags = 0; + int ret; + + /* + * Multishot MUST be used on a pollable file + */ + if (!file_can_poll(req->file)) + return -EBADFD; + + ret = __io_read(req, issue_flags); + + /* + * If we get -EAGAIN, recycle our buffer and just let normal poll + * handling arm it. + */ + if (ret == -EAGAIN) { + io_kbuf_recycle(req, issue_flags); + return -EAGAIN; + } + + /* + * Any successful return value will keep the multishot read armed. + */ + if (ret > 0) { + /* + * Put our buffer and post a CQE. If we fail to post a CQE, then + * jump to the termination path. This request is then done. + */ + cflags = io_put_kbuf(req, issue_flags); + + if (io_fill_cqe_req_aux(req, + issue_flags & IO_URING_F_COMPLETE_DEFER, + ret, cflags | IORING_CQE_F_MORE)) { + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; + return -EAGAIN; + } + } + + /* + * Either an error, or we've hit overflow posting the CQE. For any + * multishot request, hitting overflow will terminate it. + */ + io_req_set_res(req, ret, cflags); + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_STOP_MULTISHOT; + return IOU_OK; } int io_write(struct io_kiocb *req, unsigned int issue_flags) diff --git a/io_uring/rw.h b/io_uring/rw.h index 4b89f9659366..c5aed03d42a4 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -23,3 +23,5 @@ int io_writev_prep_async(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); +int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 537795fddc87..00a5e5621a28 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -13,6 +13,51 @@ #include "rsrc.h" #include "uring_cmd.h" +static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(cmd); + struct io_ring_ctx *ctx = req->ctx; + + if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) + return; + + cmd->flags &= ~IORING_URING_CMD_CANCELABLE; + io_ring_submit_lock(ctx, issue_flags); + hlist_del(&req->hash_node); + io_ring_submit_unlock(ctx, issue_flags); +} + +/* + * Mark this command as concelable, then io_uring_try_cancel_uring_cmd() + * will try to cancel this issued command by sending ->uring_cmd() with + * issue_flags of IO_URING_F_CANCEL. + * + * The command is guaranteed to not be done when calling ->uring_cmd() + * with IO_URING_F_CANCEL, but it is driver's responsibility to deal + * with race between io_uring canceling and normal completion. + */ +void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(cmd); + struct io_ring_ctx *ctx = req->ctx; + + if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { + cmd->flags |= IORING_URING_CMD_CANCELABLE; + io_ring_submit_lock(ctx, issue_flags); + hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd); + io_ring_submit_unlock(ctx, issue_flags); + } +} +EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); + +struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->task; +} +EXPORT_SYMBOL_GPL(io_uring_cmd_get_task); + static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -56,6 +101,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + io_uring_cmd_del_cancelable(ioucmd, issue_flags); + if (ret < 0) req_set_fail(req); @@ -91,7 +138,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags); - if (ioucmd->flags & ~IORING_URING_CMD_FIXED) + if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; if (ioucmd->flags & IORING_URING_CMD_FIXED) { diff --git a/io_uring/waitid.c b/io_uring/waitid.c new file mode 100644 index 000000000000..6f851978606d --- /dev/null +++ b/io_uring/waitid.c @@ -0,0 +1,372 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for async notification of waitid + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/compat.h> +#include <linux/io_uring.h> + +#include <uapi/linux/io_uring.h> + +#include "io_uring.h" +#include "cancel.h" +#include "waitid.h" +#include "../kernel/exit.h" + +static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); + +#define IO_WAITID_CANCEL_FLAG BIT(31) +#define IO_WAITID_REF_MASK GENMASK(30, 0) + +struct io_waitid { + struct file *file; + int which; + pid_t upid; + int options; + atomic_t refs; + struct wait_queue_head *head; + struct siginfo __user *infop; + struct waitid_info info; +}; + +static void io_waitid_free(struct io_kiocb *req) +{ + struct io_waitid_async *iwa = req->async_data; + + put_pid(iwa->wo.wo_pid); + kfree(req->async_data); + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; +} + +#ifdef CONFIG_COMPAT +static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) +{ + struct compat_siginfo __user *infop; + bool ret; + + infop = (struct compat_siginfo __user *) iw->infop; + + if (!user_write_access_begin(infop, sizeof(*infop))) + return false; + + unsafe_put_user(signo, &infop->si_signo, Efault); + unsafe_put_user(0, &infop->si_errno, Efault); + unsafe_put_user(iw->info.cause, &infop->si_code, Efault); + unsafe_put_user(iw->info.pid, &infop->si_pid, Efault); + unsafe_put_user(iw->info.uid, &infop->si_uid, Efault); + unsafe_put_user(iw->info.status, &infop->si_status, Efault); + ret = true; +done: + user_write_access_end(); + return ret; +Efault: + ret = false; + goto done; +} +#endif + +static bool io_waitid_copy_si(struct io_kiocb *req, int signo) +{ + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + bool ret; + + if (!iw->infop) + return true; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return io_waitid_compat_copy_si(iw, signo); +#endif + + if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) + return false; + + unsafe_put_user(signo, &iw->infop->si_signo, Efault); + unsafe_put_user(0, &iw->infop->si_errno, Efault); + unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault); + unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault); + unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault); + unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault); + ret = true; +done: + user_write_access_end(); + return ret; +Efault: + ret = false; + goto done; +} + +static int io_waitid_finish(struct io_kiocb *req, int ret) +{ + int signo = 0; + + if (ret > 0) { + signo = SIGCHLD; + ret = 0; + } + + if (!io_waitid_copy_si(req, signo)) + ret = -EFAULT; + io_waitid_free(req); + return ret; +} + +static void io_waitid_complete(struct io_kiocb *req, int ret) +{ + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_tw_state ts = { .locked = true }; + + /* anyone completing better be holding a reference */ + WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); + + lockdep_assert_held(&req->ctx->uring_lock); + + /* + * Did cancel find it meanwhile? + */ + if (hlist_unhashed(&req->hash_node)) + return; + + hlist_del_init(&req->hash_node); + + ret = io_waitid_finish(req, ret); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + io_req_task_complete(req, &ts); +} + +static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa = req->async_data; + + /* + * Mark us canceled regardless of ownership. This will prevent a + * potential retry from a spurious wakeup. + */ + atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs); + + /* claim ownership */ + if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) + return false; + + spin_lock_irq(&iw->head->lock); + list_del_init(&iwa->wo.child_wait.entry); + spin_unlock_irq(&iw->head->lock); + io_waitid_complete(req, -ECANCELED); + return true; +} + +int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + int nr = 0; + + if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) + return -ENOENT; + + io_ring_submit_lock(ctx, issue_flags); + hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { + if (req->cqe.user_data != cd->data && + !(cd->flags & IORING_ASYNC_CANCEL_ANY)) + continue; + if (__io_waitid_cancel(ctx, req)) + nr++; + if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) + break; + } + io_ring_submit_unlock(ctx, issue_flags); + + if (nr) + return nr; + + return -ENOENT; +} + +bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, + bool cancel_all) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool found = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { + if (!io_match_task_safe(req, task, cancel_all)) + continue; + __io_waitid_cancel(ctx, req); + found = true; + } + + return found; +} + +static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) +{ + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa = req->async_data; + + if (!atomic_sub_return(1, &iw->refs)) + return false; + + /* + * Wakeup triggered, racing with us. It was prevented from + * completing because of that, queue up the tw to do that. + */ + req->io_task_work.func = io_waitid_cb; + io_req_task_work_add(req); + remove_wait_queue(iw->head, &iwa->wo.child_wait); + return true; +} + +static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) +{ + struct io_waitid_async *iwa = req->async_data; + struct io_ring_ctx *ctx = req->ctx; + int ret; + + io_tw_lock(ctx, ts); + + ret = __do_wait(&iwa->wo); + + /* + * If we get -ERESTARTSYS here, we need to re-arm and check again + * to ensure we get another callback. If the retry works, then we can + * just remove ourselves from the waitqueue again and finish the + * request. + */ + if (unlikely(ret == -ERESTARTSYS)) { + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + + /* Don't retry if cancel found it meanwhile */ + ret = -ECANCELED; + if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) { + iw->head = ¤t->signal->wait_chldexit; + add_wait_queue(iw->head, &iwa->wo.child_wait); + ret = __do_wait(&iwa->wo); + if (ret == -ERESTARTSYS) { + /* retry armed, drop our ref */ + io_waitid_drop_issue_ref(req); + return; + } + + remove_wait_queue(iw->head, &iwa->wo.child_wait); + } + } + + io_waitid_complete(req, ret); +} + +static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, + int sync, void *key) +{ + struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); + struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo); + struct io_kiocb *req = iwa->req; + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct task_struct *p = key; + + if (!pid_child_should_wake(wo, p)) + return 0; + + /* cancel is in progress */ + if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) + return 1; + + req->io_task_work.func = io_waitid_cb; + io_req_task_work_add(req); + list_del_init(&wait->entry); + return 1; +} + +int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + + if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) + return -EINVAL; + + iw->which = READ_ONCE(sqe->len); + iw->upid = READ_ONCE(sqe->fd); + iw->options = READ_ONCE(sqe->file_index); + iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + return 0; +} + +int io_waitid(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_ring_ctx *ctx = req->ctx; + struct io_waitid_async *iwa; + int ret; + + if (io_alloc_async_data(req)) + return -ENOMEM; + + iwa = req->async_data; + iwa->req = req; + + ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, + iw->options, NULL); + if (ret) + goto done; + + /* + * Mark the request as busy upfront, in case we're racing with the + * wakeup. If we are, then we'll notice when we drop this initial + * reference again after arming. + */ + atomic_set(&iw->refs, 1); + + /* + * Cancel must hold the ctx lock, so there's no risk of cancelation + * finding us until a) we remain on the list, and b) the lock is + * dropped. We only need to worry about racing with the wakeup + * callback. + */ + io_ring_submit_lock(ctx, issue_flags); + hlist_add_head(&req->hash_node, &ctx->waitid_list); + + init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); + iwa->wo.child_wait.private = req->task; + iw->head = ¤t->signal->wait_chldexit; + add_wait_queue(iw->head, &iwa->wo.child_wait); + + ret = __do_wait(&iwa->wo); + if (ret == -ERESTARTSYS) { + /* + * Nobody else grabbed a reference, it'll complete when we get + * a waitqueue callback, or if someone cancels it. + */ + if (!io_waitid_drop_issue_ref(req)) { + io_ring_submit_unlock(ctx, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; + } + + /* + * Wakeup triggered, racing with us. It was prevented from + * completing because of that, queue up the tw to do that. + */ + io_ring_submit_unlock(ctx, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; + } + + hlist_del_init(&req->hash_node); + remove_wait_queue(iw->head, &iwa->wo.child_wait); + ret = io_waitid_finish(req, ret); + + io_ring_submit_unlock(ctx, issue_flags); +done: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/waitid.h b/io_uring/waitid.h new file mode 100644 index 000000000000..956a8adafe8c --- /dev/null +++ b/io_uring/waitid.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "../kernel/exit.h" + +struct io_waitid_async { + struct io_kiocb *req; + struct wait_opts wo; +}; + +int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_waitid(struct io_kiocb *req, unsigned int issue_flags); +int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags); +bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, + bool cancel_all); diff --git a/kernel/exit.c b/kernel/exit.c index edb50b4c9972..2b4a232f2f68 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -74,6 +74,8 @@ #include <asm/unistd.h> #include <asm/mmu_context.h> +#include "exit.h" + /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit @@ -1037,26 +1039,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code) return 0; } -struct waitid_info { - pid_t pid; - uid_t uid; - int status; - int cause; -}; - -struct wait_opts { - enum pid_type wo_type; - int wo_flags; - struct pid *wo_pid; - - struct waitid_info *wo_info; - int wo_stat; - struct rusage *wo_rusage; - - wait_queue_entry_t child_wait; - int notask_error; -}; - static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || @@ -1520,6 +1502,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return false; + + if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) + return false; + + return true; +} + static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { @@ -1527,13 +1520,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, child_wait); struct task_struct *p = key; - if (!eligible_pid(wo, p)) - return 0; + if (pid_child_should_wake(wo, p)) + return default_wake_function(wait, mode, sync, key); - if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) - return 0; - - return default_wake_function(wait, mode, sync, key); + return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) @@ -1582,16 +1572,10 @@ static int do_wait_pid(struct wait_opts *wo) return 0; } -static long do_wait(struct wait_opts *wo) +long __do_wait(struct wait_opts *wo) { - int retval; + long retval; - trace_sched_process_wait(wo->wo_pid); - - init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); - wo->child_wait.private = current; - add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); -repeat: /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that @@ -1603,24 +1587,23 @@ repeat: (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; - set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) - goto end; + return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) - goto end; + return retval; retval = ptrace_do_wait(wo, tsk); if (retval) - goto end; + return retval; if (wo->wo_flags & __WNOTHREAD) break; @@ -1630,27 +1613,44 @@ repeat: notask: retval = wo->notask_error; - if (!retval && !(wo->wo_flags & WNOHANG)) { - retval = -ERESTARTSYS; - if (!signal_pending(current)) { - schedule(); - goto repeat; - } - } -end: + if (!retval && !(wo->wo_flags & WNOHANG)) + return -ERESTARTSYS; + + return retval; +} + +static long do_wait(struct wait_opts *wo) +{ + int retval; + + trace_sched_process_wait(wo->wo_pid); + + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); + + do { + set_current_state(TASK_INTERRUPTIBLE); + retval = __do_wait(wo); + if (retval != -ERESTARTSYS) + break; + if (signal_pending(current)) + break; + schedule(); + } while (1); + __set_current_state(TASK_RUNNING); remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } -static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, - int options, struct rusage *ru) +int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, + struct waitid_info *infop, int options, + struct rusage *ru) { - struct wait_opts wo; + unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; - long ret; - unsigned int f_flags = 0; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) @@ -1693,19 +1693,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, return -EINVAL; } - wo.wo_type = type; - wo.wo_pid = pid; - wo.wo_flags = options; - wo.wo_info = infop; - wo.wo_rusage = ru; + wo->wo_type = type; + wo->wo_pid = pid; + wo->wo_flags = options; + wo->wo_info = infop; + wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) - wo.wo_flags |= WNOHANG; + wo->wo_flags |= WNOHANG; + + return 0; +} + +static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, + int options, struct rusage *ru) +{ + struct wait_opts wo; + long ret; + + ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); + if (ret) + return ret; ret = do_wait(&wo); - if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) + if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; - put_pid(pid); + put_pid(wo.wo_pid); return ret; } diff --git a/kernel/exit.h b/kernel/exit.h new file mode 100644 index 000000000000..278faa26a653 --- /dev/null +++ b/kernel/exit.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef LINUX_WAITID_H +#define LINUX_WAITID_H + +struct waitid_info { + pid_t pid; + uid_t uid; + int status; + int cause; +}; + +struct wait_opts { + enum pid_type wo_type; + int wo_flags; + struct pid *wo_pid; + + struct waitid_info *wo_info; + int wo_stat; + struct rusage *wo_rusage; + + wait_queue_entry_t child_wait; + int notask_error; +}; + +bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p); +long __do_wait(struct wait_opts *wo); +int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, + struct waitid_info *infop, int options, + struct rusage *ru); +#endif |