From 01e68ce08a30db3d842ce7a55f7f6e0474a55f9a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Mar 2023 07:18:51 -0700 Subject: io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers Every now and then reports come in that are puzzled on why changing affinity on the io-wq workers fails with EINVAL. This happens because they set PF_NO_SETAFFINITY as part of their creation, as io-wq organizes workers into groups based on what CPU they are running on. However, this is purely an optimization and not a functional requirement. We can allow setting affinity, and just lazily update our worker to wqe mappings. If a given io-wq thread times out, it normally exits if there's no more work to do. The exception is if it's the last worker available. For the timeout case, check the affinity of the worker against group mask and exit even if it's the last worker. New workers should be created with the right mask and in the right location. Reported-by:Daniel Dao Link: https://lore.kernel.org/io-uring/CA+wXwBQwgxB3_UphSny-yAP5b26meeOu1W4TwYVcD_+5gOhvPw@mail.gmail.com/ Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 411bb2d1acd4..f81c0a7136a5 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -616,7 +616,7 @@ static int io_wqe_worker(void *data) struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; - bool last_timeout = false; + bool exit_mask = false, last_timeout = false; char buf[TASK_COMM_LEN]; worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); @@ -632,8 +632,11 @@ static int io_wqe_worker(void *data) io_worker_handle_work(worker); raw_spin_lock(&wqe->lock); - /* timed out, exit unless we're the last worker */ - if (last_timeout && acct->nr_workers > 1) { + /* + * Last sleep timed out. Exit if we're not the last worker, + * or if someone modified our affinity. + */ + if (last_timeout && (exit_mask || acct->nr_workers > 1)) { acct->nr_workers--; raw_spin_unlock(&wqe->lock); __set_current_state(TASK_RUNNING); @@ -652,7 +655,11 @@ static int io_wqe_worker(void *data) continue; break; } - last_timeout = !ret; + if (!ret) { + last_timeout = true; + exit_mask = !cpumask_test_cpu(raw_smp_processor_id(), + wqe->cpu_mask); + } } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) @@ -704,7 +711,6 @@ static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, tsk->worker_private = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, wqe->cpu_mask); - tsk->flags |= PF_NO_SETAFFINITY; raw_spin_lock(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); -- cgit v1.2.3 From 03b3d6be73e81ddb7c2930d942cdd17f4cfd5ba5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Mar 2023 09:26:13 -0700 Subject: io_uring/uring_cmd: ensure that device supports IOPOLL It's possible for a file type to support uring commands, but not pollable ones. Hence before issuing one of those, we should check that it is supported and error out upfront if it isn't. Cc: stable@vger.kernel.org Fixes: 5756a3a7e713 ("io_uring: add iopoll infrastructure for io_uring_cmd") Link: https://github.com/axboe/liburing/issues/816 Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 446a189b78b0..2e4c483075d3 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -108,7 +108,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) struct file *file = req->file; int ret; - if (!req->file->f_op->uring_cmd) + if (!file->f_op->uring_cmd) return -EOPNOTSUPP; ret = security_uring_cmd(ioucmd); @@ -120,6 +120,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ctx->flags & IORING_SETUP_CQE32) issue_flags |= IO_URING_F_CQE32; if (ctx->flags & IORING_SETUP_IOPOLL) { + if (!file->f_op->uring_cmd_iopoll) + return -EOPNOTSUPP; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; WRITE_ONCE(ioucmd->cookie, NULL); -- cgit v1.2.3 From fa780334a8c392d959ae05eb19f2410b3a1e6cb0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Mar 2023 09:51:13 -0700 Subject: =?UTF-8?q?io=5Furing:=20silence=20variable=20=E2=80=98prev?= =?UTF-8?q?=E2=80=99=20set=20but=20not=20used=20warning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If io_uring.o is built with W=1, it triggers a warning: io_uring/io_uring.c: In function ‘__io_submit_flush_completions’: io_uring/io_uring.c:1502:40: warning: variable ‘prev’ set but not used [-Wunused-but-set-variable] 1502 | struct io_wq_work_node *node, *prev; | ^~~~ which is due to the wq_list_for_each() iterator always keeping a 'prev' variable. Most users need this to remove an entry from a list, for example, but __io_submit_flush_completions() never does that. Add a basic helper that doesn't track prev instead, and use that in that function. Reported-by: Vincenzo Palazzo Reviewed-by: Vincenzo Palazzo Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/slist.h | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fd1cc35a1c00..722624b6d0dc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1499,14 +1499,14 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) static void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; + struct io_wq_work_node *node; __io_cq_lock(ctx); /* must come first to preserve CQE ordering in failure cases */ if (state->cqes_count) __io_flush_post_cqes(ctx); - wq_list_for_each(node, prev, &state->compl_reqs) { + __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); diff --git a/io_uring/slist.h b/io_uring/slist.h index 7c198a40d5f1..0eb194817242 100644 --- a/io_uring/slist.h +++ b/io_uring/slist.h @@ -3,6 +3,9 @@ #include +#define __wq_list_for_each(pos, head) \ + for (pos = (head)->first; pos; pos = (pos)->next) + #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) @@ -113,4 +116,4 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) return container_of(work->list.next, struct io_wq_work, list); } -#endif // INTERNAL_IO_SLIST_H \ No newline at end of file +#endif // INTERNAL_IO_SLIST_H -- cgit v1.2.3 From a5fc1441af7719e93dc7a638a960befb694ade89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Tue, 14 Mar 2023 19:33:32 +0100 Subject: io_uring/sqpoll: Do not set PF_NO_SETAFFINITY on sqpoll threads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users may specify a CPU where the sqpoll thread would run. This may conflict with cpuset operations because of strict PF_NO_SETAFFINITY requirement. That flag is unnecessary for polling "kernel" threads, see the reasoning in commit 01e68ce08a30 ("io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers"). Drop the flag on poll threads too. Fixes: 01e68ce08a30 ("io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers") Link: https://lore.kernel.org/all/20230314162559.pnyxdllzgw7jozgx@blackpad/ Signed-off-by: Michal Koutný Link: https://lore.kernel.org/r/20230314183332.25834-1-mkoutny@suse.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 1 - 1 file changed, 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 0119d3f1a556..9db4bc1f521a 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -233,7 +233,6 @@ static int io_sq_thread(void *data) set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); else set_cpus_allowed_ptr(current, cpu_online_mask); - current->flags |= PF_NO_SETAFFINITY; mutex_lock(&sqd->lock); while (1) { -- cgit v1.2.3 From 6acd352dfee558194643adbed7e849fe80fd1b93 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Sat, 18 Mar 2023 02:25:38 +0800 Subject: io_uring: rsrc: Optimize return value variable 'ret' The initialization assignment of the variable ret is changed to 0, only in 'goto fail;' Use the ret variable as the function return value. Signed-off-by: Li zeming Link: https://lore.kernel.org/r/20230317182538.3027-1-zeming@nfschina.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 056f40946ff6..09a16d709cb5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -410,7 +410,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, unsigned nr, struct io_rsrc_data **pdata) { struct io_rsrc_data *data; - int ret = -ENOMEM; + int ret = 0; unsigned i; data = kzalloc(sizeof(*data), GFP_KERNEL); -- cgit v1.2.3 From 5da28edd7bd5518f97175ecea77615bb729a7a28 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Mar 2023 12:11:42 +0000 Subject: io_uring/msg_ring: let target know allocated index msg_ring requests transferring files support auto index selection via IORING_FILE_INDEX_ALLOC, however they don't return the selected index to the target ring and there is no other good way for the userspace to know where is the receieved file. Return the index for allocated slots and 0 otherwise, which is consistent with other fixed file installing requests. Cc: stable@vger.kernel.org # v6.0+ Fixes: e6130eba8a848 ("io_uring: add support for passing fixed file descriptors") Signed-off-by: Pavel Begunkov Link: https://github.com/axboe/liburing/issues/809 Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 8803c0979e2a..85fd7ce5f05b 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -202,7 +202,7 @@ static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flag * completes with -EOVERFLOW, then the sender must ensure that a * later IORING_OP_MSG_RING delivers the message. */ - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0)) ret = -EOVERFLOW; out_unlock: io_double_unlock_ctx(target_ctx); @@ -229,6 +229,8 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct file *src_file = msg->src_file; + if (msg->len) + return -EINVAL; if (target_ctx == ctx) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) -- cgit v1.2.3 From d2acf789088bb562cea342b6a24e646df4d47839 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Mar 2023 15:26:05 +0000 Subject: io_uring/rsrc: fix folio accounting | BUG: Bad page state in process kworker/u8:0 pfn:5c001 | page:00000000bfda61c8 refcount:0 mapcount:0 mapping:0000000000000000 index:0x20001 pfn:0x5c001 | head:0000000011409842 order:9 entire_mapcount:0 nr_pages_mapped:0 pincount:1 | anon flags: 0x3fffc00000b0004(uptodate|head|mappedtodisk|swapbacked|node=0|zone=0|lastcpupid=0xffff) | raw: 03fffc0000000000 fffffc0000700001 ffffffff00700903 0000000100000000 | raw: 0000000000000200 0000000000000000 00000000ffffffff 0000000000000000 | head: 03fffc00000b0004 dead000000000100 dead000000000122 ffff00000a809dc1 | head: 0000000000020000 0000000000000000 00000000ffffffff 0000000000000000 | page dumped because: nonzero pincount | CPU: 3 PID: 9 Comm: kworker/u8:0 Not tainted 6.3.0-rc2-00001-gc6811bf0cd87 #1 | Hardware name: linux,dummy-virt (DT) | Workqueue: events_unbound io_ring_exit_work | Call trace: | dump_backtrace+0x13c/0x208 | show_stack+0x34/0x58 | dump_stack_lvl+0x150/0x1a8 | dump_stack+0x20/0x30 | bad_page+0xec/0x238 | free_tail_pages_check+0x280/0x350 | free_pcp_prepare+0x60c/0x830 | free_unref_page+0x50/0x498 | free_compound_page+0xcc/0x100 | free_transhuge_page+0x1f0/0x2b8 | destroy_large_folio+0x80/0xc8 | __folio_put+0xc4/0xf8 | gup_put_folio+0xd0/0x250 | unpin_user_page+0xcc/0x128 | io_buffer_unmap+0xec/0x2c0 | __io_sqe_buffers_unregister+0xa4/0x1e0 | io_ring_exit_work+0x68c/0x1188 | process_one_work+0x91c/0x1a58 | worker_thread+0x48c/0xe30 | kthread+0x278/0x2f0 | ret_from_fork+0x10/0x20 Mark reports an issue with the recent patches coalescing compound pages while registering them in io_uring. The reason is that we try to drop excessive references with folio_put_refs(), but pages were acquired with pin_user_pages(), which has extra accounting and so should be put down with matching unpin_user_pages() or at least gup_put_folio(). As a fix unpin_user_pages() all but first page instead, and let's figure out a better API after. Fixes: 57bebf807e2abcf8 ("io_uring/rsrc: optimise registered huge pages") Reported-by: Mark Rutland Reviewed-by: Jens Axboe Tested-by: Jens Axboe Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/10efd5507d6d1f05ea0f3c601830e08767e189bd.1678980230.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 09a16d709cb5..e2bac9f89902 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1235,7 +1235,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, } } if (folio) { - folio_put_refs(folio, nr_pages - 1); + /* + * The pages are bound to the folio, it doesn't + * actually unpin them but drops all but one reference, + * which is usually put down by io_buffer_unmap(). + * Note, needs a better helper. + */ + unpin_user_pages(&pages[1], nr_pages - 1); nr_pages = 1; } } -- cgit v1.2.3