From 01e68ce08a30db3d842ce7a55f7f6e0474a55f9a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Mar 2023 07:18:51 -0700
Subject: io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers

Every now and then reports come in that are puzzled on why changing
affinity on the io-wq workers fails with EINVAL. This happens because they
set PF_NO_SETAFFINITY as part of their creation, as io-wq organizes
workers into groups based on what CPU they are running on.

However, this is purely an optimization and not a functional requirement.
We can allow setting affinity, and just lazily update our worker to wqe
mappings. If a given io-wq thread times out, it normally exits if there's
no more work to do. The exception is if it's the last worker available.
For the timeout case, check the affinity of the worker against group mask
and exit even if it's the last worker. New workers should be created with
the right mask and in the right location.

Reported-by:Daniel Dao <dqminh@cloudflare.com>
Link: https://lore.kernel.org/io-uring/CA+wXwBQwgxB3_UphSny-yAP5b26meeOu1W4TwYVcD_+5gOhvPw@mail.gmail.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io-wq.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 411bb2d1acd4..f81c0a7136a5 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -616,7 +616,7 @@ static int io_wqe_worker(void *data)
 	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
-	bool last_timeout = false;
+	bool exit_mask = false, last_timeout = false;
 	char buf[TASK_COMM_LEN];
 
 	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
@@ -632,8 +632,11 @@ static int io_wqe_worker(void *data)
 			io_worker_handle_work(worker);
 
 		raw_spin_lock(&wqe->lock);
-		/* timed out, exit unless we're the last worker */
-		if (last_timeout && acct->nr_workers > 1) {
+		/*
+		 * Last sleep timed out. Exit if we're not the last worker,
+		 * or if someone modified our affinity.
+		 */
+		if (last_timeout && (exit_mask || acct->nr_workers > 1)) {
 			acct->nr_workers--;
 			raw_spin_unlock(&wqe->lock);
 			__set_current_state(TASK_RUNNING);
@@ -652,7 +655,11 @@ static int io_wqe_worker(void *data)
 				continue;
 			break;
 		}
-		last_timeout = !ret;
+		if (!ret) {
+			last_timeout = true;
+			exit_mask = !cpumask_test_cpu(raw_smp_processor_id(),
+							wqe->cpu_mask);
+		}
 	}
 
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
@@ -704,7 +711,6 @@ static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
 	tsk->worker_private = worker;
 	worker->task = tsk;
 	set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
-	tsk->flags |= PF_NO_SETAFFINITY;
 
 	raw_spin_lock(&wqe->lock);
 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
-- 
cgit v1.2.3


From 03b3d6be73e81ddb7c2930d942cdd17f4cfd5ba5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Mar 2023 09:26:13 -0700
Subject: io_uring/uring_cmd: ensure that device supports IOPOLL

It's possible for a file type to support uring commands, but not
pollable ones. Hence before issuing one of those, we should check
that it is supported and error out upfront if it isn't.

Cc: stable@vger.kernel.org
Fixes: 5756a3a7e713 ("io_uring: add iopoll infrastructure for io_uring_cmd")
Link: https://github.com/axboe/liburing/issues/816
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/uring_cmd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 446a189b78b0..2e4c483075d3 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -108,7 +108,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 	struct file *file = req->file;
 	int ret;
 
-	if (!req->file->f_op->uring_cmd)
+	if (!file->f_op->uring_cmd)
 		return -EOPNOTSUPP;
 
 	ret = security_uring_cmd(ioucmd);
@@ -120,6 +120,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 	if (ctx->flags & IORING_SETUP_CQE32)
 		issue_flags |= IO_URING_F_CQE32;
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
+		if (!file->f_op->uring_cmd_iopoll)
+			return -EOPNOTSUPP;
 		issue_flags |= IO_URING_F_IOPOLL;
 		req->iopoll_completed = 0;
 		WRITE_ONCE(ioucmd->cookie, NULL);
-- 
cgit v1.2.3


From fa780334a8c392d959ae05eb19f2410b3a1e6cb0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 Mar 2023 09:51:13 -0700
Subject: =?UTF-8?q?io=5Furing:=20silence=20variable=20=E2=80=98prev?=
 =?UTF-8?q?=E2=80=99=20set=20but=20not=20used=20warning?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If io_uring.o is built with W=1, it triggers a warning:

io_uring/io_uring.c: In function ‘__io_submit_flush_completions’:
io_uring/io_uring.c:1502:40: warning: variable ‘prev’ set but not used [-Wunused-but-set-variable]
 1502 |         struct io_wq_work_node *node, *prev;
      |                                        ^~~~

which is due to the wq_list_for_each() iterator always keeping a 'prev'
variable. Most users need this to remove an entry from a list, for
example, but __io_submit_flush_completions() never does that.

Add a basic helper that doesn't track prev instead, and use that in
that function.

Reported-by: Vincenzo Palazzo <vincenzopalazzodev@gmail.com>
Reviewed-by: Vincenzo Palazzo <vincenzopalazzodev@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 4 ++--
 io_uring/slist.h    | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'io_uring')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fd1cc35a1c00..722624b6d0dc 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1499,14 +1499,14 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	__must_hold(&ctx->uring_lock)
 {
-	struct io_wq_work_node *node, *prev;
 	struct io_submit_state *state = &ctx->submit_state;
+	struct io_wq_work_node *node;
 
 	__io_cq_lock(ctx);
 	/* must come first to preserve CQE ordering in failure cases */
 	if (state->cqes_count)
 		__io_flush_post_cqes(ctx);
-	wq_list_for_each(node, prev, &state->compl_reqs) {
+	__wq_list_for_each(node, &state->compl_reqs) {
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 					    comp_list);
 
diff --git a/io_uring/slist.h b/io_uring/slist.h
index 7c198a40d5f1..0eb194817242 100644
--- a/io_uring/slist.h
+++ b/io_uring/slist.h
@@ -3,6 +3,9 @@
 
 #include <linux/io_uring_types.h>
 
+#define __wq_list_for_each(pos, head)				\
+	for (pos = (head)->first; pos; pos = (pos)->next)
+
 #define wq_list_for_each(pos, prv, head)			\
 	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
 
@@ -113,4 +116,4 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 	return container_of(work->list.next, struct io_wq_work, list);
 }
 
-#endif // INTERNAL_IO_SLIST_H
\ No newline at end of file
+#endif // INTERNAL_IO_SLIST_H
-- 
cgit v1.2.3


From a5fc1441af7719e93dc7a638a960befb694ade89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com>
Date: Tue, 14 Mar 2023 19:33:32 +0100
Subject: io_uring/sqpoll: Do not set PF_NO_SETAFFINITY on sqpoll threads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Users may specify a CPU where the sqpoll thread would run. This may
conflict with cpuset operations because of strict PF_NO_SETAFFINITY
requirement. That flag is unnecessary for polling "kernel" threads, see
the reasoning in commit 01e68ce08a30 ("io_uring/io-wq: stop setting
PF_NO_SETAFFINITY on io-wq workers"). Drop the flag on poll threads too.

Fixes: 01e68ce08a30 ("io_uring/io-wq: stop setting PF_NO_SETAFFINITY on io-wq workers")
Link: https://lore.kernel.org/all/20230314162559.pnyxdllzgw7jozgx@blackpad/
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Link: https://lore.kernel.org/r/20230314183332.25834-1-mkoutny@suse.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/sqpoll.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 0119d3f1a556..9db4bc1f521a 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -233,7 +233,6 @@ static int io_sq_thread(void *data)
 		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
 	else
 		set_cpus_allowed_ptr(current, cpu_online_mask);
-	current->flags |= PF_NO_SETAFFINITY;
 
 	mutex_lock(&sqd->lock);
 	while (1) {
-- 
cgit v1.2.3


From 6acd352dfee558194643adbed7e849fe80fd1b93 Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Sat, 18 Mar 2023 02:25:38 +0800
Subject: io_uring: rsrc: Optimize return value variable 'ret'

The initialization assignment of the variable ret is changed to 0, only
in 'goto fail;' Use the ret variable as the function return value.

Signed-off-by: Li zeming <zeming@nfschina.com>
Link: https://lore.kernel.org/r/20230317182538.3027-1-zeming@nfschina.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 056f40946ff6..09a16d709cb5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -410,7 +410,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 				     unsigned nr, struct io_rsrc_data **pdata)
 {
 	struct io_rsrc_data *data;
-	int ret = -ENOMEM;
+	int ret = 0;
 	unsigned i;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
-- 
cgit v1.2.3


From 5da28edd7bd5518f97175ecea77615bb729a7a28 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Mar 2023 12:11:42 +0000
Subject: io_uring/msg_ring: let target know allocated index

msg_ring requests transferring files support auto index selection via
IORING_FILE_INDEX_ALLOC, however they don't return the selected index
to the target ring and there is no other good way for the userspace to
know where is the receieved file.

Return the index for allocated slots and 0 otherwise, which is
consistent with other fixed file installing requests.

Cc: stable@vger.kernel.org # v6.0+
Fixes: e6130eba8a848 ("io_uring: add support for passing fixed file descriptors")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://github.com/axboe/liburing/issues/809
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/msg_ring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 8803c0979e2a..85fd7ce5f05b 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -202,7 +202,7 @@ static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flag
 	 * completes with -EOVERFLOW, then the sender must ensure that a
 	 * later IORING_OP_MSG_RING delivers the message.
 	 */
-	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+	if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0))
 		ret = -EOVERFLOW;
 out_unlock:
 	io_double_unlock_ctx(target_ctx);
@@ -229,6 +229,8 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct file *src_file = msg->src_file;
 
+	if (msg->len)
+		return -EINVAL;
 	if (target_ctx == ctx)
 		return -EINVAL;
 	if (target_ctx->flags & IORING_SETUP_R_DISABLED)
-- 
cgit v1.2.3


From d2acf789088bb562cea342b6a24e646df4d47839 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Mar 2023 15:26:05 +0000
Subject: io_uring/rsrc: fix folio accounting

| BUG: Bad page state in process kworker/u8:0  pfn:5c001
| page:00000000bfda61c8 refcount:0 mapcount:0 mapping:0000000000000000 index:0x20001 pfn:0x5c001
| head:0000000011409842 order:9 entire_mapcount:0 nr_pages_mapped:0 pincount:1
| anon flags: 0x3fffc00000b0004(uptodate|head|mappedtodisk|swapbacked|node=0|zone=0|lastcpupid=0xffff)
| raw: 03fffc0000000000 fffffc0000700001 ffffffff00700903 0000000100000000
| raw: 0000000000000200 0000000000000000 00000000ffffffff 0000000000000000
| head: 03fffc00000b0004 dead000000000100 dead000000000122 ffff00000a809dc1
| head: 0000000000020000 0000000000000000 00000000ffffffff 0000000000000000
| page dumped because: nonzero pincount
| CPU: 3 PID: 9 Comm: kworker/u8:0 Not tainted 6.3.0-rc2-00001-gc6811bf0cd87 #1
| Hardware name: linux,dummy-virt (DT)
| Workqueue: events_unbound io_ring_exit_work
| Call trace:
|  dump_backtrace+0x13c/0x208
|  show_stack+0x34/0x58
|  dump_stack_lvl+0x150/0x1a8
|  dump_stack+0x20/0x30
|  bad_page+0xec/0x238
|  free_tail_pages_check+0x280/0x350
|  free_pcp_prepare+0x60c/0x830
|  free_unref_page+0x50/0x498
|  free_compound_page+0xcc/0x100
|  free_transhuge_page+0x1f0/0x2b8
|  destroy_large_folio+0x80/0xc8
|  __folio_put+0xc4/0xf8
|  gup_put_folio+0xd0/0x250
|  unpin_user_page+0xcc/0x128
|  io_buffer_unmap+0xec/0x2c0
|  __io_sqe_buffers_unregister+0xa4/0x1e0
|  io_ring_exit_work+0x68c/0x1188
|  process_one_work+0x91c/0x1a58
|  worker_thread+0x48c/0xe30
|  kthread+0x278/0x2f0
|  ret_from_fork+0x10/0x20

Mark reports an issue with the recent patches coalescing compound pages
while registering them in io_uring. The reason is that we try to drop
excessive references with folio_put_refs(), but pages were acquired
with pin_user_pages(), which has extra accounting and so should be put
down with matching unpin_user_pages() or at least gup_put_folio().

As a fix unpin_user_pages() all but first page instead, and let's figure
out a better API after.

Fixes: 57bebf807e2abcf8 ("io_uring/rsrc: optimise registered huge pages")
Reported-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Tested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/10efd5507d6d1f05ea0f3c601830e08767e189bd.1678980230.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'io_uring')

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 09a16d709cb5..e2bac9f89902 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1235,7 +1235,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 			}
 		}
 		if (folio) {
-			folio_put_refs(folio, nr_pages - 1);
+			/*
+			 * The pages are bound to the folio, it doesn't
+			 * actually unpin them but drops all but one reference,
+			 * which is usually put down by io_buffer_unmap().
+			 * Note, needs a better helper.
+			 */
+			unpin_user_pages(&pages[1], nr_pages - 1);
 			nr_pages = 1;
 		}
 	}
-- 
cgit v1.2.3