summaryrefslogtreecommitdiffstats
path: root/io_uring/rsrc.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-09-16 13:29:00 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2024-09-16 13:29:00 +0200
commit3a4d319a8fb5a9bbdf5b31ef32841eb286b1dcc2 (patch)
treea9d890edc874d231729308c164de5be310d33651 /io_uring/rsrc.c
parent69a3a0a45a2f72412c2ba31761cc9193bb746fef (diff)
parent7cc2a6eadcd7a5aa36ac63e6659f5c6138c7f4d2 (diff)
downloadlinux-3a4d319a8fb5a9bbdf5b31ef32841eb286b1dcc2.tar.gz
linux-3a4d319a8fb5a9bbdf5b31ef32841eb286b1dcc2.tar.bz2
linux-3a4d319a8fb5a9bbdf5b31ef32841eb286b1dcc2.zip
Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - NAPI fixes and cleanups (Pavel, Olivier) - Add support for absolute timeouts (Pavel) - Fixes for io-wq/sqpoll affinities (Felix) - Efficiency improvements for dealing with huge pages (Chenliang) - Support for a minwait mode, where the application essentially has two timouts - one smaller one that defines the batch timeout, and the overall large one similar to what we had before. This enables efficient use of batching based on count + timeout, while still working well with periods of less intensive workloads - Use ITER_UBUF for single segment sends - Add support for incremental buffer consumption. Right now each operation will always consume a full buffer. With incremental consumption, a recv/read operation only consumes the part of the buffer that it needs to satisfy the operation - Add support for GCOV for io_uring, to help retain a high coverage of test to code ratio - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly converted to a blocking retry - Add support for cloning registered buffers from one ring to another - Misc cleanups (Anuj, me) * tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits) io_uring: add IORING_REGISTER_COPY_BUFFERS method io_uring/register: provide helper to get io_ring_ctx from 'fd' io_uring/rsrc: add reference count to struct io_mapped_ubuf io_uring/rsrc: clear 'slot' entry upfront io_uring/io-wq: inherit cpuset of cgroup in io worker io_uring/io-wq: do not allow pinning outside of cpuset io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common() io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN io_uring/sqpoll: do not allow pinning outside of cpuset io_uring/eventfd: move refs to refcount_t io_uring: remove unused rsrc_put_fn io_uring: add new line after variable declaration io_uring: add GCOV_PROFILE_URING Kconfig option io_uring/kbuf: add support for incremental buffer consumption io_uring/kbuf: pass in 'len' argument for buffer commit Revert "io_uring: Require zeroed sqe->len on provided-buffers send" io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h io_uring/kbuf: add io_kbuf_commit() helper io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg io_uring: wire up min batch wake timeout ...
Diffstat (limited to 'io_uring/rsrc.c')
-rw-r--r--io_uring/rsrc.c245
1 files changed, 203 insertions, 42 deletions
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 453867add7ca..a7164aa7d13e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -17,6 +17,7 @@
#include "openclose.h"
#include "rsrc.h"
#include "memmap.h"
+#include "register.h"
struct io_rsrc_update {
struct file *file;
@@ -114,14 +115,16 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
struct io_mapped_ubuf *imu = *slot;
unsigned int i;
+ *slot = NULL;
if (imu != &dummy_ubuf) {
+ if (!refcount_dec_and_test(&imu->refs))
+ return;
for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
kvfree(imu);
}
- *slot = NULL;
}
static void io_rsrc_put_work(struct io_rsrc_node *node)
@@ -855,6 +858,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
+static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
+ struct io_imu_folio_data *data, int nr_folios)
+{
+ struct page **page_array = *pages, **new_array = NULL;
+ int nr_pages_left = *nr_pages, i, j;
+
+ /* Store head pages only*/
+ new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!new_array)
+ return false;
+
+ new_array[0] = compound_head(page_array[0]);
+ /*
+ * The pages are bound to the folio, it doesn't
+ * actually unpin them but drops all but one reference,
+ * which is usually put down by io_buffer_unmap().
+ * Note, needs a better helper.
+ */
+ if (data->nr_pages_head > 1)
+ unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
+
+ j = data->nr_pages_head;
+ nr_pages_left -= data->nr_pages_head;
+ for (i = 1; i < nr_folios; i++) {
+ unsigned int nr_unpin;
+
+ new_array[i] = page_array[j];
+ nr_unpin = min_t(unsigned int, nr_pages_left - 1,
+ data->nr_pages_mid - 1);
+ if (nr_unpin)
+ unpin_user_pages(&page_array[j+1], nr_unpin);
+ j += data->nr_pages_mid;
+ nr_pages_left -= data->nr_pages_mid;
+ }
+ kvfree(page_array);
+ *pages = new_array;
+ *nr_pages = nr_folios;
+ return true;
+}
+
+static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
+ struct io_imu_folio_data *data)
+{
+ struct page **page_array = *pages;
+ struct folio *folio = page_folio(page_array[0]);
+ unsigned int count = 1, nr_folios = 1;
+ int i;
+
+ if (*nr_pages <= 1)
+ return false;
+
+ data->nr_pages_mid = folio_nr_pages(folio);
+ if (data->nr_pages_mid == 1)
+ return false;
+
+ data->folio_shift = folio_shift(folio);
+ /*
+ * Check if pages are contiguous inside a folio, and all folios have
+ * the same page count except for the head and tail.
+ */
+ for (i = 1; i < *nr_pages; i++) {
+ if (page_folio(page_array[i]) == folio &&
+ page_array[i] == page_array[i-1] + 1) {
+ count++;
+ continue;
+ }
+
+ if (nr_folios == 1) {
+ if (folio_page_idx(folio, page_array[i-1]) !=
+ data->nr_pages_mid - 1)
+ return false;
+
+ data->nr_pages_head = count;
+ } else if (count != data->nr_pages_mid) {
+ return false;
+ }
+
+ folio = page_folio(page_array[i]);
+ if (folio_size(folio) != (1UL << data->folio_shift) ||
+ folio_page_idx(folio, page_array[i]) != 0)
+ return false;
+
+ count = 1;
+ nr_folios++;
+ }
+ if (nr_folios == 1)
+ data->nr_pages_head = count;
+
+ return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
+}
+
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu,
struct page **last_hpage)
@@ -864,7 +959,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unsigned long off;
size_t size;
int ret, nr_pages, i;
- struct folio *folio = NULL;
+ struct io_imu_folio_data data;
+ bool coalesced;
*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base)
@@ -879,31 +975,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done;
}
- /* If it's a huge page, try to coalesce them into a single bvec entry */
- if (nr_pages > 1) {
- folio = page_folio(pages[0]);
- for (i = 1; i < nr_pages; i++) {
- /*
- * Pages must be consecutive and on the same folio for
- * this to work
- */
- if (page_folio(pages[i]) != folio ||
- pages[i] != pages[i - 1] + 1) {
- folio = NULL;
- break;
- }
- }
- if (folio) {
- /*
- * The pages are bound to the folio, it doesn't
- * actually unpin them but drops all but one reference,
- * which is usually put down by io_buffer_unmap().
- * Note, needs a better helper.
- */
- unpin_user_pages(&pages[1], nr_pages - 1);
- nr_pages = 1;
- }
- }
+ /* If it's huge page(s), try to coalesce them into fewer bvec entries */
+ coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu)
@@ -915,23 +988,26 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done;
}
- off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len;
/* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
+ imu->folio_shift = PAGE_SHIFT;
+ imu->folio_mask = PAGE_MASK;
+ if (coalesced) {
+ imu->folio_shift = data.folio_shift;
+ imu->folio_mask = ~((1UL << data.folio_shift) - 1);
+ }
+ refcount_set(&imu->refs, 1);
+ off = (unsigned long) iov->iov_base & ~imu->folio_mask;
*pimu = imu;
ret = 0;
- if (folio) {
- bvec_set_page(&imu->bvec[0], pages[0], size, off);
- goto done;
- }
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
- vec_len = min_t(size_t, size, PAGE_SIZE - off);
+ vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
off = 0;
size -= vec_len;
@@ -1042,23 +1118,18 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* we know that:
*
* 1) it's a BVEC iter, we set it up
- * 2) all bvecs are PAGE_SIZE in size, except potentially the
+ * 2) all bvecs are the same in size, except potentially the
* first and last bvec
*
* So just find our index, and adjust the iterator afterwards.
* If the offset is within the first bvec (or the whole first
* bvec, just use iov_iter_advance(). This makes it easier
* since we can just skip the first segment, which may not
- * be PAGE_SIZE aligned.
+ * be folio_size aligned.
*/
const struct bio_vec *bvec = imu->bvec;
if (offset < bvec->bv_len) {
- /*
- * Note, huge pages buffers consists of one large
- * bvec entry and should always go this way. The other
- * branch doesn't expect non PAGE_SIZE'd chunks.
- */
iter->bvec = bvec;
iter->count -= offset;
iter->iov_offset = offset;
@@ -1067,14 +1138,104 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
/* skip first vec */
offset -= bvec->bv_len;
- seg_skip = 1 + (offset >> PAGE_SHIFT);
+ seg_skip = 1 + (offset >> imu->folio_shift);
iter->bvec = bvec + seg_skip;
iter->nr_segs -= seg_skip;
iter->count -= bvec->bv_len + offset;
- iter->iov_offset = offset & ~PAGE_MASK;
+ iter->iov_offset = offset & ~imu->folio_mask;
}
}
return 0;
}
+
+static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
+{
+ struct io_mapped_ubuf **user_bufs;
+ struct io_rsrc_data *data;
+ int i, ret, nbufs;
+
+ /*
+ * Drop our own lock here. We'll setup the data we need and reference
+ * the source buffers, then re-grab, check, and assign at the end.
+ */
+ mutex_unlock(&ctx->uring_lock);
+
+ mutex_lock(&src_ctx->uring_lock);
+ ret = -ENXIO;
+ nbufs = src_ctx->nr_user_bufs;
+ if (!nbufs)
+ goto out_unlock;
+ ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
+ if (ret)
+ goto out_unlock;
+
+ ret = -ENOMEM;
+ user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
+ if (!user_bufs)
+ goto out_free_data;
+
+ for (i = 0; i < nbufs; i++) {
+ struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
+
+ refcount_inc(&src->refs);
+ user_bufs[i] = src;
+ }
+
+ /* Have a ref on the bufs now, drop src lock and re-grab our own lock */
+ mutex_unlock(&src_ctx->uring_lock);
+ mutex_lock(&ctx->uring_lock);
+ if (!ctx->user_bufs) {
+ ctx->user_bufs = user_bufs;
+ ctx->buf_data = data;
+ ctx->nr_user_bufs = nbufs;
+ return 0;
+ }
+
+ /* someone raced setting up buffers, dump ours */
+ for (i = 0; i < nbufs; i++)
+ io_buffer_unmap(ctx, &user_bufs[i]);
+ io_rsrc_data_free(data);
+ kfree(user_bufs);
+ return -EBUSY;
+out_free_data:
+ io_rsrc_data_free(data);
+out_unlock:
+ mutex_unlock(&src_ctx->uring_lock);
+ mutex_lock(&ctx->uring_lock);
+ return ret;
+}
+
+/*
+ * Copy the registered buffers from the source ring whose file descriptor
+ * is given in the src_fd to the current ring. This is identical to registering
+ * the buffers with ctx, except faster as mappings already exist.
+ *
+ * Since the memory is already accounted once, don't account it again.
+ */
+int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_copy_buffers buf;
+ bool registered_src;
+ struct file *file;
+ int ret;
+
+ if (ctx->user_bufs || ctx->nr_user_bufs)
+ return -EBUSY;
+ if (copy_from_user(&buf, arg, sizeof(buf)))
+ return -EFAULT;
+ if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
+ return -EINVAL;
+ if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
+ return -EINVAL;
+
+ registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
+ file = io_uring_register_get_file(buf.src_fd, registered_src);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ ret = io_copy_buffers(ctx, file->private_data);
+ if (!registered_src)
+ fput(file);
+ return ret;
+}