summaryrefslogtreecommitdiffstats
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/io_uring.c24
-rw-r--r--io_uring/kbuf.c2
-rw-r--r--io_uring/rsrc.c109
-rw-r--r--io_uring/zcrx.c56
-rw-r--r--io_uring/zcrx.h6
5 files changed, 122 insertions, 75 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c6209fe44cb1..a2b256e96d5d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -872,10 +872,15 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
lockdep_assert(!io_wq_current_is_worker());
lockdep_assert_held(&ctx->uring_lock);
- __io_cq_lock(ctx);
- posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
+ if (!ctx->lockless_cq) {
+ spin_lock(&ctx->completion_lock);
+ posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
+ }
+
ctx->submit_state.cq_flush = true;
- __io_cq_unlock_post(ctx);
return posted;
}
@@ -1078,21 +1083,22 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
while (node) {
req = container_of(node, struct io_kiocb, io_task_work.node);
node = node->next;
- if (sync && last_ctx != req->ctx) {
+ if (last_ctx != req->ctx) {
if (last_ctx) {
- flush_delayed_work(&last_ctx->fallback_work);
+ if (sync)
+ flush_delayed_work(&last_ctx->fallback_work);
percpu_ref_put(&last_ctx->refs);
}
last_ctx = req->ctx;
percpu_ref_get(&last_ctx->refs);
}
- if (llist_add(&req->io_task_work.node,
- &req->ctx->fallback_llist))
- schedule_delayed_work(&req->ctx->fallback_work, 1);
+ if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist))
+ schedule_delayed_work(&last_ctx->fallback_work, 1);
}
if (last_ctx) {
- flush_delayed_work(&last_ctx->fallback_work);
+ if (sync)
+ flush_delayed_work(&last_ctx->fallback_work);
percpu_ref_put(&last_ctx->refs);
}
}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 098109259671..953d5e742569 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -504,6 +504,8 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
p->nbufs = tmp;
p->addr = READ_ONCE(sqe->addr);
p->len = READ_ONCE(sqe->len);
+ if (!p->len)
+ return -EINVAL;
if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
&size))
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5e64a8bb30a4..f80a77c4973f 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -175,6 +175,18 @@ void io_rsrc_cache_free(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->imu_cache, kfree);
}
+static void io_clear_table_tags(struct io_rsrc_data *data)
+{
+ int i;
+
+ for (i = 0; i < data->nr; i++) {
+ struct io_rsrc_node *node = data->nodes[i];
+
+ if (node)
+ node->tag = 0;
+ }
+}
+
__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
struct io_rsrc_data *data)
{
@@ -583,6 +595,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
return 0;
fail:
+ io_clear_table_tags(&ctx->file_table.data);
io_sqe_files_unregister(ctx);
return ret;
}
@@ -902,8 +915,10 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
ctx->buf_table = data;
- if (ret)
+ if (ret) {
+ io_clear_table_tags(&ctx->buf_table);
io_sqe_buffers_unregister(ctx);
+ }
return ret;
}
@@ -1017,10 +1032,33 @@ static int validate_fixed_range(u64 buf_addr, size_t len,
return 0;
}
+static int io_import_kbuf(int ddir, struct iov_iter *iter,
+ struct io_mapped_ubuf *imu, size_t len, size_t offset)
+{
+ size_t count = len + offset;
+
+ iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
+ iov_iter_advance(iter, offset);
+
+ if (count < imu->len) {
+ const struct bio_vec *bvec = iter->bvec;
+
+ while (len > bvec->bv_len) {
+ len -= bvec->bv_len;
+ bvec++;
+ }
+ iter->nr_segs = 1 + bvec - iter->bvec;
+ }
+ return 0;
+}
+
static int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu,
u64 buf_addr, size_t len)
{
+ const struct bio_vec *bvec;
+ size_t folio_mask;
+ unsigned nr_segs;
size_t offset;
int ret;
@@ -1032,56 +1070,35 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
if (!(imu->dir & (1 << ddir)))
return -EFAULT;
- /*
- * Might not be a start of buffer, set size appropriately
- * and advance us to the beginning.
- */
offset = buf_addr - imu->ubuf;
- iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
- if (offset) {
- /*
- * Don't use iov_iter_advance() here, as it's really slow for
- * using the latter parts of a big fixed buffer - it iterates
- * over each segment manually. We can cheat a bit here for user
- * registered nodes, because we know that:
- *
- * 1) it's a BVEC iter, we set it up
- * 2) all bvecs are the same in size, except potentially the
- * first and last bvec
- *
- * So just find our index, and adjust the iterator afterwards.
- * If the offset is within the first bvec (or the whole first
- * bvec, just use iov_iter_advance(). This makes it easier
- * since we can just skip the first segment, which may not
- * be folio_size aligned.
- */
- const struct bio_vec *bvec = imu->bvec;
-
- /*
- * Kernel buffer bvecs, on the other hand, don't necessarily
- * have the size property of user registered ones, so we have
- * to use the slow iter advance.
- */
- if (offset < bvec->bv_len) {
- iter->count -= offset;
- iter->iov_offset = offset;
- } else if (imu->is_kbuf) {
- iov_iter_advance(iter, offset);
- } else {
- unsigned long seg_skip;
+ if (imu->is_kbuf)
+ return io_import_kbuf(ddir, iter, imu, len, offset);
- /* skip first vec */
- offset -= bvec->bv_len;
- seg_skip = 1 + (offset >> imu->folio_shift);
+ /*
+ * Don't use iov_iter_advance() here, as it's really slow for
+ * using the latter parts of a big fixed buffer - it iterates
+ * over each segment manually. We can cheat a bit here for user
+ * registered nodes, because we know that:
+ *
+ * 1) it's a BVEC iter, we set it up
+ * 2) all bvecs are the same in size, except potentially the
+ * first and last bvec
+ */
+ folio_mask = (1UL << imu->folio_shift) - 1;
+ bvec = imu->bvec;
+ if (offset >= bvec->bv_len) {
+ unsigned long seg_skip;
- iter->bvec += seg_skip;
- iter->nr_segs -= seg_skip;
- iter->count -= bvec->bv_len + offset;
- iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
- }
+ /* skip first vec */
+ offset -= bvec->bv_len;
+ seg_skip = 1 + (offset >> imu->folio_shift);
+ bvec += seg_skip;
+ offset &= folio_mask;
}
-
+ nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
+ iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
+ iter->iov_offset = offset;
return 0;
}
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 80d4a6f71d29..fe86606b9f30 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -26,6 +26,11 @@
#include "zcrx.h"
#include "rsrc.h"
+static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
+{
+ return pp->mp_priv;
+}
+
#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
@@ -46,14 +51,21 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
+ guard(mutex)(&ifq->dma_lock);
+
if (area->is_mapped)
__io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
+ area->is_mapped = false;
}
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
int i;
+ guard(mutex)(&ifq->dma_lock);
+ if (area->is_mapped)
+ return 0;
+
for (i = 0; i < area->nia.num_niovs; i++) {
struct net_iov *niov = &area->nia.niovs[i];
dma_addr_t dma;
@@ -181,7 +193,7 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
kvfree(area->nia.niovs);
kvfree(area->user_refs);
if (area->pages) {
- unpin_user_pages(area->pages, area->nia.num_niovs);
+ unpin_user_pages(area->pages, area->nr_folios);
kvfree(area->pages);
}
kfree(area);
@@ -192,7 +204,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_area_reg *area_reg)
{
struct io_zcrx_area *area;
- int i, ret, nr_pages;
+ int i, ret, nr_pages, nr_iovs;
struct iovec iov;
if (area_reg->flags || area_reg->rq_area_token)
@@ -220,27 +232,28 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
area->pages = NULL;
goto err;
}
- area->nia.num_niovs = nr_pages;
+ area->nr_folios = nr_iovs = nr_pages;
+ area->nia.num_niovs = nr_iovs;
- area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
+ area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->nia.niovs)
goto err;
- area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
+ area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->freelist)
goto err;
- for (i = 0; i < nr_pages; i++)
+ for (i = 0; i < nr_iovs; i++)
area->freelist[i] = i;
- area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]),
+ area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->user_refs)
goto err;
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_iovs; i++) {
struct net_iov *niov = &area->nia.niovs[i];
niov->owner = &area->nia;
@@ -248,7 +261,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
atomic_set(&area->user_refs[i], 0);
}
- area->free_count = nr_pages;
+ area->free_count = nr_iovs;
area->ifq = ifq;
/* we're only supporting one area per ifq for now */
area->area_id = 0;
@@ -274,6 +287,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
ifq->ctx = ctx;
spin_lock_init(&ifq->lock);
spin_lock_init(&ifq->rq_lock);
+ mutex_init(&ifq->dma_lock);
return ifq;
}
@@ -323,6 +337,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
put_device(ifq->dev);
io_free_rbuf_ring(ifq);
+ mutex_destroy(&ifq->dma_lock);
kfree(ifq);
}
@@ -353,7 +368,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
return -EFAULT;
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
return -EFAULT;
- if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
+ if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)) ||
+ reg.__resv2 || reg.zcrx_id)
return -EINVAL;
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
return -EINVAL;
@@ -393,10 +409,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
goto err;
get_device(ifq->dev);
- ret = io_zcrx_map_area(ifq, ifq->area);
- if (ret)
- goto err;
-
mp_param.mp_ops = &io_uring_pp_zc_ops;
mp_param.mp_priv = ifq;
ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
@@ -584,7 +596,7 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
{
- struct io_zcrx_ifq *ifq = pp->mp_priv;
+ struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
/* pp should already be ensuring that */
if (unlikely(pp->alloc.count))
@@ -616,7 +628,8 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
static int io_pp_zc_init(struct page_pool *pp)
{
- struct io_zcrx_ifq *ifq = pp->mp_priv;
+ struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
+ int ret;
if (WARN_ON_ONCE(!ifq))
return -EINVAL;
@@ -629,13 +642,17 @@ static int io_pp_zc_init(struct page_pool *pp)
if (pp->p.dma_dir != DMA_FROM_DEVICE)
return -EOPNOTSUPP;
+ ret = io_zcrx_map_area(ifq, ifq->area);
+ if (ret)
+ return ret;
+
percpu_ref_get(&ifq->ctx->refs);
return 0;
}
static void io_pp_zc_destroy(struct page_pool *pp)
{
- struct io_zcrx_ifq *ifq = pp->mp_priv;
+ struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
struct io_zcrx_area *area = ifq->area;
if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
@@ -664,6 +681,9 @@ static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
struct io_zcrx_ifq *ifq = mp_priv;
io_zcrx_drop_netdev(ifq);
+ if (ifq->area)
+ io_zcrx_unmap_area(ifq, ifq->area);
+
p->mp_ops = NULL;
p->mp_priv = NULL;
}
@@ -790,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
niov = netmem_to_net_iov(frag->netmem);
if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
- niov->pp->mp_priv != ifq)
+ io_pp_to_ifq(niov->pp) != ifq)
return -EFAULT;
if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 706cc7300780..f2bc811f022c 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -15,6 +15,7 @@ struct io_zcrx_area {
bool is_mapped;
u16 area_id;
struct page **pages;
+ unsigned long nr_folios;
/* freelist */
spinlock_t freelist_lock ____cacheline_aligned_in_smp;
@@ -26,17 +27,18 @@ struct io_zcrx_ifq {
struct io_ring_ctx *ctx;
struct io_zcrx_area *area;
+ spinlock_t rq_lock ____cacheline_aligned_in_smp;
struct io_uring *rq_ring;
struct io_uring_zcrx_rqe *rqes;
- u32 rq_entries;
u32 cached_rq_head;
- spinlock_t rq_lock;
+ u32 rq_entries;
u32 if_rxq;
struct device *dev;
struct net_device *netdev;
netdevice_tracker netdev_tracker;
spinlock_t lock;
+ struct mutex dma_lock;
};
#if defined(CONFIG_IO_URING_ZCRX)