summaryrefslogtreecommitdiffstats
path: root/net/xdp/xsk.c
diff options
context:
space:
mode:
authorBjörn Töpel <bjorn.topel@intel.com>2020-05-20 21:20:53 +0200
committerAlexei Starovoitov <ast@kernel.org>2020-05-21 17:31:26 -0700
commit2b43470add8c8ff1e1ee28dffc5c5df97e955d09 (patch)
treef2e61f5891abf48c657365a324d091cdf3922ff0 /net/xdp/xsk.c
parent89e4a376e3a3dab639a3947a6c7cf5d461d1aa4c (diff)
downloadlinux-2b43470add8c8ff1e1ee28dffc5c5df97e955d09.tar.gz
linux-2b43470add8c8ff1e1ee28dffc5c5df97e955d09.tar.bz2
linux-2b43470add8c8ff1e1ee28dffc5c5df97e955d09.zip
xsk: Introduce AF_XDP buffer allocation API
In order to simplify AF_XDP zero-copy enablement for NIC driver developers, a new AF_XDP buffer allocation API is added. The implementation is based on a single core (single producer/consumer) buffer pool for the AF_XDP UMEM. A buffer is allocated using the xsk_buff_alloc() function, and returned using xsk_buff_free(). If a buffer is disassociated with the pool, e.g. when a buffer is passed to an AF_XDP socket, a buffer is said to be released. Currently, the release function is only used by the AF_XDP internals and not visible to the driver. Drivers using this API should register the XDP memory model with the new MEM_TYPE_XSK_BUFF_POOL type. The API is defined in net/xdp_sock_drv.h. The buffer type is struct xdp_buff, and follows the lifetime of regular xdp_buffs, i.e. the lifetime of an xdp_buff is restricted to a NAPI context. In other words, the API is not replacing xdp_frames. In addition to introducing the API and implementations, the AF_XDP core is migrated to use the new APIs. rfc->v1: Fixed build errors/warnings for m68k and riscv. (kbuild test robot) Added headroom/chunk size getter. (Maxim/Björn) v1->v2: Swapped SoBs. (Maxim) v2->v3: Initialize struct xdp_buff member frame_sz. (Björn) Add API to query the DMA address of a frame. (Maxim) Do DMA sync for CPU till the end of the frame to handle possible growth (frame_sz). (Maxim) Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20200520192103.355233-6-bjorn.topel@gmail.com
Diffstat (limited to 'net/xdp/xsk.c')
-rw-r--r--net/xdp/xsk.c147
1 files changed, 54 insertions, 93 deletions
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 8bda654e82ec..6933f0d494ba 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -117,76 +117,67 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
}
EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
-/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
- * each page. This is only required in copy mode.
- */
-static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
- u32 len, u32 metalen)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- void *to_buf = xdp_umem_get_data(umem, addr);
-
- addr = xsk_umem_add_offset_to_addr(addr);
- if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) {
- void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
- u64 page_start = addr & ~(PAGE_SIZE - 1);
- u64 first_len = PAGE_SIZE - (addr - page_start);
-
- memcpy(to_buf, from_buf, first_len);
- memcpy(next_pg_addr, from_buf + first_len,
- len + metalen - first_len);
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+ u64 addr;
+ int err;
- return;
+ addr = xp_get_handle(xskb);
+ err = xskq_prod_reserve_desc(xs->rx, addr, len);
+ if (err) {
+ xs->rx_dropped++;
+ return err;
}
- memcpy(to_buf, from_buf, len + metalen);
+ xp_release(xskb);
+ return 0;
}
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
{
- u64 offset = xs->umem->headroom;
- u64 addr, memcpy_addr;
- void *from_buf;
+ void *from_buf, *to_buf;
u32 metalen;
- int err;
-
- if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
- len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
- xs->rx_dropped++;
- return -ENOSPC;
- }
- if (unlikely(xdp_data_meta_unsupported(xdp))) {
- from_buf = xdp->data;
+ if (unlikely(xdp_data_meta_unsupported(from))) {
+ from_buf = from->data;
+ to_buf = to->data;
metalen = 0;
} else {
- from_buf = xdp->data_meta;
- metalen = xdp->data - xdp->data_meta;
+ from_buf = from->data_meta;
+ metalen = from->data - from->data_meta;
+ to_buf = to->data - metalen;
}
- memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
-
- offset += metalen;
- addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- err = xskq_prod_reserve_desc(xs->rx, addr, len);
- if (!err) {
- xskq_cons_release(xs->umem->fq);
- xdp_return_buff(xdp);
- return 0;
- }
-
- xs->rx_dropped++;
- return err;
+ memcpy(to_buf, from_buf, len + metalen);
}
-static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
+ bool explicit_free)
{
- int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len);
+ struct xdp_buff *xsk_xdp;
+ int err;
- if (err)
+ if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
+ xs->rx_dropped++;
+ return -ENOSPC;
+ }
+
+ xsk_xdp = xsk_buff_alloc(xs->umem);
+ if (!xsk_xdp) {
xs->rx_dropped++;
+ return -ENOSPC;
+ }
- return err;
+ xsk_copy_xdp(xsk_xdp, xdp, len);
+ err = __xsk_rcv_zc(xs, xsk_xdp, len);
+ if (err) {
+ xsk_buff_free(xsk_xdp);
+ return err;
+ }
+ if (explicit_free)
+ xdp_return_buff(xdp);
+ return 0;
}
static bool xsk_is_bound(struct xdp_sock *xs)
@@ -199,7 +190,8 @@ static bool xsk_is_bound(struct xdp_sock *xs)
return false;
}
-static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
+ bool explicit_free)
{
u32 len;
@@ -211,8 +203,10 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
len = xdp->data_end - xdp->data;
- return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
- __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+ return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY ||
+ xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
+ __xsk_rcv_zc(xs, xdp, len) :
+ __xsk_rcv(xs, xdp, len, explicit_free);
}
static void xsk_flush(struct xdp_sock *xs)
@@ -224,46 +218,11 @@ static void xsk_flush(struct xdp_sock *xs)
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
- u32 metalen = xdp->data - xdp->data_meta;
- u32 len = xdp->data_end - xdp->data;
- u64 offset = xs->umem->headroom;
- void *buffer;
- u64 addr;
int err;
spin_lock_bh(&xs->rx_lock);
-
- if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
- err = -EINVAL;
- goto out_unlock;
- }
-
- if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
- len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
- err = -ENOSPC;
- goto out_drop;
- }
-
- addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- buffer = xdp_umem_get_data(xs->umem, addr);
- memcpy(buffer, xdp->data_meta, len + metalen);
-
- addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
- err = xskq_prod_reserve_desc(xs->rx, addr, len);
- if (err)
- goto out_drop;
-
- xskq_cons_release(xs->umem->fq);
- xskq_prod_submit(xs->rx);
-
- spin_unlock_bh(&xs->rx_lock);
-
- xs->sk.sk_data_ready(&xs->sk);
- return 0;
-
-out_drop:
- xs->rx_dropped++;
-out_unlock:
+ err = xsk_rcv(xs, xdp, false);
+ xsk_flush(xs);
spin_unlock_bh(&xs->rx_lock);
return err;
}
@@ -273,7 +232,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
int err;
- err = xsk_rcv(xs, xdp);
+ err = xsk_rcv(xs, xdp, true);
if (err)
return err;
@@ -404,7 +363,7 @@ static int xsk_generic_xmit(struct sock *sk)
skb_put(skb, len);
addr = desc.addr;
- buffer = xdp_umem_get_data(xs->umem, addr);
+ buffer = xsk_buff_raw_get_data(xs->umem, addr);
err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
@@ -860,6 +819,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
&xs->umem->cq;
err = xsk_init_queue(entries, q, true);
+ if (optname == XDP_UMEM_FILL_RING)
+ xp_set_fq(xs->umem->pool, *q);
mutex_unlock(&xs->mutex);
return err;
}