From 9216477449f33cdbc9c9a99d49f500b7fbb81702 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:38 +0200 Subject: bpf: cpumap: Add the possibility to attach an eBPF program to cpumap Introduce the capability to attach an eBPF program to cpumap entries. The idea behind this feature is to add the possibility to define on which CPU run the eBPF program if the underlying hw does not support RSS. Current supported verdicts are XDP_DROP and XDP_PASS. This patch has been tested on Marvell ESPRESSObin using xdp_redirect_cpu sample available in the kernel tree to identify possible performance regressions. Results show there are no observable differences in packet-per-second: $./xdp_redirect_cpu --progname xdp_cpu_map0 --dev eth0 --cpu 1 rx: 354.8 Kpps rx: 356.0 Kpps rx: 356.8 Kpps rx: 356.3 Kpps rx: 356.6 Kpps rx: 356.6 Kpps rx: 356.7 Kpps rx: 355.8 Kpps rx: 356.8 Kpps rx: 356.8 Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/5c9febdf903d810b3415732e5cd98491d7d9067a.1594734381.git.lorenzo@kernel.org --- net/core/dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b61075828358..b820527f0a8d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5448,6 +5448,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) for (i = 0; i < new->aux->used_map_cnt; i++) { if (dev_map_can_have_prog(new->aux->used_maps[i])) return -EINVAL; + if (cpu_map_prog_allowed(new->aux->used_maps[i])) + return -EINVAL; } } @@ -8875,6 +8877,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, + "BPF_XDP_CPUMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); -- cgit v1.2.3 From e9ddbb7707ff5891616240026062b8c1e29864ca Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:23 +0200 Subject: bpf: Introduce SK_LOOKUP program type with a dedicated attach point Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer when looking up a listening socket for a new connection request for connection oriented protocols, or when looking up an unconnected socket for a packet for connection-less protocols. When called, SK_LOOKUP BPF program can select a socket that will receive the packet. This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, on fixed port to a socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, on any port to a socket 198.51.100.1, any port -> L7 proxy socket In its run-time context program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. Context can be further extended to include ingress interface identifier. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...) helper to record the selection. Transport layer then uses the selected socket as a result of socket lookup. In its basic form, SK_LOOKUP acts as a filter and hence must return either SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should look for a socket to receive the packet, or use the one selected by the program if available, while SK_DROP informs the transport layer that the lookup should fail. This patch only enables the user to attach an SK_LOOKUP program to a network namespace. Subsequent patches hook it up to run on local delivery path in ipv4 and ipv6 stacks. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com --- net/core/filter.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index bdd2382e655d..d099436b3ff5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9229,6 +9229,186 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) + return -ESOCKTNOSUPPORT; /* reject connected sockets */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) -- cgit v1.2.3 From 1559b4aa1db443096af493c7d621dc156054babe Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:25 +0200 Subject: inet: Run SK_LOOKUP BPF program on socket lookup Run a BPF program before looking up a listening socket on the receive path. Program selects a listening socket to yield as result of socket lookup by calling bpf_sk_assign() helper and returning SK_PASS code. Program can revert its decision by assigning a NULL socket with bpf_sk_assign(). Alternatively, BPF program can also fail the lookup by returning with SK_DROP, or let the lookup continue as usual with SK_PASS on return, when no socket has been selected with bpf_sk_assign(). This lets the user match packets with listening sockets freely at the last possible point on the receive path, where we know that packets are destined for local delivery after undergoing policing, filtering, and routing. With BPF code selecting the socket, directing packets destined to an IP range or to a port range to a single socket becomes possible. In case multiple programs are attached, they are run in series in the order in which they were attached. The end result is determined from return codes of all the programs according to following rules: 1. If any program returned SK_PASS and selected a valid socket, the socket is used as result of socket lookup. 2. If more than one program returned SK_PASS and selected a socket, last selection takes effect. 3. If any program returned SK_DROP, and no program returned SK_PASS and selected a socket, socket lookup fails with -ECONNREFUSED. 4. If all programs returned SK_PASS and none of them selected a socket, socket lookup continues to htable-based lookup. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-5-jakub@cloudflare.com --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index d099436b3ff5..2bd129b5ae74 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9230,6 +9230,9 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; +DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); +EXPORT_SYMBOL(bpf_sk_lookup_enabled); + BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, struct sock *, sk, u64, flags) { -- cgit v1.2.3 From bc4f0548f683a3d53359cef15f088d2d5bb4bc39 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 20 Jul 2020 09:33:58 -0700 Subject: bpf: Compute bpf_skc_to_*() helper socket btf ids at build time Currently, socket types (struct tcp_sock, udp_sock, etc.) used by bpf_skc_to_*() helpers are computed when vmlinux_btf is first built in the kernel. Commit 5a2798ab32ba ("bpf: Add BTF_ID_LIST/BTF_ID/BTF_ID_UNUSED macros") implemented a mechanism to compute btf_ids at kernel build time which can simplify kernel implementation and reduce runtime overhead by removing in-kernel btf_id calculation. This patch did exactly this, removing in-kernel btf_id computation and utilizing build-time btf_id computation. If CONFIG_DEBUG_INFO_BTF is not defined, BTF_ID_LIST will define an array with size of 5, which is not enough for btf_sock_ids. So define its own static array if CONFIG_DEBUG_INFO_BTF is not defined. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200720163358.1393023-1-yhs@fb.com --- net/core/filter.c | 49 ++++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 2bd129b5ae74..5a65fb4b95ff 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9426,19 +9426,19 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) * sock_common as the first argument in its memory layout. */ #define BTF_SOCK_TYPE_xxx \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, "inet_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, "inet_connection_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, "inet_request_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, "inet_timewait_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, "request_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, "sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, "sock_common") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, "tcp_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, "tcp_request_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, "tcp_timewait_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, "tcp6_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, "udp_sock") \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, "udp6_sock") + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, inet_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, inet_connection_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, inet_request_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, inet_timewait_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, request_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, sock_common) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, tcp_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, tcp_request_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) enum { #define BTF_SOCK_TYPE(name, str) name, @@ -9447,26 +9447,13 @@ BTF_SOCK_TYPE_xxx MAX_BTF_SOCK_TYPE, }; -static int btf_sock_ids[MAX_BTF_SOCK_TYPE]; - -#ifdef CONFIG_BPF_SYSCALL -static const char *bpf_sock_types[] = { -#define BTF_SOCK_TYPE(name, str) str, +#ifdef CONFIG_DEBUG_INFO_BTF +BTF_ID_LIST(btf_sock_ids) +#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE -}; - -void init_btf_sock_ids(struct btf *btf) -{ - int i, btf_id; - - for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) { - btf_id = btf_find_by_name_kind(btf, bpf_sock_types[i], - BTF_KIND_STRUCT); - if (btf_id > 0) - btf_sock_ids[i] = btf_id; - } -} +#else +static u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; #endif static bool check_arg_btf_id(u32 btf_id, u32 arg) -- cgit v1.2.3 From fce557bcef119a1bc5ab3cb02678cf454bcaf424 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 20 Jul 2020 09:34:02 -0700 Subject: bpf: Make btf_sock_ids global tcp and udp bpf_iter can reuse some socket ids in btf_sock_ids, so make it global. I put the extern definition in btf_ids.h as a central place so it can be easily discovered by developers. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200720163402.1393427-1-yhs@fb.com --- net/core/filter.c | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 5a65fb4b95ff..654c346b7d91 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9421,39 +9421,13 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } -/* Define a list of socket types which can be the argument for - * skc_to_*_sock() helpers. All these sockets should have - * sock_common as the first argument in its memory layout. - */ -#define BTF_SOCK_TYPE_xxx \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, inet_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, inet_connection_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, inet_request_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, inet_timewait_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, request_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, sock_common) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, tcp_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, tcp_request_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) - -enum { -#define BTF_SOCK_TYPE(name, str) name, -BTF_SOCK_TYPE_xxx -#undef BTF_SOCK_TYPE -MAX_BTF_SOCK_TYPE, -}; - #ifdef CONFIG_DEBUG_INFO_BTF -BTF_ID_LIST(btf_sock_ids) +BTF_ID_LIST_GLOBAL(btf_sock_ids) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE #else -static u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; +u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; #endif static bool check_arg_btf_id(u32 btf_id, u32 arg) -- cgit v1.2.3