From 303def35f64e37bcd5401d202889f5fbc0241179 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 17 May 2018 14:16:58 -0700 Subject: bpf: allow sk_msg programs to read sock fields Currently sk_msg programs only have access to the raw data. However, it is often useful when building policies to have the policies specific to the socket endpoint. This allows using the socket tuple as input into filters, etc. This patch adds ctx access to the sock fields. Signed-off-by: John Fastabend Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- net/core/filter.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 6d0d1560bd70..aec5ebafb262 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5148,18 +5148,23 @@ static bool sk_msg_is_valid_access(int off, int size, switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; + if (size != sizeof(__u64)) + return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; + if (size != sizeof(__u64)) + return false; break; + default: + if (size != sizeof(__u32)) + return false; } if (off < 0 || off >= sizeof(struct sk_msg_md)) return false; if (off % size != 0) return false; - if (size != sizeof(__u64)) - return false; return true; } @@ -5835,7 +5840,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_ops, local_ip4): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), @@ -6152,6 +6158,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct sk_msg_md, data): @@ -6164,6 +6171,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, data_end)); break; + case offsetof(struct sk_msg_md, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct sk_msg_md, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct sk_msg_md, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct sk_msg_md, remote_ip6[0]) ... + offsetof(struct sk_msg_md, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, local_ip6[0]) ... + offsetof(struct sk_msg_md, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct sk_msg_md, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct sk_msg_md, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct sk_msg_md, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct sk_msg_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; } return insn - insn_buf; -- cgit v1.2.3 From 4f74fede40df8dbdb5261d50682491126675aac3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 21 May 2018 09:08:15 -0700 Subject: bpf: Add mtu checking to FIB forwarding helper Add check that egress MTU can handle packet to be forwarded. If the MTU is less than the packet length, return 0 meaning the packet is expected to continue up the stack for help - eg., fragmenting the packet or sending an ICMP. The XDP path needs to leverage the FIB entry for an MTU on the route spec or an exception entry for a given destination. The skb path lets is_skb_forwardable decide if the packet can be sent. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- net/core/filter.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index aec5ebafb262..ba3ff5aa575a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4089,7 +4089,7 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, - u32 flags) + u32 flags, bool check_mtu) { struct in_device *in_dev; struct neighbour *neigh; @@ -4098,6 +4098,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct fib_nh *nh; struct flowi4 fl4; int err; + u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4149,6 +4150,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); + if (check_mtu) { + mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); + if (params->tot_len > mtu) + return 0; + } + nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ @@ -4177,7 +4184,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, - u32 flags) + u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; @@ -4188,6 +4195,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, struct flowi6 fl6; int strict = 0; int oif; + u32 mtu; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) @@ -4250,6 +4258,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl6.flowi6_oif, NULL, strict); + if (check_mtu) { + mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); + if (params->tot_len > mtu) + return 0; + } + if (f6i->fib6_nh.nh_lwtstate) return 0; @@ -4282,12 +4296,12 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, - flags); + flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, - flags); + flags, true); #endif } return 0; @@ -4306,20 +4320,34 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { + struct net *net = dev_net(skb->dev); + int index = 0; + if (plen < sizeof(*params)) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); + index = bpf_ipv4_fib_lookup(net, params, flags, false); + break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); + index = bpf_ipv6_fib_lookup(net, params, flags, false); + break; #endif } - return -ENOTSUPP; + + if (index > 0) { + struct net_device *dev; + + dev = dev_get_by_index_rcu(net, index); + if (!is_skb_forwardable(dev, skb)) + index = 0; + } + + return index; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { -- cgit v1.2.3 From fe94cc290f535709d3c5ebd1e472dfd0aec7ee79 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:14 +0100 Subject: bpf: Add IPv6 Segment Routing helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF seg6local hook should be powerful enough to enable users to implement most of the use-cases one could think of. After some thinking, we figured out that the following actions should be possible on a SRv6 packet, requiring 3 specific helpers : - bpf_lwt_seg6_store_bytes: Modify non-sensitive fields of the SRH - bpf_lwt_seg6_adjust_srh: Allow to grow or shrink a SRH (to add/delete TLVs) - bpf_lwt_seg6_action: Apply some SRv6 network programming actions (specifically End.X, End.T, End.B6 and End.B6.Encap) The specifications of these helpers are provided in the patch (see include/uapi/linux/bpf.h). The non-sensitive fields of the SRH are the following : flags, tag and TLVs. The other fields can not be modified, to maintain the SRH integrity. Flags, tag and TLVs can easily be modified as their validity can be checked afterwards via seg6_validate_srh. It is not allowed to modify the segments directly. If one wants to add segments on the path, he should stack a new SRH using the End.B6 action via bpf_lwt_seg6_action. Growing, shrinking or editing TLVs via the helpers will flag the SRH as invalid, and it will have to be re-validated before re-entering the IPv6 layer. This flag is stored in a per-CPU buffer, along with the current header length in bytes. Storing the SRH len in bytes in the control block is mandatory when using bpf_lwt_seg6_adjust_srh. The Header Ext. Length field contains the SRH len rounded to 8 bytes (a padding TLV can be inserted to ensure the 8-bytes boundary). When adding/deleting TLVs within the BPF program, the SRH may temporary be in an invalid state where its length cannot be rounded to 8 bytes without remainder, hence the need to store the length in bytes separately. The caller of the BPF program can then ensure that the SRH's final length is valid using this value. Again, a final SRH modified by a BPF program which doesn’t respect the 8-bytes boundary will be discarded as it will be considered as invalid. Finally, a fourth helper is provided, bpf_lwt_push_encap, which is available from the LWT BPF IN hook, but not from the seg6local BPF one. This helper allows to encapsulate a Segment Routing Header (either with a new outer IPv6 header, or by inlining it directly in the existing IPv6 header) into a non-SRv6 packet. This helper is required if we want to offer the possibility to dynamically encapsulate a SRH for non-SRv6 packet, as the BPF seg6local hook only works on traffic already containing a SRH. This is the BPF equivalent of the seg6 LWT infrastructure, which achieves the same purpose but with a static SRH per route. These helpers require CONFIG_IPV6=y (and not =m). Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- net/core/filter.c | 285 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 262 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index ba3ff5aa575a..2e05dcfda6d7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -64,6 +64,10 @@ #include #include #include +#include +#include +#include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3363,28 +3367,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == bpf_skb_change_tail || - func == bpf_skb_adjust_room || - func == bpf_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_xdp_adjust_tail) - return true; - - return false; -} - static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { @@ -4360,6 +4342,264 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) +{ + int err; + struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; + + if (!seg6_validate_srh(srh, len)) + return -EINVAL; + + switch (type) { + case BPF_LWT_ENCAP_SEG6_INLINE: + if (skb->protocol != htons(ETH_P_IPV6)) + return -EBADMSG; + + err = seg6_do_srh_inline(skb, srh); + break; + case BPF_LWT_ENCAP_SEG6: + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); + break; + default: + return -EINVAL; + } + + bpf_compute_data_pointers(skb); + if (err) + return err; + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + return seg6_lookup_nexthop(skb, NULL, 0); +} +#endif /* CONFIG_IPV6_SEG6_BPF */ + +BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, + u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + case BPF_LWT_ENCAP_SEG6: + case BPF_LWT_ENCAP_SEG6_INLINE: + return bpf_push_seg6_encap(skb, type, hdr, len); +#endif + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_push_encap_proto = { + .func = bpf_lwt_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_tlvs, *srh_end, *ptr; + struct ipv6_sr_hdr *srh; + int srhoff = 0; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); + + ptr = skb->data + offset; + if (ptr >= srh_tlvs && ptr + len <= srh_end) + srh_state->valid = 0; + else if (ptr < (void *)&srh->flags || + ptr + len > (void *)&srh->segments) + return -EFAULT; + + if (unlikely(bpf_try_make_writable(skb, offset + len))) + return -EFAULT; + + memcpy(skb->data + offset, from, len); + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { + .func = bpf_lwt_seg6_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, + u32, action, void *, param, u32, param_len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh; + int srhoff = 0; + int err; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + if (!srh_state->valid) { + if (unlikely((srh_state->hdrlen & 7) != 0)) + return -EBADMSG; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); + if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + return -EBADMSG; + + srh_state->valid = 1; + } + + switch (action) { + case SEG6_LOCAL_ACTION_END_X: + if (param_len != sizeof(struct in6_addr)) + return -EINVAL; + return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); + case SEG6_LOCAL_ACTION_END_T: + if (param_len != sizeof(int)) + return -EINVAL; + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_B6: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + case SEG6_LOCAL_ACTION_END_B6_ENCAP: + err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, + param, param_len); + if (!err) + srh_state->hdrlen = + ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + return err; + default: + return -EINVAL; + } +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { + .func = bpf_lwt_seg6_action, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, + s32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + void *srh_end, *srh_tlvs, *ptr; + struct ipv6_sr_hdr *srh; + struct ipv6hdr *hdr; + int srhoff = 0; + int ret; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + + srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + + ((srh->first_segment + 1) << 4)); + srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + + srh_state->hdrlen); + ptr = skb->data + offset; + + if (unlikely(ptr < srh_tlvs || ptr > srh_end)) + return -EFAULT; + if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) + return -EFAULT; + + if (len > 0) { + ret = skb_cow_head(skb, len); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, offset, len); + } else { + ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); + } + + bpf_compute_data_pointers(skb); + if (unlikely(ret < 0)) + return ret; + + hdr = (struct ipv6hdr *)skb->data; + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + srh_state->hdrlen += len; + srh_state->valid = 0; + return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ + return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { + .func = bpf_lwt_seg6_adjust_srh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) +{ + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_head || + func == bpf_skb_change_tail || + func == bpf_skb_adjust_room || + func == bpf_skb_pull_data || + func == bpf_clone_redirect || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data || + func == bpf_xdp_adjust_tail || + func == bpf_lwt_push_encap || + func == bpf_lwt_seg6_store_bytes || + func == bpf_lwt_seg6_adjust_srh || + func == bpf_lwt_seg6_action + ) + return true; + + return false; +} + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -4774,7 +5014,6 @@ static bool lwt_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } - /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, -- cgit v1.2.3 From cd3092c7f8db8c320ea7f1aa7c1adeac2450f43a Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:15 +0100 Subject: bpf: Split lwt inout verifier structures The new bpf_lwt_push_encap helper should only be accessible within the LWT BPF IN hook, and not the OUT one, as this may lead to a skb under panic. At the moment, both LWT BPF IN and OUT share the same list of helpers, whose calls are authorized by the verifier. This patch separates the verifier ops for the IN and OUT hooks, and allows the IN hook to call the bpf_lwt_push_encap helper. This patch is also the occasion to put all lwt_*_func_proto functions together for clarity. At the moment, socks_op_func_proto is in the middle of lwt_inout_func_proto and lwt_xmit_func_proto. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- net/core/filter.c | 83 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 2e05dcfda6d7..5dc44309d124 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4783,33 +4783,6 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } -static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; - case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; - case BPF_FUNC_csum_diff: - return &bpf_csum_diff_proto; - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_proto; - case BPF_FUNC_get_route_realm: - return &bpf_get_route_realm_proto; - case BPF_FUNC_get_hash_recalc: - return &bpf_get_hash_recalc_proto; - case BPF_FUNC_perf_event_output: - return &bpf_skb_event_output_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_skb_under_cgroup: - return &bpf_skb_under_cgroup_proto; - default: - return bpf_base_func_proto(func_id); - } -} - static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4875,6 +4848,44 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_push_encap_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4906,7 +4917,7 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id, prog); + return lwt_out_func_proto(func_id, prog); } } @@ -6587,13 +6598,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_inout_verifier_ops = { - .get_func_proto = lwt_inout_func_proto, +const struct bpf_verifier_ops lwt_in_verifier_ops = { + .get_func_proto = lwt_in_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_in_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_out_verifier_ops = { + .get_func_proto = lwt_out_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -const struct bpf_prog_ops lwt_inout_prog_ops = { +const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -- cgit v1.2.3 From 004d4b274e2a1a895a0e5dc66158b90a7d463d44 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:16 +0100 Subject: ipv6: sr: Add seg6local action End.BPF This patch adds the End.BPF action to the LWT seg6local infrastructure. This action works like any other seg6local End action, meaning that an IPv6 header with SRH is needed, whose DA has to be equal to the SID of the action. It will also advance the SRH to the next segment, the BPF program does not have to take care of this. Since the BPF program may not be a source of instability in the kernel, it is important to ensure that the integrity of the packet is maintained before yielding it back to the IPv6 layer. The hook hence keeps track if the SRH has been altered through the helpers, and re-validates its content if needed with seg6_validate_srh. The state kept for validation is stored in a per-CPU buffer. The BPF program is not allowed to directly write into the packet, and only some fields of the SRH can be altered through the helper bpf_lwt_seg6_store_bytes. Performances profiling has shown that the SRH re-validation does not induce a significant overhead. If the altered SRH is deemed as invalid, the packet is dropped. This validation is also done before executing any action through bpf_lwt_seg6_action, and will not be performed again if the SRH is not modified after calling the action. The BPF program may return 3 types of return codes: - BPF_OK: the End.BPF action will look up the next destination through seg6_lookup_nexthop. - BPF_REDIRECT: if an action has been executed through the bpf_lwt_seg6_action helper, the BPF program should return this value, as the skb's destination is already set and the default lookup should not be performed. - BPF_DROP : the packet will be dropped. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- net/core/filter.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 5dc44309d124..aa114c4acb25 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4921,6 +4921,21 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_lwt_seg6_store_bytes: + return &bpf_lwt_seg6_store_bytes_proto; + case BPF_FUNC_lwt_seg6_action: + return &bpf_lwt_seg6_action_proto; + case BPF_FUNC_lwt_seg6_adjust_srh: + return &bpf_lwt_seg6_adjust_srh_proto; + default: + return lwt_out_func_proto(func_id, prog); + } +} + static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -6629,6 +6644,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; +const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { + .get_func_proto = lwt_seg6local_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_seg6local_prog_ops = { + .test_run = bpf_prog_test_run_skb, +}; + const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, -- cgit v1.2.3 From 67f29e07e131ffa13ea158c259a513f474c7df27 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:46 +0200 Subject: bpf: devmap introduce dev_map_enqueue Functionality is the same, but the ndo_xdp_xmit call is now simply invoked from inside the devmap.c code. V2: Fix compile issue reported by kbuild test robot V5: Cleanups requested by Daniel - Newlines before func definition - Use BUILD_BUG_ON checks - Remove unnecessary use return value store in dev_map_enqueue Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index aa114c4acb25..c867106d3707 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3065,20 +3065,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: { - struct net_device *dev = fwd; - struct xdp_frame *xdpf; + struct bpf_dtab_netdev *dst = fwd; - if (!dev->netdev_ops->ndo_xdp_xmit) - return -EOPNOTSUPP; - - xdpf = convert_to_xdp_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - - /* TODO: move to inside map code instead, for bulk support - * err = dev_map_enqueue(dev, xdp); - */ - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + err = dev_map_enqueue(dst, xdp); if (err) return err; __dev_map_insert_ctx(map, index); -- cgit v1.2.3 From 38edddb81172e8b8decb057c0cd23271583a5fa0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:57 +0200 Subject: xdp: add tracepoint for devmap like cpumap have Notice how this allow us get XDP statistic without affecting the XDP performance, as tracepoint is no-longer activated on a per packet basis. V5: Spotted by John Fastabend. Fix 'sent' also counted 'drops' in this patch, a later patch corrected this, but it was a mistake in this intermediate step. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index c867106d3707..36cf2f87d742 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3067,7 +3067,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, case BPF_MAP_TYPE_DEVMAP: { struct bpf_dtab_netdev *dst = fwd; - err = dev_map_enqueue(dst, xdp); + err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; __dev_map_insert_ctx(map, index); -- cgit v1.2.3 From 389ab7f01af988c2a1ec5617eb0c7e220df1ef1c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:07 +0200 Subject: xdp: introduce xdp_return_frame_rx_napi When sending an xdp_frame through xdp_do_redirect call, then error cases can happen where the xdp_frame needs to be dropped, and returning an -errno code isn't sufficient/possible any-longer (e.g. for cpumap case). This is already fully supported, by simply calling xdp_return_frame. This patch is an optimization, which provides xdp_return_frame_rx_napi, which is a faster variant for these error cases. It take advantage of the protection provided by XDP RX running under NAPI protection. This change is mostly relevant for drivers using the page_pool allocator as it can take advantage of this. (Tested with mlx5). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/xdp.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/xdp.c b/net/core/xdp.c index bf6758f74339..cb8c4e061a5a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,7 +308,13 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -static void xdp_return(void *data, struct xdp_mem_info *mem) +/* XDP RX runs under NAPI protection, and in different delivery error + * scenarios (e.g. queue full), it is possible to return the xdp_frame + * while still leveraging this protection. The @napi_direct boolian + * is used for those calls sites. Thus, allowing for faster recycling + * of xdp_frames/pages in those cases. + */ +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); if (xa) - page_pool_put_page(xa->page_pool, page); + page_pool_put_page(xa->page_pool, page, napi_direct); else put_page(page); rcu_read_unlock(); @@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) void xdp_return_frame(struct xdp_frame *xdpf) { - xdp_return(xdpf->data, &xdpf->mem); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) +{ + __xdp_return(xdpf->data, &xdpf->mem, true); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); + void xdp_return_buff(struct xdp_buff *xdp) { - xdp_return(xdp->data, &xdp->rxq->mem); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_buff); -- cgit v1.2.3 From 735fc4054b3a25034445c6713d259da0f96f8131 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:12 +0200 Subject: xdp: change ndo_xdp_xmit API to support bulking This patch change the API for ndo_xdp_xmit to support bulking xdp_frames. When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown. Most of the slowdown is caused by DMA API indirect function calls, but also the net_device->ndo_xdp_xmit() call. Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed performance improved: for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps With frames avail as a bulk inside the driver ndo_xdp_xmit call, further optimizations are possible, like bulk DMA-mapping for TX. Testing without CONFIG_RETPOLINE show the same performance for physical NIC drivers. The virtual NIC driver tun sees a huge performance boost, as it can avoid doing per frame producer locking, but instead amortize the locking cost over the bulk. V2: Fix compile errors reported by kbuild test robot V4: Isolated ndo, driver changes and callers. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 36cf2f87d742..1d75f9322275 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3039,7 +3039,7 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int err; + int sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; @@ -3049,9 +3049,9 @@ static int __bpf_tx_xdp(struct net_device *dev, if (unlikely(!xdpf)) return -EOVERFLOW; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) - return err; + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf); + if (sent <= 0) + return sent; dev->netdev_ops->ndo_xdp_flush(dev); return 0; } -- cgit v1.2.3