From a2c09ac0fb6756d7085c359b6c020ef8b4205e0f Mon Sep 17 00:00:00 2001 From: Inju Song Date: Tue, 27 Mar 2018 23:14:40 +0900 Subject: netfilter: ipvs: Keep latest weight of destination The hashing table in scheduler such as source hash or maglev hash should ignore the changed weight to 0 and allow changing the weight from/to non-0 values. So, struct ip_vs_dest needs to keep weight with latest non-0 weight. Signed-off-by: Inju Song Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index eb0bec043c96..0ac795b41ab8 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -668,6 +668,7 @@ struct ip_vs_dest { volatile unsigned int flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ + atomic_t last_weight; /* server latest weight */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ -- cgit v1.2.3 From d1361840f8c519eaee9a78ffe09e4f0a1b586846 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:35 -0700 Subject: tcp: fix SO_RCVLOWAT and RCVBUF autotuning Applications might use SO_RCVLOWAT on TCP socket hoping to receive one [E]POLLIN event only when a given amount of bytes are ready in socket receive queue. Problem is that receive autotuning is not aware of this constraint, meaning sk_rcvbuf might be too small to allow all bytes to be stored. Add a new (struct proto_ops)->set_rcvlowat method so that a protocol can override the default setsockopt(SO_RCVLOWAT) behavior. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 1 + include/net/tcp.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 2248a052061d..6554d3ba4396 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -197,6 +197,7 @@ struct proto_ops { int offset, size_t size, int flags); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); + int (*set_rcvlowat)(struct sock *sk, int val); }; #define DECLARE_SOCKADDR(type, dst, src) \ diff --git a/include/net/tcp.h b/include/net/tcp.h index 9c9b3768b350..b2318242cad8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -402,6 +402,7 @@ void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); +int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); -- cgit v1.2.3 From 03f45c883c6f391ed4fff8292415b35bd1107519 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:37 -0700 Subject: tcp: avoid extra wakeups for SO_RCVLOWAT users SO_RCVLOWAT is properly handled in tcp_poll(), so that POLLIN is only generated when enough bytes are available in receive queue, after David change (commit c7004482e8dc "tcp: Respect SO_RCVLOWAT in tcp_poll().") But TCP still calls sk->sk_data_ready() for each chunk added in receive queue, meaning thread is awaken, and goes back to sleep shortly after. Tested: tcp_mmap test program, receiving 32768 MB of data with SO_RCVLOWAT set to 512KB -> Should get ~2 wakeups (c-switches) per MB, regardless of how many (tiny or big) packets were received. High speed (mostly full size GRO packets) received 32768 MB (100 % mmap'ed) in 8.03112 s, 34.2266 Gbit, cpu usage user:0.037 sys:1.404, 43.9758 usec per MB, 65497 c-switches received 32768 MB (99.9954 % mmap'ed) in 7.98453 s, 34.4263 Gbit, cpu usage user:0.03 sys:1.422, 44.3115 usec per MB, 65485 c-switches Low speed (sender is ratelimited and sends 1-MSS at a time, so GRO is not helping) received 22474.5 MB (100 % mmap'ed) in 6015.35 s, 0.0313414 Gbit, cpu usage user:0.05 sys:1.586, 72.7952 usec per MB, 44950 c-switches Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index b2318242cad8..0ee85c47c185 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -403,6 +403,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); +void tcp_data_ready(struct sock *sk); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); -- cgit v1.2.3 From 93ab6cc69162775201587cc9da00d5016dc890e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Apr 2018 10:33:38 -0700 Subject: tcp: implement mmap() for zero copy receive Some networks can make sure TCP payload can exactly fit 4KB pages, with well chosen MSS/MTU and architectures. Implement mmap() system call so that applications can avoid copying data without complex splice() games. Note that a successful mmap( X bytes) on TCP socket is consuming bytes, as if recvmsg() has been done. (tp->copied += X) Only PROT_READ mappings are accepted, as skb page frags are fundamentally shared and read only. If tcp_mmap() finds data that is not a full page, or a patch of urgent data, -EINVAL is returned, no bytes are consumed. Application must fallback to recvmsg() to read the problematic sequence. mmap() wont block, regardless of socket being in blocking or non-blocking mode. If not enough bytes are in receive queue, mmap() would return -EAGAIN, or -EIO if socket is in a state where no other bytes can be added into receive queue. An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD) to efficiently use mmap() On the sender side, MSG_EOR might help to clearly separate unaligned headers and 4K-aligned chunks if necessary. Tested: mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch. MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header) Without mmap() (tcp_mmap -s) received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit, cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit, cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit, cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit, cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches With mmap() on receiver (tcp_mmap -s -z) received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit, cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit, cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit, cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit, cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ee85c47c185..833154e3df17 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_data_ready(struct sock *sk); +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma); void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); -- cgit v1.2.3 From a5724fc3834643a975bd0db71f001ca65f4a8382 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 16 Apr 2018 21:37:13 +0200 Subject: PCI: Add two more values for PCIe Max_Read_Request_Size This patch adds missing values for the max read request size. E.g. network driver r8169 uses a value of 4K. Signed-off-by: Heiner Kallweit Acked-by: Bjorn Helgaas Signed-off-by: David S. Miller --- include/uapi/linux/pci_regs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 103ba797a8f3..83ade9b5cf95 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -506,6 +506,8 @@ #define PCI_EXP_DEVCTL_READRQ_256B 0x1000 /* 256 Bytes */ #define PCI_EXP_DEVCTL_READRQ_512B 0x2000 /* 512 Bytes */ #define PCI_EXP_DEVCTL_READRQ_1024B 0x3000 /* 1024 Bytes */ +#define PCI_EXP_DEVCTL_READRQ_2048B 0x4000 /* 2048 Bytes */ +#define PCI_EXP_DEVCTL_READRQ_4096B 0x5000 /* 4096 Bytes */ #define PCI_EXP_DEVCTL_BCR_FLR 0x8000 /* Bridge Configuration Retry / FLR */ #define PCI_EXP_DEVSTA 10 /* Device Status */ #define PCI_EXP_DEVSTA_CED 0x0001 /* Correctable Error Detected */ -- cgit v1.2.3 From ef53e9e14714de2ce26eaae0244c07c426064d69 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 16 Apr 2018 15:07:13 -0700 Subject: net: Remove unused tcp_set_state tracepoint This tracepoint was replaced by inet_sock_set_state in 563e0bb and not used anywhere in the kernel anymore. Remove it. Signed-off-by: Andrey Ignatov Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 47 ---------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 878b2be7ce77..3dd68029d77a 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -166,53 +166,6 @@ DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_ARGS(sk) ); -TRACE_EVENT(tcp_set_state, - - TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), - - TP_ARGS(sk, oldstate, newstate), - - TP_STRUCT__entry( - __field(const void *, skaddr) - __field(int, oldstate) - __field(int, newstate) - __field(__u16, sport) - __field(__u16, dport) - __array(__u8, saddr, 4) - __array(__u8, daddr, 4) - __array(__u8, saddr_v6, 16) - __array(__u8, daddr_v6, 16) - ), - - TP_fast_assign( - struct inet_sock *inet = inet_sk(sk); - __be32 *p32; - - __entry->skaddr = sk; - __entry->oldstate = oldstate; - __entry->newstate = newstate; - - __entry->sport = ntohs(inet->inet_sport); - __entry->dport = ntohs(inet->inet_dport); - - p32 = (__be32 *) __entry->saddr; - *p32 = inet->inet_saddr; - - p32 = (__be32 *) __entry->daddr; - *p32 = inet->inet_daddr; - - TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, - sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); - ), - - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", - __entry->sport, __entry->dport, - __entry->saddr, __entry->daddr, - __entry->saddr_v6, __entry->daddr_v6, - show_tcp_state_name(__entry->oldstate), - show_tcp_state_name(__entry->newstate)) -); - TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), -- cgit v1.2.3 From 5ab073ffd326480a6185d096e9703f62ef92b86c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:26 +0200 Subject: xdp: introduce xdp_return_frame API and use in cpumap Introduce an xdp_return_frame API, and convert over cpumap as the first user, given it have queued XDP frame structure to leverage. V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck. V6: Remove comment that id will be added later (Req by Alex Duyck) V8: Rename enum mem_type to xdp_mem_type (found by kbuild test robot) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index b2362ddfa694..e4207699c410 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -33,16 +33,43 @@ * also mandatory during RX-ring setup. */ +enum xdp_mem_type { + MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ + MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_MAX, +}; + +struct xdp_mem_info { + u32 type; /* enum xdp_mem_type, but known size type */ +}; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; + struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ + +static inline +void xdp_return_frame(void *data, struct xdp_mem_info *mem) +{ + if (mem->type == MEM_TYPE_PAGE_SHARED) + page_frag_free(data); + + if (mem->type == MEM_TYPE_PAGE_ORDER0) { + struct page *page = virt_to_page(data); /* Assumes order0 page*/ + + put_page(page); + } +} + int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator); #endif /* __LINUX_NET_XDP_H__ */ -- cgit v1.2.3 From 106ca27f2922e8de820d1bd3d79b1cbdf2d78eea Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:37 +0200 Subject: xdp: move struct xdp_buff from filter.h to xdp.h This is done to prepare for the next patch, and it is also nice to move this XDP related struct out of filter.h. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/filter.h | 24 +----------------------- include/net/xdp.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index fc4e8f91b03d..4da8b2308174 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -30,6 +30,7 @@ struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; +struct xdp_buff; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -500,14 +501,6 @@ struct bpf_skb_data_end { void *data_end; }; -struct xdp_buff { - void *data; - void *data_end; - void *data_meta; - void *data_hard_start; - struct xdp_rxq_info *rxq; -}; - struct sk_msg_buff { void *data; void *data_end; @@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev, struct bpf_prog *prog); void xdp_do_flush_map(void); -/* Drivers not supporting XDP metadata can use this helper, which - * rejects any room expansion for metadata as a result. - */ -static __always_inline void -xdp_set_data_meta_invalid(struct xdp_buff *xdp) -{ - xdp->data_meta = xdp->data + 1; -} - -static __always_inline bool -xdp_data_meta_unsupported(const struct xdp_buff *xdp) -{ - return unlikely(xdp->data_meta > xdp->data); -} - void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); diff --git a/include/net/xdp.h b/include/net/xdp.h index e4207699c410..15f8ade008b5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -50,6 +50,13 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_buff { + void *data; + void *data_end; + void *data_meta; + void *data_hard_start; + struct xdp_rxq_info *rxq; +}; static inline void xdp_return_frame(void *data, struct xdp_mem_info *mem) @@ -72,4 +79,19 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} + #endif /* __LINUX_NET_XDP_H__ */ -- cgit v1.2.3 From c0048cff8abb69c956ce1277d17a3f7a14e41522 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:42 +0200 Subject: xdp: introduce a new xdp_frame type This is needed to convert drivers tuntap and virtio_net. This is a generalization of what is done inside cpumap, which will be converted later. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 15f8ade008b5..756c42811e78 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -58,6 +58,46 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct xdp_frame { + void *data; + u16 len; + u16 headroom; + u16 metasize; + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem info is valid on remote CPU. + */ + struct xdp_mem_info mem; +}; + +/* Convert xdp_buff to xdp_frame */ +static inline +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) +{ + struct xdp_frame *xdp_frame; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) + return NULL; + + /* Store info in top of packet */ + xdp_frame = xdp->data_hard_start; + + xdp_frame->data = xdp->data; + xdp_frame->len = xdp->data_end - xdp->data; + xdp_frame->headroom = headroom - sizeof(*xdp_frame); + xdp_frame->metasize = metasize; + + /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ + xdp_frame->mem = xdp->rxq->mem; + + return xdp_frame; +} + static inline void xdp_return_frame(void *data, struct xdp_mem_info *mem) { -- cgit v1.2.3 From 1ffcbc8537d0bc32aaca7000cb9c904ec4b6300f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:47 +0200 Subject: tun: convert to use generic xdp_frame and xdp_return_frame API The tuntap driver invented it's own driver specific way of queuing XDP packets, by storing the xdp_buff information in the top of the XDP frame data. Convert it over to use the more generic xdp_frame structure. The main problem with the in-driver method is that the xdp_rxq_info pointer cannot be trused/used when dequeueing the frame. V3: Remove check based on feedback from Jason Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/if_tun.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index fd00170b494f..3d2996dc7d85 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -22,7 +22,7 @@ #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); -bool tun_is_xdp_buff(void *ptr); +bool tun_is_xdp_frame(void *ptr); void *tun_xdp_to_ptr(void *ptr); void *tun_ptr_to_xdp(void *ptr); void tun_ptr_free(void *ptr); @@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); } -static inline bool tun_is_xdp_buff(void *ptr) +static inline bool tun_is_xdp_frame(void *ptr) { return false; } -- cgit v1.2.3 From 70280ed91cb8acb43e8fd7a8094840846c172ac5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:57 +0200 Subject: bpf: cpumap convert to use generic xdp_frame The generic xdp_frame format, was inspired by the cpumap own internal xdp_pkt format. It is now time to convert it over to the generic xdp_frame format. The cpumap needs one extra field dev_rx. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 756c42811e78..ea3773f94f65 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -67,6 +67,7 @@ struct xdp_frame { * while mem info is valid on remote CPU. */ struct xdp_mem_info mem; + struct net_device *dev_rx; /* used by cpumap */ }; /* Convert xdp_buff to xdp_frame */ -- cgit v1.2.3 From 8d5d88527587516bd58ff0f3810f07c38e65e2be Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:12 +0200 Subject: xdp: rhashtable with allocator ID to pointer mapping Use the IDA infrastructure for getting a cyclic increasing ID number, that is used for keeping track of each registered allocator per RX-queue xdp_rxq_info. Instead of using the IDR infrastructure, which uses a radix tree, use a dynamic rhashtable, for creating ID to pointer lookup table, because this is faster. The problem that is being solved here is that, the xdp_rxq_info pointer (stored in xdp_buff) cannot be used directly, as the guaranteed lifetime is too short. The info is needed on a (potentially) remote CPU during DMA-TX completion time . In an xdp_frame the xdp_mem_info is stored, when it got converted from an xdp_buff, which is sufficient for the simple page refcnt based recycle schemes. For more advanced allocators there is a need to store a pointer to the registered allocator. Thus, there is a need to guard the lifetime or validity of the allocator pointer, which is done through this rhashtable ID map to pointer. The removal and validity of of the allocator and helper struct xdp_mem_allocator is guarded by RCU. The allocator will be created by the driver, and registered with xdp_rxq_info_reg_mem_model(). It is up-to debate who is responsible for freeing the allocator pointer or invoking the allocator destructor function. In any case, this must happen via RCU freeing. Use the IDA infrastructure for getting a cyclic increasing ID number, that is used for keeping track of each registered allocator per RX-queue xdp_rxq_info. V4: Per req of Jason Wang - Use xdp_rxq_info_reg_mem_model() in all drivers implementing XDP_REDIRECT, even-though it's not strictly necessary when allocator==NULL for type MEM_TYPE_PAGE_SHARED (given it's zero). V6: Per req of Alex Duyck - Introduce rhashtable_lookup() call in later patch V8: Address sparse should be static warnings (from kbuild test robot) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index ea3773f94f65..5f67c62540aa 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -41,6 +41,7 @@ enum xdp_mem_type { struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ + u32 id; }; struct xdp_rxq_info { @@ -99,18 +100,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) return xdp_frame; } -static inline -void xdp_return_frame(void *data, struct xdp_mem_info *mem) -{ - if (mem->type == MEM_TYPE_PAGE_SHARED) - page_frag_free(data); - - if (mem->type == MEM_TYPE_PAGE_ORDER0) { - struct page *page = virt_to_page(data); /* Assumes order0 page*/ - - put_page(page); - } -} +void xdp_return_frame(void *data, struct xdp_mem_info *mem); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); -- cgit v1.2.3 From ff7d6b27f894f1469dc51ccb828b7363ccd9799f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:17 +0200 Subject: page_pool: refurbish version of page_pool code Need a fast page recycle mechanism for ndo_xdp_xmit API for returning pages on DMA-TX completion time, which have good cross CPU performance, given DMA-TX completion time can happen on a remote CPU. Refurbish my page_pool code, that was presented[1] at MM-summit 2016. Adapted page_pool code to not depend the page allocator and integration into struct page. The DMA mapping feature is kept, even-though it will not be activated/used in this patchset. [1] http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf V2: Adjustments requested by Tariq - Changed page_pool_create return codes, don't return NULL, only ERR_PTR, as this simplifies err handling in drivers. V4: many small improvements and cleanups - Add DOC comment section, that can be used by kernel-doc - Improve fallback mode, to work better with refcnt based recycling e.g. remove a WARN as pointed out by Tariq e.g. quicker fallback if ptr_ring is empty. V5: Fixed SPDX license as pointed out by Alexei V6: Adjustments requested by Eric Dumazet - Adjust ____cacheline_aligned_in_smp usage/placement - Move rcu_head in struct page_pool - Free pages quicker on destroy, minimize resources delayed an RCU period - Remove code for forward/backward compat ABI interface V8: Issues found by kbuild test robot - Address sparse should be static warnings - Only compile+link when a driver use/select page_pool, mlx5 selects CONFIG_PAGE_POOL, although its first used in two patches Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 129 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 include/net/page_pool.h (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h new file mode 100644 index 000000000000..1fe77db59518 --- /dev/null +++ b/include/net/page_pool.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.h + * Author: Jesper Dangaard Brouer + * Copyright (C) 2016 Red Hat, Inc. + */ + +/** + * DOC: page_pool allocator + * + * This page_pool allocator is optimized for the XDP mode that + * uses one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * Basic use involve replacing alloc_pages() calls with the + * page_pool_alloc_pages() call. Drivers should likely use + * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). + * + * If page_pool handles DMA mapping (use page->private), then API user + * is responsible for invoking page_pool_put_page() once. In-case of + * elevated refcnt, the DMA state is released, assuming other users of + * the page will eventually call put_page(). + * + * If no DMA mapping is done, then it can act as shim-layer that + * fall-through to alloc_page. As no state is kept on the page, the + * regular put_page() call is sufficient. + */ +#ifndef _NET_PAGE_POOL_H +#define _NET_PAGE_POOL_H + +#include /* Needed by ptr_ring */ +#include +#include + +#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ +#define PP_FLAG_ALL PP_FLAG_DMA_MAP + +/* + * Fast allocation side cache array/stack + * + * The cache size and refill watermark is related to the network + * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX + * ring is usually refilled and the max consumed elements will be 64, + * thus a natural max size of objects needed in the cache. + * + * Keeping room for more objects, is due to XDP_DROP use-case. As + * XDP_DROP allows the opportunity to recycle objects directly into + * this array, as it shares the same softirq/NAPI protection. If + * cache is already full (or partly full) then the XDP_DROP recycles + * would have to take a slower code path. + */ +#define PP_ALLOC_CACHE_SIZE 128 +#define PP_ALLOC_CACHE_REFILL 64 +struct pp_alloc_cache { + u32 count; + void *cache[PP_ALLOC_CACHE_SIZE]; +}; + +struct page_pool_params { + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; /* Numa node id to allocate from pages from */ + struct device *dev; /* device, for DMA pre-mapping purposes */ + enum dma_data_direction dma_dir; /* DMA mapping direction */ +}; + +struct page_pool { + struct rcu_head rcu; + struct page_pool_params p; + + /* + * Data structure for allocation side + * + * Drivers allocation side usually already perform some kind + * of resource protection. Piggyback on this protection, and + * require driver to protect allocation side. + * + * For NIC drivers this means, allocate a page_pool per + * RX-queue. As the RX-queue is already protected by + * Softirq/BH scheduling and napi_schedule. NAPI schedule + * guarantee that a single napi_struct will only be scheduled + * on a single CPU (see napi_schedule). + */ + struct pp_alloc_cache alloc ____cacheline_aligned_in_smp; + + /* Data structure for storing recycled pages. + * + * Returning/freeing pages is more complicated synchronization + * wise, because free's can happen on remote CPUs, with no + * association with allocation resource. + * + * Use ptr_ring, as it separates consumer and producer + * effeciently, it a way that doesn't bounce cache-lines. + * + * TODO: Implement bulk return pages into this structure. + */ + struct ptr_ring ring; +}; + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); + +static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) +{ + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); + + return page_pool_alloc_pages(pool, gfp); +} + +struct page_pool *page_pool_create(const struct page_pool_params *params); + +void page_pool_destroy(struct page_pool *pool); + +/* Never call this directly, use helpers below */ +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct); + +static inline void page_pool_put_page(struct page_pool *pool, struct page *page) +{ + __page_pool_put_page(pool, page, false); +} +/* Very limited use-cases allow recycle direct */ +static inline void page_pool_recycle_direct(struct page_pool *pool, + struct page *page) +{ + __page_pool_put_page(pool, page, true); +} + +#endif /* _NET_PAGE_POOL_H */ -- cgit v1.2.3 From 57d0a1c1ac9e6a836bbab4698ba2a2e03f64bf1b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:22 +0200 Subject: xdp: allow page_pool as an allocator type in xdp_return_frame New allocator type MEM_TYPE_PAGE_POOL for page_pool usage. The registered allocator page_pool pointer is not available directly from xdp_rxq_info, but it could be (if needed). For now, the driver should keep separate track of the page_pool pointer, which it should use for RX-ring page allocation. As suggested by Saeed, to maintain a symmetric API it is the drivers responsibility to allocate/create and free/destroy the page_pool. Thus, after the driver have called xdp_rxq_info_unreg(), it is drivers responsibility to free the page_pool, but with a RCU free call. This is done easily via the page_pool helper page_pool_destroy() (which avoids touching any driver code during the RCU callback, which could happen after the driver have been unloaded). V8: address issues found by kbuild test robot - Address sparse should be static warnings - Allow xdp.o to be compiled without page_pool.o V9: Remove inline from .c file, compiler knows best Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 14 ++++++++++++++ include/net/xdp.h | 3 +++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1fe77db59518..c79087153148 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -117,7 +117,12 @@ void __page_pool_put_page(struct page_pool *pool, static inline void page_pool_put_page(struct page_pool *pool, struct page *page) { + /* When page_pool isn't compiled-in, net/core/xdp.c doesn't + * allow registering MEM_TYPE_PAGE_POOL, but shield linker. + */ +#ifdef CONFIG_PAGE_POOL __page_pool_put_page(pool, page, false); +#endif } /* Very limited use-cases allow recycle direct */ static inline void page_pool_recycle_direct(struct page_pool *pool, @@ -126,4 +131,13 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } +static inline bool is_page_pool_compiled_in(void) +{ +#ifdef CONFIG_PAGE_POOL + return true; +#else + return false; +#endif +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 5f67c62540aa..d0ee437753dc 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -36,6 +36,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ + MEM_TYPE_PAGE_POOL, MEM_TYPE_MAX, }; @@ -44,6 +45,8 @@ struct xdp_mem_info { u32 id; }; +struct page_pool; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; -- cgit v1.2.3 From 039930945a72d9af5ff04ae9b9e60658a52e0770 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:32 +0200 Subject: xdp: transition into using xdp_frame for return API Changing API xdp_return_frame() to take struct xdp_frame as argument, seems like a natural choice. But there are some subtle performance details here that needs extra care, which is a deliberate choice. When de-referencing xdp_frame on a remote CPU during DMA-TX completion, result in the cache-line is change to "Shared" state. Later when the page is reused for RX, then this xdp_frame cache-line is written, which change the state to "Modified". This situation already happens (naturally) for, virtio_net, tun and cpumap as the xdp_frame pointer is the queued object. In tun and cpumap, the ptr_ring is used for efficiently transferring cache-lines (with pointers) between CPUs. Thus, the only option is to de-referencing xdp_frame. It is only the ixgbe driver that had an optimization, in which it can avoid doing the de-reference of xdp_frame. The driver already have TX-ring queue, which (in case of remote DMA-TX completion) have to be transferred between CPUs anyhow. In this data area, we stored a struct xdp_mem_info and a data pointer, which allowed us to avoid de-referencing xdp_frame. To compensate for this, a prefetchw is used for telling the cache coherency protocol about our access pattern. My benchmarks show that this prefetchw is enough to compensate the ixgbe driver. V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT") V8: Adjust for commit bd658dda4237 ("net/mlx5e: Separate dma base address and offset in dma_sync call") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/xdp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index d0ee437753dc..137ad5f9f40f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -103,7 +103,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) return xdp_frame; } -void xdp_return_frame(void *data, struct xdp_mem_info *mem); +void xdp_return_frame(struct xdp_frame *xdpf); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); -- cgit v1.2.3 From 44fa2dbd475996ddc8f3a0e6113dee983e0ee3aa Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:37 +0200 Subject: xdp: transition into using xdp_frame for ndo_xdp_xmit Changing API ndo_xdp_xmit to take a struct xdp_frame instead of struct xdp_buff. This brings xdp_return_frame and ndp_xdp_xmit in sync. This builds towards changing the API further to become a bulk API, because xdp_buff is not a queue-able object while xdp_frame is. V4: Adjust for commit 59655a5b6c83 ("tuntap: XDP_TX can use native XDP") V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf44503ea81a..14e0777ffcfb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1165,7 +1165,7 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); + * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); * This function is used to submit a XDP packet for transmit on a * netdevice. * void (*ndo_xdp_flush)(struct net_device *dev); @@ -1356,7 +1356,7 @@ struct net_device_ops { int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_buff *xdp); + struct xdp_frame *xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; -- cgit v1.2.3 From 032234d8231909ac049f22ea3b408487e1c103eb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 10:00:39 -0700 Subject: net/ipv6: Make __inet6_bind static BPF core gets access to __inet6_bind via ipv6_bpf_stub_impl, so it is not invoked directly outside of af_inet6.c. Make it static and move inet6_bind after to avoid forward declaration. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 836f31af1369..68b167d98879 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1044,8 +1044,6 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); int inet6_release(struct socket *sock); -int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); -- cgit v1.2.3 From bdb7cc643fc9db8d6ed9a2b9e524e27ac5882029 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Mon, 16 Apr 2018 13:42:16 -0400 Subject: ipv6: Count interface receive statistics on the ingress netdev The statistics such as InHdrErrors should be counted on the ingress netdev rather than on the dev from the dst, which is the egress. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/addrconf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 378d601258be..8312cc25a3af 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -307,6 +307,20 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +/** + * __in6_dev_get_safely - get inet6_dev pointer from netdevice + * @dev: network device + * + * This is a safer version of __in6_dev_get + */ +static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) +{ + if (likely(dev)) + return rcu_dereference_rtnl(dev->ip6_ptr); + else + return NULL; +} + /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device -- cgit v1.2.3 From 72f6d71e491e6ce269b564865b21fab0a4402dd3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 17 Apr 2018 14:11:28 +0800 Subject: vxlan: add ttl inherit support Like tos inherit, ttl inherit should also means inherit the inner protocol's ttl values, which actually not implemented in vxlan yet. But we could not treat ttl == 0 as "use the inner TTL", because that would be used also when the "ttl" option is not specified and that would be a behavior change, and breaking real use cases. So add a different attribute IFLA_VXLAN_TTL_INHERIT when "ttl inherit" is specified with ip cmd. Reported-by: Jianlin Shi Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 11 +++++++++++ include/net/vxlan.h | 1 + include/uapi/linux/if_link.h | 1 + 3 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 540a4b4417bf..751646adc769 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -379,6 +379,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, return 0; } +static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, + const struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + return iph->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + return ((const struct ipv6hdr *)iph)->hop_limit; + else + return 0; +} + /* Propogate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) diff --git a/include/net/vxlan.h b/include/net/vxlan.h index ad73d8b3fcc2..b99a02ae3934 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -262,6 +262,7 @@ struct vxlan_dev { #define VXLAN_F_COLLECT_METADATA 0x2000 #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 +#define VXLAN_F_TTL_INHERIT 0x10000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 68699f654118..b85266420bfb 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -516,6 +516,7 @@ enum { IFLA_VXLAN_COLLECT_METADATA, IFLA_VXLAN_LABEL, IFLA_VXLAN_GPE, + IFLA_VXLAN_TTL_INHERIT, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) -- cgit v1.2.3 From a919525ad832d2bb1388b2303832a2307b30aeff Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:07 -0700 Subject: net: Move fib_convert_metrics to metrics file Move logic of fib_convert_metrics into ip_metrics_convert. This allows the code that converts netlink attributes into metrics struct to be re-used in a later patch by IPv6. This is mostly a code move with the following changes to variable names: - fi->fib_net becomes net - fc_mx and fc_mx_len are passed as inputs pulled from fib_config - metrics array is passed as an input from fi->fib_metrics->metrics Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index ecffd843e7b8..dc4a2d6e58a5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -396,6 +396,9 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk, return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } +int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, + u32 *metrics); + u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); -- cgit v1.2.3 From 7aef6859ee91ea867a3dff9ba47bca9b2de382f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:10 -0700 Subject: net/ipv6: Pass net to fib6_update_sernum Pass net namespace to fib6_update_sernum. It can not be marked const as fib6_new_sernum will change ipv6.fib6_sernum. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 5e86fd9dc857..f0aaf1c8f1a8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -408,7 +408,7 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); -void fib6_update_sernum(struct rt6_info *rt); +void fib6_update_sernum(struct net *net, struct rt6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); #ifdef CONFIG_IPV6_MULTIPLE_TABLES -- cgit v1.2.3 From afb1d4b59311a8252f67c214b37ec69d8100cb55 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:11 -0700 Subject: net/ipv6: Pass net namespace to route functions Pass network namespace reference into route add, delete and get functions. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 08b132381984..1130a1144dfd 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -101,8 +101,8 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct rt6_info *); -int ip6_del_rt(struct rt6_info *); +int ip6_ins_rt(struct net *net, struct rt6_info *rt); +int ip6_del_rt(struct net *net, struct rt6_info *rt); void rt6_flush_exceptions(struct rt6_info *rt); int rt6_remove_exception_rt(struct rt6_info *rt); @@ -137,7 +137,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, +struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, @@ -147,9 +147,11 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, * support functions for ND * */ -struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, +struct rt6_info *rt6_get_dflt_router(struct net *net, + const struct in6_addr *addr, struct net_device *dev); -struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, +struct rt6_info *rt6_add_dflt_router(struct net *net, + const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); void rt6_purge_dflt_routers(struct net *net); -- cgit v1.2.3 From e8478e80e5a74f4ce47b043735f0066588fb64c7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:13 -0700 Subject: net/ipv6: Save route type in rt6_info The RTN_ type for IPv6 FIB entries is currently embedded in rt6i_flags and dst.error. Since dst is going to be removed, it can no longer be relied on for FIB dumps so save the route type as fib6_type. fc_type is set in current users based on the algorithm in rt6_fill_node: - rt6i_flags contains RTF_LOCAL: fc_type = RTN_LOCAL - rt6i_flags contains RTF_ANYCAST: fc_type = RTN_ANYCAST - else fc_type = RTN_UNICAST Similarly, fib6_type is set in the rt6_info templates based on the RTF_REJECT section of rt6_fill_node converting dst.error to RTN type. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f0aaf1c8f1a8..0165820bbafb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -174,6 +174,7 @@ struct rt6_info { int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, unused:6; -- cgit v1.2.3 From 5e670d844b2a4e47d1b9b9aceb14dd3c12a6d4bf Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:14 -0700 Subject: net/ipv6: Move nexthop data to fib6_nh Introduce fib6_nh structure and move nexthop related data from rt6_info and rt6_info.dst to fib6_nh. References to dev, gateway or lwtstate from a FIB lookup perspective are converted to use fib6_nh; datapath references to dst version are left as is. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 16 ++++++++++++---- include/net/ip6_route.h | 6 +++--- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0165820bbafb..f0a88370ba95 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -127,6 +127,16 @@ struct rt6_exception { #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) #define FIB6_MAX_DEPTH 5 +struct fib6_nh { + struct in6_addr nh_gw; + struct net_device *nh_dev; + struct lwtunnel_state *nh_lwtstate; + + unsigned int nh_flags; + atomic_t nh_upper_bound; + int nh_weight; +}; + struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; @@ -149,12 +159,9 @@ struct rt6_info { */ struct list_head rt6i_siblings; unsigned int rt6i_nsiblings; - atomic_t rt6i_nh_upper_bound; atomic_t rt6i_ref; - unsigned int rt6i_nh_flags; - /* These are in a separate cache line. */ struct rt6key rt6i_dst ____cacheline_aligned_in_smp; u32 rt6i_flags; @@ -171,13 +178,14 @@ struct rt6_info { u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ - int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, unused:6; + + struct fib6_nh fib6_nh; }; #define for_each_fib6_node_rt_rcu(fn) \ diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 1130a1144dfd..655e13017a45 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -273,10 +273,10 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) { - return a->dst.dev == b->dst.dev && + return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && a->rt6i_idev == b->rt6i_idev && - ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) && - !lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate); + ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && + !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } #endif -- cgit v1.2.3 From d4ead6b34b67fd711639324b6465a050bcb197d4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:16 -0700 Subject: net/ipv6: move metrics from dst to rt6_info Similar to IPv4, add fib metrics to the fib struct, which at the moment is rt6_info. Will be moved to fib6_info in a later patch. Copy metrics into dst by reference using refcount. To make the transition: - add dst_metrics to rt6_info. Default to dst_default_metrics if no metrics are passed during route add. No need for a separate pmtu entry; it can reference the MTU slot in fib6_metrics - ip6_convert_metrics allocates memory in the FIB entry and uses ip_metrics_convert to copy from netlink attribute to metrics entry - the convert metrics call is done in ip6_route_info_create simplifying the route add path + fib6_commit_metrics and fib6_copy_metrics and the temporary mx6_config are no longer needed - add fib6_metric_set helper to change the value of a metric in the fib entry since dst_metric_set can no longer be used - cow_metrics for IPv6 can drop to dst_cow_metrics_generic - rt6_dst_from_metrics_check is no longer needed - rt6_fill_node needs the FIB entry and dst as separate arguments to keep compatibility with existing output. Current dst address is renamed to dest. (to be consistent with IPv4 rt6_fill_node really should be split into 2 functions similar to fib_dump_info and rt_fill_info) - rt6_fill_node no longer needs the temporary metrics variable Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f0a88370ba95..1f8dc9d12abb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -94,11 +94,6 @@ struct fib6_gc_args { #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif -struct mx6_config { - const u32 *mx; - DECLARE_BITMAP(mx_valid, RTAX_MAX); -}; - /* * routing information * @@ -176,7 +171,6 @@ struct rt6_info { struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 rt6i_metric; - u32 rt6i_pmtu; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; @@ -185,6 +179,8 @@ struct rt6_info { should_flush:1, unused:6; + struct dst_metrics *fib6_metrics; +#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct fib6_nh fib6_nh; }; @@ -390,8 +386,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, - struct netlink_ext_ack *extack); + struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct rt6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, @@ -420,6 +415,12 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb); void fib6_update_sernum(struct net *net, struct rt6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); +void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val); +static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric) +{ + return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); +} + #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); void fib6_rules_cleanup(void); -- cgit v1.2.3 From 14895687d36805f051bb54014c32e48e5937f7e1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:17 -0700 Subject: net/ipv6: move expires into rt6_info Add expires to rt6_info for FIB entries, and add fib6 helpers to manage it. Data path use of dst.expires remains. The transition is fairly straightforward: when working with fib entries, rt->dst.expires is just rt->expires, rt6_clean_expires is replaced with fib6_clean_expires, rt6_set_expires becomes fib6_set_expires, and rt6_check_expired becomes fib6_check_expired, where the fib6 versions are added by this patch. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1f8dc9d12abb..c73b985734f5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -179,6 +179,7 @@ struct rt6_info { should_flush:1, unused:6; + unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct fib6_nh fib6_nh; @@ -197,6 +198,26 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) return ((struct rt6_info *)dst)->rt6i_idev; } +static inline void fib6_clean_expires(struct rt6_info *f6i) +{ + f6i->rt6i_flags &= ~RTF_EXPIRES; + f6i->expires = 0; +} + +static inline void fib6_set_expires(struct rt6_info *f6i, + unsigned long expires) +{ + f6i->expires = expires; + f6i->rt6i_flags |= RTF_EXPIRES; +} + +static inline bool fib6_check_expired(const struct rt6_info *f6i) +{ + if (f6i->rt6i_flags & RTF_EXPIRES) + return time_after(jiffies, f6i->expires); + return false; +} + static inline void rt6_clean_expires(struct rt6_info *rt) { rt->rt6i_flags &= ~RTF_EXPIRES; @@ -211,11 +232,9 @@ static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires) static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) { - struct rt6_info *rt; + if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) + rt0->dst.expires = rt0->from->expires; - for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from); - if (rt && rt != rt0) - rt0->dst.expires = rt->dst.expires; dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } -- cgit v1.2.3 From 421842edeaf62c4e180b687f5a4efca8c19c49ad Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:18 -0700 Subject: net/ipv6: Add fib6_null_entry ip6_null_entry will stay a dst based return for lookups that fail to match an entry. Add a new fib6_null_entry which constitutes the root node and leafs for fibs. Replace existing references to ip6_null_entry with the new fib6_null_entry when dealing with FIBs. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index c29f09cfc9d7..74e4e1e449d5 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -60,7 +60,8 @@ struct netns_ipv6 { #endif struct xt_table *ip6table_nat; #endif - struct rt6_info *ip6_null_entry; + struct rt6_info *fib6_null_entry; + struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; -- cgit v1.2.3 From 3b6761d18bc11f2af2a6fc494e9026d39593f22c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:20 -0700 Subject: net/ipv6: Move dst flags to booleans in fib entries Continuing to wean FIB paths off of dst_entry, use a bool to hold requests for certain dst settings. Add a helper to convert the flags to DST flags when a FIB entry is converted to a dst_entry. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c73b985734f5..159f651dee55 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -177,7 +177,10 @@ struct rt6_info { u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, - unused:6; + dst_nocount:1, + dst_nopolicy:1, + dst_host:1, + unused:3; unsigned long expires; struct dst_metrics *fib6_metrics; -- cgit v1.2.3 From f8a1b43b709d8ef33a8de2f8f35095b4a4413713 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:21 -0700 Subject: net/ipv6: Create a neigh_lookup for FIB entries The router discovery code has a FIB entry and wants to validate the gateway has a neighbor entry. Refactor the existing dst_neigh_lookup for IPv6 and create a new function that takes the gateway and device and returns a neighbor entry. Use the new function in ndisc_router_discovery to validate the gateway. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 655e13017a45..cb6fb7e16a28 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -279,4 +279,7 @@ static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } +struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, + struct net_device *dev, struct sk_buff *skb, + const void *daddr); #endif -- cgit v1.2.3 From acb54e3cba404c20f07733f3222c0418a7724a5b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:22 -0700 Subject: net/ipv6: Add gfp_flags to route add functions Most FIB entries can be added using memory allocated with GFP_KERNEL. Add gfp_flags to ip6_route_add and addrconf_dst_alloc. Code paths that can be reached from the packet path (e.g., ndisc and autoconfig) or atomic notifiers use GFP_ATOMIC; paths from user context (adding addresses and routes) use GFP_KERNEL. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index cb6fb7e16a28..ff70266e30d7 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -100,7 +100,8 @@ void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); -int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); +int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct rt6_info *rt); int ip6_del_rt(struct net *net, struct rt6_info *rt); @@ -138,7 +139,8 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, - const struct in6_addr *addr, bool anycast); + const struct in6_addr *addr, bool anycast, + gfp_t gfp_flags); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); -- cgit v1.2.3 From 23fb93a4d3f118a900790066d03368a296dce0d6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:23 -0700 Subject: net/ipv6: Cleanup exception and cache route handling IPv6 FIB will only contain FIB entries with exception routes added to the FIB entry. Once this transformation is complete, FIB lookups will return a fib6_info with the lookup functions still returning a dst based rt6_info. The current code uses rt6_info for both paths and overloads the rt6_info variable usually called 'rt'. This patch introduces a new 'f6i' variable name for the result of the FIB lookup and keeps 'rt' as the dst based return variable. 'f6i' becomes a fib6_info in a later patch which is why it is introduced as f6i now; avoids the additional churn in the later patch. In addition, remove RTF_CACHE and dst checks from fib6 add and delete since they can not happen now and will never happen after the data type flip. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index ff70266e30d7..686cdc7f356a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -106,7 +106,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt); int ip6_del_rt(struct net *net, struct rt6_info *rt); void rt6_flush_exceptions(struct rt6_info *rt); -int rt6_remove_exception_rt(struct rt6_info *rt); void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, unsigned long now); -- cgit v1.2.3 From a64efe142f5e70b7e39276a414bbb3b96691c608 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:24 -0700 Subject: net/ipv6: introduce fib6_info struct and helpers Add fib6_info struct and alloc, destroy, hold and release helpers. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 159f651dee55..630392ae12d8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -38,6 +38,7 @@ #endif struct rt6_info; +struct fib6_info; struct fib6_config { u32 fc_table; @@ -132,6 +133,46 @@ struct fib6_nh { int nh_weight; }; +struct fib6_info { + struct fib6_table *rt6i_table; + struct fib6_info __rcu *rt6_next; + struct fib6_node __rcu *rt6i_node; + + /* Multipath routes: + * siblings is a list of fib6_info that have the the same metric/weight, + * destination, but not the same gateway. nsiblings is just a cache + * to speed up lookup. + */ + struct list_head rt6i_siblings; + unsigned int rt6i_nsiblings; + + atomic_t rt6i_ref; + struct inet6_dev *rt6i_idev; + unsigned long expires; + struct dst_metrics *fib6_metrics; +#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] + + struct rt6key rt6i_dst; + u32 rt6i_flags; + struct rt6key rt6i_src; + struct rt6key rt6i_prefsrc; + + struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; + + u32 rt6i_metric; + u8 rt6i_protocol; + u8 fib6_type; + u8 exception_bucket_flushed:1, + should_flush:1, + dst_nocount:1, + dst_nopolicy:1, + dst_host:1, + unused:3; + + struct fib6_nh fib6_nh; +}; + struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; @@ -291,6 +332,20 @@ static inline void ip6_rt_put(struct rt6_info *rt) void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); +struct rt6_info *fib6_info_alloc(gfp_t gfp_flags); +void fib6_info_destroy(struct rt6_info *f6i); + +static inline void fib6_info_hold(struct rt6_info *f6i) +{ + atomic_inc(&f6i->rt6i_ref); +} + +static inline void fib6_info_release(struct rt6_info *f6i) +{ + if (f6i && atomic_dec_and_test(&f6i->rt6i_ref)) + fib6_info_destroy(f6i); +} + static inline void rt6_hold(struct rt6_info *rt) { atomic_inc(&rt->rt6i_ref); -- cgit v1.2.3 From 93531c6743157d7e8c5792f8ed1a57641149d62c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:25 -0700 Subject: net/ipv6: separate handling of FIB entries from dst based routes Last step before flipping the data type for FIB entries: - use fib6_info_alloc to create FIB entries in ip6_route_info_create and addrconf_dst_alloc - use fib6_info_release in place of dst_release, ip6_rt_put and rt6_release - remove the dst_hold before calling __ip6_ins_rt or ip6_del_rt - when purging routes, drop per-cpu routes - replace inc and dec of rt6i_ref with fib6_info_hold and fib6_info_release - use rt->from since it points to the FIB entry - drop references to exception bucket, fib6_metrics and per-cpu from dst entries (those are relevant for fib entries only) Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 4 +--- include/net/ip6_route.h | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 630392ae12d8..6c3d92bb3459 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -314,9 +314,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - rt = rt->from; - - rt6_get_cookie_safe(rt, &cookie); + rt6_get_cookie_safe(rt->from, &cookie); return cookie; } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 686cdc7f356a..57d0d45667f1 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -114,8 +114,7 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = - rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; + struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL; int err = 0; if (rt && rt->rt6i_prefsrc.plen) -- cgit v1.2.3 From 8d1c802b2815edc97af8a58c5045ebaf3848621a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:26 -0700 Subject: net/ipv6: Flip FIB entries to fib6_info Convert all code paths referencing a FIB entry from rt6_info to fib6_info. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/if_inet6.h | 4 ++-- include/net/ip6_fib.h | 42 +++++++++++++++++++++--------------------- include/net/ip6_route.h | 28 ++++++++++++++-------------- include/net/netns/ipv6.h | 2 +- 4 files changed, 38 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index d4088d1a688d..d6089b2e64fe 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -64,7 +64,7 @@ struct inet6_ifaddr { struct delayed_work dad_work; struct inet6_dev *idev; - struct rt6_info *rt; + struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; @@ -144,7 +144,7 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; struct inet6_dev *aca_idev; - struct rt6_info *aca_rt; + struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; int aca_users; refcount_t aca_refcnt; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6c3d92bb3459..d41b7bd69fb3 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -75,12 +75,12 @@ struct fib6_node { #ifdef CONFIG_IPV6_SUBTREES struct fib6_node __rcu *subtree; #endif - struct rt6_info __rcu *leaf; + struct fib6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; - struct rt6_info __rcu *rr_ptr; + struct fib6_info __rcu *rr_ptr; struct rcu_head rcu; }; @@ -176,7 +176,7 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; - struct rt6_info *from; + struct fib6_info *from; /* * Tail elements of dst_entry (__refcnt etc.) @@ -242,20 +242,20 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) return ((struct rt6_info *)dst)->rt6i_idev; } -static inline void fib6_clean_expires(struct rt6_info *f6i) +static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->rt6i_flags &= ~RTF_EXPIRES; f6i->expires = 0; } -static inline void fib6_set_expires(struct rt6_info *f6i, +static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { f6i->expires = expires; f6i->rt6i_flags |= RTF_EXPIRES; } -static inline bool fib6_check_expired(const struct rt6_info *f6i) +static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->rt6i_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); @@ -288,7 +288,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, +static inline bool rt6_get_cookie_safe(const struct fib6_info *rt, u32 *cookie) { struct fib6_node *fn; @@ -330,15 +330,15 @@ static inline void ip6_rt_put(struct rt6_info *rt) void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); -struct rt6_info *fib6_info_alloc(gfp_t gfp_flags); -void fib6_info_destroy(struct rt6_info *f6i); +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); +void fib6_info_destroy(struct fib6_info *f6i); -static inline void fib6_info_hold(struct rt6_info *f6i) +static inline void fib6_info_hold(struct fib6_info *f6i) { atomic_inc(&f6i->rt6i_ref); } -static inline void fib6_info_release(struct rt6_info *f6i) +static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && atomic_dec_and_test(&f6i->rt6i_ref)) fib6_info_destroy(f6i); @@ -371,7 +371,7 @@ enum fib6_walk_state { struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; - struct rt6_info *leaf; + struct fib6_info *leaf; enum fib6_walk_state state; unsigned int skip; unsigned int count; @@ -435,7 +435,7 @@ typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ - struct rt6_info *rt; + struct fib6_info *rt; }; /* @@ -457,14 +457,14 @@ struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *saddr, int src_len, bool exact_match); -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), +void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); -int fib6_add(struct fib6_node *root, struct rt6_info *rt, +int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); -int fib6_del(struct rt6_info *rt, struct nl_info *info); +int fib6_del(struct fib6_info *rt, struct nl_info *info); -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, +void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); @@ -487,11 +487,11 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); -void fib6_update_sernum(struct net *net, struct rt6_info *rt); -void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); +void fib6_update_sernum(struct net *net, struct fib6_info *rt); +void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); -void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val); -static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric) +void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val); +static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) { return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 57d0d45667f1..d5fb1e4ae7ac 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,7 +66,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt) { return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; @@ -102,14 +102,14 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct net *net, struct rt6_info *rt); -int ip6_del_rt(struct net *net, struct rt6_info *rt); +int ip6_ins_rt(struct net *net, struct fib6_info *rt); +int ip6_del_rt(struct net *net, struct fib6_info *rt); -void rt6_flush_exceptions(struct rt6_info *rt); -void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, +void rt6_flush_exceptions(struct fib6_info *rt); +void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, unsigned long now); -static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, +static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) @@ -136,9 +136,9 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, - const struct in6_addr *addr, bool anycast, - gfp_t gfp_flags); +struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, + const struct in6_addr *addr, bool anycast, + gfp_t gfp_flags); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); @@ -147,10 +147,10 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, * support functions for ND * */ -struct rt6_info *rt6_get_dflt_router(struct net *net, +struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev); -struct rt6_info *rt6_add_dflt_router(struct net *net, +struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); @@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg { struct net *net; }; -int rt6_dump_route(struct rt6_info *rt, void *p_arg); +int rt6_dump_route(struct fib6_info *rt, void *p_arg); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); -void rt6_multipath_rebalance(struct rt6_info *rt); +void rt6_multipath_rebalance(struct fib6_info *rt); void rt6_uncached_list_add(struct rt6_info *rt); void rt6_uncached_list_del(struct rt6_info *rt); @@ -271,7 +271,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, return daddr; } -static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) +static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && a->rt6i_idev == b->rt6i_idev && diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 74e4e1e449d5..97b3a54579c8 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -60,7 +60,7 @@ struct netns_ipv6 { #endif struct xt_table *ip6table_nat; #endif - struct rt6_info *fib6_null_entry; + struct fib6_info *fib6_null_entry; struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; -- cgit v1.2.3 From 77634cc67dc1ffc8ae6d869af6dee4b2ea6025ee Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Apr 2018 17:33:27 -0700 Subject: net/ipv6: Remove unused code and variables for rt6_info Drop unneeded elements from rt6_info struct and rearrange layout to something more relevant for the data path. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 60 +++------------------------------------------------ 1 file changed, 3 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d41b7bd69fb3..a36116b92100 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -175,58 +175,20 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; - struct rt6_info __rcu *rt6_next; struct fib6_info *from; - /* - * Tail elements of dst_entry (__refcnt etc.) - * and these elements (rarely used in hot path) are in - * the same cache line. - */ - struct fib6_table *rt6i_table; - struct fib6_node __rcu *rt6i_node; - + struct rt6key rt6i_dst; + struct rt6key rt6i_src; struct in6_addr rt6i_gateway; - - /* Multipath routes: - * siblings is a list of rt6_info that have the the same metric/weight, - * destination, but not the same gateway. nsiblings is just a cache - * to speed up lookup. - */ - struct list_head rt6i_siblings; - unsigned int rt6i_nsiblings; - - atomic_t rt6i_ref; - - /* These are in a separate cache line. */ - struct rt6key rt6i_dst ____cacheline_aligned_in_smp; + struct inet6_dev *rt6i_idev; u32 rt6i_flags; - struct rt6key rt6i_src; struct rt6key rt6i_prefsrc; struct list_head rt6i_uncached; struct uncached_list *rt6i_uncached_list; - struct inet6_dev *rt6i_idev; - struct rt6_info * __percpu *rt6i_pcpu; - struct rt6_exception_bucket __rcu *rt6i_exception_bucket; - - u32 rt6i_metric; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; - u8 rt6i_protocol; - u8 fib6_type; - u8 exception_bucket_flushed:1, - should_flush:1, - dst_nocount:1, - dst_nopolicy:1, - dst_host:1, - unused:3; - - unsigned long expires; - struct dst_metrics *fib6_metrics; -#define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] - struct fib6_nh fib6_nh; }; #define for_each_fib6_node_rt_rcu(fn) \ @@ -328,8 +290,6 @@ static inline void ip6_rt_put(struct rt6_info *rt) dst_release(&rt->dst); } -void rt6_free_pcpu(struct rt6_info *non_pcpu_rt); - struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); void fib6_info_destroy(struct fib6_info *f6i); @@ -344,20 +304,6 @@ static inline void fib6_info_release(struct fib6_info *f6i) fib6_info_destroy(f6i); } -static inline void rt6_hold(struct rt6_info *rt) -{ - atomic_inc(&rt->rt6i_ref); -} - -static inline void rt6_release(struct rt6_info *rt) -{ - if (atomic_dec_and_test(&rt->rt6i_ref)) { - rt6_free_pcpu(rt); - dst_dev_put(&rt->dst); - dst_release(&rt->dst); - } -} - enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, -- cgit v1.2.3 From b32cc5b9a346319c171e3ad905e0cddda032b5eb Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Tue, 17 Apr 2018 21:42:13 -0700 Subject: bpf: adding bpf_xdp_adjust_tail helper Adding new bpf helper which would allow us to manipulate xdp's data_end pointer, and allow us to reduce packet's size indended use case: to generate ICMP messages from XDP context, where such message would contain truncated original packet. Signed-off-by: Nikita V. Shirokov Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c5ec89732a8d..9a2d1a04eb24 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -755,6 +755,13 @@ union bpf_attr { * @addr: pointer to struct sockaddr to bind socket to * @addr_len: length of sockaddr structure * Return: 0 on success or negative error code + * + * int bpf_xdp_adjust_tail(xdp_md, delta) + * Adjust the xdp_md.data_end by delta. Only shrinking of packet's + * size is supported. + * @xdp_md: pointer to xdp_md + * @delta: A negative integer to be added to xdp_md.data_end + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -821,7 +828,8 @@ union bpf_attr { FN(msg_apply_bytes), \ FN(msg_cork_bytes), \ FN(msg_pull_data), \ - FN(bind), + FN(bind), \ + FN(xdp_adjust_tail), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From a2dd6877b43ef14129f258910d60b2e81b32100b Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Tue, 17 Apr 2018 17:30:31 -0400 Subject: soc: ti: K2G: provide APIs to support driver probe deferral This patch provide APIs to allow client drivers to support probe deferral. On K2G SoC, devices can be probed only after the ti_sci_pm_domains driver is probed and ready. As drivers may get probed at different order, any driver that depends on knav dma and qmss drivers, for example netcp network driver, needs to defer probe until knav devices are probed and ready to service. To do this, add an API to query the device ready status from the knav dma and qmss devices. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- include/linux/soc/ti/knav_dma.h | 12 ++++++++++++ include/linux/soc/ti/knav_qmss.h | 1 + 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h index 66693bc4c6ad..7127ec301537 100644 --- a/include/linux/soc/ti/knav_dma.h +++ b/include/linux/soc/ti/knav_dma.h @@ -167,6 +167,8 @@ struct knav_dma_desc { void *knav_dma_open_channel(struct device *dev, const char *name, struct knav_dma_cfg *config); void knav_dma_close_channel(void *channel); +int knav_dma_get_flow(void *channel); +bool knav_dma_device_ready(void); #else static inline void *knav_dma_open_channel(struct device *dev, const char *name, struct knav_dma_cfg *config) @@ -176,6 +178,16 @@ static inline void *knav_dma_open_channel(struct device *dev, const char *name, static inline void knav_dma_close_channel(void *channel) {} +static inline int knav_dma_get_flow(void *channel) +{ + return -EINVAL; +} + +static inline bool knav_dma_device_ready(void) +{ + return false; +} + #endif #endif /* __SOC_TI_KEYSTONE_NAVIGATOR_DMA_H__ */ diff --git a/include/linux/soc/ti/knav_qmss.h b/include/linux/soc/ti/knav_qmss.h index 9f0ebb3bad27..9745df6ed9d3 100644 --- a/include/linux/soc/ti/knav_qmss.h +++ b/include/linux/soc/ti/knav_qmss.h @@ -86,5 +86,6 @@ int knav_pool_desc_map(void *ph, void *desc, unsigned size, void *knav_pool_desc_unmap(void *ph, dma_addr_t dma, unsigned dma_sz); dma_addr_t knav_pool_desc_virt_to_dma(void *ph, void *virt); void *knav_pool_desc_dma_to_virt(void *ph, dma_addr_t dma); +bool knav_qmss_device_ready(void); #endif /* __SOC_TI_KNAV_QMSS_H__ */ -- cgit v1.2.3 From ce20cdf498bdfa6c6130be75cf4359dad305ebcd Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 9 Apr 2018 16:40:34 +0200 Subject: netfilter: xt_NFLOG: use nf_log_packet instead of nfulnl_log_packet. The nfulnl_log_packet() is added to make sure that the NFLOG target works as only user-space logger. but now, nf_log_packet() can find proper log function using NF_LOG_TYPE_ULOG and NF_LOG_TYPE_LOG. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nfnetlink_log.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h index 612cfb63ac68..ea32a7d3cf1b 100644 --- a/include/net/netfilter/nfnetlink_log.h +++ b/include/net/netfilter/nfnetlink_log.h @@ -1,18 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _KER_NFNETLINK_LOG_H -#define _KER_NFNETLINK_LOG_H - -void -nfulnl_log_packet(struct net *net, - u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *li_user, - const char *prefix); - -#define NFULNL_COPY_DISABLED 0xff - -#endif /* _KER_NFNETLINK_LOG_H */ - -- cgit v1.2.3 From e21db6f69a95b846ff04e31fe0a86004cbd000d7 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:48 -0700 Subject: tcp: track total bytes delivered with ECN CE marks Introduce a new delivered_ce stat in tcp socket to estimate number of packets being marked with CE bits. The estimation is done via ACKs with ECE bit. Depending on the actual receiver behavior, the estimation could have biases. Since the TCP sender can't really see the CE bit in the data path, so the sender is technically counting packets marked delivered with the "ECE / ECN-Echo" flag set. With RFC3168 ECN, because the ECE bit is sticky, this count can drastically overestimate the nummber of CE-marked data packets With DCTCP-style ECN this should be reasonably precise unless there is loss in the ACK path, in which case it's not precise. With AccECN proposal this can be made still more precise, even in the case some degree of ACK loss. However this is sender's best estimate of CE information. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 8f4c54986f97..20585d5c4e1c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -281,6 +281,7 @@ struct tcp_sock { * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 lost; /* Total data packets lost incl. rexmits */ u32 app_limited; /* limited until "delivered" reaches this val */ u64 first_tx_mstamp; /* start of window send phase */ -- cgit v1.2.3 From feb5f2ec646483fb66f9ad7218b1aad2a93a2a5c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 17 Apr 2018 23:18:49 -0700 Subject: tcp: export packets delivery info Export data delivered and delivered with CE marks to 1) SNMP TCPDelivered and TCPDeliveredCE 2) getsockopt(TCP_INFO) 3) Timestamping API SOF_TIMESTAMPING_OPT_STATS Note that for SCM_TSTAMP_ACK, the delivery info in SOF_TIMESTAMPING_OPT_STATS is reported before the info was fully updated on the ACK. These stats help application monitor TCP delivery and ECN status on per host, per connection, even per message level. Signed-off-by: Yuchung Cheng Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 2 ++ include/uapi/linux/tcp.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 33a70ece462f..d02e859301ff 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -276,6 +276,8 @@ enum LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */ LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */ LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ + LINUX_MIB_TCPDELIVERED, /* TCPDelivered */ + LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 560374c978f9..379b08700a54 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -224,6 +224,9 @@ struct tcp_info { __u64 tcpi_busy_time; /* Time (usec) busy sending data */ __u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */ __u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */ + + __u32 tcpi_delivered; + __u32 tcpi_delivered_ce; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ @@ -244,6 +247,8 @@ enum { TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ TCP_NLA_CA_STATE, /* ca_state of socket */ TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ + TCP_NLA_DELIVERED, /* Data pkts delivered incl. out-of-order */ + TCP_NLA_DELIVERED_CE, /* Like above but only ones w/ CE marks */ }; -- cgit v1.2.3 From af81f9e75e546de43f4f0bf532d880ddc16a4dc8 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:08 +0100 Subject: netfilter: nf_flow_table: use IP_CT_DIR_* values for FLOW_OFFLOAD_DIR_* Simplifies further code cleanups Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 833752dd0c58..09ba67598991 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -6,6 +6,7 @@ #include #include #include +#include #include struct nf_flowtable; @@ -27,11 +28,10 @@ struct nf_flowtable { }; enum flow_offload_tuple_dir { - FLOW_OFFLOAD_DIR_ORIGINAL, - FLOW_OFFLOAD_DIR_REPLY, - __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY, + FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, + FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, + FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX }; -#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1) struct flow_offload_tuple { union { -- cgit v1.2.3 From 88078d98d1bb085d72af8437707279e203524fa5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Apr 2018 11:43:15 -0700 Subject: net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends After working on IP defragmentation lately, I found that some large packets defeat CHECKSUM_COMPLETE optimization because of NIC adding zero paddings on the last (small) fragment. While removing the padding with pskb_trim_rcsum(), we set skb->ip_summed to CHECKSUM_NONE, forcing a full csum validation, even if all prior fragments had CHECKSUM_COMPLETE set. We can instead compute the checksum of the part we are trimming, usually smaller than the part we keep. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9065477ed255..d274059529eb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3131,6 +3131,7 @@ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len) return skb->data; } +int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); /** * pskb_trim_rcsum - trim received skb and update checksum * @skb: buffer to trim @@ -3144,9 +3145,7 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (likely(len >= skb->len)) return 0; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - return __pskb_trim(skb, len); + return pskb_trim_rcsum_slow(skb, len); } static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) -- cgit v1.2.3 From 93c2fb253d177a0b8f4f93592441f88c9b7d6245 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:38:59 -0700 Subject: net/ipv6: Rename fib6_info struct elements Change the prefix for fib6_info struct elements from rt6i_ to fib6_. rt6i_pcpu and rt6i_exception_bucket are left as is given that they point to rt6_info entries. Rename only; not functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 38 +++++++++++++++++++------------------- include/net/ip6_route.h | 26 +++++++++++++------------- 2 files changed, 32 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index a36116b92100..994dd67207fb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -134,34 +134,34 @@ struct fib6_nh { }; struct fib6_info { - struct fib6_table *rt6i_table; + struct fib6_table *fib6_table; struct fib6_info __rcu *rt6_next; - struct fib6_node __rcu *rt6i_node; + struct fib6_node __rcu *fib6_node; /* Multipath routes: * siblings is a list of fib6_info that have the the same metric/weight, * destination, but not the same gateway. nsiblings is just a cache * to speed up lookup. */ - struct list_head rt6i_siblings; - unsigned int rt6i_nsiblings; + struct list_head fib6_siblings; + unsigned int fib6_nsiblings; - atomic_t rt6i_ref; - struct inet6_dev *rt6i_idev; + atomic_t fib6_ref; + struct inet6_dev *fib6_idev; unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] - struct rt6key rt6i_dst; - u32 rt6i_flags; - struct rt6key rt6i_src; - struct rt6key rt6i_prefsrc; + struct rt6key fib6_dst; + u32 fib6_flags; + struct rt6key fib6_src; + struct rt6key fib6_prefsrc; struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; - u32 rt6i_metric; - u8 rt6i_protocol; + u32 fib6_metric; + u8 fib6_protocol; u8 fib6_type; u8 exception_bucket_flushed:1, should_flush:1, @@ -206,7 +206,7 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) static inline void fib6_clean_expires(struct fib6_info *f6i) { - f6i->rt6i_flags &= ~RTF_EXPIRES; + f6i->fib6_flags &= ~RTF_EXPIRES; f6i->expires = 0; } @@ -214,12 +214,12 @@ static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { f6i->expires = expires; - f6i->rt6i_flags |= RTF_EXPIRES; + f6i->fib6_flags |= RTF_EXPIRES; } static inline bool fib6_check_expired(const struct fib6_info *f6i) { - if (f6i->rt6i_flags & RTF_EXPIRES) + if (f6i->fib6_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); return false; } @@ -250,14 +250,14 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct fib6_info *rt, +static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i, u32 *cookie) { struct fib6_node *fn; bool status = false; rcu_read_lock(); - fn = rcu_dereference(rt->rt6i_node); + fn = rcu_dereference(f6i->fib6_node); if (fn) { *cookie = fn->fn_sernum; @@ -295,12 +295,12 @@ void fib6_info_destroy(struct fib6_info *f6i); static inline void fib6_info_hold(struct fib6_info *f6i) { - atomic_inc(&f6i->rt6i_ref); + atomic_inc(&f6i->fib6_ref); } static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && atomic_dec_and_test(&f6i->rt6i_ref)) + if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) fib6_info_destroy(f6i); } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index d5fb1e4ae7ac..b4b85109984f 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,9 +66,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt) +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == RTF_GATEWAY; } @@ -102,23 +102,23 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); -int ip6_ins_rt(struct net *net, struct fib6_info *rt); -int ip6_del_rt(struct net *net, struct fib6_info *rt); +int ip6_ins_rt(struct net *net, struct fib6_info *f6i); +int ip6_del_rt(struct net *net, struct fib6_info *f6i); -void rt6_flush_exceptions(struct fib6_info *rt); -void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, +void rt6_flush_exceptions(struct fib6_info *f6i); +void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now); -static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt, +static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL; + struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL; int err = 0; - if (rt && rt->rt6i_prefsrc.plen) - *saddr = rt->rt6i_prefsrc.addr; + if (f6i && f6i->fib6_prefsrc.plen) + *saddr = f6i->fib6_prefsrc.addr; else err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, daddr, prefs, saddr); @@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg { struct net *net; }; -int rt6_dump_route(struct fib6_info *rt, void *p_arg); +int rt6_dump_route(struct fib6_info *f6i, void *p_arg); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); -void rt6_multipath_rebalance(struct fib6_info *rt); +void rt6_multipath_rebalance(struct fib6_info *f6i); void rt6_uncached_list_add(struct rt6_info *rt); void rt6_uncached_list_del(struct rt6_info *rt); @@ -274,7 +274,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && - a->rt6i_idev == b->rt6i_idev && + a->fib6_idev == b->fib6_idev && ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } -- cgit v1.2.3 From 360a9887c8c01a715b2b4b131f7c7462f7cce576 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:00 -0700 Subject: net/ipv6: Rename addrconf_dst_alloc addrconf_dst_alloc now returns a fib6_info. Update the name and its users to reflect the change. Rename only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index b4b85109984f..31e24f821ef6 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -136,7 +136,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); -struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev, +struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags); -- cgit v1.2.3 From 9ee8cbb2fd4a7d6f483a20c4b8e82d8b1cf685fa Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:01 -0700 Subject: net/ipv6: Remove aca_idev aca_idev has only 1 user - inet6_fill_ifacaddr - and it only wants the device index which can be extracted from the fib6_info nexthop. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 - include/net/ip6_fib.h | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index d6089b2e64fe..db389253dc2a 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -143,7 +143,6 @@ struct ipv6_ac_socklist { struct ifacaddr6 { struct in6_addr aca_addr; - struct inet6_dev *aca_idev; struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; int aca_users; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 994dd67207fb..bd11c990c353 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -410,6 +410,11 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct fib6_info *rt, struct nl_info *info); +static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) +{ + return f6i->fib6_nh.nh_dev; +} + void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); -- cgit v1.2.3 From 6fe749414446d86cfaae1c84bfdd556fe4fa8fca Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:03 -0700 Subject: net/ipv6: Change ip6_route_get_saddr to get dev from route Prior to 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") host routes and anycast routes were installed with the device set to loopback (or VRF device once that feature was added). In the older code dst.dev was set to loopback (needed for packet tx) and rt6i_idev was used to denote the actual interface. Commit 4832c30d5458 changed the code to have dst.dev pointing to the real device with the switch to lo or vrf device done on dst clones. As a consequence of this change ip6_route_get_saddr can just pass the nexthop device to ipv6_dev_get_saddr. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 31e24f821ef6..c0620330035c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -114,14 +114,15 @@ static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, unsigned int prefs, struct in6_addr *saddr) { - struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL; int err = 0; - if (f6i && f6i->fib6_prefsrc.plen) + if (f6i && f6i->fib6_prefsrc.plen) { *saddr = f6i->fib6_prefsrc.addr; - else - err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, - daddr, prefs, saddr); + } else { + struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL; + + err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr); + } return err; } -- cgit v1.2.3 From 647d4c1363a85bec63ecf929d4ab4aae78b2a960 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:04 -0700 Subject: net/ipv6: Remove compare of fib6_idev from rt6_duplicate_nexthop After 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") the comparison of idev does not add value since it correlates to the nexthop device which is already compared. Remove the idev comparison. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index c0620330035c..8df4ff798b04 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -275,7 +275,6 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && - a->fib6_idev == b->fib6_idev && ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } -- cgit v1.2.3 From dcd1f572954f9d66d7b4a65df894ed5b4c467368 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Apr 2018 15:39:05 -0700 Subject: net/ipv6: Remove fib6_idev fib6_idev can be obtained from __in6_dev_get on the nexthop device rather than caching it in the fib6_info. Remove it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index bd11c990c353..642211174692 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -147,7 +147,6 @@ struct fib6_info { unsigned int fib6_nsiblings; atomic_t fib6_ref; - struct inet6_dev *fib6_idev; unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] -- cgit v1.2.3 From 69b693f0aefa0ed521e8bd02260523b5ae446ad7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:55:57 -0700 Subject: bpf: btf: Introduce BPF Type Format (BTF) This patch introduces BPF type Format (BTF). BTF (BPF Type Format) is the meta data format which describes the data types of BPF program/map. Hence, it basically focus on the C programming language which the modern BPF is primary using. The first use case is to provide a generic pretty print capability for a BPF map. BTF has its root from CTF (Compact C-Type format). To simplify the handling of BTF data, BTF removes the differences between small and big type/struct-member. Hence, BTF consistently uses u32 instead of supporting both "one u16" and "two u32 (+padding)" in describing type and struct-member. It also raises the number of types (and functions) limit from 0x7fff to 0x7fffffff. Due to the above changes, the format is not compatible to CTF. Hence, BTF starts with a new BTF_MAGIC and version number. This patch does the first verification pass to the BTF. The first pass checks: 1. meta-data size (e.g. It does not go beyond the total btf's size) 2. name_offset is valid 3. Each BTF_KIND (e.g. int, enum, struct....) does its own check of its meta-data. Some other checks, like checking a struct's member is referring to a valid type, can only be done in the second pass. The second verification pass will be implemented in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 130 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 include/uapi/linux/btf.h (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h new file mode 100644 index 000000000000..74a30b1090df --- /dev/null +++ b/include/uapi/linux/btf.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2018 Facebook */ +#ifndef _UAPI__LINUX_BTF_H__ +#define _UAPI__LINUX_BTF_H__ + +#include + +#define BTF_MAGIC 0xeB9F +#define BTF_MAGIC_SWAP 0x9FeB +#define BTF_VERSION 1 +#define BTF_FLAGS_COMPR 0x01 + +struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + + __u32 parent_label; + __u32 parent_name; + + /* All offsets are in bytes relative to the end of this header */ + __u32 label_off; /* offset of label section */ + __u32 object_off; /* offset of data object section*/ + __u32 func_off; /* offset of function section */ + __u32 type_off; /* offset of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ +}; + +/* Max # of type identifier */ +#define BTF_MAX_TYPE 0x7fffffff +/* Max offset into the string section */ +#define BTF_MAX_NAME_OFFSET 0x7fffffff +/* Max # of struct/union/enum members or func args */ +#define BTF_MAX_VLEN 0xffff + +/* The type id is referring to a parent BTF */ +#define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1) +#define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE) + +/* String is in the ELF string section */ +#define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1) +#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) + +struct btf_type { + __u32 name; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused + * bits 31: root + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; +}; + +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) +#define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80)) +#define BTF_INFO_VLEN(info) ((info) & 0xffff) + +#define BTF_KIND_UNKN 0 /* Unknown */ +#define BTF_KIND_INT 1 /* Integer */ +#define BTF_KIND_PTR 2 /* Pointer */ +#define BTF_KIND_ARRAY 3 /* Array */ +#define BTF_KIND_STRUCT 4 /* Struct */ +#define BTF_KIND_UNION 5 /* Union */ +#define BTF_KIND_ENUM 6 /* Enumeration */ +#define BTF_KIND_FWD 7 /* Forward */ +#define BTF_KIND_TYPEDEF 8 /* Typedef */ +#define BTF_KIND_VOLATILE 9 /* Volatile */ +#define BTF_KIND_CONST 10 /* Const */ +#define BTF_KIND_RESTRICT 11 /* Restrict */ +#define BTF_KIND_MAX 11 +#define NR_BTF_KINDS 12 + +/* For some specific BTF_KIND, "struct btf_type" is immediately + * followed by extra data. + */ + +/* BTF_KIND_INT is followed by a u32 and the following + * is the 32 bits arrangement: + */ +#define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24) +#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) +#define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) + +/* Attributes stored in the BTF_INT_ENCODING */ +#define BTF_INT_SIGNED 0x1 +#define BTF_INT_CHAR 0x2 +#define BTF_INT_BOOL 0x4 +#define BTF_INT_VARARGS 0x8 + +/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". + * The exact number of btf_enum is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum { + __u32 name; + __s32 val; +}; + +/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ +struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; +}; + +/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed + * by multiple "struct btf_member". The exact number + * of btf_member is stored in the vlen (of the info in + * "struct btf_type"). + */ +struct btf_member { + __u32 name; + __u32 type; + __u32 offset; /* offset in bits */ +}; + +#endif /* _UAPI__LINUX_BTF_H__ */ -- cgit v1.2.3 From eb3f595dab40b61011c5f123507a7db2df6f0e65 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:55:58 -0700 Subject: bpf: btf: Validate type reference After collecting all btf_type in the first pass in an earlier patch, the second pass (in this patch) can validate the reference types (e.g. the referring type does exist and it does not refer to itself). While checking the reference type, it also gathers other information (e.g. the size of an array). This info will be useful in checking the struct's members in a later patch. They will also be useful in doing pretty print later. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 include/linux/btf.h (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h new file mode 100644 index 000000000000..f14b60368753 --- /dev/null +++ b/include/linux/btf.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Facebook */ + +#ifndef _LINUX_BTF_H +#define _LINUX_BTF_H 1 + +#include + +struct btf; +struct btf_type; + +/* Figure out the size of a type_id. If type_id is a modifier + * (e.g. const), it will be resolved to find out the type with size. + * + * For example: + * In describing "const void *", type_id is "const" and "const" + * refers to "void *". The return type will be "void *". + * + * If type_id is a simple "int", then return type will be "int". + * + * @btf: struct btf object + * @type_id: Find out the size of type_id. The type_id of the return + * type is set to *type_id. + * @ret_size: It can be NULL. If not NULL, the size of the return + * type is set to *ret_size. + * Return: The btf_type (resolved to another type with size info if needed). + * NULL is returned if type_id itself does not have size info + * (e.g. void) or it cannot be resolved to another type that + * has size info. + * *type_id and *ret_size will not be changed in the + * NULL return case. + */ +const struct btf_type *btf_type_id_size(const struct btf *btf, + u32 *type_id, + u32 *ret_size); + +#endif -- cgit v1.2.3 From b00b8daec828dd59af7d1f7a42acd6e5867f80c6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:00 -0700 Subject: bpf: btf: Add pretty print capability for data with BTF type info This patch adds pretty print capability for data with BTF type info. The current usage is to allow pretty print for a BPF map. The next few patches will allow a read() on a pinned map with BTF type info for its key and value. This patch uses the seq_printf() infra. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index f14b60368753..d8bdab0280ba 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -33,5 +33,7 @@ struct btf_type; const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size); +void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m); #endif -- cgit v1.2.3 From f56a653c1fd13a197076dec4461c656fd2adec73 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:01 -0700 Subject: bpf: btf: Add BPF_BTF_LOAD command This patch adds a BPF_BTF_LOAD command which 1) loads and verifies the BTF (implemented in earlier patches) 2) returns a BTF fd to userspace. In the next patch, the BTF fd can be specified during BPF_MAP_CREATE. It currently limits to CAP_SYS_ADMIN. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 4 ++++ include/uapi/linux/bpf.h | 9 +++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index d8bdab0280ba..a7c7072535ea 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -8,7 +8,11 @@ struct btf; struct btf_type; +union bpf_attr; +void btf_put(struct btf *btf); +int btf_new_fd(const union bpf_attr *attr); +struct btf *btf_get_by_fd(int fd); /* Figure out the size of a type_id. If type_id is a modifier * (e.g. const), it will be resolved to find out the type with size. * diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9a2d1a04eb24..795bcd577750 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -95,6 +95,7 @@ enum bpf_cmd { BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, }; enum bpf_map_type { @@ -363,6 +364,14 @@ union bpf_attr { __u64 name; __u32 prog_fd; } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: -- cgit v1.2.3 From 60197cfb6e11ffc03aa0ed23765b2f7e70b2e2d4 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:02 -0700 Subject: bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd This patch adds BPF_OBJ_GET_INFO_BY_FD support to BTF fd. The original BTF data, which was used to create the BTF fd during the earlier BPF_BTF_LOAD call, will be returned. The userspace is expected to allocate buffer to info.info and the buffer size is set to info.info_len before calling BPF_OBJ_GET_INFO_BY_FD. The original BTF data is copied to the userspace buffer (info.info). Only upto the user's specified info.info_len will be copied. The original BTF data size is set to info.info_len. The userspace needs to check if it is bigger than its allocated buffer size. If it is, the userspace should realloc with the kernel-returned info.info_len and call the BPF_OBJ_GET_INFO_BY_FD again. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index a7c7072535ea..a966dc6d61ee 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -10,9 +10,14 @@ struct btf; struct btf_type; union bpf_attr; +extern const struct file_operations btf_fops; + void btf_put(struct btf *btf); int btf_new_fd(const union bpf_attr *attr); struct btf *btf_get_by_fd(int fd); +int btf_get_info_by_fd(const struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr); /* Figure out the size of a type_id. If type_id is a modifier * (e.g. const), it will be resolved to find out the type with size. * -- cgit v1.2.3 From a26ca7c982cb576749cbdd01e8ecde4bf010d60a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 18 Apr 2018 15:56:03 -0700 Subject: bpf: btf: Add pretty print support to the basic arraymap This patch adds pretty print support to the basic arraymap. Support for other bpf maps can be added later. This patch adds new attrs to the BPF_MAP_CREATE command to allow specifying the btf_fd, btf_key_id and btf_value_id. The BPF_MAP_CREATE can then associate the btf to the map if the creating map supports BTF. A BTF supported map needs to implement two new map ops, map_seq_show_elem() and map_check_btf(). This patch has implemented these new map ops for the basic arraymap. It also adds file_operations, bpffs_map_fops, to the pinned map such that the pinned map can be opened and read. After that, the user has an intuitive way to do "cat bpffs/pathto/a-pinned-map" instead of getting an error. bpffs_map_fops should not be extended further to support other operations. Other operations (e.g. write/key-lookup...) should be realized by the userspace tools (e.g. bpftool) through the BPF_OBJ_GET_INFO_BY_FD, map's lookup/update interface...etc. Follow up patches will allow the userspace to obtain the BTF from a map-fd. Here is a sample output when reading a pinned arraymap with the following map's value: struct map_value { int count_a; int count_b; }; cat /sys/fs/bpf/pinned_array_map: 0: {1,2} 1: {3,4} 2: {5,6} ... Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 20 +++++++++++++++++--- include/uapi/linux/bpf.h | 3 +++ 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 95a7abd0ee92..ee5275e7d4df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,6 +22,8 @@ struct perf_event; struct bpf_prog; struct bpf_map; struct sock; +struct seq_file; +struct btf; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -43,10 +45,14 @@ struct bpf_map_ops { void (*map_fd_put_ptr)(void *ptr); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); + void (*map_seq_show_elem)(struct bpf_map *map, void *key, + struct seq_file *m); + int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, + u32 key_type_id, u32 value_type_id); }; struct bpf_map { - /* 1st cacheline with read-mostly members of which some + /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). */ const struct bpf_map_ops *ops ____cacheline_aligned; @@ -62,10 +68,13 @@ struct bpf_map { u32 pages; u32 id; int numa_node; + u32 btf_key_id; + u32 btf_value_id; + struct btf *btf; bool unpriv_array; - /* 7 bytes hole */ + /* 55 bytes hole */ - /* 2nd cacheline with misc members to avoid false sharing + /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ struct user_struct *user ____cacheline_aligned; @@ -100,6 +109,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) return container_of(map, struct bpf_offloaded_map, map); } +static inline bool bpf_map_support_seq_show(const struct bpf_map *map) +{ + return map->ops->map_seq_show_elem && map->ops->map_check_btf; +} + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 795bcd577750..c8383a289f7b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -280,6 +280,9 @@ union bpf_attr { */ char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_id; /* BTF type_id of the key */ + __u32 btf_value_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ -- cgit v1.2.3 From 9e4d60938a2b2aae3c2c213c923d9eb8d0a87ba2 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:50 +0200 Subject: net: phy: mdio-gpio: Remove reset function The platform data can contain a function to call to reset the bit banging interface. It is not used, so remove it. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/mdio-gpio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index 11f00cdabe3d..6e8f01a570f2 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -26,8 +26,6 @@ struct mdio_gpio_platform_data { u32 phy_mask; u32 phy_ignore_ta_mask; int irqs[PHY_MAX_ADDR]; - /* reset callback */ - int (*reset)(struct mii_bus *bus); }; #endif /* __LINUX_MDIO_GPIO_H */ -- cgit v1.2.3 From a3283e2576736463e07ddc684efb2e587a3bbda2 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:51 +0200 Subject: net: phy: mdio-bitbang: Remove reset support The mdio-gpio driver was the only user of the interface reset option. Since it no longer uses it, remove it from the bit banging code. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio-bitbang.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mdio-bitbang.h b/include/linux/mdio-bitbang.h index a8ac9cfa014c..5d71e8a8500f 100644 --- a/include/linux/mdio-bitbang.h +++ b/include/linux/mdio-bitbang.h @@ -33,8 +33,6 @@ struct mdiobb_ops { struct mdiobb_ctrl { const struct mdiobb_ops *ops; - /* reset callback */ - int (*reset)(struct mii_bus *bus); }; /* The returned bus is not yet registered with the phy layer. */ -- cgit v1.2.3 From c1b3eb04682f80af74572aa25601dbb555c6bc6e Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:52 +0200 Subject: net: phy: mdio-gpio: remove support for ignoring turn around This is not needed any more by devices using platform data, so remove it. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/mdio-gpio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index 6e8f01a570f2..b8f914a30126 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -24,7 +24,6 @@ struct mdio_gpio_platform_data { bool mdo_active_low; u32 phy_mask; - u32 phy_ignore_ta_mask; int irqs[PHY_MAX_ADDR]; }; -- cgit v1.2.3 From 185a16b60a239f9db3fe8b8ce931a2fd61330853 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:53 +0200 Subject: net: phy: mdio-gpio: remove support for phy mask This is not needed any more by devices using platform data, so remove it. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/mdio-gpio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index b8f914a30126..7d55dfef56dc 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -23,7 +23,6 @@ struct mdio_gpio_platform_data { bool mdio_active_low; bool mdo_active_low; - u32 phy_mask; int irqs[PHY_MAX_ADDR]; }; -- cgit v1.2.3 From 68abb4f25d7eaf4d5f40108aa748621ddab68209 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:54 +0200 Subject: net: phy: mdio-gpio: Remove support for IRQs in platform data No current devices use IRQs in platform data, so remove support for it. The MDIO core will also initialise the new bus such that all addresses are polled, so remove the unneeded re-initialisation. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/mdio-gpio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index 7d55dfef56dc..af3be0c4ff9b 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -22,8 +22,6 @@ struct mdio_gpio_platform_data { bool mdc_active_low; bool mdio_active_low; bool mdo_active_low; - - int irqs[PHY_MAX_ADDR]; }; #endif /* __LINUX_MDIO_GPIO_H */ -- cgit v1.2.3 From c82fc4814a93f36c9b536b5af8dd617e4ee1380f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:55 +0200 Subject: net: phy: mdio-gpio: Swap to using gpio descriptors This simplifies the code, removing the need to handle active low flags, etc. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/mdio-gpio.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h index af3be0c4ff9b..bd91fa98a3aa 100644 --- a/include/linux/platform_data/mdio-gpio.h +++ b/include/linux/platform_data/mdio-gpio.h @@ -15,13 +15,9 @@ struct mdio_gpio_platform_data { /* GPIO numbers for bus pins */ - unsigned int mdc; - unsigned int mdio; - unsigned int mdo; - - bool mdc_active_low; - bool mdio_active_low; - bool mdo_active_low; + struct gpio_desc *mdc; + struct gpio_desc *mdio; + struct gpio_desc *mdo; }; #endif /* __LINUX_MDIO_GPIO_H */ -- cgit v1.2.3 From fb78a95e22123da57cb79b98bdc62eb33965ca8a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:58 +0200 Subject: net: phy: mdio-gpio: Add #defines for the GPIO index's The GPIOs are described in device tree using a list, without names. Add defines to indicate what each index in the list means. These defines should also be used by platform devices passing GPIOs via a GPIO lookup table. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio-gpio.h | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 include/linux/mdio-gpio.h (limited to 'include') diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h new file mode 100644 index 000000000000..cea443a672cb --- /dev/null +++ b/include/linux/mdio-gpio.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_MDIO_GPIO_H +#define __LINUX_MDIO_GPIO_H + +#define MDIO_GPIO_MDC 0 +#define MDIO_GPIO_MDIO 1 +#define MDIO_GPIO_MDO 2 + +#endif -- cgit v1.2.3 From 0207dd1173fe31c153ffd439c4bb33d1341829b1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 19 Apr 2018 01:02:59 +0200 Subject: net: phy: mdio-gpio: Remove redundant platform data header The platform data header file is now unused. Remove it, but add an extra include which it brought in. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/mdio-gpio.h | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 include/linux/platform_data/mdio-gpio.h (limited to 'include') diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h deleted file mode 100644 index bd91fa98a3aa..000000000000 --- a/include/linux/platform_data/mdio-gpio.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * MDIO-GPIO bus platform data structures - * - * Copyright (C) 2008, Paulius Zaleckas - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ - -#ifndef __LINUX_MDIO_GPIO_H -#define __LINUX_MDIO_GPIO_H - -#include - -struct mdio_gpio_platform_data { - /* GPIO numbers for bus pins */ - struct gpio_desc *mdc; - struct gpio_desc *mdio; - struct gpio_desc *mdo; -}; - -#endif /* __LINUX_MDIO_GPIO_H */ -- cgit v1.2.3 From 27cced20192d25ae528db8fe694c95c7656f3d56 Mon Sep 17 00:00:00 2001 From: Michael Karcher Date: Thu, 19 Apr 2018 14:05:22 +1200 Subject: net-next: ax88796: Add block_input/output hooks to ax_plat_data Add platform specific hooks for block transfer reads/writes of packet buffer data, superseding the default provided ax_block_input/output. Currently used for m68k Amiga XSurf100. Signed-off-by: Michael Karcher Signed-off-by: Michael Schmitz Signed-off-by: David S. Miller --- include/net/ax88796.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/ax88796.h b/include/net/ax88796.h index b9a3beca0ce4..363b0ca5f7e8 100644 --- a/include/net/ax88796.h +++ b/include/net/ax88796.h @@ -12,6 +12,9 @@ #ifndef __NET_AX88796_PLAT_H #define __NET_AX88796_PLAT_H +struct sk_buff; +struct net_device; + #define AXFLG_HAS_EEPROM (1<<0) #define AXFLG_MAC_FROMDEV (1<<1) /* device already has MAC */ #define AXFLG_HAS_93CX6 (1<<2) /* use eeprom_93cx6 driver */ @@ -26,6 +29,12 @@ struct ax_plat_data { u32 *reg_offsets; /* register offsets */ u8 *mac_addr; /* MAC addr (only used when AXFLG_MAC_FROMPLATFORM is used */ + + /* uses default ax88796 buffer if set to NULL */ + void (*block_output)(struct net_device *dev, int count, + const unsigned char *buf, int star_page); + void (*block_input)(struct net_device *dev, int count, + struct sk_buff *skb, int ring_offset); }; #endif /* __NET_AX88796_PLAT_H */ -- cgit v1.2.3 From cec4c1c54a643608c262bd9bb72cf9bbec64f44a Mon Sep 17 00:00:00 2001 From: Michael Karcher Date: Thu, 19 Apr 2018 14:05:23 +1200 Subject: net-next: ax88796: add interrupt status callback to platform data To be able to tell the ax88796 driver whether it is sensible to enter the 8390 interrupt handler, an "is this interrupt caused by the 88796" callback has been added to the ax_plat_data structure (with NULL being compatible to the previous behaviour). Signed-off-by: Michael Karcher Signed-off-by: Michael Schmitz Signed-off-by: David S. Miller --- include/net/ax88796.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ax88796.h b/include/net/ax88796.h index 363b0ca5f7e8..84b3785d0e66 100644 --- a/include/net/ax88796.h +++ b/include/net/ax88796.h @@ -14,6 +14,7 @@ struct sk_buff; struct net_device; +struct platform_device; #define AXFLG_HAS_EEPROM (1<<0) #define AXFLG_MAC_FROMDEV (1<<1) /* device already has MAC */ @@ -35,6 +36,10 @@ struct ax_plat_data { const unsigned char *buf, int star_page); void (*block_input)(struct net_device *dev, int count, struct sk_buff *skb, int ring_offset); + /* returns nonzero if a pending interrupt request might by caused by + * the ax88786. Handles all interrupts if set to NULL + */ + int (*check_irq)(struct platform_device *pdev); }; #endif /* __NET_AX88796_PLAT_H */ -- cgit v1.2.3 From a4dfa72d0acd1c99a160e25c099849ae37ad13fd Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Thu, 19 Apr 2018 11:06:18 +0200 Subject: tipc: set default MTU for UDP media Currently, all bearers are configured with MTU value same as the underlying L2 device. However, in case of bearers with media type UDP, higher throughput is possible with a fixed and higher emulated MTU value than adapting to the underlying L2 MTU. In this commit, we introduce a parameter mtu in struct tipc_media and a default value is set for UDP. A default value of 14k was determined by experimentation and found to have a higher throughput than 16k. MTU for UDP bearers are assigned the above set value of media MTU. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: David S. Miller --- include/uapi/linux/tipc_config.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h index 3f29e3c8ed06..4b2c93b1934c 100644 --- a/include/uapi/linux/tipc_config.h +++ b/include/uapi/linux/tipc_config.h @@ -185,6 +185,11 @@ #define TIPC_DEF_LINK_WIN 50 #define TIPC_MAX_LINK_WIN 8191 +/* + * Default MTU for UDP media + */ + +#define TIPC_DEF_LINK_UDP_MTU 14000 struct tipc_node_info { __be32 addr; /* network address of node */ -- cgit v1.2.3 From 901271e0403af638c224987c2a4e55cebade7e91 Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Thu, 19 Apr 2018 11:06:19 +0200 Subject: tipc: implement configuration of UDP media MTU In previous commit, we changed the default emulated MTU for UDP bearers to 14k. This commit adds the functionality to set/change the default value by configuring new MTU for UDP media. UDP bearer(s) have to be disabled and enabled back for the new MTU to take effect. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 0affb682e5e3..85c11982c89b 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -266,6 +266,7 @@ enum { TIPC_NLA_PROP_PRIO, /* u32 */ TIPC_NLA_PROP_TOL, /* u32 */ TIPC_NLA_PROP_WIN, /* u32 */ + TIPC_NLA_PROP_MTU, /* u32 */ __TIPC_NLA_PROP_MAX, TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1 -- cgit v1.2.3 From 809c45a091d93e05c6e9b5d53bb3f1185273286b Mon Sep 17 00:00:00 2001 From: Shahed Shaikh Date: Thu, 19 Apr 2018 05:50:12 -0700 Subject: qed* : Add new TLV to request PF to update MAC in bulletin board There may be a need for VF driver to request PF to explicitly update its bulletin with a MAC address. e.g. When user assigns a MAC address to VF while VF is still down, and PF's bulletin board contains different MAC address, in this case, when VF's interface is brought up, it gets loaded with MAC address from bulletin board which is not desirable. To handle this corner case, we need a new TLV to request PF to update its bulletin board with suggested MAC. This request will be honored only for trusted VFs. Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller --- include/linux/qed/qed_eth_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 147d08ccf813..7f9756fe9180 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -352,6 +352,7 @@ struct qed_eth_ops { int (*configure_arfs_searcher)(struct qed_dev *cdev, enum qed_filter_config_mode mode); int (*get_coalesce)(struct qed_dev *cdev, u16 *coal, void *handle); + int (*req_bulletin_update_mac)(struct qed_dev *cdev, u8 *mac); }; const struct qed_eth_ops *qed_get_eth_ops(void); -- cgit v1.2.3 From 1827b0678863bc97a1653fdf5308762b2aefcd56 Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Thu, 19 Apr 2018 17:59:39 +0100 Subject: lan78xx: Read LED states from Device Tree Add support for DT property "microchip,led-modes", a vector of zero to four cells (u32s) in the range 0-15, each of which sets the mode for one of the LEDs. Some possible values are: 0=link/activity 1=link1000/activity 2=link100/activity 3=link10/activity 4=link100/1000/activity 5=link10/1000/activity 6=link10/100/activity 14=off 15=on These values are given symbolic constants in a dt-bindings header. Also use the presence of the DT property to indicate that the LEDs should be enabled - necessary in the event that no valid OTP or EEPROM is available. Signed-off-by: Phil Elwell Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/dt-bindings/net/microchip-lan78xx.h | 21 +++++++++++++++++++++ include/linux/microchipphy.h | 3 +++ 2 files changed, 24 insertions(+) create mode 100644 include/dt-bindings/net/microchip-lan78xx.h (limited to 'include') diff --git a/include/dt-bindings/net/microchip-lan78xx.h b/include/dt-bindings/net/microchip-lan78xx.h new file mode 100644 index 000000000000..0742ff075307 --- /dev/null +++ b/include/dt-bindings/net/microchip-lan78xx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _DT_BINDINGS_MICROCHIP_LAN78XX_H +#define _DT_BINDINGS_MICROCHIP_LAN78XX_H + +/* LED modes for LAN7800/LAN7850 embedded PHY */ + +#define LAN78XX_LINK_ACTIVITY 0 +#define LAN78XX_LINK_1000_ACTIVITY 1 +#define LAN78XX_LINK_100_ACTIVITY 2 +#define LAN78XX_LINK_10_ACTIVITY 3 +#define LAN78XX_LINK_100_1000_ACTIVITY 4 +#define LAN78XX_LINK_10_1000_ACTIVITY 5 +#define LAN78XX_LINK_10_100_ACTIVITY 6 +#define LAN78XX_DUPLEX_COLLISION 8 +#define LAN78XX_COLLISION 9 +#define LAN78XX_ACTIVITY 10 +#define LAN78XX_AUTONEG_FAULT 12 +#define LAN78XX_FORCE_LED_OFF 14 +#define LAN78XX_FORCE_LED_ON 15 + +#endif diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h index eb492d47f717..8e4015e7f8d3 100644 --- a/include/linux/microchipphy.h +++ b/include/linux/microchipphy.h @@ -70,4 +70,7 @@ #define LAN88XX_MMD3_CHIP_ID (32877) #define LAN88XX_MMD3_CHIP_REV (32878) +/* Registers specific to the LAN7800/LAN7850 embedded phy */ +#define LAN78XX_PHY_LED_MODE_SELECT (0x1D) + #endif /* _MICROCHIPPHY_H */ -- cgit v1.2.3 From 07cb9623ee7d15cc9968969ac247edae6972fb8f Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:10 +0100 Subject: ipv6: make ip6_dst_mtu_forward inline Just like ip_dst_mtu_maybe_forward(), to avoid a dependency with ipv6.ko. Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/ip6_route.h | 21 +++++++++++++++++++++ include/net/ipv6.h | 2 -- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index d5fb1e4ae7ac..376928c26d2d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -279,6 +279,27 @@ static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info * !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); } +static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +{ + struct inet6_dev *idev; + unsigned int mtu; + + if (dst_metric_locked(dst, RTAX_MTU)) { + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + } + + mtu = IPV6_MIN_MTU; + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 68b167d98879..765441867cfa 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -958,8 +958,6 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk) &inet6_sk(sk)->cork); } -unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst); - int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, -- cgit v1.2.3 From 4f3780c004ef608477a534558eb16f46c5a1c534 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:11 +0100 Subject: netfilter: nf_flow_table: cache mtu in struct flow_offload_tuple Reduces the number of cache lines touched in the offload forwarding path. This is safe because PMTU limits are bypassed for the forwarding path (see commit f87c10a8aa1e for more details). Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 09ba67598991..76ee5c81b752 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -55,6 +55,8 @@ struct flow_offload_tuple { int oifidx; + u16 mtu; + struct dst_entry *dst_cache; }; -- cgit v1.2.3 From 6a3e030f08e1b700aa6d1ebdc7ebe4c44a2ef67a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:37:57 -0700 Subject: net/ipv6: Clean up rt expires helpers rt6_clean_expires and rt6_set_expires are no longer used. Removed them. rt6_update_expires has 1 caller in route.c, so move it from the header. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 642211174692..327a74cd6a0d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -223,27 +223,6 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) return false; } -static inline void rt6_clean_expires(struct rt6_info *rt) -{ - rt->rt6i_flags &= ~RTF_EXPIRES; - rt->dst.expires = 0; -} - -static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires) -{ - rt->dst.expires = expires; - rt->rt6i_flags |= RTF_EXPIRES; -} - -static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) -{ - if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from) - rt0->dst.expires = rt0->from->expires; - - dst_set_expires(&rt0->dst, timeout); - rt0->rt6i_flags |= RTF_EXPIRES; -} - /* Function to safely get fn->sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely -- cgit v1.2.3 From a269f1a764bb3abf5c73499fb74bc7ab1c2e986c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:37:58 -0700 Subject: net/ipv6: Rename rt6_get_cookie_safe rt6_get_cookie_safe takes a fib6_info and checks the sernum of the node. Update the name to reflect its purpose. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 327a74cd6a0d..dd1481ed8bdb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -228,8 +228,8 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i) * Return true if we can get cookie safely * Return false if not */ -static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i, - u32 *cookie) +static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, + u32 *cookie) { struct fib6_node *fn; bool status = false; @@ -254,7 +254,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - rt6_get_cookie_safe(rt->from, &cookie); + fib6_get_cookie_safe(rt->from, &cookie); return cookie; } -- cgit v1.2.3 From a87b7dc9f7fa7c87cbd575361553e8e9e4ab4473 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:38:00 -0700 Subject: net/ipv6: Move rcu locking to callers of fib6_get_cookie_safe A later patch protects 'from' in rt6_info and this simplifies the locking needed by it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index dd1481ed8bdb..dc3505fb62b3 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -234,7 +234,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, struct fib6_node *fn; bool status = false; - rcu_read_lock(); fn = rcu_dereference(f6i->fib6_node); if (fn) { @@ -244,7 +243,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, status = true; } - rcu_read_unlock(); return status; } @@ -252,10 +250,14 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) { u32 cookie = 0; + rcu_read_lock(); + if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) fib6_get_cookie_safe(rt->from, &cookie); + rcu_read_unlock(); + return cookie; } -- cgit v1.2.3 From a68886a691804d3f6d479ebf6825480fbafb6a00 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Apr 2018 15:38:02 -0700 Subject: net/ipv6: Make from in rt6_info rcu protected When a dst entry is created from a fib entry, the 'from' in rt6_info is set to the fib entry. The 'from' reference is used most notably for cookie checking - making sure stale dst entries are updated if the fib entry is changed. When a fib entry is deleted, the pcpu routes on it are walked releasing the fib6_info reference. This is needed for the fib6_info cleanup to happen and to make sure all device references are released in a timely manner. There is a race window when a FIB entry is deleted and the 'from' on the pcpu route is dropped and the pcpu route hits a cookie check. Handle this race using rcu on from. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index dc3505fb62b3..1af450d4e923 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -174,7 +174,7 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; - struct fib6_info *from; + struct fib6_info __rcu *from; struct rt6key rt6i_dst; struct rt6key rt6i_src; @@ -248,13 +248,15 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, static inline u32 rt6_get_cookie(const struct rt6_info *rt) { + struct fib6_info *from; u32 cookie = 0; rcu_read_lock(); - if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - fib6_get_cookie_safe(rt->from, &cookie); + from = rcu_dereference(rt->from); + if (from && (rt->rt6i_flags & RTF_PCPU || + unlikely(!list_empty(&rt->rt6i_uncached)))) + fib6_get_cookie_safe(from, &cookie); rcu_read_unlock(); -- cgit v1.2.3 From fbcf93ebcaef7d09881ee308b52cd84f5e43c622 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 21 Apr 2018 09:48:23 -0700 Subject: bpf: btf: Clean up btf.h in uapi This patch cleans up btf.h in uapi: 1) Rename "name" to "name_off" to better reflect it is an offset to the string section instead of a char array. 2) Remove unused value BTF_FLAGS_COMPR and BTF_MAGIC_SWAP Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 74a30b1090df..bcb56ee47014 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -6,9 +6,7 @@ #include #define BTF_MAGIC 0xeB9F -#define BTF_MAGIC_SWAP 0x9FeB #define BTF_VERSION 1 -#define BTF_FLAGS_COMPR 0x01 struct btf_header { __u16 magic; @@ -43,7 +41,7 @@ struct btf_header { #define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) struct btf_type { - __u32 name; + __u32 name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused @@ -105,7 +103,7 @@ struct btf_type { * info in "struct btf_type"). */ struct btf_enum { - __u32 name; + __u32 name_off; __s32 val; }; @@ -122,7 +120,7 @@ struct btf_array { * "struct btf_type"). */ struct btf_member { - __u32 name; + __u32 name_off; __u32 type; __u32 offset; /* offset in bits */ }; -- cgit v1.2.3 From 6163849d289be6ff2acd2fb520da303dec3219f0 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 20 Apr 2018 23:18:26 +0800 Subject: net: introduce a new tracepoint for tcp_rcv_space_adjust tcp_rcv_space_adjust is called every time data is copied to user space, introducing a tcp tracepoint for which could show us when the packet is copied to user. When a tcp packet arrives, tcp_rcv_established() will be called and with the existed tracepoint tcp_probe we could get the time when this packet arrives. Then this packet will be copied to user, and tcp_rcv_space_adjust will be called and with this new introduced tracepoint we could get the time when this packet is copied to user. With these two tracepoints, we could figure out whether the user program processes this packet immediately or there's latency. Hence in the printk message, sk_cookie is printed as a key to relate tcp_rcv_space_adjust with tcp_probe. Maybe we could export sockfd in this new tracepoint as well, then we could relate this new tracepoint with epoll/read/recv* tracepoints, and finally that could show us the whole lifespan of this packet. But we could also implement that with pid as these functions are executed in process context. Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 3dd68029d77a..c1a5284713b8 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -10,6 +10,7 @@ #include #include #include +#include #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ do { \ @@ -113,7 +114,7 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, */ DECLARE_EVENT_CLASS(tcp_event_sk, - TP_PROTO(const struct sock *sk), + TP_PROTO(struct sock *sk), TP_ARGS(sk), @@ -125,6 +126,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk, __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) + __field(__u64, sock_cookie) ), TP_fast_assign( @@ -144,24 +146,34 @@ DECLARE_EVENT_CLASS(tcp_event_sk, TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); + + __entry->sock_cookie = sock_gen_cookie(sk); ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, - __entry->saddr_v6, __entry->daddr_v6) + __entry->saddr_v6, __entry->daddr_v6, + __entry->sock_cookie) ); DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, - TP_PROTO(const struct sock *sk), + TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, - TP_PROTO(const struct sock *sk), + TP_PROTO(struct sock *sk), + + TP_ARGS(sk) +); + +DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, + + TP_PROTO(struct sock *sk), TP_ARGS(sk) ); @@ -232,6 +244,7 @@ TRACE_EVENT(tcp_probe, __field(__u32, snd_wnd) __field(__u32, srtt) __field(__u32, rcv_wnd) + __field(__u64, sock_cookie) ), TP_fast_assign( @@ -256,15 +269,14 @@ TRACE_EVENT(tcp_probe, __entry->rcv_wnd = tp->rcv_wnd; __entry->ssthresh = tcp_current_ssthresh(sk); __entry->srtt = tp->srtt_us >> 3; + __entry->sock_cookie = sock_gen_cookie(sk); ), - TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x " - "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u " - "rcv_wnd=%u", + TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", __entry->saddr, __entry->daddr, __entry->mark, __entry->length, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, - __entry->srtt, __entry->rcv_wnd) + __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ); #endif /* _TRACE_TCP_H */ -- cgit v1.2.3 From b16fb418b1bf2a9f14d5d2a4fe29bde1f5550b37 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sat, 21 Apr 2018 09:41:31 -0700 Subject: net: fib_rules: add extack support Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/fib_rules.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index e5cfcfc7dd93..b473df5b9512 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -75,7 +75,8 @@ struct fib_rules_ops { int (*configure)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *, - struct nlattr **); + struct nlattr **, + struct netlink_ext_ack *); int (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, -- cgit v1.2.3 From c6849a3ac17e336811f1d5bba991d2a9bdc47af1 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 22 Apr 2018 21:50:04 +0800 Subject: net: init sk_cookie for inet socket With sk_cookie we can identify a socket, that is very helpful for traceing and statistic, i.e. tcp tracepiont and ebpf. So we'd better init it by default for inet socket. When using it, we just need call atomic64_read(&sk->sk_cookie). Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/linux/sock_diag.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 15fe980a27ea..5c916e6dff36 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -25,6 +25,15 @@ void sock_diag_unregister(const struct sock_diag_handler *h); void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); +static inline +void sock_init_cookie(struct sock *sk) +{ + u64 res; + + res = atomic64_inc_return(&sock_net(sk)->cookie_gen); + atomic64_set(&sk->sk_cookie, res); +} + u64 sock_gen_cookie(struct sock *sk); int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie); void sock_diag_save_cookie(struct sock *sk, __u32 *cookie); -- cgit v1.2.3 From 1ac4329a1cff2e0bb12b71c13ad53a0e05bc87a6 Mon Sep 17 00:00:00 2001 From: Denis Bolotin Date: Mon, 23 Apr 2018 14:56:05 +0300 Subject: qed: Add configuration information to register dump and debug data Configuration information is added to the debug data collection, in addition to register dump. Added qed_dbg_nvm_image() that receives an image type, allocates a buffer and reads the image. The images are saved in the buffers and the dump size is updated. Signed-off-by: Denis Bolotin Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index b5b2bc9eacca..e53f9c7c2809 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -159,6 +159,9 @@ struct qed_dcbx_get { enum qed_nvm_images { QED_NVM_IMAGE_ISCSI_CFG, QED_NVM_IMAGE_FCOE_CFG, + QED_NVM_IMAGE_NVM_CFG1, + QED_NVM_IMAGE_DEFAULT_CFG, + QED_NVM_IMAGE_NVM_META, }; struct qed_link_eee_params { -- cgit v1.2.3 From a268de77faf6881756b4943b287fd78ec05a7d1e Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:17 +0100 Subject: netfilter: nf_flow_table: move init code to nf_flow_table_core.c Reduces duplication of .gc and .params in flowtable type definitions and makes the API clearer Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 76ee5c81b752..f876e32a60b8 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -14,9 +14,8 @@ struct nf_flowtable; struct nf_flowtable_type { struct list_head list; int family; - void (*gc)(struct work_struct *work); + int (*init)(struct nf_flowtable *ft); void (*free)(struct nf_flowtable *ft); - const struct rhashtable_params *params; nf_hookfn *hook; struct module *owner; }; @@ -100,9 +99,8 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, void nf_flow_table_cleanup(struct net *net, struct net_device *dev); +int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); -void nf_flow_offload_work_gc(struct work_struct *work); -extern const struct rhashtable_params nf_flow_offload_rhash_params; void flow_offload_dead(struct flow_offload *flow); -- cgit v1.2.3 From 84453a90252ca0cd7d1bd229199a40c58bfe431e Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:19 +0100 Subject: netfilter: nf_flow_table: track flow tables in nf_flow_table directly Avoids having nf_flow_table depend on nftables (useful for future iptables backport work) Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + include/net/netfilter/nf_tables.h | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index f876e32a60b8..ab408adba688 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -21,6 +21,7 @@ struct nf_flowtable_type { }; struct nf_flowtable { + struct list_head list; struct rhashtable rhashtable; const struct nf_flowtable_type *type; struct delayed_work gc_work; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index cd368d1b8cb8..2f2062ae1c45 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1109,9 +1109,6 @@ struct nft_flowtable { struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); -void nft_flow_table_iterate(struct net *net, - void (*iter)(struct nf_flowtable *flowtable, void *data), - void *data); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); -- cgit v1.2.3 From 6bdc3c68d94c5d6adc675ee55361962e9dd2489d Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:20 +0100 Subject: netfilter: nf_flow_table: make flow_offload_dead inline It is too trivial to keep as a separate exported function Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ab408adba688..5aa49524ebef 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -103,7 +103,10 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev); int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); -void flow_offload_dead(struct flow_offload *flow); +static inline void flow_offload_dead(struct flow_offload *flow) +{ + flow->flags |= FLOW_OFFLOAD_DYING; +} int nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, -- cgit v1.2.3 From 59c466dd68e796f3a7a0709d90c72ce2d84e29c2 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 26 Feb 2018 10:15:21 +0100 Subject: netfilter: nf_flow_table: add a new flow state for tearing down offloading On cleanup, this will be treated differently from FLOW_OFFLOAD_DYING: If FLOW_OFFLOAD_DYING is set, the connection is going away, so both the offload state and the connection tracking entry will be deleted. If FLOW_OFFLOAD_TEARDOWN is set, the connection remains alive, but the offload state is torn down. This is useful for cases that require more complex state tracking / timeout handling on TCP, or if the connection has been idle for too long. Support for sending flows back to the slow path will be implemented in a following patch Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 5aa49524ebef..ba9fa4592f2b 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -68,6 +68,7 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_SNAT 0x1 #define FLOW_OFFLOAD_DNAT 0x2 #define FLOW_OFFLOAD_DYING 0x4 +#define FLOW_OFFLOAD_TEARDOWN 0x8 struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; @@ -103,6 +104,7 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev); int nf_flow_table_init(struct nf_flowtable *flow_table); void nf_flow_table_free(struct nf_flowtable *flow_table); +void flow_offload_teardown(struct flow_offload *flow); static inline void flow_offload_dead(struct flow_offload *flow) { flow->flags |= FLOW_OFFLOAD_DYING; -- cgit v1.2.3 From cac20fcdf146b82d02e412d7a345f5826279cd82 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 28 Mar 2018 12:06:51 +0200 Subject: netfilter: nf_tables: simplify lookup functions Replace the nf_tables_ prefix by nft_ and merge code into single lookup function whenever possible. In many cases we go over the 80-chars boundary function names, this save us ~50 LoC. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2f2062ae1c45..123e82a2f8bb 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1015,9 +1015,9 @@ static inline void *nft_obj_data(const struct nft_object *obj) #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) -struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, - const struct nlattr *nla, u32 objtype, - u8 genmask); +struct nft_object *nft_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, u32 objtype, + u8 genmask); void nft_obj_notify(struct net *net, struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, @@ -1106,9 +1106,9 @@ struct nft_flowtable { struct nf_flowtable data; }; -struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask); +struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); -- cgit v1.2.3 From 71cc0873e0e0a4c6dca899c42e3ac143f7960d8e Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 3 Apr 2018 23:15:39 +0200 Subject: netfilter: nf_tables: Simplify set backend selection Drop nft_set_type's ability to act as a container of multiple backend implementations it chooses from. Instead consolidate the whole selection logic in nft_select_set_ops() and the actual backend provided estimate() callback. This turns nf_tables_set_types into a list containing all available backends which is traversed when selecting one matching userspace requested criteria. Also, this change allows to embed nft_set_ops structure into nft_set_type and pull flags field into the latter as it's only used during selection phase. A crucial part of this change is to make sure the new layout respects hash backend constraints formerly enforced by nft_hash_select_ops() function: This is achieved by introduction of a specific estimate() callback for nft_hash_fast_ops which returns false for key lengths != 4. In turn, nft_hash_estimate() is changed to return false for key lengths == 4 so it won't be chosen by accident. Also, both callbacks must return false for unbounded sets as their size estimate depends on a known maximum element count. Note that this patch partially reverts commit 4f2921ca21b71 ("netfilter: nf_tables: meter: pick a set backend that supports updates") by making nft_set_ops_candidate() not explicitly look for an update callback but make NFT_SET_EVAL a regular backend feature flag which is checked along with the others. This way all feature requirements are checked in one go. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 123e82a2f8bb..de77d36e36b3 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -275,23 +275,6 @@ struct nft_set_estimate { enum nft_set_class space; }; -/** - * struct nft_set_type - nf_tables set type - * - * @select_ops: function to select nft_set_ops - * @ops: default ops, used when no select_ops functions is present - * @list: used internally - * @owner: module reference - */ -struct nft_set_type { - const struct nft_set_ops *(*select_ops)(const struct nft_ctx *, - const struct nft_set_desc *desc, - u32 flags); - const struct nft_set_ops *ops; - struct list_head list; - struct module *owner; -}; - struct nft_set_ext; struct nft_expr; @@ -310,7 +293,6 @@ struct nft_expr; * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @elemsize: element private size - * @features: features supported by the implementation */ struct nft_set_ops { bool (*lookup)(const struct net *net, @@ -361,9 +343,23 @@ struct nft_set_ops { void (*destroy)(const struct nft_set *set); unsigned int elemsize; +}; + +/** + * struct nft_set_type - nf_tables set type + * + * @ops: set ops for this type + * @list: used internally + * @owner: module reference + * @features: features supported by the implementation + */ +struct nft_set_type { + const struct nft_set_ops ops; + struct list_head list; + struct module *owner; u32 features; - const struct nft_set_type *type; }; +#define to_set_type(o) container_of(o, struct nft_set_type, ops) int nft_register_set(struct nft_set_type *type); void nft_unregister_set(struct nft_set_type *type); -- cgit v1.2.3 From 2eb0f624b709e78ec8e2f4c3412947703db99301 Mon Sep 17 00:00:00 2001 From: Thierry Du Tre Date: Wed, 4 Apr 2018 15:38:22 +0200 Subject: netfilter: add NAT support for shifted portmap ranges This is a patch proposal to support shifted ranges in portmaps. (i.e. tcp/udp incoming port 5000-5100 on WAN redirected to LAN 192.168.1.5:2000-2100) Currently DNAT only works for single port or identical port ranges. (i.e. ports 5000-5100 on WAN interface redirected to a LAN host while original destination port is not altered) When different port ranges are configured, either 'random' mode should be used, or else all incoming connections are mapped onto the first port in the redirect range. (in described example WAN:5000-5100 will all be mapped to 192.168.1.5:2000) This patch introduces a new mode indicated by flag NF_NAT_RANGE_PROTO_OFFSET which uses a base port value to calculate an offset with the destination port present in the incoming stream. That offset is then applied as index within the redirect port range (index modulo rangewidth to handle range overflow). In described example the base port would be 5000. An incoming stream with destination port 5004 would result in an offset value 4 which means that the NAT'ed stream will be using destination port 2004. Other possibilities include deterministic mapping of larger or multiple ranges to a smaller range : WAN:5000-5999 -> LAN:5000-5099 (maps WAN port 5*xx to port 51xx) This patch does not change any current behavior. It just adds new NAT proto range functionality which must be selected via the specific flag when intended to use. A patch for iptables (libipt_DNAT.c + libip6t_DNAT.c) will also be proposed which makes this functionality immediately available. Signed-off-by: Thierry Du Tre Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_nat_masquerade.h | 2 +- include/net/netfilter/ipv6/nf_nat_masquerade.h | 2 +- include/net/netfilter/nf_nat.h | 2 +- include/net/netfilter/nf_nat_l3proto.h | 4 ++-- include/net/netfilter/nf_nat_l4proto.h | 8 ++++---- include/net/netfilter/nf_nat_redirect.h | 2 +- include/uapi/linux/netfilter/nf_nat.h | 12 +++++++++++- 7 files changed, 21 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h index ebd869473603..cd24be4c4a99 100644 --- a/include/net/netfilter/ipv4/nf_nat_masquerade.h +++ b/include/net/netfilter/ipv4/nf_nat_masquerade.h @@ -6,7 +6,7 @@ unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct net_device *out); void nf_nat_masquerade_ipv4_register_notifier(void); diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h index 1ed4f2631ed6..0c3b5ebf0bb8 100644 --- a/include/net/netfilter/ipv6/nf_nat_masquerade.h +++ b/include/net/netfilter/ipv6/nf_nat_masquerade.h @@ -3,7 +3,7 @@ #define _NF_NAT_MASQUERADE_IPV6_H_ unsigned int -nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out); void nf_nat_masquerade_ipv6_register_notifier(void); void nf_nat_masquerade_ipv6_unregister_notifier(void); diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 207a467e7ca6..da3d601cadee 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -39,7 +39,7 @@ struct nf_conn_nat { /* Set up the info structure to map into this range. */ unsigned int nf_nat_setup_info(struct nf_conn *ct, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype); extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index ce7c2b4e64bb..ac47098a61dc 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -7,7 +7,7 @@ struct nf_nat_l3proto { u8 l3proto; bool (*in_range)(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range); + const struct nf_nat_range2 *range); u32 (*secure_port)(const struct nf_conntrack_tuple *t, __be16); @@ -33,7 +33,7 @@ struct nf_nat_l3proto { struct flowi *fl); int (*nlattr_to_range)(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); }; int nf_nat_l3proto_register(const struct nf_nat_l3proto *); diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h index 67835ff8a2d9..b4d6b29bca62 100644 --- a/include/net/netfilter/nf_nat_l4proto.h +++ b/include/net/netfilter/nf_nat_l4proto.h @@ -34,12 +34,12 @@ struct nf_nat_l4proto { */ void (*unique_tuple)(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct); int (*nlattr_to_range)(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); }; /* Protocol registration. */ @@ -72,11 +72,11 @@ bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct, u16 *rover); int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range); + struct nf_nat_range2 *range); #endif /*_NF_NAT_L4PROTO_H*/ diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h index 5ddabb08c472..c129aacc8ae8 100644 --- a/include/net/netfilter/nf_nat_redirect.h +++ b/include/net/netfilter/nf_nat_redirect.h @@ -7,7 +7,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_ipv4_multi_range_compat *mr, unsigned int hooknum); unsigned int -nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum); #endif /* _NF_NAT_REDIRECT_H_ */ diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index a33000da7229..4a95c0db14d4 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -10,6 +10,7 @@ #define NF_NAT_RANGE_PROTO_RANDOM (1 << 2) #define NF_NAT_RANGE_PERSISTENT (1 << 3) #define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) +#define NF_NAT_RANGE_PROTO_OFFSET (1 << 5) #define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) @@ -17,7 +18,7 @@ #define NF_NAT_RANGE_MASK \ (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ - NF_NAT_RANGE_PROTO_RANDOM_FULLY) + NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET) struct nf_nat_ipv4_range { unsigned int flags; @@ -40,4 +41,13 @@ struct nf_nat_range { union nf_conntrack_man_proto max_proto; }; +struct nf_nat_range2 { + unsigned int flags; + union nf_inet_addr min_addr; + union nf_inet_addr max_addr; + union nf_conntrack_man_proto min_proto; + union nf_conntrack_man_proto max_proto; + union nf_conntrack_man_proto base_proto; +}; + #endif /* _NETFILTER_NF_NAT_H */ -- cgit v1.2.3 From cd9a5a15808403a7895c51b1378168d6c75cf8a6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 9 Apr 2018 00:00:57 +0900 Subject: netfilter: ebtables: remove EBT_MATCH and EBT_NOMATCH EBT_MATCH and EBT_NOMATCH are used to change return value. match functions(ebt_xxx.c) return false when received frame is not matched and returns true when received frame is matched. but, EBT_MATCH_ITERATE understands oppositely. so, to change return value, EBT_MATCH and EBT_NOMATCH are used. but, we can use operation '!' simply. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 0773b5a032f1..c6935be7c6ca 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -17,10 +17,6 @@ #include #include -/* return values for match() functions */ -#define EBT_MATCH 0 -#define EBT_NOMATCH 1 - struct ebt_match { struct list_head list; const char name[EBT_FUNCTION_MAXNAMELEN]; -- cgit v1.2.3 From a1d768f1a00db556e2aae9f92bdb38671e601da5 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 13 Apr 2018 23:09:58 +0900 Subject: netfilter: ebtables: add ebt_get_target and ebt_get_target_c ebt_get_target similar to {ip/ip6/arp}t_get_target. and ebt_get_target_c similar to {ip/ip6/arp}t_get_target_c. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebtables.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 0c7dc8315013..3b86c14ea49d 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -191,6 +191,12 @@ struct ebt_entry { unsigned char elems[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; +static __inline__ struct ebt_entry_target * +ebt_get_target(struct ebt_entry *e) +{ + return (void *)e + e->target_offset; +} + /* {g,s}etsockopt numbers */ #define EBT_BASE_CTL 128 -- cgit v1.2.3 From 8e1102d5a1596dca10f51e3de800809944f8816d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Apr 2018 18:04:49 +0200 Subject: netfilter: nf_tables: support timeouts larger than 23 days Marco De Benedetto says: I would like to use a timeout of 30 days for elements in a set but it seems there is a some kind of problem above 24d20h31m23s. Fix this by using 'jiffies64' for timeout handling to get same behaviour on 32 and 64bit systems. nftables passes timeouts as u64 in milliseconds to the kernel, but on kernel side we used a mixture of 'long' and jiffies conversions rather than u64 and jiffies64. Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1237 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index de77d36e36b3..435c9e3b9181 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -585,7 +585,7 @@ static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } -static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ext) +static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } @@ -603,7 +603,7 @@ static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext) static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && - time_is_before_eq_jiffies(*nft_set_ext_expiration(ext)); + time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext)); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, -- cgit v1.2.3 From bd2bbdb497dba24b9ca7f6257c83e496c64b6e9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Apr 2018 19:15:53 +0200 Subject: netfilter: merge meta_bridge into nft_meta It overcomplicates things for no reason. nft_meta_bridge only offers retrieval of bridge port interface name. Because of this being its own module, we had to export all nft_meta functions, which we can then make static again (which even reduces the size of nft_meta -- including bridge port retrieval...): before: text data bss dec hex filename 1838 832 0 2670 a6e net/bridge/netfilter/nft_meta_bridge.ko 6147 936 1 7084 1bac net/netfilter/nft_meta.ko after: 5826 936 1 6763 1a6b net/netfilter/nft_meta.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_meta.h | 44 ---------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 include/net/netfilter/nft_meta.h (limited to 'include') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h deleted file mode 100644 index 5c69e9b09388..000000000000 --- a/include/net/netfilter/nft_meta.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NFT_META_H_ -#define _NFT_META_H_ - -struct nft_meta { - enum nft_meta_keys key:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; - -extern const struct nla_policy nft_meta_policy[]; - -int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]); - -int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]); - -int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr); - -int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr); - -void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt); - -void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt); - -void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr); - -int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data); - -#endif -- cgit v1.2.3 From 026a807c2de37aa826748c2ffa1969fc778406b2 Mon Sep 17 00:00:00 2001 From: Tal Gilboa Date: Tue, 24 Apr 2018 13:36:01 +0300 Subject: net/dim: Rename *_get_profile() functions to *_get_rx_moderation() Preparation for introducing adaptive TX to net DIM. Signed-off-by: Tal Gilboa Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/net_dim.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h index 29ed8fd6379a..7ca3c4deb3a6 100644 --- a/include/linux/net_dim.h +++ b/include/linux/net_dim.h @@ -129,17 +129,17 @@ profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { NET_DIM_CQE_PROFILES, }; -static inline struct net_dim_cq_moder net_dim_get_profile(u8 cq_period_mode, - int ix) +static inline struct net_dim_cq_moder +net_dim_get_rx_moderation(u8 cq_period_mode, int ix) { - struct net_dim_cq_moder cq_moder; + struct net_dim_cq_moder cq_moder = profile[cq_period_mode][ix]; - cq_moder = profile[cq_period_mode][ix]; cq_moder.cq_period_mode = cq_period_mode; return cq_moder; } -static inline struct net_dim_cq_moder net_dim_get_def_profile(u8 rx_cq_period_mode) +static inline struct net_dim_cq_moder +net_dim_get_def_rx_moderation(u8 rx_cq_period_mode) { int default_profile_ix; @@ -148,7 +148,7 @@ static inline struct net_dim_cq_moder net_dim_get_def_profile(u8 rx_cq_period_mo else /* NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE */ default_profile_ix = NET_DIM_DEF_PROFILE_EQE; - return net_dim_get_profile(rx_cq_period_mode, default_profile_ix); + return net_dim_get_rx_moderation(rx_cq_period_mode, default_profile_ix); } static inline bool net_dim_on_top(struct net_dim *dim) -- cgit v1.2.3 From 623ad755226c2b4fb312772bae9cdeccd8219ee3 Mon Sep 17 00:00:00 2001 From: Tal Gilboa Date: Tue, 24 Apr 2018 13:36:02 +0300 Subject: net/dim: Support adaptive TX moderation Interrupt moderation for TX traffic requires different profiles than RX interrupt moderation. The main goal here is to reduce interrupt rate and allow better payload aggregation by keeping SKBs in the TX queue a bit longer. Ping-pong behavior would get a profile with a short timer, so latency wouldn't increase for these scenarios. There might be a slight degradation in bandwidth for single stream with large message sizes, since net.ipv4.tcp_limit_output_bytes is limiting the allowed TX traffic, but with many streams performance is always improved. Signed-off-by: Tal Gilboa Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/net_dim.h | 63 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h index 7ca3c4deb3a6..db99240d00bd 100644 --- a/include/linux/net_dim.h +++ b/include/linux/net_dim.h @@ -103,11 +103,12 @@ enum { #define NET_DIM_PARAMS_NUM_PROFILES 5 /* Adaptive moderation profiles */ #define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128 #define NET_DIM_DEF_PROFILE_CQE 1 #define NET_DIM_DEF_PROFILE_EQE 1 /* All profiles sizes must be NET_PARAMS_DIM_NUM_PROFILES */ -#define NET_DIM_EQE_PROFILES { \ +#define NET_DIM_RX_EQE_PROFILES { \ {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ @@ -115,7 +116,7 @@ enum { {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ } -#define NET_DIM_CQE_PROFILES { \ +#define NET_DIM_RX_CQE_PROFILES { \ {2, 256}, \ {8, 128}, \ {16, 64}, \ @@ -123,32 +124,68 @@ enum { {64, 64} \ } +#define NET_DIM_TX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {32, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE} \ +} + +#define NET_DIM_TX_CQE_PROFILES { \ + {5, 128}, \ + {8, 64}, \ + {16, 32}, \ + {32, 32}, \ + {64, 32} \ +} + static const struct net_dim_cq_moder -profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { - NET_DIM_EQE_PROFILES, - NET_DIM_CQE_PROFILES, +rx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_RX_EQE_PROFILES, + NET_DIM_RX_CQE_PROFILES, +}; + +static const struct net_dim_cq_moder +tx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_TX_EQE_PROFILES, + NET_DIM_TX_CQE_PROFILES, }; static inline struct net_dim_cq_moder net_dim_get_rx_moderation(u8 cq_period_mode, int ix) { - struct net_dim_cq_moder cq_moder = profile[cq_period_mode][ix]; + struct net_dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix]; cq_moder.cq_period_mode = cq_period_mode; return cq_moder; } static inline struct net_dim_cq_moder -net_dim_get_def_rx_moderation(u8 rx_cq_period_mode) +net_dim_get_def_rx_moderation(u8 cq_period_mode) +{ + u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ? + NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE; + + return net_dim_get_rx_moderation(cq_period_mode, profile_ix); +} + +static inline struct net_dim_cq_moder +net_dim_get_tx_moderation(u8 cq_period_mode, int ix) { - int default_profile_ix; + struct net_dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix]; - if (rx_cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE) - default_profile_ix = NET_DIM_DEF_PROFILE_CQE; - else /* NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE */ - default_profile_ix = NET_DIM_DEF_PROFILE_EQE; + cq_moder.cq_period_mode = cq_period_mode; + return cq_moder; +} + +static inline struct net_dim_cq_moder +net_dim_get_def_tx_moderation(u8 cq_period_mode) +{ + u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ? + NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE; - return net_dim_get_rx_moderation(rx_cq_period_mode, default_profile_ix); + return net_dim_get_tx_moderation(cq_period_mode, profile_ix); } static inline bool net_dim_on_top(struct net_dim *dim) -- cgit v1.2.3 From a06ac0d67d9fda7c255476c6391032319030045d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 24 Apr 2018 23:07:45 +0800 Subject: Revert "net: init sk_cookie for inet socket" This reverts commit ("net: init sk_cookie for inet socket") Per discussion with Eric, when update sock_net(sk)->cookie_gen, the whole cache cache line will be invalidated, as this cache line is shared with all cpus, that may cause great performace hit. Bellow is the data form Eric. "Performance is reduced from ~5 Mpps to ~3.8 Mpps with 16 RX queues on my host" when running synflood test. Have to revert it to prevent from cache line false sharing. Signed-off-by: Yafang Shao Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/sock_diag.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 5c916e6dff36..15fe980a27ea 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -25,15 +25,6 @@ void sock_diag_unregister(const struct sock_diag_handler *h); void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); -static inline -void sock_init_cookie(struct sock *sk) -{ - u64 res; - - res = atomic64_inc_return(&sock_net(sk)->cookie_gen); - atomic64_set(&sk->sk_cookie, res); -} - u64 sock_gen_cookie(struct sock *sk); int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie); void sock_diag_save_cookie(struct sock *sk, __u32 *cookie); -- cgit v1.2.3 From 0c6f69a5e3641bccfe21fd0eeafad45a8c6db987 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 24 Apr 2018 08:29:13 +1000 Subject: rhashtable: remove outdated comments about grow_decision etc grow_decision and shink_decision no longer exist, so remove the remaining references to them. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 1f8ad121eb43..87d443a5b11d 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -836,9 +836,8 @@ out: * * It is safe to call this function from atomic context. * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. */ static inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, @@ -866,9 +865,8 @@ static inline int rhashtable_insert_fast( * * It is safe to call this function from atomic context. * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. */ static inline int rhltable_insert_key( struct rhltable *hlt, const void *key, struct rhlist_head *list, @@ -890,9 +888,8 @@ static inline int rhltable_insert_key( * * It is safe to call this function from atomic context. * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. */ static inline int rhltable_insert( struct rhltable *hlt, struct rhlist_head *list, @@ -922,9 +919,8 @@ static inline int rhltable_insert( * * It is safe to call this function from atomic context. * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. */ static inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, @@ -981,9 +977,8 @@ static inline void *rhashtable_lookup_get_insert_fast( * * Lookups may occur in parallel with hashtable mutations and resizing. * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. * * Returns zero on success. */ @@ -1134,8 +1129,8 @@ static inline int __rhashtable_remove_fast( * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * - * Will automatically shrink the table via rhashtable_expand() if the - * shrink_decision function specified at rhashtable_init() returns true. + * Will automatically shrink the table if permitted when residency drops + * below 30%. * * Returns zero on success, -ENOENT if the entry could not be found. */ @@ -1156,8 +1151,8 @@ static inline int rhashtable_remove_fast( * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * - * Will automatically shrink the table via rhashtable_expand() if the - * shrink_decision function specified at rhashtable_init() returns true. + * Will automatically shrink the table if permitted when residency drops + * below 30% * * Returns zero on success, -ENOENT if the entry could not be found. */ -- cgit v1.2.3 From 82266e98dd4d8e7d5b8e4a0fedeb91f2eb29d306 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 24 Apr 2018 08:29:13 +1000 Subject: rhashtable: Revise incorrect comment on r{hl, hash}table_walk_enter() Neither rhashtable_walk_enter() or rhltable_walk_enter() sleep, though they do take a spinlock without irq protection. So revise the comments to accurately state the contexts in which these functions can be called. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 87d443a5b11d..4e1f535c2034 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1268,8 +1268,9 @@ static inline int rhashtable_walk_init(struct rhashtable *ht, * For a completely stable walk you should construct your own data * structure outside the hash table. * - * This function may sleep so you must not call it from interrupt - * context or with spin locks held. + * This function may be called from any process context, including + * non-preemptable context, but cannot be called from softirq or + * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ -- cgit v1.2.3 From 12bed760a78da6e12ac8252fec64d019a9eac523 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Tue, 24 Apr 2018 17:50:29 +0300 Subject: bpf: add helper for getting xfrm states This commit introduces a helper which allows fetching xfrm state parameters by eBPF programs attached to TC. Prototype: bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags) skb: pointer to skb index: the index in the skb xfrm_state secpath array xfrm_state: pointer to 'struct bpf_xfrm_state' size: size of 'struct bpf_xfrm_state' flags: reserved for future extensions The helper returns 0 on success. Non zero if no xfrm state at the index is found - or non exists at all. struct bpf_xfrm_state currently includes the SPI, peer IPv4/IPv6 address and the reqid; it can be further extended by adding elements to its end - indicating the populated fields by the 'size' argument - keeping backwards compatibility. Typical usage: struct bpf_xfrm_state x = {}; bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0); ... Signed-off-by: Eyal Birger Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8383a289f7b..e6679393b687 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -774,6 +774,15 @@ union bpf_attr { * @xdp_md: pointer to xdp_md * @delta: A negative integer to be added to xdp_md.data_end * Return: 0 on success or negative on error + * + * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags) + * retrieve XFRM state + * @skb: pointer to skb + * @index: index of the xfrm state in the secpath + * @key: pointer to 'struct bpf_xfrm_state' + * @size: size of 'struct bpf_xfrm_state' + * @flags: room for future extensions + * Return: 0 on success or negative error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -841,7 +850,8 @@ union bpf_attr { FN(msg_cork_bytes), \ FN(msg_pull_data), \ FN(bind), \ - FN(xdp_adjust_tail), + FN(xdp_adjust_tail), \ + FN(skb_get_xfrm_state), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -947,6 +957,19 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* user accessible mirror of in-kernel xfrm_state. + * new fields can only be added to the end of this structure + */ +struct bpf_xfrm_state { + __u32 reqid; + __u32 spi; /* Stored in network byte order */ + __u16 family; + union { + __u32 remote_ipv4; /* Stored in network byte order */ + __u32 remote_ipv6[4]; /* Stored in network byte order */ + }; +}; + /* Generic BPF return codes which all BPF program types may support. * The values are binary compatible with their TC_ACT_* counter-part to * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT -- cgit v1.2.3 From 7d775f63470c3b6ddf34c770c973293ab925a7bb Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 3 Apr 2018 17:16:03 -0400 Subject: macvlan: Rename fwd_priv to accel_priv and add accessor function This change renames the fwd_priv member to accel_priv as this more accurately reflects the actual purpose of this value. In addition I am adding an accessor which will allow us to further abstract this in the future if needed. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/if_macvlan.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 4cb7aeeafce0..c5106ce8273a 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -21,7 +21,7 @@ struct macvlan_dev { struct hlist_node hlist; struct macvlan_port *port; struct net_device *lowerdev; - void *fwd_priv; + void *accel_priv; struct vlan_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); @@ -86,4 +86,10 @@ macvlan_dev_real_dev(const struct net_device *dev) } #endif +static inline void *macvlan_accel_priv(struct net_device *dev) +{ + struct macvlan_dev *macvlan = netdev_priv(dev); + + return macvlan->accel_priv; +} #endif /* _LINUX_IF_MACVLAN_H */ -- cgit v1.2.3 From a222311283641c9a476426ccacb2a3c82bfe5aaa Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 3 Apr 2018 17:16:19 -0400 Subject: macvlan: macvlan_count_rx shouldn't be static inline AND extern It doesn't make sense to define macvlan_count_rx as a static inline and then add a forward declaration after that as an extern. I am dropping the extern declaration since it seems like it is something that likely got missed when the function was made an inline. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/if_macvlan.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index c5106ce8273a..80089d6c5a0a 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -61,10 +61,6 @@ extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); -extern void macvlan_count_rx(const struct macvlan_dev *vlan, - unsigned int len, bool success, - bool multicast); - extern void macvlan_dellink(struct net_device *dev, struct list_head *head); extern int macvlan_link_register(struct rtnl_link_ops *ops); -- cgit v1.2.3 From 6cb1937d4eff6936089139169377fda208a2701a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 3 Apr 2018 17:16:25 -0400 Subject: macvlan: Add function to test for destination filtering support This patch adds a function indicating if a given macvlan can fully supports destination filtering, especially as it relates to unicast traffic. For those macvlan interfaces that do not support destination filtering such passthru or source mode filtering we should not be enabling offload support. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/if_macvlan.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 80089d6c5a0a..0221390668a0 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -88,4 +88,13 @@ static inline void *macvlan_accel_priv(struct net_device *dev) return macvlan->accel_priv; } + +static inline bool macvlan_supports_dest_filter(struct net_device *dev) +{ + struct macvlan_dev *macvlan = netdev_priv(dev); + + return macvlan->mode == MACVLAN_MODE_PRIVATE || + macvlan->mode == MACVLAN_MODE_VEPA || + macvlan->mode == MACVLAN_MODE_BRIDGE; +} #endif /* _LINUX_IF_MACVLAN_H */ -- cgit v1.2.3 From 53cd4d8e4dfb231fa5ba125b6c0666b82a0504fa Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 3 Apr 2018 17:16:30 -0400 Subject: macvlan: Provide function for interfaces to release HW offload This patch provides a basic function to allow a lower device to disable macvlan offload if it was previously enabled on a given macvlan. The idea here is to allow for recovery from failure should the lowerdev run out of resources. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/if_macvlan.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 0221390668a0..2e55e4cdbd8a 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -97,4 +97,12 @@ static inline bool macvlan_supports_dest_filter(struct net_device *dev) macvlan->mode == MACVLAN_MODE_VEPA || macvlan->mode == MACVLAN_MODE_BRIDGE; } + +static inline int macvlan_release_l2fw_offload(struct net_device *dev) +{ + struct macvlan_dev *macvlan = netdev_priv(dev); + + macvlan->accel_priv = NULL; + return dev_uc_add(macvlan->lowerdev, dev->dev_addr); +} #endif /* _LINUX_IF_MACVLAN_H */ -- cgit v1.2.3 From b5facfdba14ccb440bd1eac20870a8f23afa17f3 Mon Sep 17 00:00:00 2001 From: Ahmed Abdelsalam Date: Tue, 24 Apr 2018 20:23:16 +0200 Subject: ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode ECMP (equal-cost multipath) hashes are typically computed on the packets' 5-tuple(src IP, dst IP, src port, dst port, L4 proto). For encapsulated packets, the L4 data is not readily available and ECMP hashing will often revert to (src IP, dst IP). This will lead to traffic polarization on a single ECMP path, causing congestion and waste of network capacity. In IPv6, the 20-bit flow label field is also used as part of the ECMP hash. In the lack of L4 data, the hashing will be on (src IP, dst IP, flow label). Having a non-zero flow label is thus important for proper traffic load balancing when L4 data is unavailable (i.e., when packets are encapsulated). Currently, the seg6_do_srh_encap() function extracts the original packet's flow label and set it as the outer IPv6 flow label. There are two issues with this behaviour: a) There is no guarantee that the inner flow label is set by the source. b) If the original packet is not IPv6, the flow label will be set to zero (e.g., IPv4 or L2 encap). This patch adds a function, named seg6_make_flowlabel(), that computes a flow label from a given skb. It supports IPv6, IPv4 and L2 payloads, and leverages the per namespace 'seg6_flowlabel" sysctl value. The currently support behaviours are as follows: -1 set flowlabel to zero. 0 copy flowlabel from Inner paceket in case of Inner IPv6 (Set flowlabel to 0 in case IPv4/L2) 1 Compute the flowlabel using seg6_make_flowlabel() This patch has been tested for IPv6, IPv4, and L2 traffic. Signed-off-by: Ahmed Abdelsalam Acked-by: David Lebrun Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 97b3a54579c8..c978a31b0f84 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -43,6 +43,7 @@ struct netns_sysctl_ipv6 { int max_hbh_opts_cnt; int max_dst_opts_len; int max_hbh_opts_len; + int seg6_flowlabel; }; struct netns_ipv6 { -- cgit v1.2.3 From 9ce33e46531d4b9f94b0fa135781e27c7c4e32e8 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 24 Apr 2018 13:49:34 -0700 Subject: neighbour: support for NTF_EXT_LEARNED flag This patch extends NTF_EXT_LEARNED support to the neighbour system. Example use-case: An Ethernet VPN implementation (eg in FRR routing suite) can use this flag to add dynamic reachable external neigh entires learned via control plane. The use of neigh NTF_EXT_LEARNED in this patch is consistent with its use with bridge and vxlan fdb entries. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/neighbour.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e421f86af043..6c1eecd56a4d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -246,6 +246,7 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_OVERRIDE 0x00000001 #define NEIGH_UPDATE_F_WEAK_OVERRIDE 0x00000002 #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER 0x00000004 +#define NEIGH_UPDATE_F_EXT_LEARNED 0x20000000 #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 @@ -526,5 +527,21 @@ static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, } while (read_seqretry(&n->ha_lock, seq)); } - +static inline void neigh_update_ext_learned(struct neighbour *neigh, u32 flags, + int *notify) +{ + u8 ndm_flags = 0; + + if (!(flags & NEIGH_UPDATE_F_ADMIN)) + return; + + ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + if (ndm_flags & NTF_EXT_LEARNED) + neigh->flags |= NTF_EXT_LEARNED; + else + neigh->flags &= ~NTF_EXT_LEARNED; + *notify = 1; + } +} #endif -- cgit v1.2.3 From 47b3ba5175e940c9be5fd6aec05d6e7e048ff823 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 24 Apr 2018 18:17:34 -0300 Subject: sctp: fix const parameter violation in sctp_make_sack sctp_make_sack() make changes to the asoc and this cast is just bypassing the const attribute. As there is no need to have the const there, just remove it and fix the violation. Signed-off-by: Marcelo Ricardo Leitner Reviewed-by: Xin Long Acked-by: Neil Horman --- include/net/sctp/sm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2d0e782c9055..f4b657478a30 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -207,7 +207,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, int len, __u8 flags, gfp_t gfp); struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn); -struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc); +struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc); struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, -- cgit v1.2.3 From 22e15b6f9c010ffded63e6c9720a656eb3d73835 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 25 Apr 2018 17:46:47 +0800 Subject: sctp: remove the unused sctp_assoc_is_match function After Commit 4f0087812648 ("sctp: apply rhashtable api to send/recv path"), there's no place using sctp_assoc_is_match, so remove it. Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index a0ec462bc1a9..05594b248e52 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2091,10 +2091,6 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, enum sctp_transport_cmd command, sctp_sn_error_t error); struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *, __u32); -struct sctp_transport *sctp_assoc_is_match(struct sctp_association *, - struct net *, - const union sctp_addr *, - const union sctp_addr *); void sctp_assoc_migrate(struct sctp_association *, struct sock *); int sctp_assoc_update(struct sctp_association *old, struct sctp_association *new); -- cgit v1.2.3 From 1cd7884dfd78df6284d27b008823b0b4a808f196 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Apr 2018 13:42:15 -0400 Subject: udp: expose inet cork to udp UDP segmentation offload needs access to inet_cork in the udp layer. Pass the struct to ip(6)_make_skb instead of allocating it on the stack in that function itself. This patch is a noop otherwise. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- include/net/ipv6.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index dc4a2d6e58a5..7ec543a64bbc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -171,7 +171,7 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, - unsigned int flags); + struct inet_cork *cork, unsigned int flags); static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4) { diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 68b167d98879..0dd722cab037 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -950,6 +950,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, + struct inet_cork_full *cork, const struct sockcm_cookie *sockc); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) -- cgit v1.2.3 From ee80d1ebe5ba7f4bd74959c873119175a4fc08d3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Apr 2018 13:42:16 -0400 Subject: udp: add udp gso Implement generic segmentation offload support for udp datagrams. A follow-up patch adds support to the protocol stack to generate such packets. UDP GSO is not UFO. UFO fragments a single large datagram. GSO splits a large payload into a number of discrete UDP datagrams. The implementation adds a GSO type SKB_UDP_GSO_L4 to differentiate it from UFO (SKB_UDP_GSO). IPPROTO_UDPLITE is excluded, as that protocol has no gso handler registered. [ Export __udp_gso_segment for ipv6. -DaveM ] Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ include/net/udp.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d274059529eb..a4a5c0c5cba8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -573,6 +573,8 @@ enum { SKB_GSO_ESP = 1 << 15, SKB_GSO_UDP = 1 << 16, + + SKB_GSO_UDP_L4 = 1 << 17, }; #if BITS_PER_LONG > 32 diff --git a/include/net/udp.h b/include/net/udp.h index 0676b272f6ac..741d888d0fdb 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -174,6 +174,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, struct udphdr *uh, udp_lookup_t lookup); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); +struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, + netdev_features_t features, + unsigned int mss, __sum16 check); + static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; -- cgit v1.2.3 From bec1f6f697362c5bc635dacd7ac8499d0a10a4e7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Apr 2018 13:42:17 -0400 Subject: udp: generate gso with UDP_SEGMENT Support generic segmentation offload for udp datagrams. Callers can concatenate and send at once the payload of multiple datagrams with the same destination. To set segment size, the caller sets socket option UDP_SEGMENT to the length of each discrete payload. This value must be smaller than or equal to the relevant MTU. A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a per send call basis. Total byte length may then exceed MTU. If not an exact multiple of segment size, the last segment will be shorter. The implementation adds a gso_size field to the udp socket, ip(v6) cmsg cookie and inet_cork structure to be able to set the value at setsockopt or cmsg time and to work with both lockless and corked paths. Initial benchmark numbers show UDP GSO about as expensive as TCP GSO. tcp tso 3197 MB/s 54232 msg/s 54232 calls/s 6,457,754,262 cycles tcp gso 1765 MB/s 29939 msg/s 29939 calls/s 11,203,021,806 cycles tcp without tso/gso * 739 MB/s 12548 msg/s 12548 calls/s 11,205,483,630 cycles udp 876 MB/s 14873 msg/s 624666 calls/s 11,205,777,429 cycles udp gso 2139 MB/s 36282 msg/s 36282 calls/s 11,204,374,561 cycles [*] after reverting commit 0a6b2a1dc2a2 ("tcp: switch to GSO being always on") Measured total system cycles ('-a') for one core while pinning both the network receive path and benchmark process to that core: perf stat -a -C 12 -e cycles \ ./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4 Note the reduction in calls/s with GSO. Bytes per syscall drops increases from 1470 to 61818. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 3 +++ include/net/inet_sock.h | 1 + include/net/ip.h | 1 + include/net/ipv6.h | 1 + include/uapi/linux/udp.h | 1 + 5 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index eaea63bc79bb..ca840345571b 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -55,6 +55,7 @@ struct udp_sock { * when the socket is uncorked. */ __u16 len; /* total length of pending frames */ + __u16 gso_size; /* * Fields specific to UDP-Lite. */ @@ -87,6 +88,8 @@ struct udp_sock { int forward_deficit; }; +#define UDP_MAX_SEGMENTS (1 << 6UL) + static inline struct udp_sock *udp_sk(const struct sock *sk) { return (struct udp_sock *)sk; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 0a671c32d6b9..83d5b3c2ac42 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -147,6 +147,7 @@ struct inet_cork { __u8 ttl; __s16 tos; char priority; + __u16 gso_size; }; struct inet_cork_full { diff --git a/include/net/ip.h b/include/net/ip.h index 7ec543a64bbc..bada1f1f871e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -76,6 +76,7 @@ struct ipcm_cookie { __u8 ttl; __s16 tos; char priority; + __u16 gso_size; }; #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 0dd722cab037..0a872a7c33c8 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -298,6 +298,7 @@ struct ipcm6_cookie { __s16 tclass; __s8 dontfrag; struct ipv6_txoptions *opt; + __u16 gso_size; }; static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index efb7b5991c2f..09d00f8c442b 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -32,6 +32,7 @@ struct udphdr { #define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */ #define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */ #define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */ +#define UDP_SEGMENT 103 /* Set GSO segmentation size */ /* UDP encapsulation types */ #define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ -- cgit v1.2.3 From 2e8de8576343ab540856082916bfb84d17288b08 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Apr 2018 13:42:20 -0400 Subject: udp: add gso segment cmsg Allow specifying segment size in the send call. The new control message performs the same function as socket option UDP_SEGMENT while avoiding the extra system call. [ Export udp_cmsg_send for ipv6. -DaveM ] Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/udp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 741d888d0fdb..05990746810e 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -273,6 +273,7 @@ int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); +int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); -- cgit v1.2.3 From 83aa025f535f76733e334e3d2a4d8577c8441a7e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Apr 2018 13:42:21 -0400 Subject: udp: add gso support to virtual devices Virtual devices such as tunnels and bonding can handle large packets. Only segment packets when reaching a physical or loopback device. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 5 ++++- include/linux/netdevice.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 35b79f47a13d..fe2f3b30960e 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -55,8 +55,9 @@ enum { NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */ NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */ NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */ + NETIF_F_GSO_UDP_L4_BIT, /* ... UDP payload GSO (not UFO) */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_UDP_BIT, + NETIF_F_GSO_UDP_L4_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ @@ -147,6 +148,7 @@ enum { #define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM) #define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT) #define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD) +#define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) @@ -216,6 +218,7 @@ enum { NETIF_F_GSO_GRE_CSUM | \ NETIF_F_GSO_IPXIP4 | \ NETIF_F_GSO_IPXIP6 | \ + NETIF_F_GSO_UDP_L4 | \ NETIF_F_GSO_UDP_TUNNEL | \ NETIF_F_GSO_UDP_TUNNEL_CSUM) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 14e0777ffcfb..366c32891158 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4186,6 +4186,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } -- cgit v1.2.3 From b85fab0e67b162014cd328cb4e2a8e8ae382cb8a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 25 Apr 2018 19:41:06 +0200 Subject: bpf: Add gpl_compatible flag to struct bpf_prog_info Adding gpl_compatible flag to struct bpf_prog_info so it can be dumped via bpf_prog_get_info_by_fd and displayed via bpftool progs dump. Alexei noticed 4-byte hole in struct bpf_prog_info, so we put the u32 flags field in there, and we can keep adding bit fields in there without breaking user space. Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e6679393b687..da8801860c7d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1060,6 +1060,7 @@ struct bpf_prog_info { __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; __u32 ifindex; + __u32 gpl_compatible:1; __u64 netns_dev; __u64 netns_ino; } __attribute__((aligned(8))); -- cgit v1.2.3 From 8a22543c8e7093104d7c22190746b9492bfbd897 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Apr 2018 19:15:54 +0200 Subject: netfilter: nf_tables: make meta expression builtin size net/netfilter/nft_meta.ko text data bss dec hex filename 5826 936 1 6763 1a6b net/netfilter/nft_meta.ko 96407 2064 400 98871 18237 net/netfilter/nf_tables.ko after: 100826 2240 401 103467 1942b net/netfilter/nf_tables.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index ea5aab568be8..3339cce8f585 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -10,6 +10,7 @@ extern struct nft_expr_type nft_byteorder_type; extern struct nft_expr_type nft_payload_type; extern struct nft_expr_type nft_dynset_type; extern struct nft_expr_type nft_range_type; +extern struct nft_expr_type nft_meta_type; int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); -- cgit v1.2.3 From ae1bc6a9f398d5e0310387eb077a0d9ce1fb21f5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Apr 2018 19:15:55 +0200 Subject: netfilter: nf_tables: merge rt expression into nft core before: text data bss dec hex filename 2657 844 0 3501 dad net/netfilter/nft_rt.ko 100826 2240 401 103467 1942b net/netfilter/nf_tables.ko after: 2657 844 0 3501 dad net/netfilter/nft_rt.ko 102456 2316 401 105173 19ad5 net/netfilter/nf_tables.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 3339cce8f585..d6a358ae3749 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -11,6 +11,7 @@ extern struct nft_expr_type nft_payload_type; extern struct nft_expr_type nft_dynset_type; extern struct nft_expr_type nft_range_type; extern struct nft_expr_type nft_meta_type; +extern struct nft_expr_type nft_rt_type; int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); -- cgit v1.2.3 From d0103158cf7c9190860dabd12b85ccad3c6e3455 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Apr 2018 19:15:56 +0200 Subject: netfilter: nf_tables: merge exthdr expression into nft core before: text data bss dec hex filename 5056 844 0 5900 170c net/netfilter/nft_exthdr.ko 102456 2316 401 105173 19ad5 net/netfilter/nf_tables.ko after: 106410 2392 401 109203 1aa93 net/netfilter/nf_tables.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index d6a358ae3749..cd6915b6c054 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -12,6 +12,7 @@ extern struct nft_expr_type nft_dynset_type; extern struct nft_expr_type nft_range_type; extern struct nft_expr_type nft_meta_type; extern struct nft_expr_type nft_rt_type; +extern struct nft_expr_type nft_exthdr_type; int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); -- cgit v1.2.3 From 56a092c895054a6b423781d788339775bd2bda10 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:52 +0100 Subject: bpf: add script and prepare bpf.h for new helpers documentation Remove previous "overview" of eBPF helpers from user bpf.h header. Replace it by a comment explaining how to process the new documentation (to come in following patches) with a Python script to produce RST, then man page documentation. Also add the aforementioned Python script under scripts/. It is used to process include/uapi/linux/bpf.h and to extract helper descriptions, to turn it into a RST document that can further be processed with rst2man to produce a man page. The script takes one "--filename " option. If the script is launched from scripts/ in the kernel root directory, it should be able to find the location of the header to parse, and "--filename " is then optional. If it cannot find the file, then the option becomes mandatory. RST-formatted documentation is printed to standard output. Typical workflow for producing the final man page would be: $ ./scripts/bpf_helpers_doc.py \ --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 $ man /tmp/bpf-helpers.7 Note that the tool kernel-doc cannot be used to document eBPF helpers, whose signatures are not available directly in the header files (pre-processor directives are used to produce them at the beginning of the compilation process). v4: - Also remove overviews for newly added bpf_xdp_adjust_tail() and bpf_skb_get_xfrm_state(). - Remove vague statement about what helpers are restricted to GPL programs in "LICENSE" section for man page footer. - Replace license boilerplate with SPDX tag for Python script. v3: - Change license for man page. - Remove "for safety reasons" from man page header text. - Change "packets metadata" to "packets" in man page header text. - Move and fix comment on helpers introducing no overhead. - Remove "NOTES" section from man page footer. - Add "LICENSE" section to man page footer. - Edit description of file include/uapi/linux/bpf.h in man page footer. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 422 ++--------------------------------------------- 1 file changed, 16 insertions(+), 406 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da8801860c7d..dd43fc98c982 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -377,412 +377,22 @@ union bpf_attr { }; } __attribute__((aligned(8))); -/* BPF helper function descriptions: - * - * void *bpf_map_lookup_elem(&map, &key) - * Return: Map value or NULL - * - * int bpf_map_update_elem(&map, &key, &value, flags) - * Return: 0 on success or negative error - * - * int bpf_map_delete_elem(&map, &key) - * Return: 0 on success or negative error - * - * int bpf_probe_read(void *dst, int size, void *src) - * Return: 0 on success or negative error - * - * u64 bpf_ktime_get_ns(void) - * Return: current ktime - * - * int bpf_trace_printk(const char *fmt, int fmt_size, ...) - * Return: length of buffer written or negative error - * - * u32 bpf_prandom_u32(void) - * Return: random value - * - * u32 bpf_raw_smp_processor_id(void) - * Return: SMP processor ID - * - * int bpf_skb_store_bytes(skb, offset, from, len, flags) - * store bytes into packet - * @skb: pointer to skb - * @offset: offset within packet from skb->mac_header - * @from: pointer where to copy bytes from - * @len: number of bytes to store into packet - * @flags: bit 0 - if true, recompute skb->csum - * other bits - reserved - * Return: 0 on success or negative error - * - * int bpf_l3_csum_replace(skb, offset, from, to, flags) - * recompute IP checksum - * @skb: pointer to skb - * @offset: offset within packet where IP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * other bits - reserved - * Return: 0 on success or negative error - * - * int bpf_l4_csum_replace(skb, offset, from, to, flags) - * recompute TCP/UDP checksum - * @skb: pointer to skb - * @offset: offset within packet where TCP/UDP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * bit 4 - is pseudo header - * other bits - reserved - * Return: 0 on success or negative error - * - * int bpf_tail_call(ctx, prog_array_map, index) - * jump into another BPF program - * @ctx: context pointer passed to next program - * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: 32-bit index inside array that selects specific program to run - * Return: 0 on success or negative error - * - * int bpf_clone_redirect(skb, ifindex, flags) - * redirect to another netdev - * @skb: pointer to skb - * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: 0 on success or negative error - * - * u64 bpf_get_current_pid_tgid(void) - * Return: current->tgid << 32 | current->pid - * - * u64 bpf_get_current_uid_gid(void) - * Return: current_gid << 32 | current_uid - * - * int bpf_get_current_comm(char *buf, int size_of_buf) - * stores current->comm into buf - * Return: 0 on success or negative error - * - * u32 bpf_get_cgroup_classid(skb) - * retrieve a proc's classid - * @skb: pointer to skb - * Return: classid if != 0 - * - * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) - * Return: 0 on success or negative error - * - * int bpf_skb_vlan_pop(skb) - * Return: 0 on success or negative error - * - * int bpf_skb_get_tunnel_key(skb, key, size, flags) - * int bpf_skb_set_tunnel_key(skb, key, size, flags) - * retrieve or populate tunnel metadata - * @skb: pointer to skb - * @key: pointer to 'struct bpf_tunnel_key' - * @size: size of 'struct bpf_tunnel_key' - * @flags: room for future extensions - * Return: 0 on success or negative error - * - * u64 bpf_perf_event_read(map, flags) - * read perf event counter value - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * Return: value of perf event counter read or error code - * - * int bpf_redirect(ifindex, flags) - * redirect to another netdev - * @ifindex: ifindex of the net device - * @flags: - * cls_bpf: - * bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * xdp_bpf: - * all bits - reserved - * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error - * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error - * int bpf_redirect_map(map, key, flags) - * redirect to endpoint in map - * @map: pointer to dev map - * @key: index in map to lookup - * @flags: -- - * Return: XDP_REDIRECT on success or XDP_ABORT on error - * - * u32 bpf_get_route_realm(skb) - * retrieve a dst's tclassid - * @skb: pointer to skb - * Return: realm if != 0 - * - * int bpf_perf_event_output(ctx, map, flags, data, size) - * output perf raw sample - * @ctx: struct pt_regs* - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * @data: data on stack to be output as raw data - * @size: size of data - * Return: 0 on success or negative error - * - * int bpf_get_stackid(ctx, map, flags) - * walk user or kernel stack and return id - * @ctx: struct pt_regs* - * @map: pointer to stack_trace map - * @flags: bits 0-7 - numer of stack frames to skip - * bit 8 - collect user stack instead of kernel - * bit 9 - compare stacks by hash only - * bit 10 - if two different stacks hash into the same stackid - * discard old - * other bits - reserved - * Return: >= 0 stackid on success or negative error - * - * s64 bpf_csum_diff(from, from_size, to, to_size, seed) - * calculate csum diff - * @from: raw from buffer - * @from_size: length of from buffer - * @to: raw to buffer - * @to_size: length of to buffer - * @seed: optional seed - * Return: csum result or negative error code - * - * int bpf_skb_get_tunnel_opt(skb, opt, size) - * retrieve tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: option size - * - * int bpf_skb_set_tunnel_opt(skb, opt, size) - * populate tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: 0 on success or negative error - * - * int bpf_skb_change_proto(skb, proto, flags) - * Change protocol of the skb. Currently supported is v4 -> v6, - * v6 -> v4 transitions. The helper will also resize the skb. eBPF - * program is expected to fill the new headers via skb_store_bytes - * and lX_csum_replace. - * @skb: pointer to skb - * @proto: new skb->protocol type - * @flags: reserved - * Return: 0 on success or negative error - * - * int bpf_skb_change_type(skb, type) - * Change packet type of skb. - * @skb: pointer to skb - * @type: new skb->pkt_type type - * Return: 0 on success or negative error - * - * int bpf_skb_under_cgroup(skb, map, index) - * Check cgroup2 membership of skb - * @skb: pointer to skb - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 skb failed the cgroup2 descendant test - * == 1 skb succeeded the cgroup2 descendant test - * < 0 error - * - * u32 bpf_get_hash_recalc(skb) - * Retrieve and possibly recalculate skb->hash. - * @skb: pointer to skb - * Return: hash - * - * u64 bpf_get_current_task(void) - * Returns current task_struct - * Return: current - * - * int bpf_probe_write_user(void *dst, void *src, int len) - * safely attempt to write to a location - * @dst: destination address in userspace - * @src: source address on stack - * @len: number of bytes to copy - * Return: 0 on success or negative error - * - * int bpf_current_task_under_cgroup(map, index) - * Check cgroup2 membership of current task - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 current failed the cgroup2 descendant test - * == 1 current succeeded the cgroup2 descendant test - * < 0 error - * - * int bpf_skb_change_tail(skb, len, flags) - * The helper will resize the skb to the given new size, to be used f.e. - * with control messages. - * @skb: pointer to skb - * @len: new skb length - * @flags: reserved - * Return: 0 on success or negative error - * - * int bpf_skb_pull_data(skb, len) - * The helper will pull in non-linear data in case the skb is non-linear - * and not all of len are part of the linear section. Only needed for - * read/write with direct packet access. - * @skb: pointer to skb - * @len: len to make read/writeable - * Return: 0 on success or negative error - * - * s64 bpf_csum_update(skb, csum) - * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. - * @skb: pointer to skb - * @csum: csum to add - * Return: csum on success or negative error - * - * void bpf_set_hash_invalid(skb) - * Invalidate current skb->hash. - * @skb: pointer to skb - * - * int bpf_get_numa_node_id() - * Return: Id of current NUMA node. - * - * int bpf_skb_change_head() - * Grows headroom of skb and adjusts MAC header offset accordingly. - * Will extends/reallocae as required automatically. - * May change skb data pointer and will thus invalidate any check - * performed for direct packet access. - * @skb: pointer to skb - * @len: length of header to be pushed in front - * @flags: Flags (unused for now) - * Return: 0 on success or negative error - * - * int bpf_xdp_adjust_head(xdp_md, delta) - * Adjust the xdp_md.data by delta - * @xdp_md: pointer to xdp_md - * @delta: An positive/negative integer to be added to xdp_md.data - * Return: 0 on success or negative on error - * - * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) - * Copy a NUL terminated string from unsafe address. In case the string - * length is smaller than size, the target is not padded with further NUL - * bytes. In case the string length is larger than size, just count-1 - * bytes are copied and the last byte is set to NUL. - * @dst: destination address - * @size: maximum number of bytes to copy, including the trailing NUL - * @unsafe_ptr: unsafe address - * Return: - * > 0 length of the string including the trailing NUL on success - * < 0 error - * - * u64 bpf_get_socket_cookie(skb) - * Get the cookie for the socket stored inside sk_buff. - * @skb: pointer to skb - * Return: 8 Bytes non-decreasing number on success or 0 if the socket - * field is missing inside sk_buff - * - * u32 bpf_get_socket_uid(skb) - * Get the owner uid of the socket stored inside sk_buff. - * @skb: pointer to skb - * Return: uid of the socket owner on success or overflowuid if failed. - * - * u32 bpf_set_hash(skb, hash) - * Set full skb->hash. - * @skb: pointer to skb - * @hash: hash to set - * - * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) - * Calls setsockopt. Not all opts are available, only those with - * integer optvals plus TCP_CONGESTION. - * Supported levels: SOL_SOCKET and IPPROTO_TCP - * @bpf_socket: pointer to bpf_socket - * @level: SOL_SOCKET or IPPROTO_TCP - * @optname: option name - * @optval: pointer to option value - * @optlen: length of optval in bytes - * Return: 0 or negative error - * - * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen) - * Calls getsockopt. Not all opts are available. - * Supported levels: IPPROTO_TCP - * @bpf_socket: pointer to bpf_socket - * @level: IPPROTO_TCP - * @optname: option name - * @optval: pointer to option value - * @optlen: length of optval in bytes - * Return: 0 or negative error - * - * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags) - * Set callback flags for sock_ops - * @bpf_sock_ops: pointer to bpf_sock_ops_kern struct - * @flags: flags value - * Return: 0 for no error - * -EINVAL if there is no full tcp socket - * bits in flags that are not supported by current kernel - * - * int bpf_skb_adjust_room(skb, len_diff, mode, flags) - * Grow or shrink room in sk_buff. - * @skb: pointer to skb - * @len_diff: (signed) amount of room to grow/shrink - * @mode: operation mode (enum bpf_adj_room_mode) - * @flags: reserved for future use - * Return: 0 on success or negative error code - * - * int bpf_sk_redirect_map(map, key, flags) - * Redirect skb to a sock in map using key as a lookup key for the - * sock in map. - * @map: pointer to sockmap - * @key: key to lookup sock in map - * @flags: reserved for future use - * Return: SK_PASS - * - * int bpf_sock_map_update(skops, map, key, flags) - * @skops: pointer to bpf_sock_ops - * @map: pointer to sockmap to update - * @key: key to insert/update sock in map - * @flags: same flags as map update elem - * - * int bpf_xdp_adjust_meta(xdp_md, delta) - * Adjust the xdp_md.data_meta by delta - * @xdp_md: pointer to xdp_md - * @delta: An positive/negative integer to be added to xdp_md.data_meta - * Return: 0 on success or negative on error - * - * int bpf_perf_event_read_value(map, flags, buf, buf_size) - * read perf event counter value and perf event enabled/running time - * @map: pointer to perf_event_array map - * @flags: index of event in the map or bitmask flags - * @buf: buf to fill - * @buf_size: size of the buf - * Return: 0 on success or negative error code - * - * int bpf_perf_prog_read_value(ctx, buf, buf_size) - * read perf prog attached perf event counter and enabled/running time - * @ctx: pointer to ctx - * @buf: buf to fill - * @buf_size: size of the buf - * Return : 0 on success or negative error code - * - * int bpf_override_return(pt_regs, rc) - * @pt_regs: pointer to struct pt_regs - * @rc: the return value to set - * - * int bpf_msg_redirect_map(map, key, flags) - * Redirect msg to a sock in map using key as a lookup key for the - * sock in map. - * @map: pointer to sockmap - * @key: key to lookup sock in map - * @flags: reserved for future use - * Return: SK_PASS - * - * int bpf_bind(ctx, addr, addr_len) - * Bind socket to address. Only binding to IP is supported, no port can be - * set in addr. - * @ctx: pointer to context of type bpf_sock_addr - * @addr: pointer to struct sockaddr to bind socket to - * @addr_len: length of sockaddr structure - * Return: 0 on success or negative error code - * - * int bpf_xdp_adjust_tail(xdp_md, delta) - * Adjust the xdp_md.data_end by delta. Only shrinking of packet's - * size is supported. - * @xdp_md: pointer to xdp_md - * @delta: A negative integer to be added to xdp_md.data_end - * Return: 0 on success or negative on error - * - * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags) - * retrieve XFRM state - * @skb: pointer to skb - * @index: index of the xfrm state in the secpath - * @key: pointer to 'struct bpf_xfrm_state' - * @size: size of 'struct bpf_xfrm_state' - * @flags: room for future extensions - * Return: 0 on success or negative error +/* The description below is an attempt at providing documentation to eBPF + * developers about the multiple available eBPF helper functions. It can be + * parsed and used to produce a manual page. The workflow is the following, + * and requires the rst2man utility: + * + * $ ./scripts/bpf_helpers_doc.py \ + * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst + * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 + * $ man /tmp/bpf-helpers.7 + * + * Note that in order to produce this external documentation, some RST + * formatting is used in the descriptions to get "bold" and "italics" in + * manual pages. Also note that the few trailing white spaces are + * intentional, removing them would break paragraphs for rst2man. + * + * Start of BPF helper function descriptions: */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.2.3 From ad4a522349e5c4fa6af63df376256749dc489fb8 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:53 +0100 Subject: bpf: add documentation for eBPF helpers (01-11) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by Alexei: - bpf_map_lookup_elem() - bpf_map_update_elem() - bpf_map_delete_elem() - bpf_probe_read() - bpf_ktime_get_ns() - bpf_trace_printk() - bpf_skb_store_bytes() - bpf_l3_csum_replace() - bpf_l4_csum_replace() - bpf_tail_call() - bpf_clone_redirect() v4: - bpf_map_lookup_elem(): Add "const" qualifier for key. - bpf_map_update_elem(): Add "const" qualifier for key and value. - bpf_map_lookup_elem(): Add "const" qualifier for key. - bpf_skb_store_bytes(): Clarify comment about invalidated verifier checks. - bpf_l3_csum_replace(): Mention L3 instead of just IP, and add a note about bpf_csum_diff(). - bpf_l4_csum_replace(): Mention L4 instead of just TCP/UDP, and add a note about bpf_csum_diff(). - bpf_tail_call(): Bring minor edits to description. - bpf_clone_redirect(): Add a note about the relation with bpf_redirect(). Also clarify comment about invalidated verifier checks. v3: - bpf_map_lookup_elem(): Fix description of restrictions for flags related to the existence of the entry. - bpf_trace_printk(): State that trace_pipe can be configured. Fix return value in case an unknown format specifier is met. Add a note on kernel log notice when the helper is used. Edit example. - bpf_tail_call(): Improve comment on stack inheritance. - bpf_clone_redirect(): Improve description of BPF_F_INGRESS flag. Cc: Alexei Starovoitov Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 230 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd43fc98c982..96925e949a24 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -393,6 +393,236 @@ union bpf_attr { * intentional, removing them would break paragraphs for rst2man. * * Start of BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + * Description + * Perform a lookup in *map* for an entry associated to *key*. + * Return + * Map value associated to *key*, or **NULL** if no entry was + * found. + * + * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * Description + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * Description + * Delete entry with *key* from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read(void *dst, u32 size, const void *src) + * Description + * For tracing programs, safely attempt to read *size* bytes from + * address *src* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_ktime_get_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Return + * Current *ktime*. + * + * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * Description + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * bloc (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * Description + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * Description + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * Description + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 32. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * Description + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.2.3 From c456dec4d2656b70de7371e3a6ae60b7ec7ab74f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:54 +0100 Subject: bpf: add documentation for eBPF helpers (12-22) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by Alexei: - bpf_get_current_pid_tgid() - bpf_get_current_uid_gid() - bpf_get_current_comm() - bpf_skb_vlan_push() - bpf_skb_vlan_pop() - bpf_skb_get_tunnel_key() - bpf_skb_set_tunnel_key() - bpf_redirect() - bpf_perf_event_output() - bpf_get_stackid() - bpf_get_current_task() v4: - bpf_redirect(): Fix typo: "XDP_ABORT" changed to "XDP_ABORTED". Add note on bpf_redirect_map() providing better performance. Replace "Save for" with "Except for". - bpf_skb_vlan_push(): Clarify comment about invalidated verifier checks. - bpf_skb_vlan_pop(): Clarify comment about invalidated verifier checks. - bpf_skb_get_tunnel_key(): Add notes on tunnel_id, "collect metadata" mode, and example tunneling protocols with which it can be used. - bpf_skb_set_tunnel_key(): Add a reference to the description of bpf_skb_get_tunnel_key(). - bpf_perf_event_output(): Specify that, and for what purpose, the helper can be used with programs attached to TC and XDP. v3: - bpf_skb_get_tunnel_key(): Change and improve description and example. - bpf_redirect(): Improve description of BPF_F_INGRESS flag. - bpf_perf_event_output(): Fix first sentence of description. Delete wrong statement on context being evaluated as a struct pt_reg. Remove the long yet incomplete example. - bpf_get_stackid(): Add a note about PERF_MAX_STACK_DEPTH being configurable. Cc: Alexei Starovoitov Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 254 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 96925e949a24..465ea273729a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -623,6 +623,260 @@ union bpf_attr { * direct packet access. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_current_pid_tgid(void) + * Return + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + * + * u64 bpf_get_current_uid_gid(void) + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * + * int bpf_get_current_comm(char *buf, u32 size_of_buf) + * Description + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * Description + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_vlan_pop(struct sk_buff *skb) + * Description + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_redirect(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can be attained with the more generic + * **bpf_redirect_map**\ (), which requires specific maps to be + * used but offers better performance. + * Return + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + * + * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * + * u64 bpf_get_current_task(void) + * Return + * A pointer to the current task struct. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.2.3 From 1fdd08bedc56d2a927c5f230e2d63f17ca1068f1 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:55 +0100 Subject: bpf: add documentation for eBPF helpers (23-32) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by Daniel: - bpf_get_prandom_u32() - bpf_get_smp_processor_id() - bpf_get_cgroup_classid() - bpf_get_route_realm() - bpf_skb_load_bytes() - bpf_csum_diff() - bpf_skb_get_tunnel_opt() - bpf_skb_set_tunnel_opt() - bpf_skb_change_proto() - bpf_skb_change_type() v4: - bpf_get_prandom_u32(): Warn that the prng is not cryptographically secure. - bpf_get_smp_processor_id(): Fix a typo (case). - bpf_get_cgroup_classid(): Clarify description. Add notes on the helper being limited to cgroup v1, and to egress path. - bpf_get_route_realm(): Add comparison with bpf_get_cgroup_classid(). Add a note about usage with TC and advantage of clsact. Fix a typo in return value ("sdb" instead of "skb"). - bpf_skb_load_bytes(): Make explicit loading large data loads it to the eBPF stack. - bpf_csum_diff(): Add a note on seed that can be cascaded. Link to bpf_l3|l4_csum_replace(). - bpf_skb_get_tunnel_opt(): Add a note about usage with "collect metadata" mode, and example of this with Geneve. - bpf_skb_set_tunnel_opt(): Add a link to bpf_skb_get_tunnel_opt() description. - bpf_skb_change_proto(): Mention that the main use case is NAT64. Clarify comment about invalidated verifier checks. v3: - bpf_get_prandom_u32(): Fix helper name :(. Add description, including a note on the internal random state. - bpf_get_smp_processor_id(): Add description, including a note on the processor id remaining stable during program run. - bpf_get_cgroup_classid(): State that CONFIG_CGROUP_NET_CLASSID is required to use the helper. Add a reference to related documentation. State that placing a task in net_cls controller disables cgroup-bpf. - bpf_get_route_realm(): State that CONFIG_CGROUP_NET_CLASSID is required to use this helper. - bpf_skb_load_bytes(): Fix comment on current use cases for the helper. Cc: Daniel Borkmann Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 197 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 465ea273729a..7352abf72f50 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -495,6 +495,27 @@ union bpf_attr { * The number of bytes written to the buffer, or a negative error * in case of failure. * + * u32 bpf_get_prandom_u32(void) + * Description + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * Return + * A random 32-bit unsigned value. + * + * u32 bpf_get_smp_processor_id(void) + * Description + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with preemption disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * Return + * The SMP id of the processor running the program. + * * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet @@ -647,6 +668,32 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u32 bpf_get_cgroup_classid(struct sk_buff *skb) + * Description + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/cgroup-v1/net_cls.txt*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * Return + * The classid, or 0 for the default unconfigured classid. + * * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol @@ -786,6 +833,30 @@ union bpf_attr { * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on * error. * + * u32 bpf_get_route_realm(struct sk_buff *skb) + * Description + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * indentifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * Return + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + * * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by @@ -831,6 +902,23 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) + * Description + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve @@ -874,6 +962,115 @@ union bpf_attr { * The positive or null stack id on success, or a negative error * in case of failure. * + * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) + * Description + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * Return + * The checksum result, or a negative error code in case of + * failure. + * + * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * Description + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * Return + * The size of the option data retrieved. + * + * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * Description + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * Description + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * Description + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * Return + * 0 on success, or a negative error in case of failure. + * * u64 bpf_get_current_task(void) * Return * A pointer to the current task struct. -- cgit v1.2.3 From fa15601ab31e5a90bb2a7dbdb43083470a8e787b Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:56 +0100 Subject: bpf: add documentation for eBPF helpers (33-41) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by Daniel: - bpf_get_hash_recalc() - bpf_skb_change_tail() - bpf_skb_pull_data() - bpf_csum_update() - bpf_set_hash_invalid() - bpf_get_numa_node_id() - bpf_set_hash() - bpf_skb_adjust_room() - bpf_xdp_adjust_meta() v4: - bpf_skb_change_tail(): Clarify comment about invalidated verifier checks. - bpf_skb_pull_data(): Clarify the motivation for using this helper or bpf_skb_load_bytes(), on non-linear buffers. Fix RST formatting for *skb*. Clarify comment about invalidated verifier checks. - bpf_csum_update(): Fix description of checksum (entire packet, not IP checksum). Fix a typo: "header" instead of "helper". - bpf_set_hash_invalid(): Mention bpf_get_hash_recalc(). - bpf_get_numa_node_id(): State that the helper is not restricted to programs attached to sockets. - bpf_skb_adjust_room(): Clarify comment about invalidated verifier checks. - bpf_xdp_adjust_meta(): Clarify comment about invalidated verifier checks. Cc: Daniel Borkmann Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 164 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7352abf72f50..b9c28f63f511 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1071,9 +1071,173 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u32 bpf_get_hash_recalc(struct sk_buff *skb) + * Description + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * Return + * The 32-bit hash. + * * u64 bpf_get_current_task(void) * Return * A pointer to the current task struct. + * + * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * Description + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then the whole length of the *skb* is pulled. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) + * Description + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * Return + * The checksum on success, or a negative error code in case of + * failure. + * + * void bpf_set_hash_invalid(struct sk_buff *skb) + * Description + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * + * int bpf_get_numa_node_id(void) + * Description + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * Return + * The id of current NUMA node. + * + * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * Description + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * Return + * 0 + * + * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * Description + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * There is a single supported mode at this time: + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed below the layer 3 header). + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.2.3 From c6b5fb8690faccf770d2f344094db1a8e482148f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:57 +0100 Subject: bpf: add documentation for eBPF helpers (42-50) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions: Helper from Kaixu: - bpf_perf_event_read() Helpers from Martin: - bpf_skb_under_cgroup() - bpf_xdp_adjust_head() Helpers from Sargun: - bpf_probe_write_user() - bpf_current_task_under_cgroup() Helper from Thomas: - bpf_skb_change_head() Helper from Gianluca: - bpf_probe_read_str() Helpers from Chenbo: - bpf_get_socket_cookie() - bpf_get_socket_uid() v4: - bpf_perf_event_read(): State that bpf_perf_event_read_value() should be preferred over this helper. - bpf_skb_change_head(): Clarify comment about invalidated verifier checks. - bpf_xdp_adjust_head(): Clarify comment about invalidated verifier checks. - bpf_probe_write_user(): Add that dst must be a valid user space address. - bpf_get_socket_cookie(): Improve description by making clearer that the cockie belongs to the socket, and state that it remains stable for the life of the socket. v3: - bpf_perf_event_read(): Fix time of selection for perf event type in description. Remove occurences of "cores" to avoid confusion with "CPU". Cc: Martin KaFai Lau Cc: Sargun Dhillon Cc: Thomas Graf Cc: Gianluca Borello Cc: Chenbo Feng Signed-off-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau [for bpf_skb_under_cgroup(), bpf_xdp_adjust_head()] Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9c28f63f511..0d4b55e85e07 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -810,6 +810,35 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) + * Description + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read*\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with bpf_perf_event_read_value(), which at the same time + * provides more features over the **bpf_perf_event_read**\ () + * interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * Return + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + * * int bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. @@ -1071,6 +1100,17 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * Description + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + * * u32 bpf_get_hash_recalc(struct sk_buff *skb) * Description * Retrieve the hash of the packet, *skb*\ **->hash**. If it is @@ -1091,6 +1131,37 @@ union bpf_attr { * Return * A pointer to the current task struct. * + * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * Description + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * Description + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* task belongs to the cgroup2. + * * 1, if the *skb* task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + * * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the @@ -1182,6 +1253,107 @@ union bpf_attr { * Return * The id of current NUMA node. * + * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read()** helper here instead + * to read the string would require to estimate the length at + * compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * u64 bpf_get_socket_cookie(struct sk_buff *skb) + * Description + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a unique socket + * identifier per namespace. + * Return + * A 8-byte long non-decreasing number on success, or 0 if the + * socket field is missing inside *skb*. + * + * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Return + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + * * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) -- cgit v1.2.3 From 7aa79a869dd6d3aa345ea8ffd54085c980f5b1c3 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:58 +0100 Subject: bpf: add documentation for eBPF helpers (51-57) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions: Helpers from Lawrence: - bpf_setsockopt() - bpf_getsockopt() - bpf_sock_ops_cb_flags_set() Helpers from Yonghong: - bpf_perf_event_read_value() - bpf_perf_prog_read_value() Helper from Josef: - bpf_override_return() Helper from Andrey: - bpf_bind() v4: - bpf_perf_event_read_value(): State that this helper should be preferred over bpf_perf_event_read(). v3: - bpf_perf_event_read_value(): Fix time of selection for perf event type in description. Remove occurences of "cores" to avoid confusion with "CPU". - bpf_bind(): Remove last paragraph of description, which was off topic. Cc: Lawrence Brakmo Cc: Yonghong Song Cc: Josef Bacik Cc: Andrey Ignatov Signed-off-by: Quentin Monnet Acked-by: Yonghong Song [for bpf_perf_event_read_value(), bpf_perf_prog_read_value()] Acked-by: Andrey Ignatov [for bpf_bind()] Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 180 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0d4b55e85e07..4aaee0169421 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1361,6 +1361,28 @@ union bpf_attr { * Return * 0 * + * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * Description + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to @@ -1410,6 +1432,164 @@ union bpf_attr { * direct packet access. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * For en eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * Description + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the following *level*\ s: + * + * * **IPPROTO_TCP**, which supports *optname* + * **TCP_CONGESTION**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_override_return(struct pt_reg *regs, u64 rc) + * Description + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * Return + * 0 + * + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval) + * Description + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * The supported callback values that *argval* can combine are: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * Return + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + * + * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) + * Description + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). Looking for a free port to bind to can be + * expensive, therefore binding to port is not permitted by the + * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) + * must be set to zero. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.2.3 From ab12704099c2001915843a8b7885567ee6f1800e Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:16:59 +0100 Subject: bpf: add documentation for eBPF helpers (58-64) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions, all written by John: - bpf_redirect_map() - bpf_sk_redirect_map() - bpf_sock_map_update() - bpf_msg_redirect_map() - bpf_msg_apply_bytes() - bpf_msg_cork_bytes() - bpf_msg_pull_data() v4: - bpf_redirect_map(): Fix typos: "XDP_ABORT" changed to "XDP_ABORTED", "his" to "this". Also add a paragraph on performance improvement over bpf_redirect() helper. v3: - bpf_sk_redirect_map(): Improve description of BPF_F_INGRESS flag. - bpf_msg_redirect_map(): Improve description of BPF_F_INGRESS flag. - bpf_redirect_map(): Fix note on CPU redirection, not fully implemented for generic XDP but supported on native XDP. - bpf_msg_pull_data(): Clarify comment about invalidated verifier checks. Cc: Jesper Dangaard Brouer Cc: John Fastabend Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 147 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4aaee0169421..27940a36501a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1404,6 +1404,56 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * When used to redirect packets to net devices, this helper + * provides a high performance increase over **bpf_redirect**\ (). + * This is due to various implementation details of the underlying + * mechanisms, one of which is the fact that **bpf_redirect_map**\ + * () tries to send packet as a "bulk" to the device. + * Return + * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * + * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by @@ -1574,6 +1624,103 @@ union bpf_attr { * be set is returned (which comes down to 0 if all bits were set * as required). * + * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * Return + * 0 + * + * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * Return + * 0 + * + * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * Description + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by -- cgit v1.2.3 From 2d020dd771762d96d158a9deed41bc6b7480a8fe Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 25 Apr 2018 18:17:00 +0100 Subject: bpf: add documentation for eBPF helpers (65-66) Add documentation for eBPF helper functions to bpf.h user header file. This documentation can be parsed with the Python script provided in another commit of the patch series, in order to provide a RST document that can later be converted into a man page. The objective is to make the documentation easily understandable and accessible to all eBPF developers, including beginners. This patch contains descriptions for the following helper functions: Helper from Nikita: - bpf_xdp_adjust_tail() Helper from Eyal: - bpf_skb_get_xfrm_state() v4: - New patch (helpers did not exist yet for previous versions). Cc: Nikita V. Shirokov Cc: Eyal Birger Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 27940a36501a..da77a9388947 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1737,6 +1737,36 @@ union bpf_attr { * must be set to zero. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * only possible to shrink the packet as of this writing, + * therefore *delta* must be a negative integer. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * Description + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.2.3 From 3e5cf362c34b14c8d01d19d4b821fb35e1779862 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 25 Apr 2018 19:29:36 +0200 Subject: tipc: introduce ioctl for fetching node identity After the introduction of a 128-bit node identity it may be difficult for a user to correlate between this identity and the generated node hash address. We now try to make this easier by introducing a new ioctl() call for fetching a node identity by using the hash value as key. This will be particularly useful when we extend some of the commands in the 'tipc' tool, but we also expect regular user applications to need this feature. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index bf6d28677cfe..6b2fd4d9655f 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -209,16 +209,16 @@ struct tipc_group_req { * The string formatting for each name element is: * media: media * interface: media:interface name - * link: Z.C.N:interface-Z.C.N:interface - * + * link: node:interface-node:interface */ - +#define TIPC_NODEID_LEN 16 #define TIPC_MAX_MEDIA_NAME 16 #define TIPC_MAX_IF_NAME 16 #define TIPC_MAX_BEARER_NAME 32 #define TIPC_MAX_LINK_NAME 68 -#define SIOCGETLINKNAME SIOCPROTOPRIVATE +#define SIOCGETLINKNAME SIOCPROTOPRIVATE +#define SIOCGETNODEID (SIOCPROTOPRIVATE + 1) struct tipc_sioc_ln_req { __u32 peer; @@ -226,6 +226,10 @@ struct tipc_sioc_ln_req { char linkname[TIPC_MAX_LINK_NAME]; }; +struct tipc_sioc_nodeid_req { + __u32 peer; + char node_id[TIPC_NODEID_LEN]; +}; /* The macros and functions below are deprecated: */ -- cgit v1.2.3 From c59530d0d5dccc96795af12c139f618182cf98db Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 25 Apr 2018 12:12:47 -0700 Subject: net: Move PHY statistics code into PHY library helpers In order to make it possible for network device drivers that do not necessarily have a phy_device attached, but still report PHY statistics, have a preliminary refactoring consisting in creating helper functions that encapsulate the PHY device driver knowledge within PHYLIB. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index f0b5870a6d40..6ca81395c545 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1066,6 +1066,26 @@ int phy_ethtool_nway_reset(struct net_device *ndev); #if IS_ENABLED(CONFIG_PHYLIB) int __init mdio_bus_init(void); void mdio_bus_exit(void); +int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); +int phy_ethtool_get_sset_count(struct phy_device *phydev); +int phy_ethtool_get_stats(struct phy_device *phydev, + struct ethtool_stats *stats, u64 *data); +#else +int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data) +{ + return -EOPNOTSUPP; +} + +int phy_ethtool_get_sset_count(struct phy_device *phydev) +{ + return -EOPNOTSUPP; +} + +int phy_ethtool_get_stats(struct phy_device *phydev, + struct ethtool_stats *stats, u64 *data) +{ + return -EOPNOTSUPP; +} #endif extern struct bus_type mdio_bus_type; -- cgit v1.2.3 From 9994338227179eaf8db6f4493504e108b1fae5fc Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 25 Apr 2018 12:12:48 -0700 Subject: net: Allow network devices to have PHY statistics Add a new callback: get_ethtool_phy_stats() which allows network device drivers not making use of the PHY library to return PHY statistics. Update ethtool_get_phy_stats(), __ethtool_get_sset_count() and __ethtool_get_strings() accordingly to interogate the network device about ETH_SS_PHY_STATS. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/ethtool.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index b32cd2062f18..f8a2245b70ac 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -312,6 +312,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * by kernel. Returns a negative error code or zero. * @get_fecparam: Get the network device Forward Error Correction parameters. * @set_fecparam: Set the network device Forward Error Correction parameters. + * @get_ethtool_phy_stats: Return extended statistics about the PHY device. + * This is only useful if the device maintains PHY statistics and + * cannot use the standard PHY library helpers. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -407,5 +410,7 @@ struct ethtool_ops { struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, struct ethtool_fecparam *); + void (*get_ethtool_phy_stats)(struct net_device *, + struct ethtool_stats *, u64 *); }; #endif /* _LINUX_ETHTOOL_H */ -- cgit v1.2.3 From 89f09048348936a9a8c5131c8538cc6ed26fd44c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 25 Apr 2018 12:12:50 -0700 Subject: net: dsa: Pass stringset to ethtool operations Up until now we largely assumed that we were interested in ETH_SS_STATS type of strings for all ethtool operations, this is about to change with the introduction of additional string sets, e.g: ETH_SS_PHY_STATS. Update all functions to take an appropriate stringset argument and act on it when it is different than ETH_SS_STATS for now. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 60fb4ec8ba61..0bc0aad1b02e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -356,10 +356,11 @@ struct dsa_switch_ops { /* * ethtool hardware statistics. */ - void (*get_strings)(struct dsa_switch *ds, int port, uint8_t *data); + void (*get_strings)(struct dsa_switch *ds, int port, + u32 stringset, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); - int (*get_sset_count)(struct dsa_switch *ds, int port); + int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); /* * ethtool Wake-on-LAN -- cgit v1.2.3 From cf963573039a333eff7156629e61a06e59da61cf Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 25 Apr 2018 12:12:52 -0700 Subject: net: dsa: Allow providing PHY statistics from CPU port Implement the same type of ethtool diversion that we have for ETH_SS_STATS and make it work with ETH_SS_PHY_STATS. This allows providing PHY level statistics for CPU ports that are directly connecting to a PHY device. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 0bc0aad1b02e..462e9741b210 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -361,6 +361,8 @@ struct dsa_switch_ops { void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds, int port, int sset); + void (*get_ethtool_phy_stats)(struct dsa_switch *ds, + int port, uint64_t *data); /* * ethtool Wake-on-LAN @@ -589,4 +591,9 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, #define BRCM_TAG_GET_PORT(v) ((v) >> 8) #define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) + +int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); +int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); +int dsa_port_get_phy_sset_count(struct dsa_port *dp); + #endif -- cgit v1.2.3 From c45698f89626177f8c51409142cbe9c5bbed4af7 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 26 Apr 2018 16:58:50 -0300 Subject: sctp: remove old and unused SCTP_MIN_PMTU This value is not used anywhere in the code. In essence it is a duplicate of SCTP_DEFAULT_MINSEGMENT, which is used by the stack. SCTP_MIN_PMTU value makes more sense, but we should not change to it now as it would risk breaking applications. So this patch removes SCTP_MIN_PMTU and adjust the comment above it. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/constants.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 20ff237c5eb2..86f034b524d4 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -254,11 +254,10 @@ enum { SCTP_ARBITRARY_COOKIE_ECHO_LEN = 200 }; #define SCTP_TSN_MAP_SIZE 4096 /* We will not record more than this many duplicate TSNs between two - * SACKs. The minimum PMTU is 576. Remove all the headers and there - * is enough room for 131 duplicate reports. Round down to the + * SACKs. The minimum PMTU is 512. Remove all the headers and there + * is enough room for 117 duplicate reports. Round down to the * nearest power of 2. */ -enum { SCTP_MIN_PMTU = 576 }; enum { SCTP_MAX_DUP_TSNS = 16 }; enum { SCTP_MAX_GABS = 16 }; -- cgit v1.2.3 From c4b2893dae6427ce1e528033383c94cbf81e80d8 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 26 Apr 2018 16:58:53 -0300 Subject: sctp: introduce sctp_assoc_set_pmtu All changes to asoc PMTU should now go through this wrapper, making it easier to track them and to do other actions upon it. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 05594b248e52..c5e244be8f1e 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2097,6 +2097,7 @@ int sctp_assoc_update(struct sctp_association *old, __u32 sctp_association_get_next_tsn(struct sctp_association *); +void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu); void sctp_assoc_sync_pmtu(struct sctp_association *asoc); void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int); -- cgit v1.2.3 From feddd6c1af30ab11d73ce0e4e76b40dfc899dbda Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 26 Apr 2018 16:58:54 -0300 Subject: sctp: introduce sctp_mtu_payload When given a MTU, this function calculates how much payload we can carry on it. Without a MTU, it calculates the amount of header overhead we have. So that when we have extra overhead, like the one added for IP options on SELinux patches, it is easier to handle it. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 28b996d63490..0b98e4683f10 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -607,6 +607,25 @@ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport * return t->dst; } +/* Calculate max payload size given a MTU, or the total overhead if + * given MTU is zero + */ +static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp, + __u32 mtu, __u32 extra) +{ + __u32 overhead = sizeof(struct sctphdr) + extra; + + if (sp) + overhead += sp->pf->af->net_header_len; + else + overhead += sizeof(struct ipv6hdr); + + if (WARN_ON_ONCE(mtu && mtu <= overhead)) + mtu = overhead; + + return mtu ? mtu - overhead : overhead; +} + static inline bool sctp_transport_pmtu_check(struct sctp_transport *t) { __u32 pmtu = max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)), -- cgit v1.2.3 From 2f5e3c9df6938b823664869ec87af3da8df272f6 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 26 Apr 2018 16:58:55 -0300 Subject: sctp: introduce sctp_assoc_update_frag_point and avoid the open-coded versions of it. Now sctp_datamsg_from_user can just re-use asoc->frag_point as it will always be updated. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 20 -------------------- include/net/sctp/structs.h | 1 + 2 files changed, 1 insertion(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 0b98e4683f10..350c65620a4e 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -428,26 +428,6 @@ static inline int sctp_list_single_entry(struct list_head *head) return (head->next != head) && (head->next == head->prev); } -/* Break down data chunks at this point. */ -static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu) -{ - struct sctp_sock *sp = sctp_sk(asoc->base.sk); - struct sctp_af *af = sp->pf->af; - int frag = pmtu; - - frag -= af->ip_options_len(asoc->base.sk); - frag -= af->net_header_len; - frag -= sizeof(struct sctphdr) + sctp_datachk_len(&asoc->stream); - - if (asoc->user_frag) - frag = min_t(int, frag, asoc->user_frag); - - frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN - - sctp_datachk_len(&asoc->stream))); - - return frag; -} - static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc) { sctp_assoc_sync_pmtu(asoc); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index c5e244be8f1e..ebf809eed33a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2097,6 +2097,7 @@ int sctp_assoc_update(struct sctp_association *old, __u32 sctp_association_get_next_tsn(struct sctp_association *); +void sctp_assoc_update_frag_point(struct sctp_association *asoc); void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu); void sctp_assoc_sync_pmtu(struct sctp_association *asoc); void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); -- cgit v1.2.3 From 2521680e1830c21033efe48322829044c6e6b32b Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 26 Apr 2018 16:58:56 -0300 Subject: sctp: remove sctp_assoc_pending_pmtu No need for this helper. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 350c65620a4e..e327acad8e7d 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -428,12 +428,6 @@ static inline int sctp_list_single_entry(struct list_head *head) return (head->next != head) && (head->next == head->prev); } -static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc) -{ - sctp_assoc_sync_pmtu(asoc); - asoc->pmtu_pending = 0; -} - static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk) { return !list_empty(&chunk->list); -- cgit v1.2.3 From 6ff0f871c20ec1769a481edca86f23c76b2b06d3 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 26 Apr 2018 16:58:57 -0300 Subject: sctp: introduce sctp_dst_mtu Which makes sure that the MTU respects the minimum value of SCTP_DEFAULT_MINSEGMENT and that it is correctly aligned. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index e327acad8e7d..4965cbfa7d92 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -600,10 +600,15 @@ static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp, return mtu ? mtu - overhead : overhead; } +static inline __u32 sctp_dst_mtu(const struct dst_entry *dst) +{ + return SCTP_TRUNC4(max_t(__u32, dst_mtu(dst), + SCTP_DEFAULT_MINSEGMENT)); +} + static inline bool sctp_transport_pmtu_check(struct sctp_transport *t) { - __u32 pmtu = max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)), - SCTP_DEFAULT_MINSEGMENT); + __u32 pmtu = sctp_dst_mtu(t->dst); if (t->pathmtu == pmtu) return true; -- cgit v1.2.3 From 22d7be267eaa8114dcc28d66c1c347f667d7878a Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 26 Apr 2018 16:58:58 -0300 Subject: sctp: remove sctp_transport_pmtu_check We are now keeping the MTU information synced between asoc, transport and dst, which makes the check at sctp_packet_config() not needed anymore. As it was the sole caller to this function, lets remove it. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 4965cbfa7d92..f66d44350007 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -606,16 +606,4 @@ static inline __u32 sctp_dst_mtu(const struct dst_entry *dst) SCTP_DEFAULT_MINSEGMENT)); } -static inline bool sctp_transport_pmtu_check(struct sctp_transport *t) -{ - __u32 pmtu = sctp_dst_mtu(t->dst); - - if (t->pathmtu == pmtu) - return true; - - t->pathmtu = pmtu; - - return false; -} - #endif /* __net_sctp_h__ */ -- cgit v1.2.3 From 9e8d438e8ba43a38def2890a65a26917f2233a83 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 27 Apr 2018 12:41:49 -0700 Subject: net: phy: Fix modular PHYLIB build After commit c59530d0d5dc ("net: Move PHY statistics code into PHY library helpers") we made net/core/ethtool.c reference symbols which are part of the library which can be modular. David introduced a temporary fix with 1ecd6e8ad996 ("phy: Temporary build fix after phylib changes.") which would prevent such modularity. This is not desireable of course, so instead, just inline the functions into include/linux/phy.h to keep both options available. Fixes: c59530d0d5dc ("net: Move PHY statistics code into PHY library helpers") Fixes: 1ecd6e8ad996 ("phy: Temporary build fix after phylib changes.") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 50 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 6ca81395c545..073235e70442 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1066,27 +1066,53 @@ int phy_ethtool_nway_reset(struct net_device *ndev); #if IS_ENABLED(CONFIG_PHYLIB) int __init mdio_bus_init(void); void mdio_bus_exit(void); -int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); -int phy_ethtool_get_sset_count(struct phy_device *phydev); -int phy_ethtool_get_stats(struct phy_device *phydev, - struct ethtool_stats *stats, u64 *data); -#else -int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data) +#endif + +/* Inline function for use within net/core/ethtool.c (built-in) */ +static inline int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data) { - return -EOPNOTSUPP; + if (!phydev->drv) + return -EIO; + + mutex_lock(&phydev->lock); + phydev->drv->get_strings(phydev, data); + mutex_unlock(&phydev->lock); + + return 0; } -int phy_ethtool_get_sset_count(struct phy_device *phydev) +static inline int phy_ethtool_get_sset_count(struct phy_device *phydev) { + int ret; + + if (!phydev->drv) + return -EIO; + + if (phydev->drv->get_sset_count && + phydev->drv->get_strings && + phydev->drv->get_stats) { + mutex_lock(&phydev->lock); + ret = phydev->drv->get_sset_count(phydev); + mutex_unlock(&phydev->lock); + + return ret; + } + return -EOPNOTSUPP; } -int phy_ethtool_get_stats(struct phy_device *phydev, - struct ethtool_stats *stats, u64 *data) +static inline int phy_ethtool_get_stats(struct phy_device *phydev, + struct ethtool_stats *stats, u64 *data) { - return -EOPNOTSUPP; + if (!phydev->drv) + return -EIO; + + mutex_lock(&phydev->lock); + phydev->drv->get_stats(phydev, stats, data); + mutex_unlock(&phydev->lock); + + return 0; } -#endif extern struct bus_type mdio_bus_type; -- cgit v1.2.3 From c195651e565ae7f41a68acb7d4aa7390ad215de1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:08 -0700 Subject: bpf: add bpf_get_stack helper Currently, stackmap and bpf_get_stackid helper are provided for bpf program to get the stack trace. This approach has a limitation though. If two stack traces have the same hash, only one will get stored in the stackmap table, so some stack traces are missing from user perspective. This patch implements a new helper, bpf_get_stack, will send stack traces directly to bpf program. The bpf program is able to see all stack traces, and then can do in-kernel processing or send stack traces to user space through shared map or bpf_perf_event_output. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/filter.h | 3 ++- include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 43 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 38ebbc61ed99..c553f6f9c6b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -692,6 +692,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_skb_vlan_push_proto; extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; +extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; /* Shared helpers among cBPF and eBPF. */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 4da8b2308174..64899c04c1a6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -468,7 +468,8 @@ struct bpf_prog { dst_needed:1, /* Do we need dst entry? */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ - kprobe_override:1; /* Do we override a kprobe? */ + kprobe_override:1, /* Do we override a kprobe? */ + has_callchain_buf:1; /* callchain buffer allocated? */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da77a9388947..1afb606a18b9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1767,6 +1767,40 @@ union bpf_attr { * **CONFIG_XFRM** configuration option. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * a non-negative value equal to or less than size on success, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1835,7 +1869,8 @@ union bpf_attr { FN(msg_pull_data), \ FN(bind), \ FN(xdp_adjust_tail), \ - FN(skb_get_xfrm_state), + FN(skb_get_xfrm_state), \ + FN(get_stack), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -1869,11 +1904,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) -/* BPF_FUNC_get_stackid flags. */ +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ #define BPF_F_SKIP_FIELD_MASK 0xffULL #define BPF_F_USER_STACK (1ULL << 8) +/* flags used by BPF_FUNC_get_stackid only. */ #define BPF_F_FAST_STACK_CMP (1ULL << 9) #define BPF_F_REUSE_STACKID (1ULL << 10) +/* flags used by BPF_FUNC_get_stack only. */ +#define BPF_F_USER_BUILD_ID (1ULL << 11) /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) -- cgit v1.2.3 From 9cbe1f5a32dcd6d0508326f7d9098e5bc380a4fe Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:11 -0700 Subject: bpf/verifier: improve register value range tracking with ARSH When helpers like bpf_get_stack returns an int value and later on used for arithmetic computation, the LSH and ARSH operations are often required to get proper sign extension into 64-bit. For example, without this patch: 54: R0=inv(id=0,umax_value=800) 54: (bf) r8 = r0 55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800) 55: (67) r8 <<= 32 56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000)) 56: (c7) r8 s>>= 32 57: R8=inv(id=0) With this patch: 54: R0=inv(id=0,umax_value=800) 54: (bf) r8 = r0 55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800) 55: (67) r8 <<= 32 56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000)) 56: (c7) r8 s>>= 32 57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff)) With better range of "R8", later on when "R8" is added to other register, e.g., a map pointer or scalar-value register, the better register range can be derived and verifier failure may be avoided. In our later example, ...... usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK); if (usize < 0) return 0; ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0); ...... Without improving ARSH value range tracking, the register representing "max_len - usize" will have smin_value equal to S64_MIN and will be rejected by verifier. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 0d2d3da46139..c7dc2b5902c0 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -23,8 +23,10 @@ struct tnum tnum_range(u64 min, u64 max); /* Arithmetic and logical ops */ /* Shift a tnum left (by a fixed shift) */ struct tnum tnum_lshift(struct tnum a, u8 shift); -/* Shift a tnum right (by a fixed shift) */ +/* Shift (rsh) a tnum right (by a fixed shift) */ struct tnum tnum_rshift(struct tnum a, u8 shift); +/* Shift (arsh) a tnum right (by a fixed min_shift) */ +struct tnum tnum_arshift(struct tnum a, u8 min_shift); /* Add two tnums, return @a + @b */ struct tnum tnum_add(struct tnum a, struct tnum b); /* Subtract two tnums, return @a - @b */ -- cgit v1.2.3 From a3ef8e9a4d7cc26d7528d50d10c5720b523b07c9 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Sat, 28 Apr 2018 16:06:19 -0700 Subject: bpf: Fix helpers ctx struct types in uapi doc Helpers may operate on two types of ctx structures: user visible ones (e.g. `struct bpf_sock_ops`) when used in user programs, and kernel ones (e.g. `struct bpf_sock_ops_kern`) in kernel implementation. UAPI documentation must refer to only user visible structures. The patch replaces references to `_kern` structures in BPF helpers description by corresponding user visible structures. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1afb606a18b9..23b334bba1a6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1361,7 +1361,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1435,7 +1435,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1533,7 +1533,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1544,7 +1544,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1588,7 +1588,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval) + * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1721,7 +1721,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len) + * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing -- cgit v1.2.3 From 05255b823a6173525587f29c4e8f1ca33fd7677d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Apr 2018 08:58:08 -0700 Subject: tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive When adding tcp mmap() implementation, I forgot that socket lock had to be taken before current->mm->mmap_sem. syzbot eventually caught the bug. Since we can not lock the socket in tcp mmap() handler we have to split the operation in two phases. 1) mmap() on a tcp socket simply reserves VMA space, and nothing else. This operation does not involve any TCP locking. 2) getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements the transfert of pages from skbs to one VMA. This operation only uses down_read(¤t->mm->mmap_sem) after holding TCP lock, thus solving the lockdep issue. This new implementation was suggested by Andy Lutomirski with great details. Benefits are : - Better scalability, in case multiple threads reuse VMAS (without mmap()/munmap() calls) since mmap_sem wont be write locked. - Better error recovery. The previous mmap() model had to provide the expected size of the mapping. If for some reason one part could not be mapped (partial MSS), the whole operation had to be aborted. With the tcp_zerocopy_receive struct, kernel can report how many bytes were successfuly mapped, and how many bytes should be read to skip the problematic sequence. - No more memory allocation to hold an array of page pointers. 16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/ - skbs are freed while mmap_sem has been released Following patch makes the change in tcp_mmap tool to demonstrate one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...) Note that memcg might require additional changes. Fixes: 93ab6cc69162 ("tcp: implement mmap() for zero copy receive") Signed-off-by: Eric Dumazet Reported-by: syzbot Suggested-by: Andy Lutomirski Cc: linux-mm@kvack.org Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 379b08700a54..e9e8373b34b9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -122,6 +122,7 @@ enum { #define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ #define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ #define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ +#define TCP_ZEROCOPY_RECEIVE 35 struct tcp_repair_opt { __u32 opt_code; @@ -276,4 +277,11 @@ struct tcp_diag_md5sig { __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; }; +/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */ + +struct tcp_zerocopy_receive { + __u64 address; /* in: address of mapping */ + __u32 length; /* in/out: number of bytes to map/mapped */ + __u32 recv_skip_hint; /* out: amount of bytes to skip */ +}; #endif /* _UAPI_LINUX_TCP_H */ -- cgit v1.2.3 From 1b837d489e06a5289753ddbee99cfbc26d251d6d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 27 Apr 2018 14:06:53 -0400 Subject: net: Revoke export for __skb_tx_hash, update it to just be static skb_tx_hash I am dropping the export of __skb_tx_hash as after my patches nobody is using it outside of the net/core/dev.c file. In addition I am renaming and repurposing it to just be a static declaration of skb_tx_hash since that was the only user for it at this point. By doing this the compiler can inline it into __netdev_pick_tx as that will improve performance. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 366c32891158..82f5a9aba578 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3213,19 +3213,6 @@ static inline int netif_set_xps_queue(struct net_device *dev, } #endif -u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, - unsigned int num_tx_queues); - -/* - * Returns a Tx hash for the given packet when dev->real_num_tx_queues is used - * as a distribution range limit for the returned value. - */ -static inline u16 skb_tx_hash(const struct net_device *dev, - struct sk_buff *skb) -{ - return __skb_tx_hash(dev, skb, dev->real_num_tx_queues); -} - /** * netif_is_multiqueue - test if device has multiple transmit queues * @dev: network device -- cgit v1.2.3 From 3ac305c386f698abccd0523c64a8aef248c89bc6 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 27 Apr 2018 13:11:14 -0700 Subject: net: core: Assert the size of netdev_featres_t We have about 53 netdev_features_t bits defined and counting, add a build time check to catch when an u64 type will not be enough and we will have to convert that to a bitmap. This is done in register_netdevice() for convenience. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 82f5a9aba578..9e09dd897b74 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4108,6 +4108,12 @@ const char *netdev_drivername(const struct net_device *dev); void linkwatch_run_queue(void); +static inline void netdev_features_size_check(void) +{ + BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < + NETDEV_FEATURE_COUNT); +} + static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { -- cgit v1.2.3 From 4d220ed0f8140c478ab7b0a14d96821da639b646 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 28 Apr 2018 19:56:37 -0700 Subject: bpf: remove tracepoints from bpf core tracepoints to bpf core were added as a way to provide introspection to bpf programs and maps, but after some time it became clear that this approach is inadequate, so prog_id, map_id and corresponding get_next_id, get_fd_by_id, get_info_by_fd, prog_query APIs were introduced and fully adopted by bpftool and other applications. The tracepoints in bpf core started to rot and causing syzbot warnings: WARNING: CPU: 0 PID: 3008 at kernel/trace/trace_event_perf.c:274 Kernel panic - not syncing: panic_on_warn set ... perf_trace_bpf_map_keyval+0x260/0xbd0 include/trace/events/bpf.h:228 trace_bpf_map_update_elem include/trace/events/bpf.h:274 [inline] map_update_elem kernel/bpf/syscall.c:597 [inline] SYSC_bpf kernel/bpf/syscall.c:1478 [inline] Hence this patch deletes tracepoints in bpf core. Reported-by: Eric Biggers Reported-by: syzbot Signed-off-by: Alexei Starovoitov Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/bpf_trace.h | 1 - include/trace/events/bpf.h | 355 --------------------------------------------- 2 files changed, 356 deletions(-) delete mode 100644 include/trace/events/bpf.h (limited to 'include') diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h index e6fe98ae3794..ddf896abcfb6 100644 --- a/include/linux/bpf_trace.h +++ b/include/linux/bpf_trace.h @@ -2,7 +2,6 @@ #ifndef __LINUX_BPF_TRACE_H__ #define __LINUX_BPF_TRACE_H__ -#include #include #endif /* __LINUX_BPF_TRACE_H__ */ diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h deleted file mode 100644 index 150185647e6b..000000000000 --- a/include/trace/events/bpf.h +++ /dev/null @@ -1,355 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM bpf - -#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_BPF_H - -/* These are only used within the BPF_SYSCALL code */ -#ifdef CONFIG_BPF_SYSCALL - -#include -#include -#include -#include - -#define __PROG_TYPE_MAP(FN) \ - FN(SOCKET_FILTER) \ - FN(KPROBE) \ - FN(SCHED_CLS) \ - FN(SCHED_ACT) \ - FN(TRACEPOINT) \ - FN(XDP) \ - FN(PERF_EVENT) \ - FN(CGROUP_SKB) \ - FN(CGROUP_SOCK) \ - FN(LWT_IN) \ - FN(LWT_OUT) \ - FN(LWT_XMIT) - -#define __MAP_TYPE_MAP(FN) \ - FN(HASH) \ - FN(ARRAY) \ - FN(PROG_ARRAY) \ - FN(PERF_EVENT_ARRAY) \ - FN(PERCPU_HASH) \ - FN(PERCPU_ARRAY) \ - FN(STACK_TRACE) \ - FN(CGROUP_ARRAY) \ - FN(LRU_HASH) \ - FN(LRU_PERCPU_HASH) \ - FN(LPM_TRIE) - -#define __PROG_TYPE_TP_FN(x) \ - TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x); -#define __PROG_TYPE_SYM_FN(x) \ - { BPF_PROG_TYPE_##x, #x }, -#define __PROG_TYPE_SYM_TAB \ - __PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 } -__PROG_TYPE_MAP(__PROG_TYPE_TP_FN) - -#define __MAP_TYPE_TP_FN(x) \ - TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x); -#define __MAP_TYPE_SYM_FN(x) \ - { BPF_MAP_TYPE_##x, #x }, -#define __MAP_TYPE_SYM_TAB \ - __MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 } -__MAP_TYPE_MAP(__MAP_TYPE_TP_FN) - -DECLARE_EVENT_CLASS(bpf_prog_event, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(u32, type) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __entry->type = prg->type; - ), - - TP_printk("prog=%s type=%s", - __print_hex_str(__entry->prog_tag, 8), - __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB)) -); - -DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg) -); - -DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu, - - TP_PROTO(const struct bpf_prog *prg), - - TP_ARGS(prg) -); - -TRACE_EVENT(bpf_prog_load, - - TP_PROTO(const struct bpf_prog *prg, int ufd), - - TP_ARGS(prg, ufd), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(u32, type) - __field(int, ufd) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __entry->type = prg->type; - __entry->ufd = ufd; - ), - - TP_printk("prog=%s type=%s ufd=%d", - __print_hex_str(__entry->prog_tag, 8), - __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB), - __entry->ufd) -); - -TRACE_EVENT(bpf_map_create, - - TP_PROTO(const struct bpf_map *map, int ufd), - - TP_ARGS(map, ufd), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, size_key) - __field(u32, size_value) - __field(u32, max_entries) - __field(u32, flags) - __field(int, ufd) - ), - - TP_fast_assign( - __entry->type = map->map_type; - __entry->size_key = map->key_size; - __entry->size_value = map->value_size; - __entry->max_entries = map->max_entries; - __entry->flags = map->map_flags; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, __entry->size_key, __entry->size_value, - __entry->max_entries, __entry->flags) -); - -DECLARE_EVENT_CLASS(bpf_obj_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname), - - TP_STRUCT__entry( - __array(u8, prog_tag, 8) - __field(int, ufd) - __string(path, pname->name) - ), - - TP_fast_assign( - BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag)); - memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag)); - __assign_str(path, pname->name); - __entry->ufd = ufd; - ), - - TP_printk("prog=%s path=%s ufd=%d", - __print_hex_str(__entry->prog_tag, 8), - __get_str(path), __entry->ufd) -); - -DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname) -); - -DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog, - - TP_PROTO(const struct bpf_prog *prg, int ufd, - const struct filename *pname), - - TP_ARGS(prg, ufd, pname) -); - -DECLARE_EVENT_CLASS(bpf_obj_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname), - - TP_STRUCT__entry( - __field(u32, type) - __field(int, ufd) - __string(path, pname->name) - ), - - TP_fast_assign( - __assign_str(path, pname->name); - __entry->type = map->map_type; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d path=%s", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, __get_str(path)) -); - -DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname) -); - -DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map, - - TP_PROTO(const struct bpf_map *map, int ufd, - const struct filename *pname), - - TP_ARGS(map, ufd, pname) -); - -DECLARE_EVENT_CLASS(bpf_map_keyval, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __field(bool, key_trunc) - __field(u32, val_len) - __dynamic_array(u8, val, map->value_size) - __field(bool, val_trunc) - __field(int, ufd) - ), - - TP_fast_assign( - memcpy(__get_dynamic_array(key), key, map->key_size); - memcpy(__get_dynamic_array(val), val, map->value_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->val_len = min(map->value_size, 16U); - __entry->val_trunc = map->value_size != __entry->val_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __print_hex(__get_dynamic_array(key), __entry->key_len), - __entry->key_trunc ? " ..." : "", - __print_hex(__get_dynamic_array(val), __entry->val_len), - __entry->val_trunc ? " ..." : "") -); - -DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val) -); - -DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *val), - - TP_ARGS(map, ufd, key, val) -); - -TRACE_EVENT(bpf_map_delete_elem, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key), - - TP_ARGS(map, ufd, key), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __field(bool, key_trunc) - __field(int, ufd) - ), - - TP_fast_assign( - memcpy(__get_dynamic_array(key), key, map->key_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __print_hex(__get_dynamic_array(key), __entry->key_len), - __entry->key_trunc ? " ..." : "") -); - -TRACE_EVENT(bpf_map_next_key, - - TP_PROTO(const struct bpf_map *map, int ufd, - const void *key, const void *key_next), - - TP_ARGS(map, ufd, key, key_next), - - TP_STRUCT__entry( - __field(u32, type) - __field(u32, key_len) - __dynamic_array(u8, key, map->key_size) - __dynamic_array(u8, nxt, map->key_size) - __field(bool, key_trunc) - __field(bool, key_null) - __field(int, ufd) - ), - - TP_fast_assign( - if (key) - memcpy(__get_dynamic_array(key), key, map->key_size); - __entry->key_null = !key; - memcpy(__get_dynamic_array(nxt), key_next, map->key_size); - __entry->type = map->map_type; - __entry->key_len = min(map->key_size, 16U); - __entry->key_trunc = map->key_size != __entry->key_len; - __entry->ufd = ufd; - ), - - TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]", - __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), - __entry->ufd, - __entry->key_null ? "NULL" : __print_hex(__get_dynamic_array(key), - __entry->key_len), - __entry->key_trunc && !__entry->key_null ? " ..." : "", - __print_hex(__get_dynamic_array(nxt), __entry->key_len), - __entry->key_trunc ? " ..." : "") -); -#endif /* CONFIG_BPF_SYSCALL */ -#endif /* _TRACE_BPF_H */ - -#include -- cgit v1.2.3 From 3bd5a09b529c03bac354c9d48e688ed2aca934fd Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 30 Apr 2018 11:39:03 +0100 Subject: bpf: fix formatting for bpf_perf_event_read() helper doc Some edits brought to the last iteration of BPF helper functions documentation introduced an error with RST formatting. As a result, most of one paragraph is rendered in bold text when only the name of a helper should be. Fix it, and fix formatting of another function name in the same paragraph. Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 23b334bba1a6..530ff6588d8f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -828,12 +828,12 @@ union bpf_attr { * * Also, be aware that the newer helper * **bpf_perf_event_read_value**\ () is recommended over - * **bpf_perf_event_read*\ () in general. The latter has some ABI + * **bpf_perf_event_read**\ () in general. The latter has some ABI * quirks where error and counter value are used as a return code * (which is wrong to do since ranges may overlap). This issue is - * fixed with bpf_perf_event_read_value(), which at the same time - * provides more features over the **bpf_perf_event_read**\ () - * interface. Please refer to the description of + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of * **bpf_perf_event_read_value**\ () for details. * Return * The value of the perf event counter read from the map, or a -- cgit v1.2.3 From 79552fbc0f9dc20dc022be7cc48eb3761623fa56 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 30 Apr 2018 11:39:04 +0100 Subject: bpf: fix formatting for bpf_get_stack() helper doc Fix formatting (indent) for bpf_get_stack() helper documentation, so that the doc is rendered correctly with the Python script. Fixes: c195651e565a ("bpf: add bpf_get_stack helper") Cc: Yonghong Song Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 54 ++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 530ff6588d8f..8daef7326bb7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1770,33 +1770,33 @@ union bpf_attr { * * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) * Description - * Return a user or a kernel stack in bpf program provided buffer. - * To achieve this, the helper needs *ctx*, which is a pointer - * to the context on which the tracing program is executed. - * To store the stacktrace, the bpf program provides *buf* with - * a nonnegative *size*. - * - * The last argument, *flags*, holds the number of stack frames to - * skip (from 0 to 255), masked with - * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set - * the following flags: - * - * **BPF_F_USER_STACK** - * Collect a user space stack instead of a kernel stack. - * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. - * - * **bpf_get_stack**\ () can collect up to - * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject - * to sufficient large buffer size. Note that - * this limit can be controlled with the **sysctl** program, and - * that it should be manually increased in order to profile long - * user stacks (such as stacks for Java programs). To do so, use: - * - * :: - * - * # sysctl kernel.perf_event_max_stack= + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= * * Return * a non-negative value equal to or less than size on success, or -- cgit v1.2.3 From 4d4fd36126d66d6091ca5aaabab262b5da3849c5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sun, 29 Apr 2018 10:56:08 +0300 Subject: net: bridge: Publish bridge accessor functions Add a couple new functions to allow querying FDB and vlan settings of a bridge. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 02639ebea2f0..585d27182425 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -93,11 +93,39 @@ static inline bool br_multicast_router(const struct net_device *dev) #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) bool br_vlan_enabled(const struct net_device *dev); +int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid); +int br_vlan_get_info(const struct net_device *dev, u16 vid, + struct bridge_vlan_info *p_vinfo); #else static inline bool br_vlan_enabled(const struct net_device *dev) { return false; } + +static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) +{ + return -1; +} + +static inline int br_vlan_get_info(const struct net_device *dev, u16 vid, + struct bridge_vlan_info *p_vinfo) +{ + return -1; +} +#endif + +#if IS_ENABLED(CONFIG_BRIDGE) +struct net_device *br_fdb_find_port(const struct net_device *br_dev, + const unsigned char *addr, + __u16 vid); +#else +static inline struct net_device * +br_fdb_find_port(const struct net_device *br_dev, + const unsigned char *addr, + __u16 vid) +{ + return NULL; +} #endif #endif -- cgit v1.2.3 From 6dac152355d9308c9e187bf1d38d98afefcaa315 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 30 Apr 2018 10:16:10 +0300 Subject: tcp: Add clean acked data hook Called when a TCP segment is acknowledged. Could be used by application protocols who hold additional metadata associated with the stream data. This is required by TLS device offload to release metadata associated with acknowledged TLS records. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: Aviad Yehezkel Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 ++ include/net/tcp.h | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b68fea022a82..2ab6667275df 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops { * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data + * @icsk_clean_acked Clean acked data hook * @icsk_listen_portaddr_node hash to the portaddr listener hashtable * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts @@ -102,6 +103,7 @@ struct inet_connection_sock { const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void *icsk_ulp_data; + void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:6, diff --git a/include/net/tcp.h b/include/net/tcp.h index 833154e3df17..cf803fe0fb86 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2105,4 +2105,12 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) #if IS_ENABLED(CONFIG_SMC) extern struct static_key_false tcp_have_smc; #endif + +#if IS_ENABLED(CONFIG_TLS_DEVICE) +void clean_acked_data_enable(struct inet_connection_sock *icsk, + void (*cad)(struct sock *sk, u32 ack_seq)); +void clean_acked_data_disable(struct inet_connection_sock *icsk); + +#endif + #endif /* _TCP_H */ -- cgit v1.2.3 From 08303c189581c985e60f588ad92a041e46b6e307 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 30 Apr 2018 10:16:11 +0300 Subject: net: Rename and export copy_skb_header copy_skb_header is renamed to skb_copy_header and exported. Exposing this function give more flexibility in copying SKBs. skb_copy and skb_copy_expand do not give enough control over which parts are copied. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a4a5c0c5cba8..908d66e55b14 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1034,6 +1034,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); +void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority); struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, gfp_t gfp_mask, bool fclone); -- cgit v1.2.3 From ebf4e808fa0b22e551baf862e17c26c325c068f4 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 30 Apr 2018 10:16:12 +0300 Subject: net: Add Software fallback infrastructure for socket dependent offloads With socket dependent offloads we rely on the netdev to transform the transmitted packets before sending them to the wire. When a packet from an offloaded socket is rerouted to a different device we need to detect it and do the transformation in software. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/sock.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 74d725fdbe0f..3c568b36ee36 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -481,6 +481,11 @@ struct sock { void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; struct rcu_head sk_rcu; @@ -2332,6 +2337,22 @@ static inline bool sk_fullsock(const struct sock *sk) return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + */ +static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sock *sk = skb->sk; + + if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) + skb = sk->sk_validate_xmit_skb(sk, dev, skb); +#endif + + return skb; +} + /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ -- cgit v1.2.3 From a5c37c63f71b044fbe1a3f893d6f6daa36fee5f5 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 30 Apr 2018 10:16:13 +0300 Subject: net: Add TLS offload netdev ops Add new netdev ops to add and delete tls context Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: Aviad Yehezkel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9e09dd897b74..a569bf5524fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -865,6 +865,26 @@ struct xfrmdev_ops { }; #endif +#if IS_ENABLED(CONFIG_TLS_DEVICE) +enum tls_offload_ctx_dir { + TLS_OFFLOAD_CTX_DIR_RX, + TLS_OFFLOAD_CTX_DIR_TX, +}; + +struct tls_crypto_info; +struct tls_context; + +struct tlsdev_ops { + int (*tls_dev_add)(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn); + void (*tls_dev_del)(struct net_device *netdev, + struct tls_context *ctx, + enum tls_offload_ctx_dir direction); +}; +#endif + struct dev_ifalias { struct rcu_head rcuhead; char ifalias[]; @@ -1750,6 +1770,10 @@ struct net_device { const struct xfrmdev_ops *xfrmdev_ops; #endif +#if IS_ENABLED(CONFIG_TLS_DEVICE) + const struct tlsdev_ops *tlsdev_ops; +#endif + const struct header_ops *header_ops; unsigned int flags; -- cgit v1.2.3 From 2342a8512a1e98c286b6b36094058e4ee2261c74 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 30 Apr 2018 10:16:14 +0300 Subject: net: Add TLS TX offload features This patch adds a netdev feature to configure TLS TX offloads. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: Aviad Yehezkel Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index fe2f3b30960e..c87c3a3453c1 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -78,6 +78,7 @@ enum { NETIF_F_HW_ESP_BIT, /* Hardware ESP transformation offload */ NETIF_F_HW_ESP_TX_CSUM_BIT, /* ESP with TX checksum offload */ NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */ + NETIF_F_HW_TLS_TX_BIT, /* Hardware TLS TX offload */ NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */ NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */ @@ -149,6 +150,7 @@ enum { #define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT) #define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD) #define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4) +#define NETIF_F_HW_TLS_TX __NETIF_F(HW_TLS_TX) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) -- cgit v1.2.3 From f66de3ee2c161fcc2d66974e9671f8a2a471ab20 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Mon, 30 Apr 2018 10:16:15 +0300 Subject: net/tls: Split conf to rx + tx In TLS inline crypto, we can have one direction in software and another in hardware. Thus, we split the TLS configuration to separate structures for receive and transmit. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 51 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 3da8e13a6d96..95a8c60b36be 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,21 +83,10 @@ struct tls_device { void (*unhash)(struct tls_device *device, struct sock *sk); }; -struct tls_sw_context { +struct tls_sw_context_tx { struct crypto_aead *aead_send; - struct crypto_aead *aead_recv; struct crypto_wait async_wait; - /* Receive context */ - struct strparser strp; - void (*saved_data_ready)(struct sock *sk); - unsigned int (*sk_poll)(struct file *file, struct socket *sock, - struct poll_table_struct *wait); - struct sk_buff *recv_pkt; - u8 control; - bool decrypted; - - /* Sending context */ char aad_space[TLS_AAD_SPACE_SIZE]; unsigned int sg_plaintext_size; @@ -114,6 +103,19 @@ struct tls_sw_context { struct scatterlist sg_aead_out[2]; }; +struct tls_sw_context_rx { + struct crypto_aead *aead_recv; + struct crypto_wait async_wait; + + struct strparser strp; + void (*saved_data_ready)(struct sock *sk); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); + struct sk_buff *recv_pkt; + u8 control; + bool decrypted; +}; + enum { TLS_PENDING_CLOSED_RECORD }; @@ -138,9 +140,15 @@ struct tls_context { struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128; }; - void *priv_ctx; + struct list_head list; + struct net_device *netdev; + refcount_t refcount; + + void *priv_ctx_tx; + void *priv_ctx_rx; - u8 conf:3; + u8 tx_conf:3; + u8 rx_conf:3; struct cipher_context tx; struct cipher_context rx; @@ -177,7 +185,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); -void tls_sw_free_resources(struct sock *sk); +void tls_sw_free_resources_tx(struct sock *sk); +void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); unsigned int tls_sw_poll(struct file *file, struct socket *sock, @@ -297,16 +306,22 @@ static inline struct tls_context *tls_get_ctx(const struct sock *sk) return icsk->icsk_ulp_data; } -static inline struct tls_sw_context *tls_sw_ctx( +static inline struct tls_sw_context_rx *tls_sw_ctx_rx( + const struct tls_context *tls_ctx) +{ + return (struct tls_sw_context_rx *)tls_ctx->priv_ctx_rx; +} + +static inline struct tls_sw_context_tx *tls_sw_ctx_tx( const struct tls_context *tls_ctx) { - return (struct tls_sw_context *)tls_ctx->priv_ctx; + return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } static inline struct tls_offload_context *tls_offload_ctx( const struct tls_context *tls_ctx) { - return (struct tls_offload_context *)tls_ctx->priv_ctx; + return (struct tls_offload_context *)tls_ctx->priv_ctx_tx; } int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, -- cgit v1.2.3 From e8f69799810c32dd40c6724d829eccc70baad07f Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 30 Apr 2018 10:16:16 +0300 Subject: net/tls: Add generic NIC offload infrastructure This patch adds a generic infrastructure to offload TLS crypto to a network device. It enables the kernel TLS socket to skip encryption and authentication operations on the transmit side of the data path. Leaving those computationally expensive operations to the NIC. The NIC offload infrastructure builds TLS records and pushes them to the TCP layer just like the SW KTLS implementation and using the same API. TCP segmentation is mostly unaffected. Currently the only exception is that we prevent mixed SKBs where only part of the payload requires offload. In the future we are likely to add a similar restriction following a change cipher spec record. The notable differences between SW KTLS and NIC offloaded TLS implementations are as follows: 1. The offloaded implementation builds "plaintext TLS record", those records contain plaintext instead of ciphertext and place holder bytes instead of authentication tags. 2. The offloaded implementation maintains a mapping from TCP sequence number to TLS records. Thus given a TCP SKB sent from a NIC offloaded TLS socket, we can use the tls NIC offload infrastructure to obtain enough context to encrypt the payload of the SKB. A TLS record is released when the last byte of the record is ack'ed, this is done through the new icsk_clean_acked callback. The infrastructure should be extendable to support various NIC offload implementations. However it is currently written with the implementation below in mind: The NIC assumes that packets from each offloaded stream are sent as plaintext and in-order. It keeps track of the TLS records in the TCP stream. When a packet marked for offload is transmitted, the NIC encrypts the payload in-place and puts authentication tags in the relevant place holders. The responsibility for handling out-of-order packets (i.e. TCP retransmission, qdisc drops) falls on the netdev driver. The netdev driver keeps track of the expected TCP SN from the NIC's perspective. If the next packet to transmit matches the expected TCP SN, the driver advances the expected TCP SN, and transmits the packet with TLS offload indication. If the next packet to transmit does not match the expected TCP SN. The driver calls the TLS layer to obtain the TLS record that includes the TCP of the packet for transmission. Using this TLS record, the driver posts a work entry on the transmit queue to reconstruct the NIC TLS state required for the offload of the out-of-order packet. It updates the expected TCP SN accordingly and transmits the now in-order packet. The same queue is used for packet transmission and TLS context reconstruction to avoid the need for flushing the transmit queue before issuing the context reconstruction request. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Signed-off-by: Aviad Yehezkel Signed-off-by: David S. Miller --- include/net/tls.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 95a8c60b36be..8c56809eb384 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -116,6 +116,37 @@ struct tls_sw_context_rx { bool decrypted; }; +struct tls_record_info { + struct list_head list; + u32 end_seq; + int len; + int num_frags; + skb_frag_t frags[MAX_SKB_FRAGS]; +}; + +struct tls_offload_context { + struct crypto_aead *aead_send; + spinlock_t lock; /* protects records list */ + struct list_head records_list; + struct tls_record_info *open_record; + struct tls_record_info *retransmit_hint; + u64 hint_record_sn; + u64 unacked_record_sn; + + struct scatterlist sg_tx_data[MAX_SKB_FRAGS]; + void (*sk_destruct)(struct sock *sk); + u8 driver_state[]; + /* The TLS layer reserves room for driver specific state + * Currently the belief is that there is not enough + * driver specific state to justify another layer of indirection + */ +#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *))) +}; + +#define TLS_OFFLOAD_CONTEXT_SIZE \ + (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) + \ + TLS_DRIVER_STATE_SIZE) + enum { TLS_PENDING_CLOSED_RECORD }; @@ -195,9 +226,28 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); -void tls_icsk_clean_acked(struct sock *sk); +int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); +int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +int tls_device_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); +void tls_device_sk_destruct(struct sock *sk); +void tls_device_init(void); +void tls_device_cleanup(void); + +struct tls_record_info *tls_get_record(struct tls_offload_context *context, + u32 seq, u64 *p_record_sn); + +static inline bool tls_record_is_start_marker(struct tls_record_info *rec) +{ + return rec->len == 0; +} + +static inline u32 tls_record_start_seq(struct tls_record_info *rec) +{ + return rec->end_seq - rec->len; +} +void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); @@ -234,6 +284,13 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } +static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) +{ + return sk_fullsock(sk) && + /* matches smp_store_release in tls_set_device_offload */ + smp_load_acquire(&sk->sk_destruct) == &tls_device_sk_destruct; +} + static inline void tls_err_abort(struct sock *sk, int err) { sk->sk_err = err; @@ -329,4 +386,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +struct sk_buff *tls_validate_xmit_skb(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); + +int tls_sw_fallback_init(struct sock *sk, + struct tls_offload_context *offload_ctx, + struct tls_crypto_info *crypto_info); + #endif /* _TLS_OFFLOAD_H */ -- cgit v1.2.3 From 1ae1732284895498b7119e42323cf12821423e6d Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 30 Apr 2018 10:16:18 +0300 Subject: net/mlx5: Accel, Add TLS tx offload interface Add routines for manipulating TLS TX offload contexts. In Innova TLS, TLS contexts are added or deleted via a command message over the SBU connection. The HW then sends a response message over the same connection. Add implementation for Innova TLS (FPGA-based) hardware. These routines will be used by the TLS offload support in a later patch mlx5/accel is a middle acceleration layer to allow mlx5e and other ULPs to work directly with mlx5_core rather than Innova FPGA or other mlx5 acceleration providers. In the future, when IPSec/TLS or any other acceleration gets integrated into ConnectX chip, mlx5/accel layer will provide the integrated acceleration, rather than the Innova one. Signed-off-by: Ilya Lesokhin Signed-off-by: Boris Pismenny Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 16 -------- include/linux/mlx5/mlx5_ifc_fpga.h | 77 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1aad455538f4..b8918a1da11f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -356,22 +356,6 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits { u8 reserved_at_6[0x1a]; }; -struct mlx5_ifc_ipv4_layout_bits { - u8 reserved_at_0[0x60]; - - u8 ipv4[0x20]; -}; - -struct mlx5_ifc_ipv6_layout_bits { - u8 ipv6[16][0x8]; -}; - -union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { - struct mlx5_ifc_ipv6_layout_bits ipv6_layout; - struct mlx5_ifc_ipv4_layout_bits ipv4_layout; - u8 reserved_at_0[0x80]; -}; - struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 smac_47_16[0x20]; diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index ec052491ba3d..193091537cb6 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -32,12 +32,29 @@ #ifndef MLX5_IFC_FPGA_H #define MLX5_IFC_FPGA_H +struct mlx5_ifc_ipv4_layout_bits { + u8 reserved_at_0[0x60]; + + u8 ipv4[0x20]; +}; + +struct mlx5_ifc_ipv6_layout_bits { + u8 ipv6[16][0x8]; +}; + +union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { + struct mlx5_ifc_ipv6_layout_bits ipv6_layout; + struct mlx5_ifc_ipv4_layout_bits ipv4_layout; + u8 reserved_at_0[0x80]; +}; + enum { MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX = 0x2c9, }; enum { MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC = 0x2, + MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_TLS = 0x3, }; struct mlx5_ifc_fpga_shell_caps_bits { @@ -370,6 +387,27 @@ struct mlx5_ifc_fpga_destroy_qp_out_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_tls_extended_cap_bits { + u8 aes_gcm_128[0x1]; + u8 aes_gcm_256[0x1]; + u8 reserved_at_2[0x1e]; + u8 reserved_at_20[0x20]; + u8 context_capacity_total[0x20]; + u8 context_capacity_rx[0x20]; + u8 context_capacity_tx[0x20]; + u8 reserved_at_a0[0x10]; + u8 tls_counter_size[0x10]; + u8 tls_counters_addr_low[0x20]; + u8 tls_counters_addr_high[0x20]; + u8 rx[0x1]; + u8 tx[0x1]; + u8 tls_v12[0x1]; + u8 tls_v13[0x1]; + u8 lro[0x1]; + u8 ipv6[0x1]; + u8 reserved_at_106[0x1a]; +}; + struct mlx5_ifc_ipsec_extended_cap_bits { u8 encapsulation[0x20]; @@ -519,4 +557,43 @@ struct mlx5_ifc_fpga_ipsec_sa { __be16 reserved2; } __packed; +enum fpga_tls_cmds { + CMD_SETUP_STREAM = 0x1001, + CMD_TEARDOWN_STREAM = 0x1002, +}; + +#define MLX5_TLS_1_2 (0) + +#define MLX5_TLS_ALG_AES_GCM_128 (0) +#define MLX5_TLS_ALG_AES_GCM_256 (1) + +struct mlx5_ifc_tls_cmd_bits { + u8 command_type[0x20]; + u8 ipv6[0x1]; + u8 direction_sx[0x1]; + u8 tls_version[0x2]; + u8 reserved[0x1c]; + u8 swid[0x20]; + u8 src_port[0x10]; + u8 dst_port[0x10]; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6; + u8 tls_rcd_sn[0x40]; + u8 tcp_sn[0x20]; + u8 tls_implicit_iv[0x20]; + u8 tls_xor_iv[0x40]; + u8 encryption_key[0x100]; + u8 alg[4]; + u8 reserved2[0x1c]; + u8 reserved3[0x4a0]; +}; + +struct mlx5_ifc_tls_resp_bits { + u8 syndrome[0x20]; + u8 stream_id[0x20]; + u8 reserverd[0x40]; +}; + +#define MLX5_TLS_COMMAND_SIZE (0x100) + #endif /* MLX5_IFC_FPGA_H */ -- cgit v1.2.3 From 6d3e8aa87622b30bfabbbfad7674d2464f2a9cb9 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Sun, 29 Apr 2018 12:56:31 -0300 Subject: sctp: allow sctp_init_cause to return errors And do so if the skb doesn't have enough space for the payload. This is a preparation for the next patch. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index f4b657478a30..5ef1bad81ef5 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -215,7 +215,7 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk); -void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen); +int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen); struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint); -- cgit v1.2.3 From e283de3a4fa885aed11525129fd4570f92c1d1a9 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 30 Apr 2018 14:20:05 -0700 Subject: net: core: Inline netdev_features_size_check() We do not require this inline function to be used in multiple different locations, just inline it where it gets used in register_netdevice(). Suggested-by: David Miller Suggested-by: Stephen Hemminger Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a569bf5524fa..46dcb5f7522f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4132,12 +4132,6 @@ const char *netdev_drivername(const struct net_device *dev); void linkwatch_run_queue(void); -static inline void netdev_features_size_check(void) -{ - BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < - NETDEV_FEATURE_COUNT); -} - static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { -- cgit v1.2.3 From b086ff87251b4a4c147bc3af20369514e9d0d9ad Mon Sep 17 00:00:00 2001 From: Stefan Strogin Date: Tue, 1 May 2018 01:04:29 +0300 Subject: connector: add parent pid and tgid to coredump and exit events The intention is to get notified of process failures as soon as possible, before a possible core dumping (which could be very long) (e.g. in some process-manager). Coredump and exit process events are perfect for such use cases (see 2b5faa4c553f "connector: Added coredumping event to the process connector"). The problem is that for now the process-manager cannot know the parent of a dying process using connectors. This could be useful if the process-manager should monitor for failures only children of certain parents, so we could filter the coredump and exit events by parent process and/or thread ID. Add parent pid and tgid to coredump and exit process connectors event data. Signed-off-by: Stefan Strogin Acked-by: Evgeniy Polyakov Signed-off-by: David S. Miller --- include/uapi/linux/cn_proc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h index 68ff25414700..db210625cee8 100644 --- a/include/uapi/linux/cn_proc.h +++ b/include/uapi/linux/cn_proc.h @@ -116,12 +116,16 @@ struct proc_event { struct coredump_proc_event { __kernel_pid_t process_pid; __kernel_pid_t process_tgid; + __kernel_pid_t parent_pid; + __kernel_pid_t parent_tgid; } coredump; struct exit_proc_event { __kernel_pid_t process_pid; __kernel_pid_t process_tgid; __u32 exit_code, exit_signal; + __kernel_pid_t parent_pid; + __kernel_pid_t parent_tgid; } exit; } event_data; -- cgit v1.2.3 From b75eba76d3d72e2374fac999926dafef2997edd2 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Tue, 1 May 2018 15:39:15 -0400 Subject: tcp: send in-queue bytes in cmsg upon read Applications with many concurrent connections, high variance in receive queue length and tight memory bounds cannot allocate worst-case buffer size to drain sockets. Knowing the size of receive queue length, applications can optimize how they allocate buffers to read from the socket. The number of bytes pending on the socket is directly available through ioctl(FIONREAD/SIOCINQ) and can be approximated using getsockopt(MEMINFO) (rmem_alloc includes skb overheads in addition to application data). But, both of these options add an extra syscall per recvmsg. Moreover, ioctl(FIONREAD/SIOCINQ) takes the socket lock. Add the TCP_INQ socket option to TCP. When this socket option is set, recvmsg() relays the number of bytes available on the socket for reading to the application via the TCP_CM_INQ control message. Calculate the number of bytes after releasing the socket lock to include the processed backlog, if any. To avoid an extra branch in the hot path of recvmsg() for this new control message, move all cmsg processing inside an existing branch for processing receive timestamps. Since the socket lock is not held when calculating the size of receive queue, TCP_INQ is a hint. For example, it can overestimate the queue size by one byte, if FIN is received. With this method, applications can start reading from the socket using a small buffer, and then use larger buffers based on the remaining data when needed. V3 change-log: As suggested by David Miller, added loads with barrier to check whether we have multiple threads calling recvmsg in parallel. When that happens we lock the socket to calculate inq. V4 change-log: Removed inline from a static function. Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Yuchung Cheng Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Suggested-by: David Miller Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- include/uapi/linux/tcp.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 20585d5c4e1c..807776928cb8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -228,7 +228,7 @@ struct tcp_sock { unused:2; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ - unused1 : 1, + recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index e9e8373b34b9..29eb659aa77a 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -123,6 +123,9 @@ enum { #define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ #define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ #define TCP_ZEROCOPY_RECEIVE 35 +#define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */ + +#define TCP_CM_INQ TCP_INQ struct tcp_repair_opt { __u32 opt_code; -- cgit v1.2.3 From 816a3bed9549fcc0c90ba97c9077e64e734f0df6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 3 May 2018 14:43:46 +0200 Subject: switchdev: Add fdb.added_by_user to switchdev notifications The following patch enables sending notifications also for events on FDB entries that weren't added by the user. Give the drivers the information necessary to distinguish between the two origins of FDB entries. To maintain the current behavior, have switchdev-implementing drivers bail out on notifications about non-user-added FDB entries. In case of mlxsw driver, allow a call to mlxsw_sp_span_respin() so that SPAN over bridge catches up with the changed FDB. Signed-off-by: Petr Machata Reviewed-by: Nikolay Aleksandrov Acked-by: Ivan Vecera Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 39bc855d7fee..d574ce63bf22 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -155,6 +155,7 @@ struct switchdev_notifier_fdb_info { struct switchdev_notifier_info info; /* must be first */ const unsigned char *addr; u16 vid; + bool added_by_user; }; static inline struct net_device * -- cgit v1.2.3 From 68e8b849b221b37a78a110a0307717d45e3593a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:22 +0200 Subject: net: initial AF_XDP skeleton MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Buildable skeleton of AF_XDP without any functionality. Just what it takes to register a new address family. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/linux/socket.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index ea50f4a65816..7ed4713d5337 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -207,8 +207,9 @@ struct ucred { * PF_SMC protocol family that * reuses AF_INET address family */ +#define AF_XDP 44 /* XDP sockets */ -#define AF_MAX 44 /* For now.. */ +#define AF_MAX 45 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC @@ -257,6 +258,7 @@ struct ucred { #define PF_KCM AF_KCM #define PF_QIPCRTR AF_QIPCRTR #define PF_SMC AF_SMC +#define PF_XDP AF_XDP #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ @@ -338,6 +340,7 @@ struct ucred { #define SOL_NFC 280 #define SOL_KCM 281 #define SOL_TLS 282 +#define SOL_XDP 283 /* IPX options */ #define IPX_TYPE 1 -- cgit v1.2.3 From c0c77d8fb787cfe0c3fca689c2a30d1dad4eaba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:23 +0200 Subject: xsk: add user memory registration support sockopt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit the base structure of the AF_XDP address family is set up. Further, we introduce the abilty register a window of user memory to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory window is viewed by an AF_XDP socket as a set of equally large frames. After a user memory registration all frames are "owned" by the user application, and not the kernel. v2: More robust checks on umem creation and unaccount on error. Call set_page_dirty_lock on cleanup. Simplified xdp_umem_reg. Co-authored-by: Magnus Karlsson Signed-off-by: Magnus Karlsson Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 31 +++++++++++++++++++++++++++++++ include/uapi/linux/if_xdp.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 include/net/xdp_sock.h create mode 100644 include/uapi/linux/if_xdp.h (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h new file mode 100644 index 000000000000..94785f5db13e --- /dev/null +++ b/include/net/xdp_sock.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 + * AF_XDP internal functions + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_XDP_SOCK_H +#define _LINUX_XDP_SOCK_H + +#include +#include + +struct xdp_umem; + +struct xdp_sock { + /* struct sock must be the first member of struct xdp_sock */ + struct sock sk; + struct xdp_umem *umem; + /* Protects multiple processes in the control path */ + struct mutex mutex; +}; + +#endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h new file mode 100644 index 000000000000..41252135a0fe --- /dev/null +++ b/include/uapi/linux/if_xdp.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note + * + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include + +/* XDP socket options */ +#define XDP_UMEM_REG 3 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 frame_size; /* Frame size */ + __u32 frame_headroom; /* Frame head room */ +}; + +#endif /* _LINUX_IF_XDP_H */ -- cgit v1.2.3 From 423f38329d267969130fb6f2c685f73d72687558 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:24 +0200 Subject: xsk: add umem fill queue support and mmap Here, we add another setsockopt for registered user memory (umem) called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can ask the kernel to allocate a queue (ring buffer) and also mmap it (XDP_UMEM_PGOFF_FILL_QUEUE) into the process. The queue is used to explicitly pass ownership of umem frames from the user process to the kernel. These frames will in a later patch be filled in with Rx packet data by the kernel. v2: Fixed potential crash in xsk_mmap. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 41252135a0fe..975661e1baca 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -23,6 +23,7 @@ /* XDP socket options */ #define XDP_UMEM_REG 3 +#define XDP_UMEM_FILL_RING 4 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -31,4 +32,18 @@ struct xdp_umem_reg { __u32 frame_headroom; /* Frame head room */ }; +/* Pgoff for mmaping the rings */ +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000 + +struct xdp_ring { + __u32 producer __attribute__((aligned(64))); + __u32 consumer __attribute__((aligned(64))); +}; + +/* Used for the fill and completion queues for buffers */ +struct xdp_umem_ring { + struct xdp_ring ptrs; + __u32 desc[0] __attribute__((aligned(64))); +}; + #endif /* _LINUX_IF_XDP_H */ -- cgit v1.2.3 From b9b6b68e8abd101be6eb5330e4999218c696d1e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:25 +0200 Subject: xsk: add Rx queue setup and mmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Another setsockopt (XDP_RX_QUEUE) is added to let the process allocate a queue, where the kernel can pass completed Rx frames from the kernel to user process. The mmapping of the queue is done using the XDP_PGOFF_RX_QUEUE offset. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 4 ++++ include/uapi/linux/if_xdp.h | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 94785f5db13e..db9a321de087 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -18,11 +18,15 @@ #include #include +struct net_device; +struct xsk_queue; struct xdp_umem; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; + struct xsk_queue *rx; + struct net_device *dev; struct xdp_umem *umem; /* Protects multiple processes in the control path */ struct mutex mutex; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 975661e1baca..65324558829d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -22,6 +22,7 @@ #include /* XDP socket options */ +#define XDP_RX_RING 1 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 @@ -33,13 +34,28 @@ struct xdp_umem_reg { }; /* Pgoff for mmaping the rings */ +#define XDP_PGOFF_RX_RING 0 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 +struct xdp_desc { + __u32 idx; + __u32 len; + __u16 offset; + __u8 flags; + __u8 padding[5]; +}; + struct xdp_ring { __u32 producer __attribute__((aligned(64))); __u32 consumer __attribute__((aligned(64))); }; +/* Used for the RX and TX queues for packets */ +struct xdp_rxtx_ring { + struct xdp_ring ptrs; + struct xdp_desc desc[0] __attribute__((aligned(64))); +}; + /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; -- cgit v1.2.3 From 965a990984432cd01a9eb3514c64d86f56704295 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:26 +0200 Subject: xsk: add support for bind for Rx Here, the bind syscall is added. Binding an AF_XDP socket, means associating the socket to an umem, a netdev and a queue index. This can be done in two ways. The first way, creating a "socket from scratch". Create the umem using the XDP_UMEM_REG setsockopt and an associated fill queue with XDP_UMEM_FILL_QUEUE. Create the Rx queue using the XDP_RX_QUEUE setsockopt. Call bind passing ifindex and queue index ("channel" in ethtool speak). The second way to bind a socket, is simply skipping the umem/netdev/queue index, and passing another already setup AF_XDP socket. The new socket will then have the same umem/netdev/queue index as the parent so it will share the same umem. You must also set the flags field in the socket address to XDP_SHARED_UMEM. v2: Use PTR_ERR instead of passing error variable explicitly. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index db9a321de087..85d02512f59b 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_sock { struct xsk_queue *rx; struct net_device *dev; struct xdp_umem *umem; + u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; }; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 65324558829d..e5091881f776 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -21,6 +21,17 @@ #include +/* Options for the sxdp_flags field */ +#define XDP_SHARED_UMEM 1 + +struct sockaddr_xdp { + __u16 sxdp_family; + __u32 sxdp_ifindex; + __u32 sxdp_queue_id; + __u32 sxdp_shared_umem_fd; + __u16 sxdp_flags; +}; + /* XDP socket options */ #define XDP_RX_RING 1 #define XDP_UMEM_REG 3 -- cgit v1.2.3 From c497176cb2e478f0a5713b0e05f242276e3194b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:27 +0200 Subject: xsk: add Rx receive functions and poll support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here the actual receive functions of AF_XDP are implemented, that in a later commit, will be called from the XDP layers. There's one set of functions for the XDP_DRV side and another for XDP_SKB (generic). A new XDP API, xdp_return_buff, is also introduced. Adding xdp_return_buff, which is analogous to xdp_return_frame, but acts upon an struct xdp_buff. The API will be used by AF_XDP in future commits. Support for the poll syscall is also implemented. v2: xskq_validate_id did not update cons_tail. The entries variable was calculated twice in xskq_nb_avail. Squashed xdp_return_buff commit. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 + include/net/xdp_sock.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 137ad5f9f40f..0b689cf561c7 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) } void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_buff(struct xdp_buff *xdp); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 85d02512f59b..a0342dff6a4d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -31,6 +31,28 @@ struct xdp_sock { u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; + u64 rx_dropped; }; +struct xdp_buff; +#ifdef CONFIG_XDP_SOCKETS +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +void xsk_flush(struct xdp_sock *xs); +#else +static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline void xsk_flush(struct xdp_sock *xs) +{ +} +#endif /* CONFIG_XDP_SOCKETS */ + #endif /* _LINUX_XDP_SOCK_H */ -- cgit v1.2.3 From fbfc504a24f53f7ebe128ab55cb5dba634f4ece8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:28 +0200 Subject: bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will *only* accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v3: Fixed race and simplified code. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 25 +++++++++++++++++++++++++ include/linux/bpf_types.h | 3 +++ include/net/xdp_sock.h | 7 +++++++ include/uapi/linux/bpf.h | 1 + 4 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c553f6f9c6b0..68ecdb4eea09 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -676,6 +676,31 @@ static inline int sock_map_prog(struct bpf_map *map, } #endif +#if defined(CONFIG_XDP_SOCKETS) +struct xdp_sock; +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key); +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs); +void __xsk_map_flush(struct bpf_map *map); +#else +struct xdp_sock; +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + +static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + return -EOPNOTSUPP; +} + +static inline void __xsk_map_flush(struct bpf_map *map) +{ +} +#endif + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b28fcf6f6ae..d7df1b323082 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) +#if defined(CONFIG_XDP_SOCKETS) +BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) +#endif #endif diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index a0342dff6a4d..ce3a2ab16b8f 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_sock { struct xsk_queue *rx; struct net_device *dev; struct xdp_umem *umem; + struct list_head flush_node; u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; @@ -39,6 +40,7 @@ struct xdp_buff; int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -53,6 +55,11 @@ static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static inline void xsk_flush(struct xdp_sock *xs) { } + +static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return false; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8daef7326bb7..a3a495052511 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -116,6 +116,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, }; enum bpf_prog_type { -- cgit v1.2.3 From 02671e23e7b383763fe1ae4f20b56d8029f9dfc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 2 May 2018 13:01:30 +0200 Subject: xsk: wire up XDP_SKB side of AF_XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit wires up the xskmap to XDP_SKB layer. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 64899c04c1a6..b7f81e3a70cb 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -760,7 +760,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *prog); + struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); -- cgit v1.2.3 From fe2308328cd2f26ebc986f543796e7d13ae00bc4 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:31 +0200 Subject: xsk: add umem completion queue support and mmap Here, we add another setsockopt for registered user memory (umem) called XDP_UMEM_COMPLETION_QUEUE. Using this socket option, the process can ask the kernel to allocate a queue (ring buffer) and also mmap it (XDP_UMEM_PGOFF_COMPLETION_QUEUE) into the process. The queue is used to explicitly pass ownership of umem frames from the kernel to user process. This will be used by the TX path to tell user space that a certain frame has been transmitted and user space can use it for something else, if it wishes. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e5091881f776..71581a139f26 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -36,6 +36,7 @@ struct sockaddr_xdp { #define XDP_RX_RING 1 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 +#define XDP_UMEM_COMPLETION_RING 5 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -47,6 +48,7 @@ struct xdp_umem_reg { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 struct xdp_desc { __u32 idx; -- cgit v1.2.3 From f61459030ec7fffdaa3c462cc0f728eef11b4d05 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:32 +0200 Subject: xsk: add Tx queue setup and mmap support Another setsockopt (XDP_TX_QUEUE) is added to let the process allocate a queue, where the user process can pass frames to be transmitted by the kernel. The mmapping of the queue is done using the XDP_PGOFF_TX_QUEUE offset. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ce3a2ab16b8f..185f4928fbda 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -30,6 +30,7 @@ struct xdp_sock { struct xdp_umem *umem; struct list_head flush_node; u16 queue_id; + struct xsk_queue *tx ____cacheline_aligned_in_smp; /* Protects multiple processes in the control path */ struct mutex mutex; u64 rx_dropped; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 71581a139f26..e2ea878d025c 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -34,6 +34,7 @@ struct sockaddr_xdp { /* XDP socket options */ #define XDP_RX_RING 1 +#define XDP_TX_RING 2 #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 #define XDP_UMEM_COMPLETION_RING 5 @@ -47,6 +48,7 @@ struct xdp_umem_reg { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 +#define XDP_PGOFF_TX_RING 0x80000000 #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 -- cgit v1.2.3 From 865b03f21162e4edfda51fc08693c864b1d4fdaf Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:33 +0200 Subject: dev: packet: make packet_direct_xmit a common function The new dev_direct_xmit will be used by AF_XDP in later commits. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 366c32891158..a30435118530 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2486,6 +2486,7 @@ void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv); +int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); -- cgit v1.2.3 From af75d9e02d08dc55ce6a1e42e485465c630d7349 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:35 +0200 Subject: xsk: statistics support In this commit, a new getsockopt is added: XDP_STATISTICS. This is used to obtain stats from the sockets. v2: getsockopt now returns size of stats structure. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/uapi/linux/if_xdp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e2ea878d025c..77b88c4efe98 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -38,6 +38,7 @@ struct sockaddr_xdp { #define XDP_UMEM_REG 3 #define XDP_UMEM_FILL_RING 4 #define XDP_UMEM_COMPLETION_RING 5 +#define XDP_STATISTICS 6 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -46,6 +47,12 @@ struct xdp_umem_reg { __u32 frame_headroom; /* Frame head room */ }; +struct xdp_statistics { + __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ +}; + /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 -- cgit v1.2.3 From 93731ef086cee90af594e62874bb98ae6d6eee91 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:13 +0200 Subject: bpf: migrate ebpf ld_abs/ld_ind tests to test_verifier Remove all eBPF tests involving LD_ABS/LD_IND from test_bpf.ko. Reason is that the eBPF tests from test_bpf module do not go via BPF verifier and therefore any instruction rewrites from verifier cannot take place. Therefore, move them into test_verifier which runs out of user space, so that verfier can rewrite LD_ABS/LD_IND internally in upcoming patches. It will have the same effect since runtime tests are also performed from there. This also allows to finally unexport bpf_skb_vlan_{push,pop}_proto and keep it internal to core kernel. Additionally, also add further cBPF LD_ABS/LD_IND test coverage into test_bpf.ko suite. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 68ecdb4eea09..d0e3d7ef36a8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -714,8 +714,6 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; -extern const struct bpf_func_proto bpf_skb_vlan_push_proto; -extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; -- cgit v1.2.3 From e0cea7ce988cf48cc4052235d2ad2550b3bc4fa0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:14 +0200 Subject: bpf: implement ld_abs/ld_ind in native bpf The main part of this work is to finally allow removal of LD_ABS and LD_IND from the BPF core by reimplementing them through native eBPF instead. Both LD_ABS/LD_IND were carried over from cBPF and keeping them around in native eBPF caused way more trouble than actually worth it. To just list some of the security issues in the past: * fdfaf64e7539 ("x86: bpf_jit: support negative offsets") * 35607b02dbef ("sparc: bpf_jit: fix loads from negative offsets") * e0ee9c12157d ("x86: bpf_jit: fix two bugs in eBPF JIT compiler") * 07aee9439454 ("bpf, sparc: fix usage of wrong reg for load_skb_regs after call") * 6d59b7dbf72e ("bpf, s390x: do not reload skb pointers in non-skb context") * 87338c8e2cbb ("bpf, ppc64: do not reload skb pointers in non-skb context") For programs in native eBPF, LD_ABS/LD_IND are pretty much legacy these days due to their limitations and more efficient/flexible alternatives that have been developed over time such as direct packet access. LD_ABS/LD_IND only cover 1/2/4 byte loads into a register, the load happens in host endianness and its exception handling can yield unexpected behavior. The latter is explained in depth in f6b1b3bf0d5f ("bpf: fix subprog verifier bypass by div/mod by 0 exception") with similar cases of exceptions we had. In native eBPF more recent program types will disable LD_ABS/LD_IND altogether through may_access_skb() in verifier, and given the limitations in terms of exception handling, it's also disabled in programs that use BPF to BPF calls. In terms of cBPF, the LD_ABS/LD_IND is used in networking programs to access packet data. It is not used in seccomp-BPF but programs that use it for socket filtering or reuseport for demuxing with cBPF. This is mostly relevant for applications that have not yet migrated to native eBPF. The main complexity and source of bugs in LD_ABS/LD_IND is coming from their implementation in the various JITs. Most of them keep the model around from cBPF times by implementing a fastpath written in asm. They use typically two from the BPF program hidden CPU registers for caching the skb's headlen (skb->len - skb->data_len) and skb->data. Throughout the JIT phase this requires to keep track whether LD_ABS/LD_IND are used and if so, the two registers need to be recached each time a BPF helper would change the underlying packet data in native eBPF case. At least in eBPF case, available CPU registers are rare and the additional exit path out of the asm written JIT helper makes it also inflexible since not all parts of the JITer are in control from plain C. A LD_ABS/LD_IND implementation in eBPF therefore allows to significantly reduce the complexity in JITs with comparable performance results for them, e.g.: test_bpf tcpdump port 22 tcpdump complex x64 - before 15 21 10 14 19 18 - after 7 10 10 7 10 15 arm64 - before 40 91 92 40 91 151 - after 51 64 73 51 62 113 For cBPF we now track any usage of LD_ABS/LD_IND in bpf_convert_filter() and cache the skb's headlen and data in the cBPF prologue. The BPF_REG_TMP gets remapped from R8 to R2 since it's mainly just used as a local temporary variable. This allows to shrink the image on x86_64 also for seccomp programs slightly since mapping to %rsi is not an ereg. In callee-saved R8 and R9 we now track skb data and headlen, respectively. For normal prologue emission in the JITs this does not add any extra instructions since R8, R9 are pushed to stack in any case from eBPF side. cBPF uses the convert_bpf_ld_abs() emitter which probes the fast path inline already and falls back to bpf_skb_load_helper_{8,16,32}() helper relying on the cached skb data and headlen as well. R8 and R9 never need to be reloaded due to bpf_helper_changes_pkt_data() since all skb access in cBPF is read-only. Then, for the case of native eBPF, we use the bpf_gen_ld_abs() emitter, which calls the bpf_skb_load_helper_{8,16,32}_no_cache() helper unconditionally, does neither cache skb data and headlen nor has an inlined fast path. The reason for the latter is that native eBPF does not have any extra registers available anyway, but even if there were, it avoids any reload of skb data and headlen in the first place. Additionally, for the negative offsets, we provide an alternative bpf_skb_load_bytes_relative() helper in eBPF which operates similarly as bpf_skb_load_bytes() and allows for more flexibility. Tested myself on x64, arm64, s390x, from Sandipan on ppc64. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/filter.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0e3d7ef36a8..0e00a13ff01b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -235,6 +235,8 @@ struct bpf_verifier_ops { struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); + int (*gen_ld_abs)(const struct bpf_insn *orig, + struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, diff --git a/include/linux/filter.h b/include/linux/filter.h index b7f81e3a70cb..da7e16523128 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -47,7 +47,9 @@ struct xdp_buff; /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 -#define BPF_REG_TMP BPF_REG_8 +#define BPF_REG_TMP BPF_REG_2 /* scratch reg */ +#define BPF_REG_D BPF_REG_8 /* data, callee-saved */ +#define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register for hardening step. * Only used by eBPF JITs. It's nothing more than a temporary -- cgit v1.2.3 From 4e1ec56cdc59746943b2acfab3c171b930187bbe Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:15 +0200 Subject: bpf: add skb_load_bytes_relative helper This adds a small BPF helper similar to bpf_skb_load_bytes() that is able to load relative to mac/net header offset from the skb's linear data. Compared to bpf_skb_load_bytes(), it takes a fifth argument namely start_header, which is either BPF_HDR_START_MAC or BPF_HDR_START_NET. This allows for a more flexible alternative compared to LD_ABS/LD_IND with negative offset. It's enabled for tc BPF programs as well as sock filter program types where it's mainly useful in reuseport programs to ease access to lower header data. Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2017-March/000698.html Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a3a495052511..93d5a4eeec2a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1802,6 +1802,30 @@ union bpf_attr { * Return * a non-negative value equal to or less than size on success, or * a negative error in case of failure. + * + * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * + * Return + * 0 on success, or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1871,7 +1895,8 @@ union bpf_attr { FN(bind), \ FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ - FN(get_stack), + FN(get_stack), \ + FN(skb_load_bytes_relative), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -1932,6 +1957,12 @@ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, }; +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ -- cgit v1.2.3 From f910cefa32b6cdabc96b126bcfc46d8940b1dc45 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 2 May 2018 16:17:17 -0400 Subject: bpf: unify main prog and subprog Currently, verifier treat main prog and subprog differently. All subprogs detected are kept in env->subprog_starts while main prog is not kept there. Instead, main prog is implicitly defined as the prog start at 0. There is actually no difference between main prog and subprog, it is better to unify them, and register all progs detected into env->subprog_starts. This could also help simplifying some code logic. Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7e61c395fddf..f655b926e432 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -191,7 +191,7 @@ struct bpf_verifier_env { bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifier_log log; - u32 subprog_starts[BPF_MAX_SUBPROGS]; + u32 subprog_starts[BPF_MAX_SUBPROGS + 1]; /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; -- cgit v1.2.3 From 9c8105bd4402236b1bb0f8f10709c5cec1440a0c Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Wed, 2 May 2018 16:17:18 -0400 Subject: bpf: centre subprog information fields It is better to centre all subprog information fields into one structure. This structure could later serve as function node in call graph. Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f655b926e432..8f70dc181e23 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -173,6 +173,11 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) #define BPF_MAX_SUBPROGS 256 +struct bpf_subprog_info { + u32 start; /* insn idx of function entry point */ + u16 stack_depth; /* max. stack depth used by this function */ +}; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -191,9 +196,7 @@ struct bpf_verifier_env { bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifier_log log; - u32 subprog_starts[BPF_MAX_SUBPROGS + 1]; - /* computes the stack depth of each bpf function */ - u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; + struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; }; -- cgit v1.2.3 From 0cd3cbed3caf6eae3bc0fa4afa4f26a9babfe55a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 May 2018 18:37:08 -0700 Subject: bpf: offload: allow offloaded programs to use perf event arrays BPF_MAP_TYPE_PERF_EVENT_ARRAY is special as far as offload goes. The map only holds glue to perf ring, not actual data. Allow non-offloaded perf event arrays to be used in offloaded programs. Offload driver can extract the events from HW and put them in the map for user space to retrieve. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0e00a13ff01b..321969da67b7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -110,6 +110,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) return container_of(map, struct bpf_offloaded_map, map); } +static inline bool bpf_map_offload_neutral(const struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; +} + static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { return map->ops->map_seq_show_elem && map->ops->map_check_btf; -- cgit v1.2.3 From 8fb11a9a8d51df9a314a6d970436963c127ff1bd Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 4 May 2018 13:54:24 -0700 Subject: net/ipv6: rename rt6_next to fib6_next This slipped through the cracks in the followup set to the fib6_info flip. Rename rt6_next to fib6_next. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1af450d4e923..a3ec08d05756 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -135,7 +135,7 @@ struct fib6_nh { struct fib6_info { struct fib6_table *fib6_table; - struct fib6_info __rcu *rt6_next; + struct fib6_info __rcu *fib6_next; struct fib6_node __rcu *fib6_node; /* Multipath routes: @@ -192,11 +192,11 @@ struct rt6_info { #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ - rt = rcu_dereference(rt->rt6_next)) + rt = rcu_dereference(rt->fib6_next)) #define for_each_fib6_walker_rt(w) \ for (rt = (w)->leaf; rt; \ - rt = rcu_dereference_protected(rt->rt6_next, 1)) + rt = rcu_dereference_protected(rt->fib6_next, 1)) static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { -- cgit v1.2.3 From d734a2888922efa521f9eb8dfe6790ced8f62bb7 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Sun, 22 Apr 2018 11:03:23 +0200 Subject: netfilter: nft_numgen: add map lookups for numgen statements This patch includes a new attribute in the numgen structure to allow the lookup of an element based on the number generator as a key. For this purpose, different ops have been included to extend the current numgen inc functions. Currently, only supported for numgen incremental operations, but it will be supported for random in a follow-up patch. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6a3d653d5b27..5a5551a580f7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1450,6 +1450,8 @@ enum nft_trace_types { * @NFTA_NG_MODULUS: maximum counter value (NLA_U32) * @NFTA_NG_TYPE: operation type (NLA_U32) * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32) + * @NFTA_NG_SET_NAME: name of the map to lookup (NLA_STRING) + * @NFTA_NG_SET_ID: id of the map (NLA_U32) */ enum nft_ng_attributes { NFTA_NG_UNSPEC, @@ -1457,6 +1459,8 @@ enum nft_ng_attributes { NFTA_NG_MODULUS, NFTA_NG_TYPE, NFTA_NG_OFFSET, + NFTA_NG_SET_NAME, + NFTA_NG_SET_ID, __NFTA_NG_MAX }; #define NFTA_NG_MAX (__NFTA_NG_MAX - 1) -- cgit v1.2.3 From c1c7e44b4f62e9c07fea8aaa58ed46cfae48ba3d Mon Sep 17 00:00:00 2001 From: Ahmed Abdelsalam Date: Wed, 25 Apr 2018 05:30:24 -0500 Subject: netfilter: ip6t_srh: extend SRH matching for previous, next and last SID IPv6 Segment Routing Header (SRH) contains a list of SIDs to be crossed by SR encapsulated packet. Each SID is encoded as an IPv6 prefix. When a Firewall receives an SR encapsulated packet, it should be able to identify which node previously processed the packet (previous SID), which node is going to process the packet next (next SID), and which node is the last to process the packet (last SID) which represent the final destination of the packet in case of inline SR mode. An example use-case of using these features could be SID list that includes two firewalls. When the second firewall receives a packet, it can check whether the packet has been processed by the first firewall or not. Based on that check, it decides to apply all rules, apply just subset of the rules, or totally skip all rules and forward the packet to the next SID. This patch extends SRH match to support matching previous SID, next SID, and last SID. Signed-off-by: Ahmed Abdelsalam Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_ipv6/ip6t_srh.h | 43 ++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h index f3cc0ef514a7..54ed83360dac 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h +++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h @@ -17,7 +17,10 @@ #define IP6T_SRH_LAST_GT 0x0100 #define IP6T_SRH_LAST_LT 0x0200 #define IP6T_SRH_TAG 0x0400 -#define IP6T_SRH_MASK 0x07FF +#define IP6T_SRH_PSID 0x0800 +#define IP6T_SRH_NSID 0x1000 +#define IP6T_SRH_LSID 0x2000 +#define IP6T_SRH_MASK 0x3FFF /* Values for "mt_invflags" field in struct ip6t_srh */ #define IP6T_SRH_INV_NEXTHDR 0x0001 @@ -31,7 +34,10 @@ #define IP6T_SRH_INV_LAST_GT 0x0100 #define IP6T_SRH_INV_LAST_LT 0x0200 #define IP6T_SRH_INV_TAG 0x0400 -#define IP6T_SRH_INV_MASK 0x07FF +#define IP6T_SRH_INV_PSID 0x0800 +#define IP6T_SRH_INV_NSID 0x1000 +#define IP6T_SRH_INV_LSID 0x2000 +#define IP6T_SRH_INV_MASK 0x3FFF /** * struct ip6t_srh - SRH match options @@ -54,4 +60,37 @@ struct ip6t_srh { __u16 mt_invflags; }; +/** + * struct ip6t_srh1 - SRH match options (revision 1) + * @ next_hdr: Next header field of SRH + * @ hdr_len: Extension header length field of SRH + * @ segs_left: Segments left field of SRH + * @ last_entry: Last entry field of SRH + * @ tag: Tag field of SRH + * @ psid_addr: Address of previous SID in SRH SID list + * @ nsid_addr: Address of NEXT SID in SRH SID list + * @ lsid_addr: Address of LAST SID in SRH SID list + * @ psid_msk: Mask of previous SID in SRH SID list + * @ nsid_msk: Mask of next SID in SRH SID list + * @ lsid_msk: MAsk of last SID in SRH SID list + * @ mt_flags: match options + * @ mt_invflags: Invert the sense of match options + */ + +struct ip6t_srh1 { + __u8 next_hdr; + __u8 hdr_len; + __u8 segs_left; + __u8 last_entry; + __u16 tag; + struct in6_addr psid_addr; + struct in6_addr nsid_addr; + struct in6_addr lsid_addr; + struct in6_addr psid_msk; + struct in6_addr nsid_msk; + struct in6_addr lsid_msk; + __u16 mt_flags; + __u16 mt_invflags; +}; + #endif /*_IP6T_SRH_H*/ -- cgit v1.2.3 From 3a2e86f64577be9a6e2c13ee06834e769cd3824f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Apr 2018 17:42:15 +0200 Subject: netfilter: nf_nat: remove unused ct arg from lookup functions Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_l3proto.h | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index ac47098a61dc..8bad2560576f 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -48,30 +48,26 @@ unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -81,29 +77,25 @@ unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)); + const struct nf_hook_state *state)); #endif /* _NF_NAT_L3PROTO_H */ -- cgit v1.2.3 From 3f9c56a581b96d8117922c4fd8221687fd649f9b Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 27 Apr 2018 12:47:01 +0200 Subject: netfilter: nf_tables: Provide NFT_{RT,CT}_MAX for userspace These macros allow conveniently declaring arrays which use NFT_{RT,CT}_* values as indexes. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 5a5551a580f7..ce031cf72288 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -831,7 +831,9 @@ enum nft_rt_keys { NFT_RT_NEXTHOP4, NFT_RT_NEXTHOP6, NFT_RT_TCPMSS, + __NFT_RT_MAX }; +#define NFT_RT_MAX (__NFT_RT_MAX - 1) /** * enum nft_hash_types - nf_tables hash expression types @@ -949,7 +951,9 @@ enum nft_ct_keys { NFT_CT_DST_IP, NFT_CT_SRC_IP6, NFT_CT_DST_IP6, + __NFT_CT_MAX }; +#define NFT_CT_MAX (__NFT_CT_MAX - 1) /** * enum nft_ct_attributes - nf_tables ct expression netlink attributes -- cgit v1.2.3 From bfb15f2a95cbbc548b59abf8007d0fdb35fdfee5 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Thu, 3 May 2018 14:05:40 +0200 Subject: netfilter: extract Passive OS fingerprint infrastructure from xt_osf Add nf_osf_ttl() and nf_osf_match() into nf_osf.c to prepare for nf_tables support. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 27 +++++++++ include/uapi/linux/netfilter/nf_osf.h | 90 +++++++++++++++++++++++++++++ include/uapi/linux/netfilter/xt_osf.h | 106 ++++++---------------------------- 3 files changed, 134 insertions(+), 89 deletions(-) create mode 100644 include/linux/netfilter/nf_osf.h create mode 100644 include/uapi/linux/netfilter/nf_osf.h (limited to 'include') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h new file mode 100644 index 000000000000..a2b39602e87d --- /dev/null +++ b/include/linux/netfilter/nf_osf.h @@ -0,0 +1,27 @@ +#include + +/* Initial window size option state machine: multiple of mss, mtu or + * plain numeric value. Can also be made as plain numeric value which + * is not a multiple of specified value. + */ +enum nf_osf_window_size_options { + OSF_WSS_PLAIN = 0, + OSF_WSS_MSS, + OSF_WSS_MTU, + OSF_WSS_MODULO, + OSF_WSS_MAX, +}; + +enum osf_fmatch_states { + /* Packet does not match the fingerprint */ + FMATCH_WRONG = 0, + /* Packet matches the fingerprint */ + FMATCH_OK, + /* Options do not match the fingerprint, but header does */ + FMATCH_OPT_WRONG, +}; + +bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, + int hooknum, struct net_device *in, struct net_device *out, + const struct nf_osf_info *info, struct net *net, + const struct list_head *nf_osf_fingers); diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h new file mode 100644 index 000000000000..45376eae31ef --- /dev/null +++ b/include/uapi/linux/netfilter/nf_osf.h @@ -0,0 +1,90 @@ +#ifndef _NF_OSF_H +#define _NF_OSF_H + +#define MAXGENRELEN 32 + +#define NF_OSF_GENRE (1 << 0) +#define NF_OSF_TTL (1 << 1) +#define NF_OSF_LOG (1 << 2) +#define NF_OSF_INVERT (1 << 3) + +#define NF_OSF_LOGLEVEL_ALL 0 /* log all matched fingerprints */ +#define NF_OSF_LOGLEVEL_FIRST 1 /* log only the first matced fingerprint */ +#define NF_OSF_LOGLEVEL_ALL_KNOWN 2 /* do not log unknown packets */ + +#define NF_OSF_TTL_TRUE 0 /* True ip and fingerprint TTL comparison */ + +/* Do not compare ip and fingerprint TTL at all */ +#define NF_OSF_TTL_NOCHECK 2 + +/* Wildcard MSS (kind of). + * It is used to implement a state machine for the different wildcard values + * of the MSS and window sizes. + */ +struct nf_osf_wc { + __u32 wc; + __u32 val; +}; + +/* This struct represents IANA options + * http://www.iana.org/assignments/tcp-parameters + */ +struct nf_osf_opt { + __u16 kind, length; + struct nf_osf_wc wc; +}; + +struct nf_osf_info { + char genre[MAXGENRELEN]; + __u32 len; + __u32 flags; + __u32 loglevel; + __u32 ttl; +}; + +struct nf_osf_user_finger { + struct nf_osf_wc wss; + + __u8 ttl, df; + __u16 ss, mss; + __u16 opt_num; + + char genre[MAXGENRELEN]; + char version[MAXGENRELEN]; + char subtype[MAXGENRELEN]; + + /* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */ + struct nf_osf_opt opt[MAX_IPOPTLEN]; +}; + +struct nf_osf_finger { + struct rcu_head rcu_head; + struct list_head finger_entry; + struct nf_osf_user_finger finger; +}; + +struct nf_osf_nlmsg { + struct nf_osf_user_finger f; + struct iphdr ip; + struct tcphdr tcp; +}; + +/* Defines for IANA option kinds */ +enum iana_options { + OSFOPT_EOL = 0, /* End of options */ + OSFOPT_NOP, /* NOP */ + OSFOPT_MSS, /* Maximum segment size */ + OSFOPT_WSO, /* Window scale option */ + OSFOPT_SACKP, /* SACK permitted */ + OSFOPT_SACK, /* SACK */ + OSFOPT_ECHO, + OSFOPT_ECHOREPLY, + OSFOPT_TS, /* Timestamp option */ + OSFOPT_POCP, /* Partial Order Connection Permitted */ + OSFOPT_POSP, /* Partial Order Service Profile */ + + /* Others are not used in the current OSF */ + OSFOPT_EMPTY = 255, +}; + +#endif /* _NF_OSF_H */ diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index dad197e2ab99..72956eceeb09 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -23,101 +23,29 @@ #include #include #include +#include -#define MAXGENRELEN 32 +#define XT_OSF_GENRE NF_OSF_GENRE +#define XT_OSF_INVERT NF_OSF_INVERT -#define XT_OSF_GENRE (1<<0) -#define XT_OSF_TTL (1<<1) -#define XT_OSF_LOG (1<<2) -#define XT_OSF_INVERT (1<<3) +#define XT_OSF_TTL NF_OSF_TTL +#define XT_OSF_LOG NF_OSF_LOG -#define XT_OSF_LOGLEVEL_ALL 0 /* log all matched fingerprints */ -#define XT_OSF_LOGLEVEL_FIRST 1 /* log only the first matced fingerprint */ -#define XT_OSF_LOGLEVEL_ALL_KNOWN 2 /* do not log unknown packets */ +#define XT_OSF_LOGLEVEL_ALL NF_OSF_LOGLEVEL_ALL +#define XT_OSF_LOGLEVEL_FIRST NF_OSF_LOGLEVEL_FIRST +#define XT_OSF_LOGLEVEL_ALL_KNOWN NF_OSF_LOGLEVEL_ALL_KNOWN -#define XT_OSF_TTL_TRUE 0 /* True ip and fingerprint TTL comparison */ -#define XT_OSF_TTL_LESS 1 /* Check if ip TTL is less than fingerprint one */ -#define XT_OSF_TTL_NOCHECK 2 /* Do not compare ip and fingerprint TTL at all */ +#define XT_OSF_TTL_TRUE NF_OSF_TTL_TRUE +#define XT_OSF_TTL_NOCHECK NF_OSF_TTL_NOCHECK -struct xt_osf_info { - char genre[MAXGENRELEN]; - __u32 len; - __u32 flags; - __u32 loglevel; - __u32 ttl; -}; - -/* - * Wildcard MSS (kind of). - * It is used to implement a state machine for the different wildcard values - * of the MSS and window sizes. - */ -struct xt_osf_wc { - __u32 wc; - __u32 val; -}; - -/* - * This struct represents IANA options - * http://www.iana.org/assignments/tcp-parameters - */ -struct xt_osf_opt { - __u16 kind, length; - struct xt_osf_wc wc; -}; - -struct xt_osf_user_finger { - struct xt_osf_wc wss; - - __u8 ttl, df; - __u16 ss, mss; - __u16 opt_num; - - char genre[MAXGENRELEN]; - char version[MAXGENRELEN]; - char subtype[MAXGENRELEN]; +#define XT_OSF_TTL_LESS 1 /* Check if ip TTL is less than fingerprint one */ - /* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */ - struct xt_osf_opt opt[MAX_IPOPTLEN]; -}; - -struct xt_osf_nlmsg { - struct xt_osf_user_finger f; - struct iphdr ip; - struct tcphdr tcp; -}; - -/* Defines for IANA option kinds */ - -enum iana_options { - OSFOPT_EOL = 0, /* End of options */ - OSFOPT_NOP, /* NOP */ - OSFOPT_MSS, /* Maximum segment size */ - OSFOPT_WSO, /* Window scale option */ - OSFOPT_SACKP, /* SACK permitted */ - OSFOPT_SACK, /* SACK */ - OSFOPT_ECHO, - OSFOPT_ECHOREPLY, - OSFOPT_TS, /* Timestamp option */ - OSFOPT_POCP, /* Partial Order Connection Permitted */ - OSFOPT_POSP, /* Partial Order Service Profile */ - - /* Others are not used in the current OSF */ - OSFOPT_EMPTY = 255, -}; - -/* - * Initial window size option state machine: multiple of mss, mtu or - * plain numeric value. Can also be made as plain numeric value which - * is not a multiple of specified value. - */ -enum xt_osf_window_size_options { - OSF_WSS_PLAIN = 0, - OSF_WSS_MSS, - OSF_WSS_MTU, - OSF_WSS_MODULO, - OSF_WSS_MAX, -}; +#define xt_osf_wc nf_osf_wc +#define xt_osf_opt nf_osf_opt +#define xt_osf_info nf_osf_info +#define xt_osf_user_finger nf_osf_user_finger +#define xt_osf_finger nf_osf_finger +#define xt_osf_nlmsg nf_osf_nlmsg /* * Add/remove fingerprint from the kernel. -- cgit v1.2.3 From 538c5672be6d67b7b10c15701588e20617374973 Mon Sep 17 00:00:00 2001 From: Florent Fourcot Date: Sun, 6 May 2018 16:30:14 +0200 Subject: netfilter: ctnetlink: export nf_conntrack_max IPCTNL_MSG_CT_GET_STATS netlink command allow to monitor current number of conntrack entries. However, if one wants to compare it with the maximum (and detect exhaustion), the only solution is currently to read sysctl value. This patch add nf_conntrack_max value in netlink message, and simplify monitoring for application built on netlink API. Signed-off-by: Florent Fourcot Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 77987111cab0..1d41810d17e2 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -262,6 +262,7 @@ enum ctattr_stats_cpu { enum ctattr_stats_global { CTA_STATS_GLOBAL_UNSPEC, CTA_STATS_GLOBAL_ENTRIES, + CTA_STATS_GLOBAL_MAX_ENTRIES, __CTA_STATS_GLOBAL_MAX, }; #define CTA_STATS_GLOBAL_MAX (__CTA_STATS_GLOBAL_MAX - 1) -- cgit v1.2.3 From 03737001e463d458c13510fcb64179b3368aaeee Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Fri, 20 Apr 2018 13:49:24 +0300 Subject: mac80211: add api to set CSA counter in mac80211 Sometimes the most updated CSA counter values are known only to the device. Add an API to pass this data to mac80211. Signed-off-by: Gregory Greenman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d2279b2d61aa..52f36c43f35f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4449,6 +4449,19 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, */ u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif); +/** + * ieee80211_csa_set_counter - request mac80211 to set csa counter + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @counter: the new value for the counter + * + * The csa counter can be changed by the device, this API should be + * used by the device driver to update csa counter in mac80211. + * + * It should never be used together with ieee80211_csa_update_counter(), + * as it will cause a race condition around the counter value. + */ +void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter); + /** * ieee80211_csa_finish - notify mac80211 about channel switch * @vif: &struct ieee80211_vif pointer from the add_interface callback. -- cgit v1.2.3 From 50f32718e125c3be5b0528bfa3868e88d677d8ce Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Fri, 20 Apr 2018 13:49:26 +0300 Subject: nl80211: Add wmm rule attribute to NL80211_CMD_GET_WIPHY dump command This will serve userspace entity to maintain its regulatory limitation. More specifcally APs can use this data to calculate the WMM IE when building: beacons, probe responses, assoc responses etc... Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 15daf5e2638d..04c9b97aa5fc 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -11,6 +11,7 @@ * Copyright 2008 Jouni Malinen * Copyright 2008 Colin McCabe * Copyright 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -3141,6 +3142,29 @@ enum nl80211_band_attr { #define NL80211_BAND_ATTR_HT_CAPA NL80211_BAND_ATTR_HT_CAPA +/** + * enum nl80211_wmm_rule - regulatory wmm rule + * + * @__NL80211_WMMR_INVALID: attribute number 0 is reserved + * @NL80211_WMMR_CW_MIN: Minimum contention window slot. + * @NL80211_WMMR_CW_MAX: Maximum contention window slot. + * @NL80211_WMMR_AIFSN: Arbitration Inter Frame Space. + * @NL80211_WMMR_TXOP: Maximum allowed tx operation time. + * @nl80211_WMMR_MAX: highest possible wmm rule. + * @__NL80211_WMMR_LAST: Internal use. + */ +enum nl80211_wmm_rule { + __NL80211_WMMR_INVALID, + NL80211_WMMR_CW_MIN, + NL80211_WMMR_CW_MAX, + NL80211_WMMR_AIFSN, + NL80211_WMMR_TXOP, + + /* keep last */ + __NL80211_WMMR_LAST, + NL80211_WMMR_MAX = __NL80211_WMMR_LAST - 1 +}; + /** * enum nl80211_frequency_attr - frequency attributes * @__NL80211_FREQUENCY_ATTR_INVALID: attribute number 0 is reserved @@ -3190,6 +3214,9 @@ enum nl80211_band_attr { * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_NO_10MHZ: 10 MHz operation is not allowed * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_WMM: this channel has wmm limitations. + * This is a nested attribute that contains the wmm limitation per AC. + * (see &enum nl80211_wmm_rule) * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -3218,6 +3245,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_IR_CONCURRENT, NL80211_FREQUENCY_ATTR_NO_20MHZ, NL80211_FREQUENCY_ATTR_NO_10MHZ, + NL80211_FREQUENCY_ATTR_WMM, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, -- cgit v1.2.3 From 81d5439da84419ee35bea54309a9f2c3871b6605 Mon Sep 17 00:00:00 2001 From: Balaji Pothunoori Date: Mon, 16 Apr 2018 20:18:40 +0530 Subject: cfg80211: average ack rssi support for data frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Average ack rssi will be given to userspace via NL80211 interface if firmware is capable. Userspace tool ‘iw’ can process this information and give the output as one of the fields in ‘iw dev wlanX station dump’. Example output : localhost ~ #iw dev wlan-5000mhz station dump Station 34:f3:9a:aa:3b:29 (on wlan-5000mhz) inactive time: 5370 ms rx bytes: 85321 rx packets: 576 tx bytes: 14225 tx packets: 71 tx retries: 0 tx failed: 2 beacon loss: 0 rx drop misc: 0 signal: -54 dBm signal avg: -53 dBm tx bitrate: 866.7 MBit/s VHT-MCS 9 80MHz short GI VHT-NSS 2 rx bitrate: 866.7 MBit/s VHT-MCS 9 80MHz short GI VHT-NSS 2 avg ack signal: -56 dBm authorized: yes authenticated: yes associated: yes preamble: short WMM/WME: yes MFP: no TDLS peer: no DTIM period: 2 beacon interval:100 short preamble: yes short slot time:yes connected time: 203 seconds Main use case is to measure the signal strength of a connected station to AP. Data packet transmit rates and bandwidth used by station can vary a lot even if the station is at fixed location, especially if the rates used are multi stream(2stream, 3stream) rates with different bandwidth(20/40/80 Mhz). These multi stream rates are sensitive and station can use different transmit power for each of the rate and bandwidth combinations. RSSI measured from these RX packets on AP will be not stable and can vary a lot with in a short time. Whereas 802.11 ack frames from station are sent relatively at a constant rate (6/12/24 Mbps) with constant bandwidth(20 Mhz). So average rssi of the ack packets is good and more accurate. Signed-off-by: Balaji Pothunoori Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 250dac390806..5e888ec62811 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1152,6 +1152,8 @@ struct cfg80211_tid_stats { * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. * @ack_signal: signal strength (in dBm) of the last ACK frame. + * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has + * been sent. */ struct station_info { u64 filled; @@ -1197,6 +1199,7 @@ struct station_info { u8 rx_beacon_signal_avg; struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1]; s8 ack_signal; + s8 avg_ack_signal; }; #if IS_ENABLED(CONFIG_CFG80211) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 04c9b97aa5fc..8e55b63ed3ff 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2981,6 +2981,8 @@ enum nl80211_sta_bss_param { * received from the station (u64, usec) * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm) + * @NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG: avg signal strength of (data) + * ACK frame (s8, dBm) * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -3020,6 +3022,7 @@ enum nl80211_sta_info { NL80211_STA_INFO_RX_DURATION, NL80211_STA_INFO_PAD, NL80211_STA_INFO_ACK_SIGNAL, + NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, @@ -5066,6 +5069,9 @@ enum nl80211_feature_flags { * "radar detected" event. * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211: Driver supports sending and * receiving control port frames over nl80211 instead of the netdevice. + * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack + * rssi if firmware support, this flag is to intimate about ack rssi + * support to nl80211. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -5098,6 +5104,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN, NL80211_EXT_FEATURE_DFS_OFFLOAD, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211, + NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From c36a68fab1c3ddf71918cd1e5937a647cd80ba2b Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Fri, 4 May 2018 17:17:46 +0200 Subject: net: u64_stats_sync: Remove functions without user Commit 67db3e4bfbc9 ("tcp: no longer hold ehash lock while calling tcp_get_info()") removes the only users of u64_stats_update_end/begin_raw() without removing the function in header file. Remove no longer used functions. Cc: Eric Dumazet Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/u64_stats_sync.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 07ee0f84a46c..a27604f99ed0 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -112,20 +112,6 @@ u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, #endif } -static inline void u64_stats_update_begin_raw(struct u64_stats_sync *syncp) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - raw_write_seqcount_begin(&syncp->seq); -#endif -} - -static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - raw_write_seqcount_end(&syncp->seq); -#endif -} - static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) -- cgit v1.2.3 From 0bc5fe857274133ca028ebb15ff2e8549a369916 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Sat, 5 May 2018 18:42:59 -0700 Subject: qed*: Refactor mf_mode to consist of bits. `mf_mode' field indicates the multi-partitioning mode the device is configured to. This method doesn't scale very well, adding a new MF mode requires going over all the existing conditions, and deciding whether those are needed for the new mode or not. The patch defines a set of bit-fields for modes which are derived according to the mode info shared by the MFW and all the configuration would be made according to those. To add a new mode, there would be a single place where we'll need to go and choose which bits apply and which don't. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index e53f9c7c2809..5dac56147a07 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -359,7 +359,7 @@ struct qed_dev_info { #define QED_MFW_VERSION_3_OFFSET 24 u32 flash_size; - u8 mf_mode; + bool b_inter_pf_switch; bool tx_switching; bool rdma_supported; u16 mtu; -- cgit v1.2.3 From 27bf96e32c92599dc7523b36d6c761fc8312c8c0 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Sat, 5 May 2018 18:43:00 -0700 Subject: qed: Remove unused data member 'is_mf_default'. The data member 'is_mf_default' is not used by the qed/qede drivers, removing the same. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 5dac56147a07..907976fd56f7 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -339,7 +339,6 @@ struct qed_dev_info { u8 num_hwfns; u8 hw_mac[ETH_ALEN]; - bool is_mf_default; /* FW version */ u16 fw_major; -- cgit v1.2.3 From cac6f691546b9efd50c31c0db97fe50d0357104a Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Sat, 5 May 2018 18:43:02 -0700 Subject: qed: Add support for Unified Fabric Port. This patch adds driver changes for supporting the Unified Fabric Port (UFP). This is a new paritioning mode wherein MFW provides the set of parameters to be used by the device such as traffic class, outer-vlan tag value, priority type etc. Drivers receives this info via notifications from mfw and configures the hardware accordingly. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_ll2_if.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index 266c1fb45387..5eb022953aca 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -202,6 +202,7 @@ struct qed_ll2_tx_pkt_info { bool enable_ip_cksum; bool enable_l4_cksum; bool calc_ip_len; + bool remove_stag; }; #define QED_LL2_UNUSED_HANDLE (0xff) @@ -220,6 +221,11 @@ struct qed_ll2_params { u8 ll2_mac_address[ETH_ALEN]; }; +enum qed_ll2_xmit_flags { + /* FIP discovery packet */ + QED_LL2_XMIT_FLAGS_FIP_DISCOVERY +}; + struct qed_ll2_ops { /** * @brief start - initializes ll2 @@ -245,10 +251,12 @@ struct qed_ll2_ops { * * @param cdev * @param skb + * @param xmit_flags - Transmit options defined by the enum qed_ll2_xmit_flags. * * @return 0 on success, otherwise error value. */ - int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb); + int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb, + unsigned long xmit_flags); /** * @brief register_cb_ops - protocol driver register the callback for Rx/Tx -- cgit v1.2.3 From 72a338bcc6ae51e01c95d687e5d775e3fe52eff1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 4 May 2018 11:32:59 +0200 Subject: net: core: rework basic flow dissection helper When the core networking needs to detect the transport offset in a given packet and parse it explicitly, a full-blown flow_keys struct is used for storage. This patch introduces a smaller keys store, rework the basic flow dissect helper to use it, and apply this new helper where possible - namely in skb_probe_transport_header(). The used flow dissector data structures are renamed to match more closely the new role. The above gives ~50% performance improvement in micro benchmarking around skb_probe_transport_header() and ~30% around eth_get_headlen(), mostly due to the smaller memset. Small, but measurable improvement is measured also in macro benchmarking. v1 -> v2: use the new helper in eth_get_headlen() and skb_get_poff(), as per DaveM suggestion Suggested-by: David Miller Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++++-------- include/net/flow_dissector.h | 7 ++++++- 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 908d66e55b14..693564a9a979 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1171,7 +1171,7 @@ void __skb_get_hash(struct sk_buff *skb); u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, - const struct flow_keys *keys, int hlen); + const struct flow_keys_basic *keys, int hlen); __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, void *data, int hlen_proto); @@ -1208,13 +1208,14 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, NULL, 0, 0, 0, flags); } -static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow, - void *data, __be16 proto, - int nhoff, int hlen, - unsigned int flags) +static inline bool +skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb, + struct flow_keys_basic *flow, void *data, + __be16 proto, int nhoff, int hlen, + unsigned int flags) { memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow, + return __skb_flow_dissect(skb, &flow_keys_basic_dissector, flow, data, proto, nhoff, hlen, flags); } @@ -2350,11 +2351,12 @@ static inline void skb_pop_mac_header(struct sk_buff *skb) static inline void skb_probe_transport_header(struct sk_buff *skb, const int offset_hint) { - struct flow_keys keys; + struct flow_keys_basic keys; if (skb_transport_header_was_set(skb)) return; - else if (skb_flow_dissect_flow_keys(skb, &keys, 0)) + + if (skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); else skb_set_transport_header(skb, offset_hint); diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 9a074776f70b..e2f6e5c928bb 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -226,6 +226,11 @@ struct flow_dissector { unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; +struct flow_keys_basic { + struct flow_dissector_key_control control; + struct flow_dissector_key_basic basic; +}; + struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic @@ -244,7 +249,7 @@ __be32 flow_get_u32_src(const struct flow_keys *flow); __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; -extern struct flow_dissector flow_keys_buf_dissector; +extern struct flow_dissector flow_keys_basic_dissector; /* struct flow_keys_digest: * -- cgit v1.2.3 From d869dea664e662a4380ccf59cb9be9888a7af53a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 May 2018 12:06:03 +0200 Subject: flow_dissector: do not rely on implicit casts This change fixes a couple of type mismatch reported by the sparse tool, explicitly using the requested type for the offending arguments. Signed-off-by: Paolo Abeni Acked-by: Jon Maloy Signed-off-by: David S. Miller --- include/net/tipc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tipc.h b/include/net/tipc.h index 07670ec022a7..f0e7e6bc1bef 100644 --- a/include/net/tipc.h +++ b/include/net/tipc.h @@ -44,11 +44,11 @@ struct tipc_basic_hdr { __be32 w[4]; }; -static inline u32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) +static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) { u32 w0 = ntohl(hdr->w[0]); bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK; - int key; + __be32 key; /* Return source node identity as key */ if (likely(!keepalive_msg)) -- cgit v1.2.3 From 52539ca89f365d3db530535fbffa88a3cca4d2ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 8 May 2018 13:03:50 +0200 Subject: cfg80211: Expose TXQ stats and parameters to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for exporting the mac80211 TXQ stats via nl80211 by way of a nested TXQ stats attribute, as well as for configuring the quantum and limits that were previously only changeable through debugfs. This commit adds just the nl80211 API, a subsequent commit adds support to mac80211 itself. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 50 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 58 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5e888ec62811..8db6071b6063 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1079,6 +1079,37 @@ struct sta_bss_parameters { u16 beacon_interval; }; +/** + * struct cfg80211_txq_stats - TXQ statistics for this TID + * @filled: bitmap of flags using the bits of &enum nl80211_txq_stats to + * indicate the relevant values in this struct are filled + * @backlog_bytes: total number of bytes currently backlogged + * @backlog_packets: total number of packets currently backlogged + * @flows: number of new flows seen + * @drops: total number of packets dropped + * @ecn_marks: total number of packets marked with ECN CE + * @overlimit: number of drops due to queue space overflow + * @overmemory: number of drops due to memory limit overflow + * @collisions: number of hash collisions + * @tx_bytes: total number of bytes dequeued + * @tx_packets: total number of packets dequeued + * @max_flows: maximum number of flows supported + */ +struct cfg80211_txq_stats { + u32 filled; + u32 backlog_bytes; + u32 backlog_packets; + u32 flows; + u32 drops; + u32 ecn_marks; + u32 overlimit; + u32 overmemory; + u32 collisions; + u32 tx_bytes; + u32 tx_packets; + u32 max_flows; +}; + /** * struct cfg80211_tid_stats - per-TID statistics * @filled: bitmap of flags using the bits of &enum nl80211_tid_stats to @@ -1088,6 +1119,7 @@ struct sta_bss_parameters { * @tx_msdu_retries: number of retries (not counting the first) for * transmitted MSDUs * @tx_msdu_failed: number of failed transmitted MSDUs + * @txq_stats: TXQ statistics */ struct cfg80211_tid_stats { u32 filled; @@ -1095,6 +1127,7 @@ struct cfg80211_tid_stats { u64 tx_msdu; u64 tx_msdu_retries; u64 tx_msdu_failed; + struct cfg80211_txq_stats txq_stats; }; #define IEEE80211_MAX_CHAINS 4 @@ -2204,6 +2237,9 @@ enum cfg80211_connect_params_changed { * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed * @WIPHY_PARAM_DYN_ACK: dynack has been enabled + * @WIPHY_PARAM_TXQ_LIMIT: TXQ packet limit has been changed + * @WIPHY_PARAM_TXQ_MEMORY_LIMIT: TXQ memory limit has been changed + * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum */ enum wiphy_params_flags { WIPHY_PARAM_RETRY_SHORT = 1 << 0, @@ -2212,6 +2248,9 @@ enum wiphy_params_flags { WIPHY_PARAM_RTS_THRESHOLD = 1 << 3, WIPHY_PARAM_COVERAGE_CLASS = 1 << 4, WIPHY_PARAM_DYN_ACK = 1 << 5, + WIPHY_PARAM_TXQ_LIMIT = 1 << 6, + WIPHY_PARAM_TXQ_MEMORY_LIMIT = 1 << 7, + WIPHY_PARAM_TXQ_QUANTUM = 1 << 8, }; /** @@ -2964,6 +3003,9 @@ struct cfg80211_external_auth_params { * * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS * + * @get_txq_stats: Get TXQ stats for interface or phy. If wdev is %NULL, this + * function should return phy stats, and interface stats otherwise. + * * @set_pmk: configure the PMK to be used for offloaded 802.1X 4-Way handshake. * If not deleted through @del_pmk the PMK remains valid until disconnect * upon which the driver should clear it. @@ -3265,6 +3307,10 @@ struct cfg80211_ops { struct net_device *dev, const bool enabled); + int (*get_txq_stats)(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_txq_stats *txqstats); + int (*set_pmk)(struct wiphy *wiphy, struct net_device *dev, const struct cfg80211_pmk_conf *conf); int (*del_pmk)(struct wiphy *wiphy, struct net_device *dev, @@ -3943,6 +3989,10 @@ struct wiphy { u8 nan_supported_bands; + u32 txq_limit; + u32 txq_memory_limit; + u32 txq_quantum; + char priv[0] __aligned(NETDEV_ALIGN); }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8e55b63ed3ff..5e67e3444aba 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2226,6 +2226,16 @@ enum nl80211_commands { * @NL80211_ATTR_NSS: Station's New/updated RX_NSS value notified using this * u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED. * + * @NL80211_ATTR_TXQ_STATS: TXQ statistics (nested attribute, see &enum + * nl80211_txq_stats) + * @NL80211_ATTR_TXQ_LIMIT: Total packet limit for the TXQ queues for this phy. + * The smaller of this and the memory limit is enforced. + * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory memory limit (in bytes) for the + * TXQ queues for this phy. The smaller of this and the packet limit is + * enforced. + * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes + * a flow is assigned on each round of the DRR scheduler. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2660,6 +2670,11 @@ enum nl80211_attrs { NL80211_ATTR_CONTROL_PORT_OVER_NL80211, + NL80211_ATTR_TXQ_STATS, + NL80211_ATTR_TXQ_LIMIT, + NL80211_ATTR_TXQ_MEMORY_LIMIT, + NL80211_ATTR_TXQ_QUANTUM, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3040,6 +3055,7 @@ enum nl80211_sta_info { * @NL80211_TID_STATS_TX_MSDU_FAILED: number of failed transmitted * MSDUs (u64) * @NL80211_TID_STATS_PAD: attribute used for padding for 64-bit alignment + * @NL80211_TID_STATS_TXQ_STATS: TXQ stats (nested attribute) * @NUM_NL80211_TID_STATS: number of attributes here * @NL80211_TID_STATS_MAX: highest numbered attribute here */ @@ -3050,12 +3066,51 @@ enum nl80211_tid_stats { NL80211_TID_STATS_TX_MSDU_RETRIES, NL80211_TID_STATS_TX_MSDU_FAILED, NL80211_TID_STATS_PAD, + NL80211_TID_STATS_TXQ_STATS, /* keep last */ NUM_NL80211_TID_STATS, NL80211_TID_STATS_MAX = NUM_NL80211_TID_STATS - 1 }; +/** + * enum nl80211_txq_stats - per TXQ statistics attributes + * @__NL80211_TXQ_STATS_INVALID: attribute number 0 is reserved + * @NUM_NL80211_TXQ_STATS: number of attributes here + * @NL80211_TXQ_STATS_BACKLOG_BYTES: number of bytes currently backlogged + * @NL80211_TXQ_STATS_BACKLOG_PACKETS: number of packets currently + * backlogged + * @NL80211_TXQ_STATS_FLOWS: total number of new flows seen + * @NL80211_TXQ_STATS_DROPS: total number of packet drops + * @NL80211_TXQ_STATS_ECN_MARKS: total number of packet ECN marks + * @NL80211_TXQ_STATS_OVERLIMIT: number of drops due to queue space overflow + * @NL80211_TXQ_STATS_OVERMEMORY: number of drops due to memory limit overflow + * (only for per-phy stats) + * @NL80211_TXQ_STATS_COLLISIONS: number of hash collisions + * @NL80211_TXQ_STATS_TX_BYTES: total number of bytes dequeued from TXQ + * @NL80211_TXQ_STATS_TX_PACKETS: total number of packets dequeued from TXQ + * @NL80211_TXQ_STATS_MAX_FLOWS: number of flow buckets for PHY + * @NL80211_TXQ_STATS_MAX: highest numbered attribute here + */ +enum nl80211_txq_stats { + __NL80211_TXQ_STATS_INVALID, + NL80211_TXQ_STATS_BACKLOG_BYTES, + NL80211_TXQ_STATS_BACKLOG_PACKETS, + NL80211_TXQ_STATS_FLOWS, + NL80211_TXQ_STATS_DROPS, + NL80211_TXQ_STATS_ECN_MARKS, + NL80211_TXQ_STATS_OVERLIMIT, + NL80211_TXQ_STATS_OVERMEMORY, + NL80211_TXQ_STATS_COLLISIONS, + NL80211_TXQ_STATS_TX_BYTES, + NL80211_TXQ_STATS_TX_PACKETS, + NL80211_TXQ_STATS_MAX_FLOWS, + + /* keep last */ + NUM_NL80211_TXQ_STATS, + NL80211_TXQ_STATS_MAX = NUM_NL80211_TXQ_STATS - 1 +}; + /** * enum nl80211_mpath_flags - nl80211 mesh path flags * @@ -5072,6 +5127,8 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack * rssi if firmware support, this flag is to intimate about ack rssi * support to nl80211. + * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate + * TXQs. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -5105,6 +5162,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_DFS_OFFLOAD, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211, NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT, + NL80211_EXT_FEATURE_TXQS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From b21c034b3df833b5d9db1cfdc3938dbb0d7995c6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 7 May 2018 11:08:28 -0700 Subject: udp: Do not pass MSS as parameter to GSO segmentation There is no point in passing MSS as a parameter for for the GSO segmentation call as it is already available via the shared info for the skb itself. Reviewed-by: Eric Dumazet Acked-by: Willem de Bruijn Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/udp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 05990746810e..8bd83b044ecd 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -176,7 +176,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, - unsigned int mss, __sum16 check); + __sum16 check); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { -- cgit v1.2.3 From 9a0d41b3598ff62ecb26661bbfb1d523586cdea3 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 7 May 2018 11:08:34 -0700 Subject: udp: Do not pass checksum as a parameter to GSO segmentation This patch is meant to allow us to avoid having to recompute the checksum from scratch and have it passed as a parameter. Instead of taking that approach we can take advantage of the fact that the length that was used to compute the existing checksum is included in the UDP header. Finally to avoid the need to invert the result we can just call csum16_add and csum16_sub directly. By doing this we can avoid a number of instructions in the loop that is handling segmentation. Signed-off-by: Alexander Duyck Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/udp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index 8bd83b044ecd..9289b6425032 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -175,8 +175,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, - netdev_features_t features, - __sum16 check); + netdev_features_t features); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { -- cgit v1.2.3 From 9c4a121e82634aa000a702c98cd6f05b27d6e186 Mon Sep 17 00:00:00 2001 From: Sean Lanigan Date: Fri, 4 May 2018 16:48:23 +1000 Subject: brcmfmac: Add support for bcm43364 wireless chipset Add support for the BCM43364 chipset via an SDIO interface, as used in e.g. the Murata 1FX module. The BCM43364 uses the same firmware as the BCM43430 (which is already included), the only difference is the omission of Bluetooth. However, the SDIO_ID for the BCM43364 is 02D0:A9A4, giving it a MODALIAS of sdio:c00v02D0dA9A4, which doesn't get recognised and hence doesn't load the brcmfmac module. Adding the 'A9A4' ID in the appropriate place triggers the brcmfmac driver to load, and then correctly use the firmware file 'brcmfmac43430-sdio.bin'. Signed-off-by: Sean Lanigan Acked-by: Ulf Hansson Signed-off-by: Kalle Valo --- include/linux/mmc/sdio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index cdd66a5fbd5e..0a7abe8a407f 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -35,6 +35,7 @@ #define SDIO_DEVICE_ID_BROADCOM_4335_4339 0x4335 #define SDIO_DEVICE_ID_BROADCOM_4339 0x4339 #define SDIO_DEVICE_ID_BROADCOM_43362 0xa962 +#define SDIO_DEVICE_ID_BROADCOM_43364 0xa9a4 #define SDIO_DEVICE_ID_BROADCOM_43430 0xa9a6 #define SDIO_DEVICE_ID_BROADCOM_4345 0x4345 #define SDIO_DEVICE_ID_BROADCOM_43455 0xa9bf -- cgit v1.2.3 From 78958fca7ead2f81b60a6827881c4866d1ed0c52 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 4 May 2018 14:49:51 -0700 Subject: bpf: btf: Introduce BTF ID This patch gives an ID to each loaded BTF. The ID is allocated by the idr like the existing prog-id and map-id. The bpf_put(map->btf) is moved to __bpf_map_put() so that the userspace can stop seeing the BTF ID ASAP when the last BTF refcnt is gone. It also makes BTF accessible from userspace through the 1. new BPF_BTF_GET_FD_BY_ID command. It is limited to CAP_SYS_ADMIN which is inline with the BPF_BTF_LOAD cmd and the existing BPF_[MAP|PROG]_GET_FD_BY_ID cmd. 2. new btf_id (and btf_key_id + btf_value_id) in "struct bpf_map_info" Once the BTF ID handler is accessible from userspace, freeing a BTF object has to go through a rcu period. The BPF_BTF_GET_FD_BY_ID cmd can then be done under a rcu_read_lock() instead of taking spin_lock. [Note: A similar rcu usage can be done to the existing bpf_prog_get_fd_by_id() in a follow up patch] When processing the BPF_BTF_GET_FD_BY_ID cmd, refcount_inc_not_zero() is needed because the BTF object could be already in the rcu dead row . btf_get() is removed since its usage is currently limited to btf.c alone. refcount_inc() is used directly instead. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 2 ++ include/uapi/linux/bpf.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index a966dc6d61ee..e076c4697049 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -44,5 +44,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *ret_size); void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); +int btf_get_fd_by_id(u32 id); +u32 btf_id(const struct btf *btf); #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 93d5a4eeec2a..6106f23a9a8a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -96,6 +96,7 @@ enum bpf_cmd { BPF_PROG_QUERY, BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, + BPF_BTF_GET_FD_BY_ID, }; enum bpf_map_type { @@ -344,6 +345,7 @@ union bpf_attr { __u32 start_id; __u32 prog_id; __u32 map_id; + __u32 btf_id; }; __u32 next_id; __u32 open_flags; @@ -2130,6 +2132,9 @@ struct bpf_map_info { __u32 ifindex; __u64 netns_dev; __u64 netns_ino; + __u32 btf_id; + __u32 btf_key_id; + __u32 btf_value_id; } __attribute__((aligned(8))); /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed -- cgit v1.2.3 From 62dab84c81a487d946a5fc37c6df541dd95cca38 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 4 May 2018 14:49:52 -0700 Subject: bpf: btf: Add struct bpf_btf_info During BPF_OBJ_GET_INFO_BY_FD on a btf_fd, the current bpf_attr's info.info is directly filled with the BTF binary data. It is not extensible. In this case, we want to add BTF ID. This patch adds "struct bpf_btf_info" which has the BTF ID as one of its member. The BTF binary data itself is exposed through the "btf" and "btf_size" members. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6106f23a9a8a..d615c777b573 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2137,6 +2137,12 @@ struct bpf_map_info { __u32 btf_value_id; } __attribute__((aligned(8))); +struct bpf_btf_info { + __aligned_u64 btf; + __u32 btf_size; + __u32 id; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach attach type). -- cgit v1.2.3 From 0d8300325660f81787892a1c58dc1f9428a67143 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 May 2018 19:37:06 -0700 Subject: bpf: xdp: allow offloads to store into rx_queue_index It's fairly easy for offloaded XDP programs to select the RX queue packets go to. We need a way of expressing this in the software. Allow write to the rx_queue_index field of struct xdp_md for device-bound programs. Skip convert_ctx_access callback entirely for offloads. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 321969da67b7..a38e474bf7ee 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -627,7 +627,7 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); -static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) +static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) { return aux->offload_requested; } -- cgit v1.2.3 From 5263a98f162f7a46e1292632c87f1e3444eb8fbf Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 8 May 2018 09:06:58 -0700 Subject: net/ipv4: Update ip_tunnel_metadata_cnt static key to modern api No changes in refcount semantics -- key init is false; replace static_key_slow_inc|dec with static_branch_inc|dec static_key_false with static_branch_unlikely Signed-off-by: Davidlohr Bueso Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 751646adc769..90ff430f5e9d 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -477,12 +477,12 @@ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstat return (struct ip_tunnel_info *)lwtstate->data; } -extern struct static_key ip_tunnel_metadata_cnt; +DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); /* Returns > 0 if metadata should be collected */ static inline int ip_tunnel_collect_metadata(void) { - return static_key_false(&ip_tunnel_metadata_cnt); + return static_branch_unlikely(&ip_tunnel_metadata_cnt); } void __init ip_tunnel_core_init(void); -- cgit v1.2.3 From a7950ae8213cf38343fd27ad1fb58f3f04e3130f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 8 May 2018 09:06:59 -0700 Subject: net/sock: Update memalloc_socks static key to modern api No changes in refcount semantics -- key init is false; replace static_key_slow_inc|dec with static_branch_inc|dec static_key_false with static_branch_unlikely Added a '_key' suffix to memalloc_socks, for better self documentation. Signed-off-by: Davidlohr Bueso Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 3c568b36ee36..4f7c584e9765 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -808,10 +808,10 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) } #ifdef CONFIG_NET -extern struct static_key memalloc_socks; +DECLARE_STATIC_KEY_FALSE(memalloc_socks_key); static inline int sk_memalloc_socks(void) { - return static_key_false(&memalloc_socks); + return static_branch_unlikely(&memalloc_socks_key); } #else -- cgit v1.2.3 From e5c9a705452acf59efe00cb84f086624fda010ab Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 9 May 2018 18:29:02 +0300 Subject: net/mlx4_core: Report driver version to FW If supported, write a driver version string to FW as part of the INIT_HCA command. Example of driver version: "Linux,mlx4_core,4.0-0" Signed-off-by: Eran Ben Elisha Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 81d0799b6091..122e7e9d3091 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -225,6 +225,7 @@ enum { MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP = 1ULL << 36, MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT = 1ULL << 37, MLX4_DEV_CAP_FLAG2_USER_MAC_EN = 1ULL << 38, + MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW = 1ULL << 39, }; enum { -- cgit v1.2.3 From 03bdfc001c951cb04ad3d28aecee4ec0e18e9664 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 9 May 2018 23:24:07 -0700 Subject: net: ipv4: remove define INET_CSK_DEBUG and unnecessary EXPORT_SYMBOL INET_CSK_DEBUG is always set and only is used for 2 pr_debug calls. EXPORT_SYMBOL(inet_csk_timer_bug_msg) is only used by these 2 pr_debug calls and is also unnecessary as the exported string can be used directly by these calls. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 2ab6667275df..0a6c9e0f2b5a 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -23,8 +23,6 @@ #include #include -#define INET_CSK_DEBUG 1 - /* Cancel timers, when they are not required. */ #undef INET_CSK_CLEAR_TIMERS @@ -196,10 +194,6 @@ static inline void inet_csk_delack_init(struct sock *sk) void inet_csk_delete_keepalive_timer(struct sock *sk); void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); -#ifdef INET_CSK_DEBUG -extern const char inet_csk_timer_bug_msg[]; -#endif - static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -214,12 +208,9 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); #endif + } else { + pr_debug("inet_csk BUG: unknown timer value\n"); } -#ifdef INET_CSK_DEBUG - else { - pr_debug("%s", inet_csk_timer_bug_msg); - } -#endif } /* @@ -232,10 +223,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, struct inet_connection_sock *icsk = inet_csk(sk); if (when > max_when) { -#ifdef INET_CSK_DEBUG pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr()); -#endif when = max_when; } @@ -249,12 +238,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, icsk->icsk_ack.pending |= ICSK_ACK_TIMER; icsk->icsk_ack.timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + } else { + pr_debug("inet_csk BUG: unknown timer value\n"); } -#ifdef INET_CSK_DEBUG - else { - pr_debug("%s", inet_csk_timer_bug_msg); - } -#endif } static inline unsigned long -- cgit v1.2.3 From 00483690552c5fb6aa30bf3acb75b0ee89b4c0fd Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Thu, 10 May 2018 16:53:51 +1000 Subject: tcp: Add mark for TIMEWAIT sockets This version has some suggestions by Eric Dumazet: - Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP races. - Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark" statement. - Factorize code as sk_fullsock() check is not necessary. Aidan McGurn from Openwave Mobility systems reported the following bug: "Marked routing is broken on customer deployment. Its effects are large increase in Uplink retransmissions caused by the client never receiving the final ACK to their FINACK - this ACK misses the mark and routes out of the incorrect route." Currently marks are added to sk_buffs for replies when the "fwmark_reflect" sysctl is enabled. But not for TW sockets that had sk->sk_mark set via setsockopt(SO_MARK..). Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark location. Then progate this so that the skb gets sent with the correct mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so that netfilter rules are still honored. Signed-off-by: Jon Maxwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index c7be1ca8e562..659d8ed5a3bc 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -62,6 +62,7 @@ struct inet_timewait_sock { #define tw_dr __tw_common.skc_tw_dr int tw_timeout; + __u32 tw_mark; volatile unsigned char tw_substate; unsigned char tw_rcv_wscale; -- cgit v1.2.3 From 6454743bc13e7dfd4f2720758ca3fcdea76b82a4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:19 -0700 Subject: net/ipv6: Rename fib6_lookup to fib6_node_lookup Rename fib6_lookup to fib6_node_lookup to better reflect what it returns. The fib6_lookup name will be used in a later patch for an IPv6 equivalent to IPv4's fib_lookup. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index a3ec08d05756..43ab545e64ea 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); -struct fib6_node *fib6_lookup(struct fib6_node *root, - const struct in6_addr *daddr, - const struct in6_addr *saddr); +struct fib6_node *fib6_node_lookup(struct fib6_node *root, + const struct in6_addr *daddr, + const struct in6_addr *saddr); struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, -- cgit v1.2.3 From 3b290a31bbc5969f9193f73d547a6dc8a25c6f9e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:20 -0700 Subject: net/ipv6: Rename rt6_multipath_select Rename rt6_multipath_select to fib6_multipath_select and export it. A later patch wants access to it similar to IPv4's fib_select_path. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 43ab545e64ea..2597d8fdd92f 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); +struct fib6_info *fib6_multipath_select(const struct net *net, + struct fib6_info *match, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict); + struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr); -- cgit v1.2.3 From 1d053da910947afccec96d90892c0f5488c7a9cf Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:21 -0700 Subject: net/ipv6: Extract table lookup from ip6_pol_route ip6_pol_route is used for ingress and egress FIB lookups. Refactor it moving the table lookup into a separate fib6_table_lookup that can be invoked separately and export the new function. ip6_pol_route now calls fib6_table_lookup and uses the result to generate a dst based rt6_info. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 2597d8fdd92f..c70705f2647a 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); +/* called with rcu lock held; caller needs to select path */ +struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int strict); + struct fib6_info *fib6_multipath_select(const struct net *net, struct fib6_info *match, struct flowi6 *fl6, int oif, -- cgit v1.2.3 From 138118ec96cbfc303c1d7cc05fbb2caf8382c95b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:23 -0700 Subject: net/ipv6: Add fib6_lookup Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules, but returns a FIB entry, fib6_info, rather than a dst based rt6_info. fib6_lookup is any where from 140% (MULTIPLE_TABLES config disabled) to 60% faster than any of the dst based lookup methods (without custom rules) and 25% faster with custom rules (e.g., l3mdev rule). Since the lookup function has a completely different signature, fib6_rule_action is split into 2 paths: the existing one is renamed __fib6_rule_action and a new one for the fib6_info path is added. fib6_rule_action decides which to call based on the lookup_ptr. If it is fib6_table_lookup then the new path is taken. Caller must hold rcu lock as no reference is taken on the returned fib entry. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c70705f2647a..cc70f6da8462 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); +/* called with rcu lock held; can return error pointer + * caller needs to select path + */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags); + /* called with rcu lock held; caller needs to select path */ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int strict); -- cgit v1.2.3 From d4bea421f7322400d804c2284739e42e61f78349 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:24 -0700 Subject: net/ipv6: Update fib6 tracepoint to take fib6_info Similar to IPv4, IPv6 should use the FIB lookup result in the tracepoint. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/trace/events/fib6.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index 7e8d48a81b91..1b8d951e3c12 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -12,10 +12,10 @@ TRACE_EVENT(fib6_table_lookup, - TP_PROTO(const struct net *net, const struct rt6_info *rt, + TP_PROTO(const struct net *net, const struct fib6_info *f6i, struct fib6_table *table, const struct flowi6 *flp), - TP_ARGS(net, rt, table, flp), + TP_ARGS(net, f6i, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) @@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup, in6 = (struct in6_addr *)__entry->dst; *in6 = flp->daddr; - if (rt->rt6i_idev) { - __assign_str(name, rt->rt6i_idev->dev->name); + if (f6i->fib6_nh.nh_dev) { + __assign_str(name, f6i->fib6_nh.nh_dev); } else { __assign_str(name, ""); } - if (rt == net->ipv6.ip6_null_entry) { + if (f6i == net->ipv6.fib6_null_entry) { struct in6_addr in6_zero = {}; in6 = (struct in6_addr *)__entry->gw; *in6 = in6_zero; - } else if (rt) { + } else if (f6i) { in6 = (struct in6_addr *)__entry->gw; - *in6 = rt->rt6i_gateway; + *in6 = f6i->fib6_nh.nh_gw; } ), -- cgit v1.2.3 From 65a2022e89a4760f9702837e2d9d15a39a9c68a3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:25 -0700 Subject: net/ipv6: Add fib lookup stubs for use in bpf helper Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table, a stub to do a lookup in a specific table, fib6_table_lookup, and a stub for a full route lookup. The stubs are needed for core bpf code to handle the case when the IPv6 module is not builtin. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/addrconf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 8312cc25a3af..ff766ab207e0 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -223,6 +223,20 @@ struct ipv6_stub { const struct in6_addr *addr); int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); + + struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); + struct fib6_info *(*fib6_lookup)(struct net *net, int oif, + struct flowi6 *fl6, int flags); + struct fib6_info *(*fib6_table_lookup)(struct net *net, + struct fib6_table *table, + int oif, struct flowi6 *fl6, + int flags); + struct fib6_info *(*fib6_multipath_select)(const struct net *net, + struct fib6_info *f6i, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, + int strict); + void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, -- cgit v1.2.3 From 87f5fc7e48dd3175b30dd03b41564e1a8e136323 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:26 -0700 Subject: bpf: Provide helper to do forwarding lookups in kernel FIB table Provide a helper for doing a FIB and neighbor lookup in the kernel tables from an XDP program. The helper provides a fastpath for forwarding packets. If the packet is a local delivery or for any reason is not a simple lookup and forward, the packet continues up the stack. If it is to be forwarded, the forwarding can be done directly if the neighbor is already known. If the neighbor does not exist, the first few packets go up the stack for neighbor resolution. Once resolved, the xdp program provides the fast path. On successful lookup the nexthop dmac, current device smac and egress device index are returned. The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6 are implemented in this patch. The API includes layer 4 parameters if the XDP program chooses to do deep packet inspection to allow compare against ACLs implemented as FIB rules. Header rewrite is left to the XDP program. The lookup takes 2 flags: - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes straight to the table associated with the device (expert setting for those looking to maximize throughput) - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective. Default is an ingress lookup. Initial performance numbers collected by Jesper, forwarded packets/sec: Full stack XDP FIB lookup XDP Direct lookup IPv4 1,947,969 7,074,156 7,415,333 IPv6 1,728,000 6,165,504 7,262,720 These number are single CPU core forwarding on a Broadwell E5-1650 v4 @ 3.60GHz. Signed-off-by: David Ahern Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 81 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d615c777b573..02e4112510f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1828,6 +1828,33 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * + * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst, + * ipv6_dst or mpls_out based on family, smac is set to mac + * address of egress device, dmac is set to nexthop mac address, + * rt_metric is set to metric from route. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * + * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs + * full lookup using FIB rules + * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress + * perspective (default is ingress) + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * + * Return + * Egress device index on success, 0 if packet needs to continue + * up the stack for further processing or a negative error in case + * of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1898,7 +1925,8 @@ union bpf_attr { FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ FN(get_stack), \ - FN(skb_load_bytes_relative), + FN(skb_load_bytes_relative), \ + FN(fib_lookup), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2321,4 +2349,55 @@ struct bpf_raw_tracepoint_args { __u64 args[0]; }; +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +#define BPF_FIB_LOOKUP_DIRECT BIT(0) +#define BPF_FIB_LOOKUP_OUTPUT BIT(1) + +struct bpf_fib_lookup { + /* input */ + __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + /* total length of packet from network header - used for MTU check */ + __u16 tot_len; + __u32 ifindex; /* L3 device index for lookup */ + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowlabel; /* AF_INET6 */ + + /* output: metric of fib result */ + __u32 rt_metric; + }; + + union { + __be32 mpls_in; + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ + }; + + /* input to bpf_fib_lookup, *dst is destination address. + * output: bpf_fib_lookup sets to gateway address + */ + union { + /* return for MPLS lookups */ + __be32 mpls_out[4]; /* support up to 4 labels */ + __be32 ipv4_dst; + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 11d8f3ddab1e2b0f148def287859d0903b7f8ac5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 10 May 2018 13:17:32 -0700 Subject: net: dsa: Add PHYLINK switch operations In preparation for adding support for PHYLINK within DSA, define a number of operations that we will need and that switch drivers can start implementing. Proper integration with PHYLINK will follow in subsequent patches. We start selecting PHYLINK (which implies PHYLIB) in net/dsa/Kconfig such that drivers can be guaranteed that this dependency is properly taken care of and can start referencing PHYLINK helper functions without requiring stubs or anything. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 462e9741b210..ed64c1f3f117 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -20,12 +20,14 @@ #include #include #include +#include #include #include struct tc_action; struct phy_device; struct fixed_phy_status; +struct phylink_link_state; enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = 0, @@ -353,6 +355,27 @@ struct dsa_switch_ops { void (*fixed_link_update)(struct dsa_switch *ds, int port, struct fixed_phy_status *st); + /* + * PHYLINK integration + */ + void (*phylink_validate)(struct dsa_switch *ds, int port, + unsigned long *supported, + struct phylink_link_state *state); + int (*phylink_mac_link_state)(struct dsa_switch *ds, int port, + struct phylink_link_state *state); + void (*phylink_mac_config)(struct dsa_switch *ds, int port, + unsigned int mode, + const struct phylink_link_state *state); + void (*phylink_mac_an_restart)(struct dsa_switch *ds, int port); + void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, + unsigned int mode, + phy_interface_t interface); + void (*phylink_mac_link_up)(struct dsa_switch *ds, int port, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev); + void (*phylink_fixed_state)(struct dsa_switch *ds, int port, + struct phylink_link_state *state); /* * ethtool hardware statistics. */ @@ -595,5 +618,6 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); int dsa_port_get_phy_sset_count(struct dsa_port *dp); +void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); #endif -- cgit v1.2.3 From aab9c4067d2389d0adfc9c53806437df7b0fe3d5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 10 May 2018 13:17:36 -0700 Subject: net: dsa: Plug in PHYLINK support Add support for PHYLINK within the DSA subsystem in order to support more complex devices such as pluggable (SFP) and non-pluggable (SFF) modules, 10G PHYs, and traditional PHYs. Using PHYLINK allows us to drop some amount of complexity we had while probing fixed and non-fixed PHYs using Device Tree. Because PHYLINK separates the Ethernet MAC/port configuration into different stages, we let switch drivers implement those, and for now, we maintain functionality by calling dsa_slave_adjust_link() during phylink_mac_link_{up,down} which provides semantically equivalent steps. Drivers willing to take advantage of PHYLINK should implement the phylink_mac_* operations that DSA wraps. We cannot quite remove the adjust_link() callback just yet, because a number of drivers rely on that for configuring their "CPU" and "DSA" ports, this is done dsa_port_setup_phy_of() and dsa_port_fixed_link_register_of() still. Drivers that utilize fixed links for user-facing ports (e.g: bcm_sf2) will need to implement phylink_mac_ops from now on to preserve functionality, since PHYLINK *does not* create a phy_device instance for fixed links. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index ed64c1f3f117..fdbd6082945d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -201,6 +201,7 @@ struct dsa_port { u8 stp_state; struct net_device *bridge_dev; struct devlink_port devlink_port; + struct phylink *pl; /* * Original copy of the master netdev ethtool_ops */ -- cgit v1.2.3 From 73a6bab5aa2a83cb7df85805e08bc03b4065aea7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 May 2018 14:59:43 -0700 Subject: tcp: switch pacing timer to softirq based hrtimer linux-4.16 got support for softirq based hrtimers. TCP can switch its pacing hrtimer to this variant, since this avoids going through a tasklet and some atomic operations. pacing timer logic looks like other (jiffies based) tcp timers. v2: use hrtimer_try_to_cancel() in tcp_clear_xmit_timers() to correctly release reference on socket if needed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index cf803fe0fb86..3b1d617b0110 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -557,7 +557,9 @@ void tcp_fin(struct sock *sk); void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { - hrtimer_cancel(&tcp_sk(sk)->pacing_timer); + if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) + sock_put(sk); + inet_csk_clear_xmit_timers(sk); } -- cgit v1.2.3 From 0fccb85ad21d65c6ee1ea912acc3f0ae4df31876 Mon Sep 17 00:00:00 2001 From: Bruce Allan Date: Thu, 10 May 2018 05:59:39 -0700 Subject: virtchnl: Whitespace and parenthesis cleanup Clean up existing instances of unnecessary parentheses in if statement and change order of conditionals to make it easier to read The opening /* should be followed by a single space and the closing */ should be preceded with a single space. Signed-off-by: Bruce Allan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/avf/virtchnl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index b0a7f315bfbe..212b3822d180 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -485,7 +485,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_key); struct virtchnl_rss_lut { u16 vsi_id; u16 lut_entries; - u8 lut[1]; /* RSS lookup table*/ + u8 lut[1]; /* RSS lookup table */ }; VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_lut); @@ -819,7 +819,7 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, return VIRTCHNL_ERR_PARAM; } /* few more checks */ - if ((valid_len != msglen) || (err_msg_format)) + if (err_msg_format || valid_len != msglen) return VIRTCHNL_STATUS_ERR_OPCODE_MISMATCH; return 0; -- cgit v1.2.3 From 2724273e8fd00b512596a77ee063f49b25f36507 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 2 May 2018 15:17:17 +0530 Subject: vmcore: add API to collect hardware dump in second kernel The sequence of actions done by device drivers to append their device specific hardware/firmware logs to /proc/vmcore are as follows: 1. During probe (before hardware is initialized), device drivers register to the vmcore module (via vmcore_add_device_dump()), with callback function, along with buffer size and log name needed for firmware/hardware log collection. 2. vmcore module allocates the buffer with requested size. It adds an Elf note and invokes the device driver's registered callback function. 3. Device driver collects all hardware/firmware logs into the buffer and returns control back to vmcore module. Ensure that the device dump buffer size is always aligned to page size so that it can be mmaped. Also, rename alloc_elfnotes_buf() to vmcore_alloc_buf() to make it more generic and reserve NT_VMCOREDD note type to indicate vmcore device dump. Suggested-by: Eric Biederman . Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- include/linux/crash_dump.h | 18 ++++++++++++++++++ include/linux/kcore.h | 6 ++++++ include/uapi/linux/elf.h | 1 + include/uapi/linux/vmcore.h | 18 ++++++++++++++++++ 4 files changed, 43 insertions(+) create mode 100644 include/uapi/linux/vmcore.h (limited to 'include') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index f7ac2aa93269..3e4ba9d753c8 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -5,6 +5,7 @@ #include #include #include +#include #include /* for pgprot_t */ @@ -93,4 +94,21 @@ static inline bool is_kdump_kernel(void) { return 0; } #endif /* CONFIG_CRASH_DUMP */ extern unsigned long saved_max_pfn; + +/* Device Dump information to be filled by drivers */ +struct vmcoredd_data { + char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */ + unsigned int size; /* Size of the dump */ + /* Driver's registered callback to be invoked to collect dump */ + int (*vmcoredd_callback)(struct vmcoredd_data *data, void *buf); +}; + +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +int vmcore_add_device_dump(struct vmcoredd_data *data); +#else +static inline int vmcore_add_device_dump(struct vmcoredd_data *data) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ #endif /* LINUX_CRASHDUMP_H */ diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 80db19d3a505..8de55e4b5ee9 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -28,6 +28,12 @@ struct vmcore { loff_t offset; }; +struct vmcoredd_node { + struct list_head list; /* List of dumps */ + void *buf; /* Buffer containing device's dump */ + unsigned int size; /* Size of the buffer */ +}; + #ifdef CONFIG_PROC_KCORE extern void kclist_add(struct kcore_list *, void *, size_t, int type); #else diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index e2535d6dcec7..4e12c423b9fe 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -421,6 +421,7 @@ typedef struct elf64_shdr { #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ #define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ +#define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ /* Note header in a PT_NOTE section */ typedef struct elf32_note { diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h new file mode 100644 index 000000000000..022619668e0e --- /dev/null +++ b/include/uapi/linux/vmcore.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _UAPI_VMCORE_H +#define _UAPI_VMCORE_H + +#include + +#define VMCOREDD_NOTE_NAME "LINUX" +#define VMCOREDD_MAX_NAME_BYTES 44 + +struct vmcoredd_header { + __u32 n_namesz; /* Name size */ + __u32 n_descsz; /* Content size */ + __u32 n_type; /* NT_VMCOREDD */ + __u8 name[8]; /* LINUX\0\0\0 */ + __u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */ +}; + +#endif /* _UAPI_VMCORE_H */ -- cgit v1.2.3 From 81c7288b170a59d01427fa33d6591da1dbf59277 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Sun, 13 May 2018 17:44:27 -0300 Subject: sched: cls: enable verbose logging Currently, when the rule is not to be exclusively executed by the hardware, extack is not passed along and offloading failures don't get logged. The idea was that hardware failures are okay because the rule will get executed in software then and this way it doesn't confuse unware users. But this is not helpful in case one needs to understand why a certain rule failed to get offloaded. Considering it may have been a temporary failure, like resources exceeded or so, reproducing it later and knowing that it is triggering the same reason may be challenging. The ultimate goal is to improve Open vSwitch debuggability when using flower offloading. This patch adds a new flag to enable verbose logging. With the flag set, extack will be passed to the driver, which will be able to log the error. As the operation itself probably won't fail (not because of this, at least), current iproute will already log it as a Warning. The flag is generic, so it can be reused later. No need to restrict it just for HW offloading. The command line will follow the syntax that tc-ebpf already uses, tc ... [ verbose ] ... , and extend its meaning. For example: # ./tc qdisc add dev p7p1 ingress # ./tc filter add dev p7p1 parent ffff: protocol ip prio 1 \ flower verbose \ src_mac ed:13:db:00:00:00 dst_mac 01:80:c2:00:00:d0 \ src_ip 56.0.0.0 dst_ip 55.0.0.0 action drop Warning: TC offload is disabled on net device. # echo $? 0 # ./tc filter add dev p7p1 parent ffff: protocol ip prio 1 \ flower \ src_mac ff:13:db:00:00:00 dst_mac 01:80:c2:00:00:d0 \ src_ip 56.0.0.0 dst_ip 55.0.0.0 action drop # echo $? 0 Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 6 ++++-- include/uapi/linux/pkt_cls.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e828d31be5da..0005f0b40fe9 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -683,9 +683,11 @@ static inline bool tc_skip_sw(u32 flags) /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static inline bool tc_flags_valid(u32 flags) { - if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)) + if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW | + TCA_CLS_FLAGS_VERBOSE)) return false; + flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW; if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))) return false; @@ -705,7 +707,7 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio; - if (tc_skip_sw(flags)) + if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index be05e66c167b..84e4c1d0f874 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -129,6 +129,7 @@ enum { #define TCA_CLS_FLAGS_SKIP_SW (1 << 1) /* don't use filter in SW */ #define TCA_CLS_FLAGS_IN_HW (1 << 2) /* filter is offloaded to HW */ #define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */ +#define TCA_CLS_FLAGS_VERBOSE (1 << 4) /* verbose logging */ /* U32 filters */ -- cgit v1.2.3 From e5cd3abcb31a48d4ea91bd32f0618802ca5f3592 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 14 May 2018 10:00:16 -0700 Subject: bpf: sockmap, refactor sockmap routines to work with hashmap This patch only refactors the existing sockmap code. This will allow much of the psock initialization code path and bpf helper codes to work for both sockmap bpf map types that are backed by an array, the currently supported type, and the new hash backed bpf map type sockhash. Most the fallout comes from three changes, - Pushing bpf programs into an independent structure so we can use it from the htab struct in the next patch. - Generalizing helpers to use void *key instead of the hardcoded u32. - Instead of passing map/key through the metadata we now do the lookup inline. This avoids storing the key in the metadata which will be useful when keys can be longer than 4 bytes. We rename the sk pointers to sk_redir at this point as well to avoid any confusion between the current sk pointer and the redirect pointer sk_redir. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 3 +-- include/net/tcp.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index da7e16523128..9dbcb9d55921 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -515,9 +515,8 @@ struct sk_msg_buff { int sg_end; struct scatterlist sg_data[MAX_SKB_FRAGS]; bool sg_copy[MAX_SKB_FRAGS]; - __u32 key; __u32 flags; - struct bpf_map *map; + struct sock *sk_redir; struct sk_buff *skb; struct list_head list; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index cf803fe0fb86..059287374ba0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -814,9 +814,8 @@ struct tcp_skb_cb { #endif } header; /* For incoming skbs */ struct { - __u32 key; __u32 flags; - struct bpf_map *map; + struct sock *sk_redir; void *data_end; } bpf; }; -- cgit v1.2.3 From 81110384441a59cff47430f20f049e69b98c17f4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 14 May 2018 10:00:17 -0700 Subject: bpf: sockmap, add hash map support Sockmap is currently backed by an array and enforces keys to be four bytes. This works well for many use cases and was originally modeled after devmap which also uses four bytes keys. However, this has become limiting in larger use cases where a hash would be more appropriate. For example users may want to use the 5-tuple of the socket as the lookup key. To support this add hash support. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 8 +++++++ include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 54 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 61 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a38e474bf7ee..ed0122b45b63 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) @@ -675,6 +676,12 @@ static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) return NULL; } +static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map, + void *key) +{ + return NULL; +} + static inline int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) @@ -724,6 +731,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; +extern const struct bpf_func_proto bpf_sock_hash_update_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d7df1b323082..b67f8793de0d 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -47,6 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 02e4112510f8..d94d333a8225 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, }; enum bpf_prog_type { @@ -1828,7 +1829,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. @@ -1855,6 +1855,53 @@ union bpf_attr { * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case * of failure. + * + * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdeict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1926,7 +1973,10 @@ union bpf_attr { FN(skb_get_xfrm_state), \ FN(get_stack), \ FN(skb_load_bytes_relative), \ - FN(fib_lookup), + FN(fib_lookup), \ + FN(sock_hash_update), \ + FN(msg_redirect_hash), \ + FN(sk_redirect_hash), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From e79c1055749e3183a2beee04a24da378623329c5 Mon Sep 17 00:00:00 2001 From: Debabrata Banerjee Date: Mon, 14 May 2018 14:48:09 -0400 Subject: bonding: allow use of tx hashing in balance-alb The rx load balancing provided by balance-alb is not mutually exclusive with using hashing for tx selection, and should provide a decent speed increase because this eliminates spinlocks and cache contention. Signed-off-by: Debabrata Banerjee Signed-off-by: David S. Miller --- include/net/bonding.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index b52235158836..808f1d167349 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -285,8 +285,15 @@ static inline bool bond_needs_speed_duplex(const struct bonding *bond) static inline bool bond_is_nondyn_tlb(const struct bonding *bond) { - return (BOND_MODE(bond) == BOND_MODE_TLB) && - (bond->params.tlb_dynamic_lb == 0); + return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0); +} + +static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond) +{ + return (BOND_MODE(bond) == BOND_MODE_8023AD || + BOND_MODE(bond) == BOND_MODE_XOR || + BOND_MODE(bond) == BOND_MODE_TLB || + BOND_MODE(bond) == BOND_MODE_ALB); } static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond) -- cgit v1.2.3 From 32f7b44d0f5661044fcfa84e9ad402ed9d759107 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 15 May 2018 10:50:31 +0200 Subject: sched: manipulate __QDISC_STATE_RUNNING in qdisc_run_* helpers Currently NOLOCK qdiscs pay a measurable overhead to atomically manipulate the __QDISC_STATE_RUNNING. Such bit is flipped twice per packet in the uncontended scenario with packet rate below the line rate: on packed dequeue and on the next, failing dequeue attempt. This changeset moves the bit manipulation into the qdisc_run_{begin,end} helpers, so that the bit is now flipped only once per packet, with measurable performance improvement in the uncontended scenario. This also allows simplifying the qdisc teardown code path - since qdisc_is_running() is now effective for each qdisc type - and avoid a possible race between qdisc_run() and dev_deactivate_many(), as now the some_qdisc_is_busy() can properly detect NOLOCK qdiscs being busy dequeuing packets. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 5154c8300262..4d2b37226e75 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -113,13 +113,19 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc) static inline bool qdisc_is_running(const struct Qdisc *qdisc) { + if (qdisc->flags & TCQ_F_NOLOCK) + return test_bit(__QDISC_STATE_RUNNING, &qdisc->state); return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } static inline bool qdisc_run_begin(struct Qdisc *qdisc) { - if (qdisc_is_running(qdisc)) + if (qdisc->flags & TCQ_F_NOLOCK) { + if (test_and_set_bit(__QDISC_STATE_RUNNING, &qdisc->state)) + return false; + } else if (qdisc_is_running(qdisc)) { return false; + } /* Variant of write_seqcount_begin() telling lockdep a trylock * was attempted. */ @@ -131,6 +137,8 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) static inline void qdisc_run_end(struct Qdisc *qdisc) { write_seqcount_end(&qdisc->running); + if (qdisc->flags & TCQ_F_NOLOCK) + clear_bit(__QDISC_STATE_RUNNING, &qdisc->state); } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) -- cgit v1.2.3 From be2d04d11fd33bd46622f94619aae1596d9f9303 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 16 May 2018 22:27:41 +0200 Subject: bpf: add __printf verification to bpf_verifier_vlog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __printf is useful to verify format and arguments. ‘bpf_verifier_vlog’ function is used twice in verifier.c in both cases the caller function already uses the __printf gcc attribute. Remove the following warning, triggered with W=1: kernel/bpf/verifier.c:176:2: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format] Signed-off-by: Mathieu Malaterre Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8f70dc181e23..c286813deaeb 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -200,8 +200,8 @@ struct bpf_verifier_env { u32 subprog_cnt; }; -void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, - va_list args); +__printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, + const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); -- cgit v1.2.3 From 01cd267bff52619a53fa05c930ea5ed53493d21a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 May 2018 10:05:38 +0200 Subject: netfilter: fix fallout from xt/nf osf separation Stephen Rothwell says: today's linux-next build (x86_64 allmodconfig) produced this warning: ./usr/include/linux/netfilter/nf_osf.h:25: found __[us]{8,16,32,64} type without #include Fix that up and also move kernel-private struct out of uapi (it was not exposed in any released kernel version). tested via allmodconfig build + make headers_check. Reported-by: Stephen Rothwell Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_osf.h | 6 ++++++ include/uapi/linux/netfilter/nf_osf.h | 8 ++------ 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h index a2b39602e87d..0e114c492fb8 100644 --- a/include/linux/netfilter/nf_osf.h +++ b/include/linux/netfilter/nf_osf.h @@ -21,6 +21,12 @@ enum osf_fmatch_states { FMATCH_OPT_WRONG, }; +struct nf_osf_finger { + struct rcu_head rcu_head; + struct list_head finger_entry; + struct nf_osf_user_finger finger; +}; + bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h index 45376eae31ef..8f2f2f403183 100644 --- a/include/uapi/linux/netfilter/nf_osf.h +++ b/include/uapi/linux/netfilter/nf_osf.h @@ -1,6 +1,8 @@ #ifndef _NF_OSF_H #define _NF_OSF_H +#include + #define MAXGENRELEN 32 #define NF_OSF_GENRE (1 << 0) @@ -57,12 +59,6 @@ struct nf_osf_user_finger { struct nf_osf_opt opt[MAX_IPOPTLEN]; }; -struct nf_osf_finger { - struct rcu_head rcu_head; - struct list_head finger_entry; - struct nf_osf_user_finger finger; -}; - struct nf_osf_nlmsg { struct nf_osf_user_finger f; struct iphdr ip; -- cgit v1.2.3 From b9ccc07e3f31ad8073697982bac014fbceef7ecb Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Fri, 11 May 2018 00:14:48 +0200 Subject: netfilter: nft_hash: add map lookups for hashing operations This patch creates new attributes to accept a map as argument and then perform the lookup with the generated hash accordingly. Both current hash functions are supported: Jenkins and Symmetric Hash. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index ce031cf72288..9c71f024f9cc 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -856,6 +856,8 @@ enum nft_hash_types { * @NFTA_HASH_SEED: seed value (NLA_U32) * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32) * @NFTA_HASH_TYPE: hash operation (NLA_U32: nft_hash_types) + * @NFTA_HASH_SET_NAME: name of the map to lookup (NLA_STRING) + * @NFTA_HASH_SET_ID: id of the map (NLA_U32) */ enum nft_hash_attributes { NFTA_HASH_UNSPEC, @@ -866,6 +868,8 @@ enum nft_hash_attributes { NFTA_HASH_SEED, NFTA_HASH_OFFSET, NFTA_HASH_TYPE, + NFTA_HASH_SET_NAME, + NFTA_HASH_SET_ID, __NFTA_HASH_MAX, }; #define NFTA_HASH_MAX (__NFTA_HASH_MAX - 1) -- cgit v1.2.3 From 96009c7d500efdd5534e83b2e3eb2c58d4b137ae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 15 May 2018 16:24:36 +0200 Subject: sched: replace __QDISC_STATE_RUNNING bit with a spin lock So that we can use lockdep on it. The newly introduced sequence lock has the same scope of busylock, so it shares the same lockdep annotation, but it's only used for NOLOCK qdiscs. With this changeset we acquire such lock in the control path around flushing operation (qdisc reset), to allow more NOLOCK qdisc perf improvement in the next patch. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 4d2b37226e75..98c10a28cd01 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -30,7 +30,6 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, - __QDISC_STATE_RUNNING, }; struct qdisc_size_table { @@ -102,6 +101,7 @@ struct Qdisc { refcount_t refcnt; spinlock_t busylock ____cacheline_aligned_in_smp; + spinlock_t seqlock; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) @@ -111,17 +111,17 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc) refcount_inc(&qdisc->refcnt); } -static inline bool qdisc_is_running(const struct Qdisc *qdisc) +static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) - return test_bit(__QDISC_STATE_RUNNING, &qdisc->state); + return spin_is_locked(&qdisc->seqlock); return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { - if (test_and_set_bit(__QDISC_STATE_RUNNING, &qdisc->state)) + if (!spin_trylock(&qdisc->seqlock)) return false; } else if (qdisc_is_running(qdisc)) { return false; @@ -138,7 +138,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) { write_seqcount_end(&qdisc->running); if (qdisc->flags & TCQ_F_NOLOCK) - clear_bit(__QDISC_STATE_RUNNING, &qdisc->state); + spin_unlock(&qdisc->seqlock); } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) -- cgit v1.2.3 From 021a17ed796b62383f7623f4fea73787abddad77 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 15 May 2018 16:24:37 +0200 Subject: pfifo_fast: drop unneeded additional lock on dequeue After the previous patch, for NOLOCK qdiscs, q->seqlock is always held when the dequeue() is invoked, we can drop any additional locking to protect such operation. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skb_array.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index a6b6e8bb3d7b..62d9b0a6329f 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -97,6 +97,11 @@ static inline bool skb_array_empty_any(struct skb_array *a) return ptr_ring_empty_any(&a->ring); } +static inline struct sk_buff *__skb_array_consume(struct skb_array *a) +{ + return __ptr_ring_consume(&a->ring); +} + static inline struct sk_buff *skb_array_consume(struct skb_array *a) { return ptr_ring_consume(&a->ring); -- cgit v1.2.3 From 20b654dfe1beaca60ab51894ff405a049248433d Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 May 2018 16:40:10 -0700 Subject: tcp: support DUPACK threshold in RACK This patch adds support for the classic DUPACK threshold rule (#DupThresh) in RACK. When the number of packets SACKed is greater or equal to the threshold, RACK sets the reordering window to zero which would immediately mark all the unsacked packets below the highest SACKed sequence lost. Since this approach is known to not work well with reordering, RACK only uses it if no reordering has been observed. The DUPACK threshold rule is a particularly useful extension to the fast recoveries triggered by RACK reordering timer. For example data-center transfers where the RTT is much smaller than a timer tick, or high RTT path where the default RTT/4 may take too long. Note that this patch differs slightly from RFC6675. RFC6675 considers a packet lost when at least #DupThresh higher-sequence packets are SACKed. With RACK, for connections that have seen reordering, RACK continues to use a dynamically-adaptive time-based reordering window to detect losses. But for connections on which we have not yet seen reordering, this patch considers a packet lost when at least one higher sequence packet is SACKed and the total number of SACKed packets is at least DupThresh. For example, suppose a connection has not seen reordering, and sends 10 packets, and packets 3, 5, 7 are SACKed. RFC6675 considers packets 1 and 2 lost. RACK considers packets 1, 2, 4, 6 lost. There is some small risk of spurious retransmits here due to reordering. However, this is mostly limited to the first flight of a connection on which the sender receives SACKs from reordering. And RFC 6675 and FACK loss detection have a similar risk on the first flight with reordering (it's just that the risk of spurious retransmits from reordering was slightly narrower for those older algorithms due to the margin of 3*MSS). Also the minimum reordering window is reduced from 1 msec to 0 to recover quicker on short RTT transfers. Therefore RACK is more aggressive in marking packets lost during recovery to reduce the reordering window timeouts. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a08eab58ef70..994f869d793c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -245,6 +245,7 @@ extern long sysctl_tcp_mem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ +#define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; -- cgit v1.2.3 From 6ac06ecd3a5d1dd1aaea5c2a8f6d6e4c81d5de6a Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 May 2018 16:40:12 -0700 Subject: tcp: simpler NewReno implementation This is a rewrite of NewReno loss recovery implementation that is simpler and standalone for readability and better performance by using less states. Note that NewReno refers to RFC6582 as a modification to the fast recovery algorithm. It is used only if the connection does not support SACK in Linux. It should not to be confused with the Reno (AIMD) congestion control. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 994f869d793c..9a3ce379b994 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1877,6 +1877,7 @@ void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ +void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); -- cgit v1.2.3 From d716bfdb10b4250617783c94253e48b0e85adcb1 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 May 2018 16:40:13 -0700 Subject: tcp: account lost retransmit after timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous approach for the lost and retransmit bits was to wipe the slate clean: zero all the lost and retransmit bits, correspondingly zero the lost_out and retrans_out counters, and then add back the lost bits (and correspondingly increment lost_out). The new approach is to treat this very much like marking packets lost in fast recovery. We don’t wipe the slate clean. We just say that for all packets that were not yet marked sacked or lost, we now mark them as lost in exactly the same way we do for fast recovery. This fixes the lost retransmit accounting at RTO time and greatly simplifies the RTO code by sharing much of the logic with Fast Recovery. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9a3ce379b994..2b5372ef2e0e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1877,6 +1877,7 @@ void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, -- cgit v1.2.3 From b8fef65a8a76c2f887c56bf8e9684ff04e8d3b9f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 May 2018 16:40:16 -0700 Subject: tcp: new helper tcp_rack_skb_timeout Create and export a new helper tcp_rack_skb_timeout and move tcp_is_rack to prepare the final RTO change. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 2b5372ef2e0e..6deb540297cc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1879,6 +1879,8 @@ void tcp_init(void); /* tcp_recovery.c */ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); +extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, + u32 reo_wnd); extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); -- cgit v1.2.3 From 5490b8725d80cdfd5266a3732e23d61d0e9f7d2f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 17 May 2018 10:29:32 +0200 Subject: phy: add 2.5G SGMII mode to the phy_mode enum This patch adds one more generic PHY mode to the phy_mode enum, to allow configuring generic PHYs to the 2.5G SGMII mode by using the set_mode callback. Signed-off-by: Antoine Tenart Acked-by: Kishon Vijay Abraham I Signed-off-by: David S. Miller --- include/linux/phy/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index c9d14eeee7f5..9713aebdd348 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -36,6 +36,7 @@ enum phy_mode { PHY_MODE_USB_DEVICE_SS, PHY_MODE_USB_OTG, PHY_MODE_SGMII, + PHY_MODE_2500SGMII, PHY_MODE_10GKR, PHY_MODE_UFS_HS_A, PHY_MODE_UFS_HS_B, -- cgit v1.2.3 From a6d04569120d53ebac1f0e6ffd5497b22c75f037 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 5 Dec 2017 10:38:58 +0200 Subject: net/mlx5: Add merged e-switch cap When merged e-switch is supported, the per-port e-switch is logically merged into one e-switch that spans both physical ports and all the VFs. Under merged eswitch, both the matching on source vport and setting destination vport can have a 2nd attribute which is the vhca id of the eswitch owner. For example: esw0: {match: action: fwd to } is a flow set on eswitch0 matching on source vport=1 from his eswitch and the action being fwd to dest vport=7 of eswitch1. Signed-off-by: Roi Dayan Reviewed-by: Shahar Klein Reviewed-by: Or Gerlitz Klein Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1aad455538f4..ef15f751a984 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -557,7 +557,8 @@ struct mlx5_ifc_e_switch_cap_bits { u8 vport_svlan_insert[0x1]; u8 vport_cvlan_insert_if_not_exist[0x1]; u8 vport_cvlan_insert_overwrite[0x1]; - u8 reserved_at_5[0x19]; + u8 reserved_at_5[0x18]; + u8 merged_eswitch[0x1]; u8 nic_vport_node_guid_modify[0x1]; u8 nic_vport_port_guid_modify[0x1]; -- cgit v1.2.3 From b17f7fc10facca9e1b2b9cef39af0a8179796c14 Mon Sep 17 00:00:00 2001 From: Shahar Klein Date: Thu, 22 Mar 2018 12:32:12 +0200 Subject: net/mlx5: Add destination e-switch owner The destination e-switch owner allows a rule in namespace of one e-switch owner to point to a vport that is natively associated with another e-switch owner. Signed-off-by: Shahar Klein Reviewed-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 6 +++++- include/linux/mlx5/mlx5_ifc.h | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 47aecc4fa8c2..9f4d32e41c06 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -90,8 +90,12 @@ struct mlx5_flow_destination { union { u32 tir_num; struct mlx5_flow_table *ft; - u32 vport_num; struct mlx5_fc *counter; + struct { + u16 num; + u16 vhca_id; + bool vhca_id_valid; + } vport; }; }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ef15f751a984..3d17709bc30c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1148,8 +1148,9 @@ enum mlx5_flow_destination_type { struct mlx5_ifc_dest_format_struct_bits { u8 destination_type[0x8]; u8 destination_id[0x18]; - - u8 reserved_at_20[0x20]; + u8 destination_eswitch_owner_vhca_id_valid[0x1]; + u8 reserved_at_21[0xf]; + u8 destination_eswitch_owner_vhca_id[0x10]; }; struct mlx5_ifc_flow_counter_list_bits { -- cgit v1.2.3 From 3e99df87722874c669f6c24a79bd1b4ba5fec819 Mon Sep 17 00:00:00 2001 From: Shahar Klein Date: Sun, 18 Mar 2018 09:02:06 +0200 Subject: net/mlx5: Add source e-switch owner The source e-switch owner allows a vport on one e-switch port be associated with a rule defined on the second port e-switch. The role of the source eswitch owner valid bit in the flow group is to allow the firmware fail driver attempts to wild card the source eswitch match field. If this bit is not set, the firmware ignores the source eswitch owner field totally. Signed-off-by: Shahar Klein Reviewed-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3d17709bc30c..9c3538f1b8b9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -412,7 +412,7 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 reserved_at_0[0x8]; u8 source_sqn[0x18]; - u8 reserved_at_20[0x10]; + u8 source_eswitch_owner_vhca_id[0x10]; u8 source_port[0x10]; u8 outer_second_prio[0x3]; @@ -6995,7 +6995,9 @@ struct mlx5_ifc_create_flow_group_in_bits { u8 reserved_at_a0[0x8]; u8 table_id[0x18]; - u8 reserved_at_c0[0x20]; + u8 source_eswitch_owner_vhca_id_valid[0x1]; + + u8 reserved_at_c1[0x1f]; u8 start_flow_index[0x20]; -- cgit v1.2.3 From d6ee6ad774a986d4faaa794a0980e7c50ed359c6 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Thu, 26 Apr 2018 13:13:26 +0200 Subject: Bluetooth: Add __hci_cmd_send function This function allows to send a HCI command without expecting any controller event/response in return. This is allowed for vendor- specific commands only. Signed-off-by: Loic Poulain Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b619a190ff12..893bbbb5d2fa 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1393,6 +1393,8 @@ struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout); struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout); +int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param); int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param); -- cgit v1.2.3 From 8689c051a20195b228e19acb155c7d6e48a86753 Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Thu, 10 May 2018 13:50:12 +0200 Subject: cfg80211: dynamically allocate per-tid stats for station info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the addition of TXQ stats in the per-tid statistics the struct station_info grew significantly. This resulted in stack size warnings due to the structure itself being above the limit for the warnings. Add an allocation function that those who want to provide per-tid stats should use to allocate the tid array, i.e. struct station_info::pertid. Cc: Toke Høiland-Jørgensen Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace") Signed-off-by: Arend van Spriel [johannes: fix missing BIT() and logic by removing] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8db6071b6063..8984d24d68b7 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1184,6 +1184,7 @@ struct cfg80211_tid_stats { * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. + * Note that this doesn't use the @filled bit, but is used if non-NULL. * @ack_signal: signal strength (in dBm) of the last ACK frame. * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has * been sent. @@ -1230,7 +1231,7 @@ struct station_info { u64 rx_beacon; u64 rx_duration; u8 rx_beacon_signal_avg; - struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1]; + struct cfg80211_tid_stats *pertid; s8 ack_signal; s8 avg_ack_signal; }; @@ -5701,6 +5702,13 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie, struct ieee80211_channel *chan, gfp_t gfp); +/** + * cfg80211_sinfo_alloc_tid_stats - allocate per-tid statistics. + * + * @sinfo: the station information + * @gfp: allocation flags + */ +int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); /** * cfg80211_new_sta - notify userspace about station -- cgit v1.2.3 From 7ea3e110f2f8ba23f330c2f702f556acd539bcb8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 May 2018 11:40:44 +0200 Subject: cfg80211: release station info tidstats where needed This fixes memory leaks in cases where we got the station info but failed sending it out properly. Fixes: 8689c051a201 ("cfg80211: dynamically allocate per-tid stats for station info") Reviewed-by: Arend van Spriel Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8984d24d68b7..11a218445448 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5710,6 +5710,19 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie, */ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); +/** + * cfg80211_sinfo_release_content - release contents of station info + * @sinfo: the station information + * + * Releases any potentially allocated sub-information of the station + * information, but not the struct itself (since it's typically on + * the stack.) + */ +static inline void cfg80211_sinfo_release_content(struct station_info *sinfo) +{ + kfree(sinfo->pertid); +} + /** * cfg80211_new_sta - notify userspace about station * -- cgit v1.2.3 From dac09149d992995adbef0f472093fbb6940a8653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 18 May 2018 14:00:21 +0200 Subject: xsk: clean up SPDX headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up SPDX-License-Identifier and removing licensing leftovers. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 13 ++----------- include/uapi/linux/if_xdp.h | 13 ++----------- 2 files changed, 4 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 185f4928fbda..7a647c56ec15 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -1,15 +1,6 @@ -/* SPDX-License-Identifier: GPL-2.0 - * AF_XDP internal functions +/* SPDX-License-Identifier: GPL-2.0 */ +/* AF_XDP internal functions * Copyright(c) 2018 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_XDP_SOCK_H diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 77b88c4efe98..56db977221d2 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -1,17 +1,8 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note - * +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* * if_xdp: XDP socket user-space interface * Copyright(c) 2018 Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * * Author(s): Björn Töpel * Magnus Karlsson */ -- cgit v1.2.3 From cf0dd203728c20e76976b69cdf42d065c1e5cab3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 May 2018 14:47:24 -0700 Subject: tcp: use __sock_put() instead of sock_put() in tcp_clear_xmit_timers() Socket can not disappear under us. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6deb540297cc..511bd0fde1dc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -559,7 +559,7 @@ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) - sock_put(sk); + __sock_put(sk); inet_csk_clear_xmit_timers(sk); } -- cgit v1.2.3 From 5d9f4262b7ea41ca9981cc790e37cca6e37c789e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 May 2018 14:47:26 -0700 Subject: tcp: add SACK compression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When TCP receives an out-of-order packet, it immediately sends a SACK packet, generating network load but also forcing the receiver to send 1-MSS pathological packets, increasing its RTX queue length/depth, and thus processing time. Wifi networks suffer from this aggressive behavior, but generally speaking, all these SACK packets add fuel to the fire when networks are under congestion. This patch adds a high resolution timer and tp->compressed_ack counter. Instead of sending a SACK, we program this timer with a small delay, based on RTT and capped to 1 ms : delay = min ( 5 % of RTT, 1 ms) If subsequent SACKs need to be sent while the timer has not yet expired, we simply increment tp->compressed_ack. When timer expires, a SACK is sent with the latest information. Whenever an ACK is sent (if data is sent, or if in-order data is received) timer is canceled. Note that tcp_sack_new_ofo_skb() is able to force a SACK to be sent if the sack blocks need to be shuffled, even if the timer has not expired. A new SNMP counter is added in the following patch. Two other patches add sysctls to allow changing the 1,000,000 and 44 values that this commit hard-coded. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ include/net/tcp.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 807776928cb8..72705eaf4b84 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -218,6 +218,7 @@ struct tcp_sock { reord:1; /* reordering detected */ } rack; u16 advmss; /* Advertised MSS */ + u8 compressed_ack; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ @@ -297,6 +298,7 @@ struct tcp_sock { u32 sacked_out; /* SACK'd packets */ struct hrtimer pacing_timer; + struct hrtimer compressed_ack_timer; /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; diff --git a/include/net/tcp.h b/include/net/tcp.h index 511bd0fde1dc..952d842a604a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -561,6 +561,9 @@ static inline void tcp_clear_xmit_timers(struct sock *sk) if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) __sock_put(sk); + if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1) + __sock_put(sk); + inet_csk_clear_xmit_timers(sk); } -- cgit v1.2.3 From 200d95f4575934e49f872109cce18c5e72383eb8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 May 2018 14:47:27 -0700 Subject: tcp: add TCPAckCompressed SNMP counter This counter tracks number of ACK packets that the host has not sent, thanks to ACK compression. Sample output : $ nstat -n;sleep 1;nstat|egrep "IpInReceives|IpOutRequests|TcpInSegs|TcpOutSegs|TcpExtTCPAckCompressed" IpInReceives 123250 0.0 IpOutRequests 3684 0.0 TcpInSegs 123251 0.0 TcpOutSegs 3684 0.0 TcpExtTCPAckCompressed 119252 0.0 Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index d02e859301ff..750d89120335 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -278,6 +278,7 @@ enum LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ LINUX_MIB_TCPDELIVERED, /* TCPDelivered */ LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */ + LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */ __LINUX_MIB_MAX }; -- cgit v1.2.3 From 6d82aa242092d73c6d2e210cfaf0ebfbe6de1ccf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 May 2018 14:47:28 -0700 Subject: tcp: add tcp_comp_sack_delay_ns sysctl This per netns sysctl allows for TCP SACK compression fine-tuning. Its default value is 1,000,000, or 1 ms to meet TSO autosizing period. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8491bc9c86b1..927318243cfa 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -160,6 +160,7 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; + unsigned long sysctl_tcp_comp_sack_delay_ns; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; -- cgit v1.2.3 From 9c21d2fc41c0c8930600c9dd83eadac2e336fcfa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 May 2018 14:47:29 -0700 Subject: tcp: add tcp_comp_sack_nr sysctl This per netns sysctl allows for TCP SACK compression fine-tuning. This limits number of SACK that can be compressed. Using 0 disables SACK compression. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 927318243cfa..661348f23ea5 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -160,6 +160,7 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; + int sysctl_tcp_comp_sack_nr; unsigned long sysctl_tcp_comp_sack_delay_ns; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; -- cgit v1.2.3 From 303def35f64e37bcd5401d202889f5fbc0241179 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 17 May 2018 14:16:58 -0700 Subject: bpf: allow sk_msg programs to read sock fields Currently sk_msg programs only have access to the raw data. However, it is often useful when building policies to have the policies specific to the socket endpoint. This allows using the socket tuple as input into filters, etc. This patch adds ctx access to the sock fields. Signed-off-by: John Fastabend Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 9dbcb9d55921..d358d1815c16 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -517,6 +517,7 @@ struct sk_msg_buff { bool sg_copy[MAX_SKB_FRAGS]; __u32 flags; struct sock *sk_redir; + struct sock *sk; struct sk_buff *skb; struct list_head list; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d94d333a8225..97446bbe2ca5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2176,6 +2176,14 @@ enum sk_action { struct sk_msg_md { void *data; void *data_end; + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ }; #define BPF_TAG_SIZE 8 -- cgit v1.2.3 From b9ffcbaf56d3040efee64d3818688083c29b2a44 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 18 May 2018 09:29:00 +0200 Subject: devlink: introduce devlink_port_attrs_set Change existing setter for split port information into more generic attrs setter. Alongside with that, allow to set port number and subport number for split ports. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 20 ++++++++++++++------ include/uapi/linux/devlink.h | 3 +++ 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 2e4f71e16e95..d6ca92266709 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -35,6 +35,13 @@ struct devlink { char priv[0] __aligned(NETDEV_ALIGN); }; +struct devlink_port_attrs { + bool set; + u32 port_number; /* same value as "split group" */ + bool split; + u32 split_subport_number; +}; + struct devlink_port { struct list_head list; struct devlink *devlink; @@ -43,8 +50,7 @@ struct devlink_port { enum devlink_port_type type; enum devlink_port_type desired_type; void *type_dev; - bool split; - u32 split_group; + struct devlink_port_attrs attrs; }; struct devlink_sb_pool_info { @@ -367,8 +373,9 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port, void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); -void devlink_port_split_set(struct devlink_port *devlink_port, - u32 split_group); +void devlink_port_attrs_set(struct devlink_port *devlink_port, + u32 port_number, bool split, + u32 split_subport_number); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, @@ -466,8 +473,9 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port) { } -static inline void devlink_port_split_set(struct devlink_port *devlink_port, - u32 split_group) +static inline void devlink_port_attrs_set(struct devlink_port *devlink_port, + u32 port_number, bool split, + u32 split_subport_number) { } diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1df65a4c2044..15b031a5ee7a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -224,6 +224,9 @@ enum devlink_attr { DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, /* u64 */ DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */ + DEVLINK_ATTR_PORT_NUMBER, /* u32 */ + DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 5ec1380a21bb6cd2ba89e31c44dfcc150f9ef792 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 18 May 2018 09:29:01 +0200 Subject: devlink: extend attrs_set for setting port flavours Devlink ports can have specific flavour according to the purpose of use. This patch extend attrs_set so the driver can say which flavour port has. Initial flavours are: physical, cpu, dsa User can query this to see right away what is the purpose of each port. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ include/uapi/linux/devlink.h | 11 +++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index d6ca92266709..6eae15cf0f57 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -37,6 +37,7 @@ struct devlink { struct devlink_port_attrs { bool set; + enum devlink_port_flavour flavour; u32 port_number; /* same value as "split group" */ bool split; u32 split_subport_number; @@ -374,6 +375,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour, u32 port_number, bool split, u32 split_subport_number); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, @@ -474,6 +476,7 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port) } static inline void devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour, u32 port_number, bool split, u32 split_subport_number) { diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 15b031a5ee7a..75cb5450c851 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -132,6 +132,16 @@ enum devlink_eswitch_encap_mode { DEVLINK_ESWITCH_ENCAP_MODE_BASIC, }; +enum devlink_port_flavour { + DEVLINK_PORT_FLAVOUR_PHYSICAL, /* Any kind of a port physically + * facing the user. + */ + DEVLINK_PORT_FLAVOUR_CPU, /* CPU port */ + DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture + * interconnect port. + */ +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -224,6 +234,7 @@ enum devlink_attr { DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, /* u64 */ DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */ + DEVLINK_ATTR_PORT_FLAVOUR, /* u16 */ DEVLINK_ATTR_PORT_NUMBER, /* u32 */ DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, /* u32 */ -- cgit v1.2.3 From 08474c1a9df0cefcc3d197bd1d770695a34b9d60 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 18 May 2018 09:29:02 +0200 Subject: devlink: introduce a helper to generate physical port names Each driver implements physical port name generation by itself. However as devlink has all needed info, it can easily do the job for all its users. So implement this helper in devlink. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 6eae15cf0f57..9686a1aa4ec9 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -378,6 +378,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour, u32 port_number, bool split, u32 split_subport_number); +int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, + char *name, size_t len); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, @@ -482,6 +484,13 @@ static inline void devlink_port_attrs_set(struct devlink_port *devlink_port, { } +static inline int +devlink_port_get_phys_port_name(struct devlink_port *devlink_port, + char *name, size_t len) +{ + return -EOPNOTSUPP; +} + static inline int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, -- cgit v1.2.3 From d48f1958ab7d9fb896d86f0241f0dc83dc3a63de Mon Sep 17 00:00:00 2001 From: William Tu Date: Fri, 18 May 2018 19:41:01 -0700 Subject: erspan: set bso bit based on mirrored packet's len Before the patch, the erspan BSO bit (Bad/Short/Oversized) is not handled. BSO has 4 possible values: 00 --> Good frame with no error, or unknown integrity 11 --> Payload is a Bad Frame with CRC or Alignment Error 01 --> Payload is a Short Frame 10 --> Payload is an Oversized Frame Based the short/oversized definitions in RFC1757, the patch sets the bso bit based on the mirrored packet's size. Reported-by: Xiaoyan Jin Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/net/erspan.h b/include/net/erspan.h index d044aa60cc76..b39643ef4c95 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -219,6 +219,33 @@ static inline __be32 erspan_get_timestamp(void) return htonl((u32)h_usecs); } +/* ERSPAN BSO (Bad/Short/Oversized), see RFC1757 + * 00b --> Good frame with no error, or unknown integrity + * 01b --> Payload is a Short Frame + * 10b --> Payload is an Oversized Frame + * 11b --> Payload is a Bad Frame with CRC or Alignment Error + */ +enum erspan_bso { + BSO_NOERROR = 0x0, + BSO_SHORT = 0x1, + BSO_OVERSIZED = 0x2, + BSO_BAD = 0x3, +}; + +static inline u8 erspan_detect_bso(struct sk_buff *skb) +{ + /* BSO_BAD is not handled because the frame CRC + * or alignment error information is in FCS. + */ + if (skb->len < ETH_ZLEN) + return BSO_SHORT; + + if (skb->len > ETH_FRAME_LEN) + return BSO_OVERSIZED; + + return BSO_NOERROR; +} + static inline void erspan_build_header_v2(struct sk_buff *skb, u32 id, u8 direction, u16 hwid, bool truncate, bool is_ipv4) @@ -248,6 +275,7 @@ static inline void erspan_build_header_v2(struct sk_buff *skb, vlan_tci = ntohs(qp->tci); } + bso = erspan_detect_bso(skb); skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); ershdr = (struct erspan_base_hdr *)skb->data; memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); -- cgit v1.2.3 From 877b7cb0b6f283593a663134ee52703f12c895cc Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 19 May 2018 22:31:34 +0200 Subject: net: dsa: mv88e6xxx: Add minimal platform_data support Not all the world uses device tree. Some parts of the world still use platform devices and platform data. Add basic support for probing a Marvell switch via platform data. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/platform_data/mv88e6xxx.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/linux/platform_data/mv88e6xxx.h (limited to 'include') diff --git a/include/linux/platform_data/mv88e6xxx.h b/include/linux/platform_data/mv88e6xxx.h new file mode 100644 index 000000000000..88e91e05f48f --- /dev/null +++ b/include/linux/platform_data/mv88e6xxx.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __DSA_MV88E6XXX_H +#define __DSA_MV88E6XXX_H + +#include + +struct dsa_mv88e6xxx_pdata { + /* Must be first, such that dsa_register_switch() can access this + * without gory pointer manipulations + */ + struct dsa_chip_data cd; + const char *compatible; + unsigned int enabled_ports; + struct net_device *netdev; +}; + +#endif -- cgit v1.2.3 From 00baabe5286c41d21ed4a78f479b021eba1f0d51 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 19 May 2018 22:31:35 +0200 Subject: net: dsa: mv88e6xxx: Add support for EEPROM via platform data Add the size of the EEPROM to the platform data, so it can also be instantiated by a platform device. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/platform_data/mv88e6xxx.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_data/mv88e6xxx.h b/include/linux/platform_data/mv88e6xxx.h index 88e91e05f48f..f63af2955ea0 100644 --- a/include/linux/platform_data/mv88e6xxx.h +++ b/include/linux/platform_data/mv88e6xxx.h @@ -12,6 +12,7 @@ struct dsa_mv88e6xxx_pdata { const char *compatible; unsigned int enabled_ports; struct net_device *netdev; + u32 eeprom_len; }; #endif -- cgit v1.2.3 From bf3c592b9d78ab2768150c680de22ef99e6e3b32 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sun, 20 May 2018 08:56:30 -0700 Subject: net: dsa: b53: Extend platform data to include DSA ports The b53 driver already defines and internally uses platform data to let the glue drivers specify parameters such as the chip id. What we were missing was a way to tell the core DSA layer about the ports and their type. Place a dsa_chip_data structure at the beginning of b53_platform_data for dsa_register_switch() to access it. This does not require modifications to b53_common.c which will pass platform_data trough. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/platform_data/b53.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/b53.h b/include/linux/platform_data/b53.h index 69d279c0da96..8eaef2f2b691 100644 --- a/include/linux/platform_data/b53.h +++ b/include/linux/platform_data/b53.h @@ -20,8 +20,12 @@ #define __B53_H #include +#include struct b53_platform_data { + /* Must be first such that dsa_register_switch() can access it */ + struct dsa_chip_data cd; + u32 chip_id; u16 enabled_ports; -- cgit v1.2.3 From ad75646c68a6e344a3609f40825855a0e742d04d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 22 May 2018 09:34:57 +0200 Subject: xsk: fill hole in struct sockaddr_xdp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the sxdp_flags up, avoiding a hole in the uapi structure. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 56db977221d2..c46609a00168 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -17,10 +17,10 @@ struct sockaddr_xdp { __u16 sxdp_family; + __u16 sxdp_flags; __u32 sxdp_ifindex; __u32 sxdp_queue_id; __u32 sxdp_shared_umem_fd; - __u16 sxdp_flags; }; /* XDP socket options */ -- cgit v1.2.3 From b3a9e0be436960072ddf327d9d82f50ee2f620e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 22 May 2018 09:34:59 +0200 Subject: xsk: remove explicit ring structure from uapi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit we remove the explicit ring structure from the the uapi. It is tricky for an uapi to depend on a certain L1 cache line size, since it can differ for variants of the same architecture. Now, we let the user application determine the offsets of the producer, consumer and descriptors by asking the socket via getsockopt. A typical flow would be (Rx ring): struct xdp_mmap_offsets off; struct xdp_desc *ring; u32 *prod, *cons; void *map; ... getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen); map = mmap(NULL, off.rx.desc + NUM_DESCS * sizeof(struct xdp_desc), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, sfd, XDP_PGOFF_RX_RING); prod = map + off.rx.producer; cons = map + off.rx.consumer; ring = map + off.rx.desc; Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index c46609a00168..4737cfe222f5 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -23,13 +23,27 @@ struct sockaddr_xdp { __u32 sxdp_shared_umem_fd; }; +struct xdp_ring_offset { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ +}; + /* XDP socket options */ -#define XDP_RX_RING 1 -#define XDP_TX_RING 2 -#define XDP_UMEM_REG 3 -#define XDP_UMEM_FILL_RING 4 -#define XDP_UMEM_COMPLETION_RING 5 -#define XDP_STATISTICS 6 +#define XDP_MMAP_OFFSETS 1 +#define XDP_RX_RING 2 +#define XDP_TX_RING 3 +#define XDP_UMEM_REG 4 +#define XDP_UMEM_FILL_RING 5 +#define XDP_UMEM_COMPLETION_RING 6 +#define XDP_STATISTICS 7 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -50,6 +64,7 @@ struct xdp_statistics { #define XDP_UMEM_PGOFF_FILL_RING 0x100000000 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 +/* Rx/Tx descriptor */ struct xdp_desc { __u32 idx; __u32 len; @@ -58,21 +73,6 @@ struct xdp_desc { __u8 padding[5]; }; -struct xdp_ring { - __u32 producer __attribute__((aligned(64))); - __u32 consumer __attribute__((aligned(64))); -}; - -/* Used for the RX and TX queues for packets */ -struct xdp_rxtx_ring { - struct xdp_ring ptrs; - struct xdp_desc desc[0] __attribute__((aligned(64))); -}; - -/* Used for the fill and completion queues for buffers */ -struct xdp_umem_ring { - struct xdp_ring ptrs; - __u32 desc[0] __attribute__((aligned(64))); -}; +/* UMEM descriptor is __u32 */ #endif /* _LINUX_IF_XDP_H */ -- cgit v1.2.3 From 50d889b1789458d1f7d7f40ff4f628b670047773 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 21 May 2018 09:08:13 -0700 Subject: net/ipv4: Add helper to return path MTU based on fib result Determine path MTU from a FIB lookup result. Logic is a distillation of ip_dst_mtu_maybe_forward. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/net/ip_fib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 81d0f2107ff1..69c91d1934c1 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -449,4 +449,6 @@ static inline void fib_proc_exit(struct net *net) } #endif +u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); + #endif /* _NET_FIB_H */ -- cgit v1.2.3 From 901731b882d77dc53897aec45015ced42d56fe4c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 21 May 2018 09:08:14 -0700 Subject: net/ipv6: Add helper to return path MTU based on fib result Determine path MTU from a FIB lookup result. Logic is based on ip6_dst_mtu_forward plus lookup of nexthop exception. Add ip6_dst_mtu_forward to ipv6_stubs to handle access by core bpf code. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/net/addrconf.h | 2 ++ include/net/ip6_fib.h | 6 ++++++ include/net/ip6_route.h | 3 +++ 3 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index ff766ab207e0..c07d4dd09361 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -236,6 +236,8 @@ struct ipv6_stub { struct flowi6 *fl6, int oif, const struct sk_buff *skb, int strict); + u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index cc70f6da8462..7897efe80727 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -412,6 +412,12 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) return f6i->fib6_nh.nh_dev; } +static inline +struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) +{ + return f6i->fib6_nh.nh_lwtstate; +} + void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 4cf1ef935ed9..7b9c82de11cc 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -300,6 +300,9 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } +u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); + struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr); -- cgit v1.2.3 From f3a7ca64587f58686d4e2e894e9abbfbc9dffb25 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 22 May 2018 11:31:59 +0200 Subject: cfg80211: add missing kernel-doc Add the kernel-doc missed earlier. Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace") Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 11a218445448..df33c2766d22 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3856,6 +3856,10 @@ struct wiphy_iftype_ext_capab { * bitmap of &enum nl80211_band values. For instance, for * NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * + * @txq_limit: configuration of internal TX queue frame limit + * @txq_memory_limit: configuration internal TX queue memory limit + * @txq_quantum: configuration of internal TX queue scheduler quantum */ struct wiphy { /* assign these fields before you register the wiphy */ -- cgit v1.2.3 From f34436a430920bdfdba575b509d994c619f5101d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 21 May 2018 10:26:53 -0700 Subject: net/ipv6: Simplify route replace and appending into multipath route Bring consistency to ipv6 route replace and append semantics. Remove rt6_qualify_for_ecmp which is just guess work. It fails in 2 cases: 1. can not replace a route with a reject route. Existing code appends a new route instead of replacing the existing one. 2. can not have a multipath route where a leg uses a dev only nexthop Existing use cases affected by this change: 1. adding a route with existing prefix and metric using NLM_F_CREATE without NLM_F_APPEND or NLM_F_EXCL (ie., what iproute2 calls 'prepend'). Existing code auto-determines that the new nexthop can be appended to an existing route to create a multipath route. This change breaks that by requiring the APPEND flag for the new route to be added to an existing one. Instead the prepend just adds another route entry. 2. route replace. Existing code replaces first matching multipath route if new route is multipath capable and fallback to first matching non-ECMP route (reject or dev only route) in case one isn't available. New behavior replaces first matching route. (Thanks to Ido for spotting this one) Note: Newer iproute2 is needed to display multipath routes with a dev-only nexthop. This is due to a bug in iproute2 and parsing nexthops. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 4cf1ef935ed9..9e4d0f0aeb6d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } -static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) -{ - return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, -- cgit v1.2.3 From 2528c389936efbbece25088426fe7c3c91ff355f Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Tue, 22 May 2018 00:28:38 -0700 Subject: qed: Add support for tlv request processing. The patch adds driver support for processing TLV requests/repsonses from the mfw and upper driver layers respectively. The implementation reads the requested TLVs from the shared memory, requests the values from upper layer drivers, populates this info (TLVs) shared memory and notifies MFW about the TLV values. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 907976fd56f7..8e4fad427ac2 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -182,6 +182,43 @@ enum qed_led_mode { QED_LED_MODE_RESTORE }; +struct qed_mfw_tlv_eth { + u16 lso_maxoff_size; + bool lso_maxoff_size_set; + u16 lso_minseg_size; + bool lso_minseg_size_set; + u8 prom_mode; + bool prom_mode_set; + u16 tx_descr_size; + bool tx_descr_size_set; + u16 rx_descr_size; + bool rx_descr_size_set; + u16 netq_count; + bool netq_count_set; + u32 tcp4_offloads; + bool tcp4_offloads_set; + u32 tcp6_offloads; + bool tcp6_offloads_set; + u16 tx_descr_qdepth; + bool tx_descr_qdepth_set; + u16 rx_descr_qdepth; + bool rx_descr_qdepth_set; + u8 iov_offload; +#define QED_MFW_TLV_IOV_OFFLOAD_NONE (0) +#define QED_MFW_TLV_IOV_OFFLOAD_MULTIQUEUE (1) +#define QED_MFW_TLV_IOV_OFFLOAD_VEB (2) +#define QED_MFW_TLV_IOV_OFFLOAD_VEPA (3) + bool iov_offload_set; + u8 txqs_empty; + bool txqs_empty_set; + u8 rxqs_empty; + bool rxqs_empty_set; + u8 num_txqs_full; + bool num_txqs_full_set; + u8 num_rxqs_full; + bool num_rxqs_full_set; +}; + #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \ (void __iomem *)(reg_addr)) -- cgit v1.2.3 From f240b6882211aae7155a9839dff1426e2853fe30 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Tue, 22 May 2018 00:28:39 -0700 Subject: qed: Add support for processing fcoe tlv request. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 193 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 8e4fad427ac2..74c2b9ad8afa 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -219,6 +219,199 @@ struct qed_mfw_tlv_eth { bool num_rxqs_full_set; }; +#define QED_MFW_TLV_TIME_SIZE 14 +struct qed_mfw_tlv_time { + bool b_set; + u8 month; + u8 day; + u8 hour; + u8 min; + u16 msec; + u16 usec; +}; + +struct qed_mfw_tlv_fcoe { + u8 scsi_timeout; + bool scsi_timeout_set; + u32 rt_tov; + bool rt_tov_set; + u32 ra_tov; + bool ra_tov_set; + u32 ed_tov; + bool ed_tov_set; + u32 cr_tov; + bool cr_tov_set; + u8 boot_type; + bool boot_type_set; + u8 npiv_state; + bool npiv_state_set; + u32 num_npiv_ids; + bool num_npiv_ids_set; + u8 switch_name[8]; + bool switch_name_set; + u16 switch_portnum; + bool switch_portnum_set; + u8 switch_portid[3]; + bool switch_portid_set; + u8 vendor_name[8]; + bool vendor_name_set; + u8 switch_model[8]; + bool switch_model_set; + u8 switch_fw_version[8]; + bool switch_fw_version_set; + u8 qos_pri; + bool qos_pri_set; + u8 port_alias[3]; + bool port_alias_set; + u8 port_state; +#define QED_MFW_TLV_PORT_STATE_OFFLINE (0) +#define QED_MFW_TLV_PORT_STATE_LOOP (1) +#define QED_MFW_TLV_PORT_STATE_P2P (2) +#define QED_MFW_TLV_PORT_STATE_FABRIC (3) + bool port_state_set; + u16 fip_tx_descr_size; + bool fip_tx_descr_size_set; + u16 fip_rx_descr_size; + bool fip_rx_descr_size_set; + u16 link_failures; + bool link_failures_set; + u8 fcoe_boot_progress; + bool fcoe_boot_progress_set; + u64 rx_bcast; + bool rx_bcast_set; + u64 tx_bcast; + bool tx_bcast_set; + u16 fcoe_txq_depth; + bool fcoe_txq_depth_set; + u16 fcoe_rxq_depth; + bool fcoe_rxq_depth_set; + u64 fcoe_rx_frames; + bool fcoe_rx_frames_set; + u64 fcoe_rx_bytes; + bool fcoe_rx_bytes_set; + u64 fcoe_tx_frames; + bool fcoe_tx_frames_set; + u64 fcoe_tx_bytes; + bool fcoe_tx_bytes_set; + u16 crc_count; + bool crc_count_set; + u32 crc_err_src_fcid[5]; + bool crc_err_src_fcid_set[5]; + struct qed_mfw_tlv_time crc_err[5]; + u16 losync_err; + bool losync_err_set; + u16 losig_err; + bool losig_err_set; + u16 primtive_err; + bool primtive_err_set; + u16 disparity_err; + bool disparity_err_set; + u16 code_violation_err; + bool code_violation_err_set; + u32 flogi_param[4]; + bool flogi_param_set[4]; + struct qed_mfw_tlv_time flogi_tstamp; + u32 flogi_acc_param[4]; + bool flogi_acc_param_set[4]; + struct qed_mfw_tlv_time flogi_acc_tstamp; + u32 flogi_rjt; + bool flogi_rjt_set; + struct qed_mfw_tlv_time flogi_rjt_tstamp; + u32 fdiscs; + bool fdiscs_set; + u8 fdisc_acc; + bool fdisc_acc_set; + u8 fdisc_rjt; + bool fdisc_rjt_set; + u8 plogi; + bool plogi_set; + u8 plogi_acc; + bool plogi_acc_set; + u8 plogi_rjt; + bool plogi_rjt_set; + u32 plogi_dst_fcid[5]; + bool plogi_dst_fcid_set[5]; + struct qed_mfw_tlv_time plogi_tstamp[5]; + u32 plogi_acc_src_fcid[5]; + bool plogi_acc_src_fcid_set[5]; + struct qed_mfw_tlv_time plogi_acc_tstamp[5]; + u8 tx_plogos; + bool tx_plogos_set; + u8 plogo_acc; + bool plogo_acc_set; + u8 plogo_rjt; + bool plogo_rjt_set; + u32 plogo_src_fcid[5]; + bool plogo_src_fcid_set[5]; + struct qed_mfw_tlv_time plogo_tstamp[5]; + u8 rx_logos; + bool rx_logos_set; + u8 tx_accs; + bool tx_accs_set; + u8 tx_prlis; + bool tx_prlis_set; + u8 rx_accs; + bool rx_accs_set; + u8 tx_abts; + bool tx_abts_set; + u8 rx_abts_acc; + bool rx_abts_acc_set; + u8 rx_abts_rjt; + bool rx_abts_rjt_set; + u32 abts_dst_fcid[5]; + bool abts_dst_fcid_set[5]; + struct qed_mfw_tlv_time abts_tstamp[5]; + u8 rx_rscn; + bool rx_rscn_set; + u32 rx_rscn_nport[4]; + bool rx_rscn_nport_set[4]; + u8 tx_lun_rst; + bool tx_lun_rst_set; + u8 abort_task_sets; + bool abort_task_sets_set; + u8 tx_tprlos; + bool tx_tprlos_set; + u8 tx_nos; + bool tx_nos_set; + u8 rx_nos; + bool rx_nos_set; + u8 ols; + bool ols_set; + u8 lr; + bool lr_set; + u8 lrr; + bool lrr_set; + u8 tx_lip; + bool tx_lip_set; + u8 rx_lip; + bool rx_lip_set; + u8 eofa; + bool eofa_set; + u8 eofni; + bool eofni_set; + u8 scsi_chks; + bool scsi_chks_set; + u8 scsi_cond_met; + bool scsi_cond_met_set; + u8 scsi_busy; + bool scsi_busy_set; + u8 scsi_inter; + bool scsi_inter_set; + u8 scsi_inter_cond_met; + bool scsi_inter_cond_met_set; + u8 scsi_rsv_conflicts; + bool scsi_rsv_conflicts_set; + u8 scsi_tsk_full; + bool scsi_tsk_full_set; + u8 scsi_aca_active; + bool scsi_aca_active_set; + u8 scsi_tsk_abort; + bool scsi_tsk_abort_set; + u32 scsi_rx_chk[5]; + bool scsi_rx_chk_set[5]; + struct qed_mfw_tlv_time scsi_chk_tstamp[5]; +}; + #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \ (void __iomem *)(reg_addr)) -- cgit v1.2.3 From 77a509e4f6d14c2e09acbab5a89a769740bda62c Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Tue, 22 May 2018 00:28:40 -0700 Subject: qed: Add support for processing iscsi tlv request. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 74c2b9ad8afa..92b5352287df 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -412,6 +412,42 @@ struct qed_mfw_tlv_fcoe { struct qed_mfw_tlv_time scsi_chk_tstamp[5]; }; +struct qed_mfw_tlv_iscsi { + u8 target_llmnr; + bool target_llmnr_set; + u8 header_digest; + bool header_digest_set; + u8 data_digest; + bool data_digest_set; + u8 auth_method; +#define QED_MFW_TLV_AUTH_METHOD_NONE (1) +#define QED_MFW_TLV_AUTH_METHOD_CHAP (2) +#define QED_MFW_TLV_AUTH_METHOD_MUTUAL_CHAP (3) + bool auth_method_set; + u16 boot_taget_portal; + bool boot_taget_portal_set; + u16 frame_size; + bool frame_size_set; + u16 tx_desc_size; + bool tx_desc_size_set; + u16 rx_desc_size; + bool rx_desc_size_set; + u8 boot_progress; + bool boot_progress_set; + u16 tx_desc_qdepth; + bool tx_desc_qdepth_set; + u16 rx_desc_qdepth; + bool rx_desc_qdepth_set; + u64 rx_frames; + bool rx_frames_set; + u64 rx_bytes; + bool rx_bytes_set; + u64 tx_frames; + bool tx_frames_set; + u64 tx_bytes; + bool tx_bytes_set; +}; + #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \ (void __iomem *)(reg_addr)) -- cgit v1.2.3 From 59ccf86fe69a6a77afebe706913d6b551d84d5bc Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Tue, 22 May 2018 00:28:41 -0700 Subject: qed: Add driver infrastucture for handling mfw requests. MFW requests the TLVs in interrupt context. Extracting of the required data from upper layers and populating of the TLVs require process context. The patch adds work-queues for processing the tlv requests. It also adds the implementation for requesting the tlv values from appropriate protocol driver. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 92b5352287df..44af652e7407 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -751,6 +751,14 @@ struct qed_int_info { u8 used_cnt; }; +struct qed_generic_tlvs { +#define QED_TLV_IP_CSUM BIT(0) +#define QED_TLV_LSO BIT(1) + u16 feat_flags; +#define QED_TLV_MAC_COUNT 3 + u8 mac[QED_TLV_MAC_COUNT][ETH_ALEN]; +}; + #define QED_NVM_SIGNATURE 0x12435687 enum qed_nvm_flash_cmd { @@ -765,6 +773,8 @@ struct qed_common_cb_ops { void (*link_update)(void *dev, struct qed_link_output *link); void (*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type); + void (*get_generic_tlv_data)(void *dev, struct qed_generic_tlvs *data); + void (*get_protocol_tlv_data)(void *dev, void *data); }; struct qed_selftest_ops { -- cgit v1.2.3 From 1f55236bd8dde69d1860a30c50793fb28d8405ae Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 May 2018 23:46:53 +0200 Subject: netfilter: nf_nat: move common nat code to nat core Copy-pasted, both l3 helpers almost use same code here. Split out the common part into an 'inet' helper. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_core.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h index 235bd0e9a5aa..0d84dd29108d 100644 --- a/include/net/netfilter/nf_nat_core.h +++ b/include/net/netfilter/nf_nat_core.h @@ -11,6 +11,13 @@ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb); +unsigned int +nf_nat_inet_fn(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state, + unsigned int (*do_chain)(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state)); + int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family); static inline int nf_nat_initialized(struct nf_conn *ct, -- cgit v1.2.3 From 4e25ceb80b585891c5e2a6edfa481bc4709e9544 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 May 2018 23:46:55 +0200 Subject: netfilter: nf_tables: allow chain type to override hook register Will be used in followup patch when nat types no longer use nf_register_net_hook() but will instead register with the nat core. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 435c9e3b9181..a94fd0c730d6 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -880,8 +880,8 @@ enum nft_chain_types { * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions - * @init: chain initialization function - * @free: chain release function + * @ops_register: base chain register function + * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; @@ -890,8 +890,8 @@ struct nft_chain_type { struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NF_MAX_HOOKS]; - int (*init)(struct nft_ctx *ctx); - void (*free)(struct nft_ctx *ctx); + int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); + void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, -- cgit v1.2.3 From 1cd472bf036ca038e783ef5f058f54e45b7e8180 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 May 2018 23:46:57 +0200 Subject: netfilter: nf_nat: add nat hook register functions to nf_nat This adds the infrastructure to register nat hooks with the nat core instead of the netfilter core. nat hooks are used to configure nat bindings. Such hooks are registered from ip(6)table_nat or by the nftables core when a nat chain is added. After next patch, nat hooks will be registered with nf_nat instead of netfilter core. This allows to use many nat lookup functions at the same time while doing the real packet rewrite (nat transformation) in one place. This change doesn't convert the intended users yet to ease review. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index da3d601cadee..a17eb2f8d40e 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -75,4 +75,8 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum, #endif } +int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, + const struct nf_hook_ops *nat_ops, unsigned int ops_count); +void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops, + unsigned int ops_count); #endif -- cgit v1.2.3 From 9971a514ed2697e542f3984a6162eac54bb1da98 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 May 2018 23:46:58 +0200 Subject: netfilter: nf_nat: add nat type hooks to nat core Currently the packet rewrite and instantiation of nat NULL bindings happens from the protocol specific nat backend. Invocation occurs either via ip(6)table_nat or the nf_tables nat chain type. Invocation looks like this (simplified): NF_HOOK() | `---iptable_nat | `---> nf_nat_l3proto_ipv4 -> nf_nat_packet | new packet? pass skb though iptables nat chain | `---> iptable_nat: ipt_do_table In nft case, this looks the same (nft_chain_nat_ipv4 instead of iptable_nat). This is a problem for two reasons: 1. Can't use iptables nat and nf_tables nat at the same time, as the first user adds a nat binding (nf_nat_l3proto_ipv4 adds a NULL binding if do_table() did not find a matching nat rule so we can detect post-nat tuple collisions). 2. If you use e.g. nft_masq, snat, redir, etc. uses must also register an empty base chain so that the nat core gets called fro NF_HOOK() to do the reverse translation, which is neither obvious nor user friendly. After this change, the base hook gets registered not from iptable_nat or nftables nat hooks, but from the l3 nat core. iptables/nft nat base hooks get registered with the nat core instead: NF_HOOK() | `---> nf_nat_l3proto_ipv4 -> nf_nat_packet | new packet? pass skb through iptables/nftables nat chains | +-> iptables_nat: ipt_do_table +-> nft nat chain x `-> nft nat chain y The nat core deals with null bindings and reverse translation. When no mapping exists, it calls the registered nat lookup hooks until one creates a new mapping. If both iptables and nftables nat hooks exist, the first matching one is used (i.e., higher priority wins). Also, nft users do not need to create empty nat hooks anymore, nat core always registers the base hooks that take care of reverse/reply translation. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_core.h | 5 +--- include/net/netfilter/nf_nat_l3proto.h | 52 +++------------------------------- 2 files changed, 5 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h index 0d84dd29108d..c78e9be14b3d 100644 --- a/include/net/netfilter/nf_nat_core.h +++ b/include/net/netfilter/nf_nat_core.h @@ -13,10 +13,7 @@ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)); + const struct nf_hook_state *state); int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family); diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h index 8bad2560576f..d300b8f03972 100644 --- a/include/net/netfilter/nf_nat_l3proto.h +++ b/include/net/netfilter/nf_nat_l3proto.h @@ -44,58 +44,14 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum); -unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)); - -unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)); - -unsigned int nf_nat_ipv4_local_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)); - -unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)); - int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen); -unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)); - -unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)); - -unsigned int nf_nat_ipv6_local_fn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)); +int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops); +void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops); -unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - unsigned int (*do_chain)(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state)); +int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops); +void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops); #endif /* _NF_NAT_L3PROTO_H */ -- cgit v1.2.3 From a37061a678cab6d164f2989dd6f3b65f730289c7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 May 2018 23:46:59 +0200 Subject: netfilter: lift one-nat-hook-only restriction This reverts commit f92b40a8b2645 ("netfilter: core: only allow one nat hook per hook point"), this limitation is no longer needed. The nat core now invokes these functions and makes sure that hook evaluation stops after a mapping is created and a null binding is created otherwise. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 85a1a0b32c66..72f5871b9a0a 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -67,7 +67,6 @@ struct nf_hook_ops { struct net_device *dev; void *priv; u_int8_t pf; - bool nat_hook; unsigned int hooknum; /* Hooks are ordered in ascending priority. */ int priority; -- cgit v1.2.3 From 1f4b24397d3e164dde7026b91056e67304724fb6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 23 May 2018 09:17:12 +0200 Subject: netfilter: add struct nf_ct_hook and use it Move the nf_ct_destroy indirection to the struct nf_ct_hook. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 72f5871b9a0a..75ded6f6eebe 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -373,13 +373,18 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; void nf_ct_attach(struct sk_buff *, const struct sk_buff *); -extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu; #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif struct nf_conn; enum ip_conntrack_info; + +struct nf_ct_hook { + void (*destroy)(struct nf_conntrack *); +}; +extern struct nf_ct_hook __rcu *nf_ct_hook; + struct nlattr; struct nfnl_ct_hook { -- cgit v1.2.3 From 2c205dd3981f79cef097207ba9c61c2260812f39 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 23 May 2018 09:17:19 +0200 Subject: netfilter: add struct nf_nat_hook and use it Move decode_session() and parse_nat_setup_hook() indirections to struct nf_nat_hook structure. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 21 ++++++++++++++++----- include/net/netfilter/nf_nat_core.h | 7 ------- 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 75ded6f6eebe..e8d09dc028f6 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -320,18 +320,29 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry); #include -extern void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); + +struct nf_conn; +enum nf_nat_manip_type; +struct nlattr; + +struct nf_nat_hook { + int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, + const struct nlattr *attr); + void (*decode_session)(struct sk_buff *skb, struct flowi *fl); +}; + +extern struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #ifdef CONFIG_NF_NAT_NEEDED - void (*decodefn)(struct sk_buff *, struct flowi *); + struct nf_nat_hook *nat_hook; rcu_read_lock(); - decodefn = rcu_dereference(nf_nat_decode_session_hook); - if (decodefn) - decodefn(skb, fl); + nat_hook = rcu_dereference(nf_nat_hook); + if (nat_hook->decode_session) + nat_hook->decode_session(skb, fl); rcu_read_unlock(); #endif } diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h index c78e9be14b3d..dc7cd0440229 100644 --- a/include/net/netfilter/nf_nat_core.h +++ b/include/net/netfilter/nf_nat_core.h @@ -26,11 +26,4 @@ static inline int nf_nat_initialized(struct nf_conn *ct, return ct->status & IPS_DST_NAT_DONE; } -struct nlattr; - -extern int -(*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, - enum nf_nat_manip_type manip, - const struct nlattr *attr); - #endif /* _NF_NAT_CORE_H */ -- cgit v1.2.3 From 368982cd7d1bd41cd39049c794990aca3770db44 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 23 May 2018 09:17:24 +0200 Subject: netfilter: nfnetlink_queue: resolve clash for unconfirmed conntracks In nfqueue, two consecutive skbuffs may race to create the conntrack entry. Hence, the one that loses the race gets dropped due to clash in the insertion into the hashes from the nf_conntrack_confirm() path. This patch adds a new nf_conntrack_update() function which searches for possible clashes and resolve them. NAT mangling for the packet losing race is corrected by using the conntrack information that won race. In order to avoid direct module dependencies with conntrack and NAT, the nf_ct_hook and nf_nat_hook structures are used for this purpose. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index e8d09dc028f6..04551af2ff23 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -324,11 +324,15 @@ int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry); struct nf_conn; enum nf_nat_manip_type; struct nlattr; +enum ip_conntrack_dir; struct nf_nat_hook { int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl); + unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, + enum nf_nat_manip_type mtype, + enum ip_conntrack_dir dir); }; extern struct nf_nat_hook __rcu *nf_nat_hook; @@ -392,6 +396,7 @@ struct nf_conn; enum ip_conntrack_info; struct nf_ct_hook { + int (*update)(struct net *net, struct sk_buff *skb); void (*destroy)(struct nf_conntrack *); }; extern struct nf_ct_hook __rcu *nf_ct_hook; -- cgit v1.2.3 From d4e36e5554eb92f3ec7fedad3efb602570584df4 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Fri, 20 Apr 2018 13:49:25 +0300 Subject: mac80211: Support adding duration for prepare_tx() callback There are specific cases, such as SAE authentication exchange, that might require long duration to complete. For such cases, add support for indicating to the driver the required duration of the prepare_tx() operation, so the driver would still be able to complete the frame exchange. Currently, indicate the duration only for SAE authentication exchange, as SAE authentication can take up to 2000 msec (as defined in IEEE P802.11-REVmd D1.0 p. 3504). As the patch modified the prepare_tx() callback API, also modify the relevant code in iwlwifi. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 604d738a2128..851a5e19ae32 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3378,6 +3378,8 @@ enum ieee80211_reconfig_type { * frame in case that no beacon was heard from the AP/P2P GO. * The callback will be called before each transmission and upon return * mac80211 will transmit the frame right away. + * If duration is greater than zero, mac80211 hints to the driver the + * duration for which the operation is requested. * The callback is optional and can (should!) sleep. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending @@ -3697,7 +3699,8 @@ struct ieee80211_ops { u32 sset, u8 *data); void (*mgd_prepare_tx)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + u16 duration); void (*mgd_protect_tdls_discover)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); -- cgit v1.2.3 From 76804d28c32ec14e1fdb3981623e5b7a4bc1c739 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 22 May 2018 10:19:06 +0200 Subject: cfg80211: use separate struct for FILS parameters Put FILS related parameters into their own struct definition so it can be reused for roam events in subsequent change. Reviewed-by: Jithu Jance Reviewed-by: Eylon Pedinovsky Signed-off-by: Arend van Spriel Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index df33c2766d22..0e4a2a04d55d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5420,6 +5420,30 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp) #define CFG80211_TESTMODE_DUMP(cmd) #endif +/** + * struct cfg80211_fils_resp_params - FILS connection response params + * @kek: KEK derived from a successful FILS connection (may be %NULL) + * @kek_len: Length of @fils_kek in octets + * @update_erp_next_seq_num: Boolean value to specify whether the value in + * @erp_next_seq_num is valid. + * @erp_next_seq_num: The next sequence number to use in ERP message in + * FILS Authentication. This value should be specified irrespective of the + * status for a FILS connection. + * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL). + * @pmk_len: Length of @pmk in octets + * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID + * used for this FILS connection (may be %NULL). + */ +struct cfg80211_fils_resp_params { + const u8 *kek; + size_t kek_len; + bool update_erp_next_seq_num; + u16 erp_next_seq_num; + const u8 *pmk; + size_t pmk_len; + const u8 *pmkid; +}; + /** * struct cfg80211_connect_resp_params - Connection response params * @status: Status code, %WLAN_STATUS_SUCCESS for successful connection, use @@ -5438,17 +5462,7 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp) * @req_ie_len: Association request IEs length * @resp_ie: Association response IEs (may be %NULL) * @resp_ie_len: Association response IEs length - * @fils_kek: KEK derived from a successful FILS connection (may be %NULL) - * @fils_kek_len: Length of @fils_kek in octets - * @update_erp_next_seq_num: Boolean value to specify whether the value in - * @fils_erp_next_seq_num is valid. - * @fils_erp_next_seq_num: The next sequence number to use in ERP message in - * FILS Authentication. This value should be specified irrespective of the - * status for a FILS connection. - * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL). - * @pmk_len: Length of @pmk in octets - * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID - * used for this FILS connection (may be %NULL). + * @fils: FILS connection response parameters. * @timeout_reason: Reason for connection timeout. This is used when the * connection fails due to a timeout instead of an explicit rejection from * the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is @@ -5464,13 +5478,7 @@ struct cfg80211_connect_resp_params { size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; - const u8 *fils_kek; - size_t fils_kek_len; - bool update_erp_next_seq_num; - u16 fils_erp_next_seq_num; - const u8 *pmk; - size_t pmk_len; - const u8 *pmkid; + struct cfg80211_fils_resp_params fils; enum nl80211_timeout_reason timeout_reason; }; -- cgit v1.2.3 From e841b7b11ebd0b359e07bb3d7caf15dca1a80b72 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 22 May 2018 10:19:07 +0200 Subject: nl80211: add FILS related parameters to ROAM event In case of FILS shared key offload the parameters can change upon roaming of which user-space needs to be notified. Reviewed-by: Jithu Jance Reviewed-by: Eylon Pedinovsky Signed-off-by: Arend van Spriel Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0e4a2a04d55d..1e7c0df4fe33 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5624,6 +5624,7 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid, * @req_ie_len: association request IEs length * @resp_ie: association response IEs (may be %NULL) * @resp_ie_len: assoc response IEs length + * @fils: FILS related roaming information. */ struct cfg80211_roam_info { struct ieee80211_channel *channel; @@ -5633,6 +5634,7 @@ struct cfg80211_roam_info { size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; + struct cfg80211_fils_resp_params fils; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 83ed1dd757e6..0a412335d56b 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -215,7 +215,8 @@ * as specified in IETF RFC 6696. * * When FILS shared key authentication is completed, driver needs to provide the - * below additional parameters to userspace. + * below additional parameters to userspace, which can be either after setting + * up a connection or after roaming. * %NL80211_ATTR_FILS_KEK - used for key renewal * %NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used in further EAP-RP exchanges * %NL80211_ATTR_PMKID - used to identify the PMKSA used/generated -- cgit v1.2.3 From 7f9a3e150ec7d3596386449c15aefb59904a1266 Mon Sep 17 00:00:00 2001 From: Vidyullatha Kanchanapally Date: Tue, 22 May 2018 10:19:08 +0200 Subject: nl80211: Update ERP info using NL80211_CMD_UPDATE_CONNECT_PARAMS Use NL80211_CMD_UPDATE_CONNECT_PARAMS to update new ERP information, Association IEs and the Authentication type to driver / firmware which will be used in subsequent roamings. Signed-off-by: Vidyullatha Kanchanapally [arend: extended fils-sk kernel doc and added check in wiphy_register()] Reviewed-by: Jithu Jance Reviewed-by: Eylon Pedinovsky Signed-off-by: Arend van Spriel Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ include/uapi/linux/nl80211.h | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1e7c0df4fe33..5fbfe61f41c6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2225,9 +2225,14 @@ struct cfg80211_connect_params { * have to be updated as part of update_connect_params() call. * * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated + * @UPDATE_FILS_ERP_INFO: Indicates that FILS connection parameters (realm, + * username, erp sequence number and rrk) are updated + * @UPDATE_AUTH_TYPE: Indicates that authentication type is updated */ enum cfg80211_connect_params_changed { UPDATE_ASSOC_IES = BIT(0), + UPDATE_FILS_ERP_INFO = BIT(1), + UPDATE_AUTH_TYPE = BIT(2), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0a412335d56b..06f9af23156b 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -204,7 +204,8 @@ * FILS shared key authentication offload should be able to construct the * authentication and association frames for FILS shared key authentication and * eventually do a key derivation as per IEEE 802.11ai. The below additional - * parameters should be given to driver in %NL80211_CMD_CONNECT. + * parameters should be given to driver in %NL80211_CMD_CONNECT and/or in + * %NL80211_CMD_UPDATE_CONNECT_PARAMS. * %NL80211_ATTR_FILS_ERP_USERNAME - used to construct keyname_nai * %NL80211_ATTR_FILS_ERP_REALM - used to construct keyname_nai * %NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used to construct erp message -- cgit v1.2.3 From dcab51f19b291d5ee23724c51b0a3a597c16451a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 15:03:31 -0700 Subject: bpf: Expose check_uarg_tail_zero() This patch exposes check_uarg_tail_zero() which will be reused by a later BTF patch. Its name is changed to bpf_check_uarg_tail_zero(). Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ed0122b45b63..f6fe3c719ca8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -463,6 +463,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); +int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, + size_t actual_size); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. -- cgit v1.2.3 From f80442a4cd1864154beaa060bb483a2c9f7d811f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:18 -0700 Subject: bpf: btf: Change how section is supported in btf_header There are currently unused section descriptions in the btf_header. Those sections are here to support future BTF use cases. For example, the func section (func_off) is to support function signature (e.g. the BPF prog function signature). Instead of spelling out all potential sections up-front in the btf_header. This patch makes changes to btf_header such that extending it (e.g. adding a section) is possible later. The unused ones can be removed for now and they can be added back later. This patch: 1. adds a hdr_len to the btf_header. It will allow adding sections (and other info like parent_label and parent_name) later. The check is similar to the existing bpf_attr. If a user passes in a longer hdr_len, the kernel ensures the extra tailing bytes are 0. 2. allows the section order in the BTF object to be different from its sec_off order in btf_header. 3. each sec_off is followed by a sec_len. It must not have gap or overlapping among sections. The string section is ensured to be at the end due to the 4 bytes alignment requirement of the type section. The above changes will allow enough flexibility to add new sections (and other info) to the btf_header later. This patch also removes an unnecessary !err check at the end of btf_parse(). Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index bcb56ee47014..4fa479741a02 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -12,15 +12,11 @@ struct btf_header { __u16 magic; __u8 version; __u8 flags; - - __u32 parent_label; - __u32 parent_name; + __u32 hdr_len; /* All offsets are in bytes relative to the end of this header */ - __u32 label_off; /* offset of label section */ - __u32 object_off; /* offset of data object section*/ - __u32 func_off; /* offset of function section */ __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ __u32 str_off; /* offset of string section */ __u32 str_len; /* length of string section */ }; -- cgit v1.2.3 From aea2f7b8911617d7a8c23fb73d69e78764f91b58 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:20 -0700 Subject: bpf: btf: Remove unused bits from uapi/linux/btf.h This patch does the followings: 1. Limit BTF_MAX_TYPES and BTF_MAX_NAME_OFFSET to 64k. We can raise it later. 2. Remove the BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID. They are currently encoded at the highest bit of a u32. It is because the current use case does not require supporting parent type (i.e type_id referring to a type in another BTF file). It also does not support referring to a string in ELF. The BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID checks are replaced by BTF_TYPE_ID_CHECK and BTF_STR_OFFSET_CHECK which are defined in btf.c instead of uapi/linux/btf.h. 3. Limit the BTF_INFO_KIND from 5 bits to 4 bits which is enough. There is unused bits headroom if we ever needed it later. 4. The root bit in BTF_INFO is also removed because it is not used in the current use case. 5. Remove BTF_INT_VARARGS since func type is not supported now. The BTF_INT_ENCODING is limited to 4 bits instead of 8 bits. The above can be added back later because the verifier ensures the unused bits are zeros. Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 4fa479741a02..0b5ddbe135a4 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -22,28 +22,19 @@ struct btf_header { }; /* Max # of type identifier */ -#define BTF_MAX_TYPE 0x7fffffff +#define BTF_MAX_TYPE 0x0000ffff /* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x7fffffff +#define BTF_MAX_NAME_OFFSET 0x0000ffff /* Max # of struct/union/enum members or func args */ #define BTF_MAX_VLEN 0xffff -/* The type id is referring to a parent BTF */ -#define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1) -#define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE) - -/* String is in the ELF string section */ -#define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1) -#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET) - struct btf_type { __u32 name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused - * bits 24-28: kind (e.g. int, ptr, array...etc) - * bits 29-30: unused - * bits 31: root + * bits 24-27: kind (e.g. int, ptr, array...etc) + * bits 28-31: unused */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. @@ -58,8 +49,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) -#define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80)) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_KIND_UNKN 0 /* Unknown */ @@ -84,15 +74,14 @@ struct btf_type { /* BTF_KIND_INT is followed by a u32 and the following * is the 32 bits arrangement: */ -#define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24) +#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) #define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff) /* Attributes stored in the BTF_INT_ENCODING */ -#define BTF_INT_SIGNED 0x1 -#define BTF_INT_CHAR 0x2 -#define BTF_INT_BOOL 0x4 -#define BTF_INT_VARARGS 0x8 +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) /* BTF_KIND_ENUM is followed by multiple "struct btf_enum". * The exact number of btf_enum is stored in the vlen (of the -- cgit v1.2.3 From 9b2cf328b2eccf761537a06bef914d2a0700fba7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:21 -0700 Subject: bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id" could cause confusion because the "id" of "btf_id" means the BPF obj id given to the BTF object while "btf_key_id" and "btf_value_id" means the BTF type id within that BTF object. To make it clear, btf_key_id and btf_value_id are renamed to btf_key_type_id and btf_value_type_id. Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 4 ++-- include/uapi/linux/bpf.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f6fe3c719ca8..1795eeee846c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -69,8 +69,8 @@ struct bpf_map { u32 pages; u32 id; int numa_node; - u32 btf_key_id; - u32 btf_value_id; + u32 btf_key_type_id; + u32 btf_value_type_id; struct btf *btf; bool unpriv_array; /* 55 bytes hole */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 97446bbe2ca5..c3e502d06bc3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -284,8 +284,8 @@ union bpf_attr { char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ __u32 btf_fd; /* fd pointing to a BTF type data */ - __u32 btf_key_id; /* BTF type_id of the key */ - __u32 btf_value_id; /* BTF type_id of the value */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -2219,8 +2219,8 @@ struct bpf_map_info { __u64 netns_dev; __u64 netns_ino; __u32 btf_id; - __u32 btf_key_id; - __u32 btf_value_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; } __attribute__((aligned(8))); struct bpf_btf_info { -- cgit v1.2.3 From 0c6bca747111dee19aa48c8f73d77fc85fcb8dd0 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 15 May 2018 21:23:31 +0900 Subject: netfilter: nf_tables: remove nft_af_info. The struct nft_af_info was removed. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netns/nftables.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 48134353411d..29c3851b486a 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -4,8 +4,6 @@ #include -struct nft_af_info; - struct netns_nftables { struct list_head tables; struct list_head commit_list; -- cgit v1.2.3 From 449325b52b7a6208f65ed67d3484fd7b7184477b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 21 May 2018 19:22:29 -0700 Subject: umh: introduce fork_usermode_blob() helper Introduce helper: int fork_usermode_blob(void *data, size_t len, struct umh_info *info); struct umh_info { struct file *pipe_to_umh; struct file *pipe_from_umh; pid_t pid; }; that GPLed kernel modules (signed or unsigned) can use it to execute part of its own data as swappable user mode process. The kernel will do: - allocate a unique file in tmpfs - populate that file with [data, data + len] bytes - user-mode-helper code will do_execve that file and, before the process starts, the kernel will create two unix pipes for bidirectional communication between kernel module and umh - close tmpfs file, effectively deleting it - the fork_usermode_blob will return zero on success and populate 'struct umh_info' with two unix pipes and the pid of the user process As the first step in the development of the bpfilter project the fork_usermode_blob() helper is introduced to allow user mode code to be invoked from a kernel module. The idea is that user mode code plus normal kernel module code are built as part of the kernel build and installed as traditional kernel module into distro specified location, such that from a distribution point of view, there is no difference between regular kernel modules and kernel modules + umh code. Such modules can be signed, modprobed, rmmod, etc. The use of this new helper by a kernel module doesn't make it any special from kernel and user space tooling point of view. Such approach enables kernel to delegate functionality traditionally done by the kernel modules into the user space processes (either root or !root) and reduces security attack surface of the new code. The buggy umh code would crash the user process, but not the kernel. Another advantage is that umh code of the kernel module can be debugged and tested out of user space (e.g. opening the possibility to run clang sanitizers, fuzzers or user space test suites on the umh code). In case of the bpfilter project such architecture allows complex control plane to be done in the user space while bpf based data plane stays in the kernel. Since umh can crash, can be oom-ed by the kernel, killed by the admin, the kernel module that uses them (like bpfilter) needs to manage life time of umh on its own via two unix pipes and the pid of umh. The exit code of such kernel module should kill the umh it started, so that rmmod of the kernel module will cleanup the corresponding umh. Just like if the kernel module does kmalloc() it should kfree() it in the exit code. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/binfmts.h | 1 + include/linux/umh.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 4955e0863b83..c05f24fac4f6 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -150,5 +150,6 @@ extern int do_execveat(int, struct filename *, const char __user * const __user *, const char __user * const __user *, int); +int do_execve_file(struct file *file, void *__argv, void *__envp); #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/umh.h b/include/linux/umh.h index 244aff638220..5c812acbb80a 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -22,8 +22,10 @@ struct subprocess_info { const char *path; char **argv; char **envp; + struct file *file; int wait; int retval; + pid_t pid; int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -38,6 +40,16 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); +struct subprocess_info *call_usermodehelper_setup_file(struct file *file, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *), void *data); +struct umh_info { + struct file *pipe_to_umh; + struct file *pipe_from_umh; + pid_t pid; +}; +int fork_usermode_blob(void *data, size_t len, struct umh_info *info); + extern int call_usermodehelper_exec(struct subprocess_info *info, int wait); -- cgit v1.2.3 From d2ba09c17a0647f899d6c20a11bab9e6d3382f07 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 21 May 2018 19:22:30 -0700 Subject: net: add skeleton of bpfilter kernel module bpfilter.ko consists of bpfilter_kern.c (normal kernel module code) and user mode helper code that is embedded into bpfilter.ko The steps to build bpfilter.ko are the following: - main.c is compiled by HOSTCC into the bpfilter_umh elf executable file - with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file is converted into bpfilter_umh.o object file with _binary_net_bpfilter_bpfilter_umh_start and _end symbols Example: $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o 0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end 0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size 0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start - bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko bpfilter_kern.c is a normal kernel module code that calls the fork_usermode_blob() helper to execute part of its own data as a user mode process. Notice that _binary_net_bpfilter_bpfilter_umh_start - end is placed into .init.rodata section, so it's freed as soon as __init function of bpfilter.ko is finished. As part of __init the bpfilter.ko does first request/reply action via two unix pipe provided by fork_usermode_blob() helper to make sure that umh is healthy. If not it will kill it via pid. Later bpfilter_process_sockopt() will be called from bpfilter hooks in get/setsockopt() to pass iptable commands into umh via bpfilter.ko If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will kill umh as well. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 15 +++++++++++++++ include/uapi/linux/bpfilter.h | 21 +++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 include/linux/bpfilter.h create mode 100644 include/uapi/linux/bpfilter.h (limited to 'include') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h new file mode 100644 index 000000000000..687b1760bb9f --- /dev/null +++ b/include/linux/bpfilter.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BPFILTER_H +#define _LINUX_BPFILTER_H + +#include + +struct sock; +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval, + unsigned int optlen); +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval, + int *optlen); +extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname, + char __user *optval, + unsigned int optlen, bool is_set); +#endif diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h new file mode 100644 index 000000000000..2ec3cc99ea4c --- /dev/null +++ b/include/uapi/linux/bpfilter.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _UAPI_LINUX_BPFILTER_H +#define _UAPI_LINUX_BPFILTER_H + +#include + +enum { + BPFILTER_IPT_SO_SET_REPLACE = 64, + BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65, + BPFILTER_IPT_SET_MAX, +}; + +enum { + BPFILTER_IPT_SO_GET_INFO = 64, + BPFILTER_IPT_SO_GET_ENTRIES = 65, + BPFILTER_IPT_SO_GET_REVISION_MATCH = 66, + BPFILTER_IPT_SO_GET_REVISION_TARGET = 67, + BPFILTER_IPT_GET_MAX, +}; + +#endif /* _UAPI_LINUX_BPFILTER_H */ -- cgit v1.2.3 From 8eea1ca82be90a7e7a4624ab9cb323574a5f71df Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 22 May 2018 11:34:40 -0400 Subject: gso: limit udp gso to egress-only virtual devices Until the udp receive stack supports large packets (UDP GRO), GSO packets must not loop from the egress to the ingress path. Revert the change that added NETIF_F_GSO_UDP_L4 to various virtual devices through NETIF_F_GSO_ENCAP_ALL as this included devices that may loop packets, such as veth and macvlan. Instead add it to specific devices that forward to another device's egress path, bonding and team. Fixes: 83aa025f535f ("udp: add gso support to virtual devices") CC: Alexander Duyck Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index c87c3a3453c1..623bb8ced060 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -220,7 +220,6 @@ enum { NETIF_F_GSO_GRE_CSUM | \ NETIF_F_GSO_IPXIP4 | \ NETIF_F_GSO_IPXIP6 | \ - NETIF_F_GSO_UDP_L4 | \ NETIF_F_GSO_UDP_TUNNEL | \ NETIF_F_GSO_UDP_TUNNEL_CSUM) -- cgit v1.2.3 From 404eb77ea766260c45cb05c4a8043b13bd7142d5 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 22 May 2018 14:03:27 -0700 Subject: ipv4: support sport, dport and ip_proto in RTM_GETROUTE This is a followup to fib rules sport, dport and ipproto match support. Only supports tcp, udp and icmp for ipproto. Used by fib rule self tests. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip.h | 3 +++ include/uapi/linux/rtnetlink.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index bada1f1f871e..0d2281b4b27a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -664,4 +664,7 @@ extern int sysctl_icmp_msgs_burst; int ip_misc_proc_init(void); #endif +int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, + struct netlink_ext_ack *extack); + #endif /* _IP_H */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 9b15005955fa..cabb210c93af 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -327,6 +327,9 @@ enum rtattr_type_t { RTA_PAD, RTA_UID, RTA_TTL_PROPAGATE, + RTA_IP_PROTO, + RTA_SPORT, + RTA_DPORT, __RTA_MAX }; -- cgit v1.2.3 From dbecd7388476aedeb66389febea84d5450d28773 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:48 +0530 Subject: bpf: get kernel symbol addresses via syscall This adds new two new fields to struct bpf_prog_info. For multi-function programs, these fields can be used to pass a list of kernel symbol addresses for all functions in a given program to userspace using the bpf system call with the BPF_OBJ_GET_INFO_BY_FD command. When bpf_jit_kallsyms is enabled, we can get the address of the corresponding kernel symbol for a callee function and resolve the symbol's name. The address is determined by adding the value of the call instruction's imm field to __bpf_call_base. This offset gets assigned to the imm field by the verifier. For some architectures, such as powerpc64, the imm field is not large enough to hold this offset. We resolve this by: [1] Assigning the subprog id to the imm field of a call instruction in the verifier instead of the offset of the callee's symbol's address from __bpf_call_base. [2] Determining the address of a callee's corresponding symbol by using the imm field as an index for the list of kernel symbol addresses now available from the program info. Suggested-by: Daniel Borkmann Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c3e502d06bc3..0be90965867d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2205,6 +2205,8 @@ struct bpf_prog_info { __u32 gpl_compatible:1; __u64 netns_dev; __u64 netns_ino; + __u32 nr_jited_ksyms; + __aligned_u64 jited_ksyms; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From 815581c11cc29f74af252b6306ea1ec94160231a Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 24 May 2018 12:26:52 +0530 Subject: bpf: get JITed image lengths of functions via syscall This adds new two new fields to struct bpf_prog_info. For multi-function programs, these fields can be used to pass a list of the JITed image lengths of each function for a given program to userspace using the bpf system call with the BPF_OBJ_GET_INFO_BY_FD command. This can be used by userspace applications like bpftool to split up the contiguous JITed dump, also obtained via the system call, into more relatable chunks corresponding to each function. Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0be90965867d..344d2ddcef49 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2206,7 +2206,9 @@ struct bpf_prog_info { __u64 netns_dev; __u64 netns_ino; __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From 63526e1c805b5a8992d25386d315009fcabac8e2 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:12 +0100 Subject: ipv6: sr: make seg6.h includable without IPv6 include/net/seg6.h cannot be included in a source file if CONFIG_IPV6 is not enabled: include/net/seg6.h: In function 'seg6_pernet': >> include/net/seg6.h:52:14: error: 'struct net' has no member named 'ipv6'; did you mean 'ipv4'? return net->ipv6.seg6_data; ^~~~ ipv4 This commit makes seg6_pernet return NULL if IPv6 is not compiled, hence allowing seg6.h to be included regardless of the configuration. Signed-off-by: Mathieu Xhonneux Signed-off-by: Daniel Borkmann --- include/net/seg6.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/seg6.h b/include/net/seg6.h index 099bad59dc90..70b4cfac52d7 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -49,7 +49,11 @@ struct seg6_pernet_data { static inline struct seg6_pernet_data *seg6_pernet(struct net *net) { +#if IS_ENABLED(CONFIG_IPV6) return net->ipv6.seg6_data; +#else + return NULL; +#endif } extern int seg6_init(void); -- cgit v1.2.3 From 1c1e761ef1a08a878a040e67979711015cfc163e Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:13 +0100 Subject: ipv6: sr: export function lookup_nexthop The function lookup_nexthop is essential to implement most of the seg6local actions. As we want to provide a BPF helper allowing to apply some of these actions on the packet being processed, the helper should be able to call this function, hence the need to make it public. Moreover, if one argument is incorrect or if the next hop can not be found, an error should be returned by the BPF helper so the BPF program can adapt its processing of the packet (return an error, properly force the drop, ...). This patch hence makes this function return dst->error to indicate a possible error. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/net/seg6.h | 3 ++- include/net/seg6_local.h | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 include/net/seg6_local.h (limited to 'include') diff --git a/include/net/seg6.h b/include/net/seg6.h index 70b4cfac52d7..e029e301faa5 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -67,5 +67,6 @@ extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto); extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); - +extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id); #endif diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h new file mode 100644 index 000000000000..57498b23085d --- /dev/null +++ b/include/net/seg6_local.h @@ -0,0 +1,24 @@ +/* + * SR-IPv6 implementation + * + * Authors: + * David Lebrun + * eBPF support: Mathieu Xhonneux + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _NET_SEG6_LOCAL_H +#define _NET_SEG6_LOCAL_H + +#include +#include + +extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, + u32 tbl_id); + +#endif -- cgit v1.2.3 From fe94cc290f535709d3c5ebd1e472dfd0aec7ee79 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:14 +0100 Subject: bpf: Add IPv6 Segment Routing helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF seg6local hook should be powerful enough to enable users to implement most of the use-cases one could think of. After some thinking, we figured out that the following actions should be possible on a SRv6 packet, requiring 3 specific helpers : - bpf_lwt_seg6_store_bytes: Modify non-sensitive fields of the SRH - bpf_lwt_seg6_adjust_srh: Allow to grow or shrink a SRH (to add/delete TLVs) - bpf_lwt_seg6_action: Apply some SRv6 network programming actions (specifically End.X, End.T, End.B6 and End.B6.Encap) The specifications of these helpers are provided in the patch (see include/uapi/linux/bpf.h). The non-sensitive fields of the SRH are the following : flags, tag and TLVs. The other fields can not be modified, to maintain the SRH integrity. Flags, tag and TLVs can easily be modified as their validity can be checked afterwards via seg6_validate_srh. It is not allowed to modify the segments directly. If one wants to add segments on the path, he should stack a new SRH using the End.B6 action via bpf_lwt_seg6_action. Growing, shrinking or editing TLVs via the helpers will flag the SRH as invalid, and it will have to be re-validated before re-entering the IPv6 layer. This flag is stored in a per-CPU buffer, along with the current header length in bytes. Storing the SRH len in bytes in the control block is mandatory when using bpf_lwt_seg6_adjust_srh. The Header Ext. Length field contains the SRH len rounded to 8 bytes (a padding TLV can be inserted to ensure the 8-bytes boundary). When adding/deleting TLVs within the BPF program, the SRH may temporary be in an invalid state where its length cannot be rounded to 8 bytes without remainder, hence the need to store the length in bytes separately. The caller of the BPF program can then ensure that the SRH's final length is valid using this value. Again, a final SRH modified by a BPF program which doesn’t respect the 8-bytes boundary will be discarded as it will be considered as invalid. Finally, a fourth helper is provided, bpf_lwt_push_encap, which is available from the LWT BPF IN hook, but not from the seg6local BPF one. This helper allows to encapsulate a Segment Routing Header (either with a new outer IPv6 header, or by inlining it directly in the existing IPv6 header) into a non-SRv6 packet. This helper is required if we want to offer the possibility to dynamically encapsulate a SRH for non-SRv6 packet, as the BPF seg6local hook only works on traffic already containing a SRH. This is the BPF equivalent of the seg6 LWT infrastructure, which achieves the same purpose but with a static SRH per route. These helpers require CONFIG_IPV6=y (and not =m). Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/net/seg6_local.h | 8 ++++ include/uapi/linux/bpf.h | 96 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 103 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h index 57498b23085d..661fd5b4d3e0 100644 --- a/include/net/seg6_local.h +++ b/include/net/seg6_local.h @@ -15,10 +15,18 @@ #ifndef _NET_SEG6_LOCAL_H #define _NET_SEG6_LOCAL_H +#include #include #include extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); +struct seg6_bpf_srh_state { + bool valid; + u16 hdrlen; +}; + +DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); + #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 344d2ddcef49..fdaf6a0bfa5b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1902,6 +1902,90 @@ union bpf_attr { * egress otherwise). This is the only flag supported for now. * Return * **SK_PASS** on success, or **SK_DROP** on error. + * + * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of param: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of param: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlaying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1976,7 +2060,11 @@ union bpf_attr { FN(fib_lookup), \ FN(sock_hash_update), \ FN(msg_redirect_hash), \ - FN(sk_redirect_hash), + FN(sk_redirect_hash), \ + FN(lwt_push_encap), \ + FN(lwt_seg6_store_bytes), \ + FN(lwt_seg6_adjust_srh), \ + FN(lwt_seg6_action), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2043,6 +2131,12 @@ enum bpf_hdr_start_off { BPF_HDR_START_NET, }; +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ -- cgit v1.2.3 From cd3092c7f8db8c320ea7f1aa7c1adeac2450f43a Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:15 +0100 Subject: bpf: Split lwt inout verifier structures The new bpf_lwt_push_encap helper should only be accessible within the LWT BPF IN hook, and not the OUT one, as this may lead to a skb under panic. At the moment, both LWT BPF IN and OUT share the same list of helpers, whose calls are authorized by the verifier. This patch separates the verifier ops for the IN and OUT hooks, and allows the IN hook to call the bpf_lwt_push_encap helper. This patch is also the occasion to put all lwt_*_func_proto functions together for clarity. At the moment, socks_op_func_proto is in the middle of lwt_inout_func_proto and lwt_xmit_func_proto. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b67f8793de0d..aa5c8b878474 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -9,8 +9,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) -- cgit v1.2.3 From 004d4b274e2a1a895a0e5dc66158b90a7d463d44 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:16 +0100 Subject: ipv6: sr: Add seg6local action End.BPF This patch adds the End.BPF action to the LWT seg6local infrastructure. This action works like any other seg6local End action, meaning that an IPv6 header with SRH is needed, whose DA has to be equal to the SID of the action. It will also advance the SRH to the next segment, the BPF program does not have to take care of this. Since the BPF program may not be a source of instability in the kernel, it is important to ensure that the integrity of the packet is maintained before yielding it back to the IPv6 layer. The hook hence keeps track if the SRH has been altered through the helpers, and re-validates its content if needed with seg6_validate_srh. The state kept for validation is stored in a per-CPU buffer. The BPF program is not allowed to directly write into the packet, and only some fields of the SRH can be altered through the helper bpf_lwt_seg6_store_bytes. Performances profiling has shown that the SRH re-validation does not induce a significant overhead. If the altered SRH is deemed as invalid, the packet is dropped. This validation is also done before executing any action through bpf_lwt_seg6_action, and will not be performed again if the SRH is not modified after calling the action. The BPF program may return 3 types of return codes: - BPF_OK: the End.BPF action will look up the next destination through seg6_lookup_nexthop. - BPF_REDIRECT: if an action has been executed through the bpf_lwt_seg6_action helper, the BPF program should return this value, as the skb's destination is already set and the default lookup should not be performed. - BPF_DROP : the packet will be dropped. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + include/uapi/linux/seg6_local.h | 12 ++++++++++++ 3 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index aa5c8b878474..b161e506dcfc 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -12,6 +12,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fdaf6a0bfa5b..e95fec90c2c1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -141,6 +141,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, }; enum bpf_attach_type { diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index ef2d8c3e76c1..edc138bdc56d 100644 --- a/include/uapi/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h @@ -25,6 +25,7 @@ enum { SEG6_LOCAL_NH6, SEG6_LOCAL_IIF, SEG6_LOCAL_OIF, + SEG6_LOCAL_BPF, __SEG6_LOCAL_MAX, }; #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) @@ -59,10 +60,21 @@ enum { SEG6_LOCAL_ACTION_END_AS = 13, /* forward to SR-unaware VNF with masquerading */ SEG6_LOCAL_ACTION_END_AM = 14, + /* custom BPF action */ + SEG6_LOCAL_ACTION_END_BPF = 15, __SEG6_LOCAL_ACTION_MAX, }; #define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1) +enum { + SEG6_LOCAL_BPF_PROG_UNSPEC, + SEG6_LOCAL_BPF_PROG, + SEG6_LOCAL_BPF_PROG_NAME, + __SEG6_LOCAL_BPF_PROG_MAX, +}; + +#define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1) + #endif -- cgit v1.2.3 From 87e5808d52b65fc5b0bfda209ba8864cc2f933e5 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 23 May 2018 08:05:20 +0200 Subject: net: phy: replace bool members in struct phy_device with bit-fields In struct phy_device we have a number of flags being defined as type bool. Similar to e.g. struct pci_dev we can save some space by using bit-fields. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 073235e70442..6cd09098427c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -406,13 +406,17 @@ struct phy_device { u32 phy_id; struct phy_c45_device_ids c45_ids; - bool is_c45; - bool is_internal; - bool is_pseudo_fixed_link; - bool has_fixups; - bool suspended; - bool sysfs_links; - bool loopback_enabled; + unsigned is_c45:1; + unsigned is_internal:1; + unsigned is_pseudo_fixed_link:1; + unsigned has_fixups:1; + unsigned suspended:1; + unsigned sysfs_links:1; + unsigned loopback_enabled:1; + + unsigned autoneg:1; + /* The most recently read link state */ + unsigned link:1; enum phy_state state; @@ -429,9 +433,6 @@ struct phy_device { int pause; int asym_pause; - /* The most recently read link state */ - int link; - /* Enabled Interrupts */ u32 interrupts; @@ -444,8 +445,6 @@ struct phy_device { /* Energy efficient ethernet modes which should be prohibited */ u32 eee_broken_modes; - int autoneg; - int link_timeout; #ifdef CONFIG_LED_TRIGGER_PHY -- cgit v1.2.3 From e549f6f9c098067a99e9de8ac84f5cc2c07ae5c6 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Thu, 22 Feb 2018 11:57:10 -0600 Subject: net/dcb: Add dcbnl buffer attribute In this patch, we add dcbnl buffer attribute to allow user change the NIC's buffer configuration such as priority to buffer mapping and buffer size of individual buffer. This attribute combined with pfc attribute allows advanced user to fine tune the qos setting for specific priority queue. For example, user can give dedicated buffer for one or more priorities or user can give large buffer to certain priorities. The dcb buffer configuration will be controlled by lldptool. lldptool -T -i eth2 -V BUFFER prio 0,2,5,7,1,2,3,6 maps priorities 0,1,2,3,4,5,6,7 to receive buffer 0,2,5,7,1,2,3,6 lldptool -T -i eth2 -V BUFFER size 87296,87296,0,87296,0,0,0,0 sets receive buffer size for buffer 0,1,2,3,4,5,6,7 respectively After discussion on mailing list with Jakub, Jiri, Ido and John, we agreed to choose dcbnl over devlink interface since this feature is intended to set port attributes which are governed by the netdev instance of that port, where devlink API is more suitable for global ASIC configurations. We present an use case scenario where dcbnl buffer attribute configured by advance user helps reduce the latency of messages of different sizes. Scenarios description: On ConnectX-5, we run latency sensitive traffic with small/medium message sizes ranging from 64B to 256KB and bandwidth sensitive traffic with large messages sizes 512KB and 1MB. We group small, medium, and large message sizes to their own pfc enables priorities as follow. Priorities 1 & 2 (64B, 256B and 1KB) Priorities 3 & 4 (4KB, 8KB, 16KB, 64KB, 128KB and 256KB) Priorities 5 & 6 (512KB and 1MB) By default, ConnectX-5 maps all pfc enabled priorities to a single lossless fixed buffer size of 50% of total available buffer space. The other 50% is assigned to lossy buffer. Using dcbnl buffer attribute, we create three equal size lossless buffers. Each buffer has 25% of total available buffer space. Thus, the lossy buffer size reduces to 25%. Priority to lossless buffer mappings are set as follow. Priorities 1 & 2 on lossless buffer #1 Priorities 3 & 4 on lossless buffer #2 Priorities 5 & 6 on lossless buffer #3 We observe improvements in latency for small and medium message sizes as follows. Please note that the large message sizes bandwidth performance is reduced but the total bandwidth remains the same. 256B message size (42 % latency reduction) 4K message size (21% latency reduction) 64K message size (16% latency reduction) CC: Ido Schimmel CC: Jakub Kicinski CC: Jiri Pirko CC: Or Gerlitz CC: Parav Pandit CC: Aron Silverton Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/net/dcbnl.h | 4 ++++ include/uapi/linux/dcbnl.h | 11 +++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 207d9ba1f92c..0e5e91be2d30 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -101,6 +101,10 @@ struct dcbnl_rtnl_ops { /* CEE peer */ int (*cee_peer_getpg) (struct net_device *, struct cee_pg *); int (*cee_peer_getpfc) (struct net_device *, struct cee_pfc *); + + /* buffer settings */ + int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *); + int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *); }; #endif /* __NET_DCBNL_H__ */ diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index 2c0c6453c3f4..60aa2e446698 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -163,6 +163,16 @@ struct ieee_pfc { __u64 indications[IEEE_8021QAZ_MAX_TCS]; }; +#define IEEE_8021Q_MAX_PRIORITIES 8 +#define DCBX_MAX_BUFFERS 8 +struct dcbnl_buffer { + /* priority to buffer mapping */ + __u8 prio2buffer[IEEE_8021Q_MAX_PRIORITIES]; + /* buffer size in Bytes */ + __u32 buffer_size[DCBX_MAX_BUFFERS]; + __u32 total_size; +}; + /* CEE DCBX std supported values */ #define CEE_DCBX_MAX_PGS 8 #define CEE_DCBX_MAX_PRIO 8 @@ -406,6 +416,7 @@ enum ieee_attrs { DCB_ATTR_IEEE_MAXRATE, DCB_ATTR_IEEE_QCN, DCB_ATTR_IEEE_QCN_STATS, + DCB_ATTR_DCB_BUFFER, __DCB_ATTR_IEEE_MAX }; #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1) -- cgit v1.2.3 From df5f1361cc080877013f7838b61d31ad31307c2b Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Wed, 28 Feb 2018 14:16:47 -0600 Subject: net/mlx5: Add pbmc and pptb in the port_access_reg_cap_mask Add pbmc and pptb in the port_access_reg_cap_mask. These two bits determine if device supports receive buffer configuration. Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 3 +++ include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2bc27f8c5b87..db0332a6d23c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1152,6 +1152,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_PCAM_FEATURE(mdev, fld) \ MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld) +#define MLX5_CAP_PCAM_REG(mdev, reg) \ + MLX5_GET(pcam_reg, (mdev)->caps.pcam, port_access_reg_cap_mask.regs_5000_to_507f.reg) + #define MLX5_CAP_MCAM_REG(mdev, reg) \ MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_access_reg_cap_mask.access_regs.reg) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b4ea8a9914c4..f687989d336b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8003,6 +8003,17 @@ struct mlx5_ifc_pcam_enhanced_features_bits { u8 ppcnt_statistical_group[0x1]; }; +struct mlx5_ifc_pcam_regs_5000_to_507f_bits { + u8 port_access_reg_cap_mask_127_to_96[0x20]; + u8 port_access_reg_cap_mask_95_to_64[0x20]; + u8 port_access_reg_cap_mask_63_to_32[0x20]; + + u8 port_access_reg_cap_mask_31_to_13[0x13]; + u8 pbmc[0x1]; + u8 pptb[0x1]; + u8 port_access_reg_cap_mask_10_to_0[0xb]; +}; + struct mlx5_ifc_pcam_reg_bits { u8 reserved_at_0[0x8]; u8 feature_group[0x8]; @@ -8012,6 +8023,7 @@ struct mlx5_ifc_pcam_reg_bits { u8 reserved_at_20[0x20]; union { + struct mlx5_ifc_pcam_regs_5000_to_507f_bits regs_5000_to_507f; u8 reserved_at_0[0x80]; } port_access_reg_cap_mask; -- cgit v1.2.3 From 50b4a3c23646254c7345f3663ff1e0a6cbcd9abb Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 2 Mar 2018 15:47:01 -0600 Subject: net/mlx5: PPTB and PBMC register firmware command support Add firmware command interface to read and write PPTB and PBMC registers. PPTB register enables mappings priority to a specific receive buffer. PBMC registers enables changing the receive buffer's configuration such as buffer size, xon/xoff thresholds, buffer's lossy property and buffer's shared property. Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d703774982ca..92d292454351 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -124,6 +124,8 @@ enum { MLX5_REG_PAOS = 0x5006, MLX5_REG_PFCC = 0x5007, MLX5_REG_PPCNT = 0x5008, + MLX5_REG_PPTB = 0x500b, + MLX5_REG_PBMC = 0x500c, MLX5_REG_PMAOS = 0x5012, MLX5_REG_PUDE = 0x5009, MLX5_REG_PMPE = 0x5010, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f687989d336b..edbddeaacc88 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8788,6 +8788,41 @@ struct mlx5_ifc_qpts_reg_bits { u8 trust_state[0x3]; }; +struct mlx5_ifc_pptb_reg_bits { + u8 reserved_at_0[0x2]; + u8 mm[0x2]; + u8 reserved_at_4[0x4]; + u8 local_port[0x8]; + u8 reserved_at_10[0x6]; + u8 cm[0x1]; + u8 um[0x1]; + u8 pm[0x8]; + + u8 prio_x_buff[0x20]; + + u8 pm_msb[0x8]; + u8 reserved_at_48[0x10]; + u8 ctrl_buff[0x4]; + u8 untagged_buff[0x4]; +}; + +struct mlx5_ifc_pbmc_reg_bits { + u8 reserved_at_0[0x8]; + u8 local_port[0x8]; + u8 reserved_at_10[0x10]; + + u8 xoff_timer_value[0x10]; + u8 xoff_refresh[0x10]; + + u8 reserved_at_40[0x9]; + u8 fullness_threshold[0x7]; + u8 port_buffer_size[0x10]; + + struct mlx5_ifc_bufferx_reg_bits buffer[10]; + + u8 reserved_at_2e0[0x40]; +}; + struct mlx5_ifc_qtct_reg_bits { u8 reserved_at_0[0x8]; u8 port_number[0x8]; -- cgit v1.2.3 From f8d959a5b188dc81e57a6bac34a1b2986f61e2fd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 24 May 2018 11:21:08 -0700 Subject: perf/core: add perf_get_event() to return perf_event given a struct file A new extern function, perf_get_event(), is added to return a perf event given a struct file. This function will be used in later patches. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e71e99eb9a4e..eec302b61cfe 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); +extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); @@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline const struct perf_event *perf_get_event(struct file *file) +{ + return ERR_PTR(-EINVAL); +} static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); -- cgit v1.2.3 From 41bdc4b40ed6fb26c6acc655ed9a243a348709c9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 24 May 2018 11:21:09 -0700 Subject: bpf: introduce bpf subcommand BPF_TASK_FD_QUERY Currently, suppose a userspace application has loaded a bpf program and attached it to a tracepoint/kprobe/uprobe, and a bpf introspection tool, e.g., bpftool, wants to show which bpf program is attached to which tracepoint/kprobe/uprobe. Such attachment information will be really useful to understand the overall bpf deployment in the system. There is a name field (16 bytes) for each program, which could be used to encode the attachment point. There are some drawbacks for this approaches. First, bpftool user (e.g., an admin) may not really understand the association between the name and the attachment point. Second, if one program is attached to multiple places, encoding a proper name which can imply all these attachments becomes difficult. This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY. Given a pid and fd, if the is associated with a tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return . prog_id . tracepoint name, or . k[ret]probe funcname + offset or kernel addr, or . u[ret]probe filename + offset to the userspace. The user can use "bpftool prog" to find more information about bpf program itself with prog_id. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/trace_events.h | 17 +++++++++++++++++ include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2bde3eff564c..d34144a3c5b5 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *fd_type, const char **buf, + u64 *probe_offset, u64 *probe_addr); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name { return NULL; } +static inline int bpf_get_perf_event_info(const struct perf_event *event, + u32 *prog_id, u32 *fd_type, + const char **buf, u64 *probe_offset, + u64 *probe_addr) +{ + return -EOPNOTSUPP; +} #endif enum { @@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags); #ifdef CONFIG_KPROBE_EVENTS extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); +extern int bpf_get_kprobe_info(const struct perf_event *event, + u32 *fd_type, const char **symbol, + u64 *probe_offset, u64 *probe_addr, + bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); +extern int bpf_get_uprobe_info(const struct perf_event *event, + u32 *fd_type, const char **filename, + u64 *probe_offset, bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e95fec90c2c1..9b8c6e310e9a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -97,6 +97,7 @@ enum bpf_cmd { BPF_RAW_TRACEPOINT_OPEN, BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, }; enum bpf_map_type { @@ -380,6 +381,22 @@ union bpf_attr { __u32 btf_log_size; __u32 btf_log_level; }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF @@ -2557,4 +2574,13 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 67f29e07e131ffa13ea158c259a513f474c7df27 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:46 +0200 Subject: bpf: devmap introduce dev_map_enqueue Functionality is the same, but the ndo_xdp_xmit call is now simply invoked from inside the devmap.c code. V2: Fix compile issue reported by kbuild test robot V5: Cleanups requested by Daniel - Newlines before func definition - Use BUILD_BUG_ON checks - Remove unnecessary use return value store in dev_map_enqueue Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++++++++++++--- include/trace/events/xdp.h | 9 ++++++++- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1795eeee846c..23a809da452d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -487,14 +487,16 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct xdp_buff; + +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); -struct xdp_buff; int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -573,6 +575,15 @@ static inline void __dev_map_flush(struct bpf_map *map) { } +struct xdp_buff; +struct bpf_dtab_netdev; + +static inline +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +{ + return 0; +} + static inline struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -587,7 +598,6 @@ static inline void __cpu_map_flush(struct bpf_map *map) { } -struct xdp_buff; static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 8989a92c571a..96104610d40e 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, __entry->map_id, __entry->map_index) ); +#ifndef __DEVMAP_OBJ_TYPE +#define __DEVMAP_OBJ_TYPE +struct _bpf_dtab_netdev { + struct net_device *dev; +}; +#endif /* __DEVMAP_OBJ_TYPE */ + #define devmap_ifindex(fwd, map) \ (!fwd ? 0 : \ (!map ? 0 : \ ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct net_device *)fwd)->ifindex : 0))) + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ -- cgit v1.2.3 From 38edddb81172e8b8decb057c0cd23271583a5fa0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:57 +0200 Subject: xdp: add tracepoint for devmap like cpumap have Notice how this allow us get XDP statistic without affecting the XDP performance, as tracepoint is no-longer activated on a per packet basis. V5: Spotted by John Fastabend. Fix 'sent' also counted 'drops' in this patch, a later patch corrected this, but it was a mistake in this intermediate step. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++-- include/trace/events/xdp.h | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 23a809da452d..bbe297436e5d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -492,7 +492,8 @@ struct xdp_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp); +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); @@ -579,7 +580,8 @@ struct xdp_buff; struct bpf_dtab_netdev; static inline -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) { return 0; } diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 96104610d40e..2e9ef0650144 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -229,6 +229,45 @@ TRACE_EVENT(xdp_cpumap_enqueue, __entry->to_cpu) ); +TRACE_EVENT(xdp_devmap_xmit, + + TP_PROTO(const struct bpf_map *map, u32 map_index, + int sent, int drops, + const struct net_device *from_dev, + const struct net_device *to_dev), + + TP_ARGS(map, map_index, sent, drops, from_dev, to_dev), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(u32, map_index) + __field(int, drops) + __field(int, sent) + __field(int, from_ifindex) + __field(int, to_ifindex) + ), + + TP_fast_assign( + __entry->map_id = map->id; + __entry->act = XDP_REDIRECT; + __entry->map_index = map_index; + __entry->drops = drops; + __entry->sent = sent; + __entry->from_ifindex = from_dev->ifindex; + __entry->to_ifindex = to_dev->ifindex; + ), + + TP_printk("ndo_xdp_xmit" + " map_id=%d map_index=%d action=%s" + " sent=%d drops=%d" + " from_ifindex=%d to_ifindex=%d", + __entry->map_id, __entry->map_index, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->sent, __entry->drops, + __entry->from_ifindex, __entry->to_ifindex) +); + #endif /* _TRACE_XDP_H */ #include -- cgit v1.2.3 From 389ab7f01af988c2a1ec5617eb0c7e220df1ef1c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:07 +0200 Subject: xdp: introduce xdp_return_frame_rx_napi When sending an xdp_frame through xdp_do_redirect call, then error cases can happen where the xdp_frame needs to be dropped, and returning an -errno code isn't sufficient/possible any-longer (e.g. for cpumap case). This is already fully supported, by simply calling xdp_return_frame. This patch is an optimization, which provides xdp_return_frame_rx_napi, which is a faster variant for these error cases. It take advantage of the protection provided by XDP RX running under NAPI protection. This change is mostly relevant for drivers using the page_pool allocator as it can take advantage of this. (Tested with mlx5). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/net/page_pool.h | 5 +++-- include/net/xdp.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index c79087153148..694d055e01ef 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool); void __page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct); -static inline void page_pool_put_page(struct page_pool *pool, struct page *page) +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, false); + __page_pool_put_page(pool, page, allow_direct); #endif } /* Very limited use-cases allow recycle direct */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 0b689cf561c7..7ad779237ae8 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) } void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, -- cgit v1.2.3 From 735fc4054b3a25034445c6713d259da0f96f8131 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:12 +0200 Subject: xdp: change ndo_xdp_xmit API to support bulking This patch change the API for ndo_xdp_xmit to support bulking xdp_frames. When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown. Most of the slowdown is caused by DMA API indirect function calls, but also the net_device->ndo_xdp_xmit() call. Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed performance improved: for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps With frames avail as a bulk inside the driver ndo_xdp_xmit call, further optimizations are possible, like bulk DMA-mapping for TX. Testing without CONFIG_RETPOLINE show the same performance for physical NIC drivers. The virtual NIC driver tun sees a huge performance boost, as it can avoid doing per frame producer locking, but instead amortize the locking cost over the bulk. V2: Fix compile errors reported by kbuild test robot V4: Isolated ndo, driver changes and callers. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 03ed492c4e14..debdb6286170 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1185,9 +1185,13 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); - * This function is used to submit a XDP packet for transmit on a - * netdevice. + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); + * This function is used to submit @n XDP packets for transmit on a + * netdevice. Returns number of frames successfully transmitted, frames + * that got dropped are freed/returned via xdp_return_frame(). + * Returns negative number, means general error invoking ndo, meaning + * no frames were xmit'ed and core-caller will free all frames. + * TODO: Consider add flag to allow sending flush operation. * void (*ndo_xdp_flush)(struct net_device *dev); * This function is used to inform the driver to flush a particular * xdp tx queue. Must be called on same CPU as xdp_xmit. @@ -1375,8 +1379,8 @@ struct net_device_ops { int needed_headroom); int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); - int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_frame *xdp); + int (*ndo_xdp_xmit)(struct net_device *dev, int n, + struct xdp_frame **xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; -- cgit v1.2.3 From e74de52e55c092e7113f839e74400ce9dbe12ceb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:17 +0200 Subject: xdp/trace: extend tracepoint in devmap with an err Extending tracepoint xdp:xdp_devmap_xmit in devmap with an err code allow people to easier identify the reason behind the ndo_xdp_xmit call to a given driver is failing. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/trace/events/xdp.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 2e9ef0650144..1ecf4c67fcf7 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -234,9 +234,9 @@ TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct bpf_map *map, u32 map_index, int sent, int drops, const struct net_device *from_dev, - const struct net_device *to_dev), + const struct net_device *to_dev, int err), - TP_ARGS(map, map_index, sent, drops, from_dev, to_dev), + TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err), TP_STRUCT__entry( __field(int, map_id) @@ -246,6 +246,7 @@ TRACE_EVENT(xdp_devmap_xmit, __field(int, sent) __field(int, from_ifindex) __field(int, to_ifindex) + __field(int, err) ), TP_fast_assign( @@ -256,16 +257,17 @@ TRACE_EVENT(xdp_devmap_xmit, __entry->sent = sent; __entry->from_ifindex = from_dev->ifindex; __entry->to_ifindex = to_dev->ifindex; + __entry->err = err; ), TP_printk("ndo_xdp_xmit" " map_id=%d map_index=%d action=%s" " sent=%d drops=%d" - " from_ifindex=%d to_ifindex=%d", + " from_ifindex=%d to_ifindex=%d err=%d", __entry->map_id, __entry->map_index, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, - __entry->from_ifindex, __entry->to_ifindex) + __entry->from_ifindex, __entry->to_ifindex, __entry->err) ); #endif /* _TRACE_XDP_H */ -- cgit v1.2.3 From aaa908ffbee18a65529b716efb346a626e81559a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 23 May 2018 15:26:53 -0700 Subject: net_sched: switch to rcu_work Commit 05f0fe6b74db ("RCU, workqueue: Implement rcu_work") introduces new API's for dispatching work in a RCU callback. Now we can just switch to the new API's for tc filters. This could get rid of a lot of code. Cc: Tejun Heo Cc: "Paul E. McKenney" Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 0005f0b40fe9..f3ec43725724 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -33,7 +33,7 @@ struct tcf_block_ext_info { }; struct tcf_block_cb; -bool tcf_queue_work(struct work_struct *work); +bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, -- cgit v1.2.3 From 9f323973c915d402378cb3e1336dd6ed4c45144b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 23 May 2018 17:08:47 -0700 Subject: net/ipv4: Udate fib_table_lookup tracepoint Commit 4a2d73a4fb36 ("ipv4: fib_rules: support match on sport, dport and ip proto") added support for protocol and ports to FIB rules. Update the FIB lookup tracepoint to dump the parameters. In addition, make the IPv4 tracepoint similar to the IPv6 one where the lookup parameters and result are dumped in 1 event. It is much easier to use and understand the outcome of the lookup. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/trace/events/fib.h | 72 +++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index 81b7e985bb45..f5a1d4c518d8 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -12,12 +12,14 @@ TRACE_EVENT(fib_table_lookup, - TP_PROTO(u32 tb_id, const struct flowi4 *flp), + TP_PROTO(u32 tb_id, const struct flowi4 *flp, + const struct fib_nh *nh, int err), - TP_ARGS(tb_id, flp), + TP_ARGS(tb_id, flp, nh, err), TP_STRUCT__entry( __field( u32, tb_id ) + __field( int, err ) __field( int, oif ) __field( int, iif ) __field( __u8, tos ) @@ -25,12 +27,19 @@ TRACE_EVENT(fib_table_lookup, __field( __u8, flags ) __array( __u8, src, 4 ) __array( __u8, dst, 4 ) + __array( __u8, gw, 4 ) + __array( __u8, saddr, 4 ) + __field( u16, sport ) + __field( u16, dport ) + __field( u8, proto ) + __dynamic_array(char, name, IFNAMSIZ ) ), TP_fast_assign( __be32 *p32; __entry->tb_id = tb_id; + __entry->err = err; __entry->oif = flp->flowi4_oif; __entry->iif = flp->flowi4_iif; __entry->tos = flp->flowi4_tos; @@ -42,36 +51,41 @@ TRACE_EVENT(fib_table_lookup, p32 = (__be32 *) __entry->dst; *p32 = flp->daddr; - ), - - TP_printk("table %u oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x", - __entry->tb_id, __entry->oif, __entry->iif, - __entry->src, __entry->dst, __entry->tos, __entry->scope, - __entry->flags) -); - -TRACE_EVENT(fib_table_lookup_nh, - - TP_PROTO(const struct fib_nh *nh), - - TP_ARGS(nh), - - TP_STRUCT__entry( - __string( name, nh->nh_dev->name) - __field( int, oif ) - __array( __u8, src, 4 ) - ), - - TP_fast_assign( - __be32 *p32 = (__be32 *) __entry->src; - __assign_str(name, nh->nh_dev ? nh->nh_dev->name : "not set"); - __entry->oif = nh->nh_oif; - *p32 = nh->nh_saddr; + __entry->proto = flp->flowi4_proto; + if (__entry->proto == IPPROTO_TCP || + __entry->proto == IPPROTO_UDP) { + __entry->sport = ntohs(flp->fl4_sport); + __entry->dport = ntohs(flp->fl4_dport); + } else { + __entry->sport = 0; + __entry->dport = 0; + } + + if (nh) { + p32 = (__be32 *) __entry->saddr; + *p32 = nh->nh_saddr; + + p32 = (__be32 *) __entry->gw; + *p32 = nh->nh_gw; + + __assign_str(name, nh->nh_dev ? nh->nh_dev->name : "-"); + } else { + p32 = (__be32 *) __entry->saddr; + *p32 = 0; + + p32 = (__be32 *) __entry->gw; + *p32 = 0; + + __assign_str(name, "-"); + } ), - TP_printk("nexthop dev %s oif %d src %pI4", - __get_str(name), __entry->oif, __entry->src) + TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4 src %pI4 err %d", + __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, + __entry->src, __entry->sport, __entry->dst, __entry->dport, + __entry->tos, __entry->scope, __entry->flags, + __get_str(name), __entry->gw, __entry->saddr, __entry->err) ); TRACE_EVENT(fib_validate_source, -- cgit v1.2.3 From 30d444d30049490398178ca4337ab49156571886 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 23 May 2018 17:08:48 -0700 Subject: net/ipv6: Udate fib6_table_lookup tracepoint Commit bb0ad1987e96 ("ipv6: fib6_rules: support for match on sport, dport and ip proto") added support for protocol and ports to FIB rules. Update the FIB lookup tracepoint to dump the parameters. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/trace/events/fib6.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index 1b8d951e3c12..b088b54d699c 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -19,7 +19,7 @@ TRACE_EVENT(fib6_table_lookup, TP_STRUCT__entry( __field( u32, tb_id ) - + __field( int, err ) __field( int, oif ) __field( int, iif ) __field( __u8, tos ) @@ -27,7 +27,10 @@ TRACE_EVENT(fib6_table_lookup, __field( __u8, flags ) __array( __u8, src, 16 ) __array( __u8, dst, 16 ) - + __field( u16, sport ) + __field( u16, dport ) + __field( u8, proto ) + __field( u8, rt_type ) __dynamic_array( char, name, IFNAMSIZ ) __array( __u8, gw, 16 ) ), @@ -36,6 +39,7 @@ TRACE_EVENT(fib6_table_lookup, struct in6_addr *in6; __entry->tb_id = table->tb6_id; + __entry->err = ip6_rt_type_to_error(f6i->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); @@ -48,10 +52,20 @@ TRACE_EVENT(fib6_table_lookup, in6 = (struct in6_addr *)__entry->dst; *in6 = flp->daddr; + __entry->proto = flp->flowi6_proto; + if (__entry->proto == IPPROTO_TCP || + __entry->proto == IPPROTO_UDP) { + __entry->sport = ntohs(flp->fl6_sport); + __entry->dport = ntohs(flp->fl6_dport); + } else { + __entry->sport = 0; + __entry->dport = 0; + } + if (f6i->fib6_nh.nh_dev) { __assign_str(name, f6i->fib6_nh.nh_dev); } else { - __assign_str(name, ""); + __assign_str(name, "-"); } if (f6i == net->ipv6.fib6_null_entry) { struct in6_addr in6_zero = {}; @@ -65,10 +79,11 @@ TRACE_EVENT(fib6_table_lookup, } ), - TP_printk("table %3u oif %d iif %d src %pI6c dst %pI6c tos %d scope %d flags %x ==> dev %s gw %pI6c", - __entry->tb_id, __entry->oif, __entry->iif, - __entry->src, __entry->dst, __entry->tos, __entry->scope, - __entry->flags, __get_str(name), __entry->gw) + TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", + __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, + __entry->src, __entry->sport, __entry->dst, __entry->dport, + __entry->tos, __entry->scope, __entry->flags, + __get_str(name), __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ -- cgit v1.2.3 From c949cbbbe5d6ff3dcacf82e8fd26f627285e8a12 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 23 May 2018 17:08:49 -0700 Subject: net/ipv4: Remove tracepoint in fib_validate_source Tracepoint does not add value and the call to fib_lookup follows it which shows the same information and the fib lookup result. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/trace/events/fib.h | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'include') diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index f5a1d4c518d8..9763cddd0594 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -87,41 +87,6 @@ TRACE_EVENT(fib_table_lookup, __entry->tos, __entry->scope, __entry->flags, __get_str(name), __entry->gw, __entry->saddr, __entry->err) ); - -TRACE_EVENT(fib_validate_source, - - TP_PROTO(const struct net_device *dev, const struct flowi4 *flp), - - TP_ARGS(dev, flp), - - TP_STRUCT__entry( - __string( name, dev->name ) - __field( int, oif ) - __field( int, iif ) - __field( __u8, tos ) - __array( __u8, src, 4 ) - __array( __u8, dst, 4 ) - ), - - TP_fast_assign( - __be32 *p32; - - __assign_str(name, dev ? dev->name : "not set"); - __entry->oif = flp->flowi4_oif; - __entry->iif = flp->flowi4_iif; - __entry->tos = flp->flowi4_tos; - - p32 = (__be32 *) __entry->src; - *p32 = flp->saddr; - - p32 = (__be32 *) __entry->dst; - *p32 = flp->daddr; - ), - - TP_printk("dev %s oif %d iif %d tos %d src %pI4 dst %pI4", - __get_str(name), __entry->oif, __entry->iif, __entry->tos, - __entry->src, __entry->dst) -); #endif /* _TRACE_FIB_H */ /* This part must be outside protection */ -- cgit v1.2.3 From f44aa9ef7950a56daa3a5b41f069761f945f1a1f Mon Sep 17 00:00:00 2001 From: John Hurley Date: Wed, 23 May 2018 19:22:52 -0700 Subject: net: include hash policy in LAG changeupper info LAG upper event notifiers contain the tx type used by the LAG device. Extend this to also include the hash policy used for tx types that utilize hashing. Signed-off-by: John Hurley Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index debdb6286170..8452f72087ef 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2332,8 +2332,19 @@ enum netdev_lag_tx_type { NETDEV_LAG_TX_TYPE_HASH, }; +enum netdev_lag_hash { + NETDEV_LAG_HASH_NONE, + NETDEV_LAG_HASH_L2, + NETDEV_LAG_HASH_L34, + NETDEV_LAG_HASH_L23, + NETDEV_LAG_HASH_E23, + NETDEV_LAG_HASH_E34, + NETDEV_LAG_HASH_UNKNOWN, +}; + struct netdev_lag_upper_info { enum netdev_lag_tx_type tx_type; + enum netdev_lag_hash hash_type; }; struct netdev_lag_lower_state_info { -- cgit v1.2.3 From 7d850abd5f4edb1b1ca4b4141a4453305736f564 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 24 May 2018 11:56:48 +0300 Subject: net: bridge: add support for port isolation This patch adds support for a new port flag - BR_ISOLATED. If it is set then isolated ports cannot communicate between each other, but they can still communicate with non-isolated ports. The same can be achieved via ACLs but they can't scale with large number of ports and also the complexity of the rules grows. This feature can be used to achieve isolated vlan functionality (similar to pvlan) as well, though currently it will be port-wide (for all vlans on the port). The new test in should_deliver uses data that is already cache hot and the new boolean is used to avoid an additional source port test in should_deliver. Signed-off-by: Nikolay Aleksandrov Reviewed-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 585d27182425..7843b98e1c6e 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -50,6 +50,7 @@ struct br_ip_list { #define BR_VLAN_TUNNEL BIT(13) #define BR_BCAST_FLOOD BIT(14) #define BR_NEIGH_SUPPRESS BIT(15) +#define BR_ISOLATED BIT(16) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b85266420bfb..cf01b6824244 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -333,6 +333,7 @@ enum { IFLA_BRPORT_BCAST_FLOOD, IFLA_BRPORT_GROUP_FWD_MASK, IFLA_BRPORT_NEIGH_SUPPRESS, + IFLA_BRPORT_ISOLATED, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From 3893fc62b1769db3ef160f7f1e36d3db754497ee Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Thu, 24 May 2018 09:54:51 -0700 Subject: qed*: Support other classification modes. Currently, driver supports flow classification to PF receive queues based on TCP/UDP 4 tuples [src_ip, dst_ip, src_port, dst_port] only. This patch enables to configure different flow profiles [For example - only UDP dest port or src_ip based] on the adapter so that classification can be done according to just those fields as well. Although, at a time just one type of flow configuration is supported due to limited number of flow profiles available on the device. For example - ethtool -N enp7s0f0 flow-type udp4 dst-port 45762 action 2 ethtool -N enp7s0f0 flow-type tcp4 src-ip 192.16.4.10 action 1 ethtool -N enp7s0f0 flow-type udp6 dst-port 45762 action 3 Signed-off-by: Manish Chopra Signed-off-by: Shahed Shaikh Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_eth_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 7f9756fe9180..557e86e1c7d9 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -66,6 +66,7 @@ enum qed_filter_config_mode { QED_FILTER_CONFIG_MODE_5_TUPLE, QED_FILTER_CONFIG_MODE_L4_PORT, QED_FILTER_CONFIG_MODE_IP_DEST, + QED_FILTER_CONFIG_MODE_IP_SRC, }; struct qed_ntuple_filter_params { -- cgit v1.2.3 From 608e00d0a2eb53079c55dc9c14d8711bbb3a4390 Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Thu, 24 May 2018 09:54:53 -0700 Subject: qed*: Support drop action classification With this patch, User can configure for the supported flows to be dropped. Added a stat "gft_filter_drop" as well to be populated in ethtool for the dropped flows. For example - ethtool -N p5p1 flow-type udp4 dst-port 8000 action -1 ethtool -N p5p1 flow-type tcp4 scr-ip 192.168.8.1 action -1 Signed-off-by: Manish Chopra Signed-off-by: Shahed Shaikh Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_eth_if.h | 3 +++ include/linux/qed/qed_if.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 557e86e1c7d9..2978fa4add42 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -89,6 +89,9 @@ struct qed_ntuple_filter_params { /* true iff this filter is to be added. Else to be removed */ bool b_is_add; + + /* If flow needs to be dropped */ + bool b_is_drop; }; struct qed_dev_eth_info { diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 44af652e7407..ac991a3b8f03 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -1129,6 +1129,7 @@ struct qed_eth_stats_common { u64 rx_bcast_pkts; u64 mftag_filter_discards; u64 mac_filter_discards; + u64 gft_filter_drop; u64 tx_ucast_bytes; u64 tx_mcast_bytes; u64 tx_bcast_bytes; -- cgit v1.2.3 From 5972be6b2495c6bffbf444497517fd1c070eef78 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Thu, 24 May 2018 17:56:42 -0700 Subject: openvswitch: Add conntrack limit netlink definition Define netlink messages and attributes to support user kernel communication that uses the conntrack limit feature. Signed-off-by: Yi-Hung Wei Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 713e56ce681f..863aabaa5cc9 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -937,4 +937,32 @@ enum ovs_meter_band_type { #define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1) +/* Conntrack limit */ +#define OVS_CT_LIMIT_FAMILY "ovs_ct_limit" +#define OVS_CT_LIMIT_MCGROUP "ovs_ct_limit" +#define OVS_CT_LIMIT_VERSION 0x1 + +enum ovs_ct_limit_cmd { + OVS_CT_LIMIT_CMD_UNSPEC, + OVS_CT_LIMIT_CMD_SET, /* Add or modify ct limit. */ + OVS_CT_LIMIT_CMD_DEL, /* Delete ct limit. */ + OVS_CT_LIMIT_CMD_GET /* Get ct limit. */ +}; + +enum ovs_ct_limit_attr { + OVS_CT_LIMIT_ATTR_UNSPEC, + OVS_CT_LIMIT_ATTR_ZONE_LIMIT, /* Nested struct ovs_zone_limit. */ + __OVS_CT_LIMIT_ATTR_MAX +}; + +#define OVS_CT_LIMIT_ATTR_MAX (__OVS_CT_LIMIT_ATTR_MAX - 1) + +#define OVS_ZONE_LIMIT_DEFAULT_ZONE -1 + +struct ovs_zone_limit { + int zone_id; + __u32 limit; + __u32 count; +}; + #endif /* _LINUX_OPENVSWITCH_H */ -- cgit v1.2.3 From b45630021b52c203c2547a95a5c811b57aed4ead Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 24 Apr 2018 11:21:46 +0900 Subject: net/mlx5: Add cap bits for flow table destination in FDB table If set, the FDB table supports the forward action with a destination list that includes a flow table. Signed-off-by: Chris Mi Reviewed-by: Paul Blakey Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index edbddeaacc88..05b480fae27d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -524,7 +524,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits { }; struct mlx5_ifc_flow_table_eswitch_cap_bits { - u8 reserved_at_0[0x200]; + u8 reserved_at_0[0x1c]; + u8 fdb_multi_path_to_table[0x1]; + u8 reserved_at_1d[0x1e3]; struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb; -- cgit v1.2.3 From 3a2f70331226c140e5aa27ee6bbe2a5c618acb4c Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 4 Apr 2018 12:54:23 +0300 Subject: net/mlx5: Use order-0 allocations for all WQ types Complete the transition of all WQ types to use fragmented order-0 coherent memory instead of high-order allocations. CQ-WQ already uses order-0. Here we do the same for cyclic and linked-list WQs. This allows the driver to load cleanly on systems with a highly fragmented coherent memory. Performance tests: ConnectX-5 100Gbps, CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz Packet rate of 64B packets, single transmit ring, size 8K. No degradation is sensed. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 92d292454351..80cbb7fdce4a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -983,16 +983,24 @@ static inline u32 mlx5_base_mkey(const u32 key) return key & 0xffffff00u; } -static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc, - void *cqc) +static inline void mlx5_fill_fbc(u8 log_stride, u8 log_sz, + struct mlx5_frag_buf_ctrl *fbc) { - fbc->log_stride = 6 + MLX5_GET(cqc, cqc, cqe_sz); - fbc->log_sz = MLX5_GET(cqc, cqc, log_cq_size); + fbc->log_stride = log_stride; + fbc->log_sz = log_sz; fbc->sz_m1 = (1 << fbc->log_sz) - 1; fbc->log_frag_strides = PAGE_SHIFT - fbc->log_stride; fbc->frag_sz_m1 = (1 << fbc->log_frag_strides) - 1; } +static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc, + void *cqc) +{ + mlx5_fill_fbc(6 + MLX5_GET(cqc, cqc, cqe_sz), + MLX5_GET(cqc, cqc, log_cq_size), + fbc); +} + static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc, u32 ix) { -- cgit v1.2.3 From 13193b0f392f5a65d0d54185cb95ed5e99c0a5bf Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 25 May 2018 08:55:22 -0700 Subject: bpf: Define cgroup_bpf_enabled for CONFIG_CGROUP_BPF=n Static key is used to enable/disable cgroup-bpf related code paths at run time. Though it's not defined when cgroup-bpf is disabled at compile time, i.e. CONFIG_CGROUP_BPF=n, and if some code wants to use it, it has to do this: #ifdef CONFIG_CGROUP_BPF if (cgroup_bpf_enabled) { /* ... some work ... */ } #endif This code can be simplified by setting cgroup_bpf_enabled to 0 for CONFIG_CGROUP_BPF=n case: if (cgroup_bpf_enabled) { /* ... some work ... */ } And it aligns well with existing BPF_CGROUP_RUN_PROG_* macros that defined for both states of CONFIG_CGROUP_BPF. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 30d15e64b993..de8e89a3758b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -185,6 +185,7 @@ struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +#define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) -- cgit v1.2.3 From 1cedee13d25ab118d325f95588c1a084e9317229 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 25 May 2018 08:55:23 -0700 Subject: bpf: Hooks for sys_sendmsg In addition to already existing BPF hooks for sys_bind and sys_connect, the patch provides new hooks for sys_sendmsg. It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that provides access to socket itlself (properties like family, type, protocol) and user-passed `struct sockaddr *` so that BPF program can override destination IP and port for system calls such as sendto(2) or sendmsg(2) and/or assign source IP to the socket. The hooks are implemented as two new attach types: `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and UDPv6 correspondingly. UDPv4 and UDPv6 separate attach types for same reason as sys_bind and sys_connect hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. The difference with already existing hooks is sys_sendmsg are implemented only for unconnected UDP. For TCP it doesn't make sense to change user-provided `struct sockaddr *` at sendto(2)/sendmsg(2) time since socket either was already connected and has source/destination set or wasn't connected and call to sendto(2)/sendmsg(2) would lead to ENOTCONN anyway. Connected UDP is already handled by sys_connect hooks that can override source/destination at connect time and use fast-path later, i.e. these hooks don't affect UDP fast-path. Rewriting source IP is implemented differently than that in sys_connect hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to just bind socket to desired local IP address since source IP can be set on per-packet basis by using ancillary data (cmsg(3)). So no matter if socket is bound or not, source IP has to be rewritten on every call to sys_sendmsg. To do so two new fields are added to UAPI `struct bpf_sock_addr`; * `msg_src_ip4` to set source IPv4 for UDPv4; * `msg_src_ip6` to set source IPv6 for UDPv6. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 23 +++++++++++++++++------ include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index de8e89a3758b..975fb4cf1bb7 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type); + enum bpf_attach_type type, + void *t_ctx); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, @@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + NULL); \ __ret; \ }) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + t_ctx); \ release_sock(sk); \ } \ __ret; \ @@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) + +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) + +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ @@ -198,6 +207,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/linux/filter.h b/include/linux/filter.h index d358d1815c16..d90abdaea94b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern { * only two (src and dst) are available at convert_ctx_access time */ u64 tmp_reg; + void *t_ctx; /* Attach type specific context. */ }; struct bpf_sock_ops_kern { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9b8c6e310e9a..cc68787f2d97 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -160,6 +160,8 @@ enum bpf_attach_type { BPF_CGROUP_INET6_CONNECT, BPF_CGROUP_INET4_POST_BIND, BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, __MAX_BPF_ATTACH_TYPE }; @@ -2363,6 +2365,12 @@ struct bpf_sock_addr { __u32 family; /* Allows 4-byte read, but no write */ __u32 type; /* Allows 4-byte read, but no write */ __u32 protocol; /* Allows 4-byte read, but no write */ + __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ }; /* User bpf_sock_ops struct to access socket values and specify request ops -- cgit v1.2.3 From e5a10bb2acf246c13dc164fd37d4c77d9acaf88f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= Date: Wed, 23 May 2018 16:26:35 +0200 Subject: netfilter: add includes to nf_socket.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These have to be included always when nf_socket.h is included. Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_socket.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h index 8230fefff9f5..29b6313f0557 100644 --- a/include/net/netfilter/nf_socket.h +++ b/include/net/netfilter/nf_socket.h @@ -2,10 +2,8 @@ #ifndef _NF_SOCK_H_ #define _NF_SOCK_H_ -struct net_device; -struct sk_buff; -struct sock; -struct net; +#include +#include static inline bool nf_sk_is_transparent(struct sock *sk) { -- cgit v1.2.3 From 30c8bd5aa8b2c78546c3e52337101b9c85879320 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Thu, 24 May 2018 09:55:13 -0700 Subject: net: Introduce generic failover module The failover module provides a generic interface for paravirtual drivers to register a netdev and a set of ops with a failover instance. The ops are used as event handlers that get called to handle netdev register/ unregister/link change/name change events on slave pci ethernet devices with the same mac address as the failover netdev. This enables paravirtual drivers to use a VF as an accelerated low latency datapath. It also allows migration of VMs with direct attached VFs by failing over to the paravirtual datapath when the VF is unplugged. Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/linux/netdevice.h | 16 ++++++++++++++++ include/net/failover.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 include/net/failover.h (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8452f72087ef..f45b1a4e37ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1425,6 +1425,8 @@ struct net_device_ops { * entity (i.e. the master device for bridged veth) * @IFF_MACSEC: device is a MACsec device * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook + * @IFF_FAILOVER: device is a failover master device + * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1454,6 +1456,8 @@ enum netdev_priv_flags { IFF_PHONY_HEADROOM = 1<<24, IFF_MACSEC = 1<<25, IFF_NO_RX_HANDLER = 1<<26, + IFF_FAILOVER = 1<<27, + IFF_FAILOVER_SLAVE = 1<<28, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1482,6 +1486,8 @@ enum netdev_priv_flags { #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED #define IFF_MACSEC IFF_MACSEC #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER +#define IFF_FAILOVER IFF_FAILOVER +#define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE /** * struct net_device - The DEVICE structure. @@ -4336,6 +4342,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev) return dev->priv_flags & IFF_RXFH_CONFIGURED; } +static inline bool netif_is_failover(const struct net_device *dev) +{ + return dev->priv_flags & IFF_FAILOVER; +} + +static inline bool netif_is_failover_slave(const struct net_device *dev) +{ + return dev->priv_flags & IFF_FAILOVER_SLAVE; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { diff --git a/include/net/failover.h b/include/net/failover.h new file mode 100644 index 000000000000..bb15438f39c7 --- /dev/null +++ b/include/net/failover.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018, Intel Corporation. */ + +#ifndef _FAILOVER_H +#define _FAILOVER_H + +#include + +struct failover_ops { + int (*slave_pre_register)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_register)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_pre_unregister)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_unregister)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_link_change)(struct net_device *slave_dev, + struct net_device *failover_dev); + int (*slave_name_change)(struct net_device *slave_dev, + struct net_device *failover_dev); + rx_handler_result_t (*slave_handle_frame)(struct sk_buff **pskb); +}; + +struct failover { + struct list_head list; + struct net_device __rcu *failover_dev; + struct failover_ops __rcu *ops; +}; + +struct failover *failover_register(struct net_device *dev, + struct failover_ops *ops); +void failover_unregister(struct failover *failover); +int failover_slave_unregister(struct net_device *slave_dev); + +#endif /* _FAILOVER_H */ -- cgit v1.2.3 From cfc80d9a11635404a40199a1c9471c96890f3f74 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Thu, 24 May 2018 09:55:15 -0700 Subject: net: Introduce net_failover driver The net_failover driver provides an automated failover mechanism via APIs to create and destroy a failover master netdev and manages a primary and standby slave netdevs that get registered via the generic failover infrastructure. The failover netdev acts a master device and controls 2 slave devices. The original paravirtual interface gets registered as 'standby' slave netdev and a passthru/vf device with the same MAC gets registered as 'primary' slave netdev. Both 'standby' and 'failover' netdevs are associated with the same 'pci' device. The user accesses the network interface via 'failover' netdev. The 'failover' netdev chooses 'primary' netdev as default for transmits when it is available with link up and running. This can be used by paravirtual drivers to enable an alternate low latency datapath. It also enables hypervisor controlled live migration of a VM with direct attached VF by failing over to the paravirtual datapath when the VF is unplugged. Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/net_failover.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 include/net/net_failover.h (limited to 'include') diff --git a/include/net/net_failover.h b/include/net/net_failover.h new file mode 100644 index 000000000000..b12a1c469d1c --- /dev/null +++ b/include/net/net_failover.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018, Intel Corporation. */ + +#ifndef _NET_FAILOVER_H +#define _NET_FAILOVER_H + +#include + +/* failover state */ +struct net_failover_info { + /* primary netdev with same MAC */ + struct net_device __rcu *primary_dev; + + /* standby netdev */ + struct net_device __rcu *standby_dev; + + /* primary netdev stats */ + struct rtnl_link_stats64 primary_stats; + + /* standby netdev stats */ + struct rtnl_link_stats64 standby_stats; + + /* aggregated stats */ + struct rtnl_link_stats64 failover_stats; + + /* spinlock while updating stats */ + spinlock_t stats_lock; +}; + +struct failover *net_failover_create(struct net_device *standby_dev); +void net_failover_destroy(struct failover *failover); + +#define FAILOVER_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \ + NETIF_F_HIGHDMA | NETIF_F_LRO) + +#define FAILOVER_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_RXCSUM | NETIF_F_ALL_TSO) + +#endif /* _NET_FAILOVER_H */ -- cgit v1.2.3 From 9805069d14c1b0b66b1600ea60cfc08f94841bd8 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Thu, 24 May 2018 09:55:16 -0700 Subject: virtio_net: Introduce VIRTIO_NET_F_STANDBY feature bit This feature bit can be used by hypervisor to indicate virtio_net device to act as a standby for another device with the same MAC address. VIRTIO_NET_F_STANDBY is defined as bit 62 as it is a device feature bit. Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/uapi/linux/virtio_net.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 5de6ed37695b..a3715a3224c1 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,9 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device + * with the same MAC. + */ #define VIRTIO_NET_F_SPEED_DUPLEX 63 /* Device set linkspeed and duplex */ #ifndef VIRTIO_NET_NO_LEGACY -- cgit v1.2.3 From 6c50c1ed72aeb5cd1ae8cd267d5a79904c82a04f Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Fri, 25 May 2018 12:40:35 +0800 Subject: ptp_qoriq: move some definitions to header file This patch is to move some definitions in ptp_qoriq.c to the header file. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- include/linux/fsl/ptp_qoriq.h | 141 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 include/linux/fsl/ptp_qoriq.h (limited to 'include') diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h new file mode 100644 index 000000000000..b462d9ea8007 --- /dev/null +++ b/include/linux/fsl/ptp_qoriq.h @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 OMICRON electronics GmbH + * Copyright 2018 NXP + */ +#ifndef __PTP_QORIQ_H__ +#define __PTP_QORIQ_H__ + +#include +#include + +/* + * qoriq ptp registers + * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010 + */ +struct qoriq_ptp_registers { + u32 tmr_ctrl; /* Timer control register */ + u32 tmr_tevent; /* Timestamp event register */ + u32 tmr_temask; /* Timer event mask register */ + u32 tmr_pevent; /* Timestamp event register */ + u32 tmr_pemask; /* Timer event mask register */ + u32 tmr_stat; /* Timestamp status register */ + u32 tmr_cnt_h; /* Timer counter high register */ + u32 tmr_cnt_l; /* Timer counter low register */ + u32 tmr_add; /* Timer drift compensation addend register */ + u32 tmr_acc; /* Timer accumulator register */ + u32 tmr_prsc; /* Timer prescale */ + u8 res1[4]; + u32 tmroff_h; /* Timer offset high */ + u32 tmroff_l; /* Timer offset low */ + u8 res2[8]; + u32 tmr_alarm1_h; /* Timer alarm 1 high register */ + u32 tmr_alarm1_l; /* Timer alarm 1 high register */ + u32 tmr_alarm2_h; /* Timer alarm 2 high register */ + u32 tmr_alarm2_l; /* Timer alarm 2 high register */ + u8 res3[48]; + u32 tmr_fiper1; /* Timer fixed period interval */ + u32 tmr_fiper2; /* Timer fixed period interval */ + u32 tmr_fiper3; /* Timer fixed period interval */ + u8 res4[20]; + u32 tmr_etts1_h; /* Timestamp of general purpose external trigger */ + u32 tmr_etts1_l; /* Timestamp of general purpose external trigger */ + u32 tmr_etts2_h; /* Timestamp of general purpose external trigger */ + u32 tmr_etts2_l; /* Timestamp of general purpose external trigger */ +}; + +/* Bit definitions for the TMR_CTRL register */ +#define ALM1P (1<<31) /* Alarm1 output polarity */ +#define ALM2P (1<<30) /* Alarm2 output polarity */ +#define FIPERST (1<<28) /* FIPER start indication */ +#define PP1L (1<<27) /* Fiper1 pulse loopback mode enabled. */ +#define PP2L (1<<26) /* Fiper2 pulse loopback mode enabled. */ +#define TCLK_PERIOD_SHIFT (16) /* 1588 timer reference clock period. */ +#define TCLK_PERIOD_MASK (0x3ff) +#define RTPE (1<<15) /* Record Tx Timestamp to PAL Enable. */ +#define FRD (1<<14) /* FIPER Realignment Disable */ +#define ESFDP (1<<11) /* External Tx/Rx SFD Polarity. */ +#define ESFDE (1<<10) /* External Tx/Rx SFD Enable. */ +#define ETEP2 (1<<9) /* External trigger 2 edge polarity */ +#define ETEP1 (1<<8) /* External trigger 1 edge polarity */ +#define COPH (1<<7) /* Generated clock output phase. */ +#define CIPH (1<<6) /* External oscillator input clock phase */ +#define TMSR (1<<5) /* Timer soft reset. */ +#define BYP (1<<3) /* Bypass drift compensated clock */ +#define TE (1<<2) /* 1588 timer enable. */ +#define CKSEL_SHIFT (0) /* 1588 Timer reference clock source */ +#define CKSEL_MASK (0x3) + +/* Bit definitions for the TMR_TEVENT register */ +#define ETS2 (1<<25) /* External trigger 2 timestamp sampled */ +#define ETS1 (1<<24) /* External trigger 1 timestamp sampled */ +#define ALM2 (1<<17) /* Current time = alarm time register 2 */ +#define ALM1 (1<<16) /* Current time = alarm time register 1 */ +#define PP1 (1<<7) /* periodic pulse generated on FIPER1 */ +#define PP2 (1<<6) /* periodic pulse generated on FIPER2 */ +#define PP3 (1<<5) /* periodic pulse generated on FIPER3 */ + +/* Bit definitions for the TMR_TEMASK register */ +#define ETS2EN (1<<25) /* External trigger 2 timestamp enable */ +#define ETS1EN (1<<24) /* External trigger 1 timestamp enable */ +#define ALM2EN (1<<17) /* Timer ALM2 event enable */ +#define ALM1EN (1<<16) /* Timer ALM1 event enable */ +#define PP1EN (1<<7) /* Periodic pulse event 1 enable */ +#define PP2EN (1<<6) /* Periodic pulse event 2 enable */ + +/* Bit definitions for the TMR_PEVENT register */ +#define TXP2 (1<<9) /* PTP transmitted timestamp im TXTS2 */ +#define TXP1 (1<<8) /* PTP transmitted timestamp in TXTS1 */ +#define RXP (1<<0) /* PTP frame has been received */ + +/* Bit definitions for the TMR_PEMASK register */ +#define TXP2EN (1<<9) /* Transmit PTP packet event 2 enable */ +#define TXP1EN (1<<8) /* Transmit PTP packet event 1 enable */ +#define RXPEN (1<<0) /* Receive PTP packet event enable */ + +/* Bit definitions for the TMR_STAT register */ +#define STAT_VEC_SHIFT (0) /* Timer general purpose status vector */ +#define STAT_VEC_MASK (0x3f) + +/* Bit definitions for the TMR_PRSC register */ +#define PRSC_OCK_SHIFT (0) /* Output clock division/prescale factor. */ +#define PRSC_OCK_MASK (0xffff) + + +#define DRIVER "ptp_qoriq" +#define DEFAULT_CKSEL 1 +#define N_EXT_TS 2 +#define REG_SIZE sizeof(struct qoriq_ptp_registers) + +struct qoriq_ptp { + struct qoriq_ptp_registers __iomem *regs; + spinlock_t lock; /* protects regs */ + struct ptp_clock *clock; + struct ptp_clock_info caps; + struct resource *rsrc; + int irq; + int phc_index; + u64 alarm_interval; /* for periodic alarm */ + u64 alarm_value; + u32 tclk_period; /* nanoseconds */ + u32 tmr_prsc; + u32 tmr_add; + u32 cksel; + u32 tmr_fiper1; + u32 tmr_fiper2; +}; + +static inline u32 qoriq_read(unsigned __iomem *addr) +{ + u32 val; + + val = ioread32be(addr); + return val; +} + +static inline void qoriq_write(unsigned __iomem *addr, u32 val) +{ + iowrite32be(val, addr); +} + +#endif -- cgit v1.2.3 From e9be0e993d95adbe5efe0e0f03b2a3e71f5bb2b6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 25 May 2018 16:28:44 +0200 Subject: net: sched: shrink struct Qdisc The struct Qdisc has a lot of holes, especially after commit a53851e2c321 ("net: sched: explicit locking in gso_cpu fallback"), which as a side effect, moved the fields just after 'busylock' on a new cacheline. Since both 'padded' and 'refcnt' are not updated frequently, and there is a hole before 'gso_skb', we can move such fields there, saving a cacheline without any performance side effect. Before this commit: pahole -C Qdisc net/sche/sch_generic.o # ... /* size: 384, cachelines: 6, members: 25 */ /* sum members: 236, holes: 3, sum holes: 92 */ /* padding: 56 */ After this commit: pahole -C Qdisc net/sche/sch_generic.o # ... /* size: 320, cachelines: 5, members: 25 */ /* sum members: 236, holes: 2, sum holes: 28 */ /* padding: 56 */ Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 98c10a28cd01..827a3711dc68 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -85,6 +85,8 @@ struct Qdisc { struct net_rate_estimator __rcu *rate_est; struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; + int padded; + refcount_t refcnt; /* * For performance sake on SMP, we put highly modified fields at the end @@ -97,8 +99,6 @@ struct Qdisc { unsigned long state; struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; - int padded; - refcount_t refcnt; spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; -- cgit v1.2.3 From 0cbc06b3faba756113d4ac748b089529f813eda4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 25 May 2018 00:25:48 +0200 Subject: netfilter: nf_tables: remove synchronize_rcu in commit phase synchronize_rcu() is expensive. The commit phase currently enforces an unconditional synchronize_rcu() after incrementing the generation counter. This is to make sure that a packet always sees a consistent chain, either nft_do_chain is still using old generation (it will skip the newly added rules), or the new one (it will skip old ones that might still be linked into the list). We could just remove the synchronize_rcu(), it would not cause a crash but it could cause us to evaluate a rule that was removed and new rule for the same packet, instead of either-or. To resolve this, add rule pointer array holding two generations, the current one and the future generation. In commit phase, allocate the rule blob and populate it with the rules that will be active in the new generation. Then, make this rule blob public, replacing the old generation pointer. Then the generation counter can be incremented. nft_do_chain() will either continue to use the current generation (in case loop was invoked right before increment), or the new one. Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 603b51401deb..6b9ddb99f3a8 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -858,6 +858,8 @@ enum nft_chain_flags { * @name: name of the chain */ struct nft_chain { + struct nft_rule *__rcu *rules_gen_0; + struct nft_rule *__rcu *rules_gen_1; struct list_head rules; struct list_head list; struct nft_table *table; @@ -867,6 +869,9 @@ struct nft_chain { u8 flags:6, genmask:2; char *name; + + /* Only used during control plane commit phase: */ + struct nft_rule **rules_next; }; enum nft_chain_types { -- cgit v1.2.3 From 6172abc1e2ea12743f368875576952ddf4ebe88c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 25 May 2018 21:53:30 -0700 Subject: net: sched: add qstats.qlen to qlen AFAICT struct gnet_stats_queue.qlen is not used in Qdiscs. It may, however, be useful for offloads to report HW queue length there. Add that value to the result of qdisc_qlen_sum(). Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 827a3711dc68..6488daa32f82 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -350,14 +350,14 @@ static inline int qdisc_qlen(const struct Qdisc *q) static inline int qdisc_qlen_sum(const struct Qdisc *q) { - __u32 qlen = 0; + __u32 qlen = q->qstats.qlen; int i; if (q->flags & TCQ_F_NOLOCK) { for_each_possible_cpu(i) qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; } else { - qlen = q->q.qlen; + qlen += q->q.qlen; } return qlen; -- cgit v1.2.3 From f971b132300fb0df63a8de631947adc74a7b3db1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 25 May 2018 21:53:35 -0700 Subject: net: sched: mq: add simple offload notification mq offload is trivial, we just need to let the device know that the root qdisc is mq. Alternative approach would be to export qdisc_lookup() and make drivers check the root type themselves, but notification via ndo_setup_tc is more in line with other qdiscs. Note that mq doesn't hold any stats on it's own, it just adds up stats of its children. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/pkt_cls.h | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f45b1a4e37ab..6b863ed3174a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -791,6 +791,7 @@ enum tc_setup_type { TC_SETUP_QDISC_CBS, TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, + TC_SETUP_QDISC_MQ, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index f3ec43725724..942f839dbca4 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -778,6 +778,16 @@ struct tc_qopt_offload_stats { struct gnet_stats_queue *qstats; }; +enum tc_mq_command { + TC_MQ_CREATE, + TC_MQ_DESTROY, +}; + +struct tc_mq_qopt_offload { + enum tc_mq_command command; + u32 handle; +}; + enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, -- cgit v1.2.3 From 47c669a406d8621c69b1c199ce099b54b17b9902 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 25 May 2018 21:53:37 -0700 Subject: net: sched: mq: request stats from offloads MQ doesn't hold any statistics on its own, however, statistic from offloads are requested starting from the root, hence MQ will read the old values for its sums. Call into the drivers, because of the additive nature of the stats drivers are aware of how much "pending updates" they have to children of the MQ. Since MQ reset its stats on every dump we can simply offset the stats, predicting how stats of offloaded children will change. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 942f839dbca4..a3c1a2c47cd4 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -781,11 +781,13 @@ struct tc_qopt_offload_stats { enum tc_mq_command { TC_MQ_CREATE, TC_MQ_DESTROY, + TC_MQ_STATS, }; struct tc_mq_qopt_offload { enum tc_mq_command command; u32 handle; + struct tc_qopt_offload_stats stats; }; enum tc_red_command { -- cgit v1.2.3 From e6464b8c6361962f5ff99dc95d010b64432c27b5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 27 May 2018 08:09:53 -0700 Subject: net/ipv6: Convert ipv6_add_addr to struct ifa6_config Move config parameters for adding an ipv6 address to a struct. struct names stem from inet6_rtm_newaddr which is the modern handler for adding an address. Start the conversion to ifa6_config with ipv6_add_addr. This is an argument move only; no functional change intended. Mapping of variable changes: addr --> cfg->pfx peer_addr --> cfg->peer_pfx pfxlen --> cfg->plen flags --> cfg->ifa_flags scope, valid_lft, prefered_lft have the same names within cfg (with corrected spelling). Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index c07d4dd09361..f766af2cd1a4 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -59,6 +59,18 @@ struct in6_validator_info { struct netlink_ext_ack *extack; }; +struct ifa6_config { + const struct in6_addr *pfx; + unsigned int plen; + + const struct in6_addr *peer_pfx; + + u32 ifa_flags; + u32 preferred_lft; + u32 valid_lft; + u16 scope; +}; + int addrconf_init(void); void addrconf_cleanup(void); -- cgit v1.2.3 From 620dee9415fae09003d012cc7c23b011e5ebb43d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 27 May 2018 08:09:56 -0700 Subject: net: Add IFA_RT_PRIORITY address attribute Currently, if two interfaces have addresses in the same connected route, then the order of the prefix route entries is determined by the order in which the addresses are assigned or the links brought up. Add IFA_RT_PRIORITY to allow user to specify the metric of the prefix route associated with an address giving interface managers better control of the order of prefix routes. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/if_addr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index 2ef053d265de..ebaf5701c9db 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -33,6 +33,7 @@ enum { IFA_CACHEINFO, IFA_MULTICAST, IFA_FLAGS, + IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */ __IFA_MAX, }; -- cgit v1.2.3 From af4d768ad28cbf6542ba70dba10b49127b31b762 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 27 May 2018 08:09:57 -0700 Subject: net/ipv4: Add support for specifying metric of connected routes Add support for IFA_RT_PRIORITY to ipv4 addresses. If the metric is changed on an existing address then the new route is inserted before removing the old one. Since the metric is one of the route keys, the prefix route can not be replaced. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + include/net/route.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index e16fe7d44a71..27650f1bff3d 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -139,6 +139,7 @@ struct in_ifaddr { __be32 ifa_local; __be32 ifa_address; __be32 ifa_mask; + __u32 ifa_rt_priority; __be32 ifa_broadcast; unsigned char ifa_scope; unsigned char ifa_prefixlen; diff --git a/include/net/route.h b/include/net/route.h index dbb032d5921b..bb53cdba38dc 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -225,6 +225,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, struct in_ifaddr; void fib_add_ifaddr(struct in_ifaddr *); void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *); +void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric); void rt_add_uncached_list(struct rtable *rt); void rt_del_uncached_list(struct rtable *rt); -- cgit v1.2.3 From 8308f3ff1753d001f7a73f9bb0f02292b5400557 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 27 May 2018 08:09:58 -0700 Subject: net/ipv6: Add support for specifying metric of connected routes Add support for IFA_RT_PRIORITY to ipv6 addresses. If the metric is changed on an existing address then the new route is inserted before removing the old one. Since the metric is one of the route keys, the prefix route can not be atomically replaced. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 1 + include/net/if_inet6.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index f766af2cd1a4..5f43f7a70fe6 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -65,6 +65,7 @@ struct ifa6_config { const struct in6_addr *peer_pfx; + u32 rt_priority; u32 ifa_flags; u32 preferred_lft; u32 valid_lft; diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index db389253dc2a..d7578cf49c3a 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -42,6 +42,7 @@ enum { struct inet6_ifaddr { struct in6_addr addr; __u32 prefix_len; + __u32 rt_priority; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 valid_lft; -- cgit v1.2.3 From 2d68c0745a0e8349ff71b04af5ee020d40f78d90 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 25 May 2018 18:14:05 +0800 Subject: tcp: use data length instead of skb->len in tcp_probe skb->len is meaningless to user. data length could be more helpful, with which we can easily filter out the packet without payload. Signed-off-by: Yafang Shao Acked-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index c1a5284713b8..703abb6e11fa 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -236,7 +236,7 @@ TRACE_EVENT(tcp_probe, __field(__u16, sport) __field(__u16, dport) __field(__u32, mark) - __field(__u16, length) + __field(__u16, data_len) __field(__u32, snd_nxt) __field(__u32, snd_una) __field(__u32, snd_cwnd) @@ -261,7 +261,7 @@ TRACE_EVENT(tcp_probe, __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; - __entry->length = skb->len; + __entry->data_len = skb->len - tcp_hdrlen(skb); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tp->snd_cwnd; @@ -272,9 +272,9 @@ TRACE_EVENT(tcp_probe, __entry->sock_cookie = sock_gen_cookie(sk); ), - TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", + TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", __entry->saddr, __entry->daddr, __entry->mark, - __entry->length, __entry->snd_nxt, __entry->snd_una, + __entry->data_len, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ); -- cgit v1.2.3 From 7a279e93330bd4f0074973293f1cd1aacf7c5fa6 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 29 May 2018 12:27:44 +0100 Subject: bpf: clean up eBPF helpers documentation These are minor edits for the eBPF helpers documentation in include/uapi/linux/bpf.h. The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends with a non-escaped underscore that gets interpreted by rst2man and produces the following message in the resulting manual page: DOCUTILS SYSTEM MESSAGES System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514) Unknown target name: "bpf_fib_lookup". Other edits consist in: - Improving formatting for flag values for "bpf_fib_lookup()" helper. - Emphasising a parameter name in description of the return value for "bpf_get_stack()" helper. - Removing unnecessary blank lines between "Description" and "Return" sections for the few helpers that would use it, for consistency. Signed-off-by: Quentin Monnet Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cc68787f2d97..3f556b35ac8d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1010,7 +1010,6 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return * The positive or null stack id on success, or a negative error * in case of failure. @@ -1821,10 +1820,9 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return - * a non-negative value equal to or less than size on success, or - * a negative error in case of failure. + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description @@ -1845,7 +1843,6 @@ union bpf_attr { * in socket filters where *skb*\ **->data** does not always point * to the start of the mac header and where "direct packet access" * is not available. - * * Return * 0 on success, or a negative error in case of failure. * @@ -1861,16 +1858,18 @@ union bpf_attr { * rt_metric is set to metric from route. * * *plen* argument is the size of the passed in struct. - * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * *flags* argument can be a combination of one or more of the + * following values: * - * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs - * full lookup using FIB rules - * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress - * perspective (default is ingress) + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. - * * Return * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case -- cgit v1.2.3 From fa898d769b264ede3c10cc30a537316c6a946956 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 29 May 2018 10:58:07 -0700 Subject: bpf: Drop mpls from bpf_fib_lookup MPLS support will not be submitted this dev cycle, but in working on it I do see a few changes are needed to the API. For now, drop mpls from the API. Since the fields in question are unions, the mpls fields can be added back later without affecting the uapi. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3f556b35ac8d..33f37eb0b6bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1852,10 +1852,10 @@ union bpf_attr { * If lookup is successful and result shows packet is to be * forwarded, the neighbor tables are searched for the nexthop. * If successful (ie., FIB lookup shows forwarding and nexthop - * is resolved), the nexthop address is returned in ipv4_dst, - * ipv6_dst or mpls_out based on family, smac is set to mac - * address of egress device, dmac is set to nexthop mac address, - * rt_metric is set to metric from route. + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only). * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the @@ -2537,8 +2537,10 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_OUTPUT BIT(1) struct bpf_fib_lookup { - /* input */ - __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ + /* input: network family for lookup (AF_INET, AF_INET6) + * output: network family of egress nexthop + */ + __u8 family; /* set if lookup is to consider L4 data - e.g., FIB rules */ __u8 l4_protocol; @@ -2554,22 +2556,20 @@ struct bpf_fib_lookup { __u8 tos; /* AF_INET */ __be32 flowlabel; /* AF_INET6 */ - /* output: metric of fib result */ - __u32 rt_metric; + /* output: metric of fib result (IPv4/IPv6 only) */ + __u32 rt_metric; }; union { - __be32 mpls_in; __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ }; - /* input to bpf_fib_lookup, *dst is destination address. - * output: bpf_fib_lookup sets to gateway address + /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in + * network header. output: bpf_fib_lookup sets to gateway address + * if FIB lookup returns gateway route */ union { - /* return for MPLS lookups */ - __be32 mpls_out[4]; /* support up to 4 labels */ __be32 ipv4_dst; __u32 ipv6_dst[4]; /* in6_addr; network order */ }; -- cgit v1.2.3 From f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 27 May 2018 12:24:09 +0100 Subject: media: rc: introduce BPF_PROG_LIRC_MODE2 Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report that the last key should be repeated. The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall; the target_fd must be the /dev/lircN device. Acked-by: Yonghong Song Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann --- include/linux/bpf_lirc.h | 29 ++++++++++++++++++++++++++ include/linux/bpf_types.h | 3 +++ include/uapi/linux/bpf.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_lirc.h (limited to 'include') diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h new file mode 100644 index 000000000000..5f8a4283092d --- /dev/null +++ b/include/linux/bpf_lirc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_LIRC_H +#define _BPF_LIRC_H + +#include + +#ifdef CONFIG_BPF_LIRC_MODE2 +int lirc_prog_attach(const union bpf_attr *attr); +int lirc_prog_detach(const union bpf_attr *attr); +int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +#else +static inline int lirc_prog_attach(const union bpf_attr *attr) +{ + return -EINVAL; +} + +static inline int lirc_prog_detach(const union bpf_attr *attr) +{ + return -EINVAL; +} + +static inline int lirc_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} +#endif + +#endif /* _BPF_LIRC_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b161e506dcfc..c5700c2d5549 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -26,6 +26,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #endif +#ifdef CONFIG_BPF_LIRC_MODE2 +BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 33f37eb0b6bf..64ac0f7a689e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,6 +143,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, + BPF_PROG_TYPE_LIRC_MODE2, }; enum bpf_attach_type { @@ -162,6 +163,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_POST_BIND, BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, __MAX_BPF_ATTACH_TYPE }; @@ -2005,6 +2007,53 @@ union bpf_attr { * direct packet access. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown** () again with the same values, or calling + * **bpf_rc_repeat** (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Return + * 0 + * + * int bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Return + * 0 */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2083,7 +2132,9 @@ union bpf_attr { FN(lwt_push_encap), \ FN(lwt_seg6_store_bytes), \ FN(lwt_seg6_adjust_srh), \ - FN(lwt_seg6_action), + FN(lwt_seg6_action), \ + FN(rc_repeat), \ + FN(rc_keydown), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 3d97d88e8091f3501e016f6b4ce45a32c4b8f2f6 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 29 May 2018 23:27:31 +0800 Subject: tcp: minor optimization around tcp_hdr() usage in receive path This is additional to the commit ea1627c20c34 ("tcp: minor optimizations around tcp_hdr() usage"). At this point, skb->data is same with tcp_hdr() as tcp header has not been pulled yet. So use the less expensive one to get the tcp header. Remove the third parameter of tcp_rcv_established() and put it into the function body. Furthermore, the local variables are listed as a reverse christmas tree :) Cc: Eric Dumazet Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +-- include/trace/events/tcp.h | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 952d842a604a..029a51b91742 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -334,8 +334,7 @@ void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th); +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 703abb6e11fa..ac55b328d61b 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -248,8 +248,9 @@ TRACE_EVENT(tcp_probe, ), TP_fast_assign( - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcphdr *th = (const struct tcphdr *)skb->data; const struct inet_sock *inet = inet_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); @@ -261,7 +262,7 @@ TRACE_EVENT(tcp_probe, __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; - __entry->data_len = skb->len - tcp_hdrlen(skb); + __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tp->snd_cwnd; -- cgit v1.2.3 From 32d26a685c1802a0e485bd674e7dd038e88019f7 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Tue, 29 May 2018 02:31:24 -0700 Subject: qed*: Add link change count value to ethtool statistics display. This patch adds driver changes for capturing the link change count in ethtool statistics display. Please consider applying this to "net-next". Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index ac991a3b8f03..b4040023cbfb 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -1180,6 +1180,7 @@ struct qed_eth_stats_common { u64 tx_mac_mc_packets; u64 tx_mac_bc_packets; u64 tx_mac_ctrl_frames; + u64 link_change_count; }; struct qed_eth_stats_bb { -- cgit v1.2.3 From 35aada99b5bfa7a9360f9653377688556f71edcb Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Wed, 30 May 2018 08:27:32 -0400 Subject: rtnetlink: Add more well known protocol values FRRouting installs routes into the kernel associated with the originating protocol. Add these values to the well known values in rtnetlink.h. Signed-off-by: Donald Sharp Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index cabb210c93af..7d8502313c99 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -254,6 +254,11 @@ enum { #define RTPROT_DHCP 16 /* DHCP client */ #define RTPROT_MROUTED 17 /* Multicast daemon */ #define RTPROT_BABEL 42 /* Babel daemon */ +#define RTPROT_BGP 186 /* BGP Routes */ +#define RTPROT_ISIS 187 /* ISIS Routes */ +#define RTPROT_OSPF 188 /* OSPF Routes */ +#define RTPROT_RIP 189 /* RIP Routes */ +#define RTPROT_EIGRP 192 /* EIGRP Routes */ /* rtm_scope -- cgit v1.2.3 From 1865ea9adbfaf341c5cd5d8f7d384f19948b2fe9 Mon Sep 17 00:00:00 2001 From: Ilan Tayari Date: Wed, 30 May 2018 10:59:49 -0700 Subject: net/mlx5: Add temperature warning event to log Temperature warning event is sent by FW to indicate high temperature as detected by one of the sensors on the board. Add handling of this event by writing the numbers of the alert sensors to the kernel log. Signed-off-by: Ilan Tayari Signed-off-by: Adi Nissim Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 7 +++++++ include/linux/mlx5/mlx5_ifc.h | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index db0332a6d23c..c1095e08f0c8 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -314,6 +314,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_PORT_CHANGE = 0x09, MLX5_EVENT_TYPE_GPIO_EVENT = 0x15, MLX5_EVENT_TYPE_PORT_MODULE_EVENT = 0x16, + MLX5_EVENT_TYPE_TEMP_WARN_EVENT = 0x17, MLX5_EVENT_TYPE_REMOTE_CONFIG = 0x19, MLX5_EVENT_TYPE_GENERAL_EVENT = 0x22, MLX5_EVENT_TYPE_PPS_EVENT = 0x25, @@ -626,6 +627,11 @@ struct mlx5_eqe_dct { __be32 dctn; }; +struct mlx5_eqe_temp_warning { + __be64 sensor_warning_msb; + __be64 sensor_warning_lsb; +} __packed; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -642,6 +648,7 @@ union ev_data { struct mlx5_eqe_port_module port_module; struct mlx5_eqe_pps pps; struct mlx5_eqe_dct dct; + struct mlx5_eqe_temp_warning temp_warning; } __packed; struct mlx5_eqe { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 05b480fae27d..ca6c0dfb5ffe 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -912,7 +912,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_msg[0x5]; u8 reserved_at_1c8[0x4]; u8 max_tc[0x4]; - u8 reserved_at_1d0[0x1]; + u8 temp_warn_event[0x1]; u8 dcbx[0x1]; u8 general_notification_event[0x1]; u8 reserved_at_1d3[0x2]; -- cgit v1.2.3 From 1f0cf89b09305743fca5898660a7be83aab38a74 Mon Sep 17 00:00:00 2001 From: Ilan Tayari Date: Wed, 30 May 2018 10:59:50 -0700 Subject: net/mlx5: Add FPGA QP error event The FPGA queue pair (QP) event fires whenever a QP on the FPGA transitions to the error state. At this stage, this event is unrecoverable, it may become recoverable in the future. Signed-off-by: Ilan Tayari Signed-off-by: Adi Nissim Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 1 + include/linux/mlx5/mlx5_ifc.h | 1 + include/linux/mlx5/mlx5_ifc_fpga.h | 16 ++++++++++++++++ 3 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index c1095e08f0c8..0f006cf8343d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -331,6 +331,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_DCT_DRAINED = 0x1c, MLX5_EVENT_TYPE_FPGA_ERROR = 0x20, + MLX5_EVENT_TYPE_FPGA_QP_ERROR = 0x21, }; enum { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ca6c0dfb5ffe..8e0b8865f91e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -60,6 +60,7 @@ enum { MLX5_EVENT_TYPE_CODING_COMMAND_INTERFACE_COMPLETION = 0xa, MLX5_EVENT_TYPE_CODING_PAGE_REQUEST = 0xb, MLX5_EVENT_TYPE_CODING_FPGA_ERROR = 0x20, + MLX5_EVENT_TYPE_CODING_FPGA_QP_ERROR = 0x21 }; enum { diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 193091537cb6..64d0f40d4cc3 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -470,6 +470,22 @@ struct mlx5_ifc_ipsec_counters_bits { u8 dropped_cmd[0x40]; }; +enum { + MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RETRY_COUNTER_EXPIRED = 0x1, + MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RNR_EXPIRED = 0x2, +}; + +struct mlx5_ifc_fpga_qp_error_event_bits { + u8 reserved_at_0[0x40]; + + u8 reserved_at_40[0x18]; + u8 syndrome[0x8]; + + u8 reserved_at_60[0x60]; + + u8 reserved_at_c0[0x8]; + u8 fpga_qpn[0x18]; +}; enum mlx5_ifc_fpga_ipsec_response_syndrome { MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0, MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1, -- cgit v1.2.3 From ccf8dbcd062a930e64741c939ca784d15316aa0c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 30 May 2018 15:20:52 -0700 Subject: rtnetlink: Remove VLA usage In the quest to remove all stack VLA usage from the kernel[1], this allocates the maximum size expected for all possible types and adds sanity-checks at both registration and usage to make sure nothing gets out of sync. This matches the proposed VLA solution for nfnetlink[2]. The values chosen here were based on finding assignments for .maxtype and .slave_maxtype and manually counting the enums: slave_maxtype (max 33): IFLA_BRPORT_MAX 33 IFLA_BOND_SLAVE_MAX 9 maxtype (max 45): IFLA_BOND_MAX 28 IFLA_BR_MAX 45 __IFLA_CAIF_HSI_MAX 8 IFLA_CAIF_MAX 4 IFLA_CAN_MAX 16 IFLA_GENEVE_MAX 12 IFLA_GRE_MAX 25 IFLA_GTP_MAX 5 IFLA_HSR_MAX 7 IFLA_IPOIB_MAX 4 IFLA_IPTUN_MAX 21 IFLA_IPVLAN_MAX 3 IFLA_MACSEC_MAX 15 IFLA_MACVLAN_MAX 7 IFLA_PPP_MAX 2 __IFLA_RMNET_MAX 4 IFLA_VLAN_MAX 6 IFLA_VRF_MAX 2 IFLA_VTI_MAX 7 IFLA_VXLAN_MAX 28 VETH_INFO_MAX 2 VXCAN_INFO_MAX 2 This additionally changes maxtype and slave_maxtype fields to unsigned, since they're only ever using positive values. [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com [2] https://patchwork.kernel.org/patch/10439647/ Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 14b6b3af8918..0bbaa5488423 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -64,7 +64,7 @@ struct rtnl_link_ops { size_t priv_size; void (*setup)(struct net_device *dev); - int maxtype; + unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], struct nlattr *data[], @@ -92,7 +92,7 @@ struct rtnl_link_ops { unsigned int (*get_num_tx_queues)(void); unsigned int (*get_num_rx_queues)(void); - int slave_maxtype; + unsigned int slave_maxtype; const struct nla_policy *slave_policy; int (*slave_changelink)(struct net_device *dev, struct net_device *slave_dev, -- cgit v1.2.3 From 554ced0a6e2946562c20d9fffdbaf2aa7da36b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= Date: Mon, 28 May 2018 09:15:33 +0200 Subject: netfilter: nf_tables: add support for native socket matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now it can only match the transparent flag of an ip/ipv6 socket. Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 9c71f024f9cc..3d46c82a5ebd 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -904,6 +904,31 @@ enum nft_rt_attributes { }; #define NFTA_RT_MAX (__NFTA_RT_MAX - 1) +/** + * enum nft_socket_attributes - nf_tables socket expression netlink attributes + * + * @NFTA_SOCKET_KEY: socket key to match + * @NFTA_SOCKET_DREG: destination register + */ +enum nft_socket_attributes { + NFTA_SOCKET_UNSPEC, + NFTA_SOCKET_KEY, + NFTA_SOCKET_DREG, + __NFTA_SOCKET_MAX +}; +#define NFTA_SOCKET_MAX (__NFTA_SOCKET_MAX - 1) + +/* + * enum nft_socket_keys - nf_tables socket expression keys + * + * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option_ + */ +enum nft_socket_keys { + NFT_SOCKET_TRANSPARENT, + __NFT_SOCKET_MAX +}; +#define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) + /** * enum nft_ct_keys - nf_tables ct expression keys * -- cgit v1.2.3 From 1a893b44de4528887e7dabcdce7151ca2a8ee238 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 30 May 2018 11:06:22 +0200 Subject: netfilter: nf_tables: Add audit support to log statement This extends log statement to support the behaviour achieved with AUDIT target in iptables. Audit logging is enabled via a pseudo log level 8. In this case any other settings like log prefix are ignored since audit log format is fixed. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 3d46c82a5ebd..5c7eb9b9f6d6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1080,6 +1080,11 @@ enum nft_log_attributes { }; #define NFTA_LOG_MAX (__NFTA_LOG_MAX - 1) +/** + * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging + */ +#define LOGLEVEL_AUDIT 8 + /** * enum nft_queue_attributes - nf_tables queue expression netlink attributes * -- cgit v1.2.3 From a654de8fdc1815676ab750e70cab231fc814c29f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 30 May 2018 20:18:57 +0200 Subject: netfilter: nf_tables: fix chain dependency validation The following ruleset: add table ip filter add chain ip filter input { type filter hook input priority 4; } add chain ip filter ap add rule ip filter input jump ap add rule ip filter ap masquerade results in a panic, because the masquerade extension should be rejected from the filter chain. The existing validation is missing a chain dependency check when the rule is added to the non-base chain. This patch fixes the problem by walking down the rules from the basechains, searching for either immediate or lookup expressions, then jumping to non-base chains and again walking down the rules to perform the expression validation, so we make sure the full ruleset graph is validated. This is done only once from the commit phase, in case of problem, we abort the transaction and perform fine grain validation for error reporting. This patch requires 003087911af2 ("netfilter: nfnetlink: allow commit to fail") to achieve this behaviour. This patch also adds a cleanup callback to nfnl batch interface to reset the validate state from the exit path. As a result of this patch, nf_tables_check_loops() doesn't use ->validate to check for loops, instead it just checks for immediate expressions. Reported-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 1 + include/net/netfilter/nf_tables.h | 2 ++ include/net/netfilter/nf_tables_core.h | 8 ++++++++ include/net/netns/nftables.h | 1 + 4 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 34551f8aaf9d..3ecc3050be0e 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -31,6 +31,7 @@ struct nfnetlink_subsystem { const struct nfnl_callback *cb; /* callback for individual types */ int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb); + void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); }; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 6b9ddb99f3a8..435c32d8a995 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -874,6 +874,8 @@ struct nft_chain { struct nft_rule **rules_next; }; +int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); + enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index cd6915b6c054..e0c0c2558ec4 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -2,6 +2,8 @@ #ifndef _NET_NF_TABLES_CORE_H #define _NET_NF_TABLES_CORE_H +#include + extern struct nft_expr_type nft_imm_type; extern struct nft_expr_type nft_cmp_type; extern struct nft_expr_type nft_lookup_type; @@ -23,6 +25,12 @@ struct nft_cmp_fast_expr { u8 len; }; +struct nft_immediate_expr { + struct nft_data data; + enum nft_registers dreg:8; + u8 dlen; +}; + /* Calculate the mask for the nft_cmp_fast expression. On big endian the * mask needs to include the *upper* bytes when interpreting that data as * something smaller than the full u32, therefore a cpu_to_le32 is done. diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 29c3851b486a..94767ea3a490 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -9,6 +9,7 @@ struct netns_nftables { struct list_head commit_list; unsigned int base_seq; u8 gencursor; + u8 validate_state; }; #endif -- cgit v1.2.3 From d32de98ea70fe7cf606f3809f0970b31c115764b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 31 May 2018 01:58:00 +0200 Subject: netfilter: nft_fwd_netdev: allow to forward packets via neighbour layer This allows us to forward packets from the netdev family via neighbour layer, so you don't need an explicit link-layer destination when using this expression from rules. The ttl/hop_limit field is decremented. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 5c7eb9b9f6d6..a089af092a29 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1260,10 +1260,14 @@ enum nft_dup_attributes { * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes * * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register) + * @NFTA_FWD_SREG_ADDR: source register of destination address (NLA_U32: nft_register) + * @NFTA_FWD_NFPROTO: layer 3 family of source register address (NLA_U32: enum nfproto) */ enum nft_fwd_attributes { NFTA_FWD_UNSPEC, NFTA_FWD_SREG_DEV, + NFTA_FWD_SREG_ADDR, + NFTA_FWD_NFPROTO, __NFTA_FWD_MAX }; #define NFTA_FWD_MAX (__NFTA_FWD_MAX - 1) -- cgit v1.2.3 From d12e12299a6915fc10131602cca41170e46ae755 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 25 May 2018 22:06:25 +0300 Subject: ipvs: add ipv6 support to ftp Add support for FTP commands with extended format (RFC 2428): - FTP EPRT: IPv4 and IPv6, active mode, similar to PORT - FTP EPSV: IPv4 and IPv6, passive mode, similar to PASV. EPSV response usually contains only port but we allow real server to provide different address We restrict control and data connection to be from same address family. Allow the "(" and ")" to be optional in PASV response. Also, add ipvsh argument to the pkt_in/pkt_out handlers to better access the payload after transport header. Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0ac795b41ab8..03f567eb9536 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -763,14 +763,14 @@ struct ip_vs_app { * 2=Mangled but checksum was not updated */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, - struct sk_buff *, int *diff); + struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* input hook: Process packet in outin direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, - struct sk_buff *, int *diff); + struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* ip_vs_app initializer */ int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); @@ -1328,8 +1328,10 @@ int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 int ip_vs_app_inc_get(struct ip_vs_app *inc); void ip_vs_app_inc_put(struct ip_vs_app *inc); -int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb); -int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb); +int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh); +int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh); int register_ip_vs_pe(struct ip_vs_pe *pe); int unregister_ip_vs_pe(struct ip_vs_pe *pe); -- cgit v1.2.3 From 8d6e555773690e6fdefd99723fcd0a7e432c0c90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= Date: Fri, 1 Jun 2018 14:54:07 +0200 Subject: netfilter: Decrease code duplication regarding transparent socket option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a function in include/net/netfilter/nf_socket.h to decide if a socket has IP(V6)_TRANSPARENT socket option set or not. However this does the same as inet_sk_transparent() in include/net/tcp.h include/net/tcp.h:1733 /* This helper checks if socket has IP_TRANSPARENT set */ static inline bool inet_sk_transparent(const struct sock *sk) { switch (sk->sk_state) { case TCP_TIME_WAIT: return inet_twsk(sk)->tw_transparent; case TCP_NEW_SYN_RECV: return inet_rsk(inet_reqsk(sk))->no_srccheck; } return inet_sk(sk)->transparent; } tproxy_sk_is_transparent has also been refactored to use this function instead of reimplementing it. Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_socket.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h index 29b6313f0557..f9d7bee9bd4e 100644 --- a/include/net/netfilter/nf_socket.h +++ b/include/net/netfilter/nf_socket.h @@ -3,19 +3,6 @@ #define _NF_SOCK_H_ #include -#include - -static inline bool nf_sk_is_transparent(struct sock *sk) -{ - switch (sk->sk_state) { - case TCP_TIME_WAIT: - return inet_twsk(sk)->tw_transparent; - case TCP_NEW_SYN_RECV: - return inet_rsk(inet_reqsk(sk))->no_srccheck; - default: - return inet_sk(sk)->transparent; - } -} struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, const struct net_device *indev); -- cgit v1.2.3 From 45ca4e0cf2734f8cc14b43e47c23618215abf1b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= Date: Fri, 1 Jun 2018 20:44:56 +0200 Subject: netfilter: Libify xt_TPROXY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The extracted functions will likely be usefull to implement tproxy support in nf_tables. Extrancted functions: - nf_tproxy_sk_is_transparent - nf_tproxy_laddr4 - nf_tproxy_handle_time_wait4 - nf_tproxy_get_sock_v4 - nf_tproxy_laddr6 - nf_tproxy_handle_time_wait6 - nf_tproxy_get_sock_v6 (nf_)tproxy_handle_time_wait6 also needed some refactor as its current implementation was xtables-specific. Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tproxy.h | 113 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 include/net/netfilter/nf_tproxy.h (limited to 'include') diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h new file mode 100644 index 000000000000..9754a50ecde9 --- /dev/null +++ b/include/net/netfilter/nf_tproxy.h @@ -0,0 +1,113 @@ +#ifndef _NF_TPROXY_H_ +#define _NF_TPROXY_H_ + +#include + +enum nf_tproxy_lookup_t { + NF_TPROXY_LOOKUP_LISTENER, + NF_TPROXY_LOOKUP_ESTABLISHED, +}; + +static inline bool nf_tproxy_sk_is_transparent(struct sock *sk) +{ + if (inet_sk_transparent(sk)) + return true; + + sock_gen_put(sk); + return false; +} + +__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); + +/** + * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @laddr: IPv4 address to redirect to or zero. + * @lport: TCP port to redirect to or zero. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * nf_tproxy_handle_time_wait4() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +struct sock * +nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, + __be32 laddr, __be16 lport, struct sock *sk); + +/* + * This is used when the user wants to intercept a connection matching + * an explicit iptables rule. In this case the sockets are assumed + * matching in preference order: + * + * - match: if there's a fully established connection matching the + * _packet_ tuple, it is returned, assuming the redirection + * already took place and we process a packet belonging to an + * established connection + * + * - match: if there's a listening socket matching the redirection + * (e.g. on-port & on-ip of the connection), it is returned, + * regardless if it was bound to 0.0.0.0 or an explicit + * address. The reasoning is that if there's an explicit rule, it + * does not really matter if the listener is bound to an interface + * or to 0. The user already stated that he wants redirection + * (since he added the rule). + * + * Please note that there's an overlap between what a TPROXY target + * and a socket match will match. Normally if you have both rules the + * "socket" match will be the first one, effectively all packets + * belonging to established connections going through that one. + */ +struct sock * +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, + const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type); + +const struct in6_addr * +nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, + const struct in6_addr *daddr); + +/** + * nf_tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @tproto: Transport protocol. + * @thoff: Transport protocol header offset. + * @net: Network namespace. + * @laddr: IPv6 address to redirect to. + * @lport: TCP port to redirect to or zero. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * nf_tproxy_handle_time_wait6() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +struct sock * +nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, + struct net *net, + const struct in6_addr *laddr, + const __be16 lport, + struct sock *sk); + +struct sock * +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, + const u8 protocol, + const struct in6_addr *saddr, const struct in6_addr *daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, + const enum nf_tproxy_lookup_t lookup_type); + +#endif /* _NF_TPROXY_H_ */ -- cgit v1.2.3 From 00bfb3205e62970ff14ac8756e1a72b753a990ab Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 2 Jun 2018 23:38:46 +0200 Subject: netfilter: nf_tables: pass context to object destroy indirection The new connlimit object needs this to properly deal with conntrack dependencies. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 435c32d8a995..81ec070582b6 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1070,7 +1070,8 @@ struct nft_object_ops { int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); - void (*destroy)(struct nft_object *obj); + void (*destroy)(const struct nft_ctx *ctx, + struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); -- cgit v1.2.3 From 5e5cbc7b23eaf13e18652c03efbad5be6995de6a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 2 Jun 2018 23:38:47 +0200 Subject: netfilter: nf_conncount: expose connection list interface This patch provides an interface to maintain the list of connections and the lookup function to obtain the number of connections in the list. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index e61184fbfb71..1910b6572430 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -13,4 +13,15 @@ unsigned int nf_conncount_count(struct net *net, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); + +unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit); + +bool nf_conncount_add(struct hlist_head *head, + const struct nf_conntrack_tuple *tuple); + +void nf_conncount_cache_free(struct hlist_head *hhead); + #endif -- cgit v1.2.3 From 3453c92731884bad7c4c3a0667228b964747f3d5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 2 Jun 2018 23:38:48 +0200 Subject: netfilter: nf_tables: pass ctx to nf_tables_expr_destroy() nft_set_elem_destroy() can be called from call_rcu context. Annotate netns and table in set object so we can populate the context object. Moreover, pass context object to nf_tables_set_elem_destroy() from the commit phase, since it is already available from there. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 81ec070582b6..e3d1bac9b0d5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -370,6 +370,8 @@ void nft_unregister_set(struct nft_set_type *type); * * @list: table set list node * @bindings: list of set bindings + * @table: table this set belongs to + * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) @@ -393,6 +395,8 @@ void nft_unregister_set(struct nft_set_type *type); struct nft_set { struct list_head list; struct list_head bindings; + struct nft_table *table; + possible_net_t net; char *name; u64 handle; u32 ktype; -- cgit v1.2.3 From 79b174ade16d90302aef6e14f5eefd0b723c1602 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 2 Jun 2018 23:38:49 +0200 Subject: netfilter: nf_tables: garbage collection for stateful expressions Use garbage collector to schedule removal of elements based of feedback from expression that this element comes with. Therefore, the garbage collector is not guided by timeout expirations in this new mode. The new connlimit expression sets on the NFT_EXPR_GC flag to enable this behaviour, the dynset expression needs to explicitly enable the garbage collector via set->ops->gc_init call. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e3d1bac9b0d5..871cb3b012e9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -342,6 +342,7 @@ struct nft_set_ops { const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_set *set); + void (*gc_init)(const struct nft_set *set); unsigned int elemsize; }; @@ -712,6 +713,7 @@ struct nft_expr_type { }; #define NFT_EXPR_STATEFUL 0x1 +#define NFT_EXPR_GC 0x2 /** * struct nft_expr_ops - nf_tables expression operations @@ -748,6 +750,8 @@ struct nft_expr_ops { int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); + bool (*gc)(struct net *net, + const struct nft_expr *expr); const struct nft_expr_type *type; void *data; }; -- cgit v1.2.3 From 371ebcbb9ee62fb46a0a27f358941588f7048678 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 2 Jun 2018 23:38:50 +0200 Subject: netfilter: nf_tables: add destroy_clone expression Before this patch, cloned expressions are released via ->destroy. This is a problem for the new connlimit expression since the ->destroy path drop a reference on the conntrack modules and it unregisters hooks. The new ->destroy_clone provides context that this expression is being released from the packet path, so it is mirroring ->clone(), where neither module reference is dropped nor hooks need to be unregistered - because this done from the control plane path from the ->init() path. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 871cb3b012e9..83e7b83ecf3e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -745,6 +745,8 @@ struct nft_expr_ops { const struct nft_expr *expr); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); + void (*destroy_clone)(const struct nft_ctx *ctx, + const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr); int (*validate)(const struct nft_ctx *ctx, -- cgit v1.2.3 From 290180e2448c02d6b391455937098882a73a9494 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 2 Jun 2018 21:38:51 +0200 Subject: netfilter: nf_tables: add connlimit support This features which allows you to limit the maximum number of connections per arbitrary key. The connlimit expression is stateful, therefore it can be used from meters to dynamically populate a set, this provides a mapping to the iptables' connlimit match. This patch also comes that allows you define static connlimit policies. This extension depends on the nf_conncount infrastructure. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index a089af092a29..ae00a3c49b8a 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1043,6 +1043,24 @@ enum nft_limit_attributes { }; #define NFTA_LIMIT_MAX (__NFTA_LIMIT_MAX - 1) +enum nft_connlimit_flags { + NFT_CONNLIMIT_F_INV = (1 << 0), +}; + +/** + * enum nft_connlimit_attributes - nf_tables connlimit expression netlink attributes + * + * @NFTA_CONNLIMIT_COUNT: number of connections (NLA_U32) + * @NFTA_CONNLIMIT_FLAGS: flags (NLA_U32: enum nft_connlimit_flags) + */ +enum nft_connlimit_attributes { + NFTA_CONNLIMIT_UNSPEC, + NFTA_CONNLIMIT_COUNT, + NFTA_CONNLIMIT_FLAGS, + __NFTA_CONNLIMIT_MAX +}; +#define NFTA_CONNLIMIT_MAX (__NFTA_CONNLIMIT_MAX - 1) + /** * enum nft_counter_attributes - nf_tables counter expression netlink attributes * @@ -1357,7 +1375,8 @@ enum nft_ct_helper_attributes { #define NFT_OBJECT_QUOTA 2 #define NFT_OBJECT_CT_HELPER 3 #define NFT_OBJECT_LIMIT 4 -#define __NFT_OBJECT_MAX 5 +#define NFT_OBJECT_CONNLIMIT 5 +#define __NFT_OBJECT_MAX 6 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** -- cgit v1.2.3 From 1b2470e59fb1e983a3655feba30cdfc03e609d51 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 2 Jun 2018 23:41:06 +0200 Subject: netfilter: nf_tables: handle chain name lookups via rhltable If there is a significant amount of chains list search is too slow, so add an rhlist table for this. This speeds up ruleset loading: for every new rule we have to check if the name already exists in current generation. We need to be able to cope with duplicate chain names in case a transaction drops the nfnl mutex (for request_module) and the abort of this old transaction is still pending. The list is kept -- we need a way to iterate chains even if hash resize is in progress without missing an entry. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 83e7b83ecf3e..08c005ce56e9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -860,6 +861,7 @@ enum nft_chain_flags { * * @rules: list of rules in the chain * @list: used internally + * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain @@ -872,6 +874,7 @@ struct nft_chain { struct nft_rule *__rcu *rules_gen_1; struct list_head rules; struct list_head list; + struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; @@ -965,7 +968,8 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * struct nft_table - nf_tables table * * @list: used internally - * @chains: chains in the table + * @chains_ht: chains in the table + * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table @@ -979,6 +983,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); */ struct nft_table { struct list_head list; + struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; -- cgit v1.2.3 From 06be0864c77ae6861632a678a6378511a4828d6e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:31 +0200 Subject: bpf: test case for map pointer poison with calls/branches Add several test cases where the same or different map pointers originate from different paths in the program and execute a map lookup or tail call at a common location. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index d90abdaea94b..6fd375fe7079 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -289,6 +289,16 @@ struct xdp_buff; .off = OFF, \ .imm = 0 }) +/* Relative call */ + +#define BPF_CALL_REL(TGT) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = BPF_PSEUDO_CALL, \ + .off = 0, \ + .imm = TGT }) + /* Function call */ #define BPF_EMIT_CALL(FUNC) \ -- cgit v1.2.3 From 09772d92cd5ad998b0d5f6f46cd1658f8cb698cf Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:35 +0200 Subject: bpf: avoid retpoline for lookup/update/delete calls on maps While some of the BPF map lookup helpers provide a ->map_gen_lookup() callback for inlining the map lookup altogether it is not available for every map, so the remaining ones have to call bpf_map_lookup_elem() helper which does a dispatch to map->ops->map_lookup_elem(). In times of retpolines, this will control and trap speculative execution rather than letting it do its work for the indirect call and will therefore cause a slowdown. Likewise, bpf_map_update_elem() and bpf_map_delete_elem() do not have an inlined version and need to call into their map->ops->map_update_elem() resp. map->ops->map_delete_elem() handlers. Before: # bpftool prog dump xlated id 1 0: (bf) r2 = r10 1: (07) r2 += -8 2: (7a) *(u64 *)(r2 +0) = 0 3: (18) r1 = map[id:1] 5: (85) call __htab_map_lookup_elem#232656 6: (15) if r0 == 0x0 goto pc+4 7: (71) r1 = *(u8 *)(r0 +35) 8: (55) if r1 != 0x0 goto pc+1 9: (72) *(u8 *)(r0 +35) = 1 10: (07) r0 += 56 11: (15) if r0 == 0x0 goto pc+4 12: (bf) r2 = r0 13: (18) r1 = map[id:1] 15: (85) call bpf_map_delete_elem#215008 <-- indirect call via 16: (95) exit helper After: # bpftool prog dump xlated id 1 0: (bf) r2 = r10 1: (07) r2 += -8 2: (7a) *(u64 *)(r2 +0) = 0 3: (18) r1 = map[id:1] 5: (85) call __htab_map_lookup_elem#233328 6: (15) if r0 == 0x0 goto pc+4 7: (71) r1 = *(u8 *)(r0 +35) 8: (55) if r1 != 0x0 goto pc+1 9: (72) *(u8 *)(r0 +35) = 1 10: (07) r0 += 56 11: (15) if r0 == 0x0 goto pc+4 12: (bf) r2 = r0 13: (18) r1 = map[id:1] 15: (85) call htab_lru_map_delete_elem#238240 <-- direct call 16: (95) exit In all three lookup/update/delete cases however we can use the actual address of the map callback directly if we find that there's only a single path with a map pointer leading to the helper call, meaning when the map pointer has not been poisoned from verifier side. Example code can be seen above for the delete case. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6fd375fe7079..8e60f1e9a702 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -301,6 +301,9 @@ struct xdp_buff; /* Function call */ +#define BPF_CAST_CALL(x) \ + ((u64 (*)(u64, u64, u64, u64, u64))(x)) + #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ -- cgit v1.2.3 From cb20b08ead401fd17627a36f035c0bf5bfee5567 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:36 +0200 Subject: bpf: add bpf_skb_cgroup_id helper Add a new bpf_skb_cgroup_id() helper that allows to retrieve the cgroup id from the skb's socket. This is useful in particular to enable bpf_get_cgroup_classid()-like behavior for cgroup v1 in cgroup v2 by allowing ID based matching on egress. This can in particular be used in combination with applying policy e.g. from map lookups, and also complements the older bpf_skb_under_cgroup() interface. In user space the cgroup id for a given path can be retrieved through the f_handle as demonstrated in [0] recently. [0] https://lkml.org/lkml/2018/5/22/1190 Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 64ac0f7a689e..661318141f72 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2054,6 +2054,22 @@ union bpf_attr { * * Return * 0 + * + * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * Description + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2134,7 +2150,8 @@ union bpf_attr { FN(lwt_seg6_adjust_srh), \ FN(lwt_seg6_action), \ FN(rc_repeat), \ - FN(rc_keydown), + FN(rc_keydown), \ + FN(skb_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 1fbc2e0cfcf9677683aea2b1f9ea76fbdc6fb6d1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:37 +0200 Subject: bpf: make sure to clear unused fields in tunnel/xfrm state fetch Since the remaining bits are not filled in struct bpf_tunnel_key resp. struct bpf_xfrm_state and originate from uninitialized stack space, we should make sure to clear them before handing control back to the program. Also add a padding element to struct bpf_xfrm_state for future use similar as we have in struct bpf_tunnel_key and clear it as well. struct bpf_xfrm_state { __u32 reqid; /* 0 4 */ __u32 spi; /* 4 4 */ __u16 family; /* 8 2 */ /* XXX 2 bytes hole, try to pack */ union { __u32 remote_ipv4; /* 4 */ __u32 remote_ipv6[4]; /* 16 */ }; /* 12 16 */ /* size: 28, cachelines: 1, members: 4 */ /* sum members: 26, holes: 1, sum holes: 2 */ /* last cacheline: 28 bytes */ }; Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 661318141f72..f0b6608b1f1c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2268,7 +2268,7 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; - __u16 tunnel_ext; + __u16 tunnel_ext; /* Padding, future use. */ __u32 tunnel_label; }; @@ -2279,6 +2279,7 @@ struct bpf_xfrm_state { __u32 reqid; __u32 spi; /* Stored in network byte order */ __u16 family; + __u16 ext; /* Padding, future use. */ union { __u32 remote_ipv4; /* Stored in network byte order */ __u32 remote_ipv6[4]; /* Stored in network byte order */ -- cgit v1.2.3 From bc23105ca0abdeed98366af01c700c2c3aff5cd5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:39 +0200 Subject: bpf: fix context access in tracing progs on 32 bit archs Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT program type in test_verifier report the following errors on x86_32: 172/p unpriv: spill/fill of different pointers ldx FAIL Unexpected error message! 0: (bf) r6 = r10 1: (07) r6 += -8 2: (15) if r1 == 0x0 goto pc+3 R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1 3: (bf) r2 = r10 4: (07) r2 += -76 5: (7b) *(u64 *)(r6 +0) = r2 6: (55) if r1 != 0x0 goto pc+1 R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 fp-8=fp 7: (7b) *(u64 *)(r6 +0) = r1 8: (79) r1 = *(u64 *)(r6 +0) 9: (79) r1 = *(u64 *)(r1 +68) invalid bpf_context access off=68 size=8 378/p check bpf_perf_event_data->sample_period byte load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (71) r0 = *(u8 *)(r1 +68) invalid bpf_context access off=68 size=1 379/p check bpf_perf_event_data->sample_period half load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (69) r0 = *(u16 *)(r1 +68) invalid bpf_context access off=68 size=2 380/p check bpf_perf_event_data->sample_period word load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (61) r0 = *(u32 *)(r1 +68) invalid bpf_context access off=68 size=4 381/p check bpf_perf_event_data->sample_period dword load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (79) r0 = *(u64 *)(r1 +68) invalid bpf_context access off=68 size=8 Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte boundary due to its size of 68 bytes. Therefore, bpf_ctx_narrow_access_ok() will then bail out saying that off & (size_default - 1) which is 68 & 7 doesn't cleanly align in the case of sample_period access from struct bpf_perf_event_data, hence verifier wrongly thinks we might be doing an unaligned access here though underlying arch can handle it just fine. Therefore adjust this down to machine size and check and rewrite the offset for narrow access on that basis. We also need to fix corresponding pe_prog_is_valid_access(), since we hit the check for off % size != 0 (e.g. 68 % 8 -> 4) in the first and last test. With that in place, progs for tracing work on x86_32. Reported-by: Wang YanQing Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Tested-by: Wang YanQing Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 8e60f1e9a702..45fc0f5000d8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -639,16 +639,34 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) return prog->type == BPF_PROG_TYPE_UNSPEC; } -static inline bool -bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default) +static inline u32 bpf_ctx_off_adjust_machine(u32 size) +{ + const u32 size_machine = sizeof(unsigned long); + + if (size > size_machine && size % size_machine == 0) + size = size_machine; + + return size; +} + +static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access, + u32 size_default) { - bool off_ok; + size_default = bpf_ctx_off_adjust_machine(size_default); + size_access = bpf_ctx_off_adjust_machine(size_access); + #ifdef __LITTLE_ENDIAN - off_ok = (off & (size_default - 1)) == 0; + return (off & (size_default - 1)) == 0; #else - off_ok = (off & (size_default - 1)) + size == size_default; + return (off & (size_default - 1)) + size_access == size_default; #endif - return off_ok && size <= size_default && (size & (size - 1)) == 0; +} + +static inline bool +bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) +{ + return bpf_ctx_narrow_align_ok(off, size, size_default) && + size <= size_default && (size & (size - 1)) == 0; } #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) -- cgit v1.2.3 From 8051ac7643e8d0f1dd7272042d66c0518e718347 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 31 May 2018 09:20:20 -0300 Subject: vlan: use non-archaic spelling of failes Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 78a5a90b4267..83ea4df6ab81 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -331,7 +331,7 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features, * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len - * Returns error if skb_cow_head failes. + * Returns error if skb_cow_head fails. * * Does not change skb->protocol so this function can be used during receive. */ @@ -379,7 +379,7 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload - * Returns error if skb_cow_head failes. + * Returns error if skb_cow_head fails. * * Does not change skb->protocol so this function can be used during receive. */ -- cgit v1.2.3 From 42b33468987bac0dd95c30f14820c7abac04a153 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 10:59:47 +0200 Subject: xdp: add flags argument to ndo_xdp_xmit API This patch only change the API and reject any use of flags. This is an intermediate step that allows us to implement the flush flag operation later, for each individual driver in a separate patch. The plan is to implement flush operation via XDP_XMIT_FLUSH flag and then remove XDP_XMIT_FLAGS_NONE when done. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 7 ++++--- include/net/xdp.h | 5 +++++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8452f72087ef..7f17785a59d7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1185,13 +1185,13 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, + * u32 flags); * This function is used to submit @n XDP packets for transmit on a * netdevice. Returns number of frames successfully transmitted, frames * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. - * TODO: Consider add flag to allow sending flush operation. * void (*ndo_xdp_flush)(struct net_device *dev); * This function is used to inform the driver to flush a particular * xdp tx queue. Must be called on same CPU as xdp_xmit. @@ -1380,7 +1380,8 @@ struct net_device_ops { int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, int n, - struct xdp_frame **xdp); + struct xdp_frame **xdp, + u32 flags); void (*ndo_xdp_flush)(struct net_device *dev); }; diff --git a/include/net/xdp.h b/include/net/xdp.h index 7ad779237ae8..0c45f0f943ed 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -40,6 +40,11 @@ enum xdp_mem_type { MEM_TYPE_MAX, }; +/* XDP flags for ndo_xdp_xmit */ +#define XDP_XMIT_FLAGS_NONE 0U +#define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ +#define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH + struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; -- cgit v1.2.3 From 73de5717eb30ceedef6abf9da4bc2194f25d92ca Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 11:00:13 +0200 Subject: xdp: done implementing ndo_xdp_xmit flush flag for all drivers Removing XDP_XMIT_FLAGS_NONE as all driver now implement a flush operation in their ndo_xdp_xmit call. The compiler will catch if any users of XDP_XMIT_FLAGS_NONE remains. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 0c45f0f943ed..a3b71a4dd71d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -41,7 +41,6 @@ enum xdp_mem_type { }; /* XDP flags for ndo_xdp_xmit */ -#define XDP_XMIT_FLAGS_NONE 0U #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH -- cgit v1.2.3 From bf6fa2c893c5237b48569a13fa3c673041430b6c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 3 Jun 2018 15:59:41 -0700 Subject: bpf: implement bpf_get_current_cgroup_id() helper bpf has been used extensively for tracing. For example, bcc contains an almost full set of bpf-based tools to trace kernel and user functions/events. Most tracing tools are currently either filtered based on pid or system-wide. Containers have been used quite extensively in industry and cgroup is often used together to provide resource isolation and protection. Several processes may run inside the same container. It is often desirable to get container-level tracing results as well, e.g. syscall count, function count, I/O activity, etc. This patch implements a new helper, bpf_get_current_cgroup_id(), which will return cgroup id based on the cgroup within which the current task is running. The later patch will provide an example to show that userspace can get the same cgroup id so it could configure a filter or policy in the bpf program based on task cgroup id. The helper is currently implemented for tracing. It can be added to other program types as well when needed. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bbe297436e5d..995c3b1e59bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -746,6 +746,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; +extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f0b6608b1f1c..18712b0dbfe7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2070,6 +2070,11 @@ union bpf_attr { * **CONFIG_SOCK_CGROUP_DATA** configuration option. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2151,7 +2156,8 @@ union bpf_attr { FN(lwt_seg6_action), \ FN(rc_repeat), \ FN(rc_keydown), \ - FN(skb_cgroup_id), + FN(skb_cgroup_id), \ + FN(get_current_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From bd3a08aaa9a383ffbbd5b788b797ae6e64eaa7a1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 3 Jun 2018 08:15:19 -0700 Subject: bpf: flowlabel in bpf_fib_lookup should be flowinfo As Michal noted the flow struct takes both the flow label and priority. Update the bpf_fib_lookup API to note that it is flowinfo and not just the flow label. Cc: Michal Kubecek Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18712b0dbfe7..eeb6237be5c2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2629,7 +2629,7 @@ struct bpf_fib_lookup { union { /* inputs to lookup */ __u8 tos; /* AF_INET */ - __be32 flowlabel; /* AF_INET6 */ + __be32 flowinfo; /* AF_INET6, flow_label + priority */ /* output: metric of fib result (IPv4/IPv6 only) */ __u32 rt_metric; -- cgit v1.2.3 From bbff2f321a864ee07c9d3d1245af498023146951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 13:57:13 +0200 Subject: xsk: new descriptor addressing scheme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, AF_XDP only supports a fixed frame-size memory scheme where each frame is referenced via an index (idx). A user passes the frame index to the kernel, and the kernel acts upon the data. Some NICs, however, do not have a fixed frame-size model, instead they have a model where a memory window is passed to the hardware and multiple frames are filled into that window (referred to as the "type-writer" model). By changing the descriptor format from the current frame index addressing scheme, AF_XDP can in the future be extended to support these kinds of NICs. In the index-based model, an idx refers to a frame of size frame_size. Addressing a frame in the UMEM is done by offseting the UMEM starting address by a global offset, idx * frame_size + offset. Communicating via the fill- and completion-rings are done by means of idx. In this commit, the idx is removed in favor of an address (addr), which is a relative address ranging over the UMEM. To convert an idx-based address to the new addr is simply: addr = idx * frame_size + offset. We also stop referring to the UMEM "frame" as a frame. Instead it is simply called a chunk. To transfer ownership of a chunk to the kernel, the addr of the chunk is passed in the fill-ring. Note, that the kernel will mask addr to make it chunk aligned, so there is no need for userspace to do that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or 3000 to the fill-ring will refer to the same chunk. On the completion-ring, the addr will match that of the Tx descriptor, passed to the kernel. Changing the descriptor format to use chunks/addr will allow for future changes to move to a type-writer based model, where multiple frames can reside in one chunk. In this model passing one single chunk into the fill-ring, would potentially result in multiple Rx descriptors. This commit changes the uapi of AF_XDP sockets, and updates the documentation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 4737cfe222f5..e411d6f9ac65 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -48,8 +48,8 @@ struct xdp_mmap_offsets { struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ - __u32 frame_size; /* Frame size */ - __u32 frame_headroom; /* Frame head room */ + __u32 chunk_size; + __u32 headroom; }; struct xdp_statistics { @@ -66,13 +66,11 @@ struct xdp_statistics { /* Rx/Tx descriptor */ struct xdp_desc { - __u32 idx; + __u64 addr; __u32 len; - __u16 offset; - __u8 flags; - __u8 padding[5]; + __u32 options; }; -/* UMEM descriptor is __u32 */ +/* UMEM descriptor is __u64 */ #endif /* _LINUX_IF_XDP_H */ -- cgit v1.2.3 From 87ae68c8b4944d142447b88875c9c412c714434f Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sat, 2 Jun 2018 09:40:34 +0200 Subject: ipv6: omit traffic class when calculating flow hash Some of the code paths calculating flow hash for IPv6 use flowlabel member of struct flowi6 which, despite its name, encodes both flow label and traffic class. If traffic class changes within a TCP connection (as e.g. ssh does), ECMP route can switch between path. It's also incosistent with other code paths where ip6_flowlabel() (returning only flow label) is used to feed the key. Use only flow label everywhere, including one place where hash key is set using ip6_flowinfo(). Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)") Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures") Signed-off-by: Michal Kubecek Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ipv6.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 798558fd1681..6dcc473fbe51 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -907,6 +907,11 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; } +static inline u32 flowi6_get_flowlabel(const struct flowi6 *fl6) +{ + return (__force u32)(fl6->flowlabel & IPV6_FLOWLABEL_MASK); +} + /* * Prototypes exported by ipv6 */ -- cgit v1.2.3 From a925ab48dacbc9ad470a9ca4c761601ee07e476c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 4 Jun 2018 13:20:38 -0400 Subject: Revert "ipv6: omit traffic class when calculating flow hash" This reverts commit 87ae68c8b4944d142447b88875c9c412c714434f. Applied the wrong version of this fix, correct version coming up. Signed-off-by: David S. Miller --- include/net/ipv6.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6dcc473fbe51..798558fd1681 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -907,11 +907,6 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; } -static inline u32 flowi6_get_flowlabel(const struct flowi6 *fl6) -{ - return (__force u32)(fl6->flowlabel & IPV6_FLOWLABEL_MASK); -} - /* * Prototypes exported by ipv6 */ -- cgit v1.2.3 From fa1be7e01ea863e911349e30456706749518eeab Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Mon, 4 Jun 2018 11:36:05 +0200 Subject: ipv6: omit traffic class when calculating flow hash Some of the code paths calculating flow hash for IPv6 use flowlabel member of struct flowi6 which, despite its name, encodes both flow label and traffic class. If traffic class changes within a TCP connection (as e.g. ssh does), ECMP route can switch between path. It's also inconsistent with other code paths where ip6_flowlabel() (returning only flow label) is used to feed the key. Use only flow label everywhere, including one place where hash key is set using ip6_flowinfo(). Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)") Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures") Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- include/net/ipv6.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 798558fd1681..16475c269749 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -907,6 +907,11 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; } +static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6) +{ + return fl6->flowlabel & IPV6_FLOWLABEL_MASK; +} + /* * Prototypes exported by ipv6 */ -- cgit v1.2.3 From 1a025028d400b23477341aa7ec2ce55f8b39b554 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sun, 3 Jun 2018 02:17:39 +0100 Subject: rxrpc: Fix handling of call quietly cancelled out on server Sometimes an in-progress call will stop responding on the fileserver when the fileserver quietly cancels the call with an internally marked abort (RX_CALL_DEAD), without sending an ABORT to the client. This causes the client's call to eventually expire from lack of incoming packets directed its way, which currently leads to it being cancelled locally with ETIME. Note that it's not currently clear as to why this happens as it's really hard to reproduce. The rotation policy implement by kAFS, however, doesn't differentiate between ETIME meaning we didn't get any response from the server and ETIME meaning the call got cancelled mid-flow. The latter leads to an oops when fetching data as the rotation partially resets the afs_read descriptor, which can result in a cleared page pointer being dereferenced because that page has already been filled. Handle this by the following means: (1) Set a flag on a call when we receive a packet for it. (2) Store the highest packet serial number so far received for a call (bearing in mind this may wrap). (3) If, when the "not received anything recently" timeout expires on a call, we've received at least one packet for a call and the connection as a whole has received packets more recently than that call, then cancel the call locally with ECONNRESET rather than ETIME. This indicates that the call was definitely in progress on the server. (4) In kAFS, if the rotation algorithm sees ECONNRESET rather than ETIME, don't try the next server, but rather abort the call. This avoids the oops as we don't try to reuse the afs_read struct. Rather, as-yet ungotten pages will be reread at a later data. Also: (5) Add an rxrpc tracepoint to log detection of the call being reset. Without this, I occasionally see an oops like the following: general protection fault: 0000 [#1] SMP PTI ... RIP: 0010:_copy_to_iter+0x204/0x310 RSP: 0018:ffff8800cae0f828 EFLAGS: 00010206 RAX: 0000000000000560 RBX: 0000000000000560 RCX: 0000000000000560 RDX: ffff8800cae0f968 RSI: ffff8800d58b3312 RDI: 0005080000000000 RBP: ffff8800cae0f968 R08: 0000000000000560 R09: ffff8800ca00f400 R10: ffff8800c36f28d4 R11: 00000000000008c4 R12: ffff8800cae0f958 R13: 0000000000000560 R14: ffff8800d58b3312 R15: 0000000000000560 FS: 00007fdaef108080(0000) GS:ffff8800ca680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb28a8fa000 CR3: 00000000d2a76002 CR4: 00000000001606e0 Call Trace: skb_copy_datagram_iter+0x14e/0x289 rxrpc_recvmsg_data.isra.0+0x6f3/0xf68 ? trace_buffer_unlock_commit_regs+0x4f/0x89 rxrpc_kernel_recv_data+0x149/0x421 afs_extract_data+0x1e0/0x798 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_deliver_fs_fetch_data+0x33a/0x5ab afs_deliver_to_call+0x1ee/0x5e0 ? afs_wait_for_call_to_complete+0xc9/0x52e afs_wait_for_call_to_complete+0x12b/0x52e ? wake_up_q+0x54/0x54 afs_make_call+0x287/0x462 ? afs_fs_fetch_data+0x3e6/0x3ed ? rcu_read_lock_sched_held+0x5d/0x63 afs_fs_fetch_data+0x3e6/0x3ed afs_fetch_data+0xbb/0x14a afs_readpages+0x317/0x40d __do_page_cache_readahead+0x203/0x2ba ? ondemand_readahead+0x3a7/0x3c1 ondemand_readahead+0x3a7/0x3c1 generic_file_buffered_read+0x18b/0x62f __vfs_read+0xdb/0xfe vfs_read+0xb2/0x137 ksys_read+0x50/0x8c do_syscall_64+0x7d/0x1a0 entry_SYSCALL_64_after_hwframe+0x49/0xbe Note the weird value in RDI which is a result of trying to kmap() a NULL page pointer. Signed-off-by: David Howells Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 077e664ac9a2..4fff00e9da8a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1459,6 +1459,38 @@ TRACE_EVENT(rxrpc_tx_fail, __print_symbolic(__entry->what, rxrpc_tx_fail_traces)) ); +TRACE_EVENT(rxrpc_call_reset, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, debug_id ) + __field(u32, cid ) + __field(u32, call_id ) + __field(rxrpc_serial_t, call_serial ) + __field(rxrpc_serial_t, conn_serial ) + __field(rxrpc_seq_t, tx_seq ) + __field(rxrpc_seq_t, rx_seq ) + ), + + TP_fast_assign( + __entry->debug_id = call->debug_id; + __entry->cid = call->cid; + __entry->call_id = call->call_id; + __entry->call_serial = call->rx_serial; + __entry->conn_serial = call->conn->hi_serial; + __entry->tx_seq = call->tx_hard_ack; + __entry->rx_seq = call->ackr_seen; + ), + + TP_printk("c=%08x %08x:%08x r=%08x/%08x tx=%08x rx=%08x", + __entry->debug_id, + __entry->cid, __entry->call_id, + __entry->call_serial, __entry->conn_serial, + __entry->tx_seq, __entry->rx_seq) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 1f4c74132140c184b8cbcf0a7298beda1d3681de Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 2 Jun 2018 21:40:19 -0700 Subject: net: skbuff.h: drop unneeded does not use nor need , so drop this header file from skbuff.h. is currently #included in around 1200 C source and header files, making it the 31st most-used header file. Build tested [allmodconfig] on 20 arch-es. Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 693564a9a979..164cdedf6012 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -854,8 +854,6 @@ struct sk_buff { /* * Handling routines are only of interest to the kernel */ -#include - #define SKB_ALLOC_FCLONE 0x01 #define SKB_ALLOC_RX 0x02 -- cgit v1.2.3 From 39dbc646fd2c67ee9b71450ce172cbd714d4e7fb Mon Sep 17 00:00:00 2001 From: Yuval Bason Date: Sun, 3 Jun 2018 19:13:07 +0300 Subject: qed: Add srq core support for RoCE and iWARP This patch adds support for configuring SRQ and provides the necessary APIs for rdma upper layer driver (qedr) to enable the SRQ feature. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: Yuval Bason Signed-off-by: David S. Miller --- include/linux/qed/qed_rdma_if.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index 4dd72ba210f5..e05e320d28b7 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -485,7 +485,9 @@ enum qed_iwarp_event_type { QED_IWARP_EVENT_ACTIVE_MPA_REPLY, QED_IWARP_EVENT_LOCAL_ACCESS_ERROR, QED_IWARP_EVENT_REMOTE_OPERATION_ERROR, - QED_IWARP_EVENT_TERMINATE_RECEIVED + QED_IWARP_EVENT_TERMINATE_RECEIVED, + QED_IWARP_EVENT_SRQ_LIMIT, + QED_IWARP_EVENT_SRQ_EMPTY, }; enum qed_tcp_ip_version { @@ -646,6 +648,14 @@ struct qed_rdma_ops { int (*rdma_alloc_tid)(void *rdma_cxt, u32 *itid); void (*rdma_free_tid)(void *rdma_cxt, u32 itid); + int (*rdma_create_srq)(void *rdma_cxt, + struct qed_rdma_create_srq_in_params *iparams, + struct qed_rdma_create_srq_out_params *oparams); + int (*rdma_destroy_srq)(void *rdma_cxt, + struct qed_rdma_destroy_srq_in_params *iparams); + int (*rdma_modify_srq)(void *rdma_cxt, + struct qed_rdma_modify_srq_in_params *iparams); + int (*ll2_acquire_connection)(void *rdma_cxt, struct qed_ll2_acquire_data *data); -- cgit v1.2.3 From 189454e868c45ce3476bfac03ace921dec34208a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 5 Jun 2018 13:55:50 +0200 Subject: net: remove net_device operation ndo_xdp_flush All drivers are cleaned up and no references to ndo_xdp_flush are left in drivers, it is time to remove the net_device_ops operation ndo_xdp_flush. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7f17785a59d7..42c6ea35a6f2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1192,9 +1192,6 @@ struct dev_ifalias { * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. - * void (*ndo_xdp_flush)(struct net_device *dev); - * This function is used to inform the driver to flush a particular - * xdp tx queue. Must be called on same CPU as xdp_xmit. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1382,7 +1379,6 @@ struct net_device_ops { int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, u32 flags); - void (*ndo_xdp_flush)(struct net_device *dev); }; /** -- cgit v1.2.3 From 9107c05e2e55c1c7abd02ab38f7694e0da08b643 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 2 Jun 2018 22:37:24 +0200 Subject: net: phy: remove PM ops from MDIO bus Current implementation of MDIO bus PM ops doesn't actually implement bus-specific PM ops but just calls PM ops defined on a device level what doesn't seem to be fully in line with the core PM model. When looking e.g. at __device_suspend() the PM core looks for PM ops of a device in a specific order: 1. device PM domain 2. device type 3. device class 4. device bus I think it has good reason that there's no PM ops on device level. Now that a device type representation of PHY's as special type of MDIO devices was added (only user of MDIO bus PM ops), the MDIO bus PM ops can be removed including member pm of struct mdio_device. If for some other type of MDIO device PM ops are needed, it should be modeled as struct device_type as well. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/mdio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 2cfffe586885..bfa7114167d7 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -29,7 +29,6 @@ enum mdio_mutex_lock_class { struct mdio_device { struct device dev; - const struct dev_pm_ops *pm_ops; struct mii_bus *bus; char modalias[MDIO_NAME_SIZE]; -- cgit v1.2.3 From 69e2ecccd0bee6b186d0d42f8093109bfca7e990 Mon Sep 17 00:00:00 2001 From: Kun Yi Date: Mon, 4 Jun 2018 13:17:04 -0700 Subject: net: phy: broadcom: Enable 125 MHz clock on LED4 pin for BCM54612E by default. BCM54612E have 4 multi-functional LED pins that can be configured through register setting; the LED4 pin can be configured to a 125MHz reference clock output by setting the spare register. Since the dedicated CLK125 reference clock pin is not brought out on the 48-Pin MLP, the LED4 pin is the only pin to provide such function in this package, and therefore it is beneficial to just enable the reference clock by default. Signed-off-by: Kun Yi Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index b324e01ccf2d..daa9234a9baf 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -85,6 +85,7 @@ #define MII_BCM54XX_EXP_SEL 0x17 /* Expansion register select */ #define MII_BCM54XX_EXP_SEL_SSD 0x0e00 /* Secondary SerDes select */ #define MII_BCM54XX_EXP_SEL_ER 0x0f00 /* Expansion register select */ +#define MII_BCM54XX_EXP_SEL_ETC 0x0d00 /* Expansion register spare + 2k mem */ #define MII_BCM54XX_AUX_CTL 0x18 /* Auxiliary control register */ #define MII_BCM54XX_ISR 0x1a /* BCM54xx interrupt status register */ @@ -219,6 +220,9 @@ #define BCM54810_SHD_CLK_CTL 0x3 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN (1 << 9) +/* BCM54612E Registers */ +#define BCM54612E_EXP_SPARE0 (MII_BCM54XX_EXP_SEL_ETC + 0x34) +#define BCM54612E_LED4_CLK125OUT_EN (1 << 1) /*****************************************************************************/ /* Fast Ethernet Transceiver definitions. */ -- cgit v1.2.3 From e61e62b9e2cc14b336f330f37f517f9d373ff31e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:51 +0200 Subject: xsk: moved struct xdp_umem definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moved struct xdp_umem to xdp_sock.h, in order to prepare for zero-copy support. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 7a647c56ec15..3a6cd88f179d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -6,12 +6,34 @@ #ifndef _LINUX_XDP_SOCK_H #define _LINUX_XDP_SOCK_H +#include +#include #include +#include #include struct net_device; struct xsk_queue; -struct xdp_umem; + +struct xdp_umem_props { + u64 chunk_mask; + u64 size; +}; + +struct xdp_umem { + struct xsk_queue *fq; + struct xsk_queue *cq; + struct page **pgs; + struct xdp_umem_props props; + u32 headroom; + u32 chunk_size_nohr; + struct user_struct *user; + struct pid *pid; + unsigned long address; + refcount_t users; + struct work_struct work; + u32 npgs; +}; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ -- cgit v1.2.3 From 8aef7340ae9695912a411886452ae9773206e845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:52 +0200 Subject: xsk: introduce xdp_umem_page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xdp_umem_page holds the address for a page. Trade memory for faster lookup. Later, we'll add DMA address here as well. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 3a6cd88f179d..caf343a7e224 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -20,10 +20,14 @@ struct xdp_umem_props { u64 size; }; +struct xdp_umem_page { + void *addr; +}; + struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; - struct page **pgs; + struct xdp_umem_page *pages; struct xdp_umem_props props; u32 headroom; u32 chunk_size_nohr; @@ -32,6 +36,7 @@ struct xdp_umem { unsigned long address; refcount_t users; struct work_struct work; + struct page **pgs; u32 npgs; }; -- cgit v1.2.3 From 74515c5750f30244a901c3c0c82a2fe534b3c9c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:53 +0200 Subject: net: xdp: added bpf_netdev_command XDP_{QUERY, SETUP}_XSK_UMEM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend ndo_bpf with two new commands used for query zero-copy support and register an UMEM to a queue_id of a netdev. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 42c6ea35a6f2..cc4ea7ab6d24 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -817,10 +817,13 @@ enum bpf_netdev_command { BPF_OFFLOAD_DESTROY, BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, + XDP_QUERY_XSK_UMEM, + XDP_SETUP_XSK_UMEM, }; struct bpf_prog_offload_ops; struct netlink_ext_ack; +struct xdp_umem; struct netdev_bpf { enum bpf_netdev_command command; @@ -851,6 +854,11 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; + /* XDP_SETUP_XSK_UMEM */ + struct { + struct xdp_umem *umem; + u16 queue_id; + } xsk; }; }; -- cgit v1.2.3 From 02b55e5657c3a569fc681ba851e464cfa6b90d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:54 +0200 Subject: xdp: add MEM_TYPE_ZERO_COPY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here, a new type of allocator support is added to the XDP return API. A zero-copy allocated xdp_buff cannot be converted to an xdp_frame. Instead is the buff has to be copied. This is not supported at all in this commit. Also, an opaque "handle" is added to xdp_buff. This can be used as a context for the zero-copy allocator implementation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index a3b71a4dd71d..2deea7166a34 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -37,6 +37,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, + MEM_TYPE_ZERO_COPY, MEM_TYPE_MAX, }; @@ -51,6 +52,10 @@ struct xdp_mem_info { struct page_pool; +struct zero_copy_allocator { + void (*free)(struct zero_copy_allocator *zca, unsigned long handle); +}; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; @@ -63,6 +68,7 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; + unsigned long handle; struct xdp_rxq_info *rxq; }; @@ -86,6 +92,10 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; + /* TODO: implement clone, copy, use "native" MEM_TYPE */ + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + return NULL; + /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; -- cgit v1.2.3 From 173d3adb6f437037f216270955886ca9878187a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:55 +0200 Subject: xsk: add zero-copy support for Rx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and wireup ndo_bpf call in bind. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 6 ++++++ include/uapi/linux/if_xdp.h | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index caf343a7e224..d93d3aac3fc9 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -22,6 +22,7 @@ struct xdp_umem_props { struct xdp_umem_page { void *addr; + dma_addr_t dma; }; struct xdp_umem { @@ -38,6 +39,9 @@ struct xdp_umem { struct work_struct work; struct page **pgs; u32 npgs; + struct net_device *dev; + u16 queue_id; + bool zc; }; struct xdp_sock { @@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +void xsk_umem_discard_addr(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e411d6f9ac65..1fa0e977ea8d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -13,7 +13,9 @@ #include /* Options for the sxdp_flags field */ -#define XDP_SHARED_UMEM 1 +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ struct sockaddr_xdp { __u16 sxdp_family; -- cgit v1.2.3 From e3760c7e50ac6cdf1188fec44938dd7e6e6eef61 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 Jun 2018 14:05:56 +0200 Subject: net: added netdevice operation for Tx Added ndo_xsk_async_xmit. This ndo "kicks" the netdev to start to pull userland AF_XDP Tx frames from a NAPI context. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cc4ea7ab6d24..03ffeadf8a41 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1387,6 +1387,8 @@ struct net_device_ops { int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, u32 flags); + int (*ndo_xsk_async_xmit)(struct net_device *dev, + u32 queue_id); }; /** -- cgit v1.2.3 From ac98d8aab61baf785eb8f099b36daf34fc76a70e Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 Jun 2018 14:05:57 +0200 Subject: xsk: wire upp Tx zero-copy functions Here we add the functionality required to support zero-copy Tx, and also exposes various zero-copy related functions for the netdevs. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d93d3aac3fc9..9fe472f2ac95 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,8 @@ struct xdp_umem { struct net_device *dev; u16 queue_id; bool zc; + spinlock_t xsk_list_lock; + struct list_head xsk_list; }; struct xdp_sock { @@ -53,6 +56,8 @@ struct xdp_sock { struct list_head flush_node; u16 queue_id; struct xsk_queue *tx ____cacheline_aligned_in_smp; + struct list_head list; + bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; u64 rx_dropped; @@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +/* Used from netdev driver */ u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { -- cgit v1.2.3 From 95358a9553fbec6c47ad7bd1aec20df663295088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 5 Jun 2018 03:07:23 -0700 Subject: net-tcp: remove useless tw_timeout field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tested: 'git grep tw_timeout' comes up empty and it builds :-) Signed-off-by: Maciej Żenczykowski Cc: Eric Dumazet Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 659d8ed5a3bc..78775038f011 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -61,7 +61,6 @@ struct inet_timewait_sock { #define tw_cookie __tw_common.skc_cookie #define tw_dr __tw_common.skc_tw_dr - int tw_timeout; __u32 tw_mark; volatile unsigned char tw_substate; unsigned char tw_rcv_wscale; -- cgit v1.2.3 From d52c89f120de849575f6b2e5948038f2be12ce6f Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 5 Jun 2018 13:11:16 +0300 Subject: qed*: Utilize FW 8.37.2.0 This FW contains several fixes and features. RDMA - Several modifications and fixes for Memory Windows - drop vlan and tcp timestamp from mss calculation in driver for this FW - Fix SQ completion flow when local ack timeout is infinite - Modifications in t10dif support ETH - Fix aRFS for tunneled traffic without inner IP. - Fix chip configuration which may fail under heavy traffic conditions. - Support receiving any-VNI in VXLAN and GENEVE RX classification. iSCSI / FcoE - Fix iSCSI recovery flow - Drop vlan and tcp timestamp from mss calc for fw 8.37.2.0 Misc - Several registers (split registers) won't read correctly with ethtool -d Signed-off-by: Ariel Elior Signed-off-by: Manish Rangankar Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 4 ++-- include/linux/qed/iscsi_common.h | 8 ++++---- include/linux/qed/qed_rdma_if.h | 4 +--- include/linux/qed/roce_common.h | 1 + 4 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 13c8ab171437..0081fa6d1268 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -109,8 +109,8 @@ #define MAX_NUM_LL2_TX_STATS_COUNTERS 48 #define FW_MAJOR_VERSION 8 -#define FW_MINOR_VERSION 33 -#define FW_REVISION_VERSION 11 +#define FW_MINOR_VERSION 37 +#define FW_REVISION_VERSION 2 #define FW_ENGINEERING_VERSION 0 /***********************/ diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h index 938df614cb6a..b34c573f2b30 100644 --- a/include/linux/qed/iscsi_common.h +++ b/include/linux/qed/iscsi_common.h @@ -799,8 +799,8 @@ struct e4_mstorm_iscsi_task_ag_ctx { #define E4_MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 #define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 #define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT 5 +#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK 0x1 +#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT 5 #define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_MASK 0x1 #define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT 6 #define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_MASK 0x1 @@ -849,8 +849,8 @@ struct e4_ustorm_iscsi_task_ag_ctx { #define E4_USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 #define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 #define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 -#define E4_USTORM_ISCSI_TASK_AG_CTX_BIT1_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT 5 +#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK 0x1 +#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT 5 #define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_MASK 0x3 #define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_SHIFT 6 u8 flags1; diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index e05e320d28b7..df4d13f7e191 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -65,8 +65,7 @@ enum qed_roce_qp_state { enum qed_rdma_tid_type { QED_RDMA_TID_REGISTERED_MR, QED_RDMA_TID_FMR, - QED_RDMA_TID_MW_TYPE1, - QED_RDMA_TID_MW_TYPE2A + QED_RDMA_TID_MW }; struct qed_rdma_events { @@ -280,7 +279,6 @@ struct qed_rdma_register_tid_in_params { bool dif_enabled; u64 dif_error_addr; - u64 dif_runt_addr; }; struct qed_rdma_create_cq_in_params { diff --git a/include/linux/qed/roce_common.h b/include/linux/qed/roce_common.h index 193bcef302e1..473fba76aa77 100644 --- a/include/linux/qed/roce_common.h +++ b/include/linux/qed/roce_common.h @@ -43,6 +43,7 @@ #define ROCE_MAX_QPS (32 * 1024) #define ROCE_DCQCN_NP_MAX_QPS (64) #define ROCE_DCQCN_RP_MAX_QPS (64) +#define ROCE_LKEY_MW_DIF_EN_BIT (28) /* Affiliated asynchronous events / errors enumeration */ enum roce_async_events_type { -- cgit v1.2.3 From e783bb00ad86d9d1f01d9d3a750713070036358e Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 5 Jun 2018 15:02:00 +0200 Subject: ipmr: fix error path when ipmr_new_table fails commit 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table") refactored ipmr_new_table, so that it now returns NULL when mr_table_alloc fails. Unfortunately, all callers of ipmr_new_table expect an ERR_PTR. This can result in NULL deref, for example when ipmr_rules_exit calls ipmr_free_table with NULL net->ipv4.mrt in the !CONFIG_IP_MROUTE_MULTIPLE_TABLES version. This patch makes mr_table_alloc return errors, and changes ip6mr_new_table and its callers to return/expect error pointers as well. It also removes the version of mr_table_alloc defined under !CONFIG_IP_MROUTE_COMMON, since it is never used. Fixes: 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index d617fe45543e..d633f737b3c6 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -307,16 +307,6 @@ static inline void vif_device_init(struct vif_device *v, { } -static inline void * -mr_table_alloc(struct net *net, u32 id, - struct mr_table_ops *ops, - void (*expire_func)(struct timer_list *t), - void (*table_set)(struct mr_table *mrt, - struct net *net)) -{ - return NULL; -} - static inline void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) { -- cgit v1.2.3 From ac0fc8a1bbcbe03ee67278afded105c05eb3535e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 5 Jun 2018 08:14:09 -0700 Subject: devlink: Add extack to reload and port_{un, }split operations Add extack argument to reload, port_split and port_unsplit operations. Signed-off-by: David Ahern Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 9686a1aa4ec9..e336ea9c73df 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -296,12 +296,13 @@ struct devlink_resource { #define DEVLINK_RESOURCE_ID_PARENT_TOP 0 struct devlink_ops { - int (*reload)(struct devlink *devlink); + int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, - unsigned int count); - int (*port_unsplit)(struct devlink *devlink, unsigned int port_index); + unsigned int count, struct netlink_ext_ack *extack); + int (*port_unsplit)(struct devlink *devlink, unsigned int port_index, + struct netlink_ext_ack *extack); int (*sb_pool_get)(struct devlink *devlink, unsigned int sb_index, u16 pool_index, struct devlink_sb_pool_info *pool_info); -- cgit v1.2.3 From 7170e6045a6a8b33f4fa5753589dc77b16198e2d Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Wed, 6 Jun 2018 09:33:28 -0700 Subject: strparser: Add __strp_unpause and use it in ktls. strp_unpause queues strp_work in order to parse any messages that arrived while the strparser was paused. However, the process invoking strp_unpause could eagerly parse a buffered message itself if it held the sock lock. __strp_unpause is an alternative to strp_pause that avoids the scheduling overhead that results when a receiving thread unpauses the strparser and waits for the next message to be delivered by the workqueue thread. This patch more than doubled the IOPS achieved in a benchmark of NBD traffic encrypted using ktls. Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- include/net/strparser.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/strparser.h b/include/net/strparser.h index d96b59f45eba..f177c87ce38b 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -90,6 +90,8 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); +/* Must be called with process lock held (lock_sock) */ +void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) -- cgit v1.2.3