From bcc7f554cfa7e0ac77c7adc4027c16f4a2f99c6f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 23 Jun 2020 16:39:35 +0100 Subject: bpf: Fix formatting in documentation for BPF helpers When producing the bpf-helpers.7 man page from the documentation from the BPF user space header file, rst2man complains: :2636: (ERROR/3) Unexpected indentation. :2640: (WARNING/2) Block quote ends without a blank line; unexpected unindent. Let's fix formatting for the relevant chunk (item list in bpf_ringbuf_query()'s description), and for a couple other functions. Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200623153935.6215-1-quentin@isovalent.com --- include/uapi/linux/bpf.h | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 974a71342aea..8bd33050b7bb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3171,13 +3171,12 @@ union bpf_attr { * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return - * 0, on success; - * < 0, on error. + * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description @@ -3189,20 +3188,20 @@ union bpf_attr { * void bpf_ringbuf_submit(void *data, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard(void *data, u64 flags) * Description * Discard reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * @@ -3210,16 +3209,18 @@ union bpf_attr { * Description * Query various characteristics of provided ring buffer. What * exactly is queries is determined by *flags*: - * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; - * - BPF_RB_RING_SIZE - the size of ring buffer; - * - BPF_RB_CONS_POS - consumer position (can wrap around); - * - BPF_RB_PROD_POS - producer(s) position (can wrap around); - * Data returned is just a momentary snapshots of actual values + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values * and could be inaccurate, so this facility should be used to * power heuristics and for reporting, not to make 100% correct * calculation. * Return - * Requested value, or 0, if flags are not recognized. + * Requested value, or 0, if *flags* are not recognized. * * int bpf_csum_level(struct sk_buff *skb, u64 level) * Description -- cgit v1.2.3 From 0c1a7f13c9ec1ceb18d97ef4b1dd20ec71ffba31 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Wed, 17 Jun 2020 17:01:32 +0530 Subject: ieee80211: Add missing and new AKM suite selector definitions Add the definitions for missing AKM selectors defined in IEEE P802.11-REVmd/D3.0, table 9-151. These definitions will be used by various drivers that support these new AKM suites. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20200617113132.13477-1-vjakkam@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index fe15f831841b..9f732499ea88 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3333,13 +3333,17 @@ struct ieee80211_multiple_bssid_configuration { #define WLAN_AKM_SUITE_TDLS SUITE(0x000FAC, 7) #define WLAN_AKM_SUITE_SAE SUITE(0x000FAC, 8) #define WLAN_AKM_SUITE_FT_OVER_SAE SUITE(0x000FAC, 9) +#define WLAN_AKM_SUITE_AP_PEER_KEY SUITE(0x000FAC, 10) #define WLAN_AKM_SUITE_8021X_SUITE_B SUITE(0x000FAC, 11) #define WLAN_AKM_SUITE_8021X_SUITE_B_192 SUITE(0x000FAC, 12) +#define WLAN_AKM_SUITE_FT_8021X_SHA384 SUITE(0x000FAC, 13) #define WLAN_AKM_SUITE_FILS_SHA256 SUITE(0x000FAC, 14) #define WLAN_AKM_SUITE_FILS_SHA384 SUITE(0x000FAC, 15) #define WLAN_AKM_SUITE_FT_FILS_SHA256 SUITE(0x000FAC, 16) #define WLAN_AKM_SUITE_FT_FILS_SHA384 SUITE(0x000FAC, 17) #define WLAN_AKM_SUITE_OWE SUITE(0x000FAC, 18) +#define WLAN_AKM_SUITE_FT_PSK_SHA384 SUITE(0x000FAC, 19) +#define WLAN_AKM_SUITE_PSK_SHA384 SUITE(0x000FAC, 20) #define WLAN_MAX_KEY_LEN 32 -- cgit v1.2.3 From a9b59159d338d414acaa8e2f569d129d51c76452 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 24 Jun 2020 15:20:39 -0700 Subject: bpf: Do not allow btf_ctx_access with __int128 types To ensure btf_ctx_access() is safe the verifier checks that the BTF arg type is an int, enum, or pointer. When the function does the BTF arg lookup it uses the calculation 'arg = off / 8' using the fact that registers are 8B. This requires that the first arg is in the first reg, the second in the second, and so on. However, for __int128 the arg will consume two registers by default LLVM implementation. So this will cause the arg layout assumed by the 'arg = off / 8' calculation to be incorrect. Because __int128 is uncommon this patch applies the easiest fix and will force int types to be sizeof(u64) or smaller so that they will fit in a single register. v2: remove unneeded parens per Andrii's feedback Fixes: 9e15db66136a1 ("bpf: Implement accurate raw_tp context access via BTF") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159303723962.11287.13309537171132420717.stgit@john-Precision-5820-Tower --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 5c1ea99b480f..8b81fbb4497c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -82,6 +82,11 @@ static inline bool btf_type_is_int(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } +static inline bool btf_type_is_small_int(const struct btf_type *t) +{ + return btf_type_is_int(t) && t->size <= sizeof(u64); +} + static inline bool btf_type_is_enum(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; -- cgit v1.2.3 From bf64ff4c2aac65d680dc639a511c781cf6b6ec08 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 27 Jun 2020 00:12:24 -0700 Subject: genetlink: get rid of family->attrbuf genl_family_rcv_msg_attrs_parse() reuses the global family->attrbuf when family->parallel_ops is false. However, family->attrbuf is not protected by any lock on the genl_family_rcv_msg_doit() code path. This leads to several different consequences, one of them is UAF, like the following: genl_family_rcv_msg_doit(): genl_start(): genl_family_rcv_msg_attrs_parse() attrbuf = family->attrbuf __nlmsg_parse(attrbuf); genl_family_rcv_msg_attrs_parse() attrbuf = family->attrbuf __nlmsg_parse(attrbuf); info->attrs = attrs; cb->data = info; netlink_unicast_kernel(): consume_skb() genl_lock_dumpit(): genl_dumpit_info(cb)->attrs Note family->attrbuf is an array of pointers to the skb data, once the skb is freed, any dereference of family->attrbuf will be a UAF. Maybe we could serialize the family->attrbuf with genl_mutex too, but that would make the locking more complicated. Instead, we can just get rid of family->attrbuf and always allocate attrbuf from heap like the family->parallel_ops==true code path. This may add some performance overhead but comparing with taking the global genl_mutex, it still looks better. Fixes: 75cdbdd08900 ("net: ieee802154: have genetlink code to parse the attrs during dumpit") Fixes: 057af7071344 ("net: tipc: have genetlink code to parse the attrs during dumpit") Reported-and-tested-by: syzbot+3039ddf6d7b13daf3787@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+80cad1e3cb4c41cde6ff@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+736bcbcb11b60d0c0792@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+520f8704db2b68091d44@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+c96e4dfb32f8987fdeed@syzkaller.appspotmail.com Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/genetlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 74950663bb00..ad71ed4f55ff 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -41,7 +41,6 @@ struct genl_info; * Note that unbind() will not be called symmetrically if the * generic netlink family is removed while there are still open * sockets. - * @attrbuf: buffer to store parsed attributes (private) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family @@ -66,7 +65,6 @@ struct genl_family { struct genl_info *info); int (*mcast_bind)(struct net *net, int group); void (*mcast_unbind)(struct net *net, int group); - struct nlattr ** attrbuf; /* private */ const struct genl_ops * ops; const struct genl_multicast_group *mcgrps; unsigned int n_ops; -- cgit v1.2.3 From 3aa91625007807bfca4155df1867a5c924a08662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:56 +0200 Subject: dma-mapping: Add a new dma_need_sync API Add a new API to check if calls to dma_sync_single_for_{device,cpu} are required for a given DMA streaming mapping. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-2-hch@lst.de --- include/linux/dma-direct.h | 1 + include/linux/dma-mapping.h | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 136f984df0d9..8b006730687b 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -87,4 +87,5 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); int dma_direct_supported(struct device *dev, u64 mask); +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 78f677cf45ab..a33ed3954ed4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -461,6 +461,7 @@ int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); u64 dma_get_required_mask(struct device *dev); size_t dma_max_mapping_size(struct device *dev); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, @@ -571,6 +572,10 @@ static inline size_t dma_max_mapping_size(struct device *dev) { return 0; } +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return false; +} static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; -- cgit v1.2.3 From 91d5b70273267bbae6f5d1fb4cf3510bd31ef9ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:57 +0200 Subject: xsk: Replace the cheap_dma flag with a dma_need_sync flag Invert the polarity and better name the flag so that the use case is properly documented. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-3-hch@lst.de --- include/net/xsk_buff_pool.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index a4ff226505c9..6842990e2712 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -40,7 +40,7 @@ struct xsk_buff_pool { u32 headroom; u32 chunk_size; u32 frame_len; - bool cheap_dma; + bool dma_need_sync; bool unaligned; void *addrs; struct device *dev; @@ -80,7 +80,7 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - if (xskb->pool->cheap_dma) + if (!xskb->pool->dma_need_sync) return; xp_dma_sync_for_cpu_slow(xskb); @@ -91,7 +91,7 @@ void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - if (pool->cheap_dma) + if (!pool->dma_need_sync) return; xp_dma_sync_for_device_slow(pool, dma, size); -- cgit v1.2.3 From 3b7016996c4c44db5d499d98759b82fb714bb912 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:54 +0200 Subject: flow_dissector: Pull BPF program assignment up to bpf-netns Prepare for using bpf_prog_array to store attached programs by moving out code that updates the attached program out of flow dissector. Managing bpf_prog_array is more involved than updating a single bpf_prog pointer. This will let us do it all from one place, bpf/net_namespace.c, in the subsequent patch. No functional change intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-2-jakub@cloudflare.com --- include/net/flow_dissector.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index a7eba43fe4e4..4b6e36288ddd 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -372,7 +372,8 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control, } #ifdef CONFIG_BPF_SYSCALL -int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog); +int flow_dissector_bpf_prog_attach_check(struct net *net, + struct bpf_prog *prog); #endif /* CONFIG_BPF_SYSCALL */ #endif -- cgit v1.2.3 From 695c12147a40181fe9221d321c3f2de33c9574ed Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:55 +0200 Subject: bpf, netns: Keep attached programs in bpf_prog_array Prepare for having multi-prog attachments for new netns attach types by storing programs to run in a bpf_prog_array, which is well suited for iterating over programs and running them in sequence. After this change bpf(PROG_QUERY) may block to allocate memory in bpf_prog_array_copy_to_user() for collected program IDs. This forces a change in how we protect access to the attached program in the query callback. Because bpf_prog_array_copy_to_user() can sleep, we switch from an RCU read lock to holding a mutex that serializes updaters. Because we allow only one BPF flow_dissector program to be attached to netns at all times, the bpf_prog_array pointed by net->bpf.run_array is always either detached (null) or one element long. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200625141357.910330-3-jakub@cloudflare.com --- include/net/netns/bpf.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a8dce2a380c8..a5015bda9979 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -9,9 +9,12 @@ #include struct bpf_prog; +struct bpf_prog_array; struct netns_bpf { - struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; + /* Array of programs to run compiled from progs or links */ + struct bpf_prog_array __rcu *run_array[MAX_NETNS_BPF_ATTACH_TYPE]; + struct bpf_prog *progs[MAX_NETNS_BPF_ATTACH_TYPE]; struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; }; -- cgit v1.2.3 From ab53cad90eb10c9991f501ba08904680a074ef3d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:56 +0200 Subject: bpf, netns: Keep a list of attached bpf_link's To support multi-prog link-based attachments for new netns attach types, we need to keep track of more than one bpf_link per attach type. Hence, convert net->bpf.links into a list, that currently can be either empty or have just one item. Instead of reusing bpf_prog_list from bpf-cgroup, we link together bpf_netns_link's themselves. This makes list management simpler as we don't have to allocate, initialize, and later release list elements. We can do this because multi-prog attachment will be available only for bpf_link, and we don't need to build a list of programs attached directly and indirectly via links. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-4-jakub@cloudflare.com --- include/net/netns/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a5015bda9979..0ca6a1b87185 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -15,7 +15,7 @@ struct netns_bpf { /* Array of programs to run compiled from progs or links */ struct bpf_prog_array __rcu *run_array[MAX_NETNS_BPF_ATTACH_TYPE]; struct bpf_prog *progs[MAX_NETNS_BPF_ATTACH_TYPE]; - struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; + struct list_head links[MAX_NETNS_BPF_ATTACH_TYPE]; }; #endif /* __NETNS_BPF_H__ */ -- cgit v1.2.3 From 4ac2add65974e4efafb8d4ccd8fc5660417ea312 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:26 +0100 Subject: bpf: flow_dissector: Check value of unused flags to BPF_PROG_DETACH Using BPF_PROG_DETACH on a flow dissector program supports neither attach_flags nor attach_bpf_fd. Yet no value is enforced for them. Enforce that attach_flags are zero, and require the current program to be passed via attach_bpf_fd. This allows us to remove the check for CAP_SYS_ADMIN, since userspace can now no longer remove arbitrary flow dissector programs. Fixes: b27f7bb590ba ("flow_dissector: Move out netns_bpf prog callbacks") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-3-lmb@cloudflare.com --- include/linux/bpf-netns.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 4052d649f36d..47d5b0c708c9 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -33,7 +33,7 @@ int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); -int netns_bpf_prog_detach(const union bpf_attr *attr); +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog); #else @@ -49,7 +49,8 @@ static inline int netns_bpf_prog_attach(const union bpf_attr *attr, return -EOPNOTSUPP; } -static inline int netns_bpf_prog_detach(const union bpf_attr *attr) +static inline int netns_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) { return -EOPNOTSUPP; } -- cgit v1.2.3 From bb0de3131f4c60a9bf976681e0fe4d1e55c7a821 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:28 +0100 Subject: bpf: sockmap: Require attach_bpf_fd when detaching a program The sockmap code currently ignores the value of attach_bpf_fd when detaching a program. This is contrary to the usual behaviour of checking that attach_bpf_fd represents the currently attached program. Ensure that attach_bpf_fd is indeed the currently attached program. It turns out that all sockmap selftests already do this, which indicates that this is unlikely to cause breakage. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-5-lmb@cloudflare.com --- include/linux/bpf.h | 13 +++++++++++-- include/linux/skmsg.h | 13 +++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 07052d44bca1..9750a1902ee5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1543,13 +1543,16 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, u32 which) + struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { return -EOPNOTSUPP; } @@ -1559,6 +1562,12 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_STREAM_PARSER */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 08674cd14d5a..1e9ed840b9fc 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -430,6 +430,19 @@ static inline void psock_set_prog(struct bpf_prog **pprog, bpf_prog_put(prog); } +static inline int psock_replace_prog(struct bpf_prog **pprog, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + if (cmpxchg(pprog, old, prog) != old) + return -ENOENT; + + if (old) + bpf_prog_put(old); + + return 0; +} + static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); -- cgit v1.2.3 From 2606aff916854b61234bf85001be9777bab2d5f8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:18 -0600 Subject: net: ip_tunnel: add header_ops for layer 3 devices Some devices that take straight up layer 3 packets benefit from having a shared header_ops so that AF_PACKET sockets can inject packets that are recognized. This shared infrastructure will be used by other drivers that currently can't inject packets using AF_PACKET. It also exposes the parser function, as it is useful in standalone form too. Signed-off-by: Jason A. Donenfeld Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 076e5d7db7d3..36025dea7612 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -290,6 +290,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); +extern const struct header_ops ip_tunnel_header_ops; +__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); + struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, -- cgit v1.2.3 From 1e82a62fec613844da9e558f3493540a5b7a7b67 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Tue, 30 Jun 2020 11:50:17 -0600 Subject: genetlink: remove genl_bind A potential deadlock can occur during registering or unregistering a new generic netlink family between the main nl_table_lock and the cb_lock where each thread wants the lock held by the other, as demonstrated below. 1) Thread 1 is performing a netlink_bind() operation on a socket. As part of this call, it will call netlink_lock_table(), incrementing the nl_table_users count to 1. 2) Thread 2 is registering (or unregistering) a genl_family via the genl_(un)register_family() API. The cb_lock semaphore will be taken for writing. 3) Thread 1 will call genl_bind() as part of the bind operation to handle subscribing to GENL multicast groups at the request of the user. It will attempt to take the cb_lock semaphore for reading, but it will fail and be scheduled away, waiting for Thread 2 to finish the write. 4) Thread 2 will call netlink_table_grab() during the (un)registration call. However, as Thread 1 has incremented nl_table_users, it will not be able to proceed, and both threads will be stuck waiting for the other. genl_bind() is a noop, unless a genl_family implements the mcast_bind() function to handle setting up family-specific multicast operations. Since no one in-tree uses this functionality as Cong pointed out, simply removing the genl_bind() function will remove the possibility for deadlock, as there is no attempt by Thread 1 above to take the cb_lock semaphore. Fixes: c380d9a7afff ("genetlink: pass multicast bind/unbind to families") Suggested-by: Cong Wang Acked-by: Johannes Berg Reported-by: kernel test robot Signed-off-by: Sean Tranchetti Signed-off-by: David S. Miller --- include/net/genetlink.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index ad71ed4f55ff..6e5f1e1aa822 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -35,12 +35,6 @@ struct genl_info; * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks - * @mcast_bind: a socket bound to the given multicast group (which - * is given as the offset into the groups array) - * @mcast_unbind: a socket was unbound from the given multicast group. - * Note that unbind() will not be called symmetrically if the - * generic netlink family is removed while there are still open - * sockets. * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family @@ -63,8 +57,6 @@ struct genl_family { void (*post_doit)(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info); - int (*mcast_bind)(struct net *net, int group); - void (*mcast_unbind)(struct net *net, int group); const struct genl_ops * ops; const struct genl_multicast_group *mcgrps; unsigned int n_ops; -- cgit v1.2.3 From d7bf2ebebc2bd61ab95e2a8e33541ef282f303d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 3 Jul 2020 22:26:43 +0200 Subject: sched: consistently handle layer3 header accesses in the presence of VLANs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a couple of places in net/sched/ that check skb->protocol and act on the value there. However, in the presence of VLAN tags, the value stored in skb->protocol can be inconsistent based on whether VLAN acceleration is enabled. The commit quoted in the Fixes tag below fixed the users of skb->protocol to use a helper that will always see the VLAN ethertype. However, most of the callers don't actually handle the VLAN ethertype, but expect to find the IP header type in the protocol field. This means that things like changing the ECN field, or parsing diffserv values, stops working if there's a VLAN tag, or if there are multiple nested VLAN tags (QinQ). To fix this, change the helper to take an argument that indicates whether the caller wants to skip the VLAN tags or not. When skipping VLAN tags, we make sure to skip all of them, so behaviour is consistent even in QinQ mode. To make the helper usable from the ECN code, move it to if_vlan.h instead of pkt_sched.h. v3: - Remove empty lines - Move vlan variable definitions inside loop in skb_protocol() - Also use skb_protocol() helper in IP{,6}_ECN_decapsulate() and bpf_skb_ecn_set_ce() v2: - Use eth_type_vlan() helper in skb_protocol() - Also fix code that reads skb->protocol directly - Change a couple of 'if/else if' statements to switch constructs to avoid calling the helper twice Reported-by: Ilya Ponetayev Fixes: d8b9605d2697 ("net: sched: fix skb->protocol use in case of accelerated vlan path") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 28 ++++++++++++++++++++++++++++ include/net/inet_ecn.h | 25 +++++++++++++++++-------- include/net/pkt_sched.h | 11 ----------- 3 files changed, 45 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index b05e855f1ddd..427a5b8597c2 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -308,6 +308,34 @@ static inline bool eth_type_vlan(__be16 ethertype) } } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); + __be16 proto = skb->protocol; + + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; + + while (eth_type_vlan(proto)) { + struct vlan_hdr vhdr, *vh; + + vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); + if (!vh) + break; + + proto = vh->h_vlan_encapsulated_proto; + offset += sizeof(vhdr); + } + + return proto; +} + static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index 0f0d1efe06dd..e1eaf1780288 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -172,7 +173,7 @@ static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) static inline int INET_ECN_set_ce(struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) @@ -191,7 +192,7 @@ static inline int INET_ECN_set_ce(struct sk_buff *skb) static inline int INET_ECN_set_ect1(struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) @@ -272,12 +273,16 @@ static inline int IP_ECN_decapsulate(const struct iphdr *oiph, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, oiph->tos, inner); } @@ -287,12 +292,16 @@ static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9092e697059e..ac8c890a2657 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -136,17 +136,6 @@ static inline void qdisc_run(struct Qdisc *q) } } -static inline __be16 tc_skb_protocol(const struct sk_buff *skb) -{ - /* We need to take extra care in case the skb came via - * vlan accelerated path. In that case, use skb->vlan_proto - * as the original vlan header was already stripped. - */ - if (skb_vlan_tag_present(skb)) - return skb->vlan_proto; - return skb->protocol; -} - /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ -- cgit v1.2.3 From ad0f75e5f57ccbceec13274e1e242f2b5a6397ed Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Jul 2020 11:52:56 -0700 Subject: cgroup: fix cgroup_sk_alloc() for sk_clone_lock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we clone a socket in sk_clone_lock(), its sk_cgrp_data is copied, so the cgroup refcnt must be taken too. And, unlike the sk_alloc() path, sock_update_netprioidx() is not called here. Therefore, it is safe and necessary to grab the cgroup refcnt even when cgroup_sk_alloc is disabled. sk_clone_lock() is in BH context anyway, the in_interrupt() would terminate this function if called there. And for sk_alloc() skcd->val is always zero. So it's safe to factor out the code to make it more readable. The global variable 'cgroup_sk_alloc_disabled' is used to determine whether to take these reference counts. It is impossible to make the reference counting correct unless we save this bit of information in skcd->val. So, add a new bit there to record whether the socket has already taken the reference counts. This obviously relies on kmalloc() to align cgroup pointers to at least 4 bytes, ARCH_KMALLOC_MINALIGN is certainly larger than that. This bug seems to be introduced since the beginning, commit d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") tried to fix it but not compeletely. It seems not easy to trigger until the recent commit 090e28b229af ("netprio_cgroup: Fix unlimited memory leak of v2 cgroups") was merged. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Reported-by: Cameron Berkenpas Reported-by: Peter Geis Reported-by: Lu Fengqi Reported-by: Daniël Sonck Reported-by: Zhang Qiang Tested-by: Cameron Berkenpas Tested-by: Peter Geis Tested-by: Thomas Lamprecht Cc: Daniel Borkmann Cc: Zefan Li Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 6 ++++-- include/linux/cgroup.h | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 52661155f85f..4f1cd0edc57d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -790,7 +790,8 @@ struct sock_cgroup_data { union { #ifdef __LITTLE_ENDIAN struct { - u8 is_data; + u8 is_data : 1; + u8 no_refcnt : 1; u8 padding; u16 prioidx; u32 classid; @@ -800,7 +801,8 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; - u8 is_data; + u8 no_refcnt : 1; + u8 is_data : 1; } __packed; #endif u64 val; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4598e4da6b1b..618838c48313 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -822,6 +822,7 @@ extern spinlock_t cgroup_sk_update_lock; void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); +void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) @@ -835,7 +836,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) */ v = READ_ONCE(skcd->val); - if (v & 1) + if (v & 3) return &cgrp_dfl_root.cgrp; return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; @@ -847,6 +848,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} +static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ -- cgit v1.2.3 From 394de110a73395de2ca4516b0de435e91b11b604 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Sun, 5 Jul 2020 14:23:49 +0530 Subject: net: Added pointer check for dst->ops->neigh_lookup in dst_neigh_lookup_skb The packets from tunnel devices (eg bareudp) may have only metadata in the dst pointer of skb. Hence a pointer check of neigh_lookup is needed in dst_neigh_lookup_skb Kernel crashes when packets from bareudp device is processed in the kernel neighbour subsytem. [ 133.384484] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 133.385240] #PF: supervisor instruction fetch in kernel mode [ 133.385828] #PF: error_code(0x0010) - not-present page [ 133.386603] PGD 0 P4D 0 [ 133.386875] Oops: 0010 [#1] SMP PTI [ 133.387275] CPU: 0 PID: 5045 Comm: ping Tainted: G W 5.8.0-rc2+ #15 [ 133.388052] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 133.391076] RIP: 0010:0x0 [ 133.392401] Code: Bad RIP value. [ 133.394029] RSP: 0018:ffffb79980003d50 EFLAGS: 00010246 [ 133.396656] RAX: 0000000080000102 RBX: ffff9de2fe0d6600 RCX: ffff9de2fe5e9d00 [ 133.399018] RDX: 0000000000000000 RSI: ffff9de2fe5e9d00 RDI: ffff9de2fc21b400 [ 133.399685] RBP: ffff9de2fe5e9d00 R08: 0000000000000000 R09: 0000000000000000 [ 133.400350] R10: ffff9de2fbc6be22 R11: ffff9de2fe0d6600 R12: ffff9de2fc21b400 [ 133.401010] R13: ffff9de2fe0d6628 R14: 0000000000000001 R15: 0000000000000003 [ 133.401667] FS: 00007fe014918740(0000) GS:ffff9de2fec00000(0000) knlGS:0000000000000000 [ 133.402412] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 133.402948] CR2: ffffffffffffffd6 CR3: 000000003bb72000 CR4: 00000000000006f0 [ 133.403611] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 133.404270] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 133.404933] Call Trace: [ 133.405169] [ 133.405367] __neigh_update+0x5a4/0x8f0 [ 133.405734] arp_process+0x294/0x820 [ 133.406076] ? __netif_receive_skb_core+0x866/0xe70 [ 133.406557] arp_rcv+0x129/0x1c0 [ 133.406882] __netif_receive_skb_one_core+0x95/0xb0 [ 133.407340] process_backlog+0xa7/0x150 [ 133.407705] net_rx_action+0x2af/0x420 [ 133.408457] __do_softirq+0xda/0x2a8 [ 133.408813] asm_call_on_stack+0x12/0x20 [ 133.409290] [ 133.409519] do_softirq_own_stack+0x39/0x50 [ 133.410036] do_softirq+0x50/0x60 [ 133.410401] __local_bh_enable_ip+0x50/0x60 [ 133.410871] ip_finish_output2+0x195/0x530 [ 133.411288] ip_output+0x72/0xf0 [ 133.411673] ? __ip_finish_output+0x1f0/0x1f0 [ 133.412122] ip_send_skb+0x15/0x40 [ 133.412471] raw_sendmsg+0x853/0xab0 [ 133.412855] ? insert_pfn+0xfe/0x270 [ 133.413827] ? vvar_fault+0xec/0x190 [ 133.414772] sock_sendmsg+0x57/0x80 [ 133.415685] __sys_sendto+0xdc/0x160 [ 133.416605] ? syscall_trace_enter+0x1d4/0x2b0 [ 133.417679] ? __audit_syscall_exit+0x1d9/0x280 [ 133.418753] ? __prepare_exit_to_usermode+0x5d/0x1a0 [ 133.419819] __x64_sys_sendto+0x24/0x30 [ 133.420848] do_syscall_64+0x4d/0x90 [ 133.421768] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 133.422833] RIP: 0033:0x7fe013689c03 [ 133.423749] Code: Bad RIP value. [ 133.424624] RSP: 002b:00007ffc7288f418 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 133.425940] RAX: ffffffffffffffda RBX: 000056151fc63720 RCX: 00007fe013689c03 [ 133.427225] RDX: 0000000000000040 RSI: 000056151fc63720 RDI: 0000000000000003 [ 133.428481] RBP: 00007ffc72890b30 R08: 000056151fc60500 R09: 0000000000000010 [ 133.429757] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000040 [ 133.431041] R13: 000056151fc636e0 R14: 000056151fc616bc R15: 0000000000000080 [ 133.432481] Modules linked in: mpls_iptunnel act_mirred act_tunnel_key cls_flower sch_ingress veth mpls_router ip_tunnel bareudp ip6_udp_tunnel udp_tunnel macsec udp_diag inet_diag unix_diag af_packet_diag netlink_diag binfmt_misc xt_MASQUERADE iptable_nat xt_addrtype xt_conntrack nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 br_netfilter bridge stp llc ebtable_filter ebtables overlay ip6table_filter ip6_tables iptable_filter sunrpc ext4 mbcache jbd2 pcspkr i2c_piix4 virtio_balloon joydev ip_tables xfs libcrc32c ata_generic qxl pata_acpi drm_ttm_helper ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ata_piix libata virtio_net net_failover virtio_console failover virtio_blk i2c_core virtio_pci virtio_ring serio_raw floppy virtio dm_mirror dm_region_hash dm_log dm_mod [ 133.444045] CR2: 0000000000000000 [ 133.445082] ---[ end trace f4aeee1958fd1638 ]--- [ 133.446236] RIP: 0010:0x0 [ 133.447180] Code: Bad RIP value. [ 133.448152] RSP: 0018:ffffb79980003d50 EFLAGS: 00010246 [ 133.449363] RAX: 0000000080000102 RBX: ffff9de2fe0d6600 RCX: ffff9de2fe5e9d00 [ 133.450835] RDX: 0000000000000000 RSI: ffff9de2fe5e9d00 RDI: ffff9de2fc21b400 [ 133.452237] RBP: ffff9de2fe5e9d00 R08: 0000000000000000 R09: 0000000000000000 [ 133.453722] R10: ffff9de2fbc6be22 R11: ffff9de2fe0d6600 R12: ffff9de2fc21b400 [ 133.455149] R13: ffff9de2fe0d6628 R14: 0000000000000001 R15: 0000000000000003 [ 133.456520] FS: 00007fe014918740(0000) GS:ffff9de2fec00000(0000) knlGS:0000000000000000 [ 133.458046] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 133.459342] CR2: ffffffffffffffd6 CR3: 000000003bb72000 CR4: 00000000000006f0 [ 133.460782] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 133.462240] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 133.463697] Kernel panic - not syncing: Fatal exception in interrupt [ 133.465226] Kernel Offset: 0xfa00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 133.467025] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: aaa0c23cb901 ("Fix dst_neigh_lookup/dst_neigh_lookup_skb return value handling bug") Signed-off-by: Martin Varghese Signed-off-by: David S. Miller --- include/net/dst.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 07adfacd8088..852d8fb36ab7 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -400,7 +400,15 @@ static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, co static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { - struct neighbour *n = dst->ops->neigh_lookup(dst, skb, NULL); + struct neighbour *n = NULL; + + /* The packets from tunnel devices (eg bareudp) may have only + * metadata in the dst pointer of skb. Hence a pointer check of + * neigh_lookup is needed. + */ + if (dst->ops->neigh_lookup) + n = dst->ops->neigh_lookup(dst, skb, NULL); + return IS_ERR(n) ? NULL : n; } -- cgit v1.2.3 From 469aceddfa3ed16e17ee30533fae45e90f62efd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 7 Jul 2020 13:03:25 +0200 Subject: vlan: consolidate VLAN parsing code and limit max parsing depth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toshiaki pointed out that we now have two very similar functions to extract the L3 protocol number in the presence of VLAN tags. And Daniel pointed out that the unbounded parsing loop makes it possible for maliciously crafted packets to loop through potentially hundreds of tags. Fix both of these issues by consolidating the two parsing functions and limiting the VLAN tag parsing to a max depth of 8 tags. As part of this, switch over __vlan_get_protocol() to use skb_header_pointer() instead of pskb_may_pull(), to avoid the possible side effects of the latter and keep the skb pointer 'const' through all the parsing functions. v2: - Use limit of 8 tags instead of 32 (matching XMIT_RECURSION_LIMIT) Reported-by: Toshiaki Makita Reported-by: Daniel Borkmann Fixes: d7bf2ebebc2b ("sched: consistently handle layer3 header accesses in the presence of VLANs") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 57 +++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 427a5b8597c2..41a518336673 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -25,6 +25,8 @@ #define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */ #define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */ +#define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */ + /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID @@ -308,34 +310,6 @@ static inline bool eth_type_vlan(__be16 ethertype) } } -/* A getter for the SKB protocol field which will handle VLAN tags consistently - * whether VLAN acceleration is enabled or not. - */ -static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) -{ - unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); - __be16 proto = skb->protocol; - - if (!skip_vlan) - /* VLAN acceleration strips the VLAN header from the skb and - * moves it to skb->vlan_proto - */ - return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; - - while (eth_type_vlan(proto)) { - struct vlan_hdr vhdr, *vh; - - vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); - if (!vh) - break; - - proto = vh->h_vlan_encapsulated_proto; - offset += sizeof(vhdr); - } - - return proto; -} - static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { @@ -605,10 +579,10 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, int *depth) { - unsigned int vlan_depth = skb->mac_len; + unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; /* if type is 802.1Q/AD then the header should already be * present at mac_len - VLAN_HLEN (if mac_len > 0), or at @@ -623,13 +597,12 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, vlan_depth = ETH_HLEN; } do { - struct vlan_hdr *vh; + struct vlan_hdr vhdr, *vh; - if (unlikely(!pskb_may_pull(skb, - vlan_depth + VLAN_HLEN))) + vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + if (unlikely(!vh || !--parse_depth)) return 0; - vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } while (eth_type_vlan(type)); @@ -648,11 +621,25 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 vlan_get_protocol(struct sk_buff *skb) +static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { return __vlan_get_protocol(skb, skb->protocol, NULL); } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; + + return vlan_get_protocol(skb); +} + static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) { -- cgit v1.2.3 From c9a368f1c0fbe2e3a21ebf231caeae58b18b2681 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jul 2020 23:11:10 -0700 Subject: bpf: net: Avoid incorrect bpf_sk_reuseport_detach call bpf_sk_reuseport_detach is currently called when sk->sk_user_data is not NULL. It is incorrect because sk->sk_user_data may not be managed by the bpf's reuseport_array. It has been reported in [1] that, the bpf_sk_reuseport_detach() which is called from udp_lib_unhash() has corrupted the sk_user_data managed by l2tp. This patch solves it by using another bit (defined as SK_USER_DATA_BPF) of the sk_user_data pointer value. It marks that a sk_user_data is managed/owned by BPF. The patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e ("net, sk_msg: Clear sk_user_data pointer on clone if tagged"). [ Note: sk->sk_user_data is used by bpf's reuseport_array only when a sk is added to the bpf's reuseport_array. i.e. doing setsockopt(SO_REUSEPORT) and having "sk->sk_reuseport == 1" alone will not stop sk->sk_user_data being used by other means. ] [1]: https://lore.kernel.org/netdev/20200706121259.GA20199@katalix.com/ Fixes: 5dc4c4b7d4e8 ("bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY") Reported-by: James Chapman Reported-by: syzbot+9f092552ba9a5efca5df@syzkaller.appspotmail.com Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Tested-by: James Chapman Acked-by: James Chapman Link: https://lore.kernel.org/bpf/20200709061110.4019316-1-kafai@fb.com --- include/net/sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 3428619faae4..1183507df95b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -533,7 +533,8 @@ enum sk_pacing { * be copied. */ #define SK_USER_DATA_NOCOPY 1UL -#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY) +#define SK_USER_DATA_BPF 2UL /* Managed by BPF */ +#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied -- cgit v1.2.3 From 14b032b8f8fce03a546dcf365454bec8c4a58d7d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 9 Jul 2020 16:28:44 -0700 Subject: cgroup: Fix sock_cgroup_data on big-endian. In order for no_refcnt and is_data to be the lowest order two bits in the 'val' we have to pad out the bitfield of the u8. Fixes: ad0f75e5f57c ("cgroup: fix cgroup_sk_alloc() for sk_clone_lock()") Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4f1cd0edc57d..fee0b5547cd0 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -792,6 +792,7 @@ struct sock_cgroup_data { struct { u8 is_data : 1; u8 no_refcnt : 1; + u8 unused : 6; u8 padding; u16 prioidx; u32 classid; @@ -801,6 +802,7 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; + u8 unused : 6; u8 no_refcnt : 1; u8 is_data : 1; } __packed; -- cgit v1.2.3 From 88b3d5c90e9685be54dd5bc441970044020eca76 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 22 Jun 2020 09:03:31 +0300 Subject: net/mlx5e: Fix port buffers cell size value Device unit for port buffers size, xoff_threshold and xon_threshold is cells. Fix a bug in driver where cell unit size was hard-coded to 128 bytes. This hard-coded value is buggy, as it is wrong for some hardware versions. Driver to read cell size from SBCAM register and translate bytes to cell units accordingly. In order to fix the bug, this patch exposes SBCAM (Shared buffer capabilities mask) layout and defines. If SBCAM.cap_cell_size is valid, use it for all bytes to cells calculations. If not valid, fallback to 128. Cell size do not change on the fly per device. Instead of issuing SBCAM access reg command every time such translation is needed, cache it in mlx5e_dcbx as part of mlx5e_dcbnl_initialize(). Pass dcbx.port_buff_cell_sz as a param to every function that needs bytes to cells translation. While fixing the bug, move MLX5E_BUFFER_CELL_SHIFT macro to en_dcbnl.c, as it is only used by that file. Fixes: 0696d60853d5 ("net/mlx5e: Receive buffer configuration") Signed-off-by: Eran Ben Elisha Reviewed-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 13c0e4556eda..1e6ca716635a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -147,6 +147,7 @@ enum { MLX5_REG_MCDA = 0x9063, MLX5_REG_MCAM = 0x907f, MLX5_REG_MIRC = 0x9162, + MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ca1887dd0423..073b79eacc99 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9960,6 +9960,34 @@ struct mlx5_ifc_pptb_reg_bits { u8 untagged_buff[0x4]; }; +struct mlx5_ifc_sbcam_reg_bits { + u8 reserved_at_0[0x8]; + u8 feature_group[0x8]; + u8 reserved_at_10[0x8]; + u8 access_reg_group[0x8]; + + u8 reserved_at_20[0x20]; + + u8 sb_access_reg_cap_mask[4][0x20]; + + u8 reserved_at_c0[0x80]; + + u8 sb_feature_cap_mask[4][0x20]; + + u8 reserved_at_1c0[0x40]; + + u8 cap_total_buffer_size[0x20]; + + u8 cap_cell_size[0x10]; + u8 cap_max_pg_buffers[0x8]; + u8 cap_num_pool_supported[0x8]; + + u8 reserved_at_240[0x8]; + u8 cap_sbsr_stat_size[0x8]; + u8 cap_max_tclass_data[0x8]; + u8 cap_max_cpu_ingress_tclass_sb[0x8]; +}; + struct mlx5_ifc_pbmc_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; -- cgit v1.2.3