From 4e15507fea70c0c312d79610efa46b6853ccf8e0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 20 Jun 2020 20:11:59 -0700 Subject: libbpf: Forward-declare bpf_stats_type for systems with outdated UAPI headers Systems that doesn't yet have the very latest linux/bpf.h header, enum bpf_stats_type will be undefined, causing compilation warnings. Prevents this by forward-declaring enum. Fixes: 0bee106716cf ("libbpf: Add support for command BPF_ENABLE_STATS") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200621031159.2279101-1-andriin@fb.com --- tools/lib/bpf/bpf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 1b6015b21ba8..dbef24ebcfcb 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -233,6 +233,8 @@ LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr); + +enum bpf_stats_type; /* defined in up-to-date linux/bpf.h */ LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); #ifdef __cplusplus -- cgit v1.2.3 From 9c82a63cf3701b78cd092c69c3e75ff806837194 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 19 Jun 2020 16:04:22 -0700 Subject: libbpf: Fix CO-RE relocs against .text section bpf_object__find_program_by_title(), used by CO-RE relocation code, doesn't return .text "BPF program", if it is a function storage for sub-programs. Because of that, any CO-RE relocation in helper non-inlined functions will fail. Fix this by searching for .text-corresponding BPF program manually. Adjust one of bpf_iter selftest to exhibit this pattern. Fixes: ddc7c3042614 ("libbpf: implement BPF CO-RE offset relocation algorithm") Reported-by: Yonghong Song Signed-off-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200619230423.691274-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 8 +++++++- tools/testing/selftests/bpf/progs/bpf_iter_netlink.c | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 477c679ed945..f17151d866e6 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4818,7 +4818,13 @@ bpf_core_reloc_fields(struct bpf_object *obj, const char *targ_btf_path) err = -EINVAL; goto out; } - prog = bpf_object__find_program_by_title(obj, sec_name); + prog = NULL; + for (i = 0; i < obj->nr_programs; i++) { + if (!strcmp(obj->programs[i].section_name, sec_name)) { + prog = &obj->programs[i]; + break; + } + } if (!prog) { pr_warn("failed to find program '%s' for CO-RE offset relocation\n", sec_name); diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c index e7b8753eac0b..75ecf956a2df 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c @@ -25,7 +25,7 @@ struct bpf_iter__netlink { struct netlink_sock *sk; } __attribute__((preserve_access_index)); -static inline struct inode *SOCK_INODE(struct socket *socket) +static __attribute__((noinline)) struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; } -- cgit v1.2.3 From c4c0bdc0d2d084ed847c7066bdf59fe2cd25aa17 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 17:10:54 -0700 Subject: bpf: Set the number of exception entries properly for subprograms Currently, if a bpf program has more than one subprograms, each program will be jitted separately. For programs with bpf-to-bpf calls the prog->aux->num_exentries is not setup properly. For example, with bpf_iter_netlink.c modified to force one function to be not inlined and with CONFIG_BPF_JIT_ALWAYS_ON the following error is seen: $ ./test_progs -n 3/3 ... libbpf: failed to load program 'iter/netlink' libbpf: failed to load object 'bpf_iter_netlink' libbpf: failed to load BPF skeleton 'bpf_iter_netlink': -4007 test_netlink:FAIL:bpf_iter_netlink__open_and_load skeleton open_and_load failed #3/3 netlink:FAIL The dmesg shows the following errors: ex gen bug which is triggered by the following code in arch/x86/net/bpf_jit_comp.c: if (excnt >= bpf_prog->aux->num_exentries) { pr_err("ex gen bug\n"); return -EFAULT; } This patch fixes the issue by computing proper num_exentries for each subprogram before calling JIT. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34cde841ab68..8911d0576399 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9801,7 +9801,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) int i, j, subprog_start, subprog_end = 0, len, subprog; struct bpf_insn *insn; void *old_bpf_func; - int err; + int err, num_exentries; if (env->subprog_cnt <= 1) return 0; @@ -9876,6 +9876,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + num_exentries = 0; + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + BPF_MODE(insn->code) == BPF_PROBE_MEM) + num_exentries++; + } + func[i]->aux->num_exentries = num_exentries; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; -- cgit v1.2.3 From b338cb921e6739ff59ce32f43342779fe5ffa732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Sat, 20 Jun 2020 14:26:16 -0700 Subject: bpf: Restore behaviour of CAP_SYS_ADMIN allowing the loading of networking bpf programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a fix for a regression in commit 2c78ee898d8f ("bpf: Implement CAP_BPF"). Before the above commit it was possible to load network bpf programs with just the CAP_SYS_ADMIN privilege. The Android bpfloader happens to run in such a configuration (it has SYS_ADMIN but not NET_ADMIN) and creates maps and loads bpf programs for later use by Android's netd (which has NET_ADMIN but not SYS_ADMIN). Fixes: 2c78ee898d8f ("bpf: Implement CAP_BPF") Reported-by: John Stultz Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov Tested-by: John Stultz Link: https://lore.kernel.org/bpf/20200620212616.93894-1-zenczykowski@gmail.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..7d946435587d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2121,7 +2121,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) !bpf_capable()) return -EPERM; - if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; -- cgit v1.2.3 From bcc7f554cfa7e0ac77c7adc4027c16f4a2f99c6f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 23 Jun 2020 16:39:35 +0100 Subject: bpf: Fix formatting in documentation for BPF helpers When producing the bpf-helpers.7 man page from the documentation from the BPF user space header file, rst2man complains: :2636: (ERROR/3) Unexpected indentation. :2640: (WARNING/2) Block quote ends without a blank line; unexpected unindent. Let's fix formatting for the relevant chunk (item list in bpf_ringbuf_query()'s description), and for a couple other functions. Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200623153935.6215-1-quentin@isovalent.com --- include/uapi/linux/bpf.h | 41 +++++++++++++++++++++-------------------- tools/include/uapi/linux/bpf.h | 41 +++++++++++++++++++++-------------------- 2 files changed, 42 insertions(+), 40 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 974a71342aea..8bd33050b7bb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3171,13 +3171,12 @@ union bpf_attr { * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return - * 0, on success; - * < 0, on error. + * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description @@ -3189,20 +3188,20 @@ union bpf_attr { * void bpf_ringbuf_submit(void *data, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard(void *data, u64 flags) * Description * Discard reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * @@ -3210,16 +3209,18 @@ union bpf_attr { * Description * Query various characteristics of provided ring buffer. What * exactly is queries is determined by *flags*: - * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; - * - BPF_RB_RING_SIZE - the size of ring buffer; - * - BPF_RB_CONS_POS - consumer position (can wrap around); - * - BPF_RB_PROD_POS - producer(s) position (can wrap around); - * Data returned is just a momentary snapshots of actual values + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values * and could be inaccurate, so this facility should be used to * power heuristics and for reporting, not to make 100% correct * calculation. * Return - * Requested value, or 0, if flags are not recognized. + * Requested value, or 0, if *flags* are not recognized. * * int bpf_csum_level(struct sk_buff *skb, u64 level) * Description diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 974a71342aea..8bd33050b7bb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3171,13 +3171,12 @@ union bpf_attr { * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return - * 0, on success; - * < 0, on error. + * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description @@ -3189,20 +3188,20 @@ union bpf_attr { * void bpf_ringbuf_submit(void *data, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard(void *data, u64 flags) * Description * Discard reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * @@ -3210,16 +3209,18 @@ union bpf_attr { * Description * Query various characteristics of provided ring buffer. What * exactly is queries is determined by *flags*: - * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; - * - BPF_RB_RING_SIZE - the size of ring buffer; - * - BPF_RB_CONS_POS - consumer position (can wrap around); - * - BPF_RB_PROD_POS - producer(s) position (can wrap around); - * Data returned is just a momentary snapshots of actual values + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values * and could be inaccurate, so this facility should be used to * power heuristics and for reporting, not to make 100% correct * calculation. * Return - * Requested value, or 0, if flags are not recognized. + * Requested value, or 0, if *flags* are not recognized. * * int bpf_csum_level(struct sk_buff *skb, u64 level) * Description -- cgit v1.2.3 From 0c1a7f13c9ec1ceb18d97ef4b1dd20ec71ffba31 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Wed, 17 Jun 2020 17:01:32 +0530 Subject: ieee80211: Add missing and new AKM suite selector definitions Add the definitions for missing AKM selectors defined in IEEE P802.11-REVmd/D3.0, table 9-151. These definitions will be used by various drivers that support these new AKM suites. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20200617113132.13477-1-vjakkam@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index fe15f831841b..9f732499ea88 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3333,13 +3333,17 @@ struct ieee80211_multiple_bssid_configuration { #define WLAN_AKM_SUITE_TDLS SUITE(0x000FAC, 7) #define WLAN_AKM_SUITE_SAE SUITE(0x000FAC, 8) #define WLAN_AKM_SUITE_FT_OVER_SAE SUITE(0x000FAC, 9) +#define WLAN_AKM_SUITE_AP_PEER_KEY SUITE(0x000FAC, 10) #define WLAN_AKM_SUITE_8021X_SUITE_B SUITE(0x000FAC, 11) #define WLAN_AKM_SUITE_8021X_SUITE_B_192 SUITE(0x000FAC, 12) +#define WLAN_AKM_SUITE_FT_8021X_SHA384 SUITE(0x000FAC, 13) #define WLAN_AKM_SUITE_FILS_SHA256 SUITE(0x000FAC, 14) #define WLAN_AKM_SUITE_FILS_SHA384 SUITE(0x000FAC, 15) #define WLAN_AKM_SUITE_FT_FILS_SHA256 SUITE(0x000FAC, 16) #define WLAN_AKM_SUITE_FT_FILS_SHA384 SUITE(0x000FAC, 17) #define WLAN_AKM_SUITE_OWE SUITE(0x000FAC, 18) +#define WLAN_AKM_SUITE_FT_PSK_SHA384 SUITE(0x000FAC, 19) +#define WLAN_AKM_SUITE_PSK_SHA384 SUITE(0x000FAC, 20) #define WLAN_MAX_KEY_LEN 32 -- cgit v1.2.3 From 86a1b9d7c275a3dba69e7ab099f8d5f71f69f6a8 Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Mon, 22 Jun 2020 14:35:42 +0200 Subject: mac80211: fix control port tx status check The initial control port tx status patch assumed, that we have IEEE 802.11 frames, but actually ethernet frames are stored in the ack skb. Fix this by checking for the correct ethertype and skb protocol 802.3. Also allow tx status reports for ETH_P_PREAUTH, as preauth frames can also be send over the nl80211 control port. Fixes: a7528198add8 ("mac80211: support control port TX status reporting") Reported-by: Jouni Malinen Signed-off-by: Markus Theil Reported-by: kernel test robot Link: https://lore.kernel.org/r/20200622123542.173695-1-markus.theil@tu-ilmenau.de Signed-off-by: Johannes Berg --- net/mac80211/status.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 7b1bacac39c6..cbc40b358ba2 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -639,11 +639,23 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr = (void *)skb->data; + __be16 ethertype = 0; + + if (skb->len >= ETH_HLEN && skb->protocol == cpu_to_be16(ETH_P_802_3)) + skb_copy_bits(skb, 2 * ETH_ALEN, ðertype, ETH_TLEN); rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (sdata) { - if (ieee80211_is_any_nullfunc(hdr->frame_control)) + if (ethertype == sdata->control_port_protocol || + ethertype == cpu_to_be16(ETH_P_PREAUTH)) + cfg80211_control_port_tx_status(&sdata->wdev, + cookie, + skb->data, + skb->len, + acked, + GFP_ATOMIC); + else if (ieee80211_is_any_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, info->status.ack_signal, @@ -654,12 +666,8 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, skb->data, skb->len, acked, GFP_ATOMIC); else - cfg80211_control_port_tx_status(&sdata->wdev, - cookie, - skb->data, - skb->len, - acked, - GFP_ATOMIC); + pr_warn("Unknown status report in ack skb\n"); + } rcu_read_unlock(); -- cgit v1.2.3 From 01da2e059dc326d02091a62b81a795a393e3719f Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 4 Jun 2020 23:41:57 +0200 Subject: mac80211: simplify mesh code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doing mod_timer() conditionaly is easier than conditionally unlocking and jumping around... Signed-off-by: Pavel Machek (CIP) Acked-by: Linus Lüssing Link: https://lore.kernel.org/r/20200604214157.GA9737@amd Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index aa5150929996..02cde0fd08fe 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1105,11 +1105,8 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) ttl, lifetime, 0, ifmsh->preq_id++, sdata); spin_lock_bh(&mpath->state_lock); - if (mpath->flags & MESH_PATH_DELETED) { - spin_unlock_bh(&mpath->state_lock); - goto enddiscovery; - } - mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout); + if (!(mpath->flags & MESH_PATH_DELETED)) + mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout); spin_unlock_bh(&mpath->state_lock); enddiscovery: -- cgit v1.2.3 From 78fb5b541b7ae57ac39187ccb3097e606004cf9b Mon Sep 17 00:00:00 2001 From: Seevalamuthu Mariappan Date: Tue, 9 Jun 2020 15:45:54 +0530 Subject: mac80211: Fix dropping broadcast packets in 802.11 encap Broadcast pkts like arp are getting dropped in 'ieee80211_8023_xmit'. Fix this by replacing is_valid_ether_addr api with is_zero_ether_addr. Fixes: 50ff477a8639 ("mac80211: add 802.11 encapsulation offloading support") Signed-off-by: Seevalamuthu Mariappan Link: https://lore.kernel.org/r/1591697754-4975-1-git-send-email-seevalam@codeaurora.org Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e9ce658141f5..3374df016c58 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4206,7 +4206,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER))) ra = sdata->u.mgd.bssid; - if (!is_valid_ether_addr(ra)) + if (is_zero_ether_addr(ra)) goto out_free; multicast = is_multicast_ether_addr(ra); -- cgit v1.2.3 From 5af7fef39d7952c0f5551afa7b821ee7b6c9dd3d Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Wed, 17 Jun 2020 10:26:36 +0200 Subject: mac80211: skip mpath lookup also for control port tx When using 802.1X over mesh networks, at first an ordinary mesh peering is established, then the 802.1X EAPOL dialog happens, afterwards an authenticated mesh peering exchange (AMPE) happens, finally the peering is complete and we can set the STA authorized flag. As 802.1X is an intermediate step here and key material is not yet exchanged for stations we have to skip mesh path lookup for these EAPOL frames. Otherwise the already configure mesh group encryption key would be used to send a mesh path request which no one can decipher, because we didn't already establish key material on both peers, like with SAE and directly using AMPE. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200617082637.22670-2-markus.theil@tu-ilmenau.de [remove pointless braces, remove unnecessary local variable, the list can only process one such frame (or its fragments)] Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3374df016c58..1a2941e5244f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3996,6 +3996,9 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, skb_list_walk_safe(skb, skb, next) { skb_mark_not_on_list(skb); + if (skb->protocol == sdata->control_port_protocol) + ctrl_flags |= IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP; + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, ctrl_flags, cookie); if (IS_ERR(skb)) { @@ -5371,7 +5374,8 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, return -EINVAL; if (proto == sdata->control_port_protocol) - ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; + ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO | + IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP; if (unencrypted) flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; -- cgit v1.2.3 From 0b467b63870d9c05c81456aa9bfee894ab2db3b6 Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Thu, 25 Jun 2020 12:42:14 +0200 Subject: mac80211: allow rx of mesh eapol frames with default rx key Without this patch, eapol frames cannot be received in mesh mode, when 802.1X should be used. Initially only a MGTK is defined, which is found and set as rx->key, when there are no other keys set. ieee80211_drop_unencrypted would then drop these eapol frames, as they are data frames without encryption and there exists some rx->key. Fix this by differentiating between mesh eapol frames and other data frames with existing rx->key. Allow mesh mesh eapol frames only if they are for our vif address. With this patch in-place, ieee80211_rx_h_mesh_fwding continues after the ieee80211_drop_unencrypted check and notices, that these eapol frames have to be delivered locally, as they should. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200625104214.50319-1-markus.theil@tu-ilmenau.de [small code cleanups] Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a88ab6fb16f2..5c5af4b5fc08 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2396,6 +2396,7 @@ static int ieee80211_802_1x_port_control(struct ieee80211_rx_data *rx) static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) { + struct ieee80211_hdr *hdr = (void *)rx->skb->data; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); @@ -2406,6 +2407,31 @@ static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) if (status->flag & RX_FLAG_DECRYPTED) return 0; + /* check mesh EAPOL frames first */ + if (unlikely(rx->sta && ieee80211_vif_is_mesh(&rx->sdata->vif) && + ieee80211_is_data(fc))) { + struct ieee80211s_hdr *mesh_hdr; + u16 hdr_len = ieee80211_hdrlen(fc); + u16 ethertype_offset; + __be16 ethertype; + + if (!ether_addr_equal(hdr->addr1, rx->sdata->vif.addr)) + goto drop_check; + + /* make sure fixed part of mesh header is there, also checks skb len */ + if (!pskb_may_pull(rx->skb, hdr_len + 6)) + goto drop_check; + + mesh_hdr = (struct ieee80211s_hdr *)(skb->data + hdr_len); + ethertype_offset = hdr_len + ieee80211_get_mesh_hdrlen(mesh_hdr) + + sizeof(rfc1042_header); + + if (skb_copy_bits(rx->skb, ethertype_offset, ðertype, 2) == 0 && + ethertype == rx->sdata->control_port_protocol) + return 0; + } + +drop_check: /* Drop unencrypted frames if key is set. */ if (unlikely(!ieee80211_has_protected(fc) && !ieee80211_is_any_nullfunc(fc) && -- cgit v1.2.3 From a9b59159d338d414acaa8e2f569d129d51c76452 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 24 Jun 2020 15:20:39 -0700 Subject: bpf: Do not allow btf_ctx_access with __int128 types To ensure btf_ctx_access() is safe the verifier checks that the BTF arg type is an int, enum, or pointer. When the function does the BTF arg lookup it uses the calculation 'arg = off / 8' using the fact that registers are 8B. This requires that the first arg is in the first reg, the second in the second, and so on. However, for __int128 the arg will consume two registers by default LLVM implementation. So this will cause the arg layout assumed by the 'arg = off / 8' calculation to be incorrect. Because __int128 is uncommon this patch applies the easiest fix and will force int types to be sizeof(u64) or smaller so that they will fit in a single register. v2: remove unneeded parens per Andrii's feedback Fixes: 9e15db66136a1 ("bpf: Implement accurate raw_tp context access via BTF") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159303723962.11287.13309537171132420717.stgit@john-Precision-5820-Tower --- include/linux/btf.h | 5 +++++ kernel/bpf/btf.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 5c1ea99b480f..8b81fbb4497c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -82,6 +82,11 @@ static inline bool btf_type_is_int(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } +static inline bool btf_type_is_small_int(const struct btf_type *t) +{ + return btf_type_is_int(t) && t->size <= sizeof(u64); +} + static inline bool btf_type_is_enum(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 58c9af1d4808..9a1a98dd9e97 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3746,7 +3746,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!btf_type_is_int(t)) { + if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3768,7 +3768,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { -- cgit v1.2.3 From 7a64135f3229a808067e4bd29be15fe6856a9ae6 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 25 Jun 2020 16:26:58 +0200 Subject: libbpf: Adjust SEC short cut for expected attach type BPF_XDP_DEVMAP Adjust the SEC("xdp_devmap/") prog type prefix to contain a slash "/" for expected attach type BPF_XDP_DEVMAP. This is consistent with other prog types like tracing. Fixes: 2778797037a6 ("libbpf: Add SEC name for xdp programs attached to device map") Suggested-by: Andrii Nakryiko Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159309521882.821855.6873145686353617509.stgit@firesoul --- tools/lib/bpf/libbpf.c | 2 +- tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f17151d866e6..11e4725b8b1c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6659,7 +6659,7 @@ static const struct bpf_sec_def section_defs[] = { .expected_attach_type = BPF_TRACE_ITER, .is_attach_btf = true, .attach_fn = attach_iter), - BPF_EAPROG_SEC("xdp_devmap", BPF_PROG_TYPE_XDP, + BPF_EAPROG_SEC("xdp_devmap/", BPF_PROG_TYPE_XDP, BPF_XDP_DEVMAP), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c index 330811260123..0ac086497722 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c @@ -27,7 +27,7 @@ int xdp_dummy_prog(struct xdp_md *ctx) /* valid program on DEVMAP entry via SEC name; * has access to egress and ingress ifindex */ -SEC("xdp_devmap") +SEC("xdp_devmap/map_prog") int xdp_dummy_dm(struct xdp_md *ctx) { char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n"; -- cgit v1.2.3 From bc7a39b4272b9672d806d422b6850e8c1a09914c Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 26 Jun 2020 12:49:39 +0300 Subject: nl80211: don't return err unconditionally in nl80211_start_ap() When a memory leak was fixed, a return err was changed to goto err, but, accidentally, the if (err) was removed, so now we always exit at this point. Fix it by adding if (err) back. Fixes: 9951ebfcdf2b ("nl80211: fix potential leak in AP start") Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200626124931.871ba5b31eee.I97340172d92164ee92f3c803fe20a8a6e97714e1@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 263ae395ad44..f31698fd4a7e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5016,7 +5016,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_he_obss_pd( info->attrs[NL80211_ATTR_HE_OBSS_PD], ¶ms.he_obss_pd); - goto out; + if (err) + goto out; } if (info->attrs[NL80211_ATTR_HE_BSS_COLOR]) { -- cgit v1.2.3 From 60a0121f8fa64b0f4297aa6fef8207500483a874 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 26 Jun 2020 12:49:40 +0300 Subject: nl80211: fix memory leak when parsing NL80211_ATTR_HE_BSS_COLOR If there is an error when parsing the NL80211_ATTR_HE_BSS_COLOR attribute, we return immediately without freeing param.acl. Fit it by using goto out instead of returning immediately. Fixes: 5c5e52d1bb96 ("nl80211: add handling for BSS color") Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20200626124931.7ad2a3eb894f.I60905fb70bd20389a3b170db515a07275e31845e@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f31698fd4a7e..0e07fb8585fb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5025,7 +5025,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) info->attrs[NL80211_ATTR_HE_BSS_COLOR], ¶ms.he_bss_color); if (err) - return err; + goto out; } nl80211_calculate_ap_params(¶ms); -- cgit v1.2.3 From fa48494cce5f6360b0f8683cdf258fb45c666287 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 25 Jun 2020 22:58:37 -0700 Subject: ionic: update the queue count on open Let the network stack know the real number of queues that we are using. v2: added error checking Fixes: 49d3b493673a ("ionic: disable the queues on link down") Signed-off-by: Shannon Nelson Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index aaa00edd9d5b..3c9dde31f3fa 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1673,6 +1673,14 @@ int ionic_open(struct net_device *netdev) if (err) goto err_out; + err = netif_set_real_num_tx_queues(netdev, lif->nxqs); + if (err) + goto err_txrx_deinit; + + err = netif_set_real_num_rx_queues(netdev, lif->nxqs); + if (err) + goto err_txrx_deinit; + /* don't start the queues until we have link */ if (netif_carrier_ok(netdev)) { err = ionic_start_queues(lif); -- cgit v1.2.3 From 0574e2000fc3103cbc69ba82ec1175ce171fdf5e Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Fri, 26 Jun 2020 19:17:29 +0300 Subject: enetc: Fix tx rings bitmap iteration range, irq handling The rings bitmap of an interrupt vector encodes which of the device's rings were assigned to that interrupt vector. Hence the iteration range of the tx rings bitmap (for_each_set_bit()) should be the total number of Tx rings of that netdevice instead of the number of rings assigned to the interrupt vector. Since there are 2 cores, and one interrupt vector for each core, the number of rings asigned to an interrupt vector is half the number of available rings. The impact of this error is that the upper half of the tx rings could still generate interrupts during napi polling. Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Signed-off-by: Claudiu Manoil Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 96831f49925c..22105d09bc89 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -266,7 +266,7 @@ static irqreturn_t enetc_msix(int irq, void *data) /* disable interrupts */ enetc_wr_reg(v->rbier, 0); - for_each_set_bit(i, &v->tx_rings_map, v->count_tx_rings) + for_each_set_bit(i, &v->tx_rings_map, ENETC_MAX_NUM_TXQS) enetc_wr_reg(v->tbier_base + ENETC_BDR_OFF(i), 0); napi_schedule_irqoff(&v->napi); @@ -302,7 +302,7 @@ static int enetc_poll(struct napi_struct *napi, int budget) /* enable interrupts */ enetc_wr_reg(v->rbier, ENETC_RBIER_RXTIE); - for_each_set_bit(i, &v->tx_rings_map, v->count_tx_rings) + for_each_set_bit(i, &v->tx_rings_map, ENETC_MAX_NUM_TXQS) enetc_wr_reg(v->tbier_base + ENETC_BDR_OFF(i), ENETC_TBIER_TXTIE); -- cgit v1.2.3 From 2ce578ca9444bb44da66b9a494f56e7ec12e6466 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 27 Jun 2020 15:47:51 +0800 Subject: net: ipv4: Fix wrong type conversion from hint to rt in ip_route_use_hint() We can't cast sk_buff to rtable by (struct rtable *)hint. Use skb_rtable(). Fixes: 02b24941619f ("ipv4: use dst hint for ipv4 list receive") Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1d7076b78e63..a01efa062f6b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2027,7 +2027,7 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, const struct sk_buff *hint) { struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rt = (struct rtable *)hint; + struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); int err = -EINVAL; u32 tag = 0; -- cgit v1.2.3 From 93dd5f185916b05e931cffae636596f21f98546e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 25 Jun 2020 16:12:59 -0700 Subject: bpf, sockmap: RCU splat with redirect and strparser error or TLS There are two paths to generate the below RCU splat the first and most obvious is the result of the BPF verdict program issuing a redirect on a TLS socket (This is the splat shown below). Unlike the non-TLS case the caller of the *strp_read() hooks does not wrap the call in a rcu_read_lock/unlock. Then if the BPF program issues a redirect action we hit the RCU splat. However, in the non-TLS socket case the splat appears to be relatively rare, because the skmsg caller into the strp_data_ready() is wrapped in a rcu_read_lock/unlock. Shown here, static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { psock->parser.saved_data_ready(sk); } else { write_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->parser.strp); write_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); } If the above was the only way to run the verdict program we would be safe. But, there is a case where the strparser may throw an ENOMEM error while parsing the skb. This is a result of a failed skb_clone, or alloc_skb_for_msg while building a new merged skb when the msg length needed spans multiple skbs. This will in turn put the skb on the strp_wrk workqueue in the strparser code. The skb will later be dequeued and verdict programs run, but now from a different context without the rcu_read_lock()/unlock() critical section in sk_psock_strp_data_ready() shown above. In practice I have not seen this yet, because as far as I know most users of the verdict programs are also only working on single skbs. In this case no merge happens which could trigger the above ENOMEM errors. In addition the system would need to be under memory pressure. For example, we can't hit the above case in selftests because we missed having tests to merge skbs. (Added in later patch) To fix the below splat extend the rcu_read_lock/unnlock block to include the call to sk_psock_tls_verdict_apply(). This will fix both TLS redirect case and non-TLS redirect+error case. Also remove psock from the sk_psock_tls_verdict_apply() function signature its not used there. [ 1095.937597] WARNING: suspicious RCU usage [ 1095.940964] 5.7.0-rc7-02911-g463bac5f1ca79 #1 Tainted: G W [ 1095.944363] ----------------------------- [ 1095.947384] include/linux/skmsg.h:284 suspicious rcu_dereference_check() usage! [ 1095.950866] [ 1095.950866] other info that might help us debug this: [ 1095.950866] [ 1095.957146] [ 1095.957146] rcu_scheduler_active = 2, debug_locks = 1 [ 1095.961482] 1 lock held by test_sockmap/15970: [ 1095.964501] #0: ffff9ea6b25de660 (sk_lock-AF_INET){+.+.}-{0:0}, at: tls_sw_recvmsg+0x13a/0x840 [tls] [ 1095.968568] [ 1095.968568] stack backtrace: [ 1095.975001] CPU: 1 PID: 15970 Comm: test_sockmap Tainted: G W 5.7.0-rc7-02911-g463bac5f1ca79 #1 [ 1095.977883] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 [ 1095.980519] Call Trace: [ 1095.982191] dump_stack+0x8f/0xd0 [ 1095.984040] sk_psock_skb_redirect+0xa6/0xf0 [ 1095.986073] sk_psock_tls_strp_read+0x1d8/0x250 [ 1095.988095] tls_sw_recvmsg+0x714/0x840 [tls] v2: Improve commit message to identify non-TLS redirect plus error case condition as well as more common TLS case. In the process I decided doing the rcu_read_unlock followed by the lock/unlock inside branches was unnecessarily complex. We can just extend the current rcu block and get the same effeective without the shuffling and branching. Thanks Martin! Fixes: e91de6afa81c1 ("bpf: Fix running sk_skb program types with ktls") Reported-by: Jakub Sitnicki Reported-by: kernel test robot Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/159312677907.18340.11064813152758406626.stgit@john-XPS-13-9370 --- net/core/skmsg.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 351afbf6bfba..c41ab6906b21 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -683,7 +683,7 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp) return container_of(parser, struct sk_psock, parser); } -static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) +static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -715,12 +715,11 @@ static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) } } -static void sk_psock_tls_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict) { switch (verdict) { case __SK_REDIRECT: - sk_psock_skb_redirect(psock, skb); + sk_psock_skb_redirect(skb); break; case __SK_PASS: case __SK_DROP: @@ -741,8 +740,8 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); } + sk_psock_tls_verdict_apply(skb, ret); rcu_read_unlock(); - sk_psock_tls_verdict_apply(psock, skb, ret); return ret; } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); @@ -770,7 +769,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } goto out_free; case __SK_REDIRECT: - sk_psock_skb_redirect(psock, skb); + sk_psock_skb_redirect(skb); break; case __SK_DROP: /* fall-through */ @@ -794,8 +793,8 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); } - rcu_read_unlock(); sk_psock_verdict_apply(psock, skb, ret); + rcu_read_unlock(); } static int sk_psock_strp_read_done(struct strparser *strp, int err) -- cgit v1.2.3 From 8025751d4d55a2f32be6bdf825b6a80c299875f5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 25 Jun 2020 16:13:18 -0700 Subject: bpf, sockmap: RCU dereferenced psock may be used outside RCU block If an ingress verdict program specifies message sizes greater than skb->len and there is an ENOMEM error due to memory pressure we may call the rcv_msg handler outside the strp_data_ready() caller context. This is because on an ENOMEM error the strparser will retry from a workqueue. The caller currently protects the use of psock by calling the strp_data_ready() inside a rcu_read_lock/unlock block. But, in above workqueue error case the psock is accessed outside the read_lock/unlock block of the caller. So instead of using psock directly we must do a look up against the sk again to ensure the psock is available. There is an an ugly piece here where we must handle the case where we paused the strp and removed the psock. On psock removal we first pause the strparser and then remove the psock. If the strparser is paused while an skb is scheduled on the workqueue the skb will be dropped on the flow and kfree_skb() is called. If the workqueue manages to get called before we pause the strparser but runs the rcvmsg callback after the psock is removed we will hit the unlikely case where we run the sockmap rcvmsg handler but do not have a psock. For now we will follow strparser logic and drop the skb on the floor with skb_kfree(). This is ugly because the data is dropped. To date this has not caused problems in practice because either the application controlling the sockmap is coordinating with the datapath so that skbs are "flushed" before removal or we simply wait for the sock to be closed before removing it. This patch fixes the describe RCU bug and dropping the skb doesn't make things worse. Future patches will improve this by allowing the normal case where skbs are not merged to skip the strparser altogether. In practice many (most?) use cases have no need to merge skbs so its both a code complexity hit as seen above and a performance issue. For example, in the Cilium case we always set the strparser up to return sbks 1:1 without any merging and have avoided above issues. Fixes: e91de6afa81c1 ("bpf: Fix running sk_skb program types with ktls") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/159312679888.18340.15248924071966273998.stgit@john-XPS-13-9370 --- net/core/skmsg.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index c41ab6906b21..6a32a1fd34f8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -781,11 +781,18 @@ out_free: static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { - struct sk_psock *psock = sk_psock_from_strp(strp); + struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; + struct sock *sk; rcu_read_lock(); + sk = strp->sk; + psock = sk_psock(sk); + if (unlikely(!psock)) { + kfree_skb(skb); + goto out; + } prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_orphan(skb); @@ -794,6 +801,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); +out: rcu_read_unlock(); } -- cgit v1.2.3 From 53792fa45b1b17f78f18bcd0bd167674341297e8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 25 Jun 2020 16:13:38 -0700 Subject: bpf, sockmap: Add ingres skb tests that utilize merge skbs Add a test to check strparser merging skbs is working. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/159312681884.18340.4922800172600252370.stgit@john-XPS-13-9370 --- tools/testing/selftests/bpf/progs/test_sockmap_kern.h | 8 +++++++- tools/testing/selftests/bpf/test_sockmap.c | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index 057036ca1111..3dca4c2e2418 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -79,7 +79,7 @@ struct { struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 2); + __uint(max_entries, 3); __type(key, int); __type(value, int); } sock_skb_opts SEC(".maps"); @@ -94,6 +94,12 @@ struct { SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { + int *f, two = 2; + + f = bpf_map_lookup_elem(&sock_skb_opts, &two); + if (f && *f) { + return *f; + } return skb->len; } diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 37695fc8096a..78789b27e573 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -85,6 +85,7 @@ int txmsg_ktls_skb_drop; int txmsg_ktls_skb_redir; int ktls; int peek_flag; +int skb_use_parser; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -174,6 +175,7 @@ static void test_reset(void) txmsg_apply = txmsg_cork = 0; txmsg_ingress = txmsg_redir_skb = 0; txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; + skb_use_parser = 0; } static int test_start_subtest(const struct _test *t, struct sockmap_options *o) @@ -1211,6 +1213,11 @@ run: } } + if (skb_use_parser) { + i = 2; + err = bpf_map_update_elem(map_fd[7], &i, &skb_use_parser, BPF_ANY); + } + if (txmsg_drop) options->drop_expected = true; @@ -1650,6 +1657,16 @@ static void test_txmsg_cork(int cgrp, struct sockmap_options *opt) test_send(opt, cgrp); } +static void test_txmsg_ingress_parser(int cgrp, struct sockmap_options *opt) +{ + txmsg_pass = 1; + skb_use_parser = 512; + opt->iov_length = 256; + opt->iov_count = 1; + opt->rate = 2; + test_exec(cgrp, opt); +} + char *map_names[] = { "sock_map", "sock_map_txmsg", @@ -1748,6 +1765,7 @@ struct _test test[] = { {"txmsg test pull-data", test_txmsg_pull}, {"txmsg test pop-data", test_txmsg_pop}, {"txmsg test push/pop data", test_txmsg_push_pop}, + {"txmsg text ingress parser", test_txmsg_ingress_parser}, }; static int check_whitelist(struct _test *t, struct sockmap_options *opt) -- cgit v1.2.3 From 9b14d1f8a76682124c5e465196685a9833ff526e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Sun, 28 Jun 2020 15:45:16 +0200 Subject: bridge: mrp: Fix endian conversion and some other warnings The following sparse warnings are fixed: net/bridge/br_mrp.c:106:18: warning: incorrect type in assignment (different base types) net/bridge/br_mrp.c:106:18: expected unsigned short [usertype] net/bridge/br_mrp.c:106:18: got restricted __be16 [usertype] net/bridge/br_mrp.c:281:23: warning: incorrect type in argument 1 (different modifiers) net/bridge/br_mrp.c:281:23: expected struct list_head *entry net/bridge/br_mrp.c:281:23: got struct list_head [noderef] * net/bridge/br_mrp.c:332:28: warning: incorrect type in argument 1 (different modifiers) net/bridge/br_mrp.c:332:28: expected struct list_head *new net/bridge/br_mrp.c:332:28: got struct list_head [noderef] * net/bridge/br_mrp.c:332:40: warning: incorrect type in argument 2 (different modifiers) net/bridge/br_mrp.c:332:40: expected struct list_head *head net/bridge/br_mrp.c:332:40: got struct list_head [noderef] * net/bridge/br_mrp.c:682:29: warning: incorrect type in argument 1 (different modifiers) net/bridge/br_mrp.c:682:29: expected struct list_head const *head net/bridge/br_mrp.c:682:29: got struct list_head [noderef] * Reported-by: kernel test robot Fixes: 2f1a11ae11d222 ("bridge: mrp: Add MRP interface.") Fixes: 4b8d7d4c599182 ("bridge: mrp: Extend bridge interface") Fixes: 9a9f26e8f7ea30 ("bridge: mrp: Connect MRP API with the switchdev API") Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_mrp.c | 2 +- net/bridge/br_private.h | 2 +- net/bridge/br_private_mrp.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 779e1eb75443..90592af9db61 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -86,7 +86,7 @@ static struct sk_buff *br_mrp_skb_alloc(struct net_bridge_port *p, { struct ethhdr *eth_hdr; struct sk_buff *skb; - u16 *version; + __be16 *version; skb = dev_alloc_skb(MRP_MAX_FRAME_LENGTH); if (!skb) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2130fe0194e6..e0ea6dbbc97e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -430,7 +430,7 @@ struct net_bridge { struct hlist_head fdb_list; #if IS_ENABLED(CONFIG_BRIDGE_MRP) - struct list_head __rcu mrp_list; + struct list_head mrp_list; #endif }; diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h index 33b255e38ffe..315eb37d89f0 100644 --- a/net/bridge/br_private_mrp.h +++ b/net/bridge/br_private_mrp.h @@ -8,7 +8,7 @@ struct br_mrp { /* list of mrp instances */ - struct list_head __rcu list; + struct list_head list; struct net_bridge_port __rcu *p_port; struct net_bridge_port __rcu *s_port; -- cgit v1.2.3 From 7dea927f702df030c02bd0c9e6e320a8315e3efa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 28 Jun 2020 17:49:35 +0300 Subject: lib: packing: add documentation for pbuflen argument Fixes sparse warning: Function parameter or member 'pbuflen' not described in 'packing' Fixes: 554aae35007e ("lib: Add support for generic packing operations") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- lib/packing.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/packing.c b/lib/packing.c index 50d1e9f2f5a7..6ed72dccfdb5 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -73,6 +73,7 @@ static void adjust_for_msb_right_quirk(u64 *to_write, int *box_start_bit, * @endbit: The index (in logical notation, compensated for quirks) where * the packed value ends within pbuf. Must be smaller than, or equal * to, startbit. + * @pbuflen: The length in bytes of the packed buffer pointed to by @pbuf. * @op: If PACK, then uval will be treated as const pointer and copied (packed) * into pbuf, between startbit and endbit. * If UNPACK, then pbuf will be treated as const pointer and the logical -- cgit v1.2.3 From be74294ffa24f5fbc0d6643842e3e095447e17a2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 26 Jun 2020 11:24:22 -0700 Subject: net: get rid of lockdep_set_class_and_subclass() lockdep_set_class_and_subclass() is meant to reduce the _nested() annotations by assigning a default subclass. For addr_list_lock, we have to compute the subclass at run-time as the netdevice topology changes after creation. So, we should just get rid of these lockdep_set_class_and_subclass() and stick with our _nested() annotations. Fixes: 845e0ebb4408 ("net: change addr_list_lock back to static key") Suggested-by: Taehee Yoo Cc: Dmitry Vyukov Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/macsec.c | 5 ++--- drivers/net/macvlan.c | 5 ++--- net/8021q/vlan_dev.c | 9 ++++----- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index e56547bfdac9..9159846b8b93 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4052,9 +4052,8 @@ static int macsec_newlink(struct net *net, struct net_device *dev, return err; netdev_lockdep_set_classes(dev); - lockdep_set_class_and_subclass(&dev->addr_list_lock, - &macsec_netdev_addr_lock_key, - dev->lower_level); + lockdep_set_class(&dev->addr_list_lock, + &macsec_netdev_addr_lock_key); err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 6a6cc9f75307..4942f6112e51 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -880,9 +880,8 @@ static struct lock_class_key macvlan_netdev_addr_lock_key; static void macvlan_set_lockdep_class(struct net_device *dev) { netdev_lockdep_set_classes(dev); - lockdep_set_class_and_subclass(&dev->addr_list_lock, - &macvlan_netdev_addr_lock_key, - dev->lower_level); + lockdep_set_class(&dev->addr_list_lock, + &macvlan_netdev_addr_lock_key); } static int macvlan_init(struct net_device *dev) diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index c8d6a07e23c5..3dd7c972677b 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -503,11 +503,10 @@ static void vlan_dev_set_lockdep_one(struct net_device *dev, lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); } -static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) +static void vlan_dev_set_lockdep_class(struct net_device *dev) { - lockdep_set_class_and_subclass(&dev->addr_list_lock, - &vlan_netdev_addr_lock_key, - subclass); + lockdep_set_class(&dev->addr_list_lock, + &vlan_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); } @@ -601,7 +600,7 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - vlan_dev_set_lockdep_class(dev, dev->lower_level); + vlan_dev_set_lockdep_class(dev); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) -- cgit v1.2.3 From e8280338c778a3f81477624267c9fa47f931477b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 26 Jun 2020 11:25:27 -0700 Subject: net: explain the lockdep annotations for dev_uc_unsync() The lockdep annotations for dev_uc_unsync() and dev_mc_unsync() are not easy to understand, so add some comments to explain why they are correct. Similar for the rest netif_addr_lock_bh() cases, they don't need nested version. Cc: Taehee Yoo Cc: Dmitry Vyukov Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 6393ba930097..54cd568e7c2f 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -690,6 +690,15 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return; + /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two + * reasons: + * 1) This is always called without any addr_list_lock, so as the + * outermost one here, it must be 0. + * 2) This is called by some callers after unlinking the upper device, + * so the dev->lower_level becomes 1 again. + * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or + * larger. + */ netif_addr_lock_bh(from); netif_addr_lock_nested(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); @@ -911,6 +920,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return; + /* See the above comments inside dev_uc_unsync(). */ netif_addr_lock_bh(from); netif_addr_lock_nested(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); -- cgit v1.2.3 From a9b1110162357689a34992d5c925852948e5b9fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Jun 2020 13:31:50 -0700 Subject: llc: make sure applications use ARPHRD_ETHER syzbot was to trigger a bug by tricking AF_LLC with non sensible addr->sllc_arphrd It seems clear LLC requires an Ethernet device. Back in commit abf9d537fea2 ("llc: add support for SO_BINDTODEVICE") Octavian Purdila added possibility for application to use a zero value for sllc_arphrd, convert it to ARPHRD_ETHER to not cause regressions on existing applications. BUG: KASAN: use-after-free in __read_once_size include/linux/compiler.h:199 [inline] BUG: KASAN: use-after-free in list_empty include/linux/list.h:268 [inline] BUG: KASAN: use-after-free in waitqueue_active include/linux/wait.h:126 [inline] BUG: KASAN: use-after-free in wq_has_sleeper include/linux/wait.h:160 [inline] BUG: KASAN: use-after-free in skwq_has_sleeper include/net/sock.h:2092 [inline] BUG: KASAN: use-after-free in sock_def_write_space+0x642/0x670 net/core/sock.c:2813 Read of size 8 at addr ffff88801e0b4078 by task ksoftirqd/3/27 CPU: 3 PID: 27 Comm: ksoftirqd/3 Not tainted 5.5.0-rc1-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 __kasan_report.cold+0x1b/0x41 mm/kasan/report.c:506 kasan_report+0x12/0x20 mm/kasan/common.c:639 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:135 __read_once_size include/linux/compiler.h:199 [inline] list_empty include/linux/list.h:268 [inline] waitqueue_active include/linux/wait.h:126 [inline] wq_has_sleeper include/linux/wait.h:160 [inline] skwq_has_sleeper include/net/sock.h:2092 [inline] sock_def_write_space+0x642/0x670 net/core/sock.c:2813 sock_wfree+0x1e1/0x260 net/core/sock.c:1958 skb_release_head_state+0xeb/0x260 net/core/skbuff.c:652 skb_release_all+0x16/0x60 net/core/skbuff.c:663 __kfree_skb net/core/skbuff.c:679 [inline] consume_skb net/core/skbuff.c:838 [inline] consume_skb+0xfb/0x410 net/core/skbuff.c:832 __dev_kfree_skb_any+0xa4/0xd0 net/core/dev.c:2967 dev_kfree_skb_any include/linux/netdevice.h:3650 [inline] e1000_unmap_and_free_tx_resource.isra.0+0x21b/0x3a0 drivers/net/ethernet/intel/e1000/e1000_main.c:1963 e1000_clean_tx_irq drivers/net/ethernet/intel/e1000/e1000_main.c:3854 [inline] e1000_clean+0x4cc/0x1d10 drivers/net/ethernet/intel/e1000/e1000_main.c:3796 napi_poll net/core/dev.c:6532 [inline] net_rx_action+0x508/0x1120 net/core/dev.c:6600 __do_softirq+0x262/0x98c kernel/softirq.c:292 run_ksoftirqd kernel/softirq.c:603 [inline] run_ksoftirqd+0x8e/0x110 kernel/softirq.c:595 smpboot_thread_fn+0x6a3/0xa40 kernel/smpboot.c:165 kthread+0x361/0x430 kernel/kthread.c:255 ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352 Allocated by task 8247: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] __kasan_kmalloc mm/kasan/common.c:513 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:486 kasan_slab_alloc+0xf/0x20 mm/kasan/common.c:521 slab_post_alloc_hook mm/slab.h:584 [inline] slab_alloc mm/slab.c:3320 [inline] kmem_cache_alloc+0x121/0x710 mm/slab.c:3484 sock_alloc_inode+0x1c/0x1d0 net/socket.c:240 alloc_inode+0x68/0x1e0 fs/inode.c:230 new_inode_pseudo+0x19/0xf0 fs/inode.c:919 sock_alloc+0x41/0x270 net/socket.c:560 __sock_create+0xc2/0x730 net/socket.c:1384 sock_create net/socket.c:1471 [inline] __sys_socket+0x103/0x220 net/socket.c:1513 __do_sys_socket net/socket.c:1522 [inline] __se_sys_socket net/socket.c:1520 [inline] __ia32_sys_socket+0x73/0xb0 net/socket.c:1520 do_syscall_32_irqs_on arch/x86/entry/common.c:337 [inline] do_fast_syscall_32+0x27b/0xe16 arch/x86/entry/common.c:408 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 Freed by task 17: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] kasan_set_free_info mm/kasan/common.c:335 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:474 kasan_slab_free+0xe/0x10 mm/kasan/common.c:483 __cache_free mm/slab.c:3426 [inline] kmem_cache_free+0x86/0x320 mm/slab.c:3694 sock_free_inode+0x20/0x30 net/socket.c:261 i_callback+0x44/0x80 fs/inode.c:219 __rcu_reclaim kernel/rcu/rcu.h:222 [inline] rcu_do_batch kernel/rcu/tree.c:2183 [inline] rcu_core+0x570/0x1540 kernel/rcu/tree.c:2408 rcu_core_si+0x9/0x10 kernel/rcu/tree.c:2417 __do_softirq+0x262/0x98c kernel/softirq.c:292 The buggy address belongs to the object at ffff88801e0b4000 which belongs to the cache sock_inode_cache of size 1152 The buggy address is located 120 bytes inside of 1152-byte region [ffff88801e0b4000, ffff88801e0b4480) The buggy address belongs to the page: page:ffffea0000782d00 refcount:1 mapcount:0 mapping:ffff88807aa59c40 index:0xffff88801e0b4ffd raw: 00fffe0000000200 ffffea00008e6c88 ffffea0000782d48 ffff88807aa59c40 raw: ffff88801e0b4ffd ffff88801e0b4000 0000000100000003 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88801e0b3f00: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff88801e0b3f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88801e0b4000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88801e0b4080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88801e0b4100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: abf9d537fea2 ("llc: add support for SO_BINDTODEVICE") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/llc/af_llc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 54fb8d452a7b..6e53e43c1907 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -273,6 +273,10 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) if (!sock_flag(sk, SOCK_ZAPPED)) goto out; + if (!addr->sllc_arphrd) + addr->sllc_arphrd = ARPHRD_ETHER; + if (addr->sllc_arphrd != ARPHRD_ETHER) + goto out; rc = -ENODEV; if (sk->sk_bound_dev_if) { llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); @@ -328,7 +332,9 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; - if (unlikely(addr->sllc_family != AF_LLC)) + if (!addr->sllc_arphrd) + addr->sllc_arphrd = ARPHRD_ETHER; + if (unlikely(addr->sllc_family != AF_LLC || addr->sllc_arphrd != ARPHRD_ETHER)) goto out; dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); rc = -ENODEV; @@ -336,8 +342,6 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) if (sk->sk_bound_dev_if) { llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); if (llc->dev) { - if (!addr->sllc_arphrd) - addr->sllc_arphrd = llc->dev->type; if (is_zero_ether_addr(addr->sllc_mac)) memcpy(addr->sllc_mac, llc->dev->dev_addr, IFHWADDRLEN); -- cgit v1.2.3 From bf64ff4c2aac65d680dc639a511c781cf6b6ec08 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 27 Jun 2020 00:12:24 -0700 Subject: genetlink: get rid of family->attrbuf genl_family_rcv_msg_attrs_parse() reuses the global family->attrbuf when family->parallel_ops is false. However, family->attrbuf is not protected by any lock on the genl_family_rcv_msg_doit() code path. This leads to several different consequences, one of them is UAF, like the following: genl_family_rcv_msg_doit(): genl_start(): genl_family_rcv_msg_attrs_parse() attrbuf = family->attrbuf __nlmsg_parse(attrbuf); genl_family_rcv_msg_attrs_parse() attrbuf = family->attrbuf __nlmsg_parse(attrbuf); info->attrs = attrs; cb->data = info; netlink_unicast_kernel(): consume_skb() genl_lock_dumpit(): genl_dumpit_info(cb)->attrs Note family->attrbuf is an array of pointers to the skb data, once the skb is freed, any dereference of family->attrbuf will be a UAF. Maybe we could serialize the family->attrbuf with genl_mutex too, but that would make the locking more complicated. Instead, we can just get rid of family->attrbuf and always allocate attrbuf from heap like the family->parallel_ops==true code path. This may add some performance overhead but comparing with taking the global genl_mutex, it still looks better. Fixes: 75cdbdd08900 ("net: ieee802154: have genetlink code to parse the attrs during dumpit") Fixes: 057af7071344 ("net: tipc: have genetlink code to parse the attrs during dumpit") Reported-and-tested-by: syzbot+3039ddf6d7b13daf3787@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+80cad1e3cb4c41cde6ff@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+736bcbcb11b60d0c0792@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+520f8704db2b68091d44@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+c96e4dfb32f8987fdeed@syzkaller.appspotmail.com Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/genetlink.h | 2 -- net/netlink/genetlink.c | 48 +++++++++++++----------------------------------- 2 files changed, 13 insertions(+), 37 deletions(-) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 74950663bb00..ad71ed4f55ff 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -41,7 +41,6 @@ struct genl_info; * Note that unbind() will not be called symmetrically if the * generic netlink family is removed while there are still open * sockets. - * @attrbuf: buffer to store parsed attributes (private) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family @@ -66,7 +65,6 @@ struct genl_family { struct genl_info *info); int (*mcast_bind)(struct net *net, int group); void (*mcast_unbind)(struct net *net, int group); - struct nlattr ** attrbuf; /* private */ const struct genl_ops * ops; const struct genl_multicast_group *mcgrps; unsigned int n_ops; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 55ee680e9db1..a914b9365a46 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -351,22 +351,11 @@ int genl_register_family(struct genl_family *family) start = end = GENL_ID_VFS_DQUOT; } - if (family->maxattr && !family->parallel_ops) { - family->attrbuf = kmalloc_array(family->maxattr + 1, - sizeof(struct nlattr *), - GFP_KERNEL); - if (family->attrbuf == NULL) { - err = -ENOMEM; - goto errout_locked; - } - } else - family->attrbuf = NULL; - family->id = idr_alloc_cyclic(&genl_fam_idr, family, start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; - goto errout_free; + goto errout_locked; } err = genl_validate_assign_mc_groups(family); @@ -385,8 +374,6 @@ int genl_register_family(struct genl_family *family) errout_remove: idr_remove(&genl_fam_idr, family->id); -errout_free: - kfree(family->attrbuf); errout_locked: genl_unlock_all(); return err; @@ -419,8 +406,6 @@ int genl_unregister_family(const struct genl_family *family) atomic_read(&genl_sk_destructing_cnt) == 0); genl_unlock(); - kfree(family->attrbuf); - genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); return 0; @@ -485,30 +470,23 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, if (!family->maxattr) return NULL; - if (family->parallel_ops) { - attrbuf = kmalloc_array(family->maxattr + 1, - sizeof(struct nlattr *), GFP_KERNEL); - if (!attrbuf) - return ERR_PTR(-ENOMEM); - } else { - attrbuf = family->attrbuf; - } + attrbuf = kmalloc_array(family->maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) + return ERR_PTR(-ENOMEM); err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, family->policy, validate, extack); if (err) { - if (family->parallel_ops) - kfree(attrbuf); + kfree(attrbuf); return ERR_PTR(err); } return attrbuf; } -static void genl_family_rcv_msg_attrs_free(const struct genl_family *family, - struct nlattr **attrbuf) +static void genl_family_rcv_msg_attrs_free(struct nlattr **attrbuf) { - if (family->parallel_ops) - kfree(attrbuf); + kfree(attrbuf); } struct genl_start_context { @@ -542,7 +520,7 @@ static int genl_start(struct netlink_callback *cb) no_attrs: info = genl_dumpit_info_alloc(); if (!info) { - genl_family_rcv_msg_attrs_free(ctx->family, attrs); + genl_family_rcv_msg_attrs_free(attrs); return -ENOMEM; } info->family = ctx->family; @@ -559,7 +537,7 @@ no_attrs: } if (rc) { - genl_family_rcv_msg_attrs_free(info->family, info->attrs); + genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); cb->data = NULL; } @@ -588,7 +566,7 @@ static int genl_lock_done(struct netlink_callback *cb) rc = ops->done(cb); genl_unlock(); } - genl_family_rcv_msg_attrs_free(info->family, info->attrs); + genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); return rc; } @@ -601,7 +579,7 @@ static int genl_parallel_done(struct netlink_callback *cb) if (ops->done) rc = ops->done(cb); - genl_family_rcv_msg_attrs_free(info->family, info->attrs); + genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); return rc; } @@ -694,7 +672,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, family->post_doit(ops, skb, &info); out: - genl_family_rcv_msg_attrs_free(family, attrbuf); + genl_family_rcv_msg_attrs_free(attrbuf); return err; } -- cgit v1.2.3 From 3aa91625007807bfca4155df1867a5c924a08662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:56 +0200 Subject: dma-mapping: Add a new dma_need_sync API Add a new API to check if calls to dma_sync_single_for_{device,cpu} are required for a given DMA streaming mapping. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-2-hch@lst.de --- Documentation/core-api/dma-api.rst | 8 ++++++++ include/linux/dma-direct.h | 1 + include/linux/dma-mapping.h | 5 +++++ kernel/dma/direct.c | 6 ++++++ kernel/dma/mapping.c | 10 ++++++++++ 5 files changed, 30 insertions(+) diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 2d8d2fed7317..f41620439ef3 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -204,6 +204,14 @@ Returns the maximum size of a mapping for the device. The size parameter of the mapping functions like dma_map_single(), dma_map_page() and others should not be larger than the returned value. +:: + + bool + dma_need_sync(struct device *dev, dma_addr_t dma_addr); + +Returns %true if dma_sync_single_for_{device,cpu} calls are required to +transfer memory ownership. Returns %false if those calls can be skipped. + :: unsigned long diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 136f984df0d9..8b006730687b 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -87,4 +87,5 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); int dma_direct_supported(struct device *dev, u64 mask); +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 78f677cf45ab..a33ed3954ed4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -461,6 +461,7 @@ int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); u64 dma_get_required_mask(struct device *dev); size_t dma_max_mapping_size(struct device *dev); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, @@ -571,6 +572,10 @@ static inline size_t dma_max_mapping_size(struct device *dev) { return 0; } +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return false; +} static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 0a4881e59aa7..ecb922a0bfa0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -530,3 +530,9 @@ size_t dma_direct_max_mapping_size(struct device *dev) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } + +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return !dev_is_dma_coherent(dev) || + is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 98e3d873792e..a8c18c9a796f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -397,6 +397,16 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_is_direct(ops)) + return dma_direct_need_sync(dev, dma_addr); + return ops->sync_single_for_cpu || ops->sync_single_for_device; +} +EXPORT_SYMBOL_GPL(dma_need_sync); + unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); -- cgit v1.2.3 From 91d5b70273267bbae6f5d1fb4cf3510bd31ef9ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:57 +0200 Subject: xsk: Replace the cheap_dma flag with a dma_need_sync flag Invert the polarity and better name the flag so that the use case is properly documented. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-3-hch@lst.de --- include/net/xsk_buff_pool.h | 6 +++--- net/xdp/xsk_buff_pool.c | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index a4ff226505c9..6842990e2712 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -40,7 +40,7 @@ struct xsk_buff_pool { u32 headroom; u32 chunk_size; u32 frame_len; - bool cheap_dma; + bool dma_need_sync; bool unaligned; void *addrs; struct device *dev; @@ -80,7 +80,7 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - if (xskb->pool->cheap_dma) + if (!xskb->pool->dma_need_sync) return; xp_dma_sync_for_cpu_slow(xskb); @@ -91,7 +91,7 @@ void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - if (pool->cheap_dma) + if (!pool->dma_need_sync) return; xp_dma_sync_for_device_slow(pool, dma, size); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 540ed75e4482..9fe84c797a70 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -55,7 +55,6 @@ struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, pool->free_heads_cnt = chunks; pool->headroom = headroom; pool->chunk_size = chunk_size; - pool->cheap_dma = true; pool->unaligned = unaligned; pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; INIT_LIST_HEAD(&pool->free_list); @@ -195,7 +194,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, xp_check_dma_contiguity(pool); pool->dev = dev; - pool->cheap_dma = xp_check_cheap_dma(pool); + pool->dma_need_sync = !xp_check_cheap_dma(pool); return 0; } EXPORT_SYMBOL(xp_dma_map); @@ -280,7 +279,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; xskb->xdp.data_meta = xskb->xdp.data; - if (!pool->cheap_dma) { + if (pool->dma_need_sync) { dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, pool->frame_len, DMA_BIDIRECTIONAL); -- cgit v1.2.3 From 53937ff7bc776aac647d0b3004d7cd21861b0f78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:58 +0200 Subject: xsk: Remove a double pool->dev assignment in xp_dma_map ->dev is already assigned at the top of the function, remove the duplicate one at the end. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-4-hch@lst.de --- net/xdp/xsk_buff_pool.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 9fe84c797a70..6733e2c59e48 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -193,7 +193,6 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, if (pool->unaligned) xp_check_dma_contiguity(pool); - pool->dev = dev; pool->dma_need_sync = !xp_check_cheap_dma(pool); return 0; } -- cgit v1.2.3 From 7e0245753f1794f17de472dcf4694fa5ed527384 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:59 +0200 Subject: xsk: Use dma_need_sync instead of reimplenting it Use the dma_need_sync helper instead of (not always entirely correctly) poking into the dma-mapping internals. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-5-hch@lst.de --- net/xdp/xsk_buff_pool.c | 50 +++---------------------------------------------- 1 file changed, 3 insertions(+), 47 deletions(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 6733e2c59e48..08b80669f649 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -2,9 +2,6 @@ #include #include -#include -#include -#include #include "xsk_queue.h" @@ -124,48 +121,6 @@ static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) } } -static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool) -{ -#if defined(CONFIG_SWIOTLB) - phys_addr_t paddr; - u32 i; - - for (i = 0; i < pool->dma_pages_cnt; i++) { - paddr = dma_to_phys(pool->dev, pool->dma_pages[i]); - if (is_swiotlb_buffer(paddr)) - return false; - } -#endif - return true; -} - -static bool xp_check_cheap_dma(struct xsk_buff_pool *pool) -{ -#if defined(CONFIG_HAS_DMA) - const struct dma_map_ops *ops = get_dma_ops(pool->dev); - - if (ops) { - return !ops->sync_single_for_cpu && - !ops->sync_single_for_device; - } - - if (!dma_is_direct(ops)) - return false; - - if (!xp_check_swiotlb_dma(pool)) - return false; - - if (!dev_is_dma_coherent(pool->dev)) { -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) - return false; -#endif - } -#endif - return true; -} - int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages) { @@ -179,6 +134,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, pool->dev = dev; pool->dma_pages_cnt = nr_pages; + pool->dma_need_sync = false; for (i = 0; i < pool->dma_pages_cnt; i++) { dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, @@ -187,13 +143,13 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, xp_dma_unmap(pool, attrs); return -ENOMEM; } + if (dma_need_sync(dev, dma)) + pool->dma_need_sync = true; pool->dma_pages[i] = dma; } if (pool->unaligned) xp_check_dma_contiguity(pool); - - pool->dma_need_sync = !xp_check_cheap_dma(pool); return 0; } EXPORT_SYMBOL(xp_dma_map); -- cgit v1.2.3 From 517bbe1994a3cee29a35c730662277bb5daff582 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jun 2020 23:15:00 -0700 Subject: bpf: Enforce BPF ringbuf size to be the power of 2 BPF ringbuf assumes the size to be a multiple of page size and the power of 2 value. The latter is important to avoid division while calculating position inside the ring buffer and using (N-1) mask instead. This patch fixes omission to enforce power-of-2 size rule. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200630061500.1804799-1-andriin@fb.com --- kernel/bpf/ringbuf.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 180414bb0d3e..0af88bbc1c15 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -132,15 +132,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) { struct bpf_ringbuf *rb; - if (!data_sz || !PAGE_ALIGNED(data_sz)) - return ERR_PTR(-EINVAL); - -#ifdef CONFIG_64BIT - /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ - if (data_sz > RINGBUF_MAX_DATA_SZ) - return ERR_PTR(-E2BIG); -#endif - rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) return ERR_PTR(-ENOMEM); @@ -166,9 +157,16 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-EINVAL); if (attr->key_size || attr->value_size || - attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries)) + !is_power_of_2(attr->max_entries) || + !PAGE_ALIGNED(attr->max_entries)) return ERR_PTR(-EINVAL); +#ifdef CONFIG_64BIT + /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ + if (attr->max_entries > RINGBUF_MAX_DATA_SZ) + return ERR_PTR(-E2BIG); +#endif + rb_map = kzalloc(sizeof(*rb_map), GFP_USER); if (!rb_map) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From c4e8fa9074ad94f80e5c0dcaa16b313e50e958c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Jun 2020 17:04:17 -0700 Subject: netfilter: ipset: call ip_set_free() instead of kfree() Whenever ip_set_alloc() is used, allocated memory can either use kmalloc() or vmalloc(). We should call kvfree() or ip_set_free() invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 21935 Comm: syz-executor.3 Not tainted 5.8.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__phys_addr+0xa7/0x110 arch/x86/mm/physaddr.c:28 Code: 1d 7a 09 4c 89 e3 31 ff 48 d3 eb 48 89 de e8 d0 58 3f 00 48 85 db 75 0d e8 26 5c 3f 00 4c 89 e0 5b 5d 41 5c c3 e8 19 5c 3f 00 <0f> 0b e8 12 5c 3f 00 48 c7 c0 10 10 a8 89 48 ba 00 00 00 00 00 fc RSP: 0000:ffffc900018572c0 EFLAGS: 00010046 RAX: 0000000000040000 RBX: 0000000000000001 RCX: ffffc9000fac3000 RDX: 0000000000040000 RSI: ffffffff8133f437 RDI: 0000000000000007 RBP: ffffc90098aff000 R08: 0000000000000000 R09: ffff8880ae636cdb R10: 0000000000000000 R11: 0000000000000000 R12: 0000408018aff000 R13: 0000000000080000 R14: 000000000000001d R15: ffffc900018573d8 FS: 00007fc540c66700(0000) GS:ffff8880ae600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc9dcd67200 CR3: 0000000059411000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: virt_to_head_page include/linux/mm.h:841 [inline] virt_to_cache mm/slab.h:474 [inline] kfree+0x77/0x2c0 mm/slab.c:3749 hash_net_create+0xbb2/0xd70 net/netfilter/ipset/ip_set_hash_gen.h:1536 ip_set_create+0x6a2/0x13c0 net/netfilter/ipset/ip_set_core.c:1128 nfnetlink_rcv_msg+0xbe8/0xea0 net/netfilter/nfnetlink.c:230 netlink_rcv_skb+0x15a/0x430 net/netlink/af_netlink.c:2469 nfnetlink_rcv+0x1ac/0x420 net/netfilter/nfnetlink.c:564 netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1329 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1918 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2352 ___sys_sendmsg+0xf3/0x170 net/socket.c:2406 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2439 do_syscall_64+0x60/0xe0 arch/x86/entry/common.c:359 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45cb19 Code: Bad RIP value. RSP: 002b:00007fc540c65c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004fed80 RCX: 000000000045cb19 RDX: 0000000000000000 RSI: 0000000020001080 RDI: 0000000000000003 RBP: 000000000078bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 000000000000095e R14: 00000000004cc295 R15: 00007fc540c666d4 Fixes: f66ee0410b1c ("netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports") Fixes: 03c8b234e61a ("netfilter: ipset: Generalize extensions support") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_bitmap_ip.c | 2 +- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 2 +- net/netfilter/ipset/ip_set_bitmap_port.c | 2 +- net/netfilter/ipset/ip_set_hash_gen.h | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 486959f70cf3..a8ce04a4bb72 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -326,7 +326,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[], set->variant = &bitmap_ip; if (!init_map_ip(set, map, first_ip, last_ip, elements, hosts, netmask)) { - kfree(map); + ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 2310a316e0af..2c625e0f49ec 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -363,7 +363,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { - kfree(map); + ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index e56ced66f202..7138e080def4 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -274,7 +274,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[], map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_port; if (!init_map_port(set, map, first_port, last_port)) { - kfree(map); + ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 1ee43752d6d3..521e970be402 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -682,7 +682,7 @@ retry: } t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); if (!t->hregion) { - kfree(t); + ip_set_free(t); ret = -ENOMEM; goto out; } @@ -1533,7 +1533,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, } t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits)); if (!t->hregion) { - kfree(t); + ip_set_free(t); kfree(h); return -ENOMEM; } -- cgit v1.2.3 From 3b7016996c4c44db5d499d98759b82fb714bb912 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:54 +0200 Subject: flow_dissector: Pull BPF program assignment up to bpf-netns Prepare for using bpf_prog_array to store attached programs by moving out code that updates the attached program out of flow dissector. Managing bpf_prog_array is more involved than updating a single bpf_prog pointer. This will let us do it all from one place, bpf/net_namespace.c, in the subsequent patch. No functional change intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-2-jakub@cloudflare.com --- include/net/flow_dissector.h | 3 ++- kernel/bpf/net_namespace.c | 20 ++++++++++++++++++-- net/core/flow_dissector.c | 13 ++----------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index a7eba43fe4e4..4b6e36288ddd 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -372,7 +372,8 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control, } #ifdef CONFIG_BPF_SYSCALL -int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog); +int flow_dissector_bpf_prog_attach_check(struct net *net, + struct bpf_prog *prog); #endif /* CONFIG_BPF_SYSCALL */ #endif diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 78cf061f8179..b951dab2687f 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -189,6 +189,7 @@ int netns_bpf_prog_query(const union bpf_attr *attr, int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { enum netns_bpf_attach_type type; + struct bpf_prog *attached; struct net *net; int ret; @@ -207,12 +208,26 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - ret = flow_dissector_bpf_prog_attach(net, prog); + ret = flow_dissector_bpf_prog_attach_check(net, prog); break; default: ret = -EINVAL; break; } + if (ret) + goto out_unlock; + + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out_unlock; + } + rcu_assign_pointer(net->bpf.progs[type], prog); + if (attached) + bpf_prog_put(attached); + out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -277,7 +292,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - err = flow_dissector_bpf_prog_attach(net, link->prog); + err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; default: err = -EINVAL; @@ -286,6 +301,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; + rcu_assign_pointer(net->bpf.progs[type], link->prog); net->bpf.links[type] = link; out_unlock: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d02df0b6d0d9..b57fb1359395 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -70,10 +70,10 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, EXPORT_SYMBOL(skb_flow_dissector_init); #ifdef CONFIG_BPF_SYSCALL -int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) +int flow_dissector_bpf_prog_attach_check(struct net *net, + struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; - struct bpf_prog *attached; if (net == &init_net) { /* BPF flow dissector in the root namespace overrides @@ -97,15 +97,6 @@ int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) return -EEXIST; } - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (attached == prog) - /* The same program cannot be attached twice */ - return -EINVAL; - - rcu_assign_pointer(net->bpf.progs[type], prog); - if (attached) - bpf_prog_put(attached); return 0; } #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 695c12147a40181fe9221d321c3f2de33c9574ed Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:55 +0200 Subject: bpf, netns: Keep attached programs in bpf_prog_array Prepare for having multi-prog attachments for new netns attach types by storing programs to run in a bpf_prog_array, which is well suited for iterating over programs and running them in sequence. After this change bpf(PROG_QUERY) may block to allocate memory in bpf_prog_array_copy_to_user() for collected program IDs. This forces a change in how we protect access to the attached program in the query callback. Because bpf_prog_array_copy_to_user() can sleep, we switch from an RCU read lock to holding a mutex that serializes updaters. Because we allow only one BPF flow_dissector program to be attached to netns at all times, the bpf_prog_array pointed by net->bpf.run_array is always either detached (null) or one element long. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200625141357.910330-3-jakub@cloudflare.com --- include/net/netns/bpf.h | 5 +- kernel/bpf/net_namespace.c | 120 +++++++++++++++++++++++++++++++-------------- net/core/flow_dissector.c | 19 +++---- 3 files changed, 96 insertions(+), 48 deletions(-) diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a8dce2a380c8..a5015bda9979 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -9,9 +9,12 @@ #include struct bpf_prog; +struct bpf_prog_array; struct netns_bpf { - struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; + /* Array of programs to run compiled from progs or links */ + struct bpf_prog_array __rcu *run_array[MAX_NETNS_BPF_ATTACH_TYPE]; + struct bpf_prog *progs[MAX_NETNS_BPF_ATTACH_TYPE]; struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; }; diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index b951dab2687f..0dba97202357 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -33,6 +33,17 @@ static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) net_link->net = NULL; } +/* Must be called with netns_bpf_mutex held. */ +static void netns_bpf_run_array_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog_array *run_array; + + run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); +} + static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = @@ -54,8 +65,8 @@ static void bpf_netns_link_release(struct bpf_link *link) if (!net) goto out_unlock; + netns_bpf_run_array_detach(net, type); net->bpf.links[type] = NULL; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -76,6 +87,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *run_array; struct net *net; int ret = 0; @@ -93,8 +105,11 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, goto out_unlock; } + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + WRITE_ONCE(run_array->items[0].prog, new_prog); + old_prog = xchg(&link->prog, new_prog); - rcu_assign_pointer(net->bpf.progs[type], new_prog); bpf_prog_put(old_prog); out_unlock: @@ -142,14 +157,38 @@ static const struct bpf_link_ops bpf_netns_link_ops = { .show_fdinfo = bpf_netns_link_show_fdinfo, }; +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct net *net, + enum netns_bpf_attach_type type) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + struct bpf_prog_array *run_array; + u32 prog_cnt = 0, flags = 0; + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) + prog_cnt = bpf_prog_array_length(run_array); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + return bpf_prog_array_copy_to_user(run_array, prog_ids, + attr->query.prog_cnt); +} + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; enum netns_bpf_attach_type type; - struct bpf_prog *attached; struct net *net; + int ret; if (attr->query.query_flags) return -EINVAL; @@ -162,32 +201,17 @@ int netns_bpf_prog_query(const union bpf_attr *attr, if (IS_ERR(net)) return PTR_ERR(net); - rcu_read_lock(); - attached = rcu_dereference(net->bpf.progs[type]); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_query(attr, uattr, net, type); + mutex_unlock(&netns_bpf_mutex); put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; + return ret; } int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_prog_array *run_array; enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; @@ -217,14 +241,28 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (ret) goto out_unlock; - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); + attached = net->bpf.progs[type]; if (attached == prog) { /* The same program cannot be attached twice */ ret = -EINVAL; goto out_unlock; } - rcu_assign_pointer(net->bpf.progs[type], prog); + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) { + WRITE_ONCE(run_array->items[0].prog, prog); + } else { + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + ret = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + } + + net->bpf.progs[type] = prog; if (attached) bpf_prog_put(attached); @@ -244,11 +282,11 @@ static int __netns_bpf_prog_detach(struct net *net, if (net->bpf.links[type]) return -EINVAL; - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); + attached = net->bpf.progs[type]; if (!attached) return -ENOENT; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); + netns_bpf_run_array_detach(net, type); + net->bpf.progs[type] = NULL; bpf_prog_put(attached); return 0; } @@ -272,7 +310,7 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { - struct bpf_prog *prog; + struct bpf_prog_array *run_array; int err; mutex_lock(&netns_bpf_mutex); @@ -283,9 +321,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, goto out_unlock; } /* Links are not compatible with attaching prog directly */ - prog = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (prog) { + if (net->bpf.progs[type]) { err = -EEXIST; goto out_unlock; } @@ -301,7 +337,14 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; - rcu_assign_pointer(net->bpf.progs[type], link->prog); + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + err = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = link->prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + net->bpf.links[type] = link; out_unlock: @@ -368,11 +411,12 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + netns_bpf_run_array_detach(net, type); link = net->bpf.links[type]; if (link) bpf_netns_link_auto_detach(link); - else - __netns_bpf_prog_detach(net, type); + else if (net->bpf.progs[type]) + bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b57fb1359395..142a8824f0a8 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -86,14 +86,14 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->bpf.progs[type])) + if (rcu_access_pointer(ns->bpf.run_array[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.bpf.progs[type])) + if (rcu_access_pointer(init_net.bpf.run_array[type])) return -EEXIST; } @@ -894,7 +894,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; - struct bpf_prog *attached = NULL; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; bool mpls_el = false; @@ -951,14 +950,14 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + struct bpf_prog_array *run_array; rcu_read_lock(); - attached = rcu_dereference(init_net.bpf.progs[type]); + run_array = rcu_dereference(init_net.bpf.run_array[type]); + if (!run_array) + run_array = rcu_dereference(net->bpf.run_array[type]); - if (!attached) - attached = rcu_dereference(net->bpf.progs[type]); - - if (attached) { + if (run_array) { struct bpf_flow_keys flow_keys; struct bpf_flow_dissector ctx = { .flow_keys = &flow_keys, @@ -966,6 +965,7 @@ bool __skb_flow_dissect(const struct net *net, .data_end = data + hlen, }; __be16 n_proto = proto; + struct bpf_prog *prog; if (skb) { ctx.skb = skb; @@ -976,7 +976,8 @@ bool __skb_flow_dissect(const struct net *net, n_proto = skb->protocol; } - ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, + prog = READ_ONCE(run_array->items[0].prog); + ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); -- cgit v1.2.3 From ab53cad90eb10c9991f501ba08904680a074ef3d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:56 +0200 Subject: bpf, netns: Keep a list of attached bpf_link's To support multi-prog link-based attachments for new netns attach types, we need to keep track of more than one bpf_link per attach type. Hence, convert net->bpf.links into a list, that currently can be either empty or have just one item. Instead of reusing bpf_prog_list from bpf-cgroup, we link together bpf_netns_link's themselves. This makes list management simpler as we don't have to allocate, initialize, and later release list elements. We can do this because multi-prog attachment will be available only for bpf_link, and we don't need to build a list of programs attached directly and indirectly via links. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-4-jakub@cloudflare.com --- include/net/netns/bpf.h | 2 +- kernel/bpf/net_namespace.c | 42 +++++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a5015bda9979..0ca6a1b87185 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -15,7 +15,7 @@ struct netns_bpf { /* Array of programs to run compiled from progs or links */ struct bpf_prog_array __rcu *run_array[MAX_NETNS_BPF_ATTACH_TYPE]; struct bpf_prog *progs[MAX_NETNS_BPF_ATTACH_TYPE]; - struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; + struct list_head links[MAX_NETNS_BPF_ATTACH_TYPE]; }; #endif /* __NETNS_BPF_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 0dba97202357..7a34a8caf954 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -19,20 +19,12 @@ struct bpf_netns_link { * with netns_bpf_mutex held. */ struct net *net; + struct list_head node; /* node in list of links attached to net */ }; /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); -/* Must be called with netns_bpf_mutex held. */ -static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) -{ - struct bpf_netns_link *net_link = - container_of(link, struct bpf_netns_link, link); - - net_link->net = NULL; -} - /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) @@ -66,7 +58,7 @@ static void bpf_netns_link_release(struct bpf_link *link) goto out_unlock; netns_bpf_run_array_detach(net, type); - net->bpf.links[type] = NULL; + list_del(&net_link->node); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -225,7 +217,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) mutex_lock(&netns_bpf_mutex); /* Attaching prog directly is not compatible with links */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { ret = -EEXIST; goto out_unlock; } @@ -279,7 +271,7 @@ static int __netns_bpf_prog_detach(struct net *net, struct bpf_prog *attached; /* Progs attached via links cannot be detached */ - if (net->bpf.links[type]) + if (!list_empty(&net->bpf.links[type])) return -EINVAL; attached = net->bpf.progs[type]; @@ -310,13 +302,15 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; int err; mutex_lock(&netns_bpf_mutex); /* Allow attaching only one prog or link for now */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { err = -E2BIG; goto out_unlock; } @@ -345,7 +339,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, run_array->items[0].prog = link->prog; rcu_assign_pointer(net->bpf.run_array[type], run_array); - net->bpf.links[type] = link; + list_add_tail(&net_link->node, &net->bpf.links[type]); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -404,24 +398,34 @@ out_put_net: return err; } +static int __net_init netns_bpf_pernet_init(struct net *net) +{ + int type; + + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + INIT_LIST_HEAD(&net->bpf.links[type]); + + return 0; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; - struct bpf_link *link; + struct bpf_netns_link *net_link; mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); - link = net->bpf.links[type]; - if (link) - bpf_netns_link_auto_detach(link); - else if (net->bpf.progs[type]) + list_for_each_entry(net_link, &net->bpf.links[type], node) + net_link->net = NULL; /* auto-detach link */ + if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .init = netns_bpf_pernet_init, .pre_exit = netns_bpf_pernet_pre_exit, }; -- cgit v1.2.3 From 6ebb85c83aaf6ae75b920ef45d2a9eee42079265 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:57 +0200 Subject: selftests/bpf: Test updating flow_dissector link with same program This case, while not particularly useful, is worth covering because we expect the operation to succeed as opposed when re-attaching the same program directly with PROG_ATTACH. While at it, update the tests summary that fell out of sync when tests extended to cover links. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-5-jakub@cloudflare.com --- .../bpf/prog_tests/flow_dissector_reattach.c | 32 +++++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c index 15cb554a66d8..a2db3b0f84db 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Test that the flow_dissector program can be updated with a single - * syscall by attaching a new program that replaces the existing one. - * - * Corner case - the same program cannot be attached twice. + * Tests for attaching, detaching, and replacing flow_dissector BPF program. */ #define _GNU_SOURCE @@ -308,6 +305,31 @@ static void test_link_update_replace_old_prog(int netns, int prog1, int prog2) CHECK_FAIL(prog_is_attached(netns)); } +static void test_link_update_same_prog(int netns, int prog1, int prog2) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); + DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts); + int err, link; + + link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts); + if (CHECK_FAIL(link < 0)) { + perror("bpf_link_create(prog1)"); + return; + } + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + /* Expect success updating the prog with the same one */ + update_opts.flags = 0; + update_opts.old_prog_fd = 0; + err = bpf_link_update(link, prog1, &update_opts); + if (CHECK_FAIL(err)) + perror("bpf_link_update"); + CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); + + close(link); + CHECK_FAIL(prog_is_attached(netns)); +} + static void test_link_update_invalid_opts(int netns, int prog1, int prog2) { DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts); @@ -571,6 +593,8 @@ static void run_tests(int netns) test_link_update_no_old_prog }, { "link update with replace old prog", test_link_update_replace_old_prog }, + { "link update with same prog", + test_link_update_same_prog }, { "link update invalid opts", test_link_update_invalid_opts }, { "link update invalid prog", -- cgit v1.2.3 From 1b514239e85965cc4df085180a73dd91733135f7 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:25 +0100 Subject: bpf: flow_dissector: Check value of unused flags to BPF_PROG_ATTACH Using BPF_PROG_ATTACH on a flow dissector program supports neither target_fd, attach_flags or replace_bpf_fd but accepts any value. Enforce that all of them are zero. This is fine for replace_bpf_fd since its presence is indicated by BPF_F_REPLACE. It's more problematic for target_fd, since zero is a valid fd. Should we want to use the flag later on we'd have to add an exception for fd 0. The alternative is to force a value like -1. This requires more changes to tests. There is also precedent for using 0, since bpf_iter uses this for target_fd as well. Fixes: b27f7bb590ba ("flow_dissector: Move out netns_bpf prog callbacks") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-2-lmb@cloudflare.com --- kernel/bpf/net_namespace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 7a34a8caf954..03045f45afec 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -209,6 +209,9 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) struct net *net; int ret; + if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; -- cgit v1.2.3 From 4ac2add65974e4efafb8d4ccd8fc5660417ea312 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:26 +0100 Subject: bpf: flow_dissector: Check value of unused flags to BPF_PROG_DETACH Using BPF_PROG_DETACH on a flow dissector program supports neither attach_flags nor attach_bpf_fd. Yet no value is enforced for them. Enforce that attach_flags are zero, and require the current program to be passed via attach_bpf_fd. This allows us to remove the check for CAP_SYS_ADMIN, since userspace can now no longer remove arbitrary flow dissector programs. Fixes: b27f7bb590ba ("flow_dissector: Move out netns_bpf prog callbacks") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-3-lmb@cloudflare.com --- include/linux/bpf-netns.h | 5 +++-- kernel/bpf/net_namespace.c | 19 +++++++++++++++---- kernel/bpf/syscall.c | 4 +--- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 4052d649f36d..47d5b0c708c9 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -33,7 +33,7 @@ int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); -int netns_bpf_prog_detach(const union bpf_attr *attr); +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog); #else @@ -49,7 +49,8 @@ static inline int netns_bpf_prog_attach(const union bpf_attr *attr, return -EOPNOTSUPP; } -static inline int netns_bpf_prog_detach(const union bpf_attr *attr) +static inline int netns_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) { return -EOPNOTSUPP; } diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 03045f45afec..3dbc29b6f51d 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -269,7 +269,8 @@ out_unlock: /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, - enum netns_bpf_attach_type type) + enum netns_bpf_attach_type type, + struct bpf_prog *old) { struct bpf_prog *attached; @@ -278,7 +279,7 @@ static int __netns_bpf_prog_detach(struct net *net, return -EINVAL; attached = net->bpf.progs[type]; - if (!attached) + if (!attached || attached != old) return -ENOENT; netns_bpf_run_array_detach(net, type); net->bpf.progs[type] = NULL; @@ -286,19 +287,29 @@ static int __netns_bpf_prog_detach(struct net *net, return 0; } -int netns_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; + struct bpf_prog *prog; int ret; + if (attr->target_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + mutex_lock(&netns_bpf_mutex); - ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); + bpf_prog_put(prog); + return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d946435587d..28c6ef759037 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2897,9 +2897,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - return netns_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr, ptype); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: -- cgit v1.2.3 From 9b2b09717e1812e450782a43ca0c2790651cf380 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:27 +0100 Subject: bpf: sockmap: Check value of unused args to BPF_PROG_ATTACH Using BPF_PROG_ATTACH on a sockmap program currently understands no flags or replace_bpf_fd, but accepts any value. Return EINVAL instead. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-4-lmb@cloudflare.com --- net/core/sock_map.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4059f94e9bb5..58016a5c63ff 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -70,6 +70,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) struct fd f; int ret; + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) -- cgit v1.2.3 From bb0de3131f4c60a9bf976681e0fe4d1e55c7a821 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:28 +0100 Subject: bpf: sockmap: Require attach_bpf_fd when detaching a program The sockmap code currently ignores the value of attach_bpf_fd when detaching a program. This is contrary to the usual behaviour of checking that attach_bpf_fd represents the currently attached program. Ensure that attach_bpf_fd is indeed the currently attached program. It turns out that all sockmap selftests already do this, which indicates that this is unlikely to cause breakage. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-5-lmb@cloudflare.com --- include/linux/bpf.h | 13 +++++++++++-- include/linux/skmsg.h | 13 +++++++++++++ kernel/bpf/syscall.c | 2 +- net/core/sock_map.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 70 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 07052d44bca1..9750a1902ee5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1543,13 +1543,16 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, u32 which) + struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { return -EOPNOTSUPP; } @@ -1559,6 +1562,12 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_STREAM_PARSER */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 08674cd14d5a..1e9ed840b9fc 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -430,6 +430,19 @@ static inline void psock_set_prog(struct bpf_prog **pprog, bpf_prog_put(prog); } +static inline int psock_replace_prog(struct bpf_prog **pprog, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + if (cmpxchg(pprog, old, prog) != old) + return -ENOENT; + + if (old) + bpf_prog_put(old); + + return 0; +} + static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 28c6ef759037..a74fce8ce043 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2893,7 +2893,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_get_from_fd(attr, NULL); + return sock_map_prog_detach(attr, ptype); case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 58016a5c63ff..0971f17e8e54 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -77,7 +77,42 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, attr->attach_type); + ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + fdput(f); + return ret; +} + +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + u32 ufd = attr->target_fd; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + int ret; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + prog = bpf_prog_get(attr->attach_bpf_fd); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + goto put_map; + } + + if (prog->type != ptype) { + ret = -EINVAL; + goto put_prog; + } + + ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); +put_prog: + bpf_prog_put(prog); +put_map: fdput(f); return ret; } @@ -1206,27 +1241,32 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) } int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - u32 which) + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **pprog; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - psock_set_prog(&progs->msg_parser, prog); + pprog = &progs->msg_parser; break; case BPF_SK_SKB_STREAM_PARSER: - psock_set_prog(&progs->skb_parser, prog); + pprog = &progs->skb_parser; break; case BPF_SK_SKB_STREAM_VERDICT: - psock_set_prog(&progs->skb_verdict, prog); + pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } + if (old) + return psock_replace_prog(pprog, prog, old); + + psock_set_prog(pprog, prog); return 0; } -- cgit v1.2.3 From 0434296c72486881c2a71cd33876e4e6342001b5 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:29 +0100 Subject: selftests: bpf: Pass program and target_fd in flow_dissector_reattach Pass 0 as target_fd when attaching and detaching flow dissector. Additionally, pass the expected program when detaching. Fixes: 1f043f87bb59 ("selftests/bpf: Add tests for attaching bpf_link to netns") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-6-lmb@cloudflare.com --- .../selftests/bpf/prog_tests/flow_dissector_reattach.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c index a2db3b0f84db..172c586b6996 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c @@ -113,7 +113,7 @@ static void test_prog_attach_prog_attach(int netns, int prog1, int prog2) CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2)); out_detach: - err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); + err = bpf_prog_detach2(prog2, 0, BPF_FLOW_DISSECTOR); if (CHECK_FAIL(err)) perror("bpf_prog_detach"); CHECK_FAIL(prog_is_attached(netns)); @@ -149,7 +149,7 @@ static void test_prog_attach_link_create(int netns, int prog1, int prog2) DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); int err, link; - err = bpf_prog_attach(prog1, -1, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(err)) { perror("bpf_prog_attach(prog1)"); return; @@ -165,7 +165,7 @@ static void test_prog_attach_link_create(int netns, int prog1, int prog2) close(link); CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); - err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + err = bpf_prog_detach2(prog1, 0, BPF_FLOW_DISSECTOR); if (CHECK_FAIL(err)) perror("bpf_prog_detach"); CHECK_FAIL(prog_is_attached(netns)); @@ -185,7 +185,7 @@ static void test_link_create_prog_attach(int netns, int prog1, int prog2) /* Expect failure attaching prog when link exists */ errno = 0; - err = bpf_prog_attach(prog2, -1, BPF_FLOW_DISSECTOR, 0); + err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0); if (CHECK_FAIL(!err || errno != EEXIST)) perror("bpf_prog_attach(prog2) expected EEXIST"); CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); @@ -208,7 +208,7 @@ static void test_link_create_prog_detach(int netns, int prog1, int prog2) /* Expect failure detaching prog when link exists */ errno = 0; - err = bpf_prog_detach(-1, BPF_FLOW_DISSECTOR); + err = bpf_prog_detach2(prog1, 0, BPF_FLOW_DISSECTOR); if (CHECK_FAIL(!err || errno != EINVAL)) perror("bpf_prog_detach expected EINVAL"); CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); @@ -228,7 +228,7 @@ static void test_prog_attach_detach_query(int netns, int prog1, int prog2) } CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1)); - err = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); + err = bpf_prog_detach2(prog1, 0, BPF_FLOW_DISSECTOR); if (CHECK_FAIL(err)) { perror("bpf_prog_detach"); return; -- cgit v1.2.3 From 1a1ad3c20a6fe0e8a4b570fbf835d7cc6e87a9d8 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:30 +0100 Subject: selftests: bpf: Pass program to bpf_prog_detach in flow_dissector Calling bpf_prog_detach is incorrect, since it takes target_fd as its argument. The intention here is to pass it as attach_bpf_fd, so use bpf_prog_detach2 and pass zero for target_fd. Fixes: 06716e04a043 ("selftests/bpf: Extend test_flow_dissector to cover link creation") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-7-lmb@cloudflare.com --- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index ea14e3ece812..f11f187990e9 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -527,8 +527,8 @@ static void test_skb_less_prog_attach(struct bpf_flow *skel, int tap_fd) run_tests_skb_less(tap_fd, skel->maps.last_dissection); - err = bpf_prog_detach(prog_fd, BPF_FLOW_DISSECTOR); - CHECK(err, "bpf_prog_detach", "err %d errno %d\n", err, errno); + err = bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR); + CHECK(err, "bpf_prog_detach2", "err %d errno %d\n", err, errno); } static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd) -- cgit v1.2.3 From 2576f87066dc08a11cb1c05f11d1eaa02148ef9e Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Tue, 30 Jun 2020 18:45:41 +0200 Subject: bpf, netns: Fix use-after-free in pernet pre_exit callback Iterating over BPF links attached to network namespace in pre_exit hook is not safe, even if there is just one. Once link gets auto-detached, that is its back-pointer to net object is set to NULL, the link can be released and freed without waiting on netns_bpf_mutex, effectively causing the list element we are operating on to be freed. This leads to use-after-free when trying to access the next element on the list, as reported by KASAN. Bug can be triggered by destroying a network namespace, while also releasing a link attached to this network namespace. | ================================================================== | BUG: KASAN: use-after-free in netns_bpf_pernet_pre_exit+0xd9/0x130 | Read of size 8 at addr ffff888119e0d778 by task kworker/u8:2/177 | | CPU: 3 PID: 177 Comm: kworker/u8:2 Not tainted 5.8.0-rc1-00197-ga0c04c9d1008-dirty #776 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: netns cleanup_net | Call Trace: | dump_stack+0x9e/0xe0 | print_address_description.constprop.0+0x3a/0x60 | ? netns_bpf_pernet_pre_exit+0xd9/0x130 | kasan_report.cold+0x1f/0x40 | ? netns_bpf_pernet_pre_exit+0xd9/0x130 | netns_bpf_pernet_pre_exit+0xd9/0x130 | cleanup_net+0x30b/0x5b0 | ? unregister_pernet_device+0x50/0x50 | ? rcu_read_lock_bh_held+0xb0/0xb0 | ? _raw_spin_unlock_irq+0x24/0x50 | process_one_work+0x4d1/0xa10 | ? lock_release+0x3e0/0x3e0 | ? pwq_dec_nr_in_flight+0x110/0x110 | ? rwlock_bug.part.0+0x60/0x60 | worker_thread+0x7a/0x5c0 | ? process_one_work+0xa10/0xa10 | kthread+0x1e3/0x240 | ? kthread_create_on_node+0xd0/0xd0 | ret_from_fork+0x1f/0x30 | | Allocated by task 280: | save_stack+0x1b/0x40 | __kasan_kmalloc.constprop.0+0xc2/0xd0 | netns_bpf_link_create+0xfe/0x650 | __do_sys_bpf+0x153a/0x2a50 | do_syscall_64+0x59/0x300 | entry_SYSCALL_64_after_hwframe+0x44/0xa9 | | Freed by task 198: | save_stack+0x1b/0x40 | __kasan_slab_free+0x12f/0x180 | kfree+0xed/0x350 | process_one_work+0x4d1/0xa10 | worker_thread+0x7a/0x5c0 | kthread+0x1e3/0x240 | ret_from_fork+0x1f/0x30 | | The buggy address belongs to the object at ffff888119e0d700 | which belongs to the cache kmalloc-192 of size 192 | The buggy address is located 120 bytes inside of | 192-byte region [ffff888119e0d700, ffff888119e0d7c0) | The buggy address belongs to the page: | page:ffffea0004678340 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 | flags: 0x2fffe0000000200(slab) | raw: 02fffe0000000200 ffffea00045ba8c0 0000000600000006 ffff88811a80ea80 | raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 | page dumped because: kasan: bad access detected | | Memory state around the buggy address: | ffff888119e0d600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb | ffff888119e0d680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc | >ffff888119e0d700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb | ^ | ffff888119e0d780: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc | ffff888119e0d800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb | ================================================================== Remove the "fast-path" for releasing a link that got auto-detached by a dying network namespace to fix it. This way as long as link is on the list and netns_bpf mutex is held, we have a guarantee that link memory can be accessed. An alternative way to fix this issue would be to safely iterate over the list of links and ensure there is no access to link object after detaching it. But, at the moment, optimizing synchronization overhead on link release without a workload in mind seems like an overkill. Fixes: ab53cad90eb1 ("bpf, netns: Keep a list of attached bpf_link's") Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200630164541.1329993-1-jakub@cloudflare.com --- kernel/bpf/net_namespace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 3dbc29b6f51d..310241ca7991 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -43,15 +43,11 @@ static void bpf_netns_link_release(struct bpf_link *link) enum netns_bpf_attach_type type = net_link->netns_type; struct net *net; - /* Link auto-detached by dying netns. */ - if (!net_link->net) - return; - mutex_lock(&netns_bpf_mutex); - /* Recheck after potential sleep. We can race with cleanup_net - * here, but if we see a non-NULL struct net pointer pre_exit - * has not happened yet and will block on netns_bpf_mutex. + /* We can race with cleanup_net, but if we see a non-NULL + * struct net pointer, pre_exit has not run yet and wait for + * netns_bpf_mutex. */ net = net_link->net; if (!net) -- cgit v1.2.3 From 2606aff916854b61234bf85001be9777bab2d5f8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:18 -0600 Subject: net: ip_tunnel: add header_ops for layer 3 devices Some devices that take straight up layer 3 packets benefit from having a shared header_ops so that AF_PACKET sockets can inject packets that are recognized. This shared infrastructure will be used by other drivers that currently can't inject packets using AF_PACKET. It also exposes the parser function, as it is useful in standalone form too. Signed-off-by: Jason A. Donenfeld Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 +++ net/ipv4/ip_tunnel_core.c | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 076e5d7db7d3..36025dea7612 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -290,6 +290,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); +extern const struct header_ops ip_tunnel_header_ops; +__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); + struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 181b7a2a0247..f8b419e2475c 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -844,3 +844,21 @@ void ip_tunnel_unneed_metadata(void) static_branch_dec(&ip_tunnel_metadata_cnt); } EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); + +/* Returns either the correct skb->protocol value, or 0 if invalid. */ +__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb) +{ + if (skb_network_header(skb) >= skb->head && + (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) && + ip_hdr(skb)->version == 4) + return htons(ETH_P_IP); + if (skb_network_header(skb) >= skb->head && + (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) && + ipv6_hdr(skb)->version == 6) + return htons(ETH_P_IPV6); + return 0; +} +EXPORT_SYMBOL(ip_tunnel_parse_protocol); + +const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol }; +EXPORT_SYMBOL(ip_tunnel_header_ops); -- cgit v1.2.3 From e53ac93220e002fdf26b2874af6a74f393cd3872 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:19 -0600 Subject: net: ipip: implement header_ops->parse_protocol for AF_PACKET Ipip uses skb->protocol to determine packet type, and bails out if it's not set. For AF_PACKET injection, we need to support its call chain of: packet_sendmsg -> packet_snd -> packet_parse_headers -> dev_parse_header_protocol -> parse_protocol Without a valid parse_protocol, this returns zero, and ipip rejects the skb. So, this wires up the ip_tunnel handler for layer 3 packets for that case. Signed-off-by: Jason A. Donenfeld Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 1 + net/ipv6/ip6_tunnel.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 40fea52c8277..75d35e76bec2 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -361,6 +361,7 @@ static const struct net_device_ops ipip_netdev_ops = { static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 821d96c720b9..a18c378ca5f4 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1846,6 +1846,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; -- cgit v1.2.3 From 01a4967c71c004f8ecad4ab57021348636502fa9 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:20 -0600 Subject: wireguard: implement header_ops->parse_protocol for AF_PACKET WireGuard uses skb->protocol to determine packet type, and bails out if it's not set or set to something it's not expecting. For AF_PACKET injection, we need to support its call chain of: packet_sendmsg -> packet_snd -> packet_parse_headers -> dev_parse_header_protocol -> parse_protocol Without a valid parse_protocol, this returns zero, and wireguard then rejects the skb. So, this wires up the ip_tunnel handler for layer 3 packets for that case. Reported-by: Hans Wippel Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index a8f151b1b5fa..c9f65e96ccb0 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -262,6 +262,7 @@ static void wg_setup(struct net_device *dev) max(sizeof(struct ipv6hdr), sizeof(struct iphdr)); dev->netdev_ops = &netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; dev->hard_header_len = 0; dev->addr_len = 0; dev->needed_headroom = DATA_PACKET_HEAD_ROOM; -- cgit v1.2.3 From 1a574074ae7d1d745c16f7710655f38a53174c27 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:21 -0600 Subject: wireguard: queueing: make use of ip_tunnel_parse_protocol Now that wg_examine_packet_protocol has been added for general consumption as ip_tunnel_parse_protocol, it's possible to remove wg_examine_packet_protocol and simply use the new ip_tunnel_parse_protocol function directly. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/queueing.h | 19 ++----------------- drivers/net/wireguard/receive.c | 2 +- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index c58df439dbbe..dfb674e03076 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -11,6 +11,7 @@ #include #include #include +#include struct wg_device; struct wg_peer; @@ -65,25 +66,9 @@ struct packet_cb { #define PACKET_CB(skb) ((struct packet_cb *)((skb)->cb)) #define PACKET_PEER(skb) (PACKET_CB(skb)->keypair->entry.peer) -/* Returns either the correct skb->protocol value, or 0 if invalid. */ -static inline __be16 wg_examine_packet_protocol(struct sk_buff *skb) -{ - if (skb_network_header(skb) >= skb->head && - (skb_network_header(skb) + sizeof(struct iphdr)) <= - skb_tail_pointer(skb) && - ip_hdr(skb)->version == 4) - return htons(ETH_P_IP); - if (skb_network_header(skb) >= skb->head && - (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= - skb_tail_pointer(skb) && - ipv6_hdr(skb)->version == 6) - return htons(ETH_P_IPV6); - return 0; -} - static inline bool wg_check_packet_protocol(struct sk_buff *skb) { - __be16 real_protocol = wg_examine_packet_protocol(skb); + __be16 real_protocol = ip_tunnel_parse_protocol(skb); return real_protocol && skb->protocol == real_protocol; } diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 9b2ab6fc91cd..2c9551ea6dc7 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -387,7 +387,7 @@ static void wg_packet_consume_data_done(struct wg_peer *peer, */ skb->ip_summed = CHECKSUM_UNNECESSARY; skb->csum_level = ~0; /* All levels */ - skb->protocol = wg_examine_packet_protocol(skb); + skb->protocol = ip_tunnel_parse_protocol(skb); if (skb->protocol == htons(ETH_P_IP)) { len = ntohs(ip_hdr(skb)->tot_len); if (unlikely(len < sizeof(struct iphdr))) -- cgit v1.2.3 From b9815eb1d13f0dc088ee8afb6e6d0683ea551098 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:22 -0600 Subject: tun: implement header_ops->parse_protocol for AF_PACKET The tun driver passes up skb->protocol to userspace in the form of PI headers. For AF_PACKET injection, we need to support its call chain of: packet_sendmsg -> packet_snd -> packet_parse_headers -> dev_parse_header_protocol -> parse_protocol Without a valid parse_protocol, this returns zero, and the tun driver then gives userspace bogus values that it can't deal with. Note that this isn't the case with tap, because tap already benefits from the shared infrastructure for ethernet headers. But with tun, there's nothing. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 858b012074bd..7adeb91bd368 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -1351,6 +1352,7 @@ static void tun_net_init(struct net_device *dev) switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; -- cgit v1.2.3 From ab59d2b6982b69a9728296ee3a1f330a72c0383e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:23 -0600 Subject: net: vti: implement header_ops->parse_protocol for AF_PACKET Vti uses skb->protocol to determine packet type, and bails out if it's not set. For AF_PACKET injection, we need to support its call chain of: packet_sendmsg -> packet_snd -> packet_parse_headers -> dev_parse_header_protocol -> parse_protocol Without a valid parse_protocol, this returns zero, and vti rejects the skb. So, this wires up the ip_tunnel handler for layer 3 packets for that case. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 1 + net/ipv6/ip6_vti.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 1d9c8cff5ac3..460ca1099e8a 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -441,6 +441,7 @@ static const struct net_device_ops vti_netdev_ops = { static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 1147f647b9a0..0d964160a9dd 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -905,6 +905,7 @@ static const struct net_device_ops vti6_netdev_ops = { static void vti6_dev_setup(struct net_device *dev) { dev->netdev_ops = &vti6_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = vti6_dev_free; -- cgit v1.2.3 From 75ea1f4773c09730bf8a364a367f6e7211484e12 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:24 -0600 Subject: net: sit: implement header_ops->parse_protocol for AF_PACKET Sit uses skb->protocol to determine packet type, and bails out if it's not set. For AF_PACKET injection, we need to support its call chain of: packet_sendmsg -> packet_snd -> packet_parse_headers -> dev_parse_header_protocol -> parse_protocol Without a valid parse_protocol, this returns zero, and sit rejects the skb. So, this wires up the ip_tunnel handler for layer 3 packets for that case. Reported-by: Willem de Bruijn Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/ipv6/sit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 1fbb4dfbb191..5e2c34c0ac97 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1421,6 +1421,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) int t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->netdev_ops = &ipip6_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ipip6_dev_free; -- cgit v1.2.3 From 8f9a1fa4308363944ba94a961f69646c4b0ff26b Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:25 -0600 Subject: net: xfrmi: implement header_ops->parse_protocol for AF_PACKET The xfrm interface uses skb->protocol to determine packet type, and bails out if it's not set. For AF_PACKET injection, we need to support its call chain of: packet_sendmsg -> packet_snd -> packet_parse_headers -> dev_parse_header_protocol -> parse_protocol Without a valid parse_protocol, this returns zero, and xfrmi rejects the skb. So, this wires up the ip_tunnel handler for layer 3 packets for that case. Reported-by: Willem de Bruijn Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/xfrm/xfrm_interface.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index c407ecbc5d46..b615729812e5 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -581,6 +582,7 @@ static const struct net_device_ops xfrmi_netdev_ops = { static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_NONE; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; -- cgit v1.2.3 From f2ca673d2cd5df9a76247b670e9ffd4d63682b3f Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 30 Jun 2020 11:04:40 +0100 Subject: net: mvneta: fix use of state->speed When support for short preambles was added, it incorrectly keyed its decision off state->speed instead of state->interface. state->speed is not guaranteed to be correct for in-band modes, which can lead to short preambles being unexpectedly disabled. Fix this by keying off the interface mode, which is the only way that mvneta can operate at 2.5Gbps. Fixes: da58a931f248 ("net: mvneta: Add support for 2500Mbps SGMII") Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index c639e3a29302..7d5d9d34f4e4 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3959,7 +3959,7 @@ static void mvneta_mac_config(struct phylink_config *config, unsigned int mode, /* When at 2.5G, the link partner can send frames with shortened * preambles. */ - if (state->speed == SPEED_2500) + if (state->interface == PHY_INTERFACE_MODE_2500BASEX) new_ctrl4 |= MVNETA_GMAC4_SHORT_PREAMBLE_ENABLE; if (pp->phy_interface != state->interface) { -- cgit v1.2.3 From 5468cbcddf47f674829c6ada190283108a63d7b5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Jun 2020 07:44:42 -0500 Subject: net: ipa: always check for stopped channel In gsi_channel_stop(), there's a check to see if the channel might have entered STOPPED state since a previous call, which might have timed out before stopping completed. That check actually belongs in gsi_channel_stop_command(), which is called repeatedly by gsi_channel_stop() for RX channels. Fixes: 650d1603825d ("soc: qcom: ipa: the generic software interface") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/gsi.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index 55226b264e3c..ac7e5a04c8ac 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -500,6 +500,13 @@ static int gsi_channel_stop_command(struct gsi_channel *channel) int ret; state = gsi_channel_state(channel); + + /* Channel could have entered STOPPED state since last call + * if it timed out. If so, we're done. + */ + if (state == GSI_CHANNEL_STATE_STOPPED) + return 0; + if (state != GSI_CHANNEL_STATE_STARTED && state != GSI_CHANNEL_STATE_STOP_IN_PROC) return -EINVAL; @@ -789,20 +796,11 @@ int gsi_channel_start(struct gsi *gsi, u32 channel_id) int gsi_channel_stop(struct gsi *gsi, u32 channel_id) { struct gsi_channel *channel = &gsi->channel[channel_id]; - enum gsi_channel_state state; u32 retries; int ret; gsi_channel_freeze(channel); - /* Channel could have entered STOPPED state since last call if the - * STOP command timed out. We won't stop a channel if stopping it - * was successful previously (so we still want the freeze above). - */ - state = gsi_channel_state(channel); - if (state == GSI_CHANNEL_STATE_STOPPED) - return 0; - /* RX channels might require a little time to enter STOPPED state */ retries = channel->toward_ipa ? 0 : GSI_CHANNEL_STOP_RX_RETRIES; -- cgit v1.2.3 From 41af5436e857ec64f302fcc9b6e4a8c526b6b402 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Jun 2020 07:44:43 -0500 Subject: net: ipa: no checksum offload for SDM845 LAN RX The AP LAN RX endpoint should not have download checksum offload enabled. The receive handler does properly accommodate the trailer that's added by the hardware, but we ignore it. Fixes: 1ed7d0c0fdba ("soc: qcom: ipa: configuration data") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_data-sdm845.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ipa/ipa_data-sdm845.c b/drivers/net/ipa/ipa_data-sdm845.c index 52d4b84e0dac..de2768d71ab5 100644 --- a/drivers/net/ipa/ipa_data-sdm845.c +++ b/drivers/net/ipa/ipa_data-sdm845.c @@ -44,7 +44,6 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = { .endpoint = { .seq_type = IPA_SEQ_INVALID, .config = { - .checksum = true, .aggregation = true, .status_enable = true, .rx = { -- cgit v1.2.3 From 6cb63ea6a39eac9640d109f274a237b34350c183 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Jun 2020 07:44:44 -0500 Subject: net: ipa: introduce ipa_cmd_tag_process() Create a new function ipa_cmd_tag_process() that simply allocates a transaction, adds a tag process command to it to clear the hardware pipeline, and commits the transaction. Call it in from ipa_endpoint_suspend(), after suspending the modem endpoints but before suspending the AP command TX and AP LAN RX endpoints (which are used by the tag sequence). Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_cmd.c | 15 +++++++++++++++ drivers/net/ipa/ipa_cmd.h | 8 ++++++++ drivers/net/ipa/ipa_endpoint.c | 2 ++ 3 files changed, 25 insertions(+) diff --git a/drivers/net/ipa/ipa_cmd.c b/drivers/net/ipa/ipa_cmd.c index c9ab865e7290..d92dd3f09b73 100644 --- a/drivers/net/ipa/ipa_cmd.c +++ b/drivers/net/ipa/ipa_cmd.c @@ -586,6 +586,21 @@ u32 ipa_cmd_tag_process_count(void) return 4; } +void ipa_cmd_tag_process(struct ipa *ipa) +{ + u32 count = ipa_cmd_tag_process_count(); + struct gsi_trans *trans; + + trans = ipa_cmd_trans_alloc(ipa, count); + if (trans) { + ipa_cmd_tag_process_add(trans); + gsi_trans_commit_wait(trans); + } else { + dev_err(&ipa->pdev->dev, + "error allocating %u entry tag transaction\n", count); + } +} + static struct ipa_cmd_info * ipa_cmd_info_alloc(struct ipa_endpoint *endpoint, u32 tre_count) { diff --git a/drivers/net/ipa/ipa_cmd.h b/drivers/net/ipa/ipa_cmd.h index e440aa69c8b5..1a646e0264a0 100644 --- a/drivers/net/ipa/ipa_cmd.h +++ b/drivers/net/ipa/ipa_cmd.h @@ -171,6 +171,14 @@ void ipa_cmd_tag_process_add(struct gsi_trans *trans); */ u32 ipa_cmd_tag_process_count(void); +/** + * ipa_cmd_tag_process() - Perform a tag process + * + * @Return: The number of elements to allocate in a transaction + * to hold tag process commands + */ +void ipa_cmd_tag_process(struct ipa *ipa); + /** * ipa_cmd_trans_alloc() - Allocate a transaction for the command TX endpoint * @ipa: IPA pointer diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 9f50d0d11704..9e58e495d373 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -1450,6 +1450,8 @@ void ipa_endpoint_suspend(struct ipa *ipa) if (ipa->modem_netdev) ipa_modem_suspend(ipa->modem_netdev); + ipa_cmd_tag_process(ipa); + ipa_endpoint_suspend_one(ipa->name_map[IPA_ENDPOINT_AP_LAN_RX]); ipa_endpoint_suspend_one(ipa->name_map[IPA_ENDPOINT_AP_COMMAND_TX]); } -- cgit v1.2.3 From 01c66c48d4f0825a202d4163800b706a1d2ec7ad Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 30 Jun 2020 10:12:40 -0700 Subject: bpf: Fix an incorrect branch elimination by verifier Wenbo reported an issue in [1] where a checking of null pointer is evaluated as always false. In this particular case, the program type is tp_btf and the pointer to compare is a PTR_TO_BTF_ID. The current verifier considers PTR_TO_BTF_ID always reprents a non-null pointer, hence all PTR_TO_BTF_ID compares to 0 will be evaluated as always not-equal, which resulted in the branch elimination. For example, struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; int BPF_PROG(test7, struct bpf_fentry_test_t *arg) { if (arg == 0) test7_result = 1; return 0; } int BPF_PROG(test8, struct bpf_fentry_test_t *arg) { if (arg->a == 0) test8_result = 1; return 0; } In above bpf programs, both branch arg == 0 and arg->a == 0 are removed. This may not be what developer expected. The bug is introduced by Commit cac616db39c2 ("bpf: Verifier track null pointer branch_taken with JNE and JEQ"), where PTR_TO_BTF_ID is considered to be non-null when evaluting pointer vs. scalar comparison. This may be added considering we have PTR_TO_BTF_ID_OR_NULL in the verifier as well. PTR_TO_BTF_ID_OR_NULL is added to explicitly requires a non-NULL testing in selective cases. The current generic pointer tracing framework in verifier always assigns PTR_TO_BTF_ID so users does not need to check NULL pointer at every pointer level like a->b->c->d. We may not want to assign every PTR_TO_BTF_ID as PTR_TO_BTF_ID_OR_NULL as this will require a null test before pointer dereference which may cause inconvenience for developers. But we could avoid branch elimination to preserve original code intention. This patch simply removed PTR_TO_BTD_ID from reg_type_not_null() in verifier, which prevented the above branches from being eliminated. [1]: https://lore.kernel.org/bpf/79dbb7c0-449d-83eb-5f4f-7af0cc269168@fb.com/T/ Fixes: cac616db39c2 ("bpf: Verifier track null pointer branch_taken with JNE and JEQ") Reported-by: Wenbo Zhang Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630171240.2523722-1-yhs@fb.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8911d0576399..94cead5a43e5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -399,8 +399,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || - type == PTR_TO_SOCK_COMMON || - type == PTR_TO_BTF_ID; + type == PTR_TO_SOCK_COMMON; } static bool reg_type_may_be_null(enum bpf_reg_type type) -- cgit v1.2.3 From d923021c2ce12acb50dc7086a1bf66eed82adf6a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 30 Jun 2020 10:12:41 -0700 Subject: bpf: Add tests for PTR_TO_BTF_ID vs. null comparison Add two tests for PTR_TO_BTF_ID vs. null ptr comparison, one for PTR_TO_BTF_ID in the ctx structure and the other for PTR_TO_BTF_ID after one level pointer chasing. In both cases, the test ensures condition is not removed. For example, for this test struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; int BPF_PROG(test7, struct bpf_fentry_test_t *arg) { if (arg == 0) test7_result = 1; return 0; } Before the previous verifier change, we have xlated codes: int test7(long long unsigned int * ctx): ; int BPF_PROG(test7, struct bpf_fentry_test_t *arg) 0: (79) r1 = *(u64 *)(r1 +0) ; int BPF_PROG(test7, struct bpf_fentry_test_t *arg) 1: (b4) w0 = 0 2: (95) exit After the previous verifier change, we have: int test7(long long unsigned int * ctx): ; int BPF_PROG(test7, struct bpf_fentry_test_t *arg) 0: (79) r1 = *(u64 *)(r1 +0) ; if (arg == 0) 1: (55) if r1 != 0x0 goto pc+4 ; test7_result = 1; 2: (18) r1 = map[id:6][0]+48 4: (b7) r2 = 1 5: (7b) *(u64 *)(r1 +0) = r2 ; int BPF_PROG(test7, struct bpf_fentry_test_t *arg) 6: (b4) w0 = 0 7: (95) exit Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630171241.2523875-1-yhs@fb.com --- net/bpf/test_run.c | 19 ++++++++++++++++++- .../selftests/bpf/prog_tests/fentry_fexit.c | 2 +- tools/testing/selftests/bpf/progs/fentry_test.c | 22 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/fexit_test.c | 22 ++++++++++++++++++++++ 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index bfd4ccd80847..b03c469cd01f 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -147,6 +147,20 @@ int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) return a + (long)b + c + d + (long)e + f; } +struct bpf_fentry_test_t { + struct bpf_fentry_test_t *a; +}; + +int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) +{ + return (long)arg; +} + +int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) +{ + return (long)arg->a; +} + int noinline bpf_modify_return_test(int a, int *b) { *b += 1; @@ -185,6 +199,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + struct bpf_fentry_test_t arg = {}; u16 side_effect = 0, ret = 0; int b = 2, err = -EFAULT; u32 retval = 0; @@ -197,7 +212,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, bpf_fentry_test3(4, 5, 6) != 15 || bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || - bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) + bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 || + bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 || + bpf_fentry_test8(&arg) != 0) goto out; break; case BPF_MODIFY_RETURN: diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c index 83493bd5745c..109d0345a2be 100644 --- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c +++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c @@ -36,7 +36,7 @@ void test_fentry_fexit(void) fentry_res = (__u64 *)fentry_skel->bss; fexit_res = (__u64 *)fexit_skel->bss; printf("%lld\n", fentry_skel->bss->test1_result); - for (i = 0; i < 6; i++) { + for (i = 0; i < 8; i++) { CHECK(fentry_res[i] != 1, "result", "fentry_test%d failed err %lld\n", i + 1, fentry_res[i]); CHECK(fexit_res[i] != 1, "result", diff --git a/tools/testing/selftests/bpf/progs/fentry_test.c b/tools/testing/selftests/bpf/progs/fentry_test.c index 9365b686f84b..5f645fdaba6f 100644 --- a/tools/testing/selftests/bpf/progs/fentry_test.c +++ b/tools/testing/selftests/bpf/progs/fentry_test.c @@ -55,3 +55,25 @@ int BPF_PROG(test6, __u64 a, void *b, short c, int d, void * e, __u64 f) e == (void *)20 && f == 21; return 0; } + +struct bpf_fentry_test_t { + struct bpf_fentry_test_t *a; +}; + +__u64 test7_result = 0; +SEC("fentry/bpf_fentry_test7") +int BPF_PROG(test7, struct bpf_fentry_test_t *arg) +{ + if (arg == 0) + test7_result = 1; + return 0; +} + +__u64 test8_result = 0; +SEC("fentry/bpf_fentry_test8") +int BPF_PROG(test8, struct bpf_fentry_test_t *arg) +{ + if (arg->a == 0) + test8_result = 1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/fexit_test.c b/tools/testing/selftests/bpf/progs/fexit_test.c index bd1e17d8024c..0952affb22a6 100644 --- a/tools/testing/selftests/bpf/progs/fexit_test.c +++ b/tools/testing/selftests/bpf/progs/fexit_test.c @@ -56,3 +56,25 @@ int BPF_PROG(test6, __u64 a, void *b, short c, int d, void *e, __u64 f, int ret) e == (void *)20 && f == 21 && ret == 111; return 0; } + +struct bpf_fentry_test_t { + struct bpf_fentry_test *a; +}; + +__u64 test7_result = 0; +SEC("fexit/bpf_fentry_test7") +int BPF_PROG(test7, struct bpf_fentry_test_t *arg) +{ + if (arg == 0) + test7_result = 1; + return 0; +} + +__u64 test8_result = 0; +SEC("fexit/bpf_fentry_test8") +int BPF_PROG(test8, struct bpf_fentry_test_t *arg) +{ + if (arg->a == 0) + test8_result = 1; + return 0; +} -- cgit v1.2.3 From 8a259e6b73ad8181b0b2ef338b35043433db1075 Mon Sep 17 00:00:00 2001 From: Li Heng Date: Mon, 29 Jun 2020 18:49:51 +0800 Subject: net: cxgb4: fix return error value in t4_prep_fw t4_prep_fw goto bye tag with positive return value when something bad happened and which can not free resource in adap_init0. so fix it to return negative value. Fixes: 16e47624e76b ("cxgb4: Add new scheme to update T4/T5 firmware") Reported-by: Hulk Robot Signed-off-by: Li Heng Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 1aa6dc10dc0b..ad522f822cc2 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -3493,7 +3493,7 @@ int t4_prep_fw(struct adapter *adap, struct fw_info *fw_info, drv_fw = &fw_info->fw_hdr; /* Read the header of the firmware on the card */ - ret = -t4_read_flash(adap, FLASH_FW_START, + ret = t4_read_flash(adap, FLASH_FW_START, sizeof(*card_fw) / sizeof(uint32_t), (uint32_t *)card_fw, 1); if (ret == 0) { @@ -3522,8 +3522,8 @@ int t4_prep_fw(struct adapter *adap, struct fw_info *fw_info, should_install_fs_fw(adap, card_fw_usable, be32_to_cpu(fs_fw->fw_ver), be32_to_cpu(card_fw->fw_ver))) { - ret = -t4_fw_upgrade(adap, adap->mbox, fw_data, - fw_size, 0); + ret = t4_fw_upgrade(adap, adap->mbox, fw_data, + fw_size, 0); if (ret != 0) { dev_err(adap->pdev_dev, "failed to install firmware: %d\n", ret); @@ -3554,7 +3554,7 @@ int t4_prep_fw(struct adapter *adap, struct fw_info *fw_info, FW_HDR_FW_VER_MICRO_G(c), FW_HDR_FW_VER_BUILD_G(c), FW_HDR_FW_VER_MAJOR_G(k), FW_HDR_FW_VER_MINOR_G(k), FW_HDR_FW_VER_MICRO_G(k), FW_HDR_FW_VER_BUILD_G(k)); - ret = EINVAL; + ret = -EINVAL; goto bye; } -- cgit v1.2.3 From 28541f3d324f6de1e545e2875283b6cef95c5d36 Mon Sep 17 00:00:00 2001 From: Carl Huang Date: Tue, 30 Jun 2020 14:52:51 +0800 Subject: net: qrtr: free flow in __qrtr_node_release The flow is allocated in qrtr_tx_wait, but not freed when qrtr node is released. (*slot) becomes NULL after radix_tree_iter_delete is called in __qrtr_node_release. The fix is to save (*slot) to a vairable and then free it. This memory leak is catched when kmemleak is enabled in kernel, the report looks like below: unreferenced object 0xffffa0de69e08420 (size 32): comm "kworker/u16:3", pid 176, jiffies 4294918275 (age 82858.876s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 28 84 e0 69 de a0 ff ff ........(..i.... 28 84 e0 69 de a0 ff ff 03 00 00 00 00 00 00 00 (..i............ backtrace: [<00000000e252af0a>] qrtr_node_enqueue+0x38e/0x400 [qrtr] [<000000009cea437f>] qrtr_sendmsg+0x1e0/0x2a0 [qrtr] [<000000008bddbba4>] sock_sendmsg+0x5b/0x60 [<0000000003beb43a>] qmi_send_message.isra.3+0xbe/0x110 [qmi_helpers] [<000000009c9ae7de>] qmi_send_request+0x1c/0x20 [qmi_helpers] Signed-off-by: Carl Huang Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 2d8d6131bc5f..059881330788 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -166,6 +166,7 @@ static void __qrtr_node_release(struct kref *kref) { struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); struct radix_tree_iter iter; + struct qrtr_tx_flow *flow; unsigned long flags; void __rcu **slot; @@ -181,8 +182,9 @@ static void __qrtr_node_release(struct kref *kref) /* Free tx flow counters */ radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) { + flow = *slot; radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot); - kfree(*slot); + kfree(flow); } kfree(node); } -- cgit v1.2.3 From 6a2febec338df7e7699a52d00b2e1207dcf65b28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jun 2020 16:41:01 -0700 Subject: tcp: md5: add missing memory barriers in tcp_md5_do_add()/tcp_md5_hash_key() MD5 keys are read with RCU protection, and tcp_md5_do_add() might update in-place a prior key. Normally, typical RCU updates would allocate a new piece of memory. In this case only key->key and key->keylen might be updated, and we do not care if an incoming packet could see the old key, the new one, or some intermediate value, since changing the key on a live flow is known to be problematic anyway. We only want to make sure that in the case key->keylen is changed, cpus in tcp_md5_hash_key() wont try to use uninitialized data, or crash because key->keylen was read twice to feed sg_init_one() and ahash_request_set_crypt() Fixes: 9ea88a153001 ("tcp: md5: check md5 signature without socket lock") Signed-off-by: Eric Dumazet Cc: Mathieu Desnoyers Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 7 +++++-- net/ipv4/tcp_ipv4.c | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 810cc164f795..f11166045324 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4033,10 +4033,13 @@ EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { + u8 keylen = key->keylen; struct scatterlist sg; - sg_init_one(&sg, key->key, key->keylen); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen); + smp_rmb(); /* paired with smp_wmb() in tcp_md5_do_add() */ + + sg_init_one(&sg, key->key, keylen); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); return crypto_ahash_update(hp->md5_req); } EXPORT_SYMBOL(tcp_md5_hash_key); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad6435ba6d72..99916fcc15ca 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1113,6 +1113,9 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, if (key) { /* Pre-existing entry - just update that one. */ memcpy(key->key, newkey, newkeylen); + + smp_wmb(); /* pairs with smp_rmb() in tcp_md5_hash_key() */ + key->keylen = newkeylen; return 0; } -- cgit v1.2.3 From 8ff41cc21714704ef0158a546c3c4d07fae2c952 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 30 Jun 2020 14:46:15 +0300 Subject: net: qrtr: Fix an out of bounds read qrtr_endpoint_post() This code assumes that the user passed in enough data for a qrtr_hdr_v1 or qrtr_hdr_v2 struct, but it's not necessarily true. If the buffer is too small then it will read beyond the end. Reported-by: Manivannan Sadhasivam Reported-by: syzbot+b8fe393f999a291a9ea6@syzkaller.appspotmail.com Fixes: 194ccc88297a ("net: qrtr: Support decoding incoming v2 packets") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 059881330788..24a8c3c6da0d 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -429,7 +429,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) unsigned int ver; size_t hdrlen; - if (len & 3) + if (len == 0 || len & 3) return -EINVAL; skb = netdev_alloc_skb(NULL, len); @@ -443,6 +443,8 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) switch (ver) { case QRTR_PROTO_VER_1: + if (len < sizeof(*v1)) + goto err; v1 = data; hdrlen = sizeof(*v1); @@ -456,6 +458,8 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) size = le32_to_cpu(v1->size); break; case QRTR_PROTO_VER_2: + if (len < sizeof(*v2)) + goto err; v2 = data; hdrlen = sizeof(*v2) + v2->optlen; -- cgit v1.2.3 From d3c54f7f18cb82283777998c4bec52e49241c783 Mon Sep 17 00:00:00 2001 From: Luo bin Date: Wed, 1 Jul 2020 11:16:33 +0800 Subject: hinic: fix passing non negative value to ERR_PTR get_dev_cap and set_resources_state functions may return a positive value because of hardware failure, and the positive return value can not be passed to ERR_PTR directly. Fixes: 7dd29ee12865 ("hinic: add sriov feature support") Signed-off-by: Luo bin Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c index 0245da02efbb..b735bc537508 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c @@ -814,6 +814,8 @@ err_aeqs_init: err_init_msix: err_pfhwdev_alloc: hinic_free_hwif(hwif); + if (err > 0) + err = -EIO; return ERR_PTR(err); } -- cgit v1.2.3 From 1e82a62fec613844da9e558f3493540a5b7a7b67 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Tue, 30 Jun 2020 11:50:17 -0600 Subject: genetlink: remove genl_bind A potential deadlock can occur during registering or unregistering a new generic netlink family between the main nl_table_lock and the cb_lock where each thread wants the lock held by the other, as demonstrated below. 1) Thread 1 is performing a netlink_bind() operation on a socket. As part of this call, it will call netlink_lock_table(), incrementing the nl_table_users count to 1. 2) Thread 2 is registering (or unregistering) a genl_family via the genl_(un)register_family() API. The cb_lock semaphore will be taken for writing. 3) Thread 1 will call genl_bind() as part of the bind operation to handle subscribing to GENL multicast groups at the request of the user. It will attempt to take the cb_lock semaphore for reading, but it will fail and be scheduled away, waiting for Thread 2 to finish the write. 4) Thread 2 will call netlink_table_grab() during the (un)registration call. However, as Thread 1 has incremented nl_table_users, it will not be able to proceed, and both threads will be stuck waiting for the other. genl_bind() is a noop, unless a genl_family implements the mcast_bind() function to handle setting up family-specific multicast operations. Since no one in-tree uses this functionality as Cong pointed out, simply removing the genl_bind() function will remove the possibility for deadlock, as there is no attempt by Thread 1 above to take the cb_lock semaphore. Fixes: c380d9a7afff ("genetlink: pass multicast bind/unbind to families") Suggested-by: Cong Wang Acked-by: Johannes Berg Reported-by: kernel test robot Signed-off-by: Sean Tranchetti Signed-off-by: David S. Miller --- include/net/genetlink.h | 8 -------- net/netlink/genetlink.c | 49 ------------------------------------------------- 2 files changed, 57 deletions(-) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index ad71ed4f55ff..6e5f1e1aa822 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -35,12 +35,6 @@ struct genl_info; * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks - * @mcast_bind: a socket bound to the given multicast group (which - * is given as the offset into the groups array) - * @mcast_unbind: a socket was unbound from the given multicast group. - * Note that unbind() will not be called symmetrically if the - * generic netlink family is removed while there are still open - * sockets. * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family @@ -63,8 +57,6 @@ struct genl_family { void (*post_doit)(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info); - int (*mcast_bind)(struct net *net, int group); - void (*mcast_unbind)(struct net *net, int group); const struct genl_ops * ops; const struct genl_multicast_group *mcgrps; unsigned int n_ops; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index a914b9365a46..9395ee8a868d 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1144,60 +1144,11 @@ static struct genl_family genl_ctrl __ro_after_init = { .netnsok = true, }; -static int genl_bind(struct net *net, int group) -{ - struct genl_family *f; - int err = -ENOENT; - unsigned int id; - - down_read(&cb_lock); - - idr_for_each_entry(&genl_fam_idr, f, id) { - if (group >= f->mcgrp_offset && - group < f->mcgrp_offset + f->n_mcgrps) { - int fam_grp = group - f->mcgrp_offset; - - if (!f->netnsok && net != &init_net) - err = -ENOENT; - else if (f->mcast_bind) - err = f->mcast_bind(net, fam_grp); - else - err = 0; - break; - } - } - up_read(&cb_lock); - - return err; -} - -static void genl_unbind(struct net *net, int group) -{ - struct genl_family *f; - unsigned int id; - - down_read(&cb_lock); - - idr_for_each_entry(&genl_fam_idr, f, id) { - if (group >= f->mcgrp_offset && - group < f->mcgrp_offset + f->n_mcgrps) { - int fam_grp = group - f->mcgrp_offset; - - if (f->mcast_unbind) - f->mcast_unbind(net, fam_grp); - break; - } - } - up_read(&cb_lock); -} - static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, - .bind = genl_bind, - .unbind = genl_unbind, }; /* we'll bump the group number right afterwards */ -- cgit v1.2.3 From e6ced831ef11a2a06e8d00aad9d4fc05b610bf38 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Jul 2020 11:43:04 -0700 Subject: tcp: md5: refine tcp_md5_do_add()/tcp_md5_hash_key() barriers My prior fix went a bit too far, according to Herbert and Mathieu. Since we accept that concurrent TCP MD5 lookups might see inconsistent keys, we can use READ_ONCE()/WRITE_ONCE() instead of smp_rmb()/smp_wmb() Clearing all key->key[] is needed to avoid possible KMSAN reports, if key->keylen is increased. Since tcp_md5_do_add() is not fast path, using __GFP_ZERO to clear all struct tcp_md5sig_key is simpler. data_race() was added in linux-5.8 and will prevent KCSAN reports, this can safely be removed in stable backports, if data_race() is not yet backported. v2: use data_race() both in tcp_md5_hash_key() and tcp_md5_do_add() Fixes: 6a2febec338d ("tcp: md5: add missing memory barriers in tcp_md5_do_add()/tcp_md5_hash_key()") Signed-off-by: Eric Dumazet Cc: Mathieu Desnoyers Cc: Herbert Xu Cc: Marco Elver Reviewed-by: Mathieu Desnoyers Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 8 ++++---- net/ipv4/tcp_ipv4.c | 19 ++++++++++++++----- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f11166045324..c33f7c6aff8e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4033,14 +4033,14 @@ EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { - u8 keylen = key->keylen; + u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; - smp_rmb(); /* paired with smp_wmb() in tcp_md5_do_add() */ - sg_init_one(&sg, key->key, keylen); ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); - return crypto_ahash_update(hp->md5_req); + + /* We use data_race() because tcp_md5_do_add() might change key->key under us */ + return data_race(crypto_ahash_update(hp->md5_req)); } EXPORT_SYMBOL(tcp_md5_hash_key); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 99916fcc15ca..04bfcbbfee83 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1111,12 +1111,21 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (key) { - /* Pre-existing entry - just update that one. */ - memcpy(key->key, newkey, newkeylen); + /* Pre-existing entry - just update that one. + * Note that the key might be used concurrently. + * data_race() is telling kcsan that we do not care of + * key mismatches, since changing MD5 key on live flows + * can lead to packet drops. + */ + data_race(memcpy(key->key, newkey, newkeylen)); - smp_wmb(); /* pairs with smp_rmb() in tcp_md5_hash_key() */ + /* Pairs with READ_ONCE() in tcp_md5_hash_key(). + * Also note that a reader could catch new key->keylen value + * but old key->key[], this is the reason we use __GFP_ZERO + * at sock_kmalloc() time below these lines. + */ + WRITE_ONCE(key->keylen, newkeylen); - key->keylen = newkeylen; return 0; } @@ -1132,7 +1141,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, rcu_assign_pointer(tp->md5sig_info, md5sig); } - key = sock_kmalloc(sk, sizeof(*key), gfp); + key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); if (!key) return -ENOMEM; if (!tcp_alloc_md5sig_pool()) { -- cgit v1.2.3 From 9ef845f894c93416a1cbcbc6ec42525fb06aaf4e Mon Sep 17 00:00:00 2001 From: Rao Shoaib Date: Wed, 1 Jul 2020 12:23:38 -0700 Subject: rds: If one path needs re-connection, check all and re-connect In testing with mprds enabled, Oracle Cluster nodes after reboot were not able to communicate with others nodes and so failed to rejoin the cluster. Peers with lower IP address initiated connection but the node could not respond as it choose a different path and could not initiate a connection as it had a higher IP address. With this patch, when a node sends out a packet and the selected path is down, all other paths are also checked and any down paths are re-connected. Reviewed-by: Ka-cheong Poon Reviewed-by: David Edmondson Signed-off-by: Somasundaram Krishnasamy Signed-off-by: Rao Shoaib Signed-off-by: David S. Miller --- net/rds/connection.c | 11 +++++++++++ net/rds/rds.h | 7 +++++++ net/rds/send.c | 3 ++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index ed7f2133acc2..f2fcab182095 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -905,6 +905,17 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); +/* Check connectivity of all paths + */ +void rds_check_all_paths(struct rds_connection *conn) +{ + int i = 0; + + do { + rds_conn_path_connect_if_down(&conn->c_path[i]); + } while (++i < conn->c_npaths); +} + void rds_conn_connect_if_down(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); diff --git a/net/rds/rds.h b/net/rds/rds.h index 6019b0c004a9..106e862996b9 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -778,6 +778,7 @@ void rds_conn_drop(struct rds_connection *conn); void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_conn_path_connect_if_down(struct rds_conn_path *cp); +void rds_check_all_paths(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, @@ -822,6 +823,12 @@ rds_conn_path_up(struct rds_conn_path *cp) return atomic_read(&cp->cp_state) == RDS_CONN_UP; } +static inline int +rds_conn_path_down(struct rds_conn_path *cp) +{ + return atomic_read(&cp->cp_state) == RDS_CONN_DOWN; +} + static inline int rds_conn_up(struct rds_connection *conn) { diff --git a/net/rds/send.c b/net/rds/send.c index 68e2bdb08fd0..9a529a01cdc6 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1340,7 +1340,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } - rds_conn_path_connect_if_down(cpath); + if (rds_conn_path_down(cpath)) + rds_check_all_paths(conn); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { -- cgit v1.2.3 From e114e1e8ac9d31f25b9dd873bab5d80c1fc482ca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Jul 2020 12:41:23 -0700 Subject: tcp: md5: do not send silly options in SYNCOOKIES Whenever cookie_init_timestamp() has been used to encode ECN,SACK,WSCALE options, we can not remove the TS option in the SYNACK. Otherwise, tcp_synack_options() will still advertize options like WSCALE that we can not deduce later when receiving the packet from the client to complete 3WHS. Note that modern linux TCP stacks wont use MD5+TS+SACK in a SYN packet, but we can not know for sure that all TCP stacks have the same logic. Before the fix a tcpdump would exhibit this wrong exchange : 10:12:15.464591 IP C > S: Flags [S], seq 4202415601, win 65535, options [nop,nop,md5 valid,mss 1400,sackOK,TS val 456965269 ecr 0,nop,wscale 8], length 0 10:12:15.464602 IP S > C: Flags [S.], seq 253516766, ack 4202415602, win 65535, options [nop,nop,md5 valid,mss 1400,nop,nop,sackOK,nop,wscale 8], length 0 10:12:15.464611 IP C > S: Flags [.], ack 1, win 256, options [nop,nop,md5 valid], length 0 10:12:15.464678 IP C > S: Flags [P.], seq 1:13, ack 1, win 256, options [nop,nop,md5 valid], length 12 10:12:15.464685 IP S > C: Flags [.], ack 13, win 65535, options [nop,nop,md5 valid], length 0 After this patch the exchange looks saner : 11:59:59.882990 IP C > S: Flags [S], seq 517075944, win 65535, options [nop,nop,md5 valid,mss 1400,sackOK,TS val 1751508483 ecr 0,nop,wscale 8], length 0 11:59:59.883002 IP S > C: Flags [S.], seq 1902939253, ack 517075945, win 65535, options [nop,nop,md5 valid,mss 1400,sackOK,TS val 1751508479 ecr 1751508483,nop,wscale 8], length 0 11:59:59.883012 IP C > S: Flags [.], ack 1, win 256, options [nop,nop,md5 valid,nop,nop,TS val 1751508483 ecr 1751508479], length 0 11:59:59.883114 IP C > S: Flags [P.], seq 1:13, ack 1, win 256, options [nop,nop,md5 valid,nop,nop,TS val 1751508483 ecr 1751508479], length 12 11:59:59.883122 IP S > C: Flags [.], ack 13, win 256, options [nop,nop,md5 valid,nop,nop,TS val 1751508483 ecr 1751508483], length 0 11:59:59.883152 IP S > C: Flags [P.], seq 1:13, ack 13, win 256, options [nop,nop,md5 valid,nop,nop,TS val 1751508484 ecr 1751508483], length 12 11:59:59.883170 IP C > S: Flags [.], ack 13, win 256, options [nop,nop,md5 valid,nop,nop,TS val 1751508484 ecr 1751508484], length 0 Of course, no SACK block will ever be added later, but nothing should break. Technically, we could remove the 4 nops included in MD5+TS options, but again some stacks could break seeing not conventional alignment. Fixes: 4957faade11b ("TCPCT part 1g: Responder Cookie => Initiator") Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Mathieu Desnoyers Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a50e1990a845..5f5b2f0b0e60 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -700,7 +700,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, - struct tcp_fastopen_cookie *foc) + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -715,7 +716,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ - ireq->tstamp_ok &= !ireq->sack_ok; + if (synack_type != TCP_SYNACK_COOKIE) + ireq->tstamp_ok &= !ireq->sack_ok; } #endif @@ -3394,7 +3396,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc) + sizeof(*th); + foc, synack_type) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); -- cgit v1.2.3 From 0da7536fb47f51df89ccfcb1fa09f249d9accec5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 1 Jul 2020 16:00:06 -0400 Subject: ip: Fix SO_MARK in RST, ACK and ICMP packets When no full socket is available, skbs are sent over a per-netns control socket. Its sk_mark is temporarily adjusted to match that of the real (request or timewait) socket or to reflect an incoming skb, so that the outgoing skb inherits this in __ip_make_skb. Introduction of the socket cookie mark field broke this. Now the skb is set through the cookie and cork: # init sockc.mark from sk_mark or cmsg ip_append_data ip_setup_cork # convert sockc.mark to cork mark ip_push_pending_frames ip_finish_skb __ip_make_skb # set skb->mark to cork mark But I missed these special control sockets. Update all callers of __ip(6)_make_skb that were originally missed. For IPv6, the same two icmp(v6) paths are affected. The third case is not, as commit 92e55f412cff ("tcp: don't annotate mark on control socket from tcp_v6_send_response()") replaced the ctl_sk->sk_mark with passing the mark field directly as a function argument. That commit predates the commit that introduced the bug. Fixes: c6af0c227a22 ("ip: support SO_MARK cmsg") Signed-off-by: Willem de Bruijn Reported-by: Martin KaFai Lau Reviewed-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv6/icmp.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 956a806649f7..e30515f89802 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) ipcm_init(&ipc); inet->tos = ip_hdr(skb)->tos; - sk->sk_mark = mark; + ipc.sockc.mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); @@ -710,10 +710,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.skb = skb_in; icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; - sk->sk_mark = mark; ipcm_init(&ipc); ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; + ipc.sockc.mark = mark; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 090d3097ee15..17206677d503 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1702,7 +1702,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = sysctl_wmem_default; - sk->sk_mark = fl4.flowi4_mark; + ipc.sockc.mark = fl4.flowi4_mark; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); if (unlikely(err)) { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index fc5000370030..9df8737ae0d3 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -566,7 +566,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); - sk->sk_mark = mark; np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6)) @@ -583,6 +582,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.flowi6_oif = np->ucast_oif; ipcm6_init_sk(&ipc6, np); + ipc6.sockc.mark = mark; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = icmpv6_route_lookup(net, skb, sk, &fl6); @@ -751,7 +751,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb) sk = icmpv6_xmit_lock(net); if (!sk) goto out_bh_enable; - sk->sk_mark = mark; np = inet6_sk(sk); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) @@ -779,6 +778,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) ipcm6_init_sk(&ipc6, np); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + ipc6.sockc.mark = mark; if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), -- cgit v1.2.3 From ba3bb0e76ccd464bb66665a1941fabe55dadb3ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jun 2020 13:51:28 -0700 Subject: tcp: fix SO_RCVLOWAT possible hangs under high mem pressure Whenever tcp_try_rmem_schedule() returns an error, we are under trouble and should make sure to wakeup readers so that they can drain socket queues and eventually make room. Fixes: 03f45c883c6f ("tcp: avoid extra wakeups for SO_RCVLOWAT users") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f3a0eb139b76..9615e72656d1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4582,6 +4582,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); + sk->sk_data_ready(sk); tcp_drop(sk, skb); return; } @@ -4828,6 +4829,7 @@ queue_and_out: sk_forced_mem_schedule(sk, skb->truesize); else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); + sk->sk_data_ready(sk); goto drop; } -- cgit v1.2.3 From e4b9a72d76a47c11677b4bb0dee24a1fb6efb3e9 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 1 Jul 2020 13:22:20 +0200 Subject: net: dsa: microchip: enable ksz9893 via i2c in the ksz9477 driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KSZ9893 3-Port Gigabit Ethernet Switch can be controlled via SPI, I²C or MDIO (very limited and not supported by this driver). While there is already a compatible entry for the SPI bus, it was missing for I²C. Signed-off-by: Helmut Grohne Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz9477_i2c.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/microchip/ksz9477_i2c.c b/drivers/net/dsa/microchip/ksz9477_i2c.c index 7d050fab0889..7951f52d860d 100644 --- a/drivers/net/dsa/microchip/ksz9477_i2c.c +++ b/drivers/net/dsa/microchip/ksz9477_i2c.c @@ -79,6 +79,7 @@ MODULE_DEVICE_TABLE(i2c, ksz9477_i2c_id); static const struct of_device_id ksz9477_dt_ids[] = { { .compatible = "microchip,ksz9477" }, { .compatible = "microchip,ksz9897" }, + { .compatible = "microchip,ksz9893" }, { .compatible = "microchip,ksz9567" }, {}, }; -- cgit v1.2.3 From 1ca0fafd73c5268e8fc4b997094b8bb2bfe8deea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Jul 2020 18:39:33 -0700 Subject: tcp: md5: allow changing MD5 keys in all socket states This essentially reverts commit 721230326891 ("tcp: md5: reject TCP_MD5SIG or TCP_MD5SIG_EXT on established sockets") Mathieu reported that many vendors BGP implementations can actually switch TCP MD5 on established flows. Quoting Mathieu : Here is a list of a few network vendors along with their behavior with respect to TCP MD5: - Cisco: Allows for password to be changed, but within the hold-down timer (~180 seconds). - Juniper: When password is initially set on active connection it will reset, but after that any subsequent password changes no network resets. - Nokia: No notes on if they flap the tcp connection or not. - Ericsson/RedBack: Allows for 2 password (old/new) to co-exist until both sides are ok with new passwords. - Meta-Switch: Expects the password to be set before a connection is attempted, but no further info on whether they reset the TCP connection on a change. - Avaya: Disable the neighbor, then set password, then re-enable. - Zebos: Would normally allow the change when socket connected. We can revert my prior change because commit 9424e2e7ad93 ("tcp: md5: fix potential overestimation of TCP option space") removed the leak of 4 kernel bytes to the wire that was the main reason for my patch. While doing my investigations, I found a bug when a MD5 key is changed, leading to these commits that stable teams want to consider before backporting this revert : Commit 6a2febec338d ("tcp: md5: add missing memory barriers in tcp_md5_do_add()/tcp_md5_hash_key()") Commit e6ced831ef11 ("tcp: md5: refine tcp_md5_do_add()/tcp_md5_hash_key() barriers") Fixes: 721230326891 "tcp: md5: reject TCP_MD5SIG or TCP_MD5SIG_EXT on established sockets" Signed-off-by: Eric Dumazet Reported-by: Mathieu Desnoyers Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c33f7c6aff8e..861fbd84c9cf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3246,10 +3246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: - if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) - err = tp->af_specific->md5_parse(sk, optname, optval, optlen); - else - err = -EINVAL; + err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_USER_TIMEOUT: -- cgit v1.2.3 From af199a1a9cb02ec0194804bd46c174b6db262075 Mon Sep 17 00:00:00 2001 From: Codrin Ciubotariu Date: Thu, 2 Jul 2020 12:44:50 +0300 Subject: net: dsa: microchip: set the correct number of ports The number of ports is incorrectly set to the maximum available for a DSA switch. Even if the extra ports are not used, this causes some functions to be called later, like port_disable() and port_stp_state_set(). If the driver doesn't check the port index, it will end up modifying unknown registers. Fixes: b987e98e50ab ("dsa: add DSA switch driver for Microchip KSZ9477") Signed-off-by: Codrin Ciubotariu Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz8795.c | 3 +++ drivers/net/dsa/microchip/ksz9477.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 47d65b77caf7..7c17b0f705ec 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -1268,6 +1268,9 @@ static int ksz8795_switch_init(struct ksz_device *dev) return -ENOMEM; } + /* set the real number of ports */ + dev->ds->num_ports = dev->port_cnt; + return 0; } diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index 9a51b8a4de5d..8d15c3016024 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -1588,6 +1588,9 @@ static int ksz9477_switch_init(struct ksz_device *dev) return -ENOMEM; } + /* set the real number of ports */ + dev->ds->num_ports = dev->port_cnt; + return 0; } -- cgit v1.2.3 From ad4e2b64839710e3b6e17a11b2684ceaaeae795e Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 2 Jul 2020 15:00:21 +0200 Subject: MAINTAINERS: net: macb: add Claudiu as co-maintainer I would like that Claudiu becomes co-maintainer of the Cadence macb driver. He's already participating to lots of reviews and enhancements to this driver and knows the different versions of this controller. Signed-off-by: Nicolas Ferre Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 63c4cc4a04d6..5f14938a5985 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2929,6 +2929,7 @@ F: include/uapi/linux/atm* ATMEL MACB ETHERNET DRIVER M: Nicolas Ferre +M: Claudiu Beznea S: Supported F: drivers/net/ethernet/cadence/ -- cgit v1.2.3 From d005fbb855d3b5660d62ee5a6bd2d99c13ff8cf3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Jul 2020 13:17:40 +0200 Subject: netfilter: conntrack: refetch conntrack after nf_conntrack_update() __nf_conntrack_update() might refresh the conntrack object that is attached to the skbuff. Otherwise, this triggers UAF. [ 633.200434] ================================================================== [ 633.200472] BUG: KASAN: use-after-free in nf_conntrack_update+0x34e/0x770 [nf_conntrack] [ 633.200478] Read of size 1 at addr ffff888370804c00 by task nfqnl_test/6769 [ 633.200487] CPU: 1 PID: 6769 Comm: nfqnl_test Not tainted 5.8.0-rc2+ #388 [ 633.200490] Hardware name: LENOVO 23259H1/23259H1, BIOS G2ET32WW (1.12 ) 05/30/2012 [ 633.200491] Call Trace: [ 633.200499] dump_stack+0x7c/0xb0 [ 633.200526] ? nf_conntrack_update+0x34e/0x770 [nf_conntrack] [ 633.200532] print_address_description.constprop.6+0x1a/0x200 [ 633.200539] ? _raw_write_lock_irqsave+0xc0/0xc0 [ 633.200568] ? nf_conntrack_update+0x34e/0x770 [nf_conntrack] [ 633.200594] ? nf_conntrack_update+0x34e/0x770 [nf_conntrack] [ 633.200598] kasan_report.cold.9+0x1f/0x42 [ 633.200604] ? call_rcu+0x2c0/0x390 [ 633.200633] ? nf_conntrack_update+0x34e/0x770 [nf_conntrack] [ 633.200659] nf_conntrack_update+0x34e/0x770 [nf_conntrack] [ 633.200687] ? nf_conntrack_find_get+0x30/0x30 [nf_conntrack] Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1436 Fixes: ee04805ff54a ("netfilter: conntrack: make conntrack userspace helpers work again") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 79cd9dde457b..f33d72c5b06e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2158,6 +2158,8 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) err = __nf_conntrack_update(net, skb, ct, ctinfo); if (err < 0) return err; + + ct = nf_ct_get(skb, &ctinfo); } return nf_confirm_cthelper(skb, ct, ctinfo); -- cgit v1.2.3 From d7bf2ebebc2bd61ab95e2a8e33541ef282f303d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 3 Jul 2020 22:26:43 +0200 Subject: sched: consistently handle layer3 header accesses in the presence of VLANs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a couple of places in net/sched/ that check skb->protocol and act on the value there. However, in the presence of VLAN tags, the value stored in skb->protocol can be inconsistent based on whether VLAN acceleration is enabled. The commit quoted in the Fixes tag below fixed the users of skb->protocol to use a helper that will always see the VLAN ethertype. However, most of the callers don't actually handle the VLAN ethertype, but expect to find the IP header type in the protocol field. This means that things like changing the ECN field, or parsing diffserv values, stops working if there's a VLAN tag, or if there are multiple nested VLAN tags (QinQ). To fix this, change the helper to take an argument that indicates whether the caller wants to skip the VLAN tags or not. When skipping VLAN tags, we make sure to skip all of them, so behaviour is consistent even in QinQ mode. To make the helper usable from the ECN code, move it to if_vlan.h instead of pkt_sched.h. v3: - Remove empty lines - Move vlan variable definitions inside loop in skb_protocol() - Also use skb_protocol() helper in IP{,6}_ECN_decapsulate() and bpf_skb_ecn_set_ce() v2: - Use eth_type_vlan() helper in skb_protocol() - Also fix code that reads skb->protocol directly - Change a couple of 'if/else if' statements to switch constructs to avoid calling the helper twice Reported-by: Ilya Ponetayev Fixes: d8b9605d2697 ("net: sched: fix skb->protocol use in case of accelerated vlan path") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 28 ++++++++++++++++++++++++++++ include/net/inet_ecn.h | 25 +++++++++++++++++-------- include/net/pkt_sched.h | 11 ----------- net/core/filter.c | 10 +++++++--- net/sched/act_connmark.c | 9 ++++++--- net/sched/act_csum.c | 2 +- net/sched/act_ct.c | 9 ++++----- net/sched/act_ctinfo.c | 9 ++++++--- net/sched/act_mpls.c | 2 +- net/sched/act_skbedit.c | 2 +- net/sched/cls_api.c | 2 +- net/sched/cls_flow.c | 8 ++++---- net/sched/cls_flower.c | 2 +- net/sched/em_ipset.c | 2 +- net/sched/em_ipt.c | 2 +- net/sched/em_meta.c | 2 +- net/sched/sch_cake.c | 4 ++-- net/sched/sch_dsmark.c | 6 +++--- net/sched/sch_teql.c | 2 +- 19 files changed, 86 insertions(+), 51 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index b05e855f1ddd..427a5b8597c2 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -308,6 +308,34 @@ static inline bool eth_type_vlan(__be16 ethertype) } } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); + __be16 proto = skb->protocol; + + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; + + while (eth_type_vlan(proto)) { + struct vlan_hdr vhdr, *vh; + + vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); + if (!vh) + break; + + proto = vh->h_vlan_encapsulated_proto; + offset += sizeof(vhdr); + } + + return proto; +} + static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index 0f0d1efe06dd..e1eaf1780288 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -172,7 +173,7 @@ static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) static inline int INET_ECN_set_ce(struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) @@ -191,7 +192,7 @@ static inline int INET_ECN_set_ce(struct sk_buff *skb) static inline int INET_ECN_set_ect1(struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) @@ -272,12 +273,16 @@ static inline int IP_ECN_decapsulate(const struct iphdr *oiph, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, oiph->tos, inner); } @@ -287,12 +292,16 @@ static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9092e697059e..ac8c890a2657 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -136,17 +136,6 @@ static inline void qdisc_run(struct Qdisc *q) } } -static inline __be16 tc_skb_protocol(const struct sk_buff *skb) -{ - /* We need to take extra care in case the skb came via - * vlan accelerated path. In that case, use skb->vlan_proto - * as the original vlan header was already stripped. - */ - if (skb_vlan_tag_present(skb)) - return skb->vlan_proto; - return skb->protocol; -} - /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ diff --git a/net/core/filter.c b/net/core/filter.c index 73395384afe2..82e1b5b06167 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5853,12 +5853,16 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; - if (skb->protocol == cpu_to_be16(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): iphdr_len = sizeof(struct iphdr); - else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + break; + case cpu_to_be16(ETH_P_IPV6): iphdr_len = sizeof(struct ipv6hdr); - else + break; + default: return 0; + } if (skb_headlen(skb) < iphdr_len) return 0; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 43a243081e7d..f901421b0634 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -43,17 +43,20 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&ca->tcf_tm); bstats_update(&ca->tcf_bstats, skb); - if (skb->protocol == htons(ETH_P_IP)) { + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): if (skb->len < sizeof(struct iphdr)) goto out; proto = NFPROTO_IPV4; - } else if (skb->protocol == htons(ETH_P_IPV6)) { + break; + case htons(ETH_P_IPV6): if (skb->len < sizeof(struct ipv6hdr)) goto out; proto = NFPROTO_IPV6; - } else { + break; + default: goto out; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index cb8608f0a77a..c60674cf25c4 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -587,7 +587,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, goto drop; update_flags = params->update_flags; - protocol = tc_skb_protocol(skb); + protocol = skb_protocol(skb, false); again: switch (protocol) { case cpu_to_be16(ETH_P_IP): diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index e9f3576cbf71..86ed02487467 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -624,7 +624,7 @@ static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) { u8 family = NFPROTO_UNSPEC; - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): family = NFPROTO_IPV4; break; @@ -748,6 +748,7 @@ static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { + __be16 proto = skb_protocol(skb, true); int hooknum, err = NF_ACCEPT; /* See HOOK2MANIP(). */ @@ -759,14 +760,13 @@ static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: - if (skb->protocol == htons(ETH_P_IP) && + if (proto == htons(ETH_P_IP) && ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, hooknum)) err = NF_DROP; goto out; - } else if (IS_ENABLED(CONFIG_IPV6) && - skb->protocol == htons(ETH_P_IPV6)) { + } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) { __be16 frag_off; u8 nexthdr = ipv6_hdr(skb)->nexthdr; int hdrlen = ipv6_skip_exthdr(skb, @@ -1550,4 +1550,3 @@ MODULE_AUTHOR("Yossi Kuperman "); MODULE_AUTHOR("Marcelo Ricardo Leitner "); MODULE_DESCRIPTION("Connection tracking action"); MODULE_LICENSE("GPL v2"); - diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 19649623493b..b5042f3ea079 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -96,19 +96,22 @@ static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); - if (tc_skb_protocol(skb) == htons(ETH_P_IP)) { + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV4; - } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) { + break; + case htons(ETH_P_IPV6): wlen += sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV6; - } else { + break; + default: goto out; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index be3f215cd027..8118e2640979 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -82,7 +82,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, goto drop; break; case TCA_MPLS_ACT_PUSH: - new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); + new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb_protocol(skb, true))); if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len, skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b125b2be4467..b2b3faa57294 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -41,7 +41,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, if (params->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index faa78b7dd962..e62beec0d844 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1538,7 +1538,7 @@ static inline int __tcf_classify(struct sk_buff *skb, reclassify: #endif for (; tp; tp = rcu_dereference_bh(tp->next)) { - __be16 protocol = tc_skb_protocol(skb); + __be16 protocol = skb_protocol(skb, false); int err; if (tp->protocol != protocol && diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 80ae7b9fa90a..ab53a93b2f2b 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -80,7 +80,7 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) if (dst) return ntohl(dst); - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_proto(const struct sk_buff *skb, @@ -104,7 +104,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb, if (flow->ports.ports) return ntohs(flow->ports.dst); - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_iif(const struct sk_buff *skb) @@ -151,7 +151,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb) static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) { - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, src.u3.ip)); case htons(ETH_P_IPV6): @@ -164,7 +164,7 @@ fallback: static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, dst.u3.ip)); case htons(ETH_P_IPV6): diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b2da37286082..e30bd969fc48 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -313,7 +313,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* skb_flow_dissect() does not set n_proto in case an unknown * protocol, so do it rather here. */ - skb_key.basic.n_proto = skb->protocol; + skb_key.basic.n_proto = skb_protocol(skb, false); skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index df00566d327d..c95cf86fb431 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -59,7 +59,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, }; int ret, network_offset; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): state.pf = NFPROTO_IPV4; if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index 18755d29fd15..3650117da47f 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -212,7 +212,7 @@ static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, struct nf_hook_state state; int ret; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) return 0; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index d99966a55c84..46254968d390 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -195,7 +195,7 @@ META_COLLECTOR(int_priority) META_COLLECTOR(int_protocol) { /* Let userspace take care of the byte ordering */ - dst->value = tc_skb_protocol(skb); + dst->value = skb_protocol(skb, false); } META_COLLECTOR(int_pkttype) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index ca813697728e..ebaeec1e5c82 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -592,7 +592,7 @@ static bool cake_update_flowkeys(struct flow_keys *keys, bool rev = !skb->_nfct, upd = false; __be32 ip; - if (tc_skb_protocol(skb) != htons(ETH_P_IP)) + if (skb_protocol(skb, true) != htons(ETH_P_IP)) return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) @@ -1557,7 +1557,7 @@ static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) u16 *buf, buf_; u8 dscp; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); if (unlikely(!buf)) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 05605b30bef3..2b88710994d7 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -210,7 +210,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (p->set_tc_index) { int wlen = skb_network_offset(skb); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen) || @@ -303,7 +303,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) index = skb->tc_index & (p->indices - 1); pr_debug("index %d->%d\n", skb->tc_index, index); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, p->mv[index].value); @@ -320,7 +320,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) */ if (p->mv[index].mask != 0xff || p->mv[index].value) pr_warn("%s: unsupported protocol %d\n", - __func__, ntohs(tc_skb_protocol(skb))); + __func__, ntohs(skb_protocol(skb, true))); break; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 689ef6f3ded8..2f1f0a378408 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -239,7 +239,7 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, n, dev); - err = dev_hard_header(skb, dev, ntohs(tc_skb_protocol(skb)), + err = dev_hard_header(skb, dev, ntohs(skb_protocol(skb, false)), haddr, NULL, skb->len); if (err < 0) -- cgit v1.2.3 From caebecb0326907e978b064926836b48f94fec62f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Jul 2020 15:41:09 -0700 Subject: Documentation: networking: arcnet: drop doubled word Drop the doubled word "to". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- Documentation/networking/arcnet.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/arcnet.rst b/Documentation/networking/arcnet.rst index e93d9820f0f1..82fce606c0f0 100644 --- a/Documentation/networking/arcnet.rst +++ b/Documentation/networking/arcnet.rst @@ -434,7 +434,7 @@ can set up your network then: ifconfig arc0 insight route add insight arc0 route add freedom arc0 /* I would use the subnet here (like I said - to to in "single protocol" above), + to in "single protocol" above), but the rest of the subnet unfortunately lies across the PPP link on freedom, which confuses -- cgit v1.2.3 From e99094856d0ab192501c3e6e2ff77b3ae44f6445 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Jul 2020 15:41:10 -0700 Subject: Documentation: networking: ax25: drop doubled word Drop the doubled word "and". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: Ralf Baechle Cc: linux-hams@vger.kernel.org Signed-off-by: David S. Miller --- Documentation/networking/ax25.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/ax25.rst b/Documentation/networking/ax25.rst index 824afd7002db..f060cfb1445a 100644 --- a/Documentation/networking/ax25.rst +++ b/Documentation/networking/ax25.rst @@ -6,7 +6,7 @@ AX.25 To use the amateur radio protocols within Linux you will need to get a suitable copy of the AX.25 Utilities. More detailed information about -AX.25, NET/ROM and ROSE, associated programs and and utilities can be +AX.25, NET/ROM and ROSE, associated programs and utilities can be found on http://www.linux-ax25.org. There is an active mailing list for discussing Linux amateur radio matters -- cgit v1.2.3 From 6d0fe3aea4a815929118a002a342849e6b9f0cfd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Jul 2020 15:41:11 -0700 Subject: Documentation: networking: can_ucan_protocol: drop doubled words Drop the doubled words "the" and "of". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: Wolfgang Grandegger Cc: Marc Kleine-Budde Cc: linux-can@vger.kernel.org Acked-by: Marc Kleine-Budde Signed-off-by: David S. Miller --- Documentation/networking/can_ucan_protocol.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/can_ucan_protocol.rst b/Documentation/networking/can_ucan_protocol.rst index 4cef88d24fc7..638ac1ee7914 100644 --- a/Documentation/networking/can_ucan_protocol.rst +++ b/Documentation/networking/can_ucan_protocol.rst @@ -144,7 +144,7 @@ UCAN_COMMAND_SET_BITTIMING *Host2Dev; mandatory* -Setup bittiming by sending the the structure +Setup bittiming by sending the structure ``ucan_ctl_payload_t.cmd_set_bittiming`` (see ``struct bittiming`` for details) @@ -232,7 +232,7 @@ UCAN_IN_TX_COMPLETE zero The CAN device has sent a message to the CAN bus. It answers with a -list of of tuples . +list of tuples . The echo-id identifies the frame from (echos the id from a previous UCAN_OUT_TX message). The flag indicates the result of the -- cgit v1.2.3 From 4f6a009c8bdd69b611cf3a807128314992d07bf9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Jul 2020 15:41:12 -0700 Subject: Documentation: networking: dsa: drop doubled word Drop the doubled word "in". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: Andrew Lunn Cc: Vivien Didelot Cc: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/dsa/dsa.rst b/Documentation/networking/dsa/dsa.rst index 563d56c6a25c..a8d15dd2b42b 100644 --- a/Documentation/networking/dsa/dsa.rst +++ b/Documentation/networking/dsa/dsa.rst @@ -95,7 +95,7 @@ Ethernet switch. Networking stack hooks ---------------------- -When a master netdev is used with DSA, a small hook is placed in in the +When a master netdev is used with DSA, a small hook is placed in the networking stack is in order to have the DSA subsystem process the Ethernet switch specific tagging protocol. DSA accomplishes this by registering a specific (and fake) Ethernet type (later becoming ``skb->protocol``) with the -- cgit v1.2.3 From a7db3c766916e8a9a054925d036a2746d3e93596 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Jul 2020 15:41:13 -0700 Subject: Documentation: networking: ip-sysctl: drop doubled word Drop the doubled word "that". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index b72f89d5694c..837d51f9e1fa 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -741,7 +741,7 @@ tcp_fastopen - INTEGER Default: 0x1 - Note that that additional client or server features are only + Note that additional client or server features are only effective if the basic support (0x1 and 0x2) are enabled respectively. tcp_fastopen_blackhole_timeout_sec - INTEGER -- cgit v1.2.3 From 474112d57c70520ebd81a5ca578fee1d93fafd07 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Jul 2020 15:41:14 -0700 Subject: Documentation: networking: ipvs-sysctl: drop doubled word Drop the doubled word "that". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- Documentation/networking/ipvs-sysctl.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst index be36c4600e8f..2afccc63856e 100644 --- a/Documentation/networking/ipvs-sysctl.rst +++ b/Documentation/networking/ipvs-sysctl.rst @@ -114,7 +114,7 @@ drop_entry - INTEGER modes (when there is no enough available memory, the strategy is enabled and the variable is automatically set to 2, otherwise the strategy is disabled and the variable is set to - 1), and 3 means that that the strategy is always enabled. + 1), and 3 means that the strategy is always enabled. drop_packet - INTEGER - 0 - disabled (default) -- cgit v1.2.3 From e54ac95afb2fe9dabcbb139b1705d5a8fe96345a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 3 Jul 2020 15:41:15 -0700 Subject: Documentation: networking: rxrpc: drop doubled word Drop the doubled word "have". Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: David Howells Cc: linux-afs@lists.infradead.org Signed-off-by: David S. Miller --- Documentation/networking/rxrpc.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index 68552b92dc44..39c2249c7aa7 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -186,7 +186,7 @@ About the AF_RXRPC driver: time [tunable] after the last connection using it discarded, in case a new connection is made that could use it. - (#) A client-side connection is only shared between calls if they have have + (#) A client-side connection is only shared between calls if they have the same key struct describing their security (and assuming the calls would otherwise share the connection). Non-secured calls would also be able to share connections with each other. -- cgit v1.2.3 From 6dbb89014dc303facc54d33ae64419d2f9c8ff32 Mon Sep 17 00:00:00 2001 From: Luo bin Date: Sat, 4 Jul 2020 15:32:43 +0800 Subject: hinic: fix sending mailbox timeout in aeq event work When sending mailbox in the work of aeq event, another aeq event will be triggered. because the last aeq work is not exited and only one work can be excuted simultaneously in the same workqueue, mailbox sending function will return failure of timeout. We create and use another workqueue to fix this. Signed-off-by: Luo bin Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c | 91 ++++++++++++++++++----- drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h | 16 ++++ 2 files changed, 88 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c index c33eb1147055..e0f5a81d8620 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c @@ -370,48 +370,89 @@ int hinic_msg_to_mgmt(struct hinic_pf_to_mgmt *pf_to_mgmt, MSG_NOT_RESP, timeout); } -/** - * mgmt_recv_msg_handler - handler for message from mgmt cpu - * @pf_to_mgmt: PF to MGMT channel - * @recv_msg: received message details - **/ -static void mgmt_recv_msg_handler(struct hinic_pf_to_mgmt *pf_to_mgmt, - struct hinic_recv_msg *recv_msg) +static void recv_mgmt_msg_work_handler(struct work_struct *work) { - struct hinic_hwif *hwif = pf_to_mgmt->hwif; - struct pci_dev *pdev = hwif->pdev; - u8 *buf_out = recv_msg->buf_out; + struct hinic_mgmt_msg_handle_work *mgmt_work = + container_of(work, struct hinic_mgmt_msg_handle_work, work); + struct hinic_pf_to_mgmt *pf_to_mgmt = mgmt_work->pf_to_mgmt; + struct pci_dev *pdev = pf_to_mgmt->hwif->pdev; + u8 *buf_out = pf_to_mgmt->mgmt_ack_buf; struct hinic_mgmt_cb *mgmt_cb; unsigned long cb_state; u16 out_size = 0; - if (recv_msg->mod >= HINIC_MOD_MAX) { + memset(buf_out, 0, MAX_PF_MGMT_BUF_SIZE); + + if (mgmt_work->mod >= HINIC_MOD_MAX) { dev_err(&pdev->dev, "Unknown MGMT MSG module = %d\n", - recv_msg->mod); + mgmt_work->mod); + kfree(mgmt_work->msg); + kfree(mgmt_work); return; } - mgmt_cb = &pf_to_mgmt->mgmt_cb[recv_msg->mod]; + mgmt_cb = &pf_to_mgmt->mgmt_cb[mgmt_work->mod]; cb_state = cmpxchg(&mgmt_cb->state, HINIC_MGMT_CB_ENABLED, HINIC_MGMT_CB_ENABLED | HINIC_MGMT_CB_RUNNING); if ((cb_state == HINIC_MGMT_CB_ENABLED) && (mgmt_cb->cb)) - mgmt_cb->cb(mgmt_cb->handle, recv_msg->cmd, - recv_msg->msg, recv_msg->msg_len, + mgmt_cb->cb(mgmt_cb->handle, mgmt_work->cmd, + mgmt_work->msg, mgmt_work->msg_len, buf_out, &out_size); else dev_err(&pdev->dev, "No MGMT msg handler, mod: %d, cmd: %d\n", - recv_msg->mod, recv_msg->cmd); + mgmt_work->mod, mgmt_work->cmd); mgmt_cb->state &= ~HINIC_MGMT_CB_RUNNING; - if (!recv_msg->async_mgmt_to_pf) + if (!mgmt_work->async_mgmt_to_pf) /* MGMT sent sync msg, send the response */ - msg_to_mgmt_async(pf_to_mgmt, recv_msg->mod, recv_msg->cmd, + msg_to_mgmt_async(pf_to_mgmt, mgmt_work->mod, mgmt_work->cmd, buf_out, out_size, MGMT_RESP, - recv_msg->msg_id); + mgmt_work->msg_id); + + kfree(mgmt_work->msg); + kfree(mgmt_work); +} + +/** + * mgmt_recv_msg_handler - handler for message from mgmt cpu + * @pf_to_mgmt: PF to MGMT channel + * @recv_msg: received message details + **/ +static void mgmt_recv_msg_handler(struct hinic_pf_to_mgmt *pf_to_mgmt, + struct hinic_recv_msg *recv_msg) +{ + struct hinic_mgmt_msg_handle_work *mgmt_work = NULL; + struct pci_dev *pdev = pf_to_mgmt->hwif->pdev; + + mgmt_work = kzalloc(sizeof(*mgmt_work), GFP_KERNEL); + if (!mgmt_work) { + dev_err(&pdev->dev, "Allocate mgmt work memory failed\n"); + return; + } + + if (recv_msg->msg_len) { + mgmt_work->msg = kzalloc(recv_msg->msg_len, GFP_KERNEL); + if (!mgmt_work->msg) { + dev_err(&pdev->dev, "Allocate mgmt msg memory failed\n"); + kfree(mgmt_work); + return; + } + } + + mgmt_work->pf_to_mgmt = pf_to_mgmt; + mgmt_work->msg_len = recv_msg->msg_len; + memcpy(mgmt_work->msg, recv_msg->msg, recv_msg->msg_len); + mgmt_work->msg_id = recv_msg->msg_id; + mgmt_work->mod = recv_msg->mod; + mgmt_work->cmd = recv_msg->cmd; + mgmt_work->async_mgmt_to_pf = recv_msg->async_mgmt_to_pf; + + INIT_WORK(&mgmt_work->work, recv_mgmt_msg_work_handler); + queue_work(pf_to_mgmt->workq, &mgmt_work->work); } /** @@ -546,6 +587,12 @@ static int alloc_msg_buf(struct hinic_pf_to_mgmt *pf_to_mgmt) if (!pf_to_mgmt->sync_msg_buf) return -ENOMEM; + pf_to_mgmt->mgmt_ack_buf = devm_kzalloc(&pdev->dev, + MAX_PF_MGMT_BUF_SIZE, + GFP_KERNEL); + if (!pf_to_mgmt->mgmt_ack_buf) + return -ENOMEM; + return 0; } @@ -571,6 +618,11 @@ int hinic_pf_to_mgmt_init(struct hinic_pf_to_mgmt *pf_to_mgmt, return 0; sema_init(&pf_to_mgmt->sync_msg_lock, 1); + pf_to_mgmt->workq = create_singlethread_workqueue("hinic_mgmt"); + if (!pf_to_mgmt->workq) { + dev_err(&pdev->dev, "Failed to initialize MGMT workqueue\n"); + return -ENOMEM; + } pf_to_mgmt->sync_msg_id = 0; err = alloc_msg_buf(pf_to_mgmt); @@ -605,4 +657,5 @@ void hinic_pf_to_mgmt_free(struct hinic_pf_to_mgmt *pf_to_mgmt) hinic_aeq_unregister_hw_cb(&hwdev->aeqs, HINIC_MSG_FROM_MGMT_CPU); hinic_api_cmd_free(pf_to_mgmt->cmd_chain); + destroy_workqueue(pf_to_mgmt->workq); } diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h index c2b142c08b0e..a824fbda59db 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h @@ -119,6 +119,7 @@ struct hinic_pf_to_mgmt { struct semaphore sync_msg_lock; u16 sync_msg_id; u8 *sync_msg_buf; + void *mgmt_ack_buf; struct hinic_recv_msg recv_resp_msg_from_mgmt; struct hinic_recv_msg recv_msg_from_mgmt; @@ -126,6 +127,21 @@ struct hinic_pf_to_mgmt { struct hinic_api_cmd_chain *cmd_chain[HINIC_API_CMD_MAX]; struct hinic_mgmt_cb mgmt_cb[HINIC_MOD_MAX]; + + struct workqueue_struct *workq; +}; + +struct hinic_mgmt_msg_handle_work { + struct work_struct work; + struct hinic_pf_to_mgmt *pf_to_mgmt; + + void *msg; + u16 msg_len; + + enum hinic_mod_type mod; + u8 cmd; + u16 msg_id; + int async_mgmt_to_pf; }; void hinic_register_mgmt_msg_cb(struct hinic_pf_to_mgmt *pf_to_mgmt, -- cgit v1.2.3 From ccfc9df1352be5b2f391091e18c4b2395d30ce78 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 2 Jul 2020 17:06:19 +0000 Subject: hsr: fix interface leak in error path of hsr_dev_finalize() To release hsr(upper) interface, it should release its own lower interfaces first. Then, hsr(upper) interface can be released safely. In the current code of error path of hsr_dev_finalize(), it releases hsr interface before releasing a lower interface. So, a warning occurs, which warns about the leak of lower interfaces. In order to fix this problem, changing the ordering of the error path of hsr_dev_finalize() is needed. Test commands: ip link add dummy0 type dummy ip link add dummy1 type dummy ip link add dummy2 type dummy ip link add hsr0 type hsr slave1 dummy0 slave2 dummy1 ip link add hsr1 type hsr slave1 dummy2 slave2 dummy0 Splat looks like: [ 214.923127][ C2] WARNING: CPU: 2 PID: 1093 at net/core/dev.c:8992 rollback_registered_many+0x986/0xcf0 [ 214.923129][ C2] Modules linked in: hsr dummy openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipx [ 214.923154][ C2] CPU: 2 PID: 1093 Comm: ip Not tainted 5.8.0-rc2+ #623 [ 214.923156][ C2] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 214.923157][ C2] RIP: 0010:rollback_registered_many+0x986/0xcf0 [ 214.923160][ C2] Code: 41 8b 4e cc 45 31 c0 31 d2 4c 89 ee 48 89 df e8 e0 47 ff ff 85 c0 0f 84 cd fc ff ff 5 [ 214.923162][ C2] RSP: 0018:ffff8880c5156f28 EFLAGS: 00010287 [ 214.923165][ C2] RAX: ffff8880d1dad458 RBX: ffff8880bd1b9000 RCX: ffffffffb929d243 [ 214.923167][ C2] RDX: 1ffffffff77e63f0 RSI: 0000000000000008 RDI: ffffffffbbf31f80 [ 214.923168][ C2] RBP: dffffc0000000000 R08: fffffbfff77e63f1 R09: fffffbfff77e63f1 [ 214.923170][ C2] R10: ffffffffbbf31f87 R11: 0000000000000001 R12: ffff8880c51570a0 [ 214.923172][ C2] R13: ffff8880bd1b90b8 R14: ffff8880c5157048 R15: ffff8880d1dacc40 [ 214.923174][ C2] FS: 00007fdd257a20c0(0000) GS:ffff8880da200000(0000) knlGS:0000000000000000 [ 214.923175][ C2] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 214.923177][ C2] CR2: 00007ffd78beb038 CR3: 00000000be544005 CR4: 00000000000606e0 [ 214.923179][ C2] Call Trace: [ 214.923180][ C2] ? netif_set_real_num_tx_queues+0x780/0x780 [ 214.923182][ C2] ? dev_validate_mtu+0x140/0x140 [ 214.923183][ C2] ? synchronize_rcu.part.79+0x85/0xd0 [ 214.923185][ C2] ? synchronize_rcu_expedited+0xbb0/0xbb0 [ 214.923187][ C2] rollback_registered+0xc8/0x170 [ 214.923188][ C2] ? rollback_registered_many+0xcf0/0xcf0 [ 214.923190][ C2] unregister_netdevice_queue+0x18b/0x240 [ 214.923191][ C2] hsr_dev_finalize+0x56e/0x6e0 [hsr] [ 214.923192][ C2] hsr_newlink+0x36b/0x450 [hsr] [ 214.923194][ C2] ? hsr_dellink+0x70/0x70 [hsr] [ 214.923195][ C2] ? rtnl_create_link+0x2e4/0xb00 [ 214.923197][ C2] ? __netlink_ns_capable+0xc3/0xf0 [ 214.923198][ C2] __rtnl_newlink+0xbdb/0x1270 [ ... ] Fixes: e0a4b99773d3 ("hsr: use upper/lower device infrastructure") Reported-by: syzbot+7f1c020f68dab95aab59@syzkaller.appspotmail.com Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 478852ef98ef..a6f4e9f65b14 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -415,6 +415,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], unsigned char multicast_spec, u8 protocol_version, struct netlink_ext_ack *extack) { + bool unregister = false; struct hsr_priv *hsr; int res; @@ -466,25 +467,27 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res) goto err_unregister; + unregister = true; + res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A, extack); if (res) - goto err_add_slaves; + goto err_unregister; res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B, extack); if (res) - goto err_add_slaves; + goto err_unregister; hsr_debugfs_init(hsr, hsr_dev); mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); return 0; -err_add_slaves: - unregister_netdevice(hsr_dev); err_unregister: hsr_del_ports(hsr); err_add_master: hsr_del_self_node(hsr); + if (unregister) + unregister_netdevice(hsr_dev); return res; } -- cgit v1.2.3 From 2a762e9e8cd1cf1242e4269a2244666ed02eecd1 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 2 Jul 2020 17:08:18 +0000 Subject: net: rmnet: fix lower interface leak There are two types of the lower interface of rmnet that are VND and BRIDGE. Each lower interface can have only one type either VND or BRIDGE. But, there is a case, which uses both lower interface types. Due to this unexpected behavior, lower interface leak occurs. Test commands: ip link add dummy0 type dummy ip link add dummy1 type dummy ip link add rmnet0 link dummy0 type rmnet mux_id 1 ip link set dummy1 master rmnet0 ip link add rmnet1 link dummy1 type rmnet mux_id 2 ip link del rmnet0 The dummy1 was attached as BRIDGE interface of rmnet0. Then, it also was attached as VND interface of rmnet1. This is unexpected behavior and there is no code for handling this case. So that below splat occurs when the rmnet0 interface is deleted. Splat looks like: [ 53.254112][ C1] WARNING: CPU: 1 PID: 1192 at net/core/dev.c:8992 rollback_registered_many+0x986/0xcf0 [ 53.254117][ C1] Modules linked in: rmnet dummy openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nfx [ 53.254182][ C1] CPU: 1 PID: 1192 Comm: ip Not tainted 5.8.0-rc1+ #620 [ 53.254188][ C1] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 53.254192][ C1] RIP: 0010:rollback_registered_many+0x986/0xcf0 [ 53.254200][ C1] Code: 41 8b 4e cc 45 31 c0 31 d2 4c 89 ee 48 89 df e8 e0 47 ff ff 85 c0 0f 84 cd fc ff ff 0f 0b e5 [ 53.254205][ C1] RSP: 0018:ffff888050a5f2e0 EFLAGS: 00010287 [ 53.254214][ C1] RAX: ffff88805756d658 RBX: ffff88804d99c000 RCX: ffffffff8329d323 [ 53.254219][ C1] RDX: 1ffffffff0be6410 RSI: 0000000000000008 RDI: ffffffff85f32080 [ 53.254223][ C1] RBP: dffffc0000000000 R08: fffffbfff0be6411 R09: fffffbfff0be6411 [ 53.254228][ C1] R10: ffffffff85f32087 R11: 0000000000000001 R12: ffff888050a5f480 [ 53.254233][ C1] R13: ffff88804d99c0b8 R14: ffff888050a5f400 R15: ffff8880548ebe40 [ 53.254238][ C1] FS: 00007f6b86b370c0(0000) GS:ffff88806c200000(0000) knlGS:0000000000000000 [ 53.254243][ C1] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 53.254248][ C1] CR2: 0000562c62438758 CR3: 000000003f600005 CR4: 00000000000606e0 [ 53.254253][ C1] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 53.254257][ C1] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 53.254261][ C1] Call Trace: [ 53.254266][ C1] ? lockdep_hardirqs_on_prepare+0x379/0x540 [ 53.254270][ C1] ? netif_set_real_num_tx_queues+0x780/0x780 [ 53.254275][ C1] ? rmnet_unregister_real_device+0x56/0x90 [rmnet] [ 53.254279][ C1] ? __kasan_slab_free+0x126/0x150 [ 53.254283][ C1] ? kfree+0xdc/0x320 [ 53.254288][ C1] ? rmnet_unregister_real_device+0x56/0x90 [rmnet] [ 53.254293][ C1] unregister_netdevice_many.part.135+0x13/0x1b0 [ 53.254297][ C1] rtnl_delete_link+0xbc/0x100 [ 53.254301][ C1] ? rtnl_af_register+0xc0/0xc0 [ 53.254305][ C1] rtnl_dellink+0x2dc/0x840 [ 53.254309][ C1] ? find_held_lock+0x39/0x1d0 [ 53.254314][ C1] ? valid_fdb_dump_strict+0x620/0x620 [ 53.254318][ C1] ? rtnetlink_rcv_msg+0x457/0x890 [ 53.254322][ C1] ? lock_contended+0xd20/0xd20 [ 53.254326][ C1] rtnetlink_rcv_msg+0x4a8/0x890 [ ... ] [ 73.813696][ T1192] unregister_netdevice: waiting for rmnet0 to become free. Usage count = 1 Fixes: 037f9cdf72fb ("net: rmnet: use upper/lower device infrastructure") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 40efe60eff8d..2c8c252b7b97 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -47,15 +47,23 @@ static int rmnet_unregister_real_device(struct net_device *real_dev) return 0; } -static int rmnet_register_real_device(struct net_device *real_dev) +static int rmnet_register_real_device(struct net_device *real_dev, + struct netlink_ext_ack *extack) { struct rmnet_port *port; int rc, entry; ASSERT_RTNL(); - if (rmnet_is_real_dev_registered(real_dev)) + if (rmnet_is_real_dev_registered(real_dev)) { + port = rmnet_get_port_rtnl(real_dev); + if (port->rmnet_mode != RMNET_EPMODE_VND) { + NL_SET_ERR_MSG_MOD(extack, "bridge device already exists"); + return -EINVAL; + } + return 0; + } port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) @@ -133,7 +141,7 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, mux_id = nla_get_u16(data[IFLA_RMNET_MUX_ID]); - err = rmnet_register_real_device(real_dev); + err = rmnet_register_real_device(real_dev, extack); if (err) goto err0; @@ -421,11 +429,6 @@ int rmnet_add_bridge(struct net_device *rmnet_dev, return -EINVAL; } - if (port->rmnet_mode != RMNET_EPMODE_VND) { - NL_SET_ERR_MSG_MOD(extack, "bridge device already exists"); - return -EINVAL; - } - if (rmnet_is_real_dev_registered(slave_dev)) { NL_SET_ERR_MSG_MOD(extack, "slave cannot be another rmnet dev"); @@ -433,7 +436,7 @@ int rmnet_add_bridge(struct net_device *rmnet_dev, return -EBUSY; } - err = rmnet_register_real_device(slave_dev); + err = rmnet_register_real_device(slave_dev, extack); if (err) return -EBUSY; -- cgit v1.2.3 From 2fb2799a2abb39d7dbb48abb3baa1133bf5e921a Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 2 Jul 2020 17:08:55 +0000 Subject: net: rmnet: do not allow to add multiple bridge interfaces rmnet can have only two bridge interface. One of them is a link interface and another one is added by the master operation. rmnet interface shouldn't allow adding additional bridge interfaces by mater operation. But, there is no code to deny additional interfaces. So, interface leak occurs. Test commands: ip link add dummy0 type dummy ip link add dummy1 type dummy ip link add dummy2 type dummy ip link add rmnet0 link dummy0 type rmnet mux_id 1 ip link set dummy1 master rmnet0 ip link set dummy2 master rmnet0 ip link del rmnet0 In the above test command, the dummy0 was attached to rmnet as VND mode. Then, dummy1 was attached to rmnet0 as BRIDGE mode. At this point, dummy0 mode is switched from VND to BRIDGE automatically. Then, dummy2 is attached to rmnet as BRIDGE mode. At this point, rmnet0 should deny this operation. But, rmnet0 doesn't deny this. So that below splat occurs when the rmnet0 interface is deleted. Splat looks like: [ 186.684787][ C2] WARNING: CPU: 2 PID: 1009 at net/core/dev.c:8992 rollback_registered_many+0x986/0xcf0 [ 186.684788][ C2] Modules linked in: rmnet dummy openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_x [ 186.684805][ C2] CPU: 2 PID: 1009 Comm: ip Not tainted 5.8.0-rc1+ #621 [ 186.684807][ C2] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 186.684808][ C2] RIP: 0010:rollback_registered_many+0x986/0xcf0 [ 186.684811][ C2] Code: 41 8b 4e cc 45 31 c0 31 d2 4c 89 ee 48 89 df e8 e0 47 ff ff 85 c0 0f 84 cd fc ff ff 5 [ 186.684812][ C2] RSP: 0018:ffff8880cd9472e0 EFLAGS: 00010287 [ 186.684815][ C2] RAX: ffff8880cc56da58 RBX: ffff8880ab21c000 RCX: ffffffff9329d323 [ 186.684816][ C2] RDX: 1ffffffff2be6410 RSI: 0000000000000008 RDI: ffffffff95f32080 [ 186.684818][ C2] RBP: dffffc0000000000 R08: fffffbfff2be6411 R09: fffffbfff2be6411 [ 186.684819][ C2] R10: ffffffff95f32087 R11: 0000000000000001 R12: ffff8880cd947480 [ 186.684820][ C2] R13: ffff8880ab21c0b8 R14: ffff8880cd947400 R15: ffff8880cdf10640 [ 186.684822][ C2] FS: 00007f00843890c0(0000) GS:ffff8880d4e00000(0000) knlGS:0000000000000000 [ 186.684823][ C2] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 186.684825][ C2] CR2: 000055b8ab1077b8 CR3: 00000000ab612006 CR4: 00000000000606e0 [ 186.684826][ C2] Call Trace: [ 186.684827][ C2] ? lockdep_hardirqs_on_prepare+0x379/0x540 [ 186.684829][ C2] ? netif_set_real_num_tx_queues+0x780/0x780 [ 186.684830][ C2] ? rmnet_unregister_real_device+0x56/0x90 [rmnet] [ 186.684831][ C2] ? __kasan_slab_free+0x126/0x150 [ 186.684832][ C2] ? kfree+0xdc/0x320 [ 186.684834][ C2] ? rmnet_unregister_real_device+0x56/0x90 [rmnet] [ 186.684835][ C2] unregister_netdevice_many.part.135+0x13/0x1b0 [ 186.684836][ C2] rtnl_delete_link+0xbc/0x100 [ ... ] [ 238.440071][ T1009] unregister_netdevice: waiting for rmnet0 to become free. Usage count = 1 Fixes: 037f9cdf72fb ("net: rmnet: use upper/lower device infrastructure") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 2c8c252b7b97..fcdecddb2812 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -429,6 +429,11 @@ int rmnet_add_bridge(struct net_device *rmnet_dev, return -EINVAL; } + if (port->rmnet_mode != RMNET_EPMODE_VND) { + NL_SET_ERR_MSG_MOD(extack, "more than one bridge dev attached"); + return -EINVAL; + } + if (rmnet_is_real_dev_registered(slave_dev)) { NL_SET_ERR_MSG_MOD(extack, "slave cannot be another rmnet dev"); -- cgit v1.2.3 From 9dc829a135fb5927f1519de11286e2bbb79f5b66 Mon Sep 17 00:00:00 2001 From: Xie He Date: Sun, 5 Jul 2020 17:45:21 -0700 Subject: drivers/net/wan/lapbether: Fixed the value of hard_header_len When this driver transmits data, first this driver will remove a pseudo header of 1 byte, then the lapb module will prepend the LAPB header of 2 or 3 bytes, then this driver will prepend a length field of 2 bytes, then the underlying Ethernet device will prepend its own header. So, the header length required should be: -1 + 3 + 2 + "the header length needed by the underlying device". This patch fixes kernel panic when this driver is used with AF_PACKET SOCK_DGRAM sockets. Signed-off-by: Xie He Signed-off-by: David S. Miller --- drivers/net/wan/lapbether.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index e30d91a38cfb..284832314f31 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -303,7 +303,6 @@ static void lapbeth_setup(struct net_device *dev) dev->netdev_ops = &lapbeth_netdev_ops; dev->needs_free_netdev = true; dev->type = ARPHRD_X25; - dev->hard_header_len = 3; dev->mtu = 1000; dev->addr_len = 0; } @@ -324,6 +323,14 @@ static int lapbeth_new_device(struct net_device *dev) if (!ndev) goto out; + /* When transmitting data: + * first this driver removes a pseudo header of 1 byte, + * then the lapb module prepends an LAPB header of at most 3 bytes, + * then this driver prepends a length field of 2 bytes, + * then the underlying Ethernet device prepends its own header. + */ + ndev->hard_header_len = -1 + 3 + 2 + dev->hard_header_len; + lapbeth = netdev_priv(ndev); lapbeth->axdev = ndev; -- cgit v1.2.3 From 7c8b1e855f94f88a0c569be6309fc8d5c8844cd1 Mon Sep 17 00:00:00 2001 From: Andre Edich Date: Mon, 6 Jul 2020 10:39:34 +0200 Subject: smsc95xx: check return value of smsc95xx_reset The return value of the function smsc95xx_reset() must be checked to avoid returning false success from the function smsc95xx_bind(). Fixes: 2f7ca802bdae2 ("net: Add SMSC LAN9500 USB2.0 10/100 ethernet adapter driver") Signed-off-by: Andre Edich Signed-off-by: Parthiban Veerasooran Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index 3cf4dc3433f9..eb404bb74e18 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -1287,6 +1287,8 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf) /* Init all registers */ ret = smsc95xx_reset(dev); + if (ret) + goto free_pdata; /* detect device revision as different features may be available */ ret = smsc95xx_read_reg(dev, ID_REV, &val); @@ -1317,6 +1319,10 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf) schedule_delayed_work(&pdata->carrier_check, CARRIER_CHECK_DELAY); return 0; + +free_pdata: + kfree(pdata); + return ret; } static void smsc95xx_unbind(struct usbnet *dev, struct usb_interface *intf) -- cgit v1.2.3 From 3ed58f96a70b85ef646d5427258f677f1395b62f Mon Sep 17 00:00:00 2001 From: Andre Edich Date: Mon, 6 Jul 2020 10:39:35 +0200 Subject: smsc95xx: avoid memory leak in smsc95xx_bind In a case where the ID_REV register read is failed, the memory for a private data structure has to be freed before returning error from the function smsc95xx_bind. Fixes: bbd9f9ee69242 ("smsc95xx: add wol support for more frame types") Signed-off-by: Andre Edich Signed-off-by: Parthiban Veerasooran Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index eb404bb74e18..bb4ccbda031a 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -1293,7 +1293,8 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf) /* detect device revision as different features may be available */ ret = smsc95xx_read_reg(dev, ID_REV, &val); if (ret < 0) - return ret; + goto free_pdata; + val >>= 16; pdata->chip_id = val; pdata->mdix_ctrl = get_mdix_status(dev->net); -- cgit v1.2.3 From bb3d866882c280a85e8950d4d72af1e294d2e69c Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Mon, 6 Jul 2020 19:25:59 +0800 Subject: net: hns3: check reset pending after FLR prepare If there is a PF reset pending before FLR prepare, FLR's preparatory work will not fail, but the FLR rebuild procedure will fail for this pending. So this PF reset pending should be handled in the FLR preparatory. Fixes: 8627bdedc435 ("net: hns3: refactor the precedure of PF FLR") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 96bfad52630d..d6bfdc6520df 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9859,7 +9859,7 @@ retry: set_bit(HCLGE_STATE_RST_HANDLING, &hdev->state); hdev->reset_type = HNAE3_FLR_RESET; ret = hclge_reset_prepare(hdev); - if (ret) { + if (ret || hdev->reset_pending) { dev_err(&hdev->pdev->dev, "fail to prepare FLR, ret=%d\n", ret); if (hdev->reset_pending || -- cgit v1.2.3 From cddd5648926d7a6e84526dadd8bfb21609a14fb7 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Mon, 6 Jul 2020 19:26:00 +0800 Subject: net: hns3: fix for mishandle of asserting VF reset fail When asserts VF reset fail, flag HCLGEVF_STATE_CMD_DISABLE and handshake status should not set, otherwise the retry will fail. So adds a check for asserting VF reset and returns directly when fails. Fixes: ef5f8e507ec9 ("net: hns3: stop handling command queue while resetting VF") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 1b9578d0bd80..a10b022d1951 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1793,6 +1793,11 @@ static int hclgevf_reset_prepare_wait(struct hclgevf_dev *hdev) if (hdev->reset_type == HNAE3_VF_FUNC_RESET) { hclgevf_build_send_msg(&send_msg, HCLGE_MBX_RESET, 0); ret = hclgevf_send_mbx_msg(hdev, &send_msg, true, NULL, 0); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to assert VF reset, ret = %d\n", ret); + return ret; + } hdev->rst_stats.vf_func_rst_cnt++; } -- cgit v1.2.3 From e22b5e728bbb179b912d3a3cd5c25894a89a26a2 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Mon, 6 Jul 2020 19:26:01 +0800 Subject: net: hns3: add a missing uninit debugfs when unload driver When unloading driver, if flag HNS3_NIC_STATE_INITED has been already cleared, the debugfs will not be uninitialized, so fix it. Fixes: b2292360bb2a ("net: hns3: Add debugfs framework registration") Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index b14f2abc2425..c38f3bbe7d97 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4127,9 +4127,8 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset) hns3_put_ring_config(priv); - hns3_dbg_uninit(handle); - out_netdev_free: + hns3_dbg_uninit(handle); free_netdev(netdev); } -- cgit v1.2.3 From a06656211304fec653c1931c2ca6d644013b5bbb Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Mon, 6 Jul 2020 19:26:02 +0800 Subject: net: hns3: fix use-after-free when doing self test Enable promisc mode of PF, set VF link state to enable, and run iperf of the VF, then do self test of the PF. The self test will fail with a low frequency, and may cause a use-after-free problem. [ 87.142126] selftest:000004a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [ 87.159722] ================================================================== [ 87.174187] BUG: KASAN: use-after-free in hex_dump_to_buffer+0x140/0x608 [ 87.187600] Read of size 1 at addr ffff003b22828000 by task ethtool/1186 [ 87.201012] [ 87.203978] CPU: 7 PID: 1186 Comm: ethtool Not tainted 5.5.0-rc4-gfd51c473-dirty #4 [ 87.219306] Hardware name: Huawei TaiShan 2280 V2/BC82AMDA, BIOS TA BIOS 2280-A CS V2.B160.01 01/15/2020 [ 87.238292] Call trace: [ 87.243173] dump_backtrace+0x0/0x280 [ 87.250491] show_stack+0x24/0x30 [ 87.257114] dump_stack+0xe8/0x140 [ 87.263911] print_address_description.isra.8+0x70/0x380 [ 87.274538] __kasan_report+0x12c/0x230 [ 87.282203] kasan_report+0xc/0x18 [ 87.288999] __asan_load1+0x60/0x68 [ 87.295969] hex_dump_to_buffer+0x140/0x608 [ 87.304332] print_hex_dump+0x140/0x1e0 [ 87.312000] hns3_lb_check_skb_data+0x168/0x170 [ 87.321060] hns3_clean_rx_ring+0xa94/0xfe0 [ 87.329422] hns3_self_test+0x708/0x8c0 The length of packet sent by the selftest process is only 128 + 14 bytes, and the min buffer size of a BD is 256 bytes, and the receive process will make sure the packet sent by the selftest process is in the linear part, so only check the linear part in hns3_lb_check_skb_data(). So fix this use-after-free by using skb_headlen() to dump skb->data instead of skb->len. Fixes: c39c4d98dc65 ("net: hns3: Add mac loopback selftest support in hns3 driver") Signed-off-by: Yonglong Liu Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 6b1545f982aa..2622e04e8eed 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -180,18 +180,21 @@ static void hns3_lb_check_skb_data(struct hns3_enet_ring *ring, { struct hns3_enet_tqp_vector *tqp_vector = ring->tqp_vector; unsigned char *packet = skb->data; + u32 len = skb_headlen(skb); u32 i; - for (i = 0; i < skb->len; i++) + len = min_t(u32, len, HNS3_NIC_LB_TEST_PACKET_SIZE); + + for (i = 0; i < len; i++) if (packet[i] != (unsigned char)(i & 0xff)) break; /* The packet is correctly received */ - if (i == skb->len) + if (i == HNS3_NIC_LB_TEST_PACKET_SIZE) tqp_vector->rx_group.total_packets++; else print_hex_dump(KERN_ERR, "selftest:", DUMP_PREFIX_OFFSET, 16, 1, - skb->data, skb->len, true); + skb->data, len, true); dev_kfree_skb_any(skb); } -- cgit v1.2.3 From 34fe5a1cf95c3f114068fc16d919c9cf4b00e428 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 6 Jul 2020 11:45:07 -0600 Subject: ipv6: fib6_select_path can not use out path for nexthop objects Brian reported a crash in IPv6 code when using rpfilter with a setup running FRR and external nexthop objects. The root cause of the crash is fib6_select_path setting fib6_nh in the result to NULL because of an improper check for nexthop objects. More specifically, rpfilter invokes ip6_route_lookup with flowi6_oif set causing fib6_select_path to be called with have_oif_match set. fib6_select_path has early check on have_oif_match and jumps to the out label which presumes a builtin fib6_nh. This path is invalid for nexthop objects; for external nexthops fib6_select_path needs to just return if the fib6_nh has already been set in the result otherwise it returns after the call to nexthop_path_fib6_result. Update the check on have_oif_match to not bail on external nexthops. Update selftests for this problem. Fixes: f88d8ea67fbd ("ipv6: Plumb support for nexthop object in a fib6_info") Reported-by: Brian Rak Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 ++++- tools/testing/selftests/net/fib_nexthops.sh | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 82cbb46a2a4f..ea0be7cf3d93 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -431,9 +431,12 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, struct fib6_info *sibling, *next_sibling; struct fib6_info *match = res->f6i; - if ((!match->fib6_nsiblings && !match->nh) || have_oif_match) + if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; + if (match->nh && have_oif_match && res->nh) + return; + /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index dee567f7576a..22dc2f3d428b 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -747,6 +747,19 @@ ipv6_fcnal_runtime() run_cmd "$IP nexthop add id 86 via 2001:db8:91::2 dev veth1" run_cmd "$IP ro add 2001:db8:101::1/128 nhid 81" + # rpfilter and default route + $IP nexthop flush >/dev/null 2>&1 + run_cmd "ip netns exec me ip6tables -t mangle -I PREROUTING 1 -m rpfilter --invert -j DROP" + run_cmd "$IP nexthop add id 91 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 92 via 2001:db8:92::2 dev veth3" + run_cmd "$IP nexthop add id 93 group 91/92" + run_cmd "$IP -6 ro add default nhid 91" + run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1" + log_test $? 0 "Nexthop with default route and rpfilter" + run_cmd "$IP -6 ro replace default nhid 93" + run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1" + log_test $? 0 "Nexthop with multipath default route and rpfilter" + # TO-DO: # existing route with old nexthop; append route with new nexthop # existing route with old nexthop; replace route with new -- cgit v1.2.3 From 74478ea4ded519db35cb1f059948b1e713bb4abf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 Jul 2020 18:10:08 -0500 Subject: net: ipa: fix QMI structure definition bugs Building with "W=1" did exactly what it was supposed to do, namely point out some suspicious-looking code to be verified not to contain bugs. Some QMI message structures defined in "ipa_qmi_msg.c" contained some bad field names (duplicating the "elem_size" field instead of defining the "offset" field), almost certainly due to copy/paste errors that weren't obvious in a scan of the code. Fix these bugs. Fixes: 530f9216a953 ("soc: qcom: ipa: AP/modem communications") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_qmi_msg.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ipa/ipa_qmi_msg.c b/drivers/net/ipa/ipa_qmi_msg.c index 03a1d0e55964..73413371e3d3 100644 --- a/drivers/net/ipa/ipa_qmi_msg.c +++ b/drivers/net/ipa/ipa_qmi_msg.c @@ -119,7 +119,7 @@ struct qmi_elem_info ipa_driver_init_complete_rsp_ei[] = { sizeof_field(struct ipa_driver_init_complete_rsp, rsp), .tlv_type = 0x02, - .elem_size = offsetof(struct ipa_driver_init_complete_rsp, + .offset = offsetof(struct ipa_driver_init_complete_rsp, rsp), .ei_array = qmi_response_type_v01_ei, }, @@ -137,7 +137,7 @@ struct qmi_elem_info ipa_init_complete_ind_ei[] = { sizeof_field(struct ipa_init_complete_ind, status), .tlv_type = 0x02, - .elem_size = offsetof(struct ipa_init_complete_ind, + .offset = offsetof(struct ipa_init_complete_ind, status), .ei_array = qmi_response_type_v01_ei, }, @@ -218,7 +218,7 @@ struct qmi_elem_info ipa_init_modem_driver_req_ei[] = { sizeof_field(struct ipa_init_modem_driver_req, platform_type_valid), .tlv_type = 0x10, - .elem_size = offsetof(struct ipa_init_modem_driver_req, + .offset = offsetof(struct ipa_init_modem_driver_req, platform_type_valid), }, { -- cgit v1.2.3 From 3c90e95bd958285d416e83cffda82ddc92d91294 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 Jul 2020 18:10:09 -0500 Subject: net: ipa: declare struct types in "ipa_gsi.h" Pointers to two struct types are used in "ipa_gsi.h", without those struct types being forward-declared. Add these declarations. Fixes: c3f398b141a8 ("soc: qcom: ipa: IPA interface to GSI") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_gsi.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ipa/ipa_gsi.h b/drivers/net/ipa/ipa_gsi.h index 3cf18600c68e..0a40f3dc55fc 100644 --- a/drivers/net/ipa/ipa_gsi.h +++ b/drivers/net/ipa/ipa_gsi.h @@ -8,7 +8,9 @@ #include +struct gsi; struct gsi_trans; +struct ipa_gsi_endpoint_data; /** * ipa_gsi_trans_complete() - GSI transaction completion callback -- cgit v1.2.3 From a21c1f028fbae8c8e2e2602d3ea206fefa448d73 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 6 Jul 2020 18:10:10 -0500 Subject: net: ipa: include declarations in "ipa_gsi.c" Include "ipa_gsi.h" in "ipa_gsi.c", so the public functions are defined before they are used in "ipa_gsi.c". This addresses some warnings that are reported with a "W=1" build. Fixes: c3f398b141a8 ("soc: qcom: ipa: IPA interface to GSI") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_gsi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipa/ipa_gsi.c b/drivers/net/ipa/ipa_gsi.c index dc4a5c2196ae..d323adb03383 100644 --- a/drivers/net/ipa/ipa_gsi.c +++ b/drivers/net/ipa/ipa_gsi.c @@ -6,6 +6,7 @@ #include +#include "ipa_gsi.h" #include "gsi_trans.h" #include "ipa.h" #include "ipa_endpoint.h" -- cgit v1.2.3 From f815dd5cf48b905eeecf0a2b990e9b7ab048b4f1 Mon Sep 17 00:00:00 2001 From: AceLan Kao Date: Tue, 7 Jul 2020 16:14:45 +0800 Subject: net: usb: qmi_wwan: add support for Quectel EG95 LTE modem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Quectel Wireless Solutions Co., Ltd. EG95 LTE modem T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=02 Dev#= 5 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=0195 Rev=03.18 S: Manufacturer=Android S: Product=Android C: #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#=0x0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) I: If#=0x1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) I: If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) I: If#=0x3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) I: If#=0x4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) Signed-off-by: AceLan Kao Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 31b1d4b959f6..07c42c0719f5 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1370,6 +1370,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x1e0e, 0x9001, 5)}, /* SIMCom 7100E, 7230E, 7600E ++ */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0191, 4)}, /* Quectel EG91 */ + {QMI_QUIRK_SET_DTR(0x2c7c, 0x0195, 4)}, /* Quectel EG95 */ {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0104, 4)}, /* Fibocom NL678 series */ {QMI_FIXED_INTF(0x0489, 0xe0b4, 0)}, /* Foxconn T77W968 LTE */ -- cgit v1.2.3 From aea23c323d89836bcdcee67e49def997ffca043b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 7 Jul 2020 07:39:24 -0600 Subject: ipv6: Fix use of anycast address with loopback Thomas reported a regression with IPv6 and anycast using the following reproducer: echo 1 > /proc/sys/net/ipv6/conf/all/forwarding ip -6 a add fc12::1/16 dev lo sleep 2 echo "pinging lo" ping6 -c 2 fc12:: The conversion of addrconf_f6i_alloc to use ip6_route_info_create missed the use of fib6_is_reject which checks addresses added to the loopback interface and sets the REJECT flag as needed. Update fib6_is_reject for loopback checks to handle RTF_ANYCAST addresses. Fixes: c7a1ce397ada ("ipv6: Change addrconf_f6i_alloc to use ip6_route_info_create") Reported-by: thomas.gambier@nexedi.com Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ea0be7cf3d93..f3279810d765 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3405,7 +3405,7 @@ static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) if ((flags & RTF_REJECT) || (dev && (dev->flags & IFF_LOOPBACK) && !(addr_type & IPV6_ADDR_LOOPBACK) && - !(flags & RTF_LOCAL))) + !(flags & (RTF_ANYCAST | RTF_LOCAL)))) return true; return false; -- cgit v1.2.3 From ad0f75e5f57ccbceec13274e1e242f2b5a6397ed Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Jul 2020 11:52:56 -0700 Subject: cgroup: fix cgroup_sk_alloc() for sk_clone_lock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we clone a socket in sk_clone_lock(), its sk_cgrp_data is copied, so the cgroup refcnt must be taken too. And, unlike the sk_alloc() path, sock_update_netprioidx() is not called here. Therefore, it is safe and necessary to grab the cgroup refcnt even when cgroup_sk_alloc is disabled. sk_clone_lock() is in BH context anyway, the in_interrupt() would terminate this function if called there. And for sk_alloc() skcd->val is always zero. So it's safe to factor out the code to make it more readable. The global variable 'cgroup_sk_alloc_disabled' is used to determine whether to take these reference counts. It is impossible to make the reference counting correct unless we save this bit of information in skcd->val. So, add a new bit there to record whether the socket has already taken the reference counts. This obviously relies on kmalloc() to align cgroup pointers to at least 4 bytes, ARCH_KMALLOC_MINALIGN is certainly larger than that. This bug seems to be introduced since the beginning, commit d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") tried to fix it but not compeletely. It seems not easy to trigger until the recent commit 090e28b229af ("netprio_cgroup: Fix unlimited memory leak of v2 cgroups") was merged. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Reported-by: Cameron Berkenpas Reported-by: Peter Geis Reported-by: Lu Fengqi Reported-by: Daniël Sonck Reported-by: Zhang Qiang Tested-by: Cameron Berkenpas Tested-by: Peter Geis Tested-by: Thomas Lamprecht Cc: Daniel Borkmann Cc: Zefan Li Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 6 ++++-- include/linux/cgroup.h | 4 +++- kernel/cgroup/cgroup.c | 31 +++++++++++++++++++------------ net/core/sock.c | 2 +- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 52661155f85f..4f1cd0edc57d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -790,7 +790,8 @@ struct sock_cgroup_data { union { #ifdef __LITTLE_ENDIAN struct { - u8 is_data; + u8 is_data : 1; + u8 no_refcnt : 1; u8 padding; u16 prioidx; u32 classid; @@ -800,7 +801,8 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; - u8 is_data; + u8 no_refcnt : 1; + u8 is_data : 1; } __packed; #endif u64 val; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4598e4da6b1b..618838c48313 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -822,6 +822,7 @@ extern spinlock_t cgroup_sk_update_lock; void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); +void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) @@ -835,7 +836,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) */ v = READ_ONCE(skcd->val); - if (v & 1) + if (v & 3) return &cgrp_dfl_root.cgrp; return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; @@ -847,6 +848,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} +static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1ea181a58465..dd247747ec14 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6439,18 +6439,8 @@ void cgroup_sk_alloc_disable(void) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) - return; - - /* Socket clone path */ - if (skcd->val) { - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); + if (cgroup_sk_alloc_disabled) { + skcd->no_refcnt = 1; return; } @@ -6475,10 +6465,27 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) rcu_read_unlock(); } +void cgroup_sk_clone(struct sock_cgroup_data *skcd) +{ + if (skcd->val) { + if (skcd->no_refcnt) + return; + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); + } +} + void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); + if (skcd->no_refcnt) + return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/net/core/sock.c b/net/core/sock.c index d832c650287c..2e5b7870e5d3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1926,7 +1926,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; - cgroup_sk_alloc(&newsk->sk_cgrp_data); + cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); -- cgit v1.2.3 From 28b18e4eb515af7c6661c3995c6e3c34412c2874 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 3 Jul 2020 06:33:59 -0700 Subject: net: sky2: initialize return of gm_phy_read clang static analysis flags this garbage return drivers/net/ethernet/marvell/sky2.c:208:2: warning: Undefined or garbage value returned to caller [core.uninitialized.UndefReturn] return v; ^~~~~~~~ static inline u16 gm_phy_read( ... { u16 v; __gm_phy_read(hw, port, reg, &v); return v; } __gm_phy_read can return without setting v. So handle similar to skge.c's gm_phy_read, initialize v. Signed-off-by: Tom Rix Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/sky2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 241f00716979..fe54764caea9 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -203,7 +203,7 @@ io_error: static inline u16 gm_phy_read(struct sky2_hw *hw, unsigned port, u16 reg) { - u16 v; + u16 v = 0; __gm_phy_read(hw, port, reg, &v); return v; } -- cgit v1.2.3 From 7cdaa4cc4bdfa252d515b307863f5a1972246dd6 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Fri, 3 Jul 2020 16:10:58 +0200 Subject: net: ethernet: fec: prevent tx starvation under high rx load In the ISR, we poll the event register for the queues in need of service and then enter polled mode. After this point, the event register will never be read again until we exit polled mode. In a scenario where a UDP flow is routed back out through the same interface, i.e. "router-on-a-stick" we'll typically only see an rx queue event initially. Once we start to process the incoming flow we'll be locked polled mode, but we'll never clean the tx rings since that event is never caught. Eventually the netdev watchdog will trip, causing all buffers to be dropped and then the process starts over again. Rework the NAPI poll to keep trying to consome the entire budget as long as new events are coming in, making sure to service all rx/tx queues, in priority order, on each pass. Fixes: 4d494cdc92b3 ("net: fec: change data structure to support multiqueue") Signed-off-by: Tobias Waldekranz Tested-by: Fugang Duan Reviewed-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.h | 5 -- drivers/net/ethernet/freescale/fec_main.c | 94 ++++++++++--------------------- 2 files changed, 31 insertions(+), 68 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index a6cdd5b61921..d8d76da51c5e 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -525,11 +525,6 @@ struct fec_enet_private { unsigned int total_tx_ring_size; unsigned int total_rx_ring_size; - unsigned long work_tx; - unsigned long work_rx; - unsigned long work_ts; - unsigned long work_mdio; - struct platform_device *pdev; int dev_id; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 2d0d313ee7c5..3982285ed020 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -75,8 +75,6 @@ static void fec_enet_itr_coal_init(struct net_device *ndev); #define DRIVER_NAME "fec" -#define FEC_ENET_GET_QUQUE(_x) ((_x == 0) ? 1 : ((_x == 1) ? 2 : 0)) - /* Pause frame feild and FIFO threshold */ #define FEC_ENET_FCE (1 << 5) #define FEC_ENET_RSEM_V 0x84 @@ -1248,8 +1246,6 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id) fep = netdev_priv(ndev); - queue_id = FEC_ENET_GET_QUQUE(queue_id); - txq = fep->tx_queue[queue_id]; /* get next bdp of dirty_tx */ nq = netdev_get_tx_queue(ndev, queue_id); @@ -1340,17 +1336,14 @@ skb_done: writel(0, txq->bd.reg_desc_active); } -static void -fec_enet_tx(struct net_device *ndev) +static void fec_enet_tx(struct net_device *ndev) { struct fec_enet_private *fep = netdev_priv(ndev); - u16 queue_id; - /* First process class A queue, then Class B and Best Effort queue */ - for_each_set_bit(queue_id, &fep->work_tx, FEC_ENET_MAX_TX_QS) { - clear_bit(queue_id, &fep->work_tx); - fec_enet_tx_queue(ndev, queue_id); - } - return; + int i; + + /* Make sure that AVB queues are processed first. */ + for (i = fep->num_tx_queues - 1; i >= 0; i--) + fec_enet_tx_queue(ndev, i); } static int @@ -1426,7 +1419,6 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id) #ifdef CONFIG_M532x flush_cache_all(); #endif - queue_id = FEC_ENET_GET_QUQUE(queue_id); rxq = fep->rx_queue[queue_id]; /* First, grab all of the stats for the incoming packet. @@ -1550,6 +1542,7 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id) htons(ETH_P_8021Q), vlan_tag); + skb_record_rx_queue(skb, queue_id); napi_gro_receive(&fep->napi, skb); if (is_copybreak) { @@ -1595,48 +1588,30 @@ rx_processing_done: return pkt_received; } -static int -fec_enet_rx(struct net_device *ndev, int budget) +static int fec_enet_rx(struct net_device *ndev, int budget) { - int pkt_received = 0; - u16 queue_id; struct fec_enet_private *fep = netdev_priv(ndev); + int i, done = 0; - for_each_set_bit(queue_id, &fep->work_rx, FEC_ENET_MAX_RX_QS) { - int ret; - - ret = fec_enet_rx_queue(ndev, - budget - pkt_received, queue_id); + /* Make sure that AVB queues are processed first. */ + for (i = fep->num_rx_queues - 1; i >= 0; i--) + done += fec_enet_rx_queue(ndev, budget - done, i); - if (ret < budget - pkt_received) - clear_bit(queue_id, &fep->work_rx); - - pkt_received += ret; - } - return pkt_received; + return done; } -static bool -fec_enet_collect_events(struct fec_enet_private *fep, uint int_events) +static bool fec_enet_collect_events(struct fec_enet_private *fep) { - if (int_events == 0) - return false; + uint int_events; + + int_events = readl(fep->hwp + FEC_IEVENT); - if (int_events & FEC_ENET_RXF_0) - fep->work_rx |= (1 << 2); - if (int_events & FEC_ENET_RXF_1) - fep->work_rx |= (1 << 0); - if (int_events & FEC_ENET_RXF_2) - fep->work_rx |= (1 << 1); + /* Don't clear MDIO events, we poll for those */ + int_events &= ~FEC_ENET_MII; - if (int_events & FEC_ENET_TXF_0) - fep->work_tx |= (1 << 2); - if (int_events & FEC_ENET_TXF_1) - fep->work_tx |= (1 << 0); - if (int_events & FEC_ENET_TXF_2) - fep->work_tx |= (1 << 1); + writel(int_events, fep->hwp + FEC_IEVENT); - return true; + return int_events != 0; } static irqreturn_t @@ -1644,18 +1619,9 @@ fec_enet_interrupt(int irq, void *dev_id) { struct net_device *ndev = dev_id; struct fec_enet_private *fep = netdev_priv(ndev); - uint int_events; irqreturn_t ret = IRQ_NONE; - int_events = readl(fep->hwp + FEC_IEVENT); - - /* Don't clear MDIO events, we poll for those */ - int_events &= ~FEC_ENET_MII; - - writel(int_events, fep->hwp + FEC_IEVENT); - fec_enet_collect_events(fep, int_events); - - if ((fep->work_tx || fep->work_rx) && fep->link) { + if (fec_enet_collect_events(fep) && fep->link) { ret = IRQ_HANDLED; if (napi_schedule_prep(&fep->napi)) { @@ -1672,17 +1638,19 @@ static int fec_enet_rx_napi(struct napi_struct *napi, int budget) { struct net_device *ndev = napi->dev; struct fec_enet_private *fep = netdev_priv(ndev); - int pkts; + int done = 0; - pkts = fec_enet_rx(ndev, budget); - - fec_enet_tx(ndev); + do { + done += fec_enet_rx(ndev, budget - done); + fec_enet_tx(ndev); + } while ((done < budget) && fec_enet_collect_events(fep)); - if (pkts < budget) { - napi_complete_done(napi, pkts); + if (done < budget) { + napi_complete_done(napi, done); writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK); } - return pkts; + + return done; } /* ------------------------------------------------------------------------- */ -- cgit v1.2.3 From 5eff06902394425c722f0a44d9545909a8800f79 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 3 Jul 2020 17:00:32 +0200 Subject: ipv4: fill fl4_icmp_{type,code} in ping_v4_sendmsg IPv4 ping sockets don't set fl4.fl4_icmp_{type,code}, which leads to incomplete IPsec ACQUIRE messages being sent to userspace. Currently, both raw sockets and IPv6 ping sockets set those fields. Expected output of "ip xfrm monitor": acquire proto esp sel src 10.0.2.15/32 dst 8.8.8.8/32 proto icmp type 8 code 0 dev ens4 policy src 10.0.2.15/32 dst 8.8.8.8/32 Currently with ping sockets: acquire proto esp sel src 10.0.2.15/32 dst 8.8.8.8/32 proto icmp type 0 code 0 dev ens4 policy src 10.0.2.15/32 dst 8.8.8.8/32 The Libreswan test suite found this problem after Fedora changed the value for the sysctl net.ipv4.ping_group_range. Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Reported-by: Paul Wouters Tested-by: Paul Wouters Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv4/ping.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 535427292194..df6fbefe44d4 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -786,6 +786,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); + fl4.fl4_icmp_type = user_icmph.type; + fl4.fl4_icmp_code = user_icmph.code; + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { -- cgit v1.2.3 From 9c29e36152748fd623fcff6cc8f538550f9eeafc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 3 Jul 2020 18:06:04 +0200 Subject: mptcp: fix DSS map generation on fin retransmission The RFC 8684 mandates that no-data DATA FIN packets should carry a DSS with 0 sequence number and data len equal to 1. Currently, on FIN retransmission we re-use the existing mapping; if the previous fin transmission was part of a partially acked data packet, we could end-up writing in the egress packet a non-compliant DSS. The above will be detected by a "Bad mapping" warning on the receiver side. This change addresses the issue explicitly checking for 0 len packet when adding the DATA_FIN option. Fixes: 6d0060f600ad ("mptcp: Write MPTCP DSS headers to outgoing data packets") Reported-by: syzbot+42a07faa5923cfaeb9c9@syzkaller.appspotmail.com Tested-by: Christoph Paasch Reviewed-by: Christoph Paasch Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index df9a51425c6f..8f940be42f98 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -449,9 +449,9 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, } static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, - struct mptcp_ext *ext) + struct sk_buff *skb, struct mptcp_ext *ext) { - if (!ext->use_map) { + if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped */ @@ -503,7 +503,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy = *mpext; if (skb && tcp_fin && subflow->data_fin_tx_enable) - mptcp_write_data_fin(subflow, &opts->ext_copy); + mptcp_write_data_fin(subflow, skb, &opts->ext_copy); ret = true; } -- cgit v1.2.3 From f0b594dfa47555d8d69e6865c882d65a9054cb81 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 3 Jul 2020 18:44:32 +0200 Subject: net/mlx5e: Do not include rwlock.h directly rwlock.h should not be included directly. Instead linux/splinlock.h should be included. Including it directly will break the RT build. Fixes: 549c243e4e010 ("net/mlx5e: Extract neigh-specific code from en_rep.c to rep/neigh.c") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Leon Romanovsky Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c index baa162432e75..c3d167fa944c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 8367b3ab6e9a26dec5d5d07ea3eae17a21035322 Mon Sep 17 00:00:00 2001 From: wenxu Date: Sat, 4 Jul 2020 15:42:47 +0800 Subject: net/sched: act_ct: add miss tcf_lastuse_update. When tcf_ct_act execute the tcf_lastuse_update should be update or the used stats never update filter protocol ip pref 3 flower chain 0 filter protocol ip pref 3 flower chain 0 handle 0x1 eth_type ipv4 dst_ip 1.1.1.1 ip_flags frag/firstfrag skip_hw not_in_hw action order 1: ct zone 1 nat pipe index 1 ref 1 bind 1 installed 103 sec used 103 sec Action statistics: Sent 151500 bytes 101 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 cookie 4519c04dc64a1a295787aab13b6a50fb Signed-off-by: wenxu Signed-off-by: David S. Miller --- net/sched/act_ct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 86ed02487467..67504aece9ae 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -925,6 +925,8 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, force = p->ct_action & TCA_CT_ACT_FORCE; tmpl = p->tmpl; + tcf_lastuse_update(&c->tcf_tm); + if (clear) { ct = nf_ct_get(skb, &ctinfo); if (ct) { -- cgit v1.2.3 From 394de110a73395de2ca4516b0de435e91b11b604 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Sun, 5 Jul 2020 14:23:49 +0530 Subject: net: Added pointer check for dst->ops->neigh_lookup in dst_neigh_lookup_skb The packets from tunnel devices (eg bareudp) may have only metadata in the dst pointer of skb. Hence a pointer check of neigh_lookup is needed in dst_neigh_lookup_skb Kernel crashes when packets from bareudp device is processed in the kernel neighbour subsytem. [ 133.384484] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 133.385240] #PF: supervisor instruction fetch in kernel mode [ 133.385828] #PF: error_code(0x0010) - not-present page [ 133.386603] PGD 0 P4D 0 [ 133.386875] Oops: 0010 [#1] SMP PTI [ 133.387275] CPU: 0 PID: 5045 Comm: ping Tainted: G W 5.8.0-rc2+ #15 [ 133.388052] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 133.391076] RIP: 0010:0x0 [ 133.392401] Code: Bad RIP value. [ 133.394029] RSP: 0018:ffffb79980003d50 EFLAGS: 00010246 [ 133.396656] RAX: 0000000080000102 RBX: ffff9de2fe0d6600 RCX: ffff9de2fe5e9d00 [ 133.399018] RDX: 0000000000000000 RSI: ffff9de2fe5e9d00 RDI: ffff9de2fc21b400 [ 133.399685] RBP: ffff9de2fe5e9d00 R08: 0000000000000000 R09: 0000000000000000 [ 133.400350] R10: ffff9de2fbc6be22 R11: ffff9de2fe0d6600 R12: ffff9de2fc21b400 [ 133.401010] R13: ffff9de2fe0d6628 R14: 0000000000000001 R15: 0000000000000003 [ 133.401667] FS: 00007fe014918740(0000) GS:ffff9de2fec00000(0000) knlGS:0000000000000000 [ 133.402412] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 133.402948] CR2: ffffffffffffffd6 CR3: 000000003bb72000 CR4: 00000000000006f0 [ 133.403611] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 133.404270] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 133.404933] Call Trace: [ 133.405169] [ 133.405367] __neigh_update+0x5a4/0x8f0 [ 133.405734] arp_process+0x294/0x820 [ 133.406076] ? __netif_receive_skb_core+0x866/0xe70 [ 133.406557] arp_rcv+0x129/0x1c0 [ 133.406882] __netif_receive_skb_one_core+0x95/0xb0 [ 133.407340] process_backlog+0xa7/0x150 [ 133.407705] net_rx_action+0x2af/0x420 [ 133.408457] __do_softirq+0xda/0x2a8 [ 133.408813] asm_call_on_stack+0x12/0x20 [ 133.409290] [ 133.409519] do_softirq_own_stack+0x39/0x50 [ 133.410036] do_softirq+0x50/0x60 [ 133.410401] __local_bh_enable_ip+0x50/0x60 [ 133.410871] ip_finish_output2+0x195/0x530 [ 133.411288] ip_output+0x72/0xf0 [ 133.411673] ? __ip_finish_output+0x1f0/0x1f0 [ 133.412122] ip_send_skb+0x15/0x40 [ 133.412471] raw_sendmsg+0x853/0xab0 [ 133.412855] ? insert_pfn+0xfe/0x270 [ 133.413827] ? vvar_fault+0xec/0x190 [ 133.414772] sock_sendmsg+0x57/0x80 [ 133.415685] __sys_sendto+0xdc/0x160 [ 133.416605] ? syscall_trace_enter+0x1d4/0x2b0 [ 133.417679] ? __audit_syscall_exit+0x1d9/0x280 [ 133.418753] ? __prepare_exit_to_usermode+0x5d/0x1a0 [ 133.419819] __x64_sys_sendto+0x24/0x30 [ 133.420848] do_syscall_64+0x4d/0x90 [ 133.421768] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 133.422833] RIP: 0033:0x7fe013689c03 [ 133.423749] Code: Bad RIP value. [ 133.424624] RSP: 002b:00007ffc7288f418 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 133.425940] RAX: ffffffffffffffda RBX: 000056151fc63720 RCX: 00007fe013689c03 [ 133.427225] RDX: 0000000000000040 RSI: 000056151fc63720 RDI: 0000000000000003 [ 133.428481] RBP: 00007ffc72890b30 R08: 000056151fc60500 R09: 0000000000000010 [ 133.429757] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000040 [ 133.431041] R13: 000056151fc636e0 R14: 000056151fc616bc R15: 0000000000000080 [ 133.432481] Modules linked in: mpls_iptunnel act_mirred act_tunnel_key cls_flower sch_ingress veth mpls_router ip_tunnel bareudp ip6_udp_tunnel udp_tunnel macsec udp_diag inet_diag unix_diag af_packet_diag netlink_diag binfmt_misc xt_MASQUERADE iptable_nat xt_addrtype xt_conntrack nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 br_netfilter bridge stp llc ebtable_filter ebtables overlay ip6table_filter ip6_tables iptable_filter sunrpc ext4 mbcache jbd2 pcspkr i2c_piix4 virtio_balloon joydev ip_tables xfs libcrc32c ata_generic qxl pata_acpi drm_ttm_helper ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ata_piix libata virtio_net net_failover virtio_console failover virtio_blk i2c_core virtio_pci virtio_ring serio_raw floppy virtio dm_mirror dm_region_hash dm_log dm_mod [ 133.444045] CR2: 0000000000000000 [ 133.445082] ---[ end trace f4aeee1958fd1638 ]--- [ 133.446236] RIP: 0010:0x0 [ 133.447180] Code: Bad RIP value. [ 133.448152] RSP: 0018:ffffb79980003d50 EFLAGS: 00010246 [ 133.449363] RAX: 0000000080000102 RBX: ffff9de2fe0d6600 RCX: ffff9de2fe5e9d00 [ 133.450835] RDX: 0000000000000000 RSI: ffff9de2fe5e9d00 RDI: ffff9de2fc21b400 [ 133.452237] RBP: ffff9de2fe5e9d00 R08: 0000000000000000 R09: 0000000000000000 [ 133.453722] R10: ffff9de2fbc6be22 R11: ffff9de2fe0d6600 R12: ffff9de2fc21b400 [ 133.455149] R13: ffff9de2fe0d6628 R14: 0000000000000001 R15: 0000000000000003 [ 133.456520] FS: 00007fe014918740(0000) GS:ffff9de2fec00000(0000) knlGS:0000000000000000 [ 133.458046] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 133.459342] CR2: ffffffffffffffd6 CR3: 000000003bb72000 CR4: 00000000000006f0 [ 133.460782] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 133.462240] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 133.463697] Kernel panic - not syncing: Fatal exception in interrupt [ 133.465226] Kernel Offset: 0xfa00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 133.467025] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: aaa0c23cb901 ("Fix dst_neigh_lookup/dst_neigh_lookup_skb return value handling bug") Signed-off-by: Martin Varghese Signed-off-by: David S. Miller --- include/net/dst.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/net/dst.h b/include/net/dst.h index 07adfacd8088..852d8fb36ab7 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -400,7 +400,15 @@ static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, co static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { - struct neighbour *n = dst->ops->neigh_lookup(dst, skb, NULL); + struct neighbour *n = NULL; + + /* The packets from tunnel devices (eg bareudp) may have only + * metadata in the dst pointer of skb. Hence a pointer check of + * neigh_lookup is needed. + */ + if (dst->ops->neigh_lookup) + n = dst->ops->neigh_lookup(dst, skb, NULL); + return IS_ERR(n) ? NULL : n; } -- cgit v1.2.3 From 5fc6266af7b427243da24f3443a50cd4584aac06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Sun, 5 Jul 2020 21:10:17 +0200 Subject: bridge: mcast: Fix MLD2 Report IPv6 payload length check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e57f61858b7c ("net: bridge: mcast: fix stale nsrcs pointer in igmp3/mld2 report handling") introduced a bug in the IPv6 header payload length check which would potentially lead to rejecting a valid MLD2 Report: The check needs to take into account the 2 bytes for the "Number of Sources" field in the "Multicast Address Record" before reading it. And not the size of a pointer to this field. Fixes: e57f61858b7c ("net: bridge: mcast: fix stale nsrcs pointer in igmp3/mld2 report handling") Acked-by: Nikolay Aleksandrov Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 83490bf73a13..4c4a93abde68 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1007,7 +1007,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, nsrcs_offset = len + offsetof(struct mld2_grec, grec_nsrcs); if (skb_transport_offset(skb) + ipv6_transport_len(skb) < - nsrcs_offset + sizeof(_nsrcs)) + nsrcs_offset + sizeof(__nsrcs)) return -EINVAL; _nsrcs = skb_header_pointer(skb, nsrcs_offset, -- cgit v1.2.3 From da3287111ab43b32cec54d7ca6b48640f210a196 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 6 Jul 2020 12:25:53 +0300 Subject: net: qed: fix buffer overflow on ethtool -d When generating debug dump, driver firstly collects all data in binary form, and then performs per-feature formatting to human-readable if it is supported. For ethtool -d, this is roughly incorrect for two reasons. First of all, drivers should always provide only original raw dumps to Ethtool without any changes. The second, and more critical, is that Ethtool's output buffer size is strictly determined by ethtool_ops::get_regs_len(), and all data *must* fit in it. The current version of driver always returns the size of raw data, but the size of the formatted buffer exceeds it in most cases. This leads to out-of-bound writes and memory corruption. Address both issues by adding an option to return original, non-formatted debug data, and using it for Ethtool case. v2: - Expand commit message to make it more clear; - No functional changes. Fixes: c965db444629 ("qed: Add support for debug data collection") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed.h | 2 ++ drivers/net/ethernet/qlogic/qed/qed_debug.c | 13 ++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index a49743d56b9c..6c2f9ff4a53e 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -876,6 +876,8 @@ struct qed_dev { struct qed_dbg_feature dbg_features[DBG_FEATURE_NUM]; u8 engine_for_debug; bool disable_ilt_dump; + bool dbg_bin_dump; + DECLARE_HASHTABLE(connections, 10); const struct firmware *firmware; diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 81e8fbe4a05b..cb80863d5a77 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -7506,6 +7506,12 @@ static enum dbg_status format_feature(struct qed_hwfn *p_hwfn, if (p_hwfn->cdev->print_dbg_data) qed_dbg_print_feature(text_buf, text_size_bytes); + /* Just return the original binary buffer if requested */ + if (p_hwfn->cdev->dbg_bin_dump) { + vfree(text_buf); + return DBG_STATUS_OK; + } + /* Free the old dump_buf and point the dump_buf to the newly allocagted * and formatted text buffer. */ @@ -7733,7 +7739,9 @@ int qed_dbg_mcp_trace_size(struct qed_dev *cdev) #define REGDUMP_HEADER_SIZE_SHIFT 0 #define REGDUMP_HEADER_SIZE_MASK 0xffffff #define REGDUMP_HEADER_FEATURE_SHIFT 24 -#define REGDUMP_HEADER_FEATURE_MASK 0x3f +#define REGDUMP_HEADER_FEATURE_MASK 0x1f +#define REGDUMP_HEADER_BIN_DUMP_SHIFT 29 +#define REGDUMP_HEADER_BIN_DUMP_MASK 0x1 #define REGDUMP_HEADER_OMIT_ENGINE_SHIFT 30 #define REGDUMP_HEADER_OMIT_ENGINE_MASK 0x1 #define REGDUMP_HEADER_ENGINE_SHIFT 31 @@ -7771,6 +7779,7 @@ static u32 qed_calc_regdump_header(struct qed_dev *cdev, feature, feature_size); SET_FIELD(res, REGDUMP_HEADER_FEATURE, feature); + SET_FIELD(res, REGDUMP_HEADER_BIN_DUMP, 1); SET_FIELD(res, REGDUMP_HEADER_OMIT_ENGINE, omit_engine); SET_FIELD(res, REGDUMP_HEADER_ENGINE, engine); @@ -7794,6 +7803,7 @@ int qed_dbg_all_data(struct qed_dev *cdev, void *buffer) omit_engine = 1; mutex_lock(&qed_dbg_lock); + cdev->dbg_bin_dump = true; org_engine = qed_get_debug_engine(cdev); for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) { @@ -7993,6 +8003,7 @@ int qed_dbg_all_data(struct qed_dev *cdev, void *buffer) QED_NVM_IMAGE_MDUMP, "QED_NVM_IMAGE_MDUMP", rc); } + cdev->dbg_bin_dump = false; mutex_unlock(&qed_dbg_lock); return 0; -- cgit v1.2.3 From 469aceddfa3ed16e17ee30533fae45e90f62efd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 7 Jul 2020 13:03:25 +0200 Subject: vlan: consolidate VLAN parsing code and limit max parsing depth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toshiaki pointed out that we now have two very similar functions to extract the L3 protocol number in the presence of VLAN tags. And Daniel pointed out that the unbounded parsing loop makes it possible for maliciously crafted packets to loop through potentially hundreds of tags. Fix both of these issues by consolidating the two parsing functions and limiting the VLAN tag parsing to a max depth of 8 tags. As part of this, switch over __vlan_get_protocol() to use skb_header_pointer() instead of pskb_may_pull(), to avoid the possible side effects of the latter and keep the skb pointer 'const' through all the parsing functions. v2: - Use limit of 8 tags instead of 32 (matching XMIT_RECURSION_LIMIT) Reported-by: Toshiaki Makita Reported-by: Daniel Borkmann Fixes: d7bf2ebebc2b ("sched: consistently handle layer3 header accesses in the presence of VLANs") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 57 +++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 427a5b8597c2..41a518336673 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -25,6 +25,8 @@ #define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */ #define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */ +#define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */ + /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID @@ -308,34 +310,6 @@ static inline bool eth_type_vlan(__be16 ethertype) } } -/* A getter for the SKB protocol field which will handle VLAN tags consistently - * whether VLAN acceleration is enabled or not. - */ -static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) -{ - unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); - __be16 proto = skb->protocol; - - if (!skip_vlan) - /* VLAN acceleration strips the VLAN header from the skb and - * moves it to skb->vlan_proto - */ - return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; - - while (eth_type_vlan(proto)) { - struct vlan_hdr vhdr, *vh; - - vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); - if (!vh) - break; - - proto = vh->h_vlan_encapsulated_proto; - offset += sizeof(vhdr); - } - - return proto; -} - static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { @@ -605,10 +579,10 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, int *depth) { - unsigned int vlan_depth = skb->mac_len; + unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; /* if type is 802.1Q/AD then the header should already be * present at mac_len - VLAN_HLEN (if mac_len > 0), or at @@ -623,13 +597,12 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, vlan_depth = ETH_HLEN; } do { - struct vlan_hdr *vh; + struct vlan_hdr vhdr, *vh; - if (unlikely(!pskb_may_pull(skb, - vlan_depth + VLAN_HLEN))) + vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + if (unlikely(!vh || !--parse_depth)) return 0; - vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } while (eth_type_vlan(type)); @@ -648,11 +621,25 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 vlan_get_protocol(struct sk_buff *skb) +static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { return __vlan_get_protocol(skb, skb->protocol, NULL); } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; + + return vlan_get_protocol(skb); +} + static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) { -- cgit v1.2.3 From 086c18f2452d0028f81e319f098bcb8e53133dbf Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 7 Jul 2020 14:13:26 -0700 Subject: ionic: centralize queue reset code The queue reset pattern is used in a couple different places, only slightly different from each other, and could cause issues if one gets changed and the other didn't. This puts them together so that only one version is needed, yet each can have slighty different effects by passing in a pointer to a work function to do whatever configuration twiddling is needed in the middle of the reset. This specifically addresses issues seen where under loops of changing ring size or queue count parameters we could occasionally bump into the netdev watchdog. v2: added more commit message commentary Fixes: 4d03e00a2140 ("ionic: Add initial ethtool support") Signed-off-by: Shannon Nelson Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../net/ethernet/pensando/ionic/ionic_ethtool.c | 52 +++++++--------------- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 17 ++++--- drivers/net/ethernet/pensando/ionic/ionic_lif.h | 4 +- 3 files changed, 32 insertions(+), 41 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index f7e3ce3de04d..e03ea9b18f95 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -468,12 +468,18 @@ static void ionic_get_ringparam(struct net_device *netdev, ring->rx_pending = lif->nrxq_descs; } +static void ionic_set_ringsize(struct ionic_lif *lif, void *arg) +{ + struct ethtool_ringparam *ring = arg; + + lif->ntxq_descs = ring->tx_pending; + lif->nrxq_descs = ring->rx_pending; +} + static int ionic_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) { struct ionic_lif *lif = netdev_priv(netdev); - bool running; - int err; if (ring->rx_mini_pending || ring->rx_jumbo_pending) { netdev_info(netdev, "Changing jumbo or mini descriptors not supported\n"); @@ -491,22 +497,7 @@ static int ionic_set_ringparam(struct net_device *netdev, ring->rx_pending == lif->nrxq_descs) return 0; - err = ionic_wait_for_bit(lif, IONIC_LIF_F_QUEUE_RESET); - if (err) - return err; - - running = test_bit(IONIC_LIF_F_UP, lif->state); - if (running) - ionic_stop(netdev); - - lif->ntxq_descs = ring->tx_pending; - lif->nrxq_descs = ring->rx_pending; - - if (running) - ionic_open(netdev); - clear_bit(IONIC_LIF_F_QUEUE_RESET, lif->state); - - return 0; + return ionic_reset_queues(lif, ionic_set_ringsize, ring); } static void ionic_get_channels(struct net_device *netdev, @@ -521,12 +512,17 @@ static void ionic_get_channels(struct net_device *netdev, ch->combined_count = lif->nxqs; } +static void ionic_set_queuecount(struct ionic_lif *lif, void *arg) +{ + struct ethtool_channels *ch = arg; + + lif->nxqs = ch->combined_count; +} + static int ionic_set_channels(struct net_device *netdev, struct ethtool_channels *ch) { struct ionic_lif *lif = netdev_priv(netdev); - bool running; - int err; if (!ch->combined_count || ch->other_count || ch->rx_count || ch->tx_count) @@ -535,21 +531,7 @@ static int ionic_set_channels(struct net_device *netdev, if (ch->combined_count == lif->nxqs) return 0; - err = ionic_wait_for_bit(lif, IONIC_LIF_F_QUEUE_RESET); - if (err) - return err; - - running = test_bit(IONIC_LIF_F_UP, lif->state); - if (running) - ionic_stop(netdev); - - lif->nxqs = ch->combined_count; - - if (running) - ionic_open(netdev); - clear_bit(IONIC_LIF_F_QUEUE_RESET, lif->state); - - return 0; + return ionic_reset_queues(lif, ionic_set_queuecount, ch); } static u32 ionic_get_priv_flags(struct net_device *netdev) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 3c9dde31f3fa..f49486b6d04d 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1313,7 +1313,7 @@ static int ionic_change_mtu(struct net_device *netdev, int new_mtu) return err; netdev->mtu = new_mtu; - err = ionic_reset_queues(lif); + err = ionic_reset_queues(lif, NULL, NULL); return err; } @@ -1325,7 +1325,7 @@ static void ionic_tx_timeout_work(struct work_struct *ws) netdev_info(lif->netdev, "Tx Timeout recovery\n"); rtnl_lock(); - ionic_reset_queues(lif); + ionic_reset_queues(lif, NULL, NULL); rtnl_unlock(); } @@ -1988,7 +1988,7 @@ static const struct net_device_ops ionic_netdev_ops = { .ndo_get_vf_stats = ionic_get_vf_stats, }; -int ionic_reset_queues(struct ionic_lif *lif) +int ionic_reset_queues(struct ionic_lif *lif, ionic_reset_cb cb, void *arg) { bool running; int err = 0; @@ -2001,12 +2001,19 @@ int ionic_reset_queues(struct ionic_lif *lif) if (running) { netif_device_detach(lif->netdev); err = ionic_stop(lif->netdev); + if (err) + goto reset_out; } - if (!err && running) { - ionic_open(lif->netdev); + + if (cb) + cb(lif, arg); + + if (running) { + err = ionic_open(lif->netdev); netif_device_attach(lif->netdev); } +reset_out: clear_bit(IONIC_LIF_F_QUEUE_RESET, lif->state); return err; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.h b/drivers/net/ethernet/pensando/ionic/ionic_lif.h index c3428034a17b..ed126dd74e01 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.h @@ -248,6 +248,8 @@ static inline u32 ionic_coal_hw_to_usec(struct ionic *ionic, u32 units) return (units * div) / mult; } +typedef void (*ionic_reset_cb)(struct ionic_lif *lif, void *arg); + void ionic_link_status_check_request(struct ionic_lif *lif); void ionic_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *ns); @@ -267,7 +269,7 @@ int ionic_lif_rss_config(struct ionic_lif *lif, u16 types, int ionic_open(struct net_device *netdev); int ionic_stop(struct net_device *netdev); -int ionic_reset_queues(struct ionic_lif *lif); +int ionic_reset_queues(struct ionic_lif *lif, ionic_reset_cb cb, void *arg); static inline void debug_stats_txq_post(struct ionic_qcq *qcq, struct ionic_txq_desc *desc, bool dbell) -- cgit v1.2.3 From a42e6aee7f47a8a68d09923c720fc8f605a04207 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Wed, 8 Jul 2020 17:17:10 +0300 Subject: net: atlantic: fix ip dst and ipv6 address filters This patch fixes ip dst and ipv6 address filters. There were 2 mistakes in the code, which led to the issue: * invalid register was used for ipv4 dst address; * incorrect write order of dwords for ipv6 addresses. Fixes: 23e7a718a49b ("net: aquantia: add rx-flow filter definitions") Signed-off-by: Dmitry Bogdanov Signed-off-by: Mark Starovoytov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c | 4 ++-- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c index 3c8e8047ea1e..d775b23025c1 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c @@ -1700,7 +1700,7 @@ void hw_atl_rpfl3l4_ipv6_src_addr_set(struct aq_hw_s *aq_hw, u8 location, for (i = 0; i < 4; ++i) aq_hw_write_reg(aq_hw, HW_ATL_RPF_L3_SRCA_ADR(location + i), - ipv6_src[i]); + ipv6_src[3 - i]); } void hw_atl_rpfl3l4_ipv6_dest_addr_set(struct aq_hw_s *aq_hw, u8 location, @@ -1711,7 +1711,7 @@ void hw_atl_rpfl3l4_ipv6_dest_addr_set(struct aq_hw_s *aq_hw, u8 location, for (i = 0; i < 4; ++i) aq_hw_write_reg(aq_hw, HW_ATL_RPF_L3_DSTA_ADR(location + i), - ipv6_dest[i]); + ipv6_dest[3 - i]); } u32 hw_atl_sem_ram_get(struct aq_hw_s *self) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h index 06220792daf1..7430ff025134 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h @@ -1360,7 +1360,7 @@ */ /* Register address for bitfield pif_rpf_l3_da0_i[31:0] */ -#define HW_ATL_RPF_L3_DSTA_ADR(filter) (0x000053B0 + (filter) * 0x4) +#define HW_ATL_RPF_L3_DSTA_ADR(filter) (0x000053D0 + (filter) * 0x4) /* Bitmask for bitfield l3_da0[1F:0] */ #define HW_ATL_RPF_L3_DSTA_MSK 0xFFFFFFFFu /* Inverted bitmask for bitfield l3_da0[1F:0] */ -- cgit v1.2.3 From 6778a6bed09b58beca936a675e9dd195c0986580 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 8 Jul 2020 17:05:11 +0200 Subject: net/smc: separate LLC wait queues for flow and messages There might be races in scenarios where both SMC link groups are on the same system. Prevent that by creating separate wait queues for LLC flows and messages. Switch to non-interruptable versions of wait_event() and wake_up() for the llc flow waiter to make sure the waiters get control sequentially. Fine tune the llc_flow_lock to include the assignment of the message. Write to system log when an unexpected message was dropped. And remove an extra indirection and use the existing local variable lgr in smc_llc_enqueue(). Fixes: 555da9af827d ("net/smc: add event-based llc_flow framework") Reviewed-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 38 ++++++++++++++----------- net/smc/smc_core.h | 4 ++- net/smc/smc_llc.c | 83 ++++++++++++++++++++++++++++++++++-------------------- 3 files changed, 77 insertions(+), 48 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 7964a21e5e6f..d695ce71837e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -247,7 +247,8 @@ static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) if (smc_link_usable(lnk)) lnk->state = SMC_LNK_INACTIVE; } - wake_up_interruptible_all(&lgr->llc_waiter); + wake_up_all(&lgr->llc_msg_waiter); + wake_up_all(&lgr->llc_flow_waiter); } static void smc_lgr_free(struct smc_link_group *lgr); @@ -1130,18 +1131,19 @@ static void smcr_link_up(struct smc_link_group *lgr, return; if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* some other llc task is ongoing */ - wait_event_interruptible_timeout(lgr->llc_waiter, - (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + wait_event_timeout(lgr->llc_flow_waiter, + (list_empty(&lgr->list) || + lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), SMC_LLC_WAIT_TIME); } - if (list_empty(&lgr->list) || - !smc_ib_port_active(smcibdev, ibport)) - return; /* lgr or device no longer active */ - link = smc_llc_usable_link(lgr); - if (!link) - return; - smc_llc_send_add_link(link, smcibdev->mac[ibport - 1], gid, - NULL, SMC_LLC_REQ); + /* lgr or device no longer active? */ + if (!list_empty(&lgr->list) && + smc_ib_port_active(smcibdev, ibport)) + link = smc_llc_usable_link(lgr); + if (link) + smc_llc_send_add_link(link, smcibdev->mac[ibport - 1], + gid, NULL, SMC_LLC_REQ); + wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */ } } @@ -1195,13 +1197,17 @@ static void smcr_link_down(struct smc_link *lnk) if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* another llc task is ongoing */ mutex_unlock(&lgr->llc_conf_mutex); - wait_event_interruptible_timeout(lgr->llc_waiter, - (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + wait_event_timeout(lgr->llc_flow_waiter, + (list_empty(&lgr->list) || + lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), SMC_LLC_WAIT_TIME); mutex_lock(&lgr->llc_conf_mutex); } - smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true, - SMC_LLC_DEL_LOST_PATH); + if (!list_empty(&lgr->list)) + smc_llc_send_delete_link(to_lnk, del_link_id, + SMC_LLC_REQ, true, + SMC_LLC_DEL_LOST_PATH); + wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */ } } @@ -1262,7 +1268,7 @@ static void smc_link_down_work(struct work_struct *work) if (list_empty(&lgr->list)) return; - wake_up_interruptible_all(&lgr->llc_waiter); + wake_up_all(&lgr->llc_msg_waiter); mutex_lock(&lgr->llc_conf_mutex); smcr_link_down(link); mutex_unlock(&lgr->llc_conf_mutex); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 86d160f0d187..c3ff512fd891 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -262,8 +262,10 @@ struct smc_link_group { struct work_struct llc_del_link_work; struct work_struct llc_event_work; /* llc event worker */ - wait_queue_head_t llc_waiter; + wait_queue_head_t llc_flow_waiter; /* w4 next llc event */ + wait_queue_head_t llc_msg_waiter; + /* w4 next llc msg */ struct smc_llc_flow llc_flow_lcl; /* llc local control field */ struct smc_llc_flow llc_flow_rmt; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 391237b601fe..df164232574b 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -186,6 +186,26 @@ static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow, flow->qentry = qentry; } +static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type, + struct smc_llc_qentry *qentry) +{ + u8 msg_type = qentry->msg.raw.hdr.common.type; + + if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) && + flow_type != msg_type && !lgr->delayed_event) { + lgr->delayed_event = qentry; + return; + } + /* drop parallel or already-in-progress llc requests */ + if (flow_type != msg_type) + pr_warn_once("smc: SMC-R lg %*phN dropped parallel " + "LLC msg: msg %d flow %d role %d\n", + SMC_LGR_ID_SIZE, &lgr->id, + qentry->msg.raw.hdr.common.type, + flow_type, lgr->role); + kfree(qentry); +} + /* try to start a new llc flow, initiated by an incoming llc msg */ static bool smc_llc_flow_start(struct smc_llc_flow *flow, struct smc_llc_qentry *qentry) @@ -195,14 +215,7 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow, spin_lock_bh(&lgr->llc_flow_lock); if (flow->type) { /* a flow is already active */ - if ((qentry->msg.raw.hdr.common.type == SMC_LLC_ADD_LINK || - qentry->msg.raw.hdr.common.type == SMC_LLC_DELETE_LINK) && - !lgr->delayed_event) { - lgr->delayed_event = qentry; - } else { - /* forget this llc request */ - kfree(qentry); - } + smc_llc_flow_parallel(lgr, flow->type, qentry); spin_unlock_bh(&lgr->llc_flow_lock); return false; } @@ -222,8 +235,8 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow, } if (qentry == lgr->delayed_event) lgr->delayed_event = NULL; - spin_unlock_bh(&lgr->llc_flow_lock); smc_llc_flow_qentry_set(flow, qentry); + spin_unlock_bh(&lgr->llc_flow_lock); return true; } @@ -251,11 +264,11 @@ again: return 0; } spin_unlock_bh(&lgr->llc_flow_lock); - rc = wait_event_interruptible_timeout(lgr->llc_waiter, - (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && - (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || - lgr->llc_flow_rmt.type == allowed_remote)), - SMC_LLC_WAIT_TIME); + rc = wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) || + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote))), + SMC_LLC_WAIT_TIME * 10); if (!rc) return -ETIMEDOUT; goto again; @@ -272,7 +285,7 @@ void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow) flow == &lgr->llc_flow_lcl) schedule_work(&lgr->llc_event_work); else - wake_up_interruptible(&lgr->llc_waiter); + wake_up(&lgr->llc_flow_waiter); } /* lnk is optional and used for early wakeup when link goes down, useful in @@ -283,26 +296,32 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, int time_out, u8 exp_msg) { struct smc_llc_flow *flow = &lgr->llc_flow_lcl; + u8 rcv_msg; - wait_event_interruptible_timeout(lgr->llc_waiter, - (flow->qentry || - (lnk && !smc_link_usable(lnk)) || - list_empty(&lgr->list)), - time_out); + wait_event_timeout(lgr->llc_msg_waiter, + (flow->qentry || + (lnk && !smc_link_usable(lnk)) || + list_empty(&lgr->list)), + time_out); if (!flow->qentry || (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) { smc_llc_flow_qentry_del(flow); goto out; } - if (exp_msg && flow->qentry->msg.raw.hdr.common.type != exp_msg) { + rcv_msg = flow->qentry->msg.raw.hdr.common.type; + if (exp_msg && rcv_msg != exp_msg) { if (exp_msg == SMC_LLC_ADD_LINK && - flow->qentry->msg.raw.hdr.common.type == - SMC_LLC_DELETE_LINK) { + rcv_msg == SMC_LLC_DELETE_LINK) { /* flow_start will delay the unexpected msg */ smc_llc_flow_start(&lgr->llc_flow_lcl, smc_llc_flow_qentry_clr(flow)); return NULL; } + pr_warn_once("smc: SMC-R lg %*phN dropped unexpected LLC msg: " + "msg %d exp %d flow %d role %d flags %x\n", + SMC_LGR_ID_SIZE, &lgr->id, rcv_msg, exp_msg, + flow->type, lgr->role, + flow->qentry->msg.raw.hdr.flags); smc_llc_flow_qentry_del(flow); } out: @@ -1459,7 +1478,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) /* a flow is waiting for this message */ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); - wake_up_interruptible(&lgr->llc_waiter); + wake_up(&lgr->llc_msg_waiter); } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { schedule_work(&lgr->llc_add_link_work); @@ -1474,7 +1493,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* a flow is waiting for this message */ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); - wake_up_interruptible(&lgr->llc_waiter); + wake_up(&lgr->llc_msg_waiter); return; } break; @@ -1485,7 +1504,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) /* DEL LINK REQ during ADD LINK SEQ */ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); - wake_up_interruptible(&lgr->llc_waiter); + wake_up(&lgr->llc_msg_waiter); } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { schedule_work(&lgr->llc_del_link_work); @@ -1496,7 +1515,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) /* DEL LINK REQ during ADD LINK SEQ */ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); - wake_up_interruptible(&lgr->llc_waiter); + wake_up(&lgr->llc_msg_waiter); } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { schedule_work(&lgr->llc_del_link_work); @@ -1581,7 +1600,7 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_DELETE_RKEY: /* assign responses to the local flow, we requested them */ smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); - wake_up_interruptible(&link->lgr->llc_waiter); + wake_up(&link->lgr->llc_msg_waiter); return; case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ @@ -1616,7 +1635,7 @@ static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) spin_lock_irqsave(&lgr->llc_event_q_lock, flags); list_add_tail(&qentry->list, &lgr->llc_event_q); spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); - schedule_work(&link->lgr->llc_event_work); + schedule_work(&lgr->llc_event_work); } /* copy received msg and add it to the event queue */ @@ -1677,7 +1696,8 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) INIT_LIST_HEAD(&lgr->llc_event_q); spin_lock_init(&lgr->llc_event_q_lock); spin_lock_init(&lgr->llc_flow_lock); - init_waitqueue_head(&lgr->llc_waiter); + init_waitqueue_head(&lgr->llc_flow_waiter); + init_waitqueue_head(&lgr->llc_msg_waiter); mutex_init(&lgr->llc_conf_mutex); lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; } @@ -1686,7 +1706,8 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) void smc_llc_lgr_clear(struct smc_link_group *lgr) { smc_llc_event_flush(lgr); - wake_up_interruptible_all(&lgr->llc_waiter); + wake_up_all(&lgr->llc_flow_waiter); + wake_up_all(&lgr->llc_msg_waiter); cancel_work_sync(&lgr->llc_event_work); cancel_work_sync(&lgr->llc_add_link_work); cancel_work_sync(&lgr->llc_del_link_work); -- cgit v1.2.3 From b7eede757883a9892dcb7bf0280f4890fc74bcf6 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 8 Jul 2020 17:05:12 +0200 Subject: net/smc: fix work request handling Wait for pending sends only when smc_switch_conns() found a link to move the connections to. Do not wait during link freeing, this can lead to permanent hang situations. And refuse to provide a new tx slot on an unusable link. Fixes: c6f02ebeea3a ("net/smc: switch connections to alternate link") Reviewed-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 8 ++++---- net/smc/smc_wr.c | 10 ++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index df164232574b..c1a038689c63 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1241,8 +1241,8 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) smc_llc_send_message(lnk, &qentry->msg); /* response */ if (smc_link_downing(&lnk_del->state)) { - smc_switch_conns(lgr, lnk_del, false); - smc_wr_tx_wait_no_pending_sends(lnk_del); + if (smc_switch_conns(lgr, lnk_del, false)) + smc_wr_tx_wait_no_pending_sends(lnk_del); } smcr_link_clear(lnk_del, true); @@ -1316,8 +1316,8 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) goto out; /* asymmetric link already deleted */ if (smc_link_downing(&lnk_del->state)) { - smc_switch_conns(lgr, lnk_del, false); - smc_wr_tx_wait_no_pending_sends(lnk_del); + if (smc_switch_conns(lgr, lnk_del, false)) + smc_wr_tx_wait_no_pending_sends(lnk_del); } if (!list_empty(&lgr->list)) { /* qentry is either a request from peer (send it back to diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 7239ba9b99dc..1e23cdd41eb1 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -169,6 +169,8 @@ void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) { *idx = link->wr_tx_cnt; + if (!smc_link_usable(link)) + return -ENOLINK; for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; @@ -560,15 +562,15 @@ void smc_wr_free_link(struct smc_link *lnk) { struct ib_device *ibdev; + if (!lnk->smcibdev) + return; + ibdev = lnk->smcibdev->ibdev; + if (smc_wr_tx_wait_no_pending_sends(lnk)) memset(lnk->wr_tx_mask, 0, BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); - if (!lnk->smcibdev) - return; - ibdev = lnk->smcibdev->ibdev; - if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, -- cgit v1.2.3 From 92f3cb0e11dda530d1daa42d7a11af5a92ed89e4 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 8 Jul 2020 17:05:13 +0200 Subject: net/smc: fix sleep bug in smc_pnet_find_roce_resource() Tests showed this BUG: [572555.252867] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:935 [572555.252876] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 131031, name: smcapp [572555.252879] INFO: lockdep is turned off. [572555.252883] CPU: 1 PID: 131031 Comm: smcapp Tainted: G O 5.7.0-rc3uschi+ #356 [572555.252885] Hardware name: IBM 3906 M03 703 (LPAR) [572555.252887] Call Trace: [572555.252896] [<00000000ac364554>] show_stack+0x94/0xe8 [572555.252901] [<00000000aca1f400>] dump_stack+0xa0/0xe0 [572555.252906] [<00000000ac3c8c10>] ___might_sleep+0x260/0x280 [572555.252910] [<00000000acdc0c98>] __mutex_lock+0x48/0x940 [572555.252912] [<00000000acdc15c2>] mutex_lock_nested+0x32/0x40 [572555.252975] [<000003ff801762d0>] mlx5_lag_get_roce_netdev+0x30/0xc0 [mlx5_core] [572555.252996] [<000003ff801fb3aa>] mlx5_ib_get_netdev+0x3a/0xe0 [mlx5_ib] [572555.253007] [<000003ff80063848>] smc_pnet_find_roce_resource+0x1d8/0x310 [smc] [572555.253011] [<000003ff800602f0>] __smc_connect+0x1f0/0x3e0 [smc] [572555.253015] [<000003ff80060634>] smc_connect+0x154/0x190 [smc] [572555.253022] [<00000000acbed8d4>] __sys_connect+0x94/0xd0 [572555.253025] [<00000000acbef620>] __s390x_sys_socketcall+0x170/0x360 [572555.253028] [<00000000acdc6800>] system_call+0x298/0x2b8 [572555.253030] INFO: lockdep is turned off. Function smc_pnet_find_rdma_dev() might be called from smc_pnet_find_roce_resource(). It holds the smc_ib_devices list spinlock while calling infiniband op get_netdev(). At least for mlx5 the get_netdev operation wants mutex serialization, which conflicts with the smc_ib_devices spinlock. This patch switches the smc_ib_devices spinlock into a mutex to allow sleeping when calling get_netdev(). Fixes: a4cf0443c414 ("smc: introduce SMC as an IB-client") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 5 +++-- net/smc/smc_ib.c | 11 ++++++----- net/smc/smc_ib.h | 3 ++- net/smc/smc_pnet.c | 21 +++++++++++---------- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d695ce71837e..8bf34d9f27e5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1961,14 +1962,14 @@ static void smc_core_going_away(void) struct smc_ib_device *smcibdev; struct smcd_dev *smcd; - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { int i; for (i = 0; i < SMC_MAX_PORTS; i++) set_bit(i, smcibdev->ports_going_away); } - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); spin_lock(&smcd_dev_list.lock); list_for_each_entry(smcd, &smcd_dev_list.list, list) { diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 562a52d01ad1..7637fdebbb78 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -33,7 +34,7 @@ #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ - .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock), + .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex), .list = LIST_HEAD_INIT(smc_ib_devices.list), }; @@ -565,9 +566,9 @@ static int smc_ib_add_dev(struct ib_device *ibdev) INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); atomic_set(&smcibdev->lnk_cnt, 0); init_waitqueue_head(&smcibdev->lnks_deleted); - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_add_tail(&smcibdev->list, &smc_ib_devices.list); - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); ib_set_client_data(ibdev, &smc_ib_client, smcibdev); INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, smc_ib_global_event_handler); @@ -602,9 +603,9 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) { struct smc_ib_device *smcibdev = client_data; - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); pr_warn_ratelimited("smc: removing ib device %s\n", smcibdev->ibdev->name); smc_smcr_terminate_all(smcibdev); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index e6a696ae15f3..ae6776e1e726 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -25,7 +26,7 @@ struct smc_ib_devices { /* list of smc ib devices definition */ struct list_head list; - spinlock_t lock; /* protects list of smc ib devices */ + struct mutex mutex; /* protects list of smc ib devices */ }; extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 014d91b9778e..d4aac31d39f5 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -129,7 +130,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) return rc; /* remove ib devices */ - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { if (ibdev->pnetid_by_user[ibport] && @@ -149,7 +150,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) } } } - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ spin_lock(&smcd_dev_list.lock); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { @@ -240,14 +241,14 @@ static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; bool applied = false; - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, SMC_MAX_PNETID_LEN); ib_dev->pnetid_by_user[ib_port - 1] = true; applied = true; } - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); return applied; } @@ -300,7 +301,7 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) { struct smc_ib_device *ibdev; - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (!strncmp(ibdev->ibdev->name, ib_name, sizeof(ibdev->ibdev->name)) || @@ -311,7 +312,7 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) } ibdev = NULL; out: - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); return ibdev; } @@ -825,7 +826,7 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, int i; ini->ib_dev = NULL; - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (ibdev == known_dev) continue; @@ -844,7 +845,7 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, } } out: - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); } /* find alternate roce device with same pnet_id and vlan_id */ @@ -863,7 +864,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, { struct smc_ib_device *ibdev; - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { struct net_device *ndev; int i; @@ -888,7 +889,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, } } } - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); } /* Determine the corresponding IB device port based on the hardware PNETID. -- cgit v1.2.3 From 82087c0330534d18e6db25869871e589d214b7fa Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 8 Jul 2020 17:05:14 +0200 Subject: net/smc: switch smcd_dev_list spinlock to mutex The similar smc_ib_devices spinlock has been converted to a mutex. Protecting the smcd_dev_list by a mutex is possible as well. This patch converts the smcd_dev_list spinlock to a mutex. Fixes: c6ba7c9ba43d ("net/smc: add base infrastructure for SMC-D and ISM") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 8 ++++---- net/smc/smc_ism.c | 11 ++++++----- net/smc/smc_ism.h | 3 ++- net/smc/smc_pnet.c | 16 ++++++++-------- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 8bf34d9f27e5..f69d205b3e11 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1971,11 +1971,11 @@ static void smc_core_going_away(void) } mutex_unlock(&smc_ib_devices.mutex); - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { smcd->going_away = 1; } - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); } /* Clean up all SMC link groups */ @@ -1987,10 +1987,10 @@ static void smc_lgrs_shutdown(void) smc_smcr_terminate_all(NULL); - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) smc_smcd_terminate_all(smcd); - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); } static int smc_core_reboot_event(struct notifier_block *this, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 91f85fc09fb8..998c525de785 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -17,7 +18,7 @@ struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), - .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock) + .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex) }; /* Test if an ISM communication is possible. */ @@ -317,9 +318,9 @@ EXPORT_SYMBOL_GPL(smcd_alloc_dev); int smcd_register_dev(struct smcd_dev *smcd) { - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_add_tail(&smcd->list, &smcd_dev_list.list); - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", dev_name(&smcd->dev), smcd->pnetid, @@ -333,9 +334,9 @@ void smcd_unregister_dev(struct smcd_dev *smcd) { pr_warn_ratelimited("smc: removing smcd device %s\n", dev_name(&smcd->dev)); - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); smcd->going_away = 1; smc_smcd_terminate_all(smcd); flush_workqueue(smcd->event_wq); diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 4da946cbfa29..81cc4537efd3 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -10,12 +10,13 @@ #define SMCD_ISM_H #include +#include #include "smc.h" struct smcd_dev_list { /* List of SMCD devices */ struct list_head list; - spinlock_t lock; /* Protects list of devices */ + struct mutex mutex; /* Protects list of devices */ }; extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index d4aac31d39f5..30e5fac7034e 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -152,7 +152,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) } mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { if (smcd_dev->pnetid_by_user && (!pnet_name || @@ -166,7 +166,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) rc = 0; } } - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); return rc; } @@ -259,13 +259,13 @@ static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; bool applied = false; - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = true; applied = true; } - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); return applied; } @@ -321,7 +321,7 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) { struct smcd_dev *smcd_dev; - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { if (!strncmp(dev_name(&smcd_dev->dev), smcd_name, IB_DEVICE_NAME_MAX - 1)) @@ -329,7 +329,7 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) } smcd_dev = NULL; out: - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); return smcd_dev; } @@ -925,7 +925,7 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) return; /* pnetid could not be determined */ - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && !ismdev->going_away) { @@ -933,7 +933,7 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, break; } } - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); } /* PNET table analysis for a given sock: -- cgit v1.2.3 From fb4f79264c0fc6fd5a68ffe3e31bfff97311e1f1 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 8 Jul 2020 17:05:15 +0200 Subject: net/smc: tolerate future SMCD versions CLC proposal messages of future SMCD versions could be larger than SMCD V1 CLC proposal messages. To enable toleration in SMC V1 the receival of CLC proposal messages is adapted: * accept larger length values in CLC proposal * check trailing eye catcher for incoming CLC proposal with V1 length only * receive the whole CLC proposal even in cases it does not fit into the V1 buffer Fixes: e7b7a64a8493d ("smc: support variable CLC proposal messages") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 45 ++++++++++++++++++++++++++++++++------------- net/smc/smc_clc.h | 2 ++ 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index d5627df24215..779f4142a11d 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -27,6 +27,7 @@ #define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 #define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 +#define SMC_CLC_RECV_BUF_LEN 100 /* eye catcher "SMCR" EBCDIC for CLC messages */ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; @@ -36,7 +37,7 @@ static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'}; /* check if received message has a correct header length and contains valid * heading and trailing eyecatchers */ -static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) +static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) { struct smc_clc_msg_proposal_prefix *pclc_prfx; struct smc_clc_msg_accept_confirm *clc; @@ -49,12 +50,9 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) return false; switch (clcm->type) { case SMC_CLC_PROPOSAL: - if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && - clcm->path != SMC_TYPE_B) - return false; pclc = (struct smc_clc_msg_proposal *)clcm; pclc_prfx = smc_clc_proposal_get_prefix(pclc); - if (ntohs(pclc->hdr.length) != + if (ntohs(pclc->hdr.length) < sizeof(*pclc) + ntohs(pclc->iparea_offset) + sizeof(*pclc_prfx) + pclc_prfx->ipv6_prefixes_cnt * @@ -86,7 +84,8 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) default: return false; } - if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + if (check_trl && + memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; return true; @@ -276,7 +275,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, struct msghdr msg = {NULL, 0}; int reason_code = 0; struct kvec vec = {buf, buflen}; - int len, datlen; + int len, datlen, recvlen; + bool check_trl = true; int krflags; /* peek the first few bytes to determine length of data to receive @@ -320,10 +320,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || - (datlen > buflen) || - (clcm->version != SMC_CLC_V1) || - (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && - clcm->path != SMC_TYPE_B) || + (clcm->version < SMC_CLC_V1) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; @@ -331,16 +328,38 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, goto out; } + if (clcm->type == SMC_CLC_PROPOSAL && clcm->path == SMC_TYPE_N) + reason_code = SMC_CLC_DECL_VERSMISMAT; /* just V2 offered */ + /* receive the complete CLC message */ memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, datlen); + if (datlen > buflen) { + check_trl = false; + recvlen = buflen; + } else { + recvlen = datlen; + } + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen); krflags = MSG_WAITALL; len = sock_recvmsg(smc->clcsock, &msg, krflags); - if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { + if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; } + datlen -= len; + while (datlen) { + u8 tmp[SMC_CLC_RECV_BUF_LEN]; + + vec.iov_base = &tmp; + vec.iov_len = SMC_CLC_RECV_BUF_LEN; + /* receive remaining proposal message */ + recvlen = datlen > SMC_CLC_RECV_BUF_LEN ? + SMC_CLC_RECV_BUF_LEN : datlen; + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen); + len = sock_recvmsg(smc->clcsock, &msg, krflags); + datlen -= len; + } if (clcm->type == SMC_CLC_DECLINE) { struct smc_clc_msg_decline *dclc; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 465876701b75..76c2b150d040 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -25,6 +25,7 @@ #define SMC_CLC_V1 0x1 /* SMC version */ #define SMC_TYPE_R 0 /* SMC-R only */ #define SMC_TYPE_D 1 /* SMC-D only */ +#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */ #define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ #define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */ @@ -46,6 +47,7 @@ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ #define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ #define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ +#define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ -- cgit v1.2.3 From 27d53323664c549b5bb2dfaaf6f7ad6e0376a64e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 7 Jul 2020 02:02:32 +0800 Subject: l2tp: remove skb_dst_set() from l2tp_xmit_skb() In the tx path of l2tp, l2tp_xmit_skb() calls skb_dst_set() to set skb's dst. However, it will eventually call inet6_csk_xmit() or ip_queue_xmit() where skb's dst will be overwritten by: skb_dst_set_noref(skb, dst); without releasing the old dst in skb. Then it causes dst/dev refcnt leak: unregister_netdevice: waiting for eth0 to become free. Usage count = 1 This can be reproduced by simply running: # modprobe l2tp_eth && modprobe l2tp_ip # sh ./tools/testing/selftests/net/l2tp.sh So before going to inet6_csk_xmit() or ip_queue_xmit(), skb's dst should be dropped. This patch is to fix it by removing skb_dst_set() from l2tp_xmit_skb() and moving skb_dst_drop() into l2tp_xmit_core(). Fixes: 3557baabf280 ("[L2TP]: PPP over L2TP driver core") Reported-by: Hangbin Liu Signed-off-by: Xin Long Acked-by: James Chapman Tested-by: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 6d7ef78c88af..6434d17e6e8e 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1028,6 +1028,7 @@ static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, /* Queue the packet to IP for output */ skb->ignore_df = 1; + skb_dst_drop(skb); #if IS_ENABLED(CONFIG_IPV6) if (l2tp_sk_is_v6(tunnel->sock)) error = inet6_csk_xmit(tunnel->sock, skb, NULL); @@ -1099,10 +1100,6 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len goto out_unlock; } - /* Get routing info from the tunnel socket */ - skb_dst_drop(skb); - skb_dst_set(skb, sk_dst_check(sk, 0)); - inet = inet_sk(sk); fl = &inet->cork.fl; switch (tunnel->encap) { -- cgit v1.2.3 From a34f829164f3c70d7f53bb532ddcc39fa890b722 Mon Sep 17 00:00:00 2001 From: Hamish Martin Date: Thu, 9 Jul 2020 09:06:44 +1200 Subject: tipc: fix retransmission on unicast links A scenario has been observed where a 'bc_init' message for a link is not retransmitted if it fails to be received by the peer. This leads to the peer never establishing the link fully and it discarding all other data received on the link. In this scenario the message is lost in transit to the peer. The issue is traced to the 'nxt_retr' field of the skb not being initialised for links that aren't a bc_sndlink. This leads to the comparison in tipc_link_advance_transmq() that gates whether to attempt retransmission of a message performing in an undesirable way. Depending on the relative value of 'jiffies', this comparison: time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr) may return true or false given that 'nxt_retr' remains at the uninitialised value of 0 for non bc_sndlinks. This is most noticeable shortly after boot when jiffies is initialised to a high value (to flush out rollover bugs) and we compare a jiffies of, say, 4294940189 to zero. In that case time_before returns 'true' leading to the skb not being retransmitted. The fix is to ensure that all skbs have a valid 'nxt_retr' time set for them and this is achieved by refactoring the setting of this value into a central function. With this fix, transmission losses of 'bc_init' messages do not stall the link establishment forever because the 'bc_init' message is retransmitted and the link eventually establishes correctly. Fixes: 382f598fb66b ("tipc: reduce duplicate packets for unicast traffic") Acked-by: Jon Maloy Signed-off-by: Hamish Martin Signed-off-by: David S. Miller --- net/tipc/link.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index ee3b8d0576b8..263d950e70e9 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -921,6 +921,21 @@ static void link_prepare_wakeup(struct tipc_link *l) } +/** + * tipc_link_set_skb_retransmit_time - set the time at which retransmission of + * the given skb should be next attempted + * @skb: skb to set a future retransmission time for + * @l: link the skb will be transmitted on + */ +static void tipc_link_set_skb_retransmit_time(struct sk_buff *skb, + struct tipc_link *l) +{ + if (link_is_bc_sndlink(l)) + TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; + else + TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; +} + void tipc_link_reset(struct tipc_link *l) { struct sk_buff_head list; @@ -1036,9 +1051,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, return -ENOBUFS; } __skb_queue_tail(transmq, skb); - /* next retransmit attempt */ - if (link_is_bc_sndlink(l)) - TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; + tipc_link_set_skb_retransmit_time(skb, l); __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; l->rcv_unacked = 0; @@ -1139,9 +1152,7 @@ static void tipc_link_advance_backlog(struct tipc_link *l, if (unlikely(skb == l->backlog[imp].target_bskb)) l->backlog[imp].target_bskb = NULL; __skb_queue_tail(&l->transmq, skb); - /* next retransmit attempt */ - if (link_is_bc_sndlink(l)) - TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; + tipc_link_set_skb_retransmit_time(skb, l); __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; @@ -1584,8 +1595,7 @@ release: /* retransmit skb if unrestricted*/ if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; - TIPC_SKB_CB(skb)->nxt_retr = (is_uc) ? - TIPC_UC_RETR_TIME : TIPC_BC_RETR_LIM; + tipc_link_set_skb_retransmit_time(skb, l); _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) continue; -- cgit v1.2.3 From 76c4d85c9260c3d741cbd194c30c61983d0a4303 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Thu, 9 Jul 2020 03:14:27 +0530 Subject: cxgb4: fix all-mask IP address comparison Convert all-mask IP address to Big Endian, instead, for comparison. Fixes: f286dd8eaad5 ("cxgb4: use correct type for all-mask IP address comparison") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 7a7f61a8cdf4..d02d346629b3 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -1112,16 +1112,16 @@ static bool is_addr_all_mask(u8 *ipmask, int family) struct in_addr *addr; addr = (struct in_addr *)ipmask; - if (ntohl(addr->s_addr) == 0xffffffff) + if (addr->s_addr == htonl(0xffffffff)) return true; } else if (family == AF_INET6) { struct in6_addr *addr6; addr6 = (struct in6_addr *)ipmask; - if (ntohl(addr6->s6_addr32[0]) == 0xffffffff && - ntohl(addr6->s6_addr32[1]) == 0xffffffff && - ntohl(addr6->s6_addr32[2]) == 0xffffffff && - ntohl(addr6->s6_addr32[3]) == 0xffffffff) + if (addr6->s6_addr32[0] == htonl(0xffffffff) && + addr6->s6_addr32[1] == htonl(0xffffffff) && + addr6->s6_addr32[2] == htonl(0xffffffff) && + addr6->s6_addr32[3] == htonl(0xffffffff)) return true; } return false; -- cgit v1.2.3 From 13cf8aab7425a253070433b5a55b4209ceac8b19 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 8 Jul 2020 20:14:29 -0700 Subject: qed: Populate nvm-file attributes while reading nvm config partition. NVM config file address will be modified when the MBI image is upgraded. Driver would return stale config values if user reads the nvm-config (via ethtool -d) in this state. The fix is to re-populate nvm attribute info while reading the nvm config values/partition. Changes from previous version: ------------------------------- v3: Corrected the formatting in 'Fixes' tag. v2: Added 'Fixes' tag. Fixes: 1ac4329a1cff ("qed: Add configuration information to register dump and debug data") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_debug.c | 4 ++++ drivers/net/ethernet/qlogic/qed/qed_dev.c | 12 +++--------- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 7 +++++++ drivers/net/ethernet/qlogic/qed/qed_mcp.h | 7 +++++++ 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index cb80863d5a77..3b9bbafafe68 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -7941,6 +7941,10 @@ int qed_dbg_all_data(struct qed_dev *cdev, void *buffer) DP_ERR(cdev, "qed_dbg_mcp_trace failed. rc = %d\n", rc); } + /* Re-populate nvm attribute info */ + qed_mcp_nvm_info_free(p_hwfn); + qed_mcp_nvm_info_populate(p_hwfn); + /* nvm cfg1 */ rc = qed_dbg_nvm_image(cdev, (u8 *)buffer + offset + diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 3aa51374e727..9c26fde663b3 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -4472,12 +4472,6 @@ static int qed_get_dev_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) return 0; } -static void qed_nvm_info_free(struct qed_hwfn *p_hwfn) -{ - kfree(p_hwfn->nvm_info.image_att); - p_hwfn->nvm_info.image_att = NULL; -} - static int qed_hw_prepare_single(struct qed_hwfn *p_hwfn, void __iomem *p_regview, void __iomem *p_doorbells, @@ -4562,7 +4556,7 @@ static int qed_hw_prepare_single(struct qed_hwfn *p_hwfn, return rc; err3: if (IS_LEAD_HWFN(p_hwfn)) - qed_nvm_info_free(p_hwfn); + qed_mcp_nvm_info_free(p_hwfn); err2: if (IS_LEAD_HWFN(p_hwfn)) qed_iov_free_hw_info(p_hwfn->cdev); @@ -4623,7 +4617,7 @@ int qed_hw_prepare(struct qed_dev *cdev, if (rc) { if (IS_PF(cdev)) { qed_init_free(p_hwfn); - qed_nvm_info_free(p_hwfn); + qed_mcp_nvm_info_free(p_hwfn); qed_mcp_free(p_hwfn); qed_hw_hwfn_free(p_hwfn); } @@ -4657,7 +4651,7 @@ void qed_hw_remove(struct qed_dev *cdev) qed_iov_free_hw_info(cdev); - qed_nvm_info_free(p_hwfn); + qed_mcp_nvm_info_free(p_hwfn); } static void qed_chain_free_next_ptr(struct qed_dev *cdev, diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 9624616806e7..0fd4520d0666 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -3280,6 +3280,13 @@ err0: return rc; } +void qed_mcp_nvm_info_free(struct qed_hwfn *p_hwfn) +{ + kfree(p_hwfn->nvm_info.image_att); + p_hwfn->nvm_info.image_att = NULL; + p_hwfn->nvm_info.valid = false; +} + int qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn, enum qed_nvm_images image_id, diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h index 5750b4c5ef63..12a705ed4bac 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h @@ -1220,6 +1220,13 @@ void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); */ int qed_mcp_nvm_info_populate(struct qed_hwfn *p_hwfn); +/** + * @brief Delete nvm info shadow in the given hardware function + * + * @param p_hwfn + */ +void qed_mcp_nvm_info_free(struct qed_hwfn *p_hwfn); + /** * @brief Get the engine affinity configuration. * -- cgit v1.2.3 From 306381aec7c2b5a658eebca008c8a1b666536cba Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 8 Jul 2020 20:13:59 -0700 Subject: net_sched: fix a memory leak in atm_tc_init() When tcf_block_get() fails inside atm_tc_init(), atm_tc_put() is called to release the qdisc p->link.q. But the flow->ref prevents it to do so, as the flow->ref is still zero. Fix this by moving the p->link.ref initialization before tcf_block_get(). Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Reported-and-tested-by: syzbot+d411cff6ab29cc2c311b@syzkaller.appspotmail.com Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_atm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index ee12ca9f55b4..1c281cc81f57 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -553,16 +553,16 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, if (!p->link.q) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); + p->link.vcc = NULL; + p->link.sock = NULL; + p->link.common.classid = sch->handle; + p->link.ref = 1; err = tcf_block_get(&p->link.block, &p->link.filter_list, sch, extack); if (err) return err; - p->link.vcc = NULL; - p->link.sock = NULL; - p->link.common.classid = sch->handle; - p->link.ref = 1; tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch); return 0; } -- cgit v1.2.3 From 365f9ae4ee36037e2a9268fe7296065356840b4c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 9 Jul 2020 12:11:50 +0200 Subject: ethtool: fix genlmsg_put() failure handling in ethnl_default_dumpit() If the genlmsg_put() call in ethnl_default_dumpit() fails, we bail out without checking if we already have some messages in current skb like we do with ethnl_default_dump_one() failure later. Therefore if existing messages almost fill up the buffer so that there is not enough space even for netlink and genetlink header, we lose all prepared messages and return and error. Rather than duplicating the skb->len check, move the genlmsg_put(), genlmsg_cancel() and genlmsg_end() calls into ethnl_default_dump_one(). This is also more logical as all message composition will be in ethnl_default_dump_one() and only iteration logic will be left in ethnl_default_dumpit(). Fixes: 728480f12442 ("ethtool: default handlers for GET requests") Reported-by: Jakub Kicinski Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 88fd07f47040..dd8a1c1dc07d 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -376,10 +376,17 @@ err_dev: } static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, - const struct ethnl_dump_ctx *ctx) + const struct ethnl_dump_ctx *ctx, + struct netlink_callback *cb) { + void *ehdr; int ret; + ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + ðtool_genl_family, 0, ctx->ops->reply_cmd); + if (!ehdr) + return -EMSGSIZE; + ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); rtnl_lock(); ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, NULL); @@ -395,6 +402,10 @@ out: if (ctx->ops->cleanup_data) ctx->ops->cleanup_data(ctx->reply_data); ctx->reply_data->dev = NULL; + if (ret < 0) + genlmsg_cancel(skb, ehdr); + else + genlmsg_end(skb, ehdr); return ret; } @@ -411,7 +422,6 @@ static int ethnl_default_dumpit(struct sk_buff *skb, int s_idx = ctx->pos_idx; int h, idx = 0; int ret = 0; - void *ehdr; rtnl_lock(); for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { @@ -431,26 +441,15 @@ restart_chain: dev_hold(dev); rtnl_unlock(); - ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - ðtool_genl_family, 0, - ctx->ops->reply_cmd); - if (!ehdr) { - dev_put(dev); - ret = -EMSGSIZE; - goto out; - } - ret = ethnl_default_dump_one(skb, dev, ctx); + ret = ethnl_default_dump_one(skb, dev, ctx, cb); dev_put(dev); if (ret < 0) { - genlmsg_cancel(skb, ehdr); if (ret == -EOPNOTSUPP) goto lock_and_cont; if (likely(skb->len)) ret = skb->len; goto out; } - genlmsg_end(skb, ehdr); lock_and_cont: rtnl_lock(); if (net->dev_base_seq != seq) { -- cgit v1.2.3 From f3dda7a679df183e798b86e7b6ec05ab35476de3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jul 2020 23:11:04 -0700 Subject: bpf: net: Avoid copying sk_user_data of reuseport_array during sk_clone It makes little sense for copying sk_user_data of reuseport_array during sk_clone_lock(). This patch reuses the SK_USER_DATA_NOCOPY bit introduced in commit f1ff5ce2cd5e ("net, sk_msg: Clear sk_user_data pointer on clone if tagged"). It is used to mark the sk_user_data is not supposed to be copied to its clone. Although the cloned sk's sk_user_data will not be used/freed in bpf_sk_reuseport_detach(), this change can still allow the cloned sk's sk_user_data to be used by some other means. Freeing the reuseport_array's sk_user_data does not require a rcu grace period. Thus, the existing rcu_assign_sk_user_data_nocopy() is not used. Fixes: 5dc4c4b7d4e8 ("bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200709061104.4018798-1-kafai@fb.com --- kernel/bpf/reuseport_array.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 21cde24386db..a95bc8d7e812 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -20,11 +20,14 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map) /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { - struct sock __rcu **socks; + uintptr_t sk_user_data; write_lock_bh(&sk->sk_callback_lock); - socks = sk->sk_user_data; - if (socks) { + sk_user_data = (uintptr_t)sk->sk_user_data; + if (sk_user_data) { + struct sock __rcu **socks; + + socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of @@ -252,6 +255,7 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; + uintptr_t sk_user_data; struct socket *socket; int err, fd; @@ -305,7 +309,8 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY; + WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; -- cgit v1.2.3 From c9a368f1c0fbe2e3a21ebf231caeae58b18b2681 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jul 2020 23:11:10 -0700 Subject: bpf: net: Avoid incorrect bpf_sk_reuseport_detach call bpf_sk_reuseport_detach is currently called when sk->sk_user_data is not NULL. It is incorrect because sk->sk_user_data may not be managed by the bpf's reuseport_array. It has been reported in [1] that, the bpf_sk_reuseport_detach() which is called from udp_lib_unhash() has corrupted the sk_user_data managed by l2tp. This patch solves it by using another bit (defined as SK_USER_DATA_BPF) of the sk_user_data pointer value. It marks that a sk_user_data is managed/owned by BPF. The patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e ("net, sk_msg: Clear sk_user_data pointer on clone if tagged"). [ Note: sk->sk_user_data is used by bpf's reuseport_array only when a sk is added to the bpf's reuseport_array. i.e. doing setsockopt(SO_REUSEPORT) and having "sk->sk_reuseport == 1" alone will not stop sk->sk_user_data being used by other means. ] [1]: https://lore.kernel.org/netdev/20200706121259.GA20199@katalix.com/ Fixes: 5dc4c4b7d4e8 ("bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY") Reported-by: James Chapman Reported-by: syzbot+9f092552ba9a5efca5df@syzkaller.appspotmail.com Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Tested-by: James Chapman Acked-by: James Chapman Link: https://lore.kernel.org/bpf/20200709061110.4019316-1-kafai@fb.com --- include/net/sock.h | 3 ++- kernel/bpf/reuseport_array.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 3428619faae4..1183507df95b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -533,7 +533,8 @@ enum sk_pacing { * be copied. */ #define SK_USER_DATA_NOCOPY 1UL -#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY) +#define SK_USER_DATA_BPF 2UL /* Managed by BPF */ +#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index a95bc8d7e812..cae9d505e04a 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -24,7 +24,7 @@ void bpf_sk_reuseport_detach(struct sock *sk) write_lock_bh(&sk->sk_callback_lock); sk_user_data = (uintptr_t)sk->sk_user_data; - if (sk_user_data) { + if (sk_user_data & SK_USER_DATA_BPF) { struct sock __rcu **socks; socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); @@ -309,7 +309,8 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY; + sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | + SK_USER_DATA_BPF; WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; -- cgit v1.2.3 From ce69e563b325f620863830c246a8698ccea52048 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Wed, 8 Jul 2020 16:18:34 -0700 Subject: tcp: make sure listeners don't initialize congestion-control state syzkaller found its way into setsockopt with TCP_CONGESTION "cdg". tcp_cdg_init() does a kcalloc to store the gradients. As sk_clone_lock just copies all the memory, the allocated pointer will be copied as well, if the app called setsockopt(..., TCP_CONGESTION) on the listener. If now the socket will be destroyed before the congestion-control has properly been initialized (through a call to tcp_init_transfer), we will end up freeing memory that does not belong to that particular socket, opening the door to a double-free: [ 11.413102] ================================================================== [ 11.414181] BUG: KASAN: double-free or invalid-free in tcp_cleanup_congestion_control+0x58/0xd0 [ 11.415329] [ 11.415560] CPU: 3 PID: 4884 Comm: syz-executor.5 Not tainted 5.8.0-rc2 #80 [ 11.416544] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 [ 11.418148] Call Trace: [ 11.418534] [ 11.418834] dump_stack+0x7d/0xb0 [ 11.419297] print_address_description.constprop.0+0x1a/0x210 [ 11.422079] kasan_report_invalid_free+0x51/0x80 [ 11.423433] __kasan_slab_free+0x15e/0x170 [ 11.424761] kfree+0x8c/0x230 [ 11.425157] tcp_cleanup_congestion_control+0x58/0xd0 [ 11.425872] tcp_v4_destroy_sock+0x57/0x5a0 [ 11.426493] inet_csk_destroy_sock+0x153/0x2c0 [ 11.427093] tcp_v4_syn_recv_sock+0xb29/0x1100 [ 11.427731] tcp_get_cookie_sock+0xc3/0x4a0 [ 11.429457] cookie_v4_check+0x13d0/0x2500 [ 11.433189] tcp_v4_do_rcv+0x60e/0x780 [ 11.433727] tcp_v4_rcv+0x2869/0x2e10 [ 11.437143] ip_protocol_deliver_rcu+0x23/0x190 [ 11.437810] ip_local_deliver+0x294/0x350 [ 11.439566] __netif_receive_skb_one_core+0x15d/0x1a0 [ 11.441995] process_backlog+0x1b1/0x6b0 [ 11.443148] net_rx_action+0x37e/0xc40 [ 11.445361] __do_softirq+0x18c/0x61a [ 11.445881] asm_call_on_stack+0x12/0x20 [ 11.446409] [ 11.446716] do_softirq_own_stack+0x34/0x40 [ 11.447259] do_softirq.part.0+0x26/0x30 [ 11.447827] __local_bh_enable_ip+0x46/0x50 [ 11.448406] ip_finish_output2+0x60f/0x1bc0 [ 11.450109] __ip_queue_xmit+0x71c/0x1b60 [ 11.451861] __tcp_transmit_skb+0x1727/0x3bb0 [ 11.453789] tcp_rcv_state_process+0x3070/0x4d3a [ 11.456810] tcp_v4_do_rcv+0x2ad/0x780 [ 11.457995] __release_sock+0x14b/0x2c0 [ 11.458529] release_sock+0x4a/0x170 [ 11.459005] __inet_stream_connect+0x467/0xc80 [ 11.461435] inet_stream_connect+0x4e/0xa0 [ 11.462043] __sys_connect+0x204/0x270 [ 11.465515] __x64_sys_connect+0x6a/0xb0 [ 11.466088] do_syscall_64+0x3e/0x70 [ 11.466617] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 11.467341] RIP: 0033:0x7f56046dc469 [ 11.467844] Code: Bad RIP value. [ 11.468282] RSP: 002b:00007f5604dccdd8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a [ 11.469326] RAX: ffffffffffffffda RBX: 000000000068bf00 RCX: 00007f56046dc469 [ 11.470379] RDX: 0000000000000010 RSI: 0000000020000000 RDI: 0000000000000004 [ 11.471311] RBP: 00000000ffffffff R08: 0000000000000000 R09: 0000000000000000 [ 11.472286] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 11.473341] R13: 000000000041427c R14: 00007f5604dcd5c0 R15: 0000000000000003 [ 11.474321] [ 11.474527] Allocated by task 4884: [ 11.475031] save_stack+0x1b/0x40 [ 11.475548] __kasan_kmalloc.constprop.0+0xc2/0xd0 [ 11.476182] tcp_cdg_init+0xf0/0x150 [ 11.476744] tcp_init_congestion_control+0x9b/0x3a0 [ 11.477435] tcp_set_congestion_control+0x270/0x32f [ 11.478088] do_tcp_setsockopt.isra.0+0x521/0x1a00 [ 11.478744] __sys_setsockopt+0xff/0x1e0 [ 11.479259] __x64_sys_setsockopt+0xb5/0x150 [ 11.479895] do_syscall_64+0x3e/0x70 [ 11.480395] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 11.481097] [ 11.481321] Freed by task 4872: [ 11.481783] save_stack+0x1b/0x40 [ 11.482230] __kasan_slab_free+0x12c/0x170 [ 11.482839] kfree+0x8c/0x230 [ 11.483240] tcp_cleanup_congestion_control+0x58/0xd0 [ 11.483948] tcp_v4_destroy_sock+0x57/0x5a0 [ 11.484502] inet_csk_destroy_sock+0x153/0x2c0 [ 11.485144] tcp_close+0x932/0xfe0 [ 11.485642] inet_release+0xc1/0x1c0 [ 11.486131] __sock_release+0xc0/0x270 [ 11.486697] sock_close+0xc/0x10 [ 11.487145] __fput+0x277/0x780 [ 11.487632] task_work_run+0xeb/0x180 [ 11.488118] __prepare_exit_to_usermode+0x15a/0x160 [ 11.488834] do_syscall_64+0x4a/0x70 [ 11.489326] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Wei Wang fixed a part of these CDG-malloc issues with commit c12014440750 ("tcp: memset ca_priv data to 0 properly"). This patch here fixes the listener-scenario: We make sure that listeners setting the congestion-control through setsockopt won't initialize it (thus CDG never allocates on listeners). For those who use AF_UNSPEC to reuse a socket, tcp_disconnect() is changed to cleanup afterwards. (The issue can be reproduced at least down to v4.4.x.) Cc: Wei Wang Cc: Eric Dumazet Fixes: 2b0a8c9eee81 ("tcp: add CDG congestion control") Signed-off-by: Christoph Paasch Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_cong.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 861fbd84c9cf..6f0caf9a866d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2691,6 +2691,9 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 3172e31987be..62878cf26d9c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -197,7 +197,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); - if (sk->sk_state != TCP_CLOSE) + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } -- cgit v1.2.3 From f43cb0d672aa8eb09bfdb779de5900c040487d1d Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 9 Jul 2020 12:51:51 +0100 Subject: selftests: bpf: Fix detach from sockmap tests Fix sockmap tests which rely on old bpf_prog_dispatch behaviour. In the first case, the tests check that detaching without giving a program succeeds. Since these are not the desired semantics, invert the condition. In the second case, the clean up code doesn't supply the necessary program fds. Fixes: bb0de3131f4c ("bpf: sockmap: Require attach_bpf_fd when detaching a program") Reported-by: Martin KaFai Lau Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200709115151.75829-1-lmb@cloudflare.com --- tools/testing/selftests/bpf/test_maps.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 6a12a0e01e07..754cf611723e 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -789,19 +789,19 @@ static void test_sockmap(unsigned int tasks, void *data) } err = bpf_prog_detach(fd, BPF_SK_SKB_STREAM_PARSER); - if (err) { + if (!err) { printf("Failed empty parser prog detach\n"); goto out_sockmap; } err = bpf_prog_detach(fd, BPF_SK_SKB_STREAM_VERDICT); - if (err) { + if (!err) { printf("Failed empty verdict prog detach\n"); goto out_sockmap; } err = bpf_prog_detach(fd, BPF_SK_MSG_VERDICT); - if (err) { + if (!err) { printf("Failed empty msg verdict prog detach\n"); goto out_sockmap; } @@ -1090,19 +1090,19 @@ static void test_sockmap(unsigned int tasks, void *data) assert(status == 0); } - err = bpf_prog_detach(map_fd_rx, __MAX_BPF_ATTACH_TYPE); + err = bpf_prog_detach2(parse_prog, map_fd_rx, __MAX_BPF_ATTACH_TYPE); if (!err) { printf("Detached an invalid prog type.\n"); goto out_sockmap; } - err = bpf_prog_detach(map_fd_rx, BPF_SK_SKB_STREAM_PARSER); + err = bpf_prog_detach2(parse_prog, map_fd_rx, BPF_SK_SKB_STREAM_PARSER); if (err) { printf("Failed parser prog detach\n"); goto out_sockmap; } - err = bpf_prog_detach(map_fd_rx, BPF_SK_SKB_STREAM_VERDICT); + err = bpf_prog_detach2(verdict_prog, map_fd_rx, BPF_SK_SKB_STREAM_VERDICT); if (err) { printf("Failed parser prog detach\n"); goto out_sockmap; -- cgit v1.2.3 From 14b032b8f8fce03a546dcf365454bec8c4a58d7d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 9 Jul 2020 16:28:44 -0700 Subject: cgroup: Fix sock_cgroup_data on big-endian. In order for no_refcnt and is_data to be the lowest order two bits in the 'val' we have to pad out the bitfield of the u8. Fixes: ad0f75e5f57c ("cgroup: fix cgroup_sk_alloc() for sk_clone_lock()") Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4f1cd0edc57d..fee0b5547cd0 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -792,6 +792,7 @@ struct sock_cgroup_data { struct { u8 is_data : 1; u8 no_refcnt : 1; + u8 unused : 6; u8 padding; u16 prioidx; u32 classid; @@ -801,6 +802,7 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; + u8 unused : 6; u8 no_refcnt : 1; u8 is_data : 1; } __packed; -- cgit v1.2.3 From 47afbdd2fa4c5775c383ba376a3d1da7d7f694dc Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Sun, 14 Jun 2020 17:31:26 +0300 Subject: net/mlx5: Fix eeprom support for SFP module Fix eeprom SFP query support by setting i2c_addr, offset and page number correctly. Unlike QSFP modules, SFP eeprom params are as follow: - i2c_addr is 0x50 for offset 0 - 255 and 0x51 for offset 256 - 511. - Page number is always zero. - Page offset is always relative to zero. As part of eeprom query, query the module ID (SFP / QSFP*) via helper function to set the params accordingly. In addition, change mlx5_qsfp_eeprom_page() input type to be u16 to avoid unnecessary casting. Fixes: a708fb7b1f8d ("net/mlx5e: ethtool, Add support for EEPROM high pages query") Signed-off-by: Eran Ben Elisha Signed-off-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 93 +++++++++++++++++++++----- 1 file changed, 77 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 9f829e68fc73..e4186e84b3ff 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -293,7 +293,40 @@ static int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num) return 0; } -static int mlx5_eeprom_page(int offset) +static int mlx5_query_module_id(struct mlx5_core_dev *dev, int module_num, + u8 *module_id) +{ + u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {}; + u32 out[MLX5_ST_SZ_DW(mcia_reg)]; + int err, status; + u8 *ptr; + + MLX5_SET(mcia_reg, in, i2c_device_address, MLX5_I2C_ADDR_LOW); + MLX5_SET(mcia_reg, in, module, module_num); + MLX5_SET(mcia_reg, in, device_address, 0); + MLX5_SET(mcia_reg, in, page_number, 0); + MLX5_SET(mcia_reg, in, size, 1); + MLX5_SET(mcia_reg, in, l, 0); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MCIA, 0, 0); + if (err) + return err; + + status = MLX5_GET(mcia_reg, out, status); + if (status) { + mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n", + status); + return -EIO; + } + ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0); + + *module_id = ptr[0]; + + return 0; +} + +static int mlx5_qsfp_eeprom_page(u16 offset) { if (offset < MLX5_EEPROM_PAGE_LENGTH) /* Addresses between 0-255 - page 00 */ @@ -307,7 +340,7 @@ static int mlx5_eeprom_page(int offset) MLX5_EEPROM_HIGH_PAGE_LENGTH); } -static int mlx5_eeprom_high_page_offset(int page_num) +static int mlx5_qsfp_eeprom_high_page_offset(int page_num) { if (!page_num) /* Page 0 always start from low page */ return 0; @@ -316,35 +349,62 @@ static int mlx5_eeprom_high_page_offset(int page_num) return page_num * MLX5_EEPROM_HIGH_PAGE_LENGTH; } +static void mlx5_qsfp_eeprom_params_set(u16 *i2c_addr, int *page_num, u16 *offset) +{ + *i2c_addr = MLX5_I2C_ADDR_LOW; + *page_num = mlx5_qsfp_eeprom_page(*offset); + *offset -= mlx5_qsfp_eeprom_high_page_offset(*page_num); +} + +static void mlx5_sfp_eeprom_params_set(u16 *i2c_addr, int *page_num, u16 *offset) +{ + *i2c_addr = MLX5_I2C_ADDR_LOW; + *page_num = 0; + + if (*offset < MLX5_EEPROM_PAGE_LENGTH) + return; + + *i2c_addr = MLX5_I2C_ADDR_HIGH; + *offset -= MLX5_EEPROM_PAGE_LENGTH; +} + int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, u16 offset, u16 size, u8 *data) { - int module_num, page_num, status, err; + int module_num, status, err, page_num = 0; + u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {}; u32 out[MLX5_ST_SZ_DW(mcia_reg)]; - u32 in[MLX5_ST_SZ_DW(mcia_reg)]; - u16 i2c_addr; - void *ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0); + u16 i2c_addr = 0; + u8 module_id; + void *ptr; err = mlx5_query_module_num(dev, &module_num); if (err) return err; - memset(in, 0, sizeof(in)); - size = min_t(int, size, MLX5_EEPROM_MAX_BYTES); - - /* Get the page number related to the given offset */ - page_num = mlx5_eeprom_page(offset); + err = mlx5_query_module_id(dev, module_num, &module_id); + if (err) + return err; - /* Set the right offset according to the page number, - * For page_num > 0, relative offset is always >= 128 (high page). - */ - offset -= mlx5_eeprom_high_page_offset(page_num); + switch (module_id) { + case MLX5_MODULE_ID_SFP: + mlx5_sfp_eeprom_params_set(&i2c_addr, &page_num, &offset); + break; + case MLX5_MODULE_ID_QSFP: + case MLX5_MODULE_ID_QSFP_PLUS: + case MLX5_MODULE_ID_QSFP28: + mlx5_qsfp_eeprom_params_set(&i2c_addr, &page_num, &offset); + break; + default: + mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id); + return -EINVAL; + } if (offset + size > MLX5_EEPROM_PAGE_LENGTH) /* Cross pages read, read until offset 256 in low page */ size -= offset + size - MLX5_EEPROM_PAGE_LENGTH; - i2c_addr = MLX5_I2C_ADDR_LOW; + size = min_t(int, size, MLX5_EEPROM_MAX_BYTES); MLX5_SET(mcia_reg, in, l, 0); MLX5_SET(mcia_reg, in, module, module_num); @@ -365,6 +425,7 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, return -EIO; } + ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0); memcpy(data, ptr, size); return size; -- cgit v1.2.3 From 01f3d5db4a9add67f53630a7f71664538b3b9783 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Wed, 17 Jun 2020 15:11:24 -0700 Subject: net/mlx5: E-Switch, Fix vlan or qos setting in legacy mode Refactoring eswitch ingress acl codes accidentally inserts extra memset zero that removes vlan and/or qos setting in legacy mode. Fixes: 07bab9502641 ("net/mlx5: E-Switch, Refactor eswitch ingress acl codes") Signed-off-by: Vu Pham Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c index 5dc335e621c5..b68976b378b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -217,7 +217,6 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, } /* Create ingress allow rule */ - memset(spec, 0, sizeof(*spec)); spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; vport->ingress.allow_rule = mlx5_add_flow_rules(vport->ingress.acl, spec, -- cgit v1.2.3 From 2fb15e72c0d7fc5fb05aefd3a7f0d70cf39d3ad4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 17 Jun 2020 17:26:33 +0300 Subject: net/mxl5e: Verify that rpriv is not NULL In helper function is_flow_rule_duplicate_allowed() verify that rpviv pointer is not NULL before dereferencing it. This can happen when device is in NIC mode and leads to following crash: [90444.046419] BUG: kernel NULL pointer dereference, address: 0000000000000000 [90444.048149] #PF: supervisor read access in kernel mode [90444.049781] #PF: error_code(0x0000) - not-present page [90444.051386] PGD 80000003d35a4067 P4D 80000003d35a4067 PUD 3d35a3067 PMD 0 [90444.053051] Oops: 0000 [#1] SMP PTI [90444.054683] CPU: 16 PID: 31736 Comm: tc Not tainted 5.8.0-rc1+ #1157 [90444.056340] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [90444.058079] RIP: 0010:mlx5e_configure_flower+0x3aa/0x9b0 [mlx5_core] [90444.059753] Code: 24 50 49 8b 95 08 02 00 00 48 b8 00 08 00 00 04 00 00 00 48 21 c2 48 39 c2 74 0a 41 f6 85 0d 02 00 00 20 74 16 48 8b 44 24 20 <48> 8b 00 66 83 78 20 ff 74 07 4d 89 aa e0 00 00 00 48 83 7d 28 00 [90444.063232] RSP: 0018:ffffabe9c61ff768 EFLAGS: 00010246 [90444.065014] RAX: 0000000000000000 RBX: ffff9b13c4c91e80 RCX: 00000000000093fa [90444.066784] RDX: 0000000400000800 RSI: 0000000000000000 RDI: 000000000002d5e0 [90444.068533] RBP: ffff9b174d308468 R08: 0000000000000000 R09: ffff9b17d63003f0 [90444.070285] R10: ffff9b17ea288600 R11: 0000000000000000 R12: ffffabe9c61ff878 [90444.072032] R13: ffff9b174d300000 R14: ffffabe9c61ffbb8 R15: ffff9b174d300880 [90444.073760] FS: 00007f3c23775480(0000) GS:ffff9b13efc80000(0000) knlGS:0000000000000000 [90444.075492] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [90444.077266] CR2: 0000000000000000 CR3: 00000003e2a60002 CR4: 00000000001606e0 [90444.079024] Call Trace: [90444.080753] tc_setup_cb_add+0xca/0x1e0 [90444.082415] fl_hw_replace_filter+0x15f/0x1f0 [cls_flower] [90444.084119] fl_change+0xa59/0x13dc [cls_flower] [90444.085772] ? wait_for_completion+0xa8/0xf0 [90444.087364] tc_new_tfilter+0x3f5/0xa60 [90444.088960] rtnetlink_rcv_msg+0xeb/0x360 [90444.090514] ? __d_lookup_done+0x76/0xe0 [90444.092034] ? proc_alloc_inode+0x16/0x70 [90444.093560] ? prep_new_page+0x8c/0xf0 [90444.095048] ? _cond_resched+0x15/0x30 [90444.096483] ? rtnl_calcit.isra.0+0x110/0x110 [90444.097907] netlink_rcv_skb+0x49/0x110 [90444.099289] netlink_unicast+0x191/0x230 [90444.100629] netlink_sendmsg+0x243/0x480 [90444.101984] sock_sendmsg+0x5e/0x60 [90444.103305] ____sys_sendmsg+0x1f3/0x260 [90444.104597] ? copy_msghdr_from_user+0x5c/0x90 [90444.105916] ? __mod_lruvec_state+0x3c/0xe0 [90444.107210] ___sys_sendmsg+0x81/0xc0 [90444.108484] ? do_filp_open+0xa5/0x100 [90444.109732] ? handle_mm_fault+0x117b/0x1e00 [90444.110970] ? __check_object_size+0x46/0x147 [90444.112205] ? __check_object_size+0x136/0x147 [90444.113402] __sys_sendmsg+0x59/0xa0 [90444.114587] do_syscall_64+0x4d/0x90 [90444.115782] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [90444.116953] RIP: 0033:0x7f3c2393b7b8 [90444.118101] Code: Bad RIP value. [90444.119240] RSP: 002b:00007ffc6ad8e6c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [90444.120408] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3c2393b7b8 [90444.121583] RDX: 0000000000000000 RSI: 00007ffc6ad8e740 RDI: 0000000000000003 [90444.122750] RBP: 000000005eea0c3a R08: 0000000000000001 R09: 00007ffc6ad8e68c [90444.123928] R10: 0000000000404fa8 R11: 0000000000000246 R12: 0000000000000001 [90444.125073] R13: 0000000000000000 R14: 00007ffc6ad92a00 R15: 00000000004866a0 [90444.126221] Modules linked in: act_skbedit act_tunnel_key act_mirred bonding vxlan ip6_udp_tunnel udp_tunnel nfnetlink act_gact cls_flower sch_ingress openvswitch nsh nf_conncount nfsv3 nfs_acl nfs lockd grace fscache tun bridge stp llc sunrpc rdma_ucm rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core intel_r apl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel mlxfw kvm act_ct nf_flow_table nf_nat nf_conntrack irqbypass crct10dif_pclmul nf_defrag_ipv6 igb ipmi_ssif libcrc32c crc32_pclmul crc32c_intel ipmi_si nf_defrag_ipv4 ptp ghash_clmulni_intel mei_me ses iTCO_wdt i2c_i801 pps_core ioatdma iTCO_vendor_support joydev mei enclosure intel_cstate i2c_smbus wmi dca ipmi_devintf intel_uncore lpc_ich ipmi_msghandler pcspkr acpi_pad acpi_power_meter ast i2c_algo_bit drm_vram_helper drm_kms_helper drm_ttm_helper ttm drm mpt3sas raid_class scsi_transport_sas [90444.136253] CR2: 0000000000000000 [90444.137621] ---[ end trace 924af62aa2b151bd ]--- Fixes: 553f9328385d ("net/mlx5e: Support tc block sharing for representors") Reported-by: David Ahern Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 7fc84f58e28a..75f169aef1cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4670,9 +4670,10 @@ static bool is_flow_rule_duplicate_allowed(struct net_device *dev, struct mlx5e_rep_priv *rpriv) { /* Offloaded flow rule is allowed to duplicate on non-uplink representor - * sharing tc block with other slaves of a lag device. + * sharing tc block with other slaves of a lag device. Rpriv can be NULL if this + * function is called from NIC mode. */ - return netif_is_lag_port(dev) && rpriv->rep->vport != MLX5_VPORT_UPLINK; + return netif_is_lag_port(dev) && rpriv && rpriv->rep->vport != MLX5_VPORT_UPLINK; } int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, -- cgit v1.2.3 From c1aea9e1765a047c7397da30f52315c43e3501fb Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 17 Jun 2020 17:51:53 +0300 Subject: net/mlx5e: Fix usage of rcu-protected pointer In mlx5e_configure_flower() flow pointer is protected by rcu read lock. However, after cited commit the pointer is being used outside of rcu read block. Extend the block to protect all pointer accesses. Fixes: 553f9328385d ("net/mlx5e: Support tc block sharing for representors") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 75f169aef1cf..cc8412151ca0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4687,13 +4687,12 @@ int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, rcu_read_lock(); flow = rhashtable_lookup(tc_ht, &f->cookie, tc_ht_params); - rcu_read_unlock(); if (flow) { /* Same flow rule offloaded to non-uplink representor sharing tc block, * just return 0. */ if (is_flow_rule_duplicate_allowed(dev, rpriv) && flow->orig_dev != dev) - goto out; + goto rcu_unlock; NL_SET_ERR_MSG_MOD(extack, "flow cookie already exists, ignoring"); @@ -4701,8 +4700,12 @@ int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, "flow cookie %lx already exists, ignoring\n", f->cookie); err = -EEXIST; - goto out; + goto rcu_unlock; } +rcu_unlock: + rcu_read_unlock(); + if (flow) + goto out; trace_mlx5e_configure_flower(f); err = mlx5e_tc_add_flow(priv, f, flags, dev, &flow); -- cgit v1.2.3 From b3c2ed21c0bdf35ba498a9974aa587f99a03b658 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 24 Jun 2020 19:04:03 +0300 Subject: net/mlx5e: Fix VXLAN configuration restore after function reload When detaching netdev, remove vxlan port configuration using udp_tunnel_drop_rx_info. During function reload, configuration will be restored using udp_tunnel_get_rx_info. This ensures sync between firmware and driver. Use udp_tunnel_get_rx_info even if its physical interface is down. Fixes: 4383cfcc65e7 ("net/mlx5: Add devlink reload") Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a836a02a2116..888e38b21c3d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3104,9 +3104,6 @@ int mlx5e_open(struct net_device *netdev) mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP); mutex_unlock(&priv->state_lock); - if (mlx5_vxlan_allowed(priv->mdev->vxlan)) - udp_tunnel_get_rx_info(netdev); - return err; } @@ -5202,6 +5199,8 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv) rtnl_lock(); if (netif_running(netdev)) mlx5e_open(netdev); + if (mlx5_vxlan_allowed(priv->mdev->vxlan)) + udp_tunnel_get_rx_info(netdev); netif_device_attach(netdev); rtnl_unlock(); } @@ -5216,6 +5215,8 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv) rtnl_lock(); if (netif_running(priv->netdev)) mlx5e_close(priv->netdev); + if (mlx5_vxlan_allowed(priv->mdev->vxlan)) + udp_tunnel_drop_rx_info(priv->netdev); netif_device_detach(priv->netdev); rtnl_unlock(); -- cgit v1.2.3 From f4aebbfb56ed0c186adbeb2799df836da50f78e3 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 20 May 2020 10:37:42 +0300 Subject: net/mlx5e: Fix CPU mapping after function reload to avoid aRFS RX crash After function reload, CPU mapping used by aRFS RX is broken, leading to a kernel panic. Fix by moving initialization of rx_cpu_rmap from netdev_init to netdev_attach. IRQ table is re-allocated on mlx5_load, but netdev is not re-initialize. Trace of the panic: [ 22.055672] general protection fault, probably for non-canonical address 0x785634120000ff1c: 0000 [#1] SMP PTI [ 22.065010] CPU: 4 PID: 0 Comm: swapper/4 Not tainted 5.7.0-rc2-for-upstream-perf-2020-04-21_16-34-03-31 #1 [ 22.067967] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 [ 22.071174] RIP: 0010:get_rps_cpu+0x267/0x300 [ 22.075692] RSP: 0018:ffffc90000244d60 EFLAGS: 00010202 [ 22.076888] RAX: ffff888459b0e400 RBX: 0000000000000000 RCX:0000000000000007 [ 22.078364] RDX: 0000000000008884 RSI: ffff888467cb5b00 RDI:0000000000000000 [ 22.079815] RBP: 00000000ff342b27 R08: 0000000000000007 R09:0000000000000003 [ 22.081289] R10: ffffffffffffffff R11: 00000000000070cc R12:ffff888454900000 [ 22.082767] R13: ffffc90000e5a950 R14: ffffc90000244dc0 R15:0000000000000007 [ 22.084190] FS: 0000000000000000(0000) GS:ffff88846fc80000(0000)knlGS:0000000000000000 [ 22.086161] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 22.087427] CR2: ffffffffffffffff CR3: 0000000464426003 CR4:0000000000760ee0 [ 22.088888] DR0: 0000000000000000 DR1: 0000000000000000 DR2:0000000000000000 [ 22.090336] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:0000000000000400 [ 22.091764] PKRU: 55555554 [ 22.092618] Call Trace: [ 22.093442] [ 22.094211] ? kvm_clock_get_cycles+0xd/0x10 [ 22.095272] netif_receive_skb_list_internal+0x258/0x2a0 [ 22.096460] gro_normal_list.part.137+0x19/0x40 [ 22.097547] napi_complete_done+0xc6/0x110 [ 22.098685] mlx5e_napi_poll+0x190/0x670 [mlx5_core] [ 22.099859] net_rx_action+0x2a0/0x400 [ 22.100848] __do_softirq+0xd8/0x2a8 [ 22.101829] irq_exit+0xa5/0xb0 [ 22.102750] do_IRQ+0x52/0xd0 [ 22.103654] common_interrupt+0xf/0xf [ 22.104641] Fixes: 4383cfcc65e7 ("net/mlx5: Add devlink reload") Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 888e38b21c3d..081f15074cac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5118,6 +5118,10 @@ static int mlx5e_init_nic_rx(struct mlx5e_priv *priv) if (err) goto err_destroy_flow_steering; +#ifdef CONFIG_MLX5_EN_ARFS + priv->netdev->rx_cpu_rmap = mlx5_eq_table_get_rmap(priv->mdev); +#endif + return 0; err_destroy_flow_steering: @@ -5289,10 +5293,6 @@ int mlx5e_netdev_init(struct net_device *netdev, /* netdev init */ netif_carrier_off(netdev); -#ifdef CONFIG_MLX5_EN_ARFS - netdev->rx_cpu_rmap = mlx5_eq_table_get_rmap(mdev); -#endif - return 0; err_free_cpumask: -- cgit v1.2.3 From 6a1cf4e443a3b0a4d690d3c93b84b1e9cbfcb1bd Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 15 Jun 2020 12:48:47 +0300 Subject: net/mlx5e: Fix 50G per lane indication Some released FW versions mistakenly don't set the capability that 50G per lane link-modes are supported for VFs (ptys_extended_ethernet capability bit). When the capability is unset, read PTYS.ext_eth_proto_capability (always reliable). If PTYS.ext_eth_proto_capability is valid (has a non-zero value) conclude that the HCA supports 50G per lane. Otherwise, conclude that the HCA doesn't support 50G per lane. Fixes: a08b4ed1373d ("net/mlx5: Add support to ext_* fields introduced in Port Type and Speed register") Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/port.c | 21 ++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/en/port.h | 2 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 8 ++++---- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c index 2a8950b3056f..3cf3e35053f7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c @@ -78,11 +78,26 @@ static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = { [MLX5E_400GAUI_8] = 400000, }; +bool mlx5e_ptys_ext_supported(struct mlx5_core_dev *mdev) +{ + struct mlx5e_port_eth_proto eproto; + int err; + + if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet)) + return true; + + err = mlx5_port_query_eth_proto(mdev, 1, true, &eproto); + if (err) + return false; + + return !!eproto.cap; +} + static void mlx5e_port_get_speed_arr(struct mlx5_core_dev *mdev, const u32 **arr, u32 *size, bool force_legacy) { - bool ext = force_legacy ? false : MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); + bool ext = force_legacy ? false : mlx5e_ptys_ext_supported(mdev); *size = ext ? ARRAY_SIZE(mlx5e_ext_link_speed) : ARRAY_SIZE(mlx5e_link_speed); @@ -177,7 +192,7 @@ int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) bool ext; int err; - ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); + ext = mlx5e_ptys_ext_supported(mdev); err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); if (err) goto out; @@ -205,7 +220,7 @@ int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) int err; int i; - ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); + ext = mlx5e_ptys_ext_supported(mdev); err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h index a2ddd446dd59..7a7defe60792 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h @@ -54,7 +54,7 @@ int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed, bool force_legacy); - +bool mlx5e_ptys_ext_supported(struct mlx5_core_dev *mdev); int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out); int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in); int mlx5e_port_query_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index ec5658bbe3c5..c2464c349117 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -200,7 +200,7 @@ static void mlx5e_ethtool_get_speed_arr(struct mlx5_core_dev *mdev, struct ptys2ethtool_config **arr, u32 *size) { - bool ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); + bool ext = mlx5e_ptys_ext_supported(mdev); *arr = ext ? ptys2ext_ethtool_table : ptys2legacy_ethtool_table; *size = ext ? ARRAY_SIZE(ptys2ext_ethtool_table) : @@ -883,7 +883,7 @@ static void get_lp_advertising(struct mlx5_core_dev *mdev, u32 eth_proto_lp, struct ethtool_link_ksettings *link_ksettings) { unsigned long *lp_advertising = link_ksettings->link_modes.lp_advertising; - bool ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); + bool ext = mlx5e_ptys_ext_supported(mdev); ptys2ethtool_adver_link(lp_advertising, eth_proto_lp, ext); } @@ -913,7 +913,7 @@ int mlx5e_ethtool_get_link_ksettings(struct mlx5e_priv *priv, __func__, err); goto err_query_regs; } - ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); + ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_capability); eth_proto_admin = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, @@ -1066,7 +1066,7 @@ int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv, autoneg = link_ksettings->base.autoneg; speed = link_ksettings->base.speed; - ext_supported = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); + ext_supported = mlx5e_ptys_ext_supported(mdev); ext = ext_requested(autoneg, adver, ext_supported); if (!ext_supported && ext) return -EOPNOTSUPP; -- cgit v1.2.3 From 88b3d5c90e9685be54dd5bc441970044020eca76 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 22 Jun 2020 09:03:31 +0300 Subject: net/mlx5e: Fix port buffers cell size value Device unit for port buffers size, xoff_threshold and xon_threshold is cells. Fix a bug in driver where cell unit size was hard-coded to 128 bytes. This hard-coded value is buggy, as it is wrong for some hardware versions. Driver to read cell size from SBCAM register and translate bytes to cell units accordingly. In order to fix the bug, this patch exposes SBCAM (Shared buffer capabilities mask) layout and defines. If SBCAM.cap_cell_size is valid, use it for all bytes to cells calculations. If not valid, fallback to 128. Cell size do not change on the fly per device. Instead of issuing SBCAM access reg command every time such translation is needed, cache it in mlx5e_dcbx as part of mlx5e_dcbnl_initialize(). Pass dcbx.port_buff_cell_sz as a param to every function that needs bytes to cells translation. While fixing the bug, move MLX5E_BUFFER_CELL_SHIFT macro to en_dcbnl.c, as it is only used by that file. Fixes: 0696d60853d5 ("net/mlx5e: Receive buffer configuration") Signed-off-by: Eran Ben Elisha Reviewed-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h | 1 + .../ethernet/mellanox/mlx5/core/en/port_buffer.c | 53 ++++++++++++---------- .../ethernet/mellanox/mlx5/core/en/port_buffer.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 19 ++++++++ include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 28 ++++++++++++ 6 files changed, 78 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h index 7be6b2d36b60..9976de8b9047 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h @@ -29,6 +29,7 @@ struct mlx5e_dcbx { bool manual_buffer; u32 cable_len; u32 xoff; + u16 port_buff_cell_sz; }; #define MLX5E_MAX_DSCP (64) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index ae99fac08b53..673f1c82d381 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -34,6 +34,7 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, struct mlx5e_port_buffer *port_buffer) { + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); u32 total_used = 0; @@ -57,11 +58,11 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, port_buffer->buffer[i].epsb = MLX5_GET(bufferx_reg, buffer, epsb); port_buffer->buffer[i].size = - MLX5_GET(bufferx_reg, buffer, size) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(bufferx_reg, buffer, size) * port_buff_cell_sz; port_buffer->buffer[i].xon = - MLX5_GET(bufferx_reg, buffer, xon_threshold) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(bufferx_reg, buffer, xon_threshold) * port_buff_cell_sz; port_buffer->buffer[i].xoff = - MLX5_GET(bufferx_reg, buffer, xoff_threshold) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(bufferx_reg, buffer, xoff_threshold) * port_buff_cell_sz; total_used += port_buffer->buffer[i].size; mlx5e_dbg(HW, priv, "buffer %d: size=%d, xon=%d, xoff=%d, epsb=%d, lossy=%d\n", i, @@ -73,7 +74,7 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, } port_buffer->port_buffer_size = - MLX5_GET(pbmc_reg, out, port_buffer_size) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(pbmc_reg, out, port_buffer_size) * port_buff_cell_sz; port_buffer->spare_buffer_size = port_buffer->port_buffer_size - total_used; @@ -88,9 +89,9 @@ out: static int port_set_buffer(struct mlx5e_priv *priv, struct mlx5e_port_buffer *port_buffer) { + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); - void *buffer; void *in; int err; int i; @@ -104,16 +105,18 @@ static int port_set_buffer(struct mlx5e_priv *priv, goto out; for (i = 0; i < MLX5E_MAX_BUFFER; i++) { - buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); - - MLX5_SET(bufferx_reg, buffer, size, - port_buffer->buffer[i].size >> MLX5E_BUFFER_CELL_SHIFT); - MLX5_SET(bufferx_reg, buffer, lossy, - port_buffer->buffer[i].lossy); - MLX5_SET(bufferx_reg, buffer, xoff_threshold, - port_buffer->buffer[i].xoff >> MLX5E_BUFFER_CELL_SHIFT); - MLX5_SET(bufferx_reg, buffer, xon_threshold, - port_buffer->buffer[i].xon >> MLX5E_BUFFER_CELL_SHIFT); + void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); + u64 size = port_buffer->buffer[i].size; + u64 xoff = port_buffer->buffer[i].xoff; + u64 xon = port_buffer->buffer[i].xon; + + do_div(size, port_buff_cell_sz); + do_div(xoff, port_buff_cell_sz); + do_div(xon, port_buff_cell_sz); + MLX5_SET(bufferx_reg, buffer, size, size); + MLX5_SET(bufferx_reg, buffer, lossy, port_buffer->buffer[i].lossy); + MLX5_SET(bufferx_reg, buffer, xoff_threshold, xoff); + MLX5_SET(bufferx_reg, buffer, xon_threshold, xon); } err = mlx5e_port_set_pbmc(mdev, in); @@ -143,7 +146,7 @@ static u32 calculate_xoff(struct mlx5e_priv *priv, unsigned int mtu) } static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, - u32 xoff, unsigned int max_mtu) + u32 xoff, unsigned int max_mtu, u16 port_buff_cell_sz) { int i; @@ -155,7 +158,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, } if (port_buffer->buffer[i].size < - (xoff + max_mtu + (1 << MLX5E_BUFFER_CELL_SHIFT))) { + (xoff + max_mtu + port_buff_cell_sz)) { pr_err("buffer_size[%d]=%d is not enough for lossless buffer\n", i, port_buffer->buffer[i].size); return -ENOMEM; @@ -175,6 +178,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, * @pfc_en: current pfc configuration * @buffer: current prio to buffer mapping * @xoff: xoff value + * @port_buff_cell_sz: port buffer cell_size * @port_buffer: port receive buffer configuration * @change: * @@ -189,7 +193,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, * sets change to true if buffer configuration was modified. */ static int update_buffer_lossy(unsigned int max_mtu, - u8 pfc_en, u8 *buffer, u32 xoff, + u8 pfc_en, u8 *buffer, u32 xoff, u16 port_buff_cell_sz, struct mlx5e_port_buffer *port_buffer, bool *change) { @@ -225,7 +229,7 @@ static int update_buffer_lossy(unsigned int max_mtu, } if (changed) { - err = update_xoff_threshold(port_buffer, xoff, max_mtu); + err = update_xoff_threshold(port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; @@ -262,6 +266,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, u32 *buffer_size, u8 *prio2buffer) { + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5e_port_buffer port_buffer; u32 xoff = calculate_xoff(priv, mtu); bool update_prio2buffer = false; @@ -282,7 +287,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (change & MLX5E_PORT_BUFFER_CABLE_LEN) { update_buffer = true; - err = update_xoff_threshold(&port_buffer, xoff, max_mtu); + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; } @@ -292,7 +297,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (err) return err; - err = update_buffer_lossy(max_mtu, pfc->pfc_en, buffer, xoff, + err = update_buffer_lossy(max_mtu, pfc->pfc_en, buffer, xoff, port_buff_cell_sz, &port_buffer, &update_buffer); if (err) return err; @@ -304,7 +309,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (err) return err; - err = update_buffer_lossy(max_mtu, curr_pfc_en, prio2buffer, + err = update_buffer_lossy(max_mtu, curr_pfc_en, prio2buffer, port_buff_cell_sz, xoff, &port_buffer, &update_buffer); if (err) return err; @@ -329,7 +334,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, return -EINVAL; update_buffer = true; - err = update_xoff_threshold(&port_buffer, xoff, max_mtu); + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; } @@ -337,7 +342,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, /* Need to update buffer configuration if xoff value is changed */ if (!update_buffer && xoff != priv->dcbx.xoff) { update_buffer = true; - err = update_xoff_threshold(&port_buffer, xoff, max_mtu); + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h index 34f55b81a0de..80af7a5ac604 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h @@ -36,7 +36,6 @@ #include "port.h" #define MLX5E_MAX_BUFFER 8 -#define MLX5E_BUFFER_CELL_SHIFT 7 #define MLX5E_DEFAULT_CABLE_LEN 7 /* 7 meters */ #define MLX5_BUFFER_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, pcam_reg) && \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index bc102d094bbd..d20243d6a032 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -1217,6 +1217,24 @@ static int mlx5e_trust_initialize(struct mlx5e_priv *priv) return 0; } +#define MLX5E_BUFFER_CELL_SHIFT 7 + +static u16 mlx5e_query_port_buffers_cell_size(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 out[MLX5_ST_SZ_DW(sbcam_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(sbcam_reg)] = {}; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return (1 << MLX5E_BUFFER_CELL_SHIFT); + + if (mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out), + MLX5_REG_SBCAM, 0, 0)) + return (1 << MLX5E_BUFFER_CELL_SHIFT); + + return MLX5_GET(sbcam_reg, out, cap_cell_size); +} + void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) { struct mlx5e_dcbx *dcbx = &priv->dcbx; @@ -1234,6 +1252,7 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) if (priv->dcbx.mode == MLX5E_DCBX_PARAM_VER_OPER_HOST) priv->dcbx.cap |= DCB_CAP_DCBX_HOST; + priv->dcbx.port_buff_cell_sz = mlx5e_query_port_buffers_cell_size(priv); priv->dcbx.manual_buffer = false; priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 13c0e4556eda..1e6ca716635a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -147,6 +147,7 @@ enum { MLX5_REG_MCDA = 0x9063, MLX5_REG_MCAM = 0x907f, MLX5_REG_MIRC = 0x9162, + MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ca1887dd0423..073b79eacc99 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9960,6 +9960,34 @@ struct mlx5_ifc_pptb_reg_bits { u8 untagged_buff[0x4]; }; +struct mlx5_ifc_sbcam_reg_bits { + u8 reserved_at_0[0x8]; + u8 feature_group[0x8]; + u8 reserved_at_10[0x8]; + u8 access_reg_group[0x8]; + + u8 reserved_at_20[0x20]; + + u8 sb_access_reg_cap_mask[4][0x20]; + + u8 reserved_at_c0[0x80]; + + u8 sb_feature_cap_mask[4][0x20]; + + u8 reserved_at_1c0[0x40]; + + u8 cap_total_buffer_size[0x20]; + + u8 cap_cell_size[0x10]; + u8 cap_max_pg_buffers[0x8]; + u8 cap_num_pool_supported[0x8]; + + u8 reserved_at_240[0x8]; + u8 cap_sbsr_stat_size[0x8]; + u8 cap_max_tclass_data[0x8]; + u8 cap_max_cpu_ingress_tclass_sb[0x8]; +}; + struct mlx5_ifc_pbmc_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; -- cgit v1.2.3 From eb32b3f53d283e8d68b6d86c3a6ed859b24dacae Mon Sep 17 00:00:00 2001 From: Eli Britstein Date: Sun, 28 Jun 2020 15:42:26 +0300 Subject: net/mlx5e: CT: Fix memory leak in cleanup CT entries are deleted via a workqueue from netfilter. If removing the module before that, the rules are cleaned by the driver itself, but the memory entries for them are not freed. Fix that. Fixes: ac991b48d43c ("net/mlx5e: CT: Offload established flows") Signed-off-by: Eli Britstein Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index 430025550fad..aad1c29b23db 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -1097,6 +1097,7 @@ mlx5_tc_ct_flush_ft_entry(void *ptr, void *arg) struct mlx5_ct_entry *entry = ptr; mlx5_tc_ct_entry_del_rules(ct_priv, entry); + kfree(entry); } static void -- cgit v1.2.3 From b2f9f1535bb93ee5fa2ea30ac1c26fa0d676154c Mon Sep 17 00:00:00 2001 From: Jakub Bogusz Date: Thu, 9 Jul 2020 15:57:23 -0700 Subject: libbpf: Fix libbpf hashmap on (I)LP32 architectures On ILP32, 64-bit result was shifted by value calculated for 32-bit long type and returned value was much outside hashmap capacity. As advised by Andrii Nakryiko, this patch uses different hashing variant for architectures with size_t shorter than long long. Fixes: e3b924224028 ("libbpf: add resizable non-thread safe internal hashmap") Signed-off-by: Jakub Bogusz Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200709225723.1069937-1-andriin@fb.com --- tools/lib/bpf/hashmap.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/hashmap.h b/tools/lib/bpf/hashmap.h index df59fd4fc95b..e0af36b0e5d8 100644 --- a/tools/lib/bpf/hashmap.h +++ b/tools/lib/bpf/hashmap.h @@ -11,14 +11,18 @@ #include #include #include -#ifndef __WORDSIZE -#define __WORDSIZE (__SIZEOF_LONG__ * 8) -#endif static inline size_t hash_bits(size_t h, int bits) { /* shuffle bits and return requested number of upper bits */ - return (h * 11400714819323198485llu) >> (__WORDSIZE - bits); +#if (__SIZEOF_SIZE_T__ == __SIZEOF_LONG_LONG__) + /* LP64 case */ + return (h * 11400714819323198485llu) >> (__SIZEOF_LONG_LONG__ * 8 - bits); +#elif (__SIZEOF_SIZE_T__ <= __SIZEOF_LONG__) + return (h * 2654435769lu) >> (__SIZEOF_LONG__ * 8 - bits); +#else +# error "Unsupported size_t size" +#endif } typedef size_t (*hashmap_hash_fn)(const void *key, void *ctx); -- cgit v1.2.3 From c8b1d7436045d3599bae56aef1682813ecccaad7 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 10 Jul 2020 12:55:08 +0200 Subject: bnxt_en: fix NULL dereference in case SR-IOV configuration fails we need to set 'active_vfs' back to 0, if something goes wrong during the allocation of SR-IOV resources: otherwise, further VF configurations will wrongly assume that bp->pf.vf[x] are valid memory locations, and commands like the ones in the following sequence: # echo 2 >/sys/bus/pci/devices/${ADDR}/sriov_numvfs # ip link set dev ens1f0np0 up # ip link set dev ens1f0np0 vf 0 trust on will cause a kernel crash similar to this: bnxt_en 0000:3b:00.0: not enough MMIO resources for SR-IOV BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 43 PID: 2059 Comm: ip Tainted: G I 5.8.0-rc2.upstream+ #871 Hardware name: Dell Inc. PowerEdge R740/08D89F, BIOS 2.2.11 06/13/2019 RIP: 0010:bnxt_set_vf_trust+0x5b/0x110 [bnxt_en] Code: 44 24 58 31 c0 e8 f5 fb ff ff 85 c0 0f 85 b6 00 00 00 48 8d 1c 5b 41 89 c6 b9 0b 00 00 00 48 c1 e3 04 49 03 9c 24 f0 0e 00 00 <8b> 43 14 89 c2 83 c8 10 83 e2 ef 45 84 ed 49 89 e5 0f 44 c2 4c 89 RSP: 0018:ffffac6246a1f570 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000000000b RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff98b28f538900 RBP: ffff98b28f538900 R08: 0000000000000000 R09: 0000000000000008 R10: ffffffffb9515be0 R11: ffffac6246a1f678 R12: ffff98b28f538000 R13: 0000000000000001 R14: 0000000000000000 R15: ffffffffc05451e0 FS: 00007fde0f688800(0000) GS:ffff98baffd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 000000104bb0a003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: do_setlink+0x994/0xfe0 __rtnl_newlink+0x544/0x8d0 rtnl_newlink+0x47/0x70 rtnetlink_rcv_msg+0x29f/0x350 netlink_rcv_skb+0x4a/0x110 netlink_unicast+0x21d/0x300 netlink_sendmsg+0x329/0x450 sock_sendmsg+0x5b/0x60 ____sys_sendmsg+0x204/0x280 ___sys_sendmsg+0x88/0xd0 __sys_sendmsg+0x5e/0xa0 do_syscall_64+0x47/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: c0c050c58d840 ("bnxt_en: New Broadcom ethernet driver.") Reported-by: Fei Liu CC: Jonathan Toppins CC: Michael Chan Signed-off-by: Davide Caratti Reviewed-by: Michael Chan Acked-by: Jonathan Toppins Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index 3a9a51f7063a..392e32c7122a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -396,6 +396,7 @@ static void bnxt_free_vf_resources(struct bnxt *bp) } } + bp->pf.active_vfs = 0; kfree(bp->pf.vf); bp->pf.vf = NULL; } @@ -835,7 +836,6 @@ void bnxt_sriov_disable(struct bnxt *bp) bnxt_free_vf_resources(bp); - bp->pf.active_vfs = 0; /* Reclaim all resources for the PF. */ rtnl_lock(); bnxt_restore_pf_fw_resources(bp); -- cgit v1.2.3 From 515a10a701d570e26dfbe6ee373f77c8bf11053f Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Fri, 10 Jul 2020 14:46:41 +0200 Subject: net: macb: fix wakeup test in runtime suspend/resume routines Use the proper struct device pointer to check if the wakeup flag and wakeup source are positioned. Use the one passed by function call which is equivalent to &bp->dev->dev.parent. It's preventing the trigger of a spurious interrupt in case the Wake-on-Lan feature is used. Fixes: d54f89af6cc4 ("net: macb: Add pm runtime support") Cc: Claudiu Beznea Cc: Harini Katakam Reviewed-by: Florian Fainelli Signed-off-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 52582e8ed90e..55e680f35022 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4654,7 +4654,7 @@ static int __maybe_unused macb_runtime_suspend(struct device *dev) struct net_device *netdev = dev_get_drvdata(dev); struct macb *bp = netdev_priv(netdev); - if (!(device_may_wakeup(&bp->dev->dev))) { + if (!(device_may_wakeup(dev))) { clk_disable_unprepare(bp->tx_clk); clk_disable_unprepare(bp->hclk); clk_disable_unprepare(bp->pclk); @@ -4670,7 +4670,7 @@ static int __maybe_unused macb_runtime_resume(struct device *dev) struct net_device *netdev = dev_get_drvdata(dev); struct macb *bp = netdev_priv(netdev); - if (!(device_may_wakeup(&bp->dev->dev))) { + if (!(device_may_wakeup(dev))) { clk_prepare_enable(bp->pclk); clk_prepare_enable(bp->hclk); clk_prepare_enable(bp->tx_clk); -- cgit v1.2.3 From ced4799d06375929e013eea04ba6908207afabbe Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Fri, 10 Jul 2020 14:46:42 +0200 Subject: net: macb: mark device wake capable when "magic-packet" property present Change the way the "magic-packet" DT property is handled in the macb_probe() function, matching DT binding documentation. Now we mark the device as "wakeup capable" instead of calling the device_init_wakeup() function that would enable the wakeup source. For Ethernet WoL, enabling the wakeup_source is done by using ethtool and associated macb_set_wol() function that already calls device_set_wakeup_enable() for this purpose. That would reduce power consumption by cutting more clocks if "magic-packet" property is set but WoL is not configured by ethtool. Fixes: 3e2a5e153906 ("net: macb: add wake-on-lan support via magic packet") Cc: Claudiu Beznea Cc: Harini Katakam Cc: Sergio Prado Reviewed-by: Florian Fainelli Signed-off-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 55e680f35022..4cafe343c0a2 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4422,7 +4422,7 @@ static int macb_probe(struct platform_device *pdev) bp->wol = 0; if (of_get_property(np, "magic-packet", NULL)) bp->wol |= MACB_WOL_HAS_MAGIC_PACKET; - device_init_wakeup(&pdev->dev, bp->wol & MACB_WOL_HAS_MAGIC_PACKET); + device_set_wakeup_capable(&pdev->dev, bp->wol & MACB_WOL_HAS_MAGIC_PACKET); spin_lock_init(&bp->lock); -- cgit v1.2.3 From 253fe09435045ab9346a8e364299d971185ae031 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Fri, 10 Jul 2020 14:46:43 +0200 Subject: net: macb: fix macb_get/set_wol() when moving to phylink Keep previous function goals and integrate phylink actions to them. phylink_ethtool_get_wol() is not enough to figure out if Ethernet driver supports Wake-on-Lan. Initialization of "supported" and "wolopts" members is done in phylink function, no need to keep them in calling function. phylink_ethtool_set_wol() return value is considered and determines if the MAC has to handle WoL or not. The case where the PHY doesn't implement WoL leads to the MAC configuring it to provide this feature. Fixes: 7897b071ac3b ("net: macb: convert to phylink") Cc: Claudiu Beznea Cc: Harini Katakam Cc: Antoine Tenart Cc: Florian Fainelli Signed-off-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 4cafe343c0a2..79c2fe054303 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -2821,11 +2821,13 @@ static void macb_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct macb *bp = netdev_priv(netdev); - wol->supported = 0; - wol->wolopts = 0; - - if (bp->wol & MACB_WOL_HAS_MAGIC_PACKET) + if (bp->wol & MACB_WOL_HAS_MAGIC_PACKET) { phylink_ethtool_get_wol(bp->phylink, wol); + wol->supported |= WAKE_MAGIC; + + if (bp->wol & MACB_WOL_ENABLED) + wol->wolopts |= WAKE_MAGIC; + } } static int macb_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) @@ -2833,9 +2835,13 @@ static int macb_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) struct macb *bp = netdev_priv(netdev); int ret; + /* Pass the order to phylink layer */ ret = phylink_ethtool_set_wol(bp->phylink, wol); - if (!ret) - return 0; + /* Don't manage WoL on MAC if handled by the PHY + * or if there's a failure in talking to the PHY + */ + if (!ret || ret != -EOPNOTSUPP) + return ret; if (!(bp->wol & MACB_WOL_HAS_MAGIC_PACKET) || (wol->wolopts & ~WAKE_MAGIC)) -- cgit v1.2.3 From 64febc5e56c9a748162f206dcc5be1a44436087a Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Fri, 10 Jul 2020 14:46:44 +0200 Subject: net: macb: fix macb_suspend() by removing call to netif_carrier_off() As we now use the phylink call to phylink_stop() in the non-WoL path, there is no need for this call to netif_carrier_off() anymore. It can disturb the underlying phylink FSM. Fixes: 7897b071ac3b ("net: macb: convert to phylink") Cc: Claudiu Beznea Cc: Harini Katakam Cc: Antoine Tenart Reviewed-by: Florian Fainelli Signed-off-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 79c2fe054303..548815255e22 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4604,7 +4604,6 @@ static int __maybe_unused macb_suspend(struct device *dev) bp->pm_data.scrt2 = gem_readl_n(bp, ETHT, SCRT2_ETHT); } - netif_carrier_off(netdev); if (bp->ptp_info) bp->ptp_info->ptp_remove(netdev); pm_runtime_force_suspend(dev); -- cgit v1.2.3 From 6c8f85cac98a4c6b767c4c4f6af7283724c32b47 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Fri, 10 Jul 2020 14:46:45 +0200 Subject: net: macb: fix call to pm_runtime in the suspend/resume functions The calls to pm_runtime_force_suspend/resume() functions are only relevant if the device is not configured to act as a WoL wakeup source. Add the device_may_wakeup() test before calling them. Fixes: 3e2a5e153906 ("net: macb: add wake-on-lan support via magic packet") Cc: Claudiu Beznea Cc: Harini Katakam Cc: Sergio Prado Reviewed-by: Florian Fainelli Signed-off-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 548815255e22..f1f0976e7669 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4606,7 +4606,8 @@ static int __maybe_unused macb_suspend(struct device *dev) if (bp->ptp_info) bp->ptp_info->ptp_remove(netdev); - pm_runtime_force_suspend(dev); + if (!device_may_wakeup(dev)) + pm_runtime_force_suspend(dev); return 0; } @@ -4621,7 +4622,8 @@ static int __maybe_unused macb_resume(struct device *dev) if (!netif_running(netdev)) return 0; - pm_runtime_force_resume(dev); + if (!device_may_wakeup(dev)) + pm_runtime_force_resume(dev); if (bp->wol & MACB_WOL_ENABLED) { macb_writel(bp, IDR, MACB_BIT(WOL)); -- cgit v1.2.3 From d9d5420273997664a1c09151ca86ac993f2f89c1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 10 Jul 2020 16:41:38 +0300 Subject: mlxsw: spectrum_router: Remove inappropriate usage of WARN_ON() We should not trigger a warning when a memory allocation fails. Remove the WARN_ON(). The warning is constantly triggered by syzkaller when it is injecting faults: [ 2230.758664] FAULT_INJECTION: forcing a failure. [ 2230.758664] name failslab, interval 1, probability 0, space 0, times 0 [ 2230.762329] CPU: 3 PID: 1407 Comm: syz-executor.0 Not tainted 5.8.0-rc2+ #28 ... [ 2230.898175] WARNING: CPU: 3 PID: 1407 at drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c:6265 mlxsw_sp_router_fib_event+0xfad/0x13e0 [ 2230.898179] Kernel panic - not syncing: panic_on_warn set ... [ 2230.898183] CPU: 3 PID: 1407 Comm: syz-executor.0 Not tainted 5.8.0-rc2+ #28 [ 2230.898190] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Fixes: 3057224e014c ("mlxsw: spectrum_router: Implement FIB offload in deferred work") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 770de0222e7b..019ed503aadf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -6262,7 +6262,7 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, } fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC); - if (WARN_ON(!fib_work)) + if (!fib_work) return NOTIFY_BAD; fib_work->mlxsw_sp = router->mlxsw_sp; -- cgit v1.2.3 From c4317b11675b99af6641662ebcbd3c6010600e64 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 10 Jul 2020 16:41:39 +0300 Subject: mlxsw: pci: Fix use-after-free in case of failed devlink reload In case devlink reload failed, it is possible to trigger a use-after-free when querying the kernel for device info via 'devlink dev info' [1]. This happens because as part of the reload error path the PCI command interface is de-initialized and its mailboxes are freed. When the devlink '->info_get()' callback is invoked the device is queried via the command interface and the freed mailboxes are accessed. Fix this by initializing the command interface once during probe and not during every reload. This is consistent with the other bus used by mlxsw (i.e., 'mlxsw_i2c') and also allows user space to query the running firmware version (for example) from the device after a failed reload. [1] BUG: KASAN: use-after-free in memcpy include/linux/string.h:406 [inline] BUG: KASAN: use-after-free in mlxsw_pci_cmd_exec+0x177/0xa60 drivers/net/ethernet/mellanox/mlxsw/pci.c:1675 Write of size 4096 at addr ffff88810ae32000 by task syz-executor.1/2355 CPU: 1 PID: 2355 Comm: syz-executor.1 Not tainted 5.8.0-rc2+ #29 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xf6/0x16e lib/dump_stack.c:118 print_address_description.constprop.0+0x1c/0x250 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 check_memory_region_inline mm/kasan/generic.c:186 [inline] check_memory_region+0x14e/0x1b0 mm/kasan/generic.c:192 memcpy+0x39/0x60 mm/kasan/common.c:106 memcpy include/linux/string.h:406 [inline] mlxsw_pci_cmd_exec+0x177/0xa60 drivers/net/ethernet/mellanox/mlxsw/pci.c:1675 mlxsw_cmd_exec+0x249/0x550 drivers/net/ethernet/mellanox/mlxsw/core.c:2335 mlxsw_cmd_access_reg drivers/net/ethernet/mellanox/mlxsw/cmd.h:859 [inline] mlxsw_core_reg_access_cmd drivers/net/ethernet/mellanox/mlxsw/core.c:1938 [inline] mlxsw_core_reg_access+0x2f6/0x540 drivers/net/ethernet/mellanox/mlxsw/core.c:1985 mlxsw_reg_query drivers/net/ethernet/mellanox/mlxsw/core.c:2000 [inline] mlxsw_devlink_info_get+0x17f/0x6e0 drivers/net/ethernet/mellanox/mlxsw/core.c:1090 devlink_nl_info_fill.constprop.0+0x13c/0x2d0 net/core/devlink.c:4588 devlink_nl_cmd_info_get_dumpit+0x246/0x460 net/core/devlink.c:4648 genl_lock_dumpit+0x85/0xc0 net/netlink/genetlink.c:575 netlink_dump+0x515/0xe50 net/netlink/af_netlink.c:2245 __netlink_dump_start+0x53d/0x830 net/netlink/af_netlink.c:2353 genl_family_rcv_msg_dumpit.isra.0+0x296/0x300 net/netlink/genetlink.c:638 genl_family_rcv_msg net/netlink/genetlink.c:733 [inline] genl_rcv_msg+0x78d/0x9d0 net/netlink/genetlink.c:753 netlink_rcv_skb+0x152/0x440 net/netlink/af_netlink.c:2469 genl_rcv+0x24/0x40 net/netlink/genetlink.c:764 netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline] netlink_unicast+0x53a/0x750 net/netlink/af_netlink.c:1329 netlink_sendmsg+0x850/0xd90 net/netlink/af_netlink.c:1918 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0x150/0x190 net/socket.c:672 ____sys_sendmsg+0x6d8/0x840 net/socket.c:2363 ___sys_sendmsg+0xff/0x170 net/socket.c:2417 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2450 do_syscall_64+0x56/0xa0 arch/x86/entry/common.c:359 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: a9c8336f6544 ("mlxsw: core: Add support for devlink info command") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 54 ++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index fd0e97de44e7..c04ec1a92826 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1414,23 +1414,12 @@ static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core, u16 num_pages; int err; - mutex_init(&mlxsw_pci->cmd.lock); - init_waitqueue_head(&mlxsw_pci->cmd.wait); - mlxsw_pci->core = mlxsw_core; mbox = mlxsw_cmd_mbox_alloc(); if (!mbox) return -ENOMEM; - err = mlxsw_pci_mbox_alloc(mlxsw_pci, &mlxsw_pci->cmd.in_mbox); - if (err) - goto mbox_put; - - err = mlxsw_pci_mbox_alloc(mlxsw_pci, &mlxsw_pci->cmd.out_mbox); - if (err) - goto err_out_mbox_alloc; - err = mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id); if (err) goto err_sw_reset; @@ -1537,9 +1526,6 @@ err_query_fw: mlxsw_pci_free_irq_vectors(mlxsw_pci); err_alloc_irq: err_sw_reset: - mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.out_mbox); -err_out_mbox_alloc: - mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox); mbox_put: mlxsw_cmd_mbox_free(mbox); return err; @@ -1553,8 +1539,6 @@ static void mlxsw_pci_fini(void *bus_priv) mlxsw_pci_aqs_fini(mlxsw_pci); mlxsw_pci_fw_area_fini(mlxsw_pci); mlxsw_pci_free_irq_vectors(mlxsw_pci); - mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.out_mbox); - mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox); } static struct mlxsw_pci_queue * @@ -1776,6 +1760,37 @@ static const struct mlxsw_bus mlxsw_pci_bus = { .features = MLXSW_BUS_F_TXRX | MLXSW_BUS_F_RESET, }; +static int mlxsw_pci_cmd_init(struct mlxsw_pci *mlxsw_pci) +{ + int err; + + mutex_init(&mlxsw_pci->cmd.lock); + init_waitqueue_head(&mlxsw_pci->cmd.wait); + + err = mlxsw_pci_mbox_alloc(mlxsw_pci, &mlxsw_pci->cmd.in_mbox); + if (err) + goto err_in_mbox_alloc; + + err = mlxsw_pci_mbox_alloc(mlxsw_pci, &mlxsw_pci->cmd.out_mbox); + if (err) + goto err_out_mbox_alloc; + + return 0; + +err_out_mbox_alloc: + mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox); +err_in_mbox_alloc: + mutex_destroy(&mlxsw_pci->cmd.lock); + return err; +} + +static void mlxsw_pci_cmd_fini(struct mlxsw_pci *mlxsw_pci) +{ + mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.out_mbox); + mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox); + mutex_destroy(&mlxsw_pci->cmd.lock); +} + static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { const char *driver_name = pdev->driver->name; @@ -1831,6 +1846,10 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) mlxsw_pci->pdev = pdev; pci_set_drvdata(pdev, mlxsw_pci); + err = mlxsw_pci_cmd_init(mlxsw_pci); + if (err) + goto err_pci_cmd_init; + mlxsw_pci->bus_info.device_kind = driver_name; mlxsw_pci->bus_info.device_name = pci_name(mlxsw_pci->pdev); mlxsw_pci->bus_info.dev = &pdev->dev; @@ -1848,6 +1867,8 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return 0; err_bus_device_register: + mlxsw_pci_cmd_fini(mlxsw_pci); +err_pci_cmd_init: iounmap(mlxsw_pci->hw_addr); err_ioremap: err_pci_resource_len_check: @@ -1865,6 +1886,7 @@ static void mlxsw_pci_remove(struct pci_dev *pdev) struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev); mlxsw_core_bus_device_unregister(mlxsw_pci->core, false); + mlxsw_pci_cmd_fini(mlxsw_pci); iounmap(mlxsw_pci->hw_addr); pci_release_regions(mlxsw_pci->pdev); pci_disable_device(mlxsw_pci->pdev); -- cgit v1.2.3