From 71a06f1034b91e15d3ba6b5539c7d3a2d7f13030 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Sat, 17 Dec 2022 00:57:42 +0100 Subject: mac802154: Fix possible double free upon parsing error Commit 4d1c7d87030b ("mac802154: Move an skb free within the rx path") tried to simplify error handling within the receive path by moving the kfree_skb() call at the very end of the top-level function but missed one kfree_skb() called upon frame parsing error. Prevent this possible double free from happening. Fixes: 4d1c7d87030b ("mac802154: Move an skb free within the rx path") Reported-by: Dan Carpenter Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20221216235742.646134-1-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/rx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index c2aae2a6d6a6..97bb4401dd3e 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -213,7 +213,6 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, ret = ieee802154_parse_frame_start(skb, &hdr); if (ret) { pr_debug("got invalid frame\n"); - kfree_skb(skb); return; } -- cgit v1.2.3 From 36024d023d139a0c8b552dc3b7f4dc7b4c139e8f Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 18 Jan 2023 16:46:30 +0800 Subject: bpf: Fix off-by-one error in bpf_mem_cache_idx() According to the definition of sizes[NUM_CACHES], when the size passed to bpf_mem_cache_size() is 256, it should return 6 instead 7. Fixes: 7c8199e24fa0 ("bpf: Introduce any context BPF specific memory allocator.") Signed-off-by: Hou Tao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230118084630.3750680-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index ebcc3dd0fa19..1db156405b68 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -71,7 +71,7 @@ static int bpf_mem_cache_idx(size_t size) if (size <= 192) return size_index[(size - 1) / 8] - 1; - return fls(size - 1) - 1; + return fls(size - 1) - 2; } #define NUM_CACHES 11 -- cgit v1.2.3 From bdb7fdb0aca8b96cef9995d3a57e251c2289322f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 18 Jan 2023 12:48:15 -0800 Subject: bpf: Fix a possible task gone issue with bpf_send_signal[_thread]() helpers In current bpf_send_signal() and bpf_send_signal_thread() helper implementation, irq_work is used to handle nmi context. Hao Sun reported in [1] that the current task at the entry of the helper might be gone during irq_work callback processing. To fix the issue, a reference is acquired for the current task before enqueuing into the irq_work so that the queued task is still available during irq_work callback processing. [1] https://lore.kernel.org/bpf/20230109074425.12556-1-sunhao.th@gmail.com/ Fixes: 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") Tested-by: Hao Sun Reported-by: Hao Sun Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230118204815.3331855-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f47274de012b..c09792c551bf 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -833,6 +833,7 @@ static void do_bpf_send_signal(struct irq_work *entry) work = container_of(entry, struct send_signal_irq_work, irq_work); group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); + put_task_struct(work->task); } static int bpf_send_signal_common(u32 sig, enum pid_type type) @@ -867,7 +868,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) * to the irq_work. The current task may change when queued * irq works get executed. */ - work->task = current; + work->task = get_task_struct(current); work->sig = sig; work->type = type; irq_work_queue(&work->irq_work); -- cgit v1.2.3 From 71f656a50176915d6813751188b5758daa8d012b Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 6 Jan 2023 16:22:13 +0200 Subject: bpf: Fix to preserve reg parent/live fields when copying range info Register range information is copied in several places. The intent is to transfer range/id information from one register/stack spill to another. Currently this is done using direct register assignment, e.g.: static void find_equal_scalars(..., struct bpf_reg_state *known_reg) { ... struct bpf_reg_state *reg; ... *reg = *known_reg; ... } However, such assignments also copy the following bpf_reg_state fields: struct bpf_reg_state { ... struct bpf_reg_state *parent; ... enum bpf_reg_liveness live; ... }; Copying of these fields is accidental and incorrect, as could be demonstrated by the following example: 0: call ktime_get_ns() 1: r6 = r0 2: call ktime_get_ns() 3: r7 = r0 4: if r0 > r6 goto +1 ; r0 & r6 are unbound thus generated ; branch states are identical 5: *(u64 *)(r10 - 8) = 0xdeadbeef ; 64-bit write to fp[-8] --- checkpoint --- 6: r1 = 42 ; r1 marked as written 7: *(u8 *)(r10 - 8) = r1 ; 8-bit write, fp[-8] parent & live ; overwritten 8: r2 = *(u64 *)(r10 - 8) 9: r0 = 0 10: exit This example is unsafe because 64-bit write to fp[-8] at (5) is conditional, thus not all bytes of fp[-8] are guaranteed to be set when it is read at (8). However, currently the example passes verification. First, the execution path 1-10 is examined by verifier. Suppose that a new checkpoint is created by is_state_visited() at (6). After checkpoint creation: - r1.parent points to checkpoint.r1, - fp[-8].parent points to checkpoint.fp[-8]. At (6) the r1.live is set to REG_LIVE_WRITTEN. At (7) the fp[-8].parent is set to r1.parent and fp[-8].live is set to REG_LIVE_WRITTEN, because of the following code called in check_stack_write_fixed_off(): static void save_register_state(struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, int size) { ... state->stack[spi].spilled_ptr = *reg; // <--- parent & live copied if (size == BPF_REG_SIZE) state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; ... } Note the intent to mark stack spill as written only if 8 bytes are spilled to a slot, however this intent is spoiled by a 'live' field copy. At (8) the checkpoint.fp[-8] should be marked as REG_LIVE_READ but this does not happen: - fp[-8] in a current state is already marked as REG_LIVE_WRITTEN; - fp[-8].parent points to checkpoint.r1, parentage chain is used by mark_reg_read() to mark checkpoint states. At (10) the verification is finished for path 1-10 and jump 4-6 is examined. The checkpoint.fp[-8] never gets REG_LIVE_READ mark and this spill is pruned from the cached states by clean_live_states(). Hence verifier state obtained via path 1-4,6 is deemed identical to one obtained via path 1-6 and program marked as safe. Note: the example should be executed with BPF_F_TEST_STATE_FREQ flag set to force creation of intermediate verifier states. This commit revisits the locations where bpf_reg_state instances are copied and replaces the direct copies with a call to a function copy_register_state(dst, src) that preserves 'parent' and 'live' fields of the 'dst'. Fixes: 679c782de14b ("bpf/verifier: per-register parent pointers") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20230106142214.1040390-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbef0b0967ae..7ee218827259 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3243,13 +3243,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +/* Copy src state preserving dst->parent and dst->live fields */ +static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) +{ + struct bpf_reg_state *parent = dst->parent; + enum bpf_reg_liveness live = dst->live; + + *dst = *src; + dst->parent = parent; + dst->live = live; +} + static void save_register_state(struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, int size) { int i; - state->stack[spi].spilled_ptr = *reg; + copy_register_state(&state->stack[spi].spilled_ptr, reg); if (size == BPF_REG_SIZE) state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; @@ -3577,7 +3588,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, */ s32 subreg_def = state->regs[dst_regno].subreg_def; - state->regs[dst_regno] = *reg; + copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; } else { for (i = 0; i < size; i++) { @@ -3598,7 +3609,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[dst_regno] = *reg; + copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions @@ -9592,7 +9603,7 @@ do_sim: */ if (!ptr_is_dst_reg) { tmp = *dst_reg; - *dst_reg = *ptr_reg; + copy_register_state(dst_reg, ptr_reg); } ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); @@ -10845,7 +10856,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * to propagate min/max range. */ src_reg->id = ++env->id_gen; - *dst_reg = *src_reg; + copy_register_state(dst_reg, src_reg); dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { @@ -10856,7 +10867,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { - *dst_reg = *src_reg; + copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared otherwise * dst_reg min/max could be incorrectly * propagated into src_reg by find_equal_scalars() @@ -11655,7 +11666,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; + copy_register_state(reg, known_reg); })); } -- cgit v1.2.3 From b9fa9bc839291020b362ab5392e5f18ba79657ac Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 6 Jan 2023 16:22:14 +0200 Subject: selftests/bpf: Verify copy_register_state() preserves parent/live fields A testcase to check that verifier.c:copy_register_state() preserves register parentage chain and livness information. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20230106142214.1040390-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/verifier/search_pruning.c | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/search_pruning.c b/tools/testing/selftests/bpf/verifier/search_pruning.c index 68b14fdfebdb..d63fd8991b03 100644 --- a/tools/testing/selftests/bpf/verifier/search_pruning.c +++ b/tools/testing/selftests/bpf/verifier/search_pruning.c @@ -225,3 +225,39 @@ .result_unpriv = ACCEPT, .insn_processed = 15, }, +/* The test performs a conditional 64-bit write to a stack location + * fp[-8], this is followed by an unconditional 8-bit write to fp[-8], + * then data is read from fp[-8]. This sequence is unsafe. + * + * The test would be mistakenly marked as safe w/o dst register parent + * preservation in verifier.c:copy_register_state() function. + * + * Note the usage of BPF_F_TEST_STATE_FREQ to force creation of the + * checkpoint state after conditional 64-bit assignment. + */ +{ + "write tracking and register parent chain bug", + .insns = { + /* r6 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* r0 = ktime_get_ns() */ + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + /* if r0 > r6 goto +1 */ + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_6, 1), + /* *(u64 *)(r10 - 8) = 0xdeadbeef */ + BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0xdeadbeef), + /* r1 = 42 */ + BPF_MOV64_IMM(BPF_REG_1, 42), + /* *(u8 *)(r10 - 8) = r1 */ + BPF_STX_MEM(BPF_B, BPF_REG_FP, BPF_REG_1, -8), + /* r2 = *(u64 *)(r10 - 8) */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_FP, -8), + /* exit(0) */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .flags = BPF_F_TEST_STATE_FREQ, + .errstr = "invalid read from stack off -8+1 size 8", + .result = REJECT, +}, -- cgit v1.2.3 From 74bc3a5acc82f020d2e126f56c535d02d1e74e37 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 20 Jan 2023 13:21:48 +0100 Subject: bpf: Add missing btf_put to register_btf_id_dtor_kfuncs We take the BTF reference before we register dtors and we need to put it back when it's done. We probably won't se a problem with kernel BTF, but module BTF would stay loaded (because of the extra ref) even when its module is removed. Cc: Kumar Kartikeya Dwivedi Fixes: 5ce937d613a4 ("bpf: Populate pairs of btf_id and destructor kfunc in btf") Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20230120122148.1522359-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7dd8af06413..b7017cae6fd1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7782,9 +7782,9 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); - return 0; end: - btf_free_dtor_kfunc_tab(btf); + if (ret) + btf_free_dtor_kfunc_tab(btf); btf_put(btf); return ret; } -- cgit v1.2.3 From 5b4a79ba65a1ab479903fff2e604865d229b70a9 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sat, 21 Jan 2023 13:41:43 +0100 Subject: bpf, sockmap: Don't let sock_map_{close,destroy,unhash} call itself sock_map proto callbacks should never call themselves by design. Protect against bugs like [1] and break out of the recursive loop to avoid a stack overflow in favor of a resource leak. [1] https://lore.kernel.org/all/00000000000073b14905ef2e7401@google.com/ Suggested-by: Eric Dumazet Signed-off-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/r/20230113-sockmap-fix-v2-1-1e0ee7ac2f90@cloudflare.com Signed-off-by: Alexei Starovoitov --- net/core/sock_map.c | 61 +++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 22fa2c5bc6ec..a68a7290a3b2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1569,15 +1569,16 @@ void sock_map_unhash(struct sock *sk) psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; + saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + } else { + saved_unhash = psock->saved_unhash; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); } - - saved_unhash = psock->saved_unhash; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - saved_unhash(sk); + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; + if (saved_unhash) + saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); @@ -1590,17 +1591,18 @@ void sock_map_destroy(struct sock *sk) psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); - if (sk->sk_prot->destroy) - sk->sk_prot->destroy(sk); - return; + saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + } else { + saved_destroy = psock->saved_destroy; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + sk_psock_put(sk, psock); } - - saved_destroy = psock->saved_destroy; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - sk_psock_put(sk, psock); - saved_destroy(sk); + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; + if (saved_destroy) + saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); @@ -1615,16 +1617,21 @@ void sock_map_close(struct sock *sk, long timeout) if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); - return sk->sk_prot->close(sk, timeout); + saved_close = READ_ONCE(sk->sk_prot)->close; + } else { + saved_close = psock->saved_close; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + release_sock(sk); + cancel_work_sync(&psock->work); + sk_psock_put(sk, psock); } - - saved_close = psock->saved_close; - sock_map_remove_links(sk, psock); - rcu_read_unlock(); - sk_psock_stop(psock); - release_sock(sk); - cancel_work_sync(&psock->work); - sk_psock_put(sk, psock); + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); -- cgit v1.2.3 From ddce1e091757d0259107c6c0c7262df201de2b66 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sat, 21 Jan 2023 13:41:44 +0100 Subject: bpf, sockmap: Check for any of tcp_bpf_prots when cloning a listener A listening socket linked to a sockmap has its sk_prot overridden. It points to one of the struct proto variants in tcp_bpf_prots. The variant depends on the socket's family and which sockmap programs are attached. A child socket cloned from a TCP listener initially inherits their sk_prot. But before cloning is finished, we restore the child's proto to the listener's original non-tcp_bpf_prots one. This happens in tcp_create_openreq_child -> tcp_bpf_clone. Today, in tcp_bpf_clone we detect if the child's proto should be restored by checking only for the TCP_BPF_BASE proto variant. This is not correct. The sk_prot of listening socket linked to a sockmap can point to to any variant in tcp_bpf_prots. If the listeners sk_prot happens to be not the TCP_BPF_BASE variant, then the child socket unintentionally is left if the inherited sk_prot by tcp_bpf_clone. This leads to issues like infinite recursion on close [1], because the child state is otherwise not set up for use with tcp_bpf_prot operations. Adjust the check in tcp_bpf_clone to detect all of tcp_bpf_prots variants. Note that it wouldn't be sufficient to check the socket state when overriding the sk_prot in tcp_bpf_update_proto in order to always use the TCP_BPF_BASE variant for listening sockets. Since commit b8b8315e39ff ("bpf, sockmap: Remove unhash handler for BPF sockmap usage") it is possible for a socket to transition to TCP_LISTEN state while already linked to a sockmap, e.g. connect() -> insert into map -> connect(AF_UNSPEC) -> listen(). [1]: https://lore.kernel.org/all/00000000000073b14905ef2e7401@google.com/ Fixes: e80251555f0b ("tcp_bpf: Don't let child socket inherit parent protocol ops on copy") Reported-by: syzbot+04c21ed96d861dccc5cd@syzkaller.appspotmail.com Signed-off-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/r/20230113-sockmap-fix-v2-2-1e0ee7ac2f90@cloudflare.com Signed-off-by: Alexei Starovoitov --- include/linux/util_macros.h | 12 ++++++++++++ net/ipv4/tcp_bpf.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 72299f261b25..43db6e47503c 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -38,4 +38,16 @@ */ #define find_closest_descending(x, a, as) __find_closest(x, a, as, >=) +/** + * is_insidevar - check if the @ptr points inside the @var memory range. + * @ptr: the pointer to a memory address. + * @var: the variable which address and size identify the memory range. + * + * Evaluates to true if the address in @ptr lies within the memory + * range allocated to @var. + */ +#define is_insidevar(ptr, var) \ + ((uintptr_t)(ptr) >= (uintptr_t)(var) && \ + (uintptr_t)(ptr) < (uintptr_t)(var) + sizeof(var)) + #endif diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 94aad3870c5f..cf26d65ca389 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -639,10 +640,9 @@ EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); */ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { - int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; struct proto *prot = newsk->sk_prot; - if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) + if (is_insidevar(prot, tcp_bpf_prots)) newsk->sk_prot = sk->sk_prot_creator; } #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From b4ea530d024ca6095fc80290075893a5b7136516 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sat, 21 Jan 2023 13:41:45 +0100 Subject: selftests/bpf: Pass BPF skeleton to sockmap_listen ops tests Following patch extends the sockmap ops tests to cover the scenario when a sockmap with attached programs holds listening sockets. Pass the BPF skeleton to sockmap ops test so that the can access and attach the BPF programs. Signed-off-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/r/20230113-sockmap-fix-v2-3-1e0ee7ac2f90@cloudflare.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_listen.c | 55 +++++++++++++++------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 2cf0c7a3fe23..499fba8f55b9 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -30,6 +30,8 @@ #define MAX_STRERR_LEN 256 #define MAX_TEST_NAME 80 +#define __always_unused __attribute__((__unused__)) + #define _FAIL(errnum, fmt...) \ ({ \ error_at_line(0, (errnum), __func__, __LINE__, fmt); \ @@ -321,7 +323,8 @@ static int socket_loopback(int family, int sotype) return socket_loopback_reuseport(family, sotype, -1); } -static void test_insert_invalid(int family, int sotype, int mapfd) +static void test_insert_invalid(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u32 key = 0; u64 value; @@ -338,7 +341,8 @@ static void test_insert_invalid(int family, int sotype, int mapfd) FAIL_ERRNO("map_update: expected EBADF"); } -static void test_insert_opened(int family, int sotype, int mapfd) +static void test_insert_opened(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u32 key = 0; u64 value; @@ -359,7 +363,8 @@ static void test_insert_opened(int family, int sotype, int mapfd) xclose(s); } -static void test_insert_bound(int family, int sotype, int mapfd) +static void test_insert_bound(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -386,7 +391,8 @@ close: xclose(s); } -static void test_insert(int family, int sotype, int mapfd) +static void test_insert(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u64 value; u32 key; @@ -402,7 +408,8 @@ static void test_insert(int family, int sotype, int mapfd) xclose(s); } -static void test_delete_after_insert(int family, int sotype, int mapfd) +static void test_delete_after_insert(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u64 value; u32 key; @@ -419,7 +426,8 @@ static void test_delete_after_insert(int family, int sotype, int mapfd) xclose(s); } -static void test_delete_after_close(int family, int sotype, int mapfd) +static void test_delete_after_close(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { int err, s; u64 value; @@ -442,7 +450,8 @@ static void test_delete_after_close(int family, int sotype, int mapfd) FAIL_ERRNO("map_delete: expected EINVAL/EINVAL"); } -static void test_lookup_after_insert(int family, int sotype, int mapfd) +static void test_lookup_after_insert(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u64 cookie, value; socklen_t len; @@ -470,7 +479,8 @@ static void test_lookup_after_insert(int family, int sotype, int mapfd) xclose(s); } -static void test_lookup_after_delete(int family, int sotype, int mapfd) +static void test_lookup_after_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { int err, s; u64 value; @@ -493,7 +503,8 @@ static void test_lookup_after_delete(int family, int sotype, int mapfd) xclose(s); } -static void test_lookup_32_bit_value(int family, int sotype, int mapfd) +static void test_lookup_32_bit_value(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { u32 key, value32; int err, s; @@ -523,7 +534,8 @@ close: xclose(s); } -static void test_update_existing(int family, int sotype, int mapfd) +static void test_update_existing(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { int s1, s2; u64 value; @@ -551,7 +563,8 @@ close_s1: /* Exercise the code path where we destroy child sockets that never * got accept()'ed, aka orphans, when parent socket gets closed. */ -static void test_destroy_orphan_child(int family, int sotype, int mapfd) +static void test_destroy_orphan_child(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -585,7 +598,8 @@ close_srv: /* Perform a passive open after removing listening socket from SOCKMAP * to ensure that callbacks get restored properly. */ -static void test_clone_after_delete(int family, int sotype, int mapfd) +static void test_clone_after_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -621,7 +635,8 @@ close_srv: * SOCKMAP, but got accept()'ed only after the parent has been removed * from SOCKMAP, gets cloned without parent psock state or callbacks. */ -static void test_accept_after_delete(int family, int sotype, int mapfd) +static void test_accept_after_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; const u32 zero = 0; @@ -675,7 +690,8 @@ close_srv: /* Check that child socket that got created and accepted while parent * was in a SOCKMAP is cloned without parent psock state or callbacks. */ -static void test_accept_before_delete(int family, int sotype, int mapfd) +static void test_accept_before_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct sockaddr_storage addr; const u32 zero = 0, one = 1; @@ -784,7 +800,8 @@ done: return NULL; } -static void test_syn_recv_insert_delete(int family, int sotype, int mapfd) +static void test_syn_recv_insert_delete(struct test_sockmap_listen *skel __always_unused, + int family, int sotype, int mapfd) { struct connect_accept_ctx ctx = { 0 }; struct sockaddr_storage addr; @@ -847,7 +864,8 @@ static void *listen_thread(void *arg) return NULL; } -static void test_race_insert_listen(int family, int socktype, int mapfd) +static void test_race_insert_listen(struct test_sockmap_listen *skel __always_unused, + int family, int socktype, int mapfd) { struct connect_accept_ctx ctx = { 0 }; const u32 zero = 0; @@ -1473,7 +1491,8 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, int family, int sotype) { const struct op_test { - void (*fn)(int family, int sotype, int mapfd); + void (*fn)(struct test_sockmap_listen *skel, + int family, int sotype, int mapfd); const char *name; int sotype; } tests[] = { @@ -1520,7 +1539,7 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map, if (!test__start_subtest(s)) continue; - t->fn(family, sotype, map_fd); + t->fn(skel, family, sotype, map_fd); test_ops_cleanup(map); } } -- cgit v1.2.3 From c88ea16a8f892bce3bfb3f6a0d91b2bb27df8f59 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sat, 21 Jan 2023 13:41:46 +0100 Subject: selftests/bpf: Cover listener cloning with progs attached to sockmap Today we test if a child socket is cloned properly from a listening socket inside a sockmap only when there are no BPF programs attached to the map. A bug has been reported [1] for the case when sockmap has a verdict program attached. So cover this case as well to prevent regressions. [1]: https://lore.kernel.org/r/00000000000073b14905ef2e7401@google.com Signed-off-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/r/20230113-sockmap-fix-v2-4-1e0ee7ac2f90@cloudflare.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_listen.c | 30 ++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 499fba8f55b9..567e07c19ecc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -563,8 +563,7 @@ close_s1: /* Exercise the code path where we destroy child sockets that never * got accept()'ed, aka orphans, when parent socket gets closed. */ -static void test_destroy_orphan_child(struct test_sockmap_listen *skel __always_unused, - int family, int sotype, int mapfd) +static void do_destroy_orphan_child(int family, int sotype, int mapfd) { struct sockaddr_storage addr; socklen_t len; @@ -595,6 +594,33 @@ close_srv: xclose(s); } +static void test_destroy_orphan_child(struct test_sockmap_listen *skel, + int family, int sotype, int mapfd) +{ + int msg_verdict = bpf_program__fd(skel->progs.prog_msg_verdict); + int skb_verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + const struct test { + int progfd; + enum bpf_attach_type atype; + } tests[] = { + { -1, -1 }, + { msg_verdict, BPF_SK_MSG_VERDICT }, + { skb_verdict, BPF_SK_SKB_VERDICT }, + }; + const struct test *t; + + for (t = tests; t < tests + ARRAY_SIZE(tests); t++) { + if (t->progfd != -1 && + xbpf_prog_attach(t->progfd, mapfd, t->atype, 0) != 0) + return; + + do_destroy_orphan_child(family, sotype, mapfd); + + if (t->progfd != -1) + xbpf_prog_detach2(t->progfd, mapfd, t->atype); + } +} + /* Perform a passive open after removing listening socket from SOCKMAP * to ensure that callbacks get restored properly. */ -- cgit v1.2.3 From 5416c9aea8323583e8696f0500b6142dfae80821 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 26 Jan 2023 16:17:32 -0800 Subject: bpf: Fix the kernel crash caused by bpf_setsockopt(). The kernel crash was caused by a BPF program attached to the "lsm_cgroup/socket_sock_rcv_skb" hook, which performed a call to `bpf_setsockopt()` in order to set the TCP_NODELAY flag as an example. Flags like TCP_NODELAY can prompt the kernel to flush a socket's outgoing queue, and this hook "lsm_cgroup/socket_sock_rcv_skb" is frequently triggered by softirqs. The issue was that in certain circumstances, when `tcp_write_xmit()` was called to flush the queue, it would also allow BH (bottom-half) to run. This could lead to our program attempting to flush the same socket recursively, which caused a `skbuff` to be unlinked twice. `security_sock_rcv_skb()` is triggered by `tcp_filter()`. This occurs before the sock ownership is checked in `tcp_v4_rcv()`. Consequently, if a bpf program runs on `security_sock_rcv_skb()` while under softirq conditions, it may not possess the lock needed for `bpf_setsockopt()`, thus presenting an issue. The patch fixes this issue by ensuring that a BPF program attached to the "lsm_cgroup/socket_sock_rcv_skb" hook is not allowed to call `bpf_setsockopt()`. The differences from v1 are - changing commit log to explain holding the lock of the sock, - emphasizing that TCP_NODELAY is not the only flag, and - adding the fixes tag. v1: https://lore.kernel.org/bpf/20230125000244.1109228-1-kuifeng@meta.com/ Signed-off-by: Kui-Feng Lee Fixes: 9113d7e48e91 ("bpf: expose bpf_{g,s}etsockopt to lsm cgroup") Link: https://lore.kernel.org/r/20230127001732.4162630-1-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_lsm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index a4a41ee3e80b..e14c822f8911 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -51,7 +51,6 @@ BTF_SET_END(bpf_lsm_current_hooks) */ BTF_SET_START(bpf_lsm_locked_sockopt_hooks) #ifdef CONFIG_SECURITY_NETWORK -BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) BTF_ID(func, bpf_lsm_sock_graft) BTF_ID(func, bpf_lsm_inet_csk_clone) BTF_ID(func, bpf_lsm_inet_conn_established) -- cgit v1.2.3 From a6a0974aae4209d039ba81226ded5246eea14961 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 24 Jan 2023 09:19:43 -0800 Subject: ice: Prevent set_channel from changing queues while RDMA active The PF controls the set of queues that the RDMA auxiliary_driver requests resources from. The set_channel command will alter that pool and trigger a reconfiguration of the VSI, which breaks RDMA functionality. Prevent set_channel from executing when RDMA driver bound to auxiliary device. Adding a locked variable to pass down the call chain to avoid double locking the device_lock. Fixes: 348048e724a0 ("ice: Implement iidc operations") Signed-off-by: Dave Ertman Tested-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 2 +- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 23 +++++++++++++---------- drivers/net/ethernet/intel/ice/ice_dcb_lib.h | 4 ++-- drivers/net/ethernet/intel/ice/ice_ethtool.c | 28 ++++++++++++++++++++++++---- drivers/net/ethernet/intel/ice/ice_main.c | 5 +++-- 5 files changed, 43 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 2f0b604abc5e..713069f809ec 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -880,7 +880,7 @@ void ice_set_ethtool_repr_ops(struct net_device *netdev); void ice_set_ethtool_safe_mode_ops(struct net_device *netdev); u16 ice_get_avail_txq_count(struct ice_pf *pf); u16 ice_get_avail_rxq_count(struct ice_pf *pf); -int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx); +int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx, bool locked); void ice_update_vsi_stats(struct ice_vsi *vsi); void ice_update_pf_stats(struct ice_pf *pf); void diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index 4f24d441c35e..0a55c552189a 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -441,7 +441,7 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked) goto out; } - ice_pf_dcb_recfg(pf); + ice_pf_dcb_recfg(pf, false); out: /* enable previously downed VSIs */ @@ -731,12 +731,13 @@ static int ice_dcb_noncontig_cfg(struct ice_pf *pf) /** * ice_pf_dcb_recfg - Reconfigure all VEBs and VSIs * @pf: pointer to the PF struct + * @locked: is adev device lock held * * Assumed caller has already disabled all VSIs before * calling this function. Reconfiguring DCB based on * local_dcbx_cfg. */ -void ice_pf_dcb_recfg(struct ice_pf *pf) +void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) { struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; struct iidc_event *event; @@ -783,14 +784,16 @@ void ice_pf_dcb_recfg(struct ice_pf *pf) if (vsi->type == ICE_VSI_PF) ice_dcbnl_set_all(vsi); } - /* Notify the AUX drivers that TC change is finished */ - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return; + if (!locked) { + /* Notify the AUX drivers that TC change is finished */ + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return; - set_bit(IIDC_EVENT_AFTER_TC_CHANGE, event->type); - ice_send_event_to_aux(pf, event); - kfree(event); + set_bit(IIDC_EVENT_AFTER_TC_CHANGE, event->type); + ice_send_event_to_aux(pf, event); + kfree(event); + } } /** @@ -1044,7 +1047,7 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, } /* changes in configuration update VSI */ - ice_pf_dcb_recfg(pf); + ice_pf_dcb_recfg(pf, false); /* enable previously downed VSIs */ ice_dcb_ena_dis_vsi(pf, true, true); diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h index 4c421c842a13..800879a88c5e 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h @@ -23,7 +23,7 @@ u8 ice_dcb_get_tc(struct ice_vsi *vsi, int queue_index); int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked); int ice_dcb_bwchk(struct ice_pf *pf, struct ice_dcbx_cfg *dcbcfg); -void ice_pf_dcb_recfg(struct ice_pf *pf); +void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked); void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi); int ice_init_pf_dcb(struct ice_pf *pf, bool locked); void ice_update_dcb_stats(struct ice_pf *pf); @@ -128,7 +128,7 @@ static inline u8 ice_get_pfc_mode(struct ice_pf *pf) return 0; } -static inline void ice_pf_dcb_recfg(struct ice_pf *pf) { } +static inline void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) { } static inline void ice_vsi_cfg_dcb_rings(struct ice_vsi *vsi) { } static inline void ice_update_dcb_stats(struct ice_pf *pf) { } static inline void diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 4191994d8f3a..a359f1610fc1 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3641,7 +3641,9 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) struct ice_vsi *vsi = np->vsi; struct ice_pf *pf = vsi->back; int new_rx = 0, new_tx = 0; + bool locked = false; u32 curr_combined; + int ret = 0; /* do not support changing channels in Safe Mode */ if (ice_is_safe_mode(pf)) { @@ -3705,15 +3707,33 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) return -EINVAL; } - ice_vsi_recfg_qs(vsi, new_rx, new_tx); + if (pf->adev) { + mutex_lock(&pf->adev_mutex); + device_lock(&pf->adev->dev); + locked = true; + if (pf->adev->dev.driver) { + netdev_err(dev, "Cannot change channels when RDMA is active\n"); + ret = -EBUSY; + goto adev_unlock; + } + } + + ice_vsi_recfg_qs(vsi, new_rx, new_tx, locked); - if (!netif_is_rxfh_configured(dev)) - return ice_vsi_set_dflt_rss_lut(vsi, new_rx); + if (!netif_is_rxfh_configured(dev)) { + ret = ice_vsi_set_dflt_rss_lut(vsi, new_rx); + goto adev_unlock; + } /* Update rss_size due to change in Rx queues */ vsi->rss_size = ice_get_valid_rss_size(&pf->hw, new_rx); - return 0; +adev_unlock: + if (locked) { + device_unlock(&pf->adev->dev); + mutex_unlock(&pf->adev_mutex); + } + return ret; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 237ede2cffb0..5f86e4111fa9 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4195,12 +4195,13 @@ bool ice_is_wol_supported(struct ice_hw *hw) * @vsi: VSI being changed * @new_rx: new number of Rx queues * @new_tx: new number of Tx queues + * @locked: is adev device_lock held * * Only change the number of queues if new_tx, or new_rx is non-0. * * Returns 0 on success. */ -int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx) +int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx, bool locked) { struct ice_pf *pf = vsi->back; int err = 0, timeout = 50; @@ -4229,7 +4230,7 @@ int ice_vsi_recfg_qs(struct ice_vsi *vsi, int new_rx, int new_tx) ice_vsi_close(vsi); ice_vsi_rebuild(vsi, false); - ice_pf_dcb_recfg(pf); + ice_pf_dcb_recfg(pf, locked); ice_vsi_open(vsi); done: clear_bit(ICE_CFG_BUSY, pf->state); -- cgit v1.2.3 From 53b9b77dcf48dad1c6111b0c0a7310b3f9364776 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Fri, 25 Nov 2022 14:34:39 +0100 Subject: ice: Fix broken link in ice NAPI doc Current link for NAPI documentation in ice driver doesn't work - it returns 404. Update the link to the working one. Signed-off-by: Michal Wilczynski Acked-by: Jesse Brandeburg Signed-off-by: Tony Nguyen --- Documentation/networking/device_drivers/ethernet/intel/ice.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/device_drivers/ethernet/intel/ice.rst b/Documentation/networking/device_drivers/ethernet/intel/ice.rst index dc2e60ced927..b481b81f3be5 100644 --- a/Documentation/networking/device_drivers/ethernet/intel/ice.rst +++ b/Documentation/networking/device_drivers/ethernet/intel/ice.rst @@ -819,7 +819,7 @@ NAPI ---- This driver supports NAPI (Rx polling mode). For more information on NAPI, see -https://www.linuxfoundation.org/collaborate/workgroups/networking/napi +https://wiki.linuxfoundation.org/networking/napi MACVLAN -- cgit v1.2.3 From 2ccce20d51faa0178086163ccb6c84a099a87ab4 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Jan 2023 08:48:57 +0100 Subject: qede: execute xdp_do_flush() before napi_complete_done() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that xdp_do_flush() is always executed before napi_complete_done(). This is important for two reasons. First, a redirect to an XSKMAP assumes that a call to xdp_do_redirect() from napi context X on CPU Y will be followed by a xdp_do_flush() from the same napi context and CPU. This is not guaranteed if the napi_complete_done() is executed before xdp_do_flush(), as it tells the napi logic that it is fine to schedule napi context X on another CPU. Details from a production system triggering this bug using the veth driver can be found following the first link below. The second reason is that the XDP_REDIRECT logic in itself relies on being inside a single NAPI instance through to the xdp_do_flush() call for RCU protection of all in-kernel data structures. Details can be found in the second link below. Fixes: d1b25b79e162b ("qede: add .ndo_xdp_xmit() and XDP_REDIRECT support") Signed-off-by: Magnus Karlsson Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221220185903.1105011-1-sbohrer@cloudflare.com Link: https://lore.kernel.org/all/20210624160609.292325-1-toke@redhat.com/ Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 7c2af482192d..cb1746bc0e0c 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1438,6 +1438,10 @@ int qede_poll(struct napi_struct *napi, int budget) rx_work_done = (likely(fp->type & QEDE_FASTPATH_RX) && qede_has_rx_work(fp->rxq)) ? qede_rx_int(fp, budget) : 0; + + if (fp->xdp_xmit & QEDE_XDP_REDIRECT) + xdp_do_flush(); + /* Handle case where we are called by netpoll with a budget of 0 */ if (rx_work_done < budget || !budget) { if (!qede_poll_is_more_work(fp)) { @@ -1457,9 +1461,6 @@ int qede_poll(struct napi_struct *napi, int budget) qede_update_tx_producer(fp->xdp_tx); } - if (fp->xdp_xmit & QEDE_XDP_REDIRECT) - xdp_do_flush_map(); - return rx_work_done; } -- cgit v1.2.3 From 12b5717990c81fc2f6f2aa9d53d960b916478d4f Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Jan 2023 08:48:58 +0100 Subject: lan966x: execute xdp_do_flush() before napi_complete_done() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that xdp_do_flush() is always executed before napi_complete_done(). This is important for two reasons. First, a redirect to an XSKMAP assumes that a call to xdp_do_redirect() from napi context X on CPU Y will be followed by a xdp_do_flush() from the same napi context and CPU. This is not guaranteed if the napi_complete_done() is executed before xdp_do_flush(), as it tells the napi logic that it is fine to schedule napi context X on another CPU. Details from a production system triggering this bug using the veth driver can be found following the first link below. The second reason is that the XDP_REDIRECT logic in itself relies on being inside a single NAPI instance through to the xdp_do_flush() call for RCU protection of all in-kernel data structures. Details can be found in the second link below. Fixes: a825b611c7c1 ("net: lan966x: Add support for XDP_REDIRECT") Signed-off-by: Magnus Karlsson Acked-by: Toke Høiland-Jørgensen Acked-by: Steen Hegelund Link: https://lore.kernel.org/r/20221220185903.1105011-1-sbohrer@cloudflare.com Link: https://lore.kernel.org/all/20210624160609.292325-1-toke@redhat.com/ Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c index 5314c064ceae..55b484b10562 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_fdma.c @@ -608,12 +608,12 @@ allocate_new: lan966x_fdma_rx_reload(rx); } - if (counter < weight && napi_complete_done(napi, counter)) - lan_wr(0xff, lan966x, FDMA_INTR_DB_ENA); - if (redirect) xdp_do_flush(); + if (counter < weight && napi_complete_done(napi, counter)) + lan_wr(0xff, lan966x, FDMA_INTR_DB_ENA); + return counter; } -- cgit v1.2.3 From ad7e615f646c9b5b2cf655cdfb9d91a28db4f25a Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Jan 2023 08:48:59 +0100 Subject: virtio-net: execute xdp_do_flush() before napi_complete_done() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that xdp_do_flush() is always executed before napi_complete_done(). This is important for two reasons. First, a redirect to an XSKMAP assumes that a call to xdp_do_redirect() from napi context X on CPU Y will be followed by a xdp_do_flush() from the same napi context and CPU. This is not guaranteed if the napi_complete_done() is executed before xdp_do_flush(), as it tells the napi logic that it is fine to schedule napi context X on another CPU. Details from a production system triggering this bug using the veth driver can be found following the first link below. The second reason is that the XDP_REDIRECT logic in itself relies on being inside a single NAPI instance through to the xdp_do_flush() call for RCU protection of all in-kernel data structures. Details can be found in the second link below. Fixes: 186b3c998c50 ("virtio-net: support XDP_REDIRECT") Signed-off-by: Magnus Karlsson Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221220185903.1105011-1-sbohrer@cloudflare.com Link: https://lore.kernel.org/all/20210624160609.292325-1-toke@redhat.com/ Acked-by: Michael S. Tsirkin Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 18b3de854aeb..6df14dd5bf46 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1677,13 +1677,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget) received = virtnet_receive(rq, budget, &xdp_xmit); + if (xdp_xmit & VIRTIO_XDP_REDIR) + xdp_do_flush(); + /* Out of packets? */ if (received < budget) virtqueue_napi_complete(napi, rq->vq, received); - if (xdp_xmit & VIRTIO_XDP_REDIR) - xdp_do_flush(); - if (xdp_xmit & VIRTIO_XDP_TX) { sq = virtnet_xdp_get_sq(vi); if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { -- cgit v1.2.3 From b534013798b77f81a36f36dafd59bab9de837619 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Jan 2023 08:49:00 +0100 Subject: dpaa_eth: execute xdp_do_flush() before napi_complete_done() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that xdp_do_flush() is always executed before napi_complete_done(). This is important for two reasons. First, a redirect to an XSKMAP assumes that a call to xdp_do_redirect() from napi context X on CPU Y will be followed by a xdp_do_flush() from the same napi context and CPU. This is not guaranteed if the napi_complete_done() is executed before xdp_do_flush(), as it tells the napi logic that it is fine to schedule napi context X on another CPU. Details from a production system triggering this bug using the veth driver can be found following the first link below. The second reason is that the XDP_REDIRECT logic in itself relies on being inside a single NAPI instance through to the xdp_do_flush() call for RCU protection of all in-kernel data structures. Details can be found in the second link below. Fixes: a1e031ffb422 ("dpaa_eth: add XDP_REDIRECT support") Signed-off-by: Magnus Karlsson Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221220185903.1105011-1-sbohrer@cloudflare.com Link: https://lore.kernel.org/all/20210624160609.292325-1-toke@redhat.com/ Acked-by: Camelia Groza Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 3f8032947d86..027fff9f7db0 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2410,6 +2410,9 @@ static int dpaa_eth_poll(struct napi_struct *napi, int budget) cleaned = qman_p_poll_dqrr(np->p, budget); + if (np->xdp_act & XDP_REDIRECT) + xdp_do_flush(); + if (cleaned < budget) { napi_complete_done(napi, cleaned); qman_p_irqsource_add(np->p, QM_PIRQ_DQRI); @@ -2417,9 +2420,6 @@ static int dpaa_eth_poll(struct napi_struct *napi, int budget) qman_p_irqsource_add(np->p, QM_PIRQ_DQRI); } - if (np->xdp_act & XDP_REDIRECT) - xdp_do_flush(); - return cleaned; } -- cgit v1.2.3 From a3191c4d86c5d3bd35b00dfde6910b88391436a0 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Jan 2023 08:49:01 +0100 Subject: dpaa2-eth: execute xdp_do_flush() before napi_complete_done() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that xdp_do_flush() is always executed before napi_complete_done(). This is important for two reasons. First, a redirect to an XSKMAP assumes that a call to xdp_do_redirect() from napi context X on CPU Y will be followed by a xdp_do_flush() from the same napi context and CPU. This is not guaranteed if the napi_complete_done() is executed before xdp_do_flush(), as it tells the napi logic that it is fine to schedule napi context X on another CPU. Details from a production system triggering this bug using the veth driver can be found following the first link below. The second reason is that the XDP_REDIRECT logic in itself relies on being inside a single NAPI instance through to the xdp_do_flush() call for RCU protection of all in-kernel data structures. Details can be found in the second link below. Fixes: d678be1dc1ec ("dpaa2-eth: add XDP_REDIRECT support") Signed-off-by: Magnus Karlsson Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221220185903.1105011-1-sbohrer@cloudflare.com Link: https://lore.kernel.org/all/20210624160609.292325-1-toke@redhat.com/ Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 0c35abb7d065..2e79d18fc3c7 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -1993,10 +1993,15 @@ static int dpaa2_eth_poll(struct napi_struct *napi, int budget) if (rx_cleaned >= budget || txconf_cleaned >= DPAA2_ETH_TXCONF_PER_NAPI) { work_done = budget; + if (ch->xdp.res & XDP_REDIRECT) + xdp_do_flush(); goto out; } } while (store_cleaned); + if (ch->xdp.res & XDP_REDIRECT) + xdp_do_flush(); + /* Update NET DIM with the values for this CDAN */ dpaa2_io_update_net_dim(ch->dpio, ch->stats.frames_per_cdan, ch->stats.bytes_per_cdan); @@ -2032,9 +2037,7 @@ out: txc_fq->dq_bytes = 0; } - if (ch->xdp.res & XDP_REDIRECT) - xdp_do_flush_map(); - else if (rx_cleaned && ch->xdp.res & XDP_TX) + if (rx_cleaned && ch->xdp.res & XDP_TX) dpaa2_eth_xdp_tx_flush(priv, ch, &priv->fq[flowid]); return work_done; -- cgit v1.2.3 From ff445b8397745ea94675162faa0e9b0079ca521d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Thu, 26 Jan 2023 22:01:11 +0300 Subject: net: dsa: mt7530: fix tristate and help description MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix description for tristate and help sections which include inaccurate information. Signed-off-by: Arınç ÜNAL Link: https://lore.kernel.org/r/20230126190110.9124-1-arinc.unal@arinc9.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/Kconfig | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig index c26755f662c1..f6f3b43dfb06 100644 --- a/drivers/net/dsa/Kconfig +++ b/drivers/net/dsa/Kconfig @@ -35,12 +35,13 @@ config NET_DSA_LANTIQ_GSWIP the xrx200 / VR9 SoC. config NET_DSA_MT7530 - tristate "MediaTek MT753x and MT7621 Ethernet switch support" + tristate "MediaTek MT7530 and MT7531 Ethernet switch support" select NET_DSA_TAG_MTK select MEDIATEK_GE_PHY help - This enables support for the MediaTek MT7530, MT7531, and MT7621 - Ethernet switch chips. + This enables support for the MediaTek MT7530 and MT7531 Ethernet + switch chips. Multi-chip module MT7530 in MT7621AT, MT7621DAT, + MT7621ST and MT7623AI SoCs is supported. config NET_DSA_MV88E6060 tristate "Marvell 88E6060 ethernet switch chip support" -- cgit v1.2.3 From 7d2c89b325874a35564db5630a459966afab04cc Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 26 Jan 2023 11:06:59 -0800 Subject: skb: Do mix page pool and page referenced frags in GRO GSO should not merge page pool recycled frames with standard reference counted frames. Traditionally this didn't occur, at least not often. However as we start looking at adding support for wireless adapters there becomes the potential to mix the two due to A-MSDU repartitioning frames in the receive path. There are possibly other places where this may have occurred however I suspect they must be few and far between as we have not seen this issue until now. Fixes: 53e0961da1c7 ("page_pool: add frag page recycling support in page pool") Reported-by: Felix Fietkau Signed-off-by: Alexander Duyck Acked-by: Ilias Apalodimas Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/167475990764.1934330.11960904198087757911.stgit@localhost.localdomain Signed-off-by: Jakub Kicinski --- net/core/gro.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/core/gro.c b/net/core/gro.c index 506f83d715f8..4bac7ea6e025 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -162,6 +162,15 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) struct sk_buff *lp; int segs; + /* Do not splice page pool based packets w/ non-page pool + * packets. This can result in reference count issues as page + * pool pages will not decrement the reference count and will + * instead be immediately returned to the pool or have frag + * count decremented. + */ + if (p->pp_recycle != skb->pp_recycle) + return -ETOOMANYREFS; + /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ gro_max_size = READ_ONCE(p->dev->gro_max_size); -- cgit v1.2.3 From ffffd2454a7a1bc9f7242b12c4cc0b05c12692b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Wed, 25 Jan 2023 15:35:13 +0100 Subject: sfc: correctly advertise tunneled IPv6 segmentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent sfc NICs are TSO capable for some tunnel protocols. However, it was not working properly because the feature was not advertised in hw_enc_features, but in hw_features only. Setting up a GENEVE tunnel and using iperf3 to send IPv4 and IPv6 traffic to the tunnel show, with tcpdump, that the IPv4 packets still had ~64k size but the IPv6 ones had only ~1500 bytes (they had been segmented by software, not offloaded). With this patch segmentation is offloaded as expected and the traffic is correctly received at the other end. Fixes: 24b2c3751aa3 ("sfc: advertise encapsulated offloads on EF10") Reported-by: Tianhao Zhao Signed-off-by: Íñigo Huguet Acked-by: Martin Habets Link: https://lore.kernel.org/r/20230125143513.25841-1-ihuguet@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/efx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 0556542d7a6b..3a86f1213a05 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -1003,8 +1003,11 @@ static int efx_pci_probe_post_io(struct efx_nic *efx) /* Determine netdevice features */ net_dev->features |= (efx->type->offload_features | NETIF_F_SG | NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_RXALL); - if (efx->type->offload_features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) + if (efx->type->offload_features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) { net_dev->features |= NETIF_F_TSO6; + if (efx_has_cap(efx, TX_TSO_V2_ENCAP)) + net_dev->hw_enc_features |= NETIF_F_TSO6; + } /* Check whether device supports TSO */ if (!efx->type->tso_versions || !efx->type->tso_versions(efx)) net_dev->features &= ~NETIF_F_ALL_TSO; -- cgit v1.2.3 From 14caefcf9837a2be765a566005ad82cd0d2a429f Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Wed, 25 Jan 2023 02:59:44 -0800 Subject: net/rose: Fix to not accept on connected socket If you call listen() and accept() on an already connect()ed rose socket, accept() can successfully connect. This is because when the peer socket sends data to sendmsg, the skb with its own sk stored in the connected socket's sk->sk_receive_queue is connected, and rose_accept() dequeues the skb waiting in the sk->sk_receive_queue. This creates a child socket with the sk of the parent rose socket, which can cause confusion. Fix rose_listen() to return -EINVAL if the socket has already been successfully connected, and add lock_sock to prevent this issue. Signed-off-by: Hyunwoo Kim Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230125105944.GA133314@ubuntu Signed-off-by: Jakub Kicinski --- net/rose/af_rose.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 36fefc3957d7..ca2b17f32670 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -488,6 +488,12 @@ static int rose_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; + lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { struct rose_sock *rose = rose_sk(sk); @@ -497,8 +503,10 @@ static int rose_listen(struct socket *sock, int backlog) memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; + release_sock(sk); return 0; } + release_sock(sk); return -EOPNOTSUPP; } -- cgit v1.2.3 From 29de68c2b32ce58d64dea496d281e25ad0f551bd Mon Sep 17 00:00:00 2001 From: Natalia Petrova Date: Wed, 25 Jan 2023 16:48:31 +0300 Subject: net: qrtr: free memory on error path in radix_tree_insert() Function radix_tree_insert() returns errors if the node hasn't been initialized and added to the tree. "kfree(node)" and return value "NULL" of node_get() help to avoid using unclear node in other calls. Found by Linux Verification Center (linuxtesting.org) with SVACE. Cc: # 5.7 Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace") Signed-off-by: Natalia Petrova Reviewed-by: Simon Horman Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20230125134831.8090-1-n.petrova@fintech.ru Signed-off-by: Jakub Kicinski --- net/qrtr/ns.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 1990d496fcfc..e595079c2caf 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -83,7 +83,10 @@ static struct qrtr_node *node_get(unsigned int node_id) node->id = node_id; - radix_tree_insert(&nodes, node_id, node); + if (radix_tree_insert(&nodes, node_id, node)) { + kfree(node); + return NULL; + } return node; } -- cgit v1.2.3 From 422ae7d9c7221e8d4c8526d0f54106307d69d2dc Mon Sep 17 00:00:00 2001 From: Andre Kalb Date: Wed, 25 Jan 2023 19:23:26 +0100 Subject: net: phy: dp83822: Fix null pointer access on DP83825/DP83826 devices The probe() function is only used for the DP83822 PHY, leaving the private data pointer uninitialized for the smaller DP83825/26 models. While all uses of the private data structure are hidden in 82822 specific callbacks, configuring the interrupt is shared across all models. This causes a NULL pointer dereference on the smaller PHYs as it accesses the private data unchecked. Verifying the pointer avoids that. Fixes: 5dc39fd5ef35 ("net: phy: DP83822: Add ability to advertise Fiber connection") Signed-off-by: Andre Kalb Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/Y9FzniUhUtbaGKU7@pc6682 Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83822.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index a6f05e35d91f..b7cb71817780 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -233,7 +233,8 @@ static int dp83822_config_intr(struct phy_device *phydev) DP83822_ENERGY_DET_INT_EN | DP83822_LINK_QUAL_INT_EN); - if (!dp83822->fx_enabled) + /* Private data pointer is NULL on DP83825/26 */ + if (!dp83822 || !dp83822->fx_enabled) misr_status |= DP83822_ANEG_COMPLETE_INT_EN | DP83822_DUP_MODE_CHANGE_INT_EN | DP83822_SPEED_CHANGED_INT_EN; @@ -253,7 +254,8 @@ static int dp83822_config_intr(struct phy_device *phydev) DP83822_PAGE_RX_INT_EN | DP83822_EEE_ERROR_CHANGE_INT_EN); - if (!dp83822->fx_enabled) + /* Private data pointer is NULL on DP83825/26 */ + if (!dp83822 || !dp83822->fx_enabled) misr_status |= DP83822_ANEG_ERR_INT_EN | DP83822_WOL_PKT_INT_EN; -- cgit v1.2.3 From 60bd1d9008a50cc78c4033a16a6f5d78210d481c Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 26 Jan 2023 14:45:51 +0800 Subject: net: mctp: purge receive queues on sk destruction We may have pending skbs in the receive queue when the sk is being destroyed; add a destructor to purge the queue. MCTP doesn't use the error queue, so only the receive_queue is purged. Fixes: 833ef3b91de6 ("mctp: Populate socket implementation") Signed-off-by: Jeremy Kerr Reviewed-by: Pavan Chebbi Link: https://lore.kernel.org/r/20230126064551.464468-1-jk@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/af_mctp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 45bbe3e54cc2..3150f3f0c872 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -587,6 +587,11 @@ static void mctp_sk_unhash(struct sock *sk) del_timer_sync(&msk->key_expiry); } +static void mctp_sk_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + static struct proto mctp_proto = { .name = "MCTP", .owner = THIS_MODULE, @@ -623,6 +628,7 @@ static int mctp_pf_create(struct net *net, struct socket *sock, return -ENOMEM; sock_init_data(sock, sk); + sk->sk_destruct = mctp_sk_destruct; rc = 0; if (sk->sk_prot->init) -- cgit v1.2.3 From 364d0221f1788e5225006ba7a0026e5968431c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kornel=20Dul=C4=99ba?= Date: Thu, 26 Jan 2023 13:25:34 +0000 Subject: net: wwan: t7xx: Fix Runtime PM resume sequence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resume device before calling napi_schedule, instead of doing in the napi poll routine. Polling is done in softrq context. We can't call the PM resume logic from there as it's blocking and not irq safe. In order to make it work modify the interrupt handler to be run from irq handler thread. Fixes: 5545b7b9f294 ("net: wwan: t7xx: Add NAPI support") Signed-off-by: Kornel Dulęba Signed-off-by: David S. Miller --- drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c | 11 ++++++++++- drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c | 29 ++++++++++++++++++++--------- drivers/net/wwan/t7xx/t7xx_netdev.c | 16 +++++++++++++++- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c index 7eff3531b9a5..7ff33c1d6ac7 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif.c @@ -152,6 +152,15 @@ static irqreturn_t t7xx_dpmaif_isr_handler(int irq, void *data) } t7xx_pcie_mac_clear_int(dpmaif_ctrl->t7xx_dev, isr_para->pcie_int); + + return IRQ_WAKE_THREAD; +} + +static irqreturn_t t7xx_dpmaif_isr_thread(int irq, void *data) +{ + struct dpmaif_isr_para *isr_para = data; + struct dpmaif_ctrl *dpmaif_ctrl = isr_para->dpmaif_ctrl; + t7xx_dpmaif_irq_cb(isr_para); t7xx_pcie_mac_set_int(dpmaif_ctrl->t7xx_dev, isr_para->pcie_int); return IRQ_HANDLED; @@ -188,7 +197,7 @@ static void t7xx_dpmaif_register_pcie_irq(struct dpmaif_ctrl *dpmaif_ctrl) t7xx_pcie_mac_clear_int(t7xx_dev, int_type); t7xx_dev->intr_handler[int_type] = t7xx_dpmaif_isr_handler; - t7xx_dev->intr_thread[int_type] = NULL; + t7xx_dev->intr_thread[int_type] = t7xx_dpmaif_isr_thread; t7xx_dev->callback_param[int_type] = isr_para; t7xx_pcie_mac_clear_int_status(t7xx_dev, int_type); diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c index aa2174a10437..f4ff2198b5ef 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c @@ -840,14 +840,13 @@ int t7xx_dpmaif_napi_rx_poll(struct napi_struct *napi, const int budget) if (!rxq->que_started) { atomic_set(&rxq->rx_processing, 0); + pm_runtime_put_autosuspend(rxq->dpmaif_ctrl->dev); dev_err(rxq->dpmaif_ctrl->dev, "Work RXQ: %d has not been started\n", rxq->index); return work_done; } - if (!rxq->sleep_lock_pending) { - pm_runtime_get_noresume(rxq->dpmaif_ctrl->dev); + if (!rxq->sleep_lock_pending) t7xx_pci_disable_sleep(t7xx_dev); - } ret = try_wait_for_completion(&t7xx_dev->sleep_lock_acquire); if (!ret) { @@ -876,22 +875,22 @@ int t7xx_dpmaif_napi_rx_poll(struct napi_struct *napi, const int budget) napi_complete_done(napi, work_done); t7xx_dpmaif_clr_ip_busy_sts(&rxq->dpmaif_ctrl->hw_info); t7xx_dpmaif_dlq_unmask_rx_done(&rxq->dpmaif_ctrl->hw_info, rxq->index); + t7xx_pci_enable_sleep(rxq->dpmaif_ctrl->t7xx_dev); + pm_runtime_mark_last_busy(rxq->dpmaif_ctrl->dev); + pm_runtime_put_autosuspend(rxq->dpmaif_ctrl->dev); + atomic_set(&rxq->rx_processing, 0); } else { t7xx_dpmaif_clr_ip_busy_sts(&rxq->dpmaif_ctrl->hw_info); } - t7xx_pci_enable_sleep(rxq->dpmaif_ctrl->t7xx_dev); - pm_runtime_mark_last_busy(rxq->dpmaif_ctrl->dev); - pm_runtime_put_noidle(rxq->dpmaif_ctrl->dev); - atomic_set(&rxq->rx_processing, 0); - return work_done; } void t7xx_dpmaif_irq_rx_done(struct dpmaif_ctrl *dpmaif_ctrl, const unsigned int que_mask) { struct dpmaif_rx_queue *rxq; - int qno; + struct dpmaif_ctrl *ctrl; + int qno, ret; qno = ffs(que_mask) - 1; if (qno < 0 || qno > DPMAIF_RXQ_NUM - 1) { @@ -900,6 +899,18 @@ void t7xx_dpmaif_irq_rx_done(struct dpmaif_ctrl *dpmaif_ctrl, const unsigned int } rxq = &dpmaif_ctrl->rxq[qno]; + ctrl = rxq->dpmaif_ctrl; + /* We need to make sure that the modem has been resumed before + * calling napi. This can't be done inside the polling function + * as we could be blocked waiting for device to be resumed, + * which can't be done from softirq context the poll function + * is running in. + */ + ret = pm_runtime_resume_and_get(ctrl->dev); + if (ret < 0 && ret != -EACCES) { + dev_err(ctrl->dev, "Failed to resume device: %d\n", ret); + return; + } napi_schedule(&rxq->napi); } diff --git a/drivers/net/wwan/t7xx/t7xx_netdev.c b/drivers/net/wwan/t7xx/t7xx_netdev.c index 494a28e386a3..3ef4a8a4f8fd 100644 --- a/drivers/net/wwan/t7xx/t7xx_netdev.c +++ b/drivers/net/wwan/t7xx/t7xx_netdev.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -45,12 +46,25 @@ static void t7xx_ccmni_enable_napi(struct t7xx_ccmni_ctrl *ctlb) { - int i; + struct dpmaif_ctrl *ctrl; + int i, ret; + + ctrl = ctlb->hif_ctrl; if (ctlb->is_napi_en) return; for (i = 0; i < RXQ_NUM; i++) { + /* The usage count has to be bumped every time before calling + * napi_schedule. It will be decresed in the poll routine, + * right after napi_complete_done is called. + */ + ret = pm_runtime_resume_and_get(ctrl->dev); + if (ret < 0) { + dev_err(ctrl->dev, "Failed to resume device: %d\n", + ret); + return; + } napi_enable(ctlb->napi[i]); napi_schedule(ctlb->napi[i]); } -- cgit v1.2.3 From e3d6d152a1cbdee25f2e3962009a2751b54e2297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kornel=20Dul=C4=99ba?= Date: Thu, 26 Jan 2023 13:25:35 +0000 Subject: net: wwan: t7xx: Fix Runtime PM initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For PCI devices the Runtime PM refcount is incremented twice: 1. During device enumeration with a call to pm_runtime_forbid. 2. Just before a driver probe logic is called. Because of that in order to enable Runtime PM on a given device we have to call both pm_runtime_allow and pm_runtime_put_noidle, once it's ready to be runtime suspended. The former was missing causing the pm refcount to never reach 0. Fixes: d10b3a695ba0 ("net: wwan: t7xx: Runtime PM") Signed-off-by: Kornel Dulęba Signed-off-by: David S. Miller --- drivers/net/wwan/t7xx/t7xx_pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wwan/t7xx/t7xx_pci.c b/drivers/net/wwan/t7xx/t7xx_pci.c index 871f2a27a398..226fc1703e90 100644 --- a/drivers/net/wwan/t7xx/t7xx_pci.c +++ b/drivers/net/wwan/t7xx/t7xx_pci.c @@ -121,6 +121,8 @@ void t7xx_pci_pm_init_late(struct t7xx_pci_dev *t7xx_dev) iowrite32(T7XX_L1_BIT(0), IREG_BASE(t7xx_dev) + ENABLE_ASPM_LOWPWR); atomic_set(&t7xx_dev->md_pm_state, MTK_PM_RESUMED); + pm_runtime_mark_last_busy(&t7xx_dev->pdev->dev); + pm_runtime_allow(&t7xx_dev->pdev->dev); pm_runtime_put_noidle(&t7xx_dev->pdev->dev); } -- cgit v1.2.3 From a6efc42a86c0c87cfe2f1c3d1f09a4c9b13ba890 Mon Sep 17 00:00:00 2001 From: Andrei Gherzan Date: Thu, 26 Jan 2023 16:55:48 +0000 Subject: selftest: net: Improve IPV6_TCLASS/IPV6_HOPLIMIT tests apparmor compatibility "tcpdump" is used to capture traffic in these tests while using a random, temporary and not suffixed file for it. This can interfere with apparmor configuration where the tool is only allowed to read from files with 'known' extensions. The MINE type application/vnd.tcpdump.pcap was registered with IANA for pcap files and .pcap is the extension that is both most common but also aligned with standard apparmor configurations. See TCPDUMP(8) for more details. This improves compatibility with standard apparmor configurations by using ".pcap" as the file extension for the tests' temporary files. Signed-off-by: Andrei Gherzan Signed-off-by: David S. Miller --- tools/testing/selftests/net/cmsg_ipv6.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/cmsg_ipv6.sh b/tools/testing/selftests/net/cmsg_ipv6.sh index 2d89cb0ad288..330d0b1ceced 100755 --- a/tools/testing/selftests/net/cmsg_ipv6.sh +++ b/tools/testing/selftests/net/cmsg_ipv6.sh @@ -6,7 +6,7 @@ ksft_skip=4 NS=ns IP6=2001:db8:1::1/64 TGT6=2001:db8:1::2 -TMPF=`mktemp` +TMPF=$(mktemp --suffix ".pcap") cleanup() { -- cgit v1.2.3 From 54aa39a513dbf2164ca462a19f04519b2407a224 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 27 Jan 2023 00:35:39 +0300 Subject: net: stmmac: do not stop RX_CLK in Rx LPI state for qcs404 SoC Currently in phy_init_eee() the driver unconditionally configures the PHY to stop RX_CLK after entering Rx LPI state. This causes an LPI interrupt storm on my qcs404-base board. Change the PHY initialization so that for "qcom,qcs404-ethqos" compatible device RX_CLK continues to run even in Rx LPI state. Signed-off-by: Andrey Konovalov Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 ++- include/linux/stmmac.h | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 835caa15d55f..732774645c1a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -560,6 +560,8 @@ static int qcom_ethqos_probe(struct platform_device *pdev) plat_dat->has_gmac4 = 1; plat_dat->pmt = 1; plat_dat->tso_en = of_property_read_bool(np, "snps,tso"); + if (of_device_is_compatible(np, "qcom,qcs404-ethqos")) + plat_dat->rx_clk_runs_in_lpi = 1; ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b7e5af58ab75..1a5b8dab5e9b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1080,7 +1080,8 @@ static void stmmac_mac_link_up(struct phylink_config *config, stmmac_mac_set(priv, priv->ioaddr, true); if (phy && priv->dma_cap.eee) { - priv->eee_active = phy_init_eee(phy, 1) >= 0; + priv->eee_active = + phy_init_eee(phy, !priv->plat->rx_clk_runs_in_lpi) >= 0; priv->eee_enabled = stmmac_eee_init(priv); priv->tx_lpi_enabled = priv->eee_enabled; stmmac_set_eee_pls(priv, priv->hw, true); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 83ca2e8eb6b5..a152678b82b7 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -252,6 +252,7 @@ struct plat_stmmacenet_data { int rss_en; int mac_port_sel_speed; bool en_tx_lpi_clockgating; + bool rx_clk_runs_in_lpi; int has_xgmac; bool vlan_fail_q_en; u8 vlan_fail_q; -- cgit v1.2.3 From 611792920925fb088ddccbe2783c7f92fdfb6b64 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Thu, 26 Jan 2023 18:32:50 -0800 Subject: netrom: Fix use-after-free caused by accept on already connected socket If you call listen() and accept() on an already connect()ed AF_NETROM socket, accept() can successfully connect. This is because when the peer socket sends data to sendmsg, the skb with its own sk stored in the connected socket's sk->sk_receive_queue is connected, and nr_accept() dequeues the skb waiting in the sk->sk_receive_queue. As a result, nr_accept() allocates and returns a sock with the sk of the parent AF_NETROM socket. And here use-after-free can happen through complex race conditions: ``` cpu0 cpu1 1. socket_2 = socket(AF_NETROM) . . listen(socket_2) accepted_socket = accept(socket_2) 2. socket_1 = socket(AF_NETROM) nr_create() // sk refcount : 1 connect(socket_1) 3. write(accepted_socket) nr_sendmsg() nr_output() nr_kick() nr_send_iframe() nr_transmit_buffer() nr_route_frame() nr_loopback_queue() nr_loopback_timer() nr_rx_frame() nr_process_rx_frame(sk, skb); // sk : socket_1's sk nr_state3_machine() nr_queue_rx_frame() sock_queue_rcv_skb() sock_queue_rcv_skb_reason() __sock_queue_rcv_skb() __skb_queue_tail(list, skb); // list : socket_1's sk->sk_receive_queue 4. listen(socket_1) nr_listen() uaf_socket = accept(socket_1) nr_accept() skb_dequeue(&sk->sk_receive_queue); 5. close(accepted_socket) nr_release() nr_write_internal(sk, NR_DISCREQ) nr_transmit_buffer() // NR_DISCREQ nr_route_frame() nr_loopback_queue() nr_loopback_timer() nr_rx_frame() // sk : socket_1's sk nr_process_rx_frame() // NR_STATE_3 nr_state3_machine() // NR_DISCREQ nr_disconnect() nr_sk(sk)->state = NR_STATE_0; 6. close(socket_1) // sk refcount : 3 nr_release() // NR_STATE_0 sock_put(sk); // sk refcount : 0 sk_free(sk); close(uaf_socket) nr_release() sock_hold(sk); // UAF ``` KASAN report by syzbot: ``` BUG: KASAN: use-after-free in nr_release+0x66/0x460 net/netrom/af_netrom.c:520 Write of size 4 at addr ffff8880235d8080 by task syz-executor564/5128 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd1/0x138 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:306 [inline] print_report+0x15e/0x461 mm/kasan/report.c:417 kasan_report+0xbf/0x1f0 mm/kasan/report.c:517 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0x141/0x190 mm/kasan/generic.c:189 instrument_atomic_read_write include/linux/instrumented.h:102 [inline] atomic_fetch_add_relaxed include/linux/atomic/atomic-instrumented.h:116 [inline] __refcount_add include/linux/refcount.h:193 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] sock_hold include/net/sock.h:775 [inline] nr_release+0x66/0x460 net/netrom/af_netrom.c:520 __sock_release+0xcd/0x280 net/socket.c:650 sock_close+0x1c/0x20 net/socket.c:1365 __fput+0x27c/0xa90 fs/file_table.c:320 task_work_run+0x16f/0x270 kernel/task_work.c:179 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0xaa8/0x2950 kernel/exit.c:867 do_group_exit+0xd4/0x2a0 kernel/exit.c:1012 get_signal+0x21c3/0x2450 kernel/signal.c:2859 arch_do_signal_or_restart+0x79/0x5c0 arch/x86/kernel/signal.c:306 exit_to_user_mode_loop kernel/entry/common.c:168 [inline] exit_to_user_mode_prepare+0x15f/0x250 kernel/entry/common.c:203 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x1d/0x50 kernel/entry/common.c:296 do_syscall_64+0x46/0xb0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f6c19e3c9b9 Code: Unable to access opcode bytes at 0x7f6c19e3c98f. RSP: 002b:00007fffd4ba2ce8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: 0000000000000116 RBX: 0000000000000003 RCX: 00007f6c19e3c9b9 RDX: 0000000000000318 RSI: 00000000200bd000 RDI: 0000000000000006 RBP: 0000000000000003 R08: 000000000000000d R09: 000000000000000d R10: 0000000000000000 R11: 0000000000000246 R12: 000055555566a2c0 R13: 0000000000000011 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 5128: kasan_save_stack+0x22/0x40 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 ____kasan_kmalloc mm/kasan/common.c:371 [inline] ____kasan_kmalloc mm/kasan/common.c:330 [inline] __kasan_kmalloc+0xa3/0xb0 mm/kasan/common.c:380 kasan_kmalloc include/linux/kasan.h:211 [inline] __do_kmalloc_node mm/slab_common.c:968 [inline] __kmalloc+0x5a/0xd0 mm/slab_common.c:981 kmalloc include/linux/slab.h:584 [inline] sk_prot_alloc+0x140/0x290 net/core/sock.c:2038 sk_alloc+0x3a/0x7a0 net/core/sock.c:2091 nr_create+0xb6/0x5f0 net/netrom/af_netrom.c:433 __sock_create+0x359/0x790 net/socket.c:1515 sock_create net/socket.c:1566 [inline] __sys_socket_create net/socket.c:1603 [inline] __sys_socket_create net/socket.c:1588 [inline] __sys_socket+0x133/0x250 net/socket.c:1636 __do_sys_socket net/socket.c:1649 [inline] __se_sys_socket net/socket.c:1647 [inline] __x64_sys_socket+0x73/0xb0 net/socket.c:1647 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 5128: kasan_save_stack+0x22/0x40 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 kasan_save_free_info+0x2b/0x40 mm/kasan/generic.c:518 ____kasan_slab_free mm/kasan/common.c:236 [inline] ____kasan_slab_free+0x13b/0x1a0 mm/kasan/common.c:200 kasan_slab_free include/linux/kasan.h:177 [inline] __cache_free mm/slab.c:3394 [inline] __do_kmem_cache_free mm/slab.c:3580 [inline] __kmem_cache_free+0xcd/0x3b0 mm/slab.c:3587 sk_prot_free net/core/sock.c:2074 [inline] __sk_destruct+0x5df/0x750 net/core/sock.c:2166 sk_destruct net/core/sock.c:2181 [inline] __sk_free+0x175/0x460 net/core/sock.c:2192 sk_free+0x7c/0xa0 net/core/sock.c:2203 sock_put include/net/sock.h:1991 [inline] nr_release+0x39e/0x460 net/netrom/af_netrom.c:554 __sock_release+0xcd/0x280 net/socket.c:650 sock_close+0x1c/0x20 net/socket.c:1365 __fput+0x27c/0xa90 fs/file_table.c:320 task_work_run+0x16f/0x270 kernel/task_work.c:179 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0xaa8/0x2950 kernel/exit.c:867 do_group_exit+0xd4/0x2a0 kernel/exit.c:1012 get_signal+0x21c3/0x2450 kernel/signal.c:2859 arch_do_signal_or_restart+0x79/0x5c0 arch/x86/kernel/signal.c:306 exit_to_user_mode_loop kernel/entry/common.c:168 [inline] exit_to_user_mode_prepare+0x15f/0x250 kernel/entry/common.c:203 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x1d/0x50 kernel/entry/common.c:296 do_syscall_64+0x46/0xb0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x63/0xcd ``` To fix this issue, nr_listen() returns -EINVAL for sockets that successfully nr_connect(). Reported-by: syzbot+caa188bdfc1eeafeb418@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Hyunwoo Kim Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/netrom/af_netrom.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 6f7f4392cffb..5a4cb796150f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -400,6 +400,11 @@ static int nr_listen(struct socket *sock, int backlog) struct sock *sk = sock->sk; lock_sock(sk); + if (sock->state != SS_UNCONNECTED) { + release_sock(sk); + return -EINVAL; + } + if (sk->sk_state != TCP_LISTEN) { memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; -- cgit v1.2.3 From 73a876022273cbc9c1db7869a54444ce8b9d085e Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Fri, 27 Jan 2023 09:14:27 -0800 Subject: net: phy: fix null dereference in phy_attach_direct Commit bc66fa87d4fd ("net: phy: Add link between phy dev and mac dev") introduced a link between net devices and phy devices. It fails to check whether dev is NULL, leading to a NULL dereference error. Fixes: bc66fa87d4fd ("net: phy: Add link between phy dev and mac dev") Signed-off-by: Colin Foster Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 716870a4499c..607aa786c8cb 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1517,7 +1517,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, * another mac interface, so we should create a device link between * phy dev and mac dev. */ - if (phydev->mdio.bus->parent && dev->dev.parent != phydev->mdio.bus->parent) + if (dev && phydev->mdio.bus->parent && dev->dev.parent != phydev->mdio.bus->parent) phydev->devlink = device_link_add(dev->dev.parent, &phydev->mdio.dev, DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); -- cgit v1.2.3 From ffe2a22562444720b05bdfeb999c03e810d84cbb Mon Sep 17 00:00:00 2001 From: Pietro Borrello Date: Sat, 28 Jan 2023 16:29:17 +0000 Subject: net/tls: tls_is_tx_ready() checked list_entry tls_is_tx_ready() checks that list_first_entry() does not return NULL. This condition can never happen. For empty lists, list_first_entry() returns the list_entry() of the head, which is a type confusion. Use list_first_entry_or_null() which returns NULL in case of empty lists. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Signed-off-by: Pietro Borrello Link: https://lore.kernel.org/r/20230128-list-entry-null-check-tls-v1-1-525bbfe6f0d0@diag.uniroma1.it Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9ed978634125..a83d2b4275fa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2427,7 +2427,7 @@ static bool tls_is_tx_ready(struct tls_sw_context_tx *ctx) { struct tls_rec *rec; - rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); + rec = list_first_entry_or_null(&ctx->tx_list, struct tls_rec, list); if (!rec) return false; -- cgit v1.2.3 From f3eceaed9edd7c0e0d9fb057613131f92973626f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Jan 2023 14:38:54 -0800 Subject: net: ethernet: mtk_eth_soc: Avoid truncating allocation There doesn't appear to be a reason to truncate the allocation used for flow_info, so do a full allocation and remove the unused empty struct. GCC does not like having a reference to an object that has been partially allocated, as bounds checking may become impossible when such an object is passed to other code. Seen with GCC 13: ../drivers/net/ethernet/mediatek/mtk_ppe.c: In function 'mtk_foe_entry_commit_subflow': ../drivers/net/ethernet/mediatek/mtk_ppe.c:623:18: warning: array subscript 'struct mtk_flow_entry[0]' is partly outside array bounds of 'unsigned char[48]' [-Warray-bounds=] 623 | flow_info->l2_data.base_flow = entry; | ^~ Cc: Felix Fietkau Cc: John Crispin Cc: Sean Wang Cc: Mark Lee Cc: Lorenzo Bianconi Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Matthias Brugger Cc: netdev@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mediatek@lists.infradead.org Signed-off-by: Kees Cook Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230127223853.never.014-kees@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_ppe.c | 3 +-- drivers/net/ethernet/mediatek/mtk_ppe.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index 269208a841c7..1ff024f42444 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -615,8 +615,7 @@ mtk_foe_entry_commit_subflow(struct mtk_ppe *ppe, struct mtk_flow_entry *entry, u32 ib1_mask = mtk_get_ib1_pkt_type_mask(ppe->eth) | MTK_FOE_IB1_UDP; int type; - flow_info = kzalloc(offsetof(struct mtk_flow_entry, l2_data.end), - GFP_ATOMIC); + flow_info = kzalloc(sizeof(*flow_info), GFP_ATOMIC); if (!flow_info) return; diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h index ea64fac1d425..b5e432031340 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.h +++ b/drivers/net/ethernet/mediatek/mtk_ppe.h @@ -279,7 +279,6 @@ struct mtk_flow_entry { struct { struct mtk_flow_entry *base_flow; struct hlist_node list; - struct {} end; } l2_data; }; struct rhash_head node; -- cgit v1.2.3 From de5ca4c3852f896cacac2bf259597aab5e17d9e3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Jan 2023 14:40:37 -0800 Subject: net: sched: sch: Bounds check priority Nothing was explicitly bounds checking the priority index used to access clpriop[]. WARN and bail out early if it's pathological. Seen with GCC 13: ../net/sched/sch_htb.c: In function 'htb_activate_prios': ../net/sched/sch_htb.c:437:44: warning: array subscript [0, 31] is outside array bounds of 'struct htb_prio[8]' [-Warray-bounds=] 437 | if (p->inner.clprio[prio].feed.rb_node) | ~~~~~~~~~~~~~~~^~~~~~ ../net/sched/sch_htb.c:131:41: note: while referencing 'clprio' 131 | struct htb_prio clprio[TC_HTB_NUMPRIO]; | ^~~~~~ Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Reviewed-by: Simon Horman Reviewed-by: Cong Wang Link: https://lore.kernel.org/r/20230127224036.never.561-kees@kernel.org Signed-off-by: Paolo Abeni --- net/sched/sch_htb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index f46643850df8..cc28e41fb745 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -431,7 +431,10 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) while (cl->cmode == HTB_MAY_BORROW && p && mask) { m = mask; while (m) { - int prio = ffz(~m); + unsigned int prio = ffz(~m); + + if (WARN_ON_ONCE(prio > ARRAY_SIZE(p->inner.clprio))) + break; m &= ~(1 << prio); if (p->inner.clprio[prio].feed.rb_node) -- cgit v1.2.3 From 2b272bb558f1d3a5aa95ed8a82253786fd1a48ba Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 30 Jan 2023 11:39:29 +0100 Subject: netfilter: br_netfilter: disable sabotage_in hook after first suppression When using a xfrm interface in a bridged setup (the outgoing device is bridged), the incoming packets in the xfrm interface are only tracked in the outgoing direction. $ brctl show bridge name interfaces br_eth1 eth1 $ conntrack -L tcp 115 SYN_SENT src=192... dst=192... [UNREPLIED] ... If br_netfilter is enabled, the first (encrypted) packet is received onR eth1, conntrack hooks are called from br_netfilter emulation which allocates nf_bridge info for this skb. If the packet is for local machine, skb gets passed up the ip stack. The skb passes through ip prerouting a second time. br_netfilter ip_sabotage_in supresses the re-invocation of the hooks. After this, skb gets decrypted in xfrm layer and appears in network stack a second time (after decryption). Then, ip_sabotage_in is called again and suppresses netfilter hook invocation, even though the bridge layer never called them for the plaintext incarnation of the packet. Free the bridge info after the first suppression to avoid this. I was unable to figure out where the regression comes from, as far as i can see br_netfilter always had this problem; i did not expect that skb is looped again with different headers. Fixes: c4b0e771f906 ("netfilter: avoid using skb->nf_bridge directly") Reported-and-tested-by: Wolfgang Nothdurft Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index f20f4373ff40..9554abcfd5b4 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -871,6 +871,7 @@ static unsigned int ip_sabotage_in(void *priv, if (nf_bridge && !nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev) && !netif_is_l3_slave(skb->dev)) { + nf_bridge_info_free(skb); state->okfn(state->net, state->sk, skb); return NF_STOLEN; } -- cgit v1.2.3 From bd0e06f0def75ba26572a94e5350324474a55562 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Jan 2023 02:35:21 +0100 Subject: Revert "netfilter: conntrack: fix bug in for_each_sctp_chunk" There is no bug. If sch->length == 0, this would result in an infinite loop, but first caller, do_basic_checks(), errors out in this case. After this change, packets with bogus zero-length chunks are no longer detected as invalid, so revert & add comment wrt. 0 length check. Fixes: 98ee00774525 ("netfilter: conntrack: fix bug in for_each_sctp_chunk") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_sctp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 945dd40e7077..011d414038ea 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -142,10 +142,11 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) } #endif +/* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ - ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))) && \ - (sch)->length; \ + (offset) < (skb)->len && \ + ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ -- cgit v1.2.3 From 8f35ae17ef565a605de5f409e04bcd49a55d7646 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 30 Jan 2023 11:25:33 -0500 Subject: sctp: do not check hb_timer.expires when resetting hb_timer It tries to avoid the frequently hb_timer refresh in commit ba6f5e33bdbb ("sctp: avoid refreshing heartbeat timer too often"), and it only allows mod_timer when the new expires is after hb_timer.expires. It means even a much shorter interval for hb timer gets applied, it will have to wait until the current hb timer to time out. In sctp_do_8_2_transport_strike(), when a transport enters PF state, it expects to update the hb timer to resend a heartbeat every rto after calling sctp_transport_reset_hb_timer(), which will not work as the change mentioned above. The frequently hb_timer refresh was caused by sctp_transport_reset_timers() called in sctp_outq_flush() and it was already removed in the commit above. So we don't have to check hb_timer.expires when resetting hb_timer as it is now not called very often. Fixes: ba6f5e33bdbb ("sctp: avoid refreshing heartbeat timer too often") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/d958c06985713ec84049a2d5664879802710179a.1675095933.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/transport.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index ca1eba95c293..2f66a2006517 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -196,9 +196,7 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport) /* When a data chunk is sent, reset the heartbeat interval. */ expires = jiffies + sctp_transport_timeout(transport); - if ((time_before(transport->hb_timer.expires, expires) || - !timer_pending(&transport->hb_timer)) && - !mod_timer(&transport->hb_timer, + if (!mod_timer(&transport->hb_timer, expires + get_random_u32_below(transport->rto))) sctp_transport_hold(transport); } -- cgit v1.2.3 From efec2e2a722ed609fc7b64feef720fd601633b73 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 30 Jan 2023 21:30:51 +0200 Subject: net: fman: memac: free mdio device if lynx_pcs_create() fails When memory allocation fails in lynx_pcs_create() and it returns NULL, there remains a dangling reference to the mdiodev returned by of_mdio_find_device() which is leaked as soon as memac_pcs_create() returns empty-handed. Fixes: a7c2a32e7f22 ("net: fman: memac: Use lynx pcs driver") Signed-off-by: Vladimir Oltean Reviewed-by: Sean Anderson Acked-by: Madalin Bucur Link: https://lore.kernel.org/r/20230130193051.563315-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fman/fman_memac.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/freescale/fman/fman_memac.c b/drivers/net/ethernet/freescale/fman/fman_memac.c index 9349f841bd06..587ad81a2dc3 100644 --- a/drivers/net/ethernet/freescale/fman/fman_memac.c +++ b/drivers/net/ethernet/freescale/fman/fman_memac.c @@ -1055,6 +1055,9 @@ static struct phylink_pcs *memac_pcs_create(struct device_node *mac_node, return ERR_PTR(-EPROBE_DEFER); pcs = lynx_pcs_create(mdiodev); + if (!pcs) + mdio_device_free(mdiodev); + return pcs; } -- cgit v1.2.3 From 876e8ca8366735a604bac86ff7e2732fc9d85d2d Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Mon, 30 Jan 2023 12:51:48 -0800 Subject: net: fix NULL pointer in skb_segment_list Commit 3a1296a38d0c ("net: Support GRO/GSO fraglist chaining.") introduced UDP listifyed GRO. The segmentation relies on frag_list being untouched when passing through the network stack. This assumption can be broken sometimes, where frag_list itself gets pulled into linear area, leaving frag_list being NULL. When this happens it can trigger following NULL pointer dereference, and panic the kernel. Reverse the test condition should fix it. [19185.577801][ C1] BUG: kernel NULL pointer dereference, address: ... [19185.663775][ C1] RIP: 0010:skb_segment_list+0x1cc/0x390 ... [19185.834644][ C1] Call Trace: [19185.841730][ C1] [19185.848563][ C1] __udp_gso_segment+0x33e/0x510 [19185.857370][ C1] inet_gso_segment+0x15b/0x3e0 [19185.866059][ C1] skb_mac_gso_segment+0x97/0x110 [19185.874939][ C1] __skb_gso_segment+0xb2/0x160 [19185.883646][ C1] udp_queue_rcv_skb+0xc3/0x1d0 [19185.892319][ C1] udp_unicast_rcv_skb+0x75/0x90 [19185.900979][ C1] ip_protocol_deliver_rcu+0xd2/0x200 [19185.910003][ C1] ip_local_deliver_finish+0x44/0x60 [19185.918757][ C1] __netif_receive_skb_one_core+0x8b/0xa0 [19185.927834][ C1] process_backlog+0x88/0x130 [19185.935840][ C1] __napi_poll+0x27/0x150 [19185.943447][ C1] net_rx_action+0x27e/0x5f0 [19185.951331][ C1] ? mlx5_cq_tasklet_cb+0x70/0x160 [mlx5_core] [19185.960848][ C1] __do_softirq+0xbc/0x25d [19185.968607][ C1] irq_exit_rcu+0x83/0xb0 [19185.976247][ C1] common_interrupt+0x43/0xa0 [19185.984235][ C1] asm_common_interrupt+0x22/0x40 ... [19186.094106][ C1] Fixes: 3a1296a38d0c ("net: Support GRO/GSO fraglist chaining.") Suggested-by: Daniel Borkmann Reviewed-by: Willem de Bruijn Signed-off-by: Yan Zhai Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/Y9gt5EUizK1UImEP@debian Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4a0eb5593275..a31ff4d83ecc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4100,7 +4100,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_shinfo(skb)->frag_list = NULL; - do { + while (list_skb) { nskb = list_skb; list_skb = list_skb->next; @@ -4146,8 +4146,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, if (skb_needs_linearize(nskb, features) && __skb_linearize(nskb)) goto err_linearize; - - } while (list_skb); + } skb->truesize = skb->truesize - delta_truesize; skb->data_len = skb->data_len - delta_len; -- cgit v1.2.3 From afc2336f89dc0fc0ef25b92366814524b0fd90fb Mon Sep 17 00:00:00 2001 From: Chris Healy Date: Mon, 30 Jan 2023 15:14:02 -0800 Subject: net: phy: meson-gxl: Add generic dummy stubs for MMD register access The Meson G12A Internal PHY does not support standard IEEE MMD extended register access, therefore add generic dummy stubs to fail the read and write MMD calls. This is necessary to prevent the core PHY code from erroneously believing that EEE is supported by this PHY even though this PHY does not support EEE, as MMD register access returns all FFFFs. Fixes: 5c3407abb338 ("net: phy: meson-gxl: add g12a support") Reviewed-by: Heiner Kallweit Signed-off-by: Chris Healy Reviewed-by: Jerome Brunet Link: https://lore.kernel.org/r/20230130231402.471493-1-cphealy@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/meson-gxl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/meson-gxl.c b/drivers/net/phy/meson-gxl.c index c49062ad72c6..5e41658b1e2f 100644 --- a/drivers/net/phy/meson-gxl.c +++ b/drivers/net/phy/meson-gxl.c @@ -271,6 +271,8 @@ static struct phy_driver meson_gxl_phy[] = { .handle_interrupt = meson_gxl_handle_interrupt, .suspend = genphy_suspend, .resume = genphy_resume, + .read_mmd = genphy_read_mmd_unsupported, + .write_mmd = genphy_write_mmd_unsupported, }, }; -- cgit v1.2.3 From 23ca0c2c93406bdb1150659e720bda1cec1fad04 Mon Sep 17 00:00:00 2001 From: Thomas Winter Date: Tue, 31 Jan 2023 16:46:45 +1300 Subject: ip/ip6_gre: Fix changing addr gen mode not generating IPv6 link local address For our point-to-point GRE tunnels, they have IN6_ADDR_GEN_MODE_NONE when they are created then we set IN6_ADDR_GEN_MODE_EUI64 when they come up to generate the IPv6 link local address for the interface. Recently we found that they were no longer generating IPv6 addresses. This issue would also have affected SIT tunnels. Commit e5dd729460ca changed the code path so that GRE tunnels generate an IPv6 address based on the tunnel source address. It also changed the code path so GRE tunnels don't call addrconf_addr_gen in addrconf_dev_config which is called by addrconf_sysctl_addr_gen_mode when the IN6_ADDR_GEN_MODE is changed. This patch aims to fix this issue by moving the code in addrconf_notify which calls the addr gen for GRE and SIT into a separate function and calling it in the places that expect the IPv6 address to be generated. The previous addrconf_dev_config is renamed to addrconf_eth_config since it only expected eth type interfaces and follows the addrconf_gre/sit_config format. A part of this changes means that the loopback address will be attempted to be configured when changing addr_gen_mode for lo. This should not be a problem because the address should exist anyway and if does already exist then no error is produced. Fixes: e5dd729460ca ("ip/ip6_gre: use the same logic as SIT interfaces when computing v6LL address") Signed-off-by: Thomas Winter Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f7a84a4acffc..f4ba68e096ac 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3447,6 +3447,30 @@ static void addrconf_gre_config(struct net_device *dev) } #endif +static void addrconf_init_auto_addrs(struct net_device *dev) +{ + switch (dev->type) { +#if IS_ENABLED(CONFIG_IPV6_SIT) + case ARPHRD_SIT: + addrconf_sit_config(dev); + break; +#endif +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) + case ARPHRD_IP6GRE: + case ARPHRD_IPGRE: + addrconf_gre_config(dev); + break; +#endif + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + default: + addrconf_dev_config(dev); + break; + } +} + static int fixup_permanent_addr(struct net *net, struct inet6_dev *idev, struct inet6_ifaddr *ifp) @@ -3615,26 +3639,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } - switch (dev->type) { -#if IS_ENABLED(CONFIG_IPV6_SIT) - case ARPHRD_SIT: - addrconf_sit_config(dev); - break; -#endif -#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) - case ARPHRD_IP6GRE: - case ARPHRD_IPGRE: - addrconf_gre_config(dev); - break; -#endif - case ARPHRD_LOOPBACK: - init_loopback(dev); - break; - - default: - addrconf_dev_config(dev); - break; - } + addrconf_init_auto_addrs(dev); if (!IS_ERR_OR_NULL(idev)) { if (run_pending) @@ -6397,7 +6402,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, if (idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + addrconf_init_auto_addrs(idev->dev); } } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { struct net_device *dev; @@ -6408,7 +6413,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, if (idev && idev->cnf.addr_gen_mode != new_val) { idev->cnf.addr_gen_mode = new_val; - addrconf_dev_config(idev->dev); + addrconf_init_auto_addrs(idev->dev); } } } -- cgit v1.2.3 From 30e2291f61f93f7132c060190f8360df52644ec1 Mon Sep 17 00:00:00 2001 From: Thomas Winter Date: Tue, 31 Jan 2023 16:46:46 +1300 Subject: ip/ip6_gre: Fix non-point-to-point tunnel not generating IPv6 link local address We recently found that our non-point-to-point tunnels were not generating any IPv6 link local address and instead generating an IPv6 compat address, breaking IPv6 communication on the tunnel. Previously, addrconf_gre_config always would call addrconf_addr_gen and generate a EUI64 link local address for the tunnel. Then commit e5dd729460ca changed the code path so that add_v4_addrs is called but this only generates a compat IPv6 address for non-point-to-point tunnels. I assume the compat address is specifically for SIT tunnels so have kept that only for SIT - GRE tunnels now always generate link local addresses. Fixes: e5dd729460ca ("ip/ip6_gre: use the same logic as SIT interfaces when computing v6LL address") Signed-off-by: Thomas Winter Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f4ba68e096ac..faa47f9ea73a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3127,17 +3127,17 @@ static void add_v4_addrs(struct inet6_dev *idev) offset = sizeof(struct in6_addr) - 4; memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); - if (idev->dev->flags&IFF_POINTOPOINT) { + if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { + scope = IPV6_ADDR_COMPATv4; + plen = 96; + pflags |= RTF_NONEXTHOP; + } else { if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE) return; addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; plen = 64; - } else { - scope = IPV6_ADDR_COMPATv4; - plen = 96; - pflags |= RTF_NONEXTHOP; } if (addr.s6_addr32[3]) { -- cgit v1.2.3 From 9c6b9cbafdc010b38f4077c8252654381eb46028 Mon Sep 17 00:00:00 2001 From: Yanguo Li Date: Tue, 31 Jan 2023 09:03:13 +0100 Subject: nfp: flower: avoid taking mutex in atomic context A mutex may sleep, which is not permitted in atomic context. Avoid a case where this may arise by moving the to nfp_flower_lag_get_info_from_netdev() in nfp_tun_write_neigh() spinlock. Fixes: abc210952af7 ("nfp: flower: tunnel neigh support bond offload") Reported-by: Dan Carpenter Signed-off-by: Yanguo Li Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20230131080313.2076060-1-simon.horman@corigine.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index a8678d5612ee..060a77f2265d 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -460,6 +460,7 @@ nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, sizeof(struct nfp_tun_neigh_v4); unsigned long cookie = (unsigned long)neigh; struct nfp_flower_priv *priv = app->priv; + struct nfp_tun_neigh_lag lag_info; struct nfp_neigh_entry *nn_entry; u32 port_id; u8 mtype; @@ -468,6 +469,11 @@ nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, if (!port_id) return; + if ((port_id & NFP_FL_LAG_OUT) == NFP_FL_LAG_OUT) { + memset(&lag_info, 0, sizeof(struct nfp_tun_neigh_lag)); + nfp_flower_lag_get_info_from_netdev(app, netdev, &lag_info); + } + spin_lock_bh(&priv->predt_lock); nn_entry = rhashtable_lookup_fast(&priv->neigh_table, &cookie, neigh_table_params); @@ -515,7 +521,7 @@ nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, neigh_ha_snapshot(common->dst_addr, neigh, netdev); if ((port_id & NFP_FL_LAG_OUT) == NFP_FL_LAG_OUT) - nfp_flower_lag_get_info_from_netdev(app, netdev, lag); + memcpy(lag, &lag_info, sizeof(struct nfp_tun_neigh_lag)); common->port_id = cpu_to_be32(port_id); if (rhashtable_insert_fast(&priv->neigh_table, -- cgit v1.2.3 From a2df8463e15c10a8a882090f3d7a760fdb7b189d Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 31 Jan 2023 13:54:37 -0800 Subject: igc: return an error if the mac type is unknown in igc_ptp_systim_to_hwtstamp() clang static analysis reports drivers/net/ethernet/intel/igc/igc_ptp.c:673:3: warning: The left operand of '+' is a garbage value [core.UndefinedBinaryOperatorResult] ktime_add_ns(shhwtstamps.hwtstamp, adjust); ^ ~~~~~~~~~~~~~~~~~~~~ igc_ptp_systim_to_hwtstamp() silently returns without setting the hwtstamp if the mac type is unknown. This should be treated as an error. Fixes: 81b055205e8b ("igc: Add support for RX timestamping") Signed-off-by: Tom Rix Reviewed-by: Simon Horman Acked-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20230131215437.1528994-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_ptp.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index c34734d432e0..4e10ced736db 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -417,10 +417,12 @@ static int igc_ptp_verify_pin(struct ptp_clock_info *ptp, unsigned int pin, * * We need to convert the system time value stored in the RX/TXSTMP registers * into a hwtstamp which can be used by the upper level timestamping functions. + * + * Returns 0 on success. **/ -static void igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, - struct skb_shared_hwtstamps *hwtstamps, - u64 systim) +static int igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, + struct skb_shared_hwtstamps *hwtstamps, + u64 systim) { switch (adapter->hw.mac.type) { case igc_i225: @@ -430,8 +432,9 @@ static void igc_ptp_systim_to_hwtstamp(struct igc_adapter *adapter, systim & 0xFFFFFFFF); break; default: - break; + return -EINVAL; } + return 0; } /** @@ -652,7 +655,8 @@ static void igc_ptp_tx_hwtstamp(struct igc_adapter *adapter) regval = rd32(IGC_TXSTMPL); regval |= (u64)rd32(IGC_TXSTMPH) << 32; - igc_ptp_systim_to_hwtstamp(adapter, &shhwtstamps, regval); + if (igc_ptp_systim_to_hwtstamp(adapter, &shhwtstamps, regval)) + return; switch (adapter->link_speed) { case SPEED_10: -- cgit v1.2.3 From 917d5e04d4dd2bbbf36fc6976ba442e284ccc42d Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Tue, 31 Jan 2023 11:46:59 +0530 Subject: octeontx2-af: Fix devlink unregister Exact match feature is only available in CN10K-B. Unregister exact match devlink entry only for this silicon variant. Fixes: 87e4ea29b030 ("octeontx2-af: Debugsfs support for exact match.") Signed-off-by: Ratheesh Kannoth Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230131061659.1025137-1-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/rvu_devlink.c | 35 +++++++++++++++++----- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c index bda1a6fa2ec4..e4407f09c9d3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c @@ -1500,6 +1500,9 @@ static const struct devlink_param rvu_af_dl_params[] = { BIT(DEVLINK_PARAM_CMODE_RUNTIME), rvu_af_dl_dwrr_mtu_get, rvu_af_dl_dwrr_mtu_set, rvu_af_dl_dwrr_mtu_validate), +}; + +static const struct devlink_param rvu_af_dl_param_exact_match[] = { DEVLINK_PARAM_DRIVER(RVU_AF_DEVLINK_PARAM_ID_NPC_EXACT_FEATURE_DISABLE, "npc_exact_feature_disable", DEVLINK_PARAM_TYPE_STRING, BIT(DEVLINK_PARAM_CMODE_RUNTIME), @@ -1556,7 +1559,6 @@ int rvu_register_dl(struct rvu *rvu) { struct rvu_devlink *rvu_dl; struct devlink *dl; - size_t size; int err; dl = devlink_alloc(&rvu_devlink_ops, sizeof(struct rvu_devlink), @@ -1578,21 +1580,32 @@ int rvu_register_dl(struct rvu *rvu) goto err_dl_health; } + err = devlink_params_register(dl, rvu_af_dl_params, ARRAY_SIZE(rvu_af_dl_params)); + if (err) { + dev_err(rvu->dev, + "devlink params register failed with error %d", err); + goto err_dl_health; + } + /* Register exact match devlink only for CN10K-B */ - size = ARRAY_SIZE(rvu_af_dl_params); if (!rvu_npc_exact_has_match_table(rvu)) - size -= 1; + goto done; - err = devlink_params_register(dl, rvu_af_dl_params, size); + err = devlink_params_register(dl, rvu_af_dl_param_exact_match, + ARRAY_SIZE(rvu_af_dl_param_exact_match)); if (err) { dev_err(rvu->dev, - "devlink params register failed with error %d", err); - goto err_dl_health; + "devlink exact match params register failed with error %d", err); + goto err_dl_exact_match; } +done: devlink_register(dl); return 0; +err_dl_exact_match: + devlink_params_unregister(dl, rvu_af_dl_params, ARRAY_SIZE(rvu_af_dl_params)); + err_dl_health: rvu_health_reporters_destroy(rvu); devlink_free(dl); @@ -1605,8 +1618,14 @@ void rvu_unregister_dl(struct rvu *rvu) struct devlink *dl = rvu_dl->dl; devlink_unregister(dl); - devlink_params_unregister(dl, rvu_af_dl_params, - ARRAY_SIZE(rvu_af_dl_params)); + + devlink_params_unregister(dl, rvu_af_dl_params, ARRAY_SIZE(rvu_af_dl_params)); + + /* Unregister exact match devlink only for CN10K-B */ + if (rvu_npc_exact_has_match_table(rvu)) + devlink_params_unregister(dl, rvu_af_dl_param_exact_match, + ARRAY_SIZE(rvu_af_dl_param_exact_match)); + rvu_health_reporters_destroy(rvu); devlink_free(dl); } -- cgit v1.2.3 From 99f1c46011cc0feb47d4f4f7bee70a0341442d14 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 30 Jan 2023 19:33:06 -0800 Subject: hv_netvsc: Fix missed pagebuf entries in netvsc_dma_map/unmap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit netvsc_dma_map() and netvsc_dma_unmap() currently check the cp_partial flag and adjust the page_count so that pagebuf entries for the RNDIS portion of the message are skipped when it has already been copied into a send buffer. But this adjustment has already been made by code in netvsc_send(). The duplicate adjustment causes some pagebuf entries to not be mapped. In a normal VM, this doesn't break anything because the mapping doesn’t change the PFN. But in a Confidential VM, dma_map_single() does bounce buffering and provides a different PFN. Failing to do the mapping causes the wrong PFN to be passed to Hyper-V, and various errors ensue. Fix this by removing the duplicate adjustment in netvsc_dma_map() and netvsc_dma_unmap(). Fixes: 846da38de0e8 ("net: netvsc: Add Isolation VM support for netvsc driver") Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Reviewed-by: Haiyang Zhang Link: https://lore.kernel.org/r/1675135986-254490-1-git-send-email-mikelley@microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/hyperv/netvsc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 9352dad58996..e02d1e3ef672 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -987,9 +987,6 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, void netvsc_dma_unmap(struct hv_device *hv_dev, struct hv_netvsc_packet *packet) { - u32 page_count = packet->cp_partial ? - packet->page_buf_cnt - packet->rmsg_pgcnt : - packet->page_buf_cnt; int i; if (!hv_is_isolation_supported()) @@ -998,7 +995,7 @@ void netvsc_dma_unmap(struct hv_device *hv_dev, if (!packet->dma_range) return; - for (i = 0; i < page_count; i++) + for (i = 0; i < packet->page_buf_cnt; i++) dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma, packet->dma_range[i].mapping_size, DMA_TO_DEVICE); @@ -1028,9 +1025,7 @@ static int netvsc_dma_map(struct hv_device *hv_dev, struct hv_netvsc_packet *packet, struct hv_page_buffer *pb) { - u32 page_count = packet->cp_partial ? - packet->page_buf_cnt - packet->rmsg_pgcnt : - packet->page_buf_cnt; + u32 page_count = packet->page_buf_cnt; dma_addr_t dma; int i; -- cgit v1.2.3 From d0553680f94c49bbe0e39eb50d033ba563b4212d Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Mon, 6 Sep 2021 17:42:00 +0800 Subject: can: j1939: fix errant WARN_ON_ONCE in j1939_session_deactivate The conclusion "j1939_session_deactivate() should be called with a session ref-count of at least 2" is incorrect. In some concurrent scenarios, j1939_session_deactivate can be called with the session ref-count less than 2. But there is not any problem because it will check the session active state before session putting in j1939_session_deactivate_locked(). Here is the concurrent scenario of the problem reported by syzbot and my reproduction log. cpu0 cpu1 j1939_xtp_rx_eoma j1939_xtp_rx_abort_one j1939_session_get_by_addr [kref == 2] j1939_session_get_by_addr [kref == 3] j1939_session_deactivate [kref == 2] j1939_session_put [kref == 1] j1939_session_completed j1939_session_deactivate WARN_ON_ONCE(kref < 2) ===================================================== WARNING: CPU: 1 PID: 21 at net/can/j1939/transport.c:1088 j1939_session_deactivate+0x5f/0x70 CPU: 1 PID: 21 Comm: ksoftirqd/1 Not tainted 5.14.0-rc7+ #32 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 RIP: 0010:j1939_session_deactivate+0x5f/0x70 Call Trace: j1939_session_deactivate_activate_next+0x11/0x28 j1939_xtp_rx_eoma+0x12a/0x180 j1939_tp_recv+0x4a2/0x510 j1939_can_recv+0x226/0x380 can_rcv_filter+0xf8/0x220 can_receive+0x102/0x220 ? process_backlog+0xf0/0x2c0 can_rcv+0x53/0xf0 __netif_receive_skb_one_core+0x67/0x90 ? process_backlog+0x97/0x2c0 __netif_receive_skb+0x22/0x80 Fixes: 0c71437dd50d ("can: j1939: j1939_session_deactivate(): clarify lifetime of session object") Reported-by: syzbot+9981a614060dcee6eeca@syzkaller.appspotmail.com Signed-off-by: Ziyang Xuan Acked-by: Oleksij Rempel Link: https://lore.kernel.org/all/20210906094200.95868-1-william.xuanziyang@huawei.com Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 5c722b55fe23..fce9b9ebf13f 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1092,10 +1092,6 @@ static bool j1939_session_deactivate(struct j1939_session *session) bool active; j1939_session_list_lock(priv); - /* This function should be called with a session ref-count of at - * least 2. - */ - WARN_ON_ONCE(kref_read(&session->kref) < 2); active = j1939_session_deactivate_locked(session); j1939_session_list_unlock(priv); -- cgit v1.2.3 From 3793301cbaa4a62d83e21f685307da7671f812ab Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 31 Jan 2023 11:56:13 +0100 Subject: can: raw: fix CAN FD frame transmissions over CAN XL devices A CAN XL device is always capable to process CAN FD frames. The former check when sending CAN FD frames relied on the existence of a CAN FD device and did not check for a CAN XL device that would be correct too. With this patch the CAN FD feature is enabled automatically when CAN XL is switched on - and CAN FD cannot be switch off while CAN XL is enabled. This precondition also leads to a clean up and reduction of checks in the hot path in raw_rcv() and raw_sendmsg(). Some conditions are reordered to handle simple checks first. changes since v1: https://lore.kernel.org/all/20230131091012.50553-1-socketcan@hartkopp.net - fixed typo: devive -> device changes since v2: https://lore.kernel.org/all/20230131091824.51026-1-socketcan@hartkopp.net/ - reorder checks in if statements to handle simple checks first Fixes: 626332696d75 ("can: raw: add CAN XL support") Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20230131105613.55228-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- net/can/raw.c | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/net/can/raw.c b/net/can/raw.c index 81071cdb0301..ba86782ba8bb 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -132,8 +132,8 @@ static void raw_rcv(struct sk_buff *oskb, void *data) return; /* make sure to not pass oversized frames to the socket */ - if ((can_is_canfd_skb(oskb) && !ro->fd_frames && !ro->xl_frames) || - (can_is_canxl_skb(oskb) && !ro->xl_frames)) + if ((!ro->fd_frames && can_is_canfd_skb(oskb)) || + (!ro->xl_frames && can_is_canxl_skb(oskb))) return; /* eliminate multiple filter matches for the same skb */ @@ -670,6 +670,11 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (copy_from_sockptr(&ro->fd_frames, optval, optlen)) return -EFAULT; + /* Enabling CAN XL includes CAN FD */ + if (ro->xl_frames && !ro->fd_frames) { + ro->fd_frames = ro->xl_frames; + return -EINVAL; + } break; case CAN_RAW_XL_FRAMES: @@ -679,6 +684,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) return -EFAULT; + /* Enabling CAN XL includes CAN FD */ + if (ro->xl_frames) + ro->fd_frames = ro->xl_frames; break; case CAN_RAW_JOIN_FILTERS: @@ -786,6 +794,25 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, return 0; } +static bool raw_bad_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) +{ + /* Classical CAN -> no checks for flags and device capabilities */ + if (can_is_can_skb(skb)) + return false; + + /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ + if (ro->fd_frames && can_is_canfd_skb(skb) && + (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) + return false; + + /* CAN XL -> needs to be enabled and a CAN XL device */ + if (ro->xl_frames && can_is_canxl_skb(skb) && + can_is_canxl_dev_mtu(mtu)) + return false; + + return true; +} + static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; @@ -833,20 +860,8 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) goto free_skb; err = -EINVAL; - if (ro->xl_frames && can_is_canxl_dev_mtu(dev->mtu)) { - /* CAN XL, CAN FD and Classical CAN */ - if (!can_is_canxl_skb(skb) && !can_is_canfd_skb(skb) && - !can_is_can_skb(skb)) - goto free_skb; - } else if (ro->fd_frames && dev->mtu == CANFD_MTU) { - /* CAN FD and Classical CAN */ - if (!can_is_canfd_skb(skb) && !can_is_can_skb(skb)) - goto free_skb; - } else { - /* Classical CAN */ - if (!can_is_can_skb(skb)) - goto free_skb; - } + if (raw_bad_txframe(ro, skb, dev->mtu)) + goto free_skb; sockcm_init(&sockc, sk); if (msg->msg_controllen) { -- cgit v1.2.3 From 823b2e42720f96f277940c37ea438b7c5ead51a4 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 12 Jan 2023 20:23:47 +0100 Subject: can: isotp: handle wait_event_interruptible() return values When wait_event_interruptible() has been interrupted by a signal the tx.state value might not be ISOTP_IDLE. Force the state machines into idle state to inhibit the timer handlers to continue working. Fixes: 866337865f37 ("can: isotp: fix tx state handling for echo tx processing") Cc: stable@vger.kernel.org Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20230112192347.1944-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/can/isotp.c b/net/can/isotp.c index 608f8c24ae46..dae421f6c901 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1162,6 +1162,10 @@ static int isotp_release(struct socket *sock) /* wait for complete transmission of current pdu */ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + /* force state machines to be idle also when a signal occurred */ + so->tx.state = ISOTP_IDLE; + so->rx.state = ISOTP_IDLE; + spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); -- cgit v1.2.3 From 4f027cba8216f42a18b544842efab134f8b1f9f4 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 4 Jan 2023 15:57:01 +0100 Subject: can: isotp: split tx timer into transmission and timeout The timer for the transmission of isotp PDUs formerly had two functions: 1. send two consecutive frames with a given time gap 2. monitor the timeouts for flow control frames and the echo frames This led to larger txstate checks and potentially to a problem discovered by syzbot which enabled the panic_on_warn feature while testing. The former 'txtimer' function is split into 'txfrtimer' and 'txtimer' to handle the two above functionalities with separate timer callbacks. The two simplified timers now run in one-shot mode and make the state transitions (especially with isotp_rcv_echo) better understandable. Fixes: 866337865f37 ("can: isotp: fix tx state handling for echo tx processing") Reported-by: syzbot+5aed6c3aaba661f5b917@syzkaller.appspotmail.com Cc: stable@vger.kernel.org # >= v6.0 Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20230104145701.2422-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 65 +++++++++++++++++++++++++-------------------------------- 1 file changed, 29 insertions(+), 36 deletions(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index dae421f6c901..fc81d77724a1 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -140,7 +140,7 @@ struct isotp_sock { canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; - struct hrtimer rxtimer, txtimer; + struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; @@ -871,7 +871,7 @@ static void isotp_rcv_echo(struct sk_buff *skb, void *data) } /* start timer to send next consecutive frame with correct delay */ - hrtimer_start(&so->txtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); + hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) @@ -879,49 +879,39 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; - enum hrtimer_restart restart = HRTIMER_NORESTART; - switch (so->tx.state) { - case ISOTP_SENDING: + /* don't handle timeouts in IDLE state */ + if (so->tx.state == ISOTP_IDLE) + return HRTIMER_NORESTART; - /* cfecho should be consumed by isotp_rcv_echo() here */ - if (!so->cfecho) { - /* start timeout for unlikely lost echo skb */ - hrtimer_set_expires(&so->txtimer, - ktime_add(ktime_get(), - ktime_set(ISOTP_ECHO_TIMEOUT, 0))); - restart = HRTIMER_RESTART; + /* we did not get any flow control or echo frame in time */ - /* push out the next consecutive frame */ - isotp_send_cframe(so); - break; - } - - /* cfecho has not been cleared in isotp_rcv_echo() */ - pr_notice_once("can-isotp: cfecho %08X timeout\n", so->cfecho); - fallthrough; + /* report 'communication error on send' */ + sk->sk_err = ECOMM; + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); - case ISOTP_WAIT_FC: - case ISOTP_WAIT_FIRST_FC: + /* reset tx state */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); - /* we did not get any flow control frame in time */ + return HRTIMER_NORESTART; +} - /* report 'communication error on send' */ - sk->sk_err = ECOMM; - if (!sock_flag(sk, SOCK_DEAD)) - sk_error_report(sk); +static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + txfrtimer); - /* reset tx state */ - so->tx.state = ISOTP_IDLE; - wake_up_interruptible(&so->wait); - break; + /* start echo timeout handling and cover below protocol error */ + hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), + HRTIMER_MODE_REL_SOFT); - default: - WARN_ONCE(1, "can-isotp: tx timer state %08X cfecho %08X\n", - so->tx.state, so->cfecho); - } + /* cfecho should be consumed by isotp_rcv_echo() here */ + if (so->tx.state == ISOTP_SENDING && !so->cfecho) + isotp_send_cframe(so); - return restart; + return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -1198,6 +1188,7 @@ static int isotp_release(struct socket *sock) } } + hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); @@ -1601,6 +1592,8 @@ static int isotp_init(struct sock *sk) so->rxtimer.function = isotp_rx_timer_handler; hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txtimer.function = isotp_tx_timer_handler; + hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + so->txfrtimer.function = isotp_txfr_timer_handler; init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); -- cgit v1.2.3 From 1613fff7a32e1d9e2ac09db73feba0e71a188445 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 23 Jan 2023 09:03:42 +0100 Subject: can: mcp251xfd: mcp251xfd_ring_set_ringparam(): assign missing tx_obj_num_coalesce_irq If the a new ring layout is set, the max coalesced frames for RX and TX are re-calculated, too. Add the missing assignment of the newly calculated TX max coalesced frames. Fixes: 656fc12ddaf8 ("can: mcp251xfd: add TX IRQ coalescing ethtool support") Link: https://lore.kernel.org/all/20230130154334.1578518-1-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c index 3585f02575df..57eeb066a945 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ethtool.c @@ -48,6 +48,7 @@ mcp251xfd_ring_set_ringparam(struct net_device *ndev, priv->rx_obj_num = layout.cur_rx; priv->rx_obj_num_coalesce_irq = layout.rx_coalesce; priv->tx->obj_num = layout.cur_tx; + priv->tx_obj_num_coalesce_irq = layout.tx_coalesce; return 0; } -- cgit v1.2.3 From c03c80e3a03ffb4f790901d60797e9810539d946 Mon Sep 17 00:00:00 2001 From: Andrei Gherzan Date: Wed, 1 Feb 2023 00:16:10 +0000 Subject: selftests: net: udpgso_bench_rx: Fix 'used uninitialized' compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change fixes the following compiler warning: /usr/include/x86_64-linux-gnu/bits/error.h:40:5: warning: ‘gso_size’ may be used uninitialized [-Wmaybe-uninitialized] 40 | __error_noreturn (__status, __errnum, __format, __va_arg_pack ()); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ udpgso_bench_rx.c: In function ‘main’: udpgso_bench_rx.c:253:23: note: ‘gso_size’ was declared here 253 | int ret, len, gso_size, budget = 256; Fixes: 3327a9c46352 ("selftests: add functionals test for UDP GRO") Signed-off-by: Andrei Gherzan Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230201001612.515730-1-andrei.gherzan@canonical.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/udpgso_bench_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index 6a193425c367..d0895bd1933f 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -250,7 +250,7 @@ static int recv_msg(int fd, char *buf, int len, int *gso_size) static void do_flush_udp(int fd) { static char rbuf[ETH_MAX_MTU]; - int ret, len, gso_size, budget = 256; + int ret, len, gso_size = 0, budget = 256; len = cfg_read_all ? sizeof(rbuf) : 0; while (budget--) { -- cgit v1.2.3 From db9b47ee9f5f375ab0c5daeb20321c75b4fa657d Mon Sep 17 00:00:00 2001 From: Andrei Gherzan Date: Wed, 1 Feb 2023 00:16:12 +0000 Subject: selftests: net: udpgso_bench_rx/tx: Stop when wrong CLI args are provided Leaving unrecognized arguments buried in the output, can easily hide a CLI/script typo. Avoid this by exiting when wrong arguments are provided to the udpgso_bench test programs. Fixes: 3a687bef148d ("selftests: udp gso benchmark") Signed-off-by: Andrei Gherzan Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230201001612.515730-2-andrei.gherzan@canonical.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/udpgso_bench_rx.c | 2 ++ tools/testing/selftests/net/udpgso_bench_tx.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index d0895bd1933f..4058c7451e70 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -336,6 +336,8 @@ static void parse_opts(int argc, char **argv) cfg_verify = true; cfg_read_all = true; break; + default: + exit(1); } } diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c index f1fdaa270291..b47b5c32039f 100644 --- a/tools/testing/selftests/net/udpgso_bench_tx.c +++ b/tools/testing/selftests/net/udpgso_bench_tx.c @@ -490,6 +490,8 @@ static void parse_opts(int argc, char **argv) case 'z': cfg_zerocopy = true; break; + default: + exit(1); } } -- cgit v1.2.3 From dafe93b9ee21028d625dce347118b82659652eff Mon Sep 17 00:00:00 2001 From: Andrei Gherzan Date: Wed, 1 Feb 2023 00:16:14 +0000 Subject: selftests: net: udpgso_bench: Fix racing bug between the rx/tx programs "udpgro_bench.sh" invokes udpgso_bench_rx/udpgso_bench_tx programs subsequently and while doing so, there is a chance that the rx one is not ready to accept socket connections. This racing bug could fail the test with at least one of the following: ./udpgso_bench_tx: connect: Connection refused ./udpgso_bench_tx: sendmsg: Connection refused ./udpgso_bench_tx: write: Connection refused This change addresses this by making udpgro_bench.sh wait for the rx program to be ready before firing off the tx one - up to a 10s timeout. Fixes: 3a687bef148d ("selftests: udp gso benchmark") Signed-off-by: Andrei Gherzan Cc: Paolo Abeni Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230201001612.515730-3-andrei.gherzan@canonical.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/udpgso_bench.sh | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh index dc932fd65363..640bc43452fa 100755 --- a/tools/testing/selftests/net/udpgso_bench.sh +++ b/tools/testing/selftests/net/udpgso_bench.sh @@ -7,6 +7,7 @@ readonly GREEN='\033[0;92m' readonly YELLOW='\033[0;33m' readonly RED='\033[0;31m' readonly NC='\033[0m' # No Color +readonly TESTPORT=8000 readonly KSFT_PASS=0 readonly KSFT_FAIL=1 @@ -56,11 +57,26 @@ trap wake_children EXIT run_one() { local -r args=$@ + local nr_socks=0 + local i=0 + local -r timeout=10 + + ./udpgso_bench_rx -p "$TESTPORT" & + ./udpgso_bench_rx -p "$TESTPORT" -t & + + # Wait for the above test program to get ready to receive connections. + while [ "$i" -lt "$timeout" ]; do + nr_socks="$(ss -lnHi | grep -c "\*:${TESTPORT}")" + [ "$nr_socks" -eq 2 ] && break + i=$((i + 1)) + sleep 1 + done + if [ "$nr_socks" -ne 2 ]; then + echo "timed out while waiting for udpgso_bench_rx" + exit 1 + fi - ./udpgso_bench_rx & - ./udpgso_bench_rx -t & - - ./udpgso_bench_tx ${args} + ./udpgso_bench_tx -p "$TESTPORT" ${args} } run_in_netns() { -- cgit v1.2.3 From 329c9cd769c2e306957df031efff656c40922c76 Mon Sep 17 00:00:00 2001 From: Andrei Gherzan Date: Wed, 1 Feb 2023 00:16:16 +0000 Subject: selftests: net: udpgso_bench_tx: Cater for pending datagrams zerocopy benchmarking The test tool can check that the zerocopy number of completions value is valid taking into consideration the number of datagram send calls. This can catch the system into a state where the datagrams are still in the system (for example in a qdisk, waiting for the network interface to return a completion notification, etc). This change adds a retry logic of computing the number of completions up to a configurable (via CLI) timeout (default: 2 seconds). Fixes: 79ebc3c26010 ("net/udpgso_bench_tx: options to exercise TX CMSG") Signed-off-by: Andrei Gherzan Cc: Willem de Bruijn Cc: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230201001612.515730-4-andrei.gherzan@canonical.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/udpgso_bench_tx.c | 34 +++++++++++++++++++++------ 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c index b47b5c32039f..477392715a9a 100644 --- a/tools/testing/selftests/net/udpgso_bench_tx.c +++ b/tools/testing/selftests/net/udpgso_bench_tx.c @@ -62,6 +62,7 @@ static int cfg_payload_len = (1472 * 42); static int cfg_port = 8000; static int cfg_runtime_ms = -1; static bool cfg_poll; +static int cfg_poll_loop_timeout_ms = 2000; static bool cfg_segment; static bool cfg_sendmmsg; static bool cfg_tcp; @@ -235,16 +236,17 @@ static void flush_errqueue_recv(int fd) } } -static void flush_errqueue(int fd, const bool do_poll) +static void flush_errqueue(int fd, const bool do_poll, + unsigned long poll_timeout, const bool poll_err) { if (do_poll) { struct pollfd fds = {0}; int ret; fds.fd = fd; - ret = poll(&fds, 1, 500); + ret = poll(&fds, 1, poll_timeout); if (ret == 0) { - if (cfg_verbose) + if ((cfg_verbose) && (poll_err)) fprintf(stderr, "poll timeout\n"); } else if (ret < 0) { error(1, errno, "poll"); @@ -254,6 +256,20 @@ static void flush_errqueue(int fd, const bool do_poll) flush_errqueue_recv(fd); } +static void flush_errqueue_retry(int fd, unsigned long num_sends) +{ + unsigned long tnow, tstop; + bool first_try = true; + + tnow = gettimeofday_ms(); + tstop = tnow + cfg_poll_loop_timeout_ms; + do { + flush_errqueue(fd, true, tstop - tnow, first_try); + first_try = false; + tnow = gettimeofday_ms(); + } while ((stat_zcopies != num_sends) && (tnow < tstop)); +} + static int send_tcp(int fd, char *data) { int ret, done = 0, count = 0; @@ -413,7 +429,8 @@ static int send_udp_segment(int fd, char *data) static void usage(const char *filepath) { - error(1, 0, "Usage: %s [-46acmHPtTuvz] [-C cpu] [-D dst ip] [-l secs] [-M messagenr] [-p port] [-s sendsize] [-S gsosize]", + error(1, 0, "Usage: %s [-46acmHPtTuvz] [-C cpu] [-D dst ip] [-l secs] " + "[-L secs] [-M messagenr] [-p port] [-s sendsize] [-S gsosize]", filepath); } @@ -423,7 +440,7 @@ static void parse_opts(int argc, char **argv) int max_len, hdrlen; int c; - while ((c = getopt(argc, argv, "46acC:D:Hl:mM:p:s:PS:tTuvz")) != -1) { + while ((c = getopt(argc, argv, "46acC:D:Hl:L:mM:p:s:PS:tTuvz")) != -1) { switch (c) { case '4': if (cfg_family != PF_UNSPEC) @@ -452,6 +469,9 @@ static void parse_opts(int argc, char **argv) case 'l': cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000; break; + case 'L': + cfg_poll_loop_timeout_ms = strtoul(optarg, NULL, 10) * 1000; + break; case 'm': cfg_sendmmsg = true; break; @@ -679,7 +699,7 @@ int main(int argc, char **argv) num_sends += send_udp(fd, buf[i]); num_msgs++; if ((cfg_zerocopy && ((num_msgs & 0xF) == 0)) || cfg_tx_tstamp) - flush_errqueue(fd, cfg_poll); + flush_errqueue(fd, cfg_poll, 500, true); if (cfg_msg_nr && num_msgs >= cfg_msg_nr) break; @@ -698,7 +718,7 @@ int main(int argc, char **argv) } while (!interrupted && (cfg_runtime_ms == -1 || tnow < tstop)); if (cfg_zerocopy || cfg_tx_tstamp) - flush_errqueue(fd, true); + flush_errqueue_retry(fd, num_sends); if (close(fd)) error(1, errno, "close"); -- cgit v1.2.3 From 63b114042d8a9c02d9939889177c36dbdb17a588 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 2 Feb 2023 18:35:16 +0200 Subject: virtio-net: Keep stop() to follow mirror sequence of open() Cited commit in fixes tag frees rxq xdp info while RQ NAPI is still enabled and packet processing may be ongoing. Follow the mirror sequence of open() in the stop() callback. This ensures that when rxq info is unregistered, no rx packet processing is ongoing. Fixes: 754b8a21a96d ("virtio_net: setup xdp_rxq_info") Acked-by: Michael S. Tsirkin Reviewed-by: Jiri Pirko Signed-off-by: Parav Pandit Link: https://lore.kernel.org/r/20230202163516.12559-1-parav@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 6df14dd5bf46..61e33e4dd0cd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2158,8 +2158,8 @@ static int virtnet_close(struct net_device *dev) cancel_delayed_work_sync(&vi->refill); for (i = 0; i < vi->max_queue_pairs; i++) { - xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); napi_disable(&vi->rq[i].napi); + xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); virtnet_napi_tx_disable(&vi->sq[i].napi); } -- cgit v1.2.3 From a1f47752fd6275b2502fb075945022d6cf264855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Sat, 28 Jan 2023 12:42:32 +0300 Subject: net: ethernet: mtk_eth_soc: disable hardware DSA untagging for second MAC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to my tests on MT7621AT and MT7623NI SoCs, hardware DSA untagging won't work on the second MAC. Therefore, disable this feature when the second MAC of the MT7621 and MT7623 SoCs is being used. Fixes: 2d7605a72906 ("net: ethernet: mtk_eth_soc: enable hardware DSA untagging") Link: https://lore.kernel.org/netdev/6249fc14-b38a-c770-36b4-5af6d41c21d3@arinc9.com/ Tested-by: Arınç ÜNAL Signed-off-by: Arınç ÜNAL Link: https://lore.kernel.org/r/20230128094232.2451947-1-arinc.unal@arinc9.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index e3de9a53b2d9..d56eda6397a4 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -3177,7 +3177,8 @@ static int mtk_open(struct net_device *dev) struct mtk_eth *eth = mac->hw; int i, err; - if (mtk_uses_dsa(dev) && !eth->prog) { + if ((mtk_uses_dsa(dev) && !eth->prog) && + !(mac->id == 1 && MTK_HAS_CAPS(eth->soc->caps, MTK_GMAC1_TRGMII))) { for (i = 0; i < ARRAY_SIZE(eth->dsa_meta); i++) { struct metadata_dst *md_dst = eth->dsa_meta[i]; @@ -3194,7 +3195,8 @@ static int mtk_open(struct net_device *dev) } } else { /* Hardware special tag parsing needs to be disabled if at least - * one MAC does not use DSA. + * one MAC does not use DSA, or the second MAC of the MT7621 and + * MT7623 SoCs is being used. */ u32 val = mtk_r32(eth, MTK_CDMP_IG_CTRL); val &= ~MTK_CDMP_STAG_EN; -- cgit v1.2.3 From 0c598aed445eb45b0ee7ba405f7ece99ee349c30 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 2 Feb 2023 00:02:18 +0300 Subject: net: openvswitch: fix flow memory leak in ovs_flow_cmd_new Syzkaller reports a memory leak of new_flow in ovs_flow_cmd_new() as it is not freed when an allocation of a key fails. BUG: memory leak unreferenced object 0xffff888116668000 (size 632): comm "syz-executor231", pid 1090, jiffies 4294844701 (age 18.871s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000defa3494>] kmem_cache_zalloc include/linux/slab.h:654 [inline] [<00000000defa3494>] ovs_flow_alloc+0x19/0x180 net/openvswitch/flow_table.c:77 [<00000000c67d8873>] ovs_flow_cmd_new+0x1de/0xd40 net/openvswitch/datapath.c:957 [<0000000010a539a8>] genl_family_rcv_msg_doit+0x22d/0x330 net/netlink/genetlink.c:739 [<00000000dff3302d>] genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] [<00000000dff3302d>] genl_rcv_msg+0x328/0x590 net/netlink/genetlink.c:800 [<000000000286dd87>] netlink_rcv_skb+0x153/0x430 net/netlink/af_netlink.c:2515 [<0000000061fed410>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:811 [<000000009dc0f111>] netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] [<000000009dc0f111>] netlink_unicast+0x545/0x7f0 net/netlink/af_netlink.c:1339 [<000000004a5ee816>] netlink_sendmsg+0x8e7/0xde0 net/netlink/af_netlink.c:1934 [<00000000482b476f>] sock_sendmsg_nosec net/socket.c:651 [inline] [<00000000482b476f>] sock_sendmsg+0x152/0x190 net/socket.c:671 [<00000000698574ba>] ____sys_sendmsg+0x70a/0x870 net/socket.c:2356 [<00000000d28d9e11>] ___sys_sendmsg+0xf3/0x170 net/socket.c:2410 [<0000000083ba9120>] __sys_sendmsg+0xe5/0x1b0 net/socket.c:2439 [<00000000c00628f8>] do_syscall_64+0x30/0x40 arch/x86/entry/common.c:46 [<000000004abfdcf4>] entry_SYSCALL_64_after_hwframe+0x61/0xc6 To fix this the patch rearranges the goto labels to reflect the order of object allocations and adds appropriate goto statements on the error paths. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 68bb10101e6b ("openvswitch: Fix flow lookup to use unmasked key") Signed-off-by: Fedor Pchelkin Signed-off-by: Alexey Khoroshilov Acked-by: Eelco Chaudron Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230201210218.361970-1-pchelkin@ispras.ru Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index a71795355aec..fcee6012293b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1004,14 +1004,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) { error = -ENOMEM; - goto err_kfree_key; + goto err_kfree_flow; } ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) - goto err_kfree_flow; + goto err_kfree_key; ovs_flow_mask_key(&new_flow->key, key, true, &mask); @@ -1019,14 +1019,14 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], key, log); if (error) - goto err_kfree_flow; + goto err_kfree_key; /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); - goto err_kfree_flow; + goto err_kfree_key; } reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, @@ -1126,10 +1126,10 @@ err_unlock_ovs: kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); -err_kfree_flow: - ovs_flow_free(new_flow, false); err_kfree_key: kfree(key); +err_kfree_flow: + ovs_flow_free(new_flow, false); error: return error; } -- cgit v1.2.3 From 57b24f8c30a0bdbe851f4a181d9c9c29f077e0a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 1 Feb 2023 10:20:11 -0800 Subject: MAINTAINERS: bonding: move Veaceslav Falico to CREDITS Veaceslav has stepped away from netdev: Subsystem BONDING DRIVER Changes 96 / 319 (30%) Last activity: 2022-12-01 Jay Vosburgh : Author 4f5d33f4f798 2022-08-11 00:00:00 3 Tags e5214f363dab 2022-12-01 00:00:00 48 Veaceslav Falico : Andy Gospodarek : Tags 47f706262f1d 2019-02-24 00:00:00 4 Top reviewers: [42]: jay.vosburgh@canonical.com [18]: jiri@nvidia.com [10]: jtoppins@redhat.com INACTIVE MAINTAINER Veaceslav Falico Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index acac06b6563e..a440474a7206 100644 --- a/CREDITS +++ b/CREDITS @@ -1173,6 +1173,10 @@ D: Future Domain TMC-16x0 SCSI driver (author) D: APM driver (early port) D: DRM drivers (author of several) +N: Veaceslav Falico +E: vfalico@gmail.com +D: Co-maintainer and co-author of the network bonding driver. + N: János Farkas E: chexum@shadow.banki.hu D: romfs, various (mostly networking) fixes diff --git a/MAINTAINERS b/MAINTAINERS index 7c884556eb8e..67b9f0c585a7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3766,7 +3766,6 @@ F: net/bluetooth/ BONDING DRIVER M: Jay Vosburgh -M: Veaceslav Falico M: Andy Gospodarek L: netdev@vger.kernel.org S: Supported -- cgit v1.2.3 From a35965625649b5b65153d51aed466c4b3008ce2e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 1 Feb 2023 10:20:12 -0800 Subject: mailmap: add John Crispin's entry John has not been CCed on some of the fixes which perhaps resulted in the lack of review tags: Subsystem MEDIATEK ETHERNET DRIVER Changes 50 / 295 (16%) Last activity: 2023-01-17 Felix Fietkau : Author 8bd8dcc5e47f 2022-11-18 00:00:00 33 Tags 8bd8dcc5e47f 2022-11-18 00:00:00 38 John Crispin : Sean Wang : Author 880c2d4b2fdf 2019-06-03 00:00:00 7 Tags a5d75538295b 2020-04-07 00:00:00 10 Mark Lee : Author 8d66a8183d0c 2019-11-14 00:00:00 4 Tags 8d66a8183d0c 2019-11-14 00:00:00 4 Lorenzo Bianconi : Author 08a764a7c51b 2023-01-17 00:00:00 68 Tags 08a764a7c51b 2023-01-17 00:00:00 74 Top reviewers: [12]: leonro@nvidia.com [6]: f.fainelli@gmail.com [6]: andrew@lunn.ch INACTIVE MAINTAINER John Crispin map his old address to the up to date one. Acked-by: John Crispin Signed-off-by: Jakub Kicinski --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 8deff4cec169..ac3decd2c756 100644 --- a/.mailmap +++ b/.mailmap @@ -214,6 +214,7 @@ Jisheng Zhang Jisheng Zhang Johan Hovold Johan Hovold +John Crispin John Paul Adrian Glaubitz John Stultz Jordan Crouse -- cgit v1.2.3 From c71a70c267eb40455489c0df10ca986e3969b007 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 1 Feb 2023 10:20:13 -0800 Subject: MAINTAINERS: ipv6: retire Hideaki Yoshifuji We very rarely hear from Hideaki Yoshifuji and the IPv4/IPv6 entry covers a lot of code. Asking people to CC someone who rarely responds feels wrong. Note that Hideaki Yoshifuji already has an entry in CREDITS for IPv6 so not adding another one. Cc: Hideaki YOSHIFUJI Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 67b9f0c585a7..f11d5386d1ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14598,7 +14598,6 @@ F: tools/testing/selftests/net/ipsec.c NETWORKING [IPv4/IPv6] M: "David S. Miller" -M: Hideaki YOSHIFUJI M: David Ahern L: netdev@vger.kernel.org S: Maintained -- cgit v1.2.3 From cd101f40a419f72d54b517bc51ca710bc5b07d55 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 1 Feb 2023 10:20:14 -0800 Subject: MAINTAINERS: update SCTP maintainers Vlad has stepped away from SCTP related duties. Move him to CREDITS and add Xin Long. Subsystem SCTP PROTOCOL Changes 237 / 629 (37%) Last activity: 2022-12-12 Vlad Yasevich : Neil Horman : Author 20a785aa52c8 2020-05-19 00:00:00 4 Tags 20a785aa52c8 2020-05-19 00:00:00 84 Marcelo Ricardo Leitner : Author 557fb5862c92 2021-07-28 00:00:00 41 Tags da05cecc4939 2022-12-12 00:00:00 197 Top reviewers: [15]: lucien.xin@gmail.com INACTIVE MAINTAINER Vlad Yasevich Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- CREDITS | 4 ++++ MAINTAINERS | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CREDITS b/CREDITS index a440474a7206..5f5d70c9c038 100644 --- a/CREDITS +++ b/CREDITS @@ -4183,6 +4183,10 @@ S: B-1206 Jingmao Guojigongyu S: 16 Baliqiao Nanjie, Beijing 101100 S: People's Repulic of China +N: Vlad Yasevich +E: vyasevich@gmail.com +D: SCTP protocol maintainer. + N: Aviad Yehezkel E: aviadye@nvidia.com D: Kernel TLS implementation and offload support. diff --git a/MAINTAINERS b/MAINTAINERS index f11d5386d1ad..6f22075603cc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18679,9 +18679,9 @@ F: drivers/target/ F: include/target/ SCTP PROTOCOL -M: Vlad Yasevich M: Neil Horman M: Marcelo Ricardo Leitner +M: Xin Long L: linux-sctp@vger.kernel.org S: Maintained W: http://lksctp.sourceforge.net -- cgit v1.2.3 From 7ff82416de8295c61423ef6fd75f052d3837d2f7 Mon Sep 17 00:00:00 2001 From: Alexander Couzens Date: Wed, 1 Feb 2023 19:23:29 +0100 Subject: net: mediatek: sgmii: ensure the SGMII PHY is powered down on configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code expect the PHY to be in power down which is only true after reset. Allow changes of the SGMII parameters more than once. Only power down when reconfiguring to avoid bouncing the link when there's no reason to - based on code from Russell King. There are cases when the SGMII_PHYA_PWD register contains 0x9 which prevents SGMII from working. The SGMII still shows link but no traffic can flow. Writing 0x0 to the PHYA_PWD register fix the issue. 0x0 was taken from a good working state of the SGMII interface. Fixes: 42c03844e93d ("net-next: mediatek: add support for MediaTek MT7622 SoC") Suggested-by: Russell King (Oracle) Signed-off-by: Alexander Couzens [ bmork: rebased and squashed into one patch ] Reviewed-by: Russell King (Oracle) Signed-off-by: Bjørn Mork Acked-by: Daniel Golle Tested-by: Daniel Golle Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.h | 2 ++ drivers/net/ethernet/mediatek/mtk_sgmii.c | 39 +++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 18a50529ce7b..01a38f5145b1 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -1036,11 +1036,13 @@ struct mtk_soc_data { * @regmap: The register map pointing at the range used to setup * SGMII modes * @ana_rgc3: The offset refers to register ANA_RGC3 related to regmap + * @interface: Currently configured interface mode * @pcs: Phylink PCS structure */ struct mtk_pcs { struct regmap *regmap; u32 ana_rgc3; + phy_interface_t interface; struct phylink_pcs pcs; }; diff --git a/drivers/net/ethernet/mediatek/mtk_sgmii.c b/drivers/net/ethernet/mediatek/mtk_sgmii.c index 5c286f2c9418..0a06995099cf 100644 --- a/drivers/net/ethernet/mediatek/mtk_sgmii.c +++ b/drivers/net/ethernet/mediatek/mtk_sgmii.c @@ -43,11 +43,6 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, int advertise, link_timer; bool changed, use_an; - if (interface == PHY_INTERFACE_MODE_2500BASEX) - rgc3 = RG_PHY_SPEED_3_125G; - else - rgc3 = 0; - advertise = phylink_mii_c22_pcs_encode_advertisement(interface, advertising); if (advertise < 0) @@ -88,9 +83,22 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, bmcr = 0; } - /* Configure the underlying interface speed */ - regmap_update_bits(mpcs->regmap, mpcs->ana_rgc3, - RG_PHY_SPEED_3_125G, rgc3); + if (mpcs->interface != interface) { + /* PHYA power down */ + regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, + SGMII_PHYA_PWD, SGMII_PHYA_PWD); + + if (interface == PHY_INTERFACE_MODE_2500BASEX) + rgc3 = RG_PHY_SPEED_3_125G; + else + rgc3 = 0; + + /* Configure the underlying interface speed */ + regmap_update_bits(mpcs->regmap, mpcs->ana_rgc3, + RG_PHY_SPEED_3_125G, rgc3); + + mpcs->interface = interface; + } /* Update the advertisement, noting whether it has changed */ regmap_update_bits_check(mpcs->regmap, SGMSYS_PCS_ADVERTISE, @@ -108,9 +116,17 @@ static int mtk_pcs_config(struct phylink_pcs *pcs, unsigned int mode, regmap_update_bits(mpcs->regmap, SGMSYS_PCS_CONTROL_1, SGMII_AN_RESTART | SGMII_AN_ENABLE, bmcr); - /* Release PHYA power down state */ - regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, - SGMII_PHYA_PWD, 0); + /* Release PHYA power down state + * Only removing bit SGMII_PHYA_PWD isn't enough. + * There are cases when the SGMII_PHYA_PWD register contains 0x9 which + * prevents SGMII from working. The SGMII still shows link but no traffic + * can flow. Writing 0x0 to the PHYA_PWD register fix the issue. 0x0 was + * taken from a good working state of the SGMII interface. + * Unknown how much the QPHY needs but it is racy without a sleep. + * Tested on mt7622 & mt7986. + */ + usleep_range(50, 100); + regmap_write(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, 0); return changed; } @@ -171,6 +187,7 @@ int mtk_sgmii_init(struct mtk_sgmii *ss, struct device_node *r, u32 ana_rgc3) return PTR_ERR(ss->pcs[i].regmap); ss->pcs[i].pcs.ops = &mtk_pcs_ops; + ss->pcs[i].interface = PHY_INTERFACE_MODE_NA; } return 0; -- cgit v1.2.3 From 9d32637122de88f1ef614c29703f0e050cad342e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 1 Feb 2023 19:23:30 +0100 Subject: net: mediatek: sgmii: fix duplex configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The logic of the duplex bit is inverted. Setting it means half duplex, not full duplex. Fix and rename macro to avoid confusion. Fixes: 7e538372694b ("net: ethernet: mediatek: Re-add support SGMII") Reviewed-by: Russell King (Oracle) Signed-off-by: Bjørn Mork Acked-by: Daniel Golle Tested-by: Daniel Golle Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.h | 2 +- drivers/net/ethernet/mediatek/mtk_sgmii.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 01a38f5145b1..2d9186d32bc0 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -519,7 +519,7 @@ #define SGMII_SPEED_10 FIELD_PREP(SGMII_SPEED_MASK, 0) #define SGMII_SPEED_100 FIELD_PREP(SGMII_SPEED_MASK, 1) #define SGMII_SPEED_1000 FIELD_PREP(SGMII_SPEED_MASK, 2) -#define SGMII_DUPLEX_FULL BIT(4) +#define SGMII_DUPLEX_HALF BIT(4) #define SGMII_IF_MODE_BIT5 BIT(5) #define SGMII_REMOTE_FAULT_DIS BIT(8) #define SGMII_CODE_SYNC_SET_VAL BIT(9) diff --git a/drivers/net/ethernet/mediatek/mtk_sgmii.c b/drivers/net/ethernet/mediatek/mtk_sgmii.c index 0a06995099cf..c4261069b521 100644 --- a/drivers/net/ethernet/mediatek/mtk_sgmii.c +++ b/drivers/net/ethernet/mediatek/mtk_sgmii.c @@ -154,11 +154,11 @@ static void mtk_pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, else sgm_mode = SGMII_SPEED_1000; - if (duplex == DUPLEX_FULL) - sgm_mode |= SGMII_DUPLEX_FULL; + if (duplex != DUPLEX_FULL) + sgm_mode |= SGMII_DUPLEX_HALF; regmap_update_bits(mpcs->regmap, SGMSYS_SGMII_MODE, - SGMII_DUPLEX_FULL | SGMII_SPEED_MASK, + SGMII_DUPLEX_HALF | SGMII_SPEED_MASK, sgm_mode); } } -- cgit v1.2.3 From 3337a6e04ddf2923a1bdcf3d31b3b52412bf82dd Mon Sep 17 00:00:00 2001 From: Alexander Couzens Date: Wed, 1 Feb 2023 19:23:31 +0100 Subject: mtk_sgmii: enable PCS polling to allow SFP work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there is no IRQ handling (even the SGMII supports it). Enable polling to support SFP ports. Fixes: 14a44ab0330d ("net: mtk_eth_soc: partially convert to phylink_pcs") Reviewed-by: Russell King (Oracle) Signed-off-by: Alexander Couzens [ bmork: changed "1" => "true" ] Signed-off-by: Bjørn Mork Acked-by: Daniel Golle Tested-by: Daniel Golle Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_sgmii.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mediatek/mtk_sgmii.c b/drivers/net/ethernet/mediatek/mtk_sgmii.c index c4261069b521..bb00de1003ac 100644 --- a/drivers/net/ethernet/mediatek/mtk_sgmii.c +++ b/drivers/net/ethernet/mediatek/mtk_sgmii.c @@ -187,6 +187,7 @@ int mtk_sgmii_init(struct mtk_sgmii *ss, struct device_node *r, u32 ana_rgc3) return PTR_ERR(ss->pcs[i].regmap); ss->pcs[i].pcs.ops = &mtk_pcs_ops; + ss->pcs[i].pcs.poll = true; ss->pcs[i].interface = PHY_INTERFACE_MODE_NA; } -- cgit v1.2.3