From 304e024216a802a7dc8ba75d36de82fa136bbf3e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 28 Mar 2020 12:12:59 -0700 Subject: net_sched: add a temporary refcnt for struct tcindex_data Although we intentionally use an ordered workqueue for all tc filter works, the ordering is not guaranteed by RCU work, given that tcf_queue_work() is esstenially a call_rcu(). This problem is demostrated by Thomas: CPU 0: tcf_queue_work() tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); -> Migration to CPU 1 CPU 1: tcf_queue_work(&p->rwork, tcindex_destroy_work); so the 2nd work could be queued before the 1st one, which leads to a free-after-free. Enforcing this order in RCU work is hard as it requires to change RCU code too. Fortunately we can workaround this problem in tcindex filter by taking a temporary refcnt, we only refcnt it right before we begin to destroy it. This simplifies the code a lot as a full refcnt requires much more changes in tcindex_set_parms(). Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Paul E. McKenney Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 9904299424a1..065345832a69 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -26,9 +27,12 @@ #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ +struct tcindex_data; + struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; + struct tcindex_data *p; struct rcu_work rwork; }; @@ -49,6 +53,7 @@ struct tcindex_data { u32 hash; /* hash table size; 0 if undefined */ u32 alloc_hash; /* allocated size */ u32 fall_through; /* 0: only classify if explicit match */ + refcount_t refcnt; /* a temporary refcnt for perfect hash */ struct rcu_work rwork; }; @@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) return tcf_exts_has_actions(&r->exts) || r->res.classid; } +static void tcindex_data_get(struct tcindex_data *p) +{ + refcount_inc(&p->refcnt); +} + +static void tcindex_data_put(struct tcindex_data *p) +{ + if (refcount_dec_and_test(&p->refcnt)) { + kfree(p->perfect); + kfree(p->h); + kfree(p); + } +} + static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, u16 key) { @@ -141,6 +160,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) { tcf_exts_destroy(&r->exts); tcf_exts_put_net(&r->exts); + tcindex_data_put(r->p); } static void tcindex_destroy_rexts_work(struct work_struct *work) @@ -212,6 +232,8 @@ found: else __tcindex_destroy_fexts(f); } else { + tcindex_data_get(p); + if (tcf_exts_get_net(&r->exts)) tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); else @@ -228,9 +250,7 @@ static void tcindex_destroy_work(struct work_struct *work) struct tcindex_data, rwork); - kfree(p->perfect); - kfree(p->h); - kfree(p); + tcindex_data_put(p); } static inline int @@ -248,9 +268,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { }; static int tcindex_filter_result_init(struct tcindex_filter_result *r, + struct tcindex_data *p, struct net *net) { memset(r, 0, sizeof(*r)); + r->p = p; return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } @@ -290,6 +312,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) goto errout; + cp->perfect[i].p = cp; } return 0; @@ -334,6 +357,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->alloc_hash = p->alloc_hash; cp->fall_through = p->fall_through; cp->tp = tp; + refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */ if (tb[TCA_TCINDEX_HASH]) cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -366,7 +390,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } cp->h = p->h; - err = tcindex_filter_result_init(&new_filter_result, net); + err = tcindex_filter_result_init(&new_filter_result, cp, net); if (err < 0) goto errout_alloc; if (old_r) @@ -434,7 +458,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, goto errout_alloc; f->key = handle; f->next = NULL; - err = tcindex_filter_result_init(&f->result, net); + err = tcindex_filter_result_init(&f->result, cp, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -447,7 +471,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r, net); + err = tcindex_filter_result_init(old_r, cp, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -571,6 +595,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, for (i = 0; i < p->hash; i++) { struct tcindex_filter_result *r = p->perfect + i; + /* tcf_queue_work() does not guarantee the ordering we + * want, so we have to take this refcnt temporarily to + * ensure 'p' is freed after all tcindex_filter_result + * here. Imperfect hash does not need this, because it + * uses linked lists rather than an array. + */ + tcindex_data_get(p); + tcf_unbind_filter(tp, &r->res); if (tcf_exts_get_net(&r->exts)) tcf_queue_work(&r->rwork, -- cgit v1.2.3 From 744fdc8233f6aa9582ce08a51ca06e59796a3196 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 30 Mar 2020 11:22:19 -0400 Subject: ipv6: don't auto-add link-local address to lag ports Bonding slave and team port devices should not have link-local addresses automatically added to them, as it can interfere with openvswitch being able to properly add tc ingress. Basic reproducer, courtesy of Marcelo: $ ip link add name bond0 type bond $ ip link set dev ens2f0np0 master bond0 $ ip link set dev ens2f1np2 master bond0 $ ip link set dev bond0 up $ ip a s 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: ens2f0np0: mtu 1500 qdisc mq master bond0 state UP group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff 5: ens2f1np2: mtu 1500 qdisc mq master bond0 state DOWN group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff 11: bond0: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff inet6 fe80::20f:53ff:fe2f:ea40/64 scope link valid_lft forever preferred_lft forever (above trimmed to relevant entries, obviously) $ sysctl net.ipv6.conf.ens2f0np0.addr_gen_mode=0 net.ipv6.conf.ens2f0np0.addr_gen_mode = 0 $ sysctl net.ipv6.conf.ens2f1np2.addr_gen_mode=0 net.ipv6.conf.ens2f1np2.addr_gen_mode = 0 $ ip a l ens2f0np0 2: ens2f0np0: mtu 1500 qdisc mq master bond0 state UP group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff inet6 fe80::20f:53ff:fe2f:ea40/64 scope link tentative valid_lft forever preferred_lft forever $ ip a l ens2f1np2 5: ens2f1np2: mtu 1500 qdisc mq master bond0 state DOWN group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff inet6 fe80::20f:53ff:fe2f:ea40/64 scope link tentative valid_lft forever preferred_lft forever Looks like addrconf_sysctl_addr_gen_mode() bypasses the original "is this a slave interface?" check added by commit c2edacf80e15, and results in an address getting added, while w/the proposed patch added, no address gets added. This simply adds the same gating check to another code path, and thus should prevent the same devices from erroneously obtaining an ipv6 link-local address. Fixes: d35a00b8e33d ("net/ipv6: allow sysctl to change link-local address generation mode") Reported-by: Moshe Levi CC: Stephen Hemminger CC: Marcelo Ricardo Leitner CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a11fd4d67832..84a28b539c43 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3298,6 +3298,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) if (netif_is_l3_master(idev->dev)) return; + /* no link local addresses on devices flagged as slaves */ + if (idev->dev->flags & IFF_SLAVE) + return; + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); switch (idev->cnf.addr_gen_mode) { -- cgit v1.2.3 From bf88dc327de8c311078da557788af5d88b74c8e5 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 2 Apr 2020 09:25:48 +0800 Subject: net: dsa: dsa_bridge_mtu_normalization() can be static Fixes: f41071407c85 ("net: dsa: implement auto-normalization of MTU for bridge hardware datapath") Signed-off-by: kbuild test robot Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5390ff541658..e94eb1aac602 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1338,7 +1338,7 @@ static void dsa_hw_port_list_free(struct list_head *hw_port_list) } /* Make the hardware datapath to/from @dev limited to a common MTU */ -void dsa_bridge_mtu_normalization(struct dsa_port *dp) +static void dsa_bridge_mtu_normalization(struct dsa_port *dp) { struct list_head hw_port_list; struct dsa_switch_tree *dst; -- cgit v1.2.3 From d16fa759253ff7a42b5257d0db9784caef2da9c0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Apr 2020 11:18:59 +0100 Subject: net: ipv6: rpl_iptunnel: remove redundant assignments to variable err The variable err is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/ipv6/rpl_iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index a49ddc6cd020..c3ececd7cfc1 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -210,7 +210,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct rpl_lwt *rlwt; - int err = -EINVAL; + int err; rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); -- cgit v1.2.3 From 0b4f33def7bbde1ce2fea05f116639270e7acdc7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Apr 2020 13:44:51 +0200 Subject: mptcp: fix tcp fallback crash Christoph Paasch reports following crash: general protection fault [..] CPU: 0 PID: 2874 Comm: syz-executor072 Not tainted 5.6.0-rc5 #62 RIP: 0010:__pv_queued_spin_lock_slowpath kernel/locking/qspinlock.c:471 [..] queued_spin_lock_slowpath arch/x86/include/asm/qspinlock.h:50 [inline] do_raw_spin_lock include/linux/spinlock.h:181 [inline] spin_lock_bh include/linux/spinlock.h:343 [inline] __mptcp_flush_join_list+0x44/0xb0 net/mptcp/protocol.c:278 mptcp_shutdown+0xb3/0x230 net/mptcp/protocol.c:1882 [..] Problem is that mptcp_shutdown() socket isn't an mptcp socket, its a plain tcp_sk. Thus, trying to access mptcp_sk specific members accesses garbage. Root cause is that accept() returns a fallback (tcp) socket, not an mptcp one. There is code in getpeername to detect this and override the sockets stream_ops. But this will only run when accept() caller provided a sockaddr struct. "accept(fd, NULL, 0)" will therefore result in mptcp stream ops, but with sock->sk pointing at a tcp_sk. Update the existing fallback handling to detect this as well. Moreover, mptcp_shutdown did not have fallback handling, and mptcp_poll did it too late so add that there as well. Reported-by: Christoph Paasch Tested-by: Christoph Paasch Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1833bc1f4a43..4cf88e3d5121 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -57,10 +57,43 @@ static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) return msk->first && !sk_is_mptcp(msk->first); } +static struct socket *mptcp_is_tcpsk(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + if (sock->sk != sk) + return NULL; + + if (unlikely(sk->sk_prot == &tcp_prot)) { + /* we are being invoked after mptcp_accept() has + * accepted a non-mp-capable flow: sk is a tcp_sk, + * not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + sock->ops = &inet_stream_ops; + return sock; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { + sock->ops = &inet6_stream_ops; + return sock; +#endif + } + + return NULL; +} + static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) { + struct socket *sock; + sock_owned_by_me((const struct sock *)msk); + sock = mptcp_is_tcpsk((struct sock *)msk); + if (unlikely(sock)) + return sock; + if (likely(!__mptcp_needs_tcp_fallback(msk))) return NULL; @@ -84,6 +117,10 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) struct socket *ssock; int err; + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) + return ssock; + ssock = __mptcp_nmpc_socket(msk); if (ssock) goto set_state; @@ -1752,7 +1789,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, msk = mptcp_sk(sk); lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); + ssock = __mptcp_tcp_fallback(msk); + if (!ssock) + ssock = __mptcp_nmpc_socket(msk); if (ssock) { mask = ssock->ops->poll(file, ssock, wait); release_sock(sk); @@ -1762,9 +1801,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, release_sock(sk); sock_poll_wait(file, sock, wait); lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - return ssock->ops->poll(file, ssock, NULL); if (test_bit(MPTCP_DATA_READY, &msk->flags)) mask = EPOLLIN | EPOLLRDNORM; @@ -1783,11 +1819,17 @@ static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_subflow_context *subflow; + struct socket *ssock; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); lock_sock(sock->sk); + ssock = __mptcp_tcp_fallback(msk); + if (ssock) { + release_sock(sock->sk); + return inet_shutdown(ssock, how); + } if (how == SHUT_WR || how == SHUT_RDWR) inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); -- cgit v1.2.3 From 59832e246515ab6a4f5aa878073e6f415aa35166 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Apr 2020 13:44:52 +0200 Subject: mptcp: subflow: check parent mptcp socket on subflow state change This is needed at least until proper MPTCP-Level fin/reset signalling gets added: We wake parent when a subflow changes, but we should do this only when all subflows have closed, not just one. Schedule the mptcp worker and tell it to check eof state on all subflows. Only flag mptcp socket as closed and wake userspace processes blocking in poll if all subflows have closed. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 2 ++ net/mptcp/subflow.c | 3 +-- 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4cf88e3d5121..8cc9dd2cc828 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -327,6 +327,15 @@ void mptcp_data_acked(struct sock *sk) sock_hold(sk); } +void mptcp_subflow_eof(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && + schedule_work(&msk->work)) + sock_hold(sk); +} + static void mptcp_stop_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -1031,6 +1040,27 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } +static void mptcp_check_for_eof(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int receivers = 0; + + mptcp_for_each_subflow(msk, subflow) + receivers += !subflow->rx_eof; + + if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + /* hopefully temporary hack: propagate shutdown status + * to msk, when all subflows agree on it + */ + sk->sk_shutdown |= RCV_SHUTDOWN; + + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + sk->sk_data_ready(sk); + } +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1047,6 +1077,9 @@ static void mptcp_worker(struct work_struct *work) __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) + mptcp_check_for_eof(msk); + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f733c5425552..67448002a2d7 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -89,6 +89,7 @@ #define MPTCP_DATA_READY 0 #define MPTCP_SEND_SPACE 1 #define MPTCP_WORK_RTX 2 +#define MPTCP_WORK_EOF 3 static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) { @@ -339,6 +340,7 @@ void mptcp_finish_connect(struct sock *sk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); void mptcp_data_acked(struct sock *sk); +void mptcp_subflow_eof(struct sock *sk); int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(u32 token); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index b5180c81588e..50a8bea987c6 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -994,8 +994,7 @@ static void subflow_state_change(struct sock *sk) if (!(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; - parent->sk_shutdown |= RCV_SHUTDOWN; - __subflow_state_change(parent); + mptcp_subflow_eof(parent); } } -- cgit v1.2.3 From de06f57392b60e4d92135fbbedad4aea7d1107e2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Apr 2020 13:44:53 +0200 Subject: mptcp: re-check dsn before reading from subflow mptcp_subflow_data_available() is commonly called via ssk->sk_data_ready(), in this case the mptcp socket lock cannot be acquired. Therefore, while we can safely discard subflow data that was already received up to msk->ack_seq, we cannot be sure that 'subflow->data_avail' will still be valid at the time userspace wants to read the data -- a previous read on a different subflow might have carried this data already. In that (unlikely) event, msk->ack_seq will have been updated and will be ahead of the subflow dsn. We can check for this condition and skip/resync to the expected sequence number. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cc9dd2cc828..939a5045181a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -158,6 +158,27 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->offset = offset; } +/* both sockets must be locked */ +static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, + struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u64 dsn = mptcp_subflow_get_mapped_dsn(subflow); + + /* revalidate data sequence number. + * + * mptcp_subflow_data_available() is usually called + * without msk lock. Its unlikely (but possible) + * that msk->ack_seq has been advanced since the last + * call found in-sequence data. + */ + if (likely(dsn == msk->ack_seq)) + return true; + + subflow->data_avail = 0; + return mptcp_subflow_data_available(ssk); +} + static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) @@ -169,6 +190,11 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct tcp_sock *tp; bool done = false; + if (!mptcp_subflow_dsn_valid(msk, ssk)) { + *bytes = 0; + return false; + } + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf); -- cgit v1.2.3 From 564cf2f3953678e08a97c43198badf77b042cf96 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 2 Apr 2020 13:44:54 +0200 Subject: mptcp: fix "fn parameter not described" warnings Obtained with: $ make W=1 net/mptcp/token.o net/mptcp/token.c:53: warning: Function parameter or member 'req' not described in 'mptcp_token_new_request' net/mptcp/token.c:98: warning: Function parameter or member 'sk' not described in 'mptcp_token_new_connect' net/mptcp/token.c:133: warning: Function parameter or member 'conn' not described in 'mptcp_token_new_accept' net/mptcp/token.c:178: warning: Function parameter or member 'token' not described in 'mptcp_token_destroy_request' net/mptcp/token.c:191: warning: Function parameter or member 'token' not described in 'mptcp_token_destroy' Fixes: 79c0949e9a09 (mptcp: Add key generation and token tree) Fixes: 58b09919626b (mptcp: create msk early) Signed-off-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/token.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 129a5ad1bc35..33352dd99d4d 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -40,7 +40,7 @@ static int token_used __read_mostly; /** * mptcp_token_new_request - create new key/idsn/token for subflow_request - * @req - the request socket + * @req: the request socket * * This function is called when a new mptcp connection is coming in. * @@ -80,7 +80,7 @@ int mptcp_token_new_request(struct request_sock *req) /** * mptcp_token_new_connect - create new key/idsn/token for subflow - * @sk - the socket that will initiate a connection + * @sk: the socket that will initiate a connection * * This function is called when a new outgoing mptcp connection is * initiated. @@ -125,6 +125,7 @@ int mptcp_token_new_connect(struct sock *sk) /** * mptcp_token_new_accept - insert token for later processing * @token: the token to insert to the tree + * @conn: the just cloned socket linked to the new connection * * Called when a SYN packet creates a new logical connection, i.e. * is not a join request. @@ -169,7 +170,7 @@ struct mptcp_sock *mptcp_token_get_sock(u32 token) /** * mptcp_token_destroy_request - remove mptcp connection/token - * @token - token of mptcp connection to remove + * @token: token of mptcp connection to remove * * Remove not-yet-fully-established incoming connection identified * by @token. @@ -183,7 +184,7 @@ void mptcp_token_destroy_request(u32 token) /** * mptcp_token_destroy - remove mptcp connection/token - * @token - token of mptcp connection to remove + * @token: token of mptcp connection to remove * * Remove the connection identified by @token. */ -- cgit v1.2.3 From c427bfec18f2190b8f4718785ee8ed2db4f84ee6 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Tue, 31 Mar 2020 15:20:10 +0200 Subject: net: core: enable SO_BINDTODEVICE for non-root users Currently, SO_BINDTODEVICE requires CAP_NET_RAW. This change allows a non-root user to bind a socket to an interface if it is not already bound. This is useful to allow an application to bind itself to a specific VRF for outgoing or incoming connections. Currently, an application wanting to manage connections through several VRF need to be privileged. Previously, IP_UNICAST_IF and IPV6_UNICAST_IF were added for Wine (76e21053b5bf3 and c4062dfc425e9) specifically for use by non-root processes. However, they are restricted to sendmsg() and not usable with TCP. Allowing SO_BINDTODEVICE would allow TCP clients to get the same privilege. As for TCP servers, outside the VRF use case, SO_BINDTODEVICE would only further restrict connections a server could accept. When an application is restricted to a VRF (with `ip vrf exec`), the socket is bound to an interface at creation and therefore, a non-privileged call to SO_BINDTODEVICE to escape the VRF fails. When an application bound a socket to SO_BINDTODEVICE and transmit it to a non-privileged process through a Unix socket, a tentative to change the bound device also fails. Before: >>> import socket >>> s=socket.socket(socket.AF_INET, socket.SOCK_STREAM) >>> s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"dummy0") Traceback (most recent call last): File "", line 1, in PermissionError: [Errno 1] Operation not permitted After: >>> import socket >>> s=socket.socket(socket.AF_INET, socket.SOCK_STREAM) >>> s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"dummy0") >>> s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"dummy0") Traceback (most recent call last): File "", line 1, in PermissionError: [Errno 1] Operation not permitted Signed-off-by: Vincent Bernat Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index da32d9b6d09f..ce1d8dce9b7a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -574,7 +574,7 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) /* Sorry... */ ret = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_RAW)) + if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; -- cgit v1.2.3 From 64948427a63f49dd0ce403388d232f22cc1971a8 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 26 Mar 2020 04:27:24 +0800 Subject: net: openvswitch: use hlist_for_each_entry_rcu instead of hlist_for_each_entry The struct sw_flow is protected by RCU, when traversing them, use hlist_for_each_entry_rcu. Signed-off-by: Tonghao Zhang Tested-by: Greg Rose Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- net/openvswitch/flow_table.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index fd8a01ca7a2d..2398d7238300 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -462,12 +462,14 @@ static void flow_table_copy_flows(struct table_instance *old, struct hlist_head *head = &old->buckets[i]; if (ufid) - hlist_for_each_entry(flow, head, - ufid_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + ufid_table.node[old_ver], + lockdep_ovsl_is_held()) ufid_table_instance_insert(new, flow); else - hlist_for_each_entry(flow, head, - flow_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + flow_table.node[old_ver], + lockdep_ovsl_is_held()) table_instance_insert(new, flow); } -- cgit v1.2.3 From 19e16d220f0adbf899a652dfb1fde2e3a95153e9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 1 Apr 2020 14:46:20 +0800 Subject: neigh: support smaller retrans_time settting Currently, we limited the retrans_time to be greater than HZ/2. i.e. setting retrans_time less than 500ms will not work. This makes the user unable to achieve a more accurate control for bonding arp fast failover. Update the sanity check to HZ/100, which is 10ms, to let users have more ability on the retrans_time control. v3: sync the behavior with IPv6 and update all the timer handler v2: use HZ instead of hard code number Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/core/neighbour.c | 10 ++++++---- net/ipv6/addrconf.c | 7 ++++--- net/ipv6/ndisc.c | 4 ++-- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5bf8d22a47ec..39d37d0ef575 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1065,11 +1065,12 @@ static void neigh_timer_handler(struct timer_list *t) neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1125,7 +1126,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/2); + HZ/100); neigh_add_timer(neigh, next); immediate_probe = true; } else { @@ -1427,7 +1428,8 @@ void __neigh_set_probe_once(struct neighbour *neigh) neigh->nud_state = NUD_INCOMPLETE; atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, - jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); + jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 84a28b539c43..24e319dfb510 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1357,7 +1357,7 @@ retry: regen_advance = idev->cnf.regen_max_retry * idev->cnf.dad_transmits * - NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; + max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; /* recalculate max_desync_factor each time and update * idev->desync_factor if it's larger @@ -4121,7 +4121,8 @@ static void addrconf_dad_work(struct work_struct *w) ifp->dad_probes--; addrconf_mod_dad_work(ifp, - NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME)); + max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), + HZ/100)); spin_unlock(&ifp->lock); write_unlock_bh(&idev->lock); @@ -4527,7 +4528,7 @@ restart: !(ifp->flags&IFA_F_TENTATIVE)) { unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * ifp->idev->cnf.dad_transmits * - NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ; + max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; if (age >= ifp->prefered_lft - regen_advance) { struct inet6_ifaddr *ifpub = ifp->ifpub; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 6ffa153e5166..1ecd4e9b0bdf 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1359,8 +1359,8 @@ skip_defrtr: if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; - if (rtime < HZ/10) - rtime = HZ/10; + if (rtime < HZ/100) + rtime = HZ/100; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; -- cgit v1.2.3 From a8eab6d35e22f4f21471f16147be79529cd6aaf7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Apr 2020 20:58:51 -0700 Subject: net_sched: fix a missing refcnt in tcindex_init() The initial refcnt of struct tcindex_data should be 1, it is clear that I forgot to set it to 1 in tcindex_init(). This leads to a dec-after-zero warning. Reported-by: syzbot+8325e509a1bf83ec741d@syzkaller.appspotmail.com Fixes: 304e024216a8 ("net_sched: add a temporary refcnt for struct tcindex_data") Cc: Jamal Hadi Salim Cc: Jiri Pirko Cc: Paul E. McKenney Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 065345832a69..61e95029c18f 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -151,6 +151,7 @@ static int tcindex_init(struct tcf_proto *tp) p->mask = 0xffff; p->hash = DEFAULT_HASH_SIZE; p->fall_through = 1; + refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */ rcu_assign_pointer(tp->root, p); return 0; -- cgit v1.2.3 From c85adced953af8eb443852c12e8ea1142de91b7c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 3 Apr 2020 17:14:08 +0800 Subject: mptcp: add some missing pr_fmt defines Some of the mptcp logs didn't print out the format string: [ 185.651493] DSS [ 185.651494] data_fin=0 dsn64=0 use_map=0 ack64=1 use_ack=1 [ 185.651494] data_ack=13792750332298763796 [ 185.651495] MPTCP: msk=00000000c4b81cfc ssk=000000009743af53 data_avail=0 skb=0000000063dc595d [ 185.651495] MPTCP: msk=00000000c4b81cfc ssk=000000009743af53 status=0 [ 185.651495] MPTCP: msk ack_seq=9bbc894565aa2f9a subflow ack_seq=9bbc894565aa2f9a [ 185.651496] MPTCP: msk=00000000c4b81cfc ssk=000000009743af53 data_avail=1 skb=0000000012e809e1 So this patch added these missing pr_fmt defines. Then we can get the same format string "MPTCP" in all mptcp logs like this: [ 142.795829] MPTCP: DSS [ 142.795829] MPTCP: data_fin=0 dsn64=0 use_map=0 ack64=1 use_ack=1 [ 142.795829] MPTCP: data_ack=8089704603109242421 [ 142.795830] MPTCP: msk=00000000133a24e0 ssk=000000002e508c64 data_avail=0 skb=00000000d5f230df [ 142.795830] MPTCP: msk=00000000133a24e0 ssk=000000002e508c64 status=0 [ 142.795831] MPTCP: msk ack_seq=66790290f1199d9b subflow ack_seq=66790290f1199d9b [ 142.795831] MPTCP: msk=00000000133a24e0 ssk=000000002e508c64 data_avail=1 skb=00000000de5aca2e Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/options.c | 2 ++ net/mptcp/pm.c | 2 ++ net/mptcp/pm_netlink.c | 2 ++ 3 files changed, 6 insertions(+) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index bd220ee4aac9..faf57585b892 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -4,6 +4,8 @@ * Copyright (c) 2017 - 2019, Intel Corporation. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 064639f72487..977d9c8b1453 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -3,6 +3,8 @@ * * Copyright (c) 2019, Intel Corporation. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a0ce7f324499..86d61ab34c7c 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -4,6 +4,8 @@ * Copyright (c) 2020, Red Hat, Inc. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include -- cgit v1.2.3 From a7f9a6f4cc5fb4d5c5a03f3f898b7cb86db05cbf Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 4 Apr 2020 11:22:57 -0400 Subject: ipv6: rpl: fix loop iteration This patch fix the loop iteration by not walking over the last iteration. The cmpri compressing value exempt the last segment. As the code shows the last iteration will be overwritten by cmpre value handling which is for the last segment. I think this doesn't end in any bufferoverflows because we work on worst case temporary buffer sizes but it ends in not best compression settings in some cases. Fixes: 8610c7c6e3bd ("net: ipv6: add support for rpl sr exthdr") Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/ipv6/rpl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index dc4f20e23bf7..d38b476fc7f2 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -48,7 +48,7 @@ void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, outhdr->cmpri = 0; outhdr->cmpre = 0; - for (i = 0; i <= n; i++) + for (i = 0; i < n; i++) ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr, ipv6_rpl_segdata_pos(inhdr, i), inhdr->cmpri); @@ -66,7 +66,7 @@ static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr int i; for (plen = 0; plen < sizeof(*daddr); plen++) { - for (i = 0; i <= n; i++) { + for (i = 0; i < n; i++) { if (daddr->s6_addr[plen] != inhdr->rpl_segaddr[i].s6_addr[plen]) return plen; @@ -114,7 +114,7 @@ void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr, outhdr->cmpri = cmpri; outhdr->cmpre = cmpre; - for (i = 0; i <= n; i++) + for (i = 0; i < n; i++) ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i), &inhdr->rpl_segaddr[i], cmpri); -- cgit v1.2.3