diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-15 13:22:29 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-15 13:22:29 -0800 |
commit | d635a69dd4981cc51f90293f5f64268620ed1565 (patch) | |
tree | 5e0a758b402ea7d624c25c3a343545dd29e80f31 /net/mptcp | |
parent | ac73e3dc8acd0a3be292755db30388c3580f5674 (diff) | |
parent | efd5a1584537698220578227e6467638307c2a0b (diff) | |
download | linux-d635a69dd4981cc51f90293f5f64268620ed1565.tar.gz linux-d635a69dd4981cc51f90293f5f64268620ed1565.tar.bz2 linux-d635a69dd4981cc51f90293f5f64268620ed1565.zip |
Merge tag 'net-next-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski:
"Core:
- support "prefer busy polling" NAPI operation mode, where we defer
softirq for some time expecting applications to periodically busy
poll
- AF_XDP: improve efficiency by more batching and hindering the
adjacency cache prefetcher
- af_packet: make packet_fanout.arr size configurable up to 64K
- tcp: optimize TCP zero copy receive in presence of partial or
unaligned reads making zero copy a performance win for much smaller
messages
- XDP: add bulk APIs for returning / freeing frames
- sched: support fragmenting IP packets as they come out of conntrack
- net: allow virtual netdevs to forward UDP L4 and fraglist GSO skbs
BPF:
- BPF switch from crude rlimit-based to memcg-based memory accounting
- BPF type format information for kernel modules and related tracing
enhancements
- BPF implement task local storage for BPF LSM
- allow the FENTRY/FEXIT/RAW_TP tracing programs to use
bpf_sk_storage
Protocols:
- mptcp: improve multiple xmit streams support, memory accounting and
many smaller improvements
- TLS: support CHACHA20-POLY1305 cipher
- seg6: add support for SRv6 End.DT4/DT6 behavior
- sctp: Implement RFC 6951: UDP Encapsulation of SCTP
- ppp_generic: add ability to bridge channels directly
- bridge: Connectivity Fault Management (CFM) support as is defined
in IEEE 802.1Q section 12.14.
Drivers:
- mlx5: make use of the new auxiliary bus to organize the driver
internals
- mlx5: more accurate port TX timestamping support
- mlxsw:
- improve the efficiency of offloaded next hop updates by using
the new nexthop object API
- support blackhole nexthops
- support IEEE 802.1ad (Q-in-Q) bridging
- rtw88: major bluetooth co-existance improvements
- iwlwifi: support new 6 GHz frequency band
- ath11k: Fast Initial Link Setup (FILS)
- mt7915: dual band concurrent (DBDC) support
- net: ipa: add basic support for IPA v4.5
Refactor:
- a few pieces of in_interrupt() cleanup work from Sebastian Andrzej
Siewior
- phy: add support for shared interrupts; get rid of multiple driver
APIs and have the drivers write a full IRQ handler, slight growth
of driver code should be compensated by the simpler API which also
allows shared IRQs
- add common code for handling netdev per-cpu counters
- move TX packet re-allocation from Ethernet switch tag drivers to a
central place
- improve efficiency and rename nla_strlcpy
- number of W=1 warning cleanups as we now catch those in a patchwork
build bot
Old code removal:
- wan: delete the DLCI / SDLA drivers
- wimax: move to staging
- wifi: remove old WDS wifi bridging support"
* tag 'net-next-5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1922 commits)
net: hns3: fix expression that is currently always true
net: fix proc_fs init handling in af_packet and tls
nfc: pn533: convert comma to semicolon
af_vsock: Assign the vsock transport considering the vsock address flags
af_vsock: Set VMADDR_FLAG_TO_HOST flag on the receive path
vsock_addr: Check for supported flag values
vm_sockets: Add VMADDR_FLAG_TO_HOST vsock flag
vm_sockets: Add flags field in the vsock address data structure
net: Disable NETIF_F_HW_TLS_TX when HW_CSUM is disabled
tcp: Add logic to check for SYN w/ data in tcp_simple_retransmit
net: mscc: ocelot: install MAC addresses in .ndo_set_rx_mode from process context
nfc: s3fwrn5: Release the nfc firmware
net: vxget: clean up sparse warnings
mlxsw: spectrum_router: Use eXtended mezzanine to offload IPv4 router
mlxsw: spectrum: Set KVH XLT cache mode for Spectrum2/3
mlxsw: spectrum_router_xm: Introduce basic XM cache flushing
mlxsw: reg: Add Router LPM Cache Enable Register
mlxsw: reg: Add Router LPM Cache ML Delete Register
mlxsw: spectrum_router_xm: Implement L-value tracking for M-index
mlxsw: reg: Add XM Router M Table Register
...
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/ctrl.c | 14 | ||||
-rw-r--r-- | net/mptcp/mptcp_diag.c | 2 | ||||
-rw-r--r-- | net/mptcp/options.c | 218 | ||||
-rw-r--r-- | net/mptcp/pm.c | 72 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 84 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 1813 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 192 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 165 |
8 files changed, 1803 insertions, 757 deletions
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 54b888f94009..96ba616f59bf 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -18,6 +18,7 @@ struct mptcp_pernet { struct ctl_table_header *ctl_table_hdr; int mptcp_enabled; + unsigned int add_addr_timeout; }; static struct mptcp_pernet *mptcp_get_pernet(struct net *net) @@ -30,6 +31,11 @@ int mptcp_is_enabled(struct net *net) return mptcp_get_pernet(net)->mptcp_enabled; } +unsigned int mptcp_get_add_addr_timeout(struct net *net) +{ + return mptcp_get_pernet(net)->add_addr_timeout; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -40,12 +46,19 @@ static struct ctl_table mptcp_sysctl_table[] = { */ .proc_handler = proc_dointvec, }, + { + .procname = "add_addr_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, {} }; static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; + pernet->add_addr_timeout = TCP_RTO_MAX; } static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -61,6 +74,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) } table[0].data = &pernet->mptcp_enabled; + table[1].data = &pernet->add_addr_timeout; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 5f390a97f556..b70ae4ba3000 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -140,7 +140,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, info->mptcpi_flags = flags; info->mptcpi_token = READ_ONCE(msk->token); info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); unlock_sock_fast(sk, slow); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 90cd52df99a6..c5328f407aab 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -242,7 +242,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->add_addr = 1; mp_opt->addr_id = *ptr++; - pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo); if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); ptr += 4; @@ -267,6 +266,9 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } + pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", + (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "", + mp_opt->addr_id, mp_opt->ahmac, mp_opt->echo, mp_opt->port); break; case MPTCPOPT_RM_ADDR: @@ -280,6 +282,16 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); break; + case MPTCPOPT_MP_FASTCLOSE: + if (opsize != TCPOLEN_MPTCP_FASTCLOSE) + break; + + ptr += 2; + mp_opt->rcvr_key = get_unaligned_be64(ptr); + ptr += 8; + mp_opt->fastclose = 1; + break; + default: break; } @@ -297,6 +309,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->mp_join = 0; mp_opt->add_addr = 0; mp_opt->ahmac = 0; + mp_opt->fastclose = 0; mp_opt->port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; @@ -492,7 +505,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool ret = false; mpext = skb ? mptcp_get_ext(skb) : NULL; - snd_data_fin_enable = READ_ONCE(msk->snd_data_fin_enable); + snd_data_fin_enable = mptcp_data_fin_enabled(msk); if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size; @@ -528,6 +541,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; + WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) @@ -573,27 +587,43 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, } #endif -static bool mptcp_established_options_add_addr(struct sock *sk, +static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); + bool drop_other_suboptions = false; + unsigned int opt_size = *size; struct mptcp_addr_info saddr; bool echo; + bool port; int len; + if ((mptcp_pm_should_add_signal_ipv6(msk) || + mptcp_pm_should_add_signal_port(msk)) && + skb && skb_is_tcp_pure_ack(skb)) { + pr_debug("drop other suboptions"); + opts->suboptions = 0; + remaining += opt_size; + drop_other_suboptions = true; + } + if (!mptcp_pm_should_add_signal(msk) || - !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo))) + !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port))) return false; - len = mptcp_add_addr_len(saddr.family, echo); + len = mptcp_add_addr_len(saddr.family, echo, port); if (remaining < len) return false; *size = len; + if (drop_other_suboptions) + *size -= opt_size; opts->addr_id = saddr.id; + if (port) + opts->port = ntohs(saddr.port); if (saddr.family == AF_INET) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR; opts->addr = saddr.addr; @@ -616,7 +646,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, } } #endif - pr_debug("addr_id=%d, ahmac=%llu, echo=%d", opts->addr_id, opts->ahmac, echo); + pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", + opts->addr_id, opts->ahmac, echo, opts->port); return true; } @@ -678,7 +709,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, *size += opt_size; remaining -= opt_size; - if (mptcp_established_options_add_addr(sk, &opt_size, remaining, opts)) { + if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; @@ -759,6 +790,11 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, goto fully_established; } + if (mp_opt->add_addr) { + WRITE_ONCE(msk->fully_established, true); + return true; + } + /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. @@ -777,7 +813,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, mptcp_subflow_fully_established(subflow, mp_opt); fully_established: - if (likely(subflow->pm_notified)) + /* if the subflow is not already linked into the conn_list, we can't + * notify the PM: this subflow is still on the listener queue + * and the PM possibly acquiring the subflow lock could race with + * the listener close + */ + if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; @@ -809,31 +850,39 @@ static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) return cur_ack; } -static void update_una(struct mptcp_sock *msk, - struct mptcp_options_received *mp_opt) +static void ack_update_msk(struct mptcp_sock *msk, + struct sock *ssk, + struct mptcp_options_received *mp_opt) { - u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); - u64 write_seq = READ_ONCE(msk->write_seq); + u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); + struct sock *sk = (struct sock *)msk; + u64 old_snd_una; + + mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ + old_snd_una = msk->snd_una; new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ - if (after64(new_snd_una, write_seq)) + if (after64(new_snd_una, snd_nxt)) new_snd_una = old_snd_una; - while (after64(new_snd_una, old_snd_una)) { - snd_una = old_snd_una; - old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, - new_snd_una); - if (old_snd_una == snd_una) { - mptcp_data_acked((struct sock *)msk); - break; - } + new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; + + if (after64(new_wnd_end, msk->wnd_end)) { + msk->wnd_end = new_wnd_end; + __mptcp_wnd_updated(sk, ssk); + } + + if (after64(new_snd_una, old_snd_una)) { + msk->snd_una = new_snd_una; + __mptcp_data_acked(sk); } + mptcp_data_unlock(sk); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) @@ -886,13 +935,30 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; - if (__mptcp_check_fallback(msk)) + if (__mptcp_check_fallback(msk)) { + /* Keep it simple and unconditionally trigger send data cleanup and + * pending queue spooling. We will need to acquire the data lock + * for more accurate checks, and once the lock is acquired, such + * helpers are cheap. + */ + mptcp_data_lock(subflow->conn); + if (mptcp_send_head(subflow->conn)) + __mptcp_wnd_updated(subflow->conn, sk); + __mptcp_data_acked(subflow->conn); + mptcp_data_unlock(subflow->conn); return; + } mptcp_get_options(skb, &mp_opt); if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; + if (mp_opt.fastclose && + msk->local_key == mp_opt.rcvr_key) { + WRITE_ONCE(msk->rcv_fastclose, true); + mptcp_schedule_work((struct sock *)msk); + } + if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { struct mptcp_addr_info addr; @@ -930,7 +996,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) * monodirectional flows will stuck */ if (mp_opt.use_ack) - update_una(msk, &mp_opt); + ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not @@ -975,7 +1041,24 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } } -void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) +static void mptcp_set_rwin(const struct tcp_sock *tp) +{ + const struct sock *ssk = (const struct sock *)tp; + const struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + u64 ack_seq; + + subflow = mptcp_subflow_ctx(ssk); + msk = mptcp_sk(subflow->conn); + + ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; + + if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); +} + +void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, + struct mptcp_out_options *opts) { if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions) { @@ -1014,44 +1097,66 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) } mp_capable_done: - if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { - if (opts->ahmac) - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR, 0, - opts->addr_id); - else - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR_BASE, - MPTCP_ADDR_ECHO, - opts->addr_id); - memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); - ptr += 1; + if ((OPTION_MPTCP_ADD_ADDR +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + | OPTION_MPTCP_ADD_ADDR6 +#endif + ) & opts->suboptions) { + u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; + u8 echo = MPTCP_ADDR_ECHO; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) + len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; +#endif + + if (opts->port) + len += TCPOLEN_MPTCP_PORT_LEN; + if (opts->ahmac) { - put_unaligned_be64(opts->ahmac, ptr); - ptr += 2; + len += sizeof(opts->ahmac); + echo = 0; } - } + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + len, echo, opts->addr_id); + if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + ptr += 1; + } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { - if (opts->ahmac) - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR6, 0, - opts->addr_id); - else - *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - TCPOLEN_MPTCP_ADD_ADDR6_BASE, - MPTCP_ADDR_ECHO, - opts->addr_id); - memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); - ptr += 4; - if (opts->ahmac) { - put_unaligned_be64(opts->ahmac, ptr); - ptr += 2; + else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { + memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + ptr += 4; } - } #endif + if (!opts->port) { + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } else { + if (opts->ahmac) { + u8 *bptr = (u8 *)ptr; + + put_unaligned_be16(opts->port, bptr); + bptr += 2; + put_unaligned_be64(opts->ahmac, bptr); + bptr += 8; + put_unaligned_be16(TCPOPT_NOP << 8 | + TCPOPT_NOP, bptr); + + ptr += 3; + } else { + put_unaligned_be32(opts->port << 16 | + TCPOPT_NOP << 8 | + TCPOPT_NOP, ptr); + ptr += 1; + } + } + } + if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, TCPOLEN_MPTCP_RM_ADDR_BASE, @@ -1132,4 +1237,7 @@ mp_capable_done: TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } } + + if (tp) + mptcp_set_rwin(tp); } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e19e1525ecbb..da2ed576f289 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -14,22 +14,43 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo) + bool echo, bool port) { + u8 add_addr = READ_ONCE(msk->pm.addr_signal); + pr_debug("msk=%p, local_id=%d", msk, addr->id); + if (add_addr) { + pr_warn("addr_signal error, add_addr=%d", add_addr); + return -EINVAL; + } + msk->pm.local = *addr; - WRITE_ONCE(msk->pm.add_addr_echo, echo); - WRITE_ONCE(msk->pm.add_addr_signal, true); + add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); + if (echo) + add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); + if (addr->family == AF_INET6) + add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); + if (port) + add_addr |= BIT(MPTCP_ADD_ADDR_PORT); + WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) { + u8 rm_addr = READ_ONCE(msk->pm.addr_signal); + pr_debug("msk=%p, local_id=%d", msk, local_id); + if (rm_addr) { + pr_warn("addr_signal error, rm_addr=%d", rm_addr); + return -EINVAL; + } + msk->pm.rm_id = local_id; - WRITE_ONCE(msk->pm.rm_addr_signal, true); + rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); + WRITE_ONCE(msk->pm.addr_signal, rm_addr); return 0; } @@ -89,8 +110,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return false; msk->pm.status |= BIT(new_status); - if (schedule_work(&msk->work)) - sock_hold((struct sock *)msk); + mptcp_schedule_work((struct sock *)msk); return true; } @@ -106,8 +126,14 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); - if (READ_ONCE(pm->work_pending)) + /* mptcp_pm_fully_established() can be invoked by multiple + * racing paths - accept() and check_fully_established() + * be sure to serve this event only once. + */ + if (READ_ONCE(pm->work_pending) && + !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); + msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); spin_unlock_bh(&pm->lock); } @@ -150,14 +176,25 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, spin_lock_bh(&pm->lock); - if (!READ_ONCE(pm->accept_addr)) - mptcp_pm_announce_addr(msk, addr, true); - else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) + if (!READ_ONCE(pm->accept_addr)) { + mptcp_pm_announce_addr(msk, addr, true, addr->port); + mptcp_pm_add_addr_send_ack(msk); + } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; + } spin_unlock_bh(&pm->lock); } +void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) +{ + if (!mptcp_pm_should_add_signal_ipv6(msk) && + !mptcp_pm_should_add_signal_port(msk)) + return; + + mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); +} + void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id) { struct mptcp_pm_data *pm = &msk->pm; @@ -173,7 +210,7 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id) /* path manager helpers */ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo) + struct mptcp_addr_info *saddr, bool *echo, bool *port) { int ret = false; @@ -183,13 +220,14 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_add_signal(msk)) goto out_unlock; - *echo = READ_ONCE(msk->pm.add_addr_echo); + *echo = mptcp_pm_should_add_signal_echo(msk); + *port = mptcp_pm_should_add_signal_port(msk); - if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo)) + if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo, *port)) goto out_unlock; *saddr = msk->pm.local; - WRITE_ONCE(msk->pm.add_addr_signal, false); + WRITE_ONCE(msk->pm.addr_signal, 0); ret = true; out_unlock: @@ -212,7 +250,7 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, goto out_unlock; *rm_id = msk->pm.rm_id; - WRITE_ONCE(msk->pm.rm_addr_signal, false); + WRITE_ONCE(msk->pm.addr_signal, 0); ret = true; out_unlock: @@ -233,11 +271,9 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.subflows = 0; msk->pm.rm_id = 0; WRITE_ONCE(msk->pm.work_pending, false); - WRITE_ONCE(msk->pm.add_addr_signal, false); - WRITE_ONCE(msk->pm.rm_addr_signal, false); + WRITE_ONCE(msk->pm.addr_signal, 0); WRITE_ONCE(msk->pm.accept_addr, false); WRITE_ONCE(msk->pm.accept_subflow, false); - WRITE_ONCE(msk->pm.add_addr_echo, false); msk->pm.status = 0; spin_lock_init(&msk->pm.lock); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 0d6f3d912891..a6d983d80576 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -135,7 +135,7 @@ select_local_address(const struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry, *ret = NULL; rcu_read_lock(); - spin_lock_bh(&msk->join_list_lock); + __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; @@ -144,13 +144,11 @@ select_local_address(const struct pm_nl_pernet *pernet, * pending join */ if (entry->addr.family == ((struct sock *)msk)->sk_family && - !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && - !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) { + !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) { ret = entry; break; } } - spin_unlock_bh(&msk->join_list_lock); rcu_read_unlock(); return ret; } @@ -227,12 +225,14 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (!mptcp_pm_should_add_signal(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); - mptcp_pm_announce_addr(msk, &entry->addr, false); + mptcp_pm_announce_addr(msk, &entry->addr, false, entry->addr.port); + mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, timer, + jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); spin_unlock_bh(&msk->pm.lock); @@ -264,6 +264,7 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); if (lookup_anno_list_by_saddr(msk, &entry->addr)) return false; @@ -279,7 +280,8 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); - sk_reset_timer(sk, &add_entry->add_timer, jiffies + TCP_RTO_MAX); + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); return true; } @@ -309,7 +311,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct mptcp_pm_addr_entry *local; struct pm_nl_pernet *pernet; - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + pernet = net_generic(sock_net(sk), pm_nl_pernet_id); pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, msk->pm.local_addr_max, @@ -324,7 +326,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { msk->pm.add_addr_signaled++; - mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_announce_addr(msk, &local->addr, false, local->addr.port); + mptcp_pm_nl_add_addr_send_ack(msk); } } else { /* pick failed, avoid fourther attempts later */ @@ -371,6 +374,7 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) struct sock *sk = (struct sock *)msk; struct mptcp_addr_info remote; struct mptcp_addr_info local; + bool use_port = false; pr_debug("accepted %d:%d remote family %d", msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max, @@ -387,14 +391,51 @@ void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) remote = msk->pm.remote; if (!remote.port) remote.port = sk->sk_dport; + else + use_port = true; memset(&local, 0, sizeof(local)); local.family = remote.family; spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect((struct sock *)msk, &local, &remote); + __mptcp_subflow_connect(sk, &local, &remote); spin_lock_bh(&msk->pm.lock); - mptcp_pm_announce_addr(msk, &remote, true); + mptcp_pm_announce_addr(msk, &remote, true, use_port); + mptcp_pm_nl_add_addr_send_ack(msk); +} + +void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + if (!mptcp_pm_should_add_signal_ipv6(msk) && + !mptcp_pm_should_add_signal_port(msk)) + return; + + __mptcp_flush_join_list(msk); + subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); + if (subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u8 add_addr; + + spin_unlock_bh(&msk->pm.lock); + if (mptcp_pm_should_add_signal_ipv6(msk)) + pr_debug("send ack for add_addr6"); + if (mptcp_pm_should_add_signal_port(msk)) + pr_debug("send ack for add_addr_port"); + + lock_sock(ssk); + tcp_send_ack(ssk); + release_sock(ssk); + spin_lock_bh(&msk->pm.lock); + + add_addr = READ_ONCE(msk->pm.addr_signal); + if (mptcp_pm_should_add_signal_ipv6(msk)) + add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6); + if (mptcp_pm_should_add_signal_port(msk)) + add_addr &= ~BIT(MPTCP_ADD_ADDR_PORT); + WRITE_ONCE(msk->pm.addr_signal, add_addr); + } } void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) @@ -413,14 +454,13 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (msk->pm.rm_id != subflow->remote_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.add_addr_accepted--; @@ -449,14 +489,13 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - long timeout = 0; if (rm_id != subflow->local_id) continue; spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); msk->pm.local_addr_used--; @@ -826,13 +865,14 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } -static void __flush_addrs(struct pm_nl_pernet *pernet) +static void __flush_addrs(struct net *net, struct list_head *list) { - while (!list_empty(&pernet->local_addr_list)) { + while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; - cur = list_entry(pernet->local_addr_list.next, + cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); + mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr); list_del_rcu(&cur->list); kfree_rcu(cur, rcu); } @@ -849,11 +889,13 @@ static void __reset_counters(struct pm_nl_pernet *pernet) static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); - __flush_addrs(pernet); + list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); spin_unlock_bh(&pernet->lock); + __flush_addrs(sock_net(skb->sk), &free_list); return 0; } @@ -1115,10 +1157,12 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct net *net; list_for_each_entry(net, net_list, exit_list) { + struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + /* net is removed from namespace list, can't race with * other modifiers */ - __flush_addrs(net_generic(net, pm_nl_pernet_id)); + __flush_addrs(net, &pernet->local_addr_list); } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 88f2a7a0ccb8..b812aaae8044 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #include <net/transp_v6.h> #endif #include <net/mptcp.h> +#include <net/xfrm.h> #include "protocol.h" #include "mib.h" @@ -41,6 +42,9 @@ struct mptcp_skb_cb { static struct percpu_counter mptcp_sockets_allocated; +static void __mptcp_destroy_sock(struct sock *sk); +static void __mptcp_check_send_data_fin(struct sock *sk); + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -53,6 +57,12 @@ static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) return msk->subflow; } +/* Returns end sequence number of the receiver's advertised window */ +static u64 mptcp_wnd_end(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->wnd_end); +} + static bool mptcp_is_tcpsk(struct sock *sk) { struct socket *sock = sk->sk_socket; @@ -102,6 +112,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) msk->subflow = ssock; subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); + sock_hold(ssock->sk); subflow->request_mptcp = 1; /* accept() will wait on first subflow sk_wq, and we always wakes up @@ -157,18 +168,19 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) struct rb_node **p, *parent; u64 seq, end_seq, max_seq; struct sk_buff *skb1; - int space; seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; - space = tcp_space(sk); - max_seq = space > 0 ? space + msk->ack_seq : msk->ack_seq; + max_seq = READ_ONCE(msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); - if (after64(seq, max_seq)) { + if (after64(end_seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); + pr_debug("oow by %lld, rcv_wnd_sent %llu\n", + (unsigned long long)end_seq - (unsigned long)max_seq, + (unsigned long long)msk->rcv_wnd_sent); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -323,17 +335,35 @@ static void mptcp_stop_timer(struct sock *sk) mptcp_sk(sk)->timer_ival = 0; } -static void mptcp_check_data_fin_ack(struct sock *sk) +static void mptcp_close_wake_up(struct sock *sk) +{ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); +} + +static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (__mptcp_check_fallback(msk)) - return; + return !__mptcp_check_fallback(msk) && + ((1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && + msk->write_seq == READ_ONCE(msk->snd_una); +} + +static void mptcp_check_data_fin_ack(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); /* Look for an acknowledged DATA_FIN */ - if (((1 << sk->sk_state) & - (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && - msk->write_seq == atomic64_read(&msk->snd_una)) { + if (mptcp_pending_data_fin_ack(sk)) { mptcp_stop_timer(sk); WRITE_ONCE(msk->snd_data_fin_enable, 0); @@ -341,20 +371,14 @@ static void mptcp_check_data_fin_ack(struct sock *sk) switch (sk->sk_state) { case TCP_FIN_WAIT1: inet_sk_state_store(sk, TCP_FIN_WAIT2); - sk->sk_state_change(sk); break; case TCP_CLOSING: case TCP_LAST_ACK: inet_sk_state_store(sk, TCP_CLOSE); - sk->sk_state_change(sk); break; } - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_close_wake_up(sk); } } @@ -388,13 +412,79 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } -static void mptcp_check_data_fin(struct sock *sk) +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ + if (subflow->request_join && !subflow->fully_established) + return false; + + /* only send if our side has not closed yet */ + return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); +} + +static bool tcp_can_send_ack(const struct sock *ssk) +{ + return !((1 << inet_sk_state_load(ssk)) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE)); +} + +static void mptcp_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + if (tcp_can_send_ack(ssk)) + tcp_send_ack(ssk); + release_sock(ssk); + } +} + +static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk) +{ + int ret; + + lock_sock(ssk); + ret = tcp_can_send_ack(ssk); + if (ret) + tcp_cleanup_rbuf(ssk, 1); + release_sock(ssk); + return ret; +} + +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +{ + struct sock *ack_hint = READ_ONCE(msk->ack_hint); + struct mptcp_subflow_context *subflow; + + /* if the hinted ssk is still active, try to use it */ + if (likely(ack_hint)) { + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk)) + return; + } + } + + /* otherwise pick the first active subflow */ + mptcp_for_each_subflow(msk, subflow) + if (mptcp_subflow_cleanup_rbuf(mptcp_subflow_tcp_sock(subflow))) + return; +} + +static bool mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; + bool ret = false; if (__mptcp_check_fallback(msk) || !msk->first) - return; + return ret; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. @@ -410,8 +500,6 @@ static void mptcp_check_data_fin(struct sock *sk) */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { - struct mptcp_subflow_context *subflow; - WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); @@ -428,7 +516,6 @@ static void mptcp_check_data_fin(struct sock *sk) break; case TCP_FIN_WAIT2: inet_sk_state_store(sk, TCP_CLOSE); - // @@ Close subflows now? break; default: /* Other states not expected */ @@ -436,23 +523,12 @@ static void mptcp_check_data_fin(struct sock *sk) break; } + ret = true; mptcp_set_timeout(sk, NULL); - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - lock_sock(ssk); - tcp_send_ack(ssk); - release_sock(ssk); - } - - sk->sk_state_change(sk); - - if (sk->sk_shutdown == SHUTDOWN_MASK || - sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + mptcp_send_ack(msk); + mptcp_close_wake_up(sk); } + return ret; } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, @@ -464,12 +540,22 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; - u32 old_copied_seq; bool done = false; + int sk_rbuf; + + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + + if (unlikely(ssk_rbuf > sk_rbuf)) { + WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); + sk_rbuf = ssk_rbuf; + } + } pr_debug("msk=%p ssk=%p", msk, ssk); tp = tcp_sk(ssk); - old_copied_seq = tp->copied_seq; do { u32 map_remaining, offset; u32 seq = tp->copied_seq; @@ -528,20 +614,18 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) { + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { done = true; break; } } while (more_data_avail); + WRITE_ONCE(msk->ack_hint, ssk); *bytes += moved; - if (tp->copied_seq != old_copied_seq) - tcp_cleanup_rbuf(ssk, 1); - return done; } -static bool mptcp_ofo_queue(struct mptcp_sock *msk) +static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; @@ -587,43 +671,43 @@ static bool mptcp_ofo_queue(struct mptcp_sock *msk) /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ -static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) +static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; - if (READ_ONCE(sk->sk_lock.owned)) - return false; - - if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock))) - return false; - - /* must re-check after taking the lock */ - if (!READ_ONCE(sk->sk_lock.owned)) { - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - mptcp_ofo_queue(msk); + if (inet_sk_state_load(sk) == TCP_CLOSE) + return; - /* If the moves have caught up with the DATA_FIN sequence number - * it's time to ack the DATA_FIN and change socket state, but - * this is not a good place to change state. Let the workqueue - * do it. - */ - if (mptcp_pending_data_fin(sk, NULL) && - schedule_work(&msk->work)) - sock_hold(sk); - } + mptcp_data_lock(sk); - spin_unlock_bh(&sk->sk_lock.slock); + __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + __mptcp_ofo_queue(msk); - return moved > 0; + /* If the moves have caught up with the DATA_FIN sequence number + * it's time to ack the DATA_FIN and change socket state, but + * this is not a good place to change state. Let the workqueue + * do it. + */ + if (mptcp_pending_data_fin(sk, NULL)) + mptcp_schedule_work(sk); + mptcp_data_unlock(sk); } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); + int sk_rbuf, ssk_rbuf; bool wake; + /* The peer can send data while we are shutting down this + * subflow at msk destruction time, but we must avoid enqueuing + * more data to the msk receive queue + */ + if (unlikely(subflow->disposable)) + return; + /* move_skbs_to_msk below can legitly clear the data_avail flag, * but we will need later to properly woke the reader, cache its * value @@ -632,30 +716,23 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (wake) set_bit(MPTCP_DATA_READY, &msk->flags); - if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) && - move_skbs_to_msk(msk, ssk)) - goto wake; + ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); + sk_rbuf = READ_ONCE(sk->sk_rcvbuf); + if (unlikely(ssk_rbuf > sk_rbuf)) + sk_rbuf = ssk_rbuf; - /* don't schedule if mptcp sk is (still) over limit */ - if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) + /* over limit? can't append more skbs to msk */ + if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) goto wake; - /* mptcp socket is owned, release_cb should retry */ - if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, - &sk->sk_tsq_flags)) { - sock_hold(sk); + move_skbs_to_msk(msk, ssk); - /* need to try again, its possible release_cb() has already - * been called after the test_and_set_bit() above. - */ - move_skbs_to_msk(msk, ssk); - } wake: if (wake) sk->sk_data_ready(sk); } -static void __mptcp_flush_join_list(struct mptcp_sock *msk) +void __mptcp_flush_join_list(struct mptcp_sock *msk) { if (likely(list_empty(&msk->join_list))) return; @@ -675,6 +752,10 @@ static void mptcp_reset_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; + /* prevent rescheduling on close */ + if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) + return; + /* should never be called with mptcp level timer cleared */ tout = READ_ONCE(mptcp_sk(sk)->timer_ival); if (WARN_ON_ONCE(!tout)) @@ -682,23 +763,23 @@ static void mptcp_reset_timer(struct sock *sk) sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } -void mptcp_data_acked(struct sock *sk) +bool mptcp_schedule_work(struct sock *sk) { - mptcp_reset_timer(sk); - - if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || - (inet_sk_state_load(sk) != TCP_ESTABLISHED)) && - schedule_work(&mptcp_sk(sk)->work)) + if (inet_sk_state_load(sk) != TCP_CLOSE && + schedule_work(&mptcp_sk(sk)->work)) { + /* each subflow already holds a reference to the sk, and the + * workqueue is invoked by a subflow, so sk can't go away here. + */ sock_hold(sk); + return true; + } + return false; } void mptcp_subflow_eof(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); - - if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && - schedule_work(&msk->work)) - sock_hold(sk); + if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); } static void mptcp_check_for_eof(struct mptcp_sock *msk) @@ -709,8 +790,10 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) receivers += !subflow->rx_eof; + if (receivers) + return; - if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + if (!(sk->sk_shutdown & RCV_SHUTDOWN)) { /* hopefully temporary hack: propagate shutdown status * to msk, when all subflows agree on it */ @@ -720,16 +803,21 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); } -} - -static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) -{ - const struct sock *sk = (const struct sock *)msk; - if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); - - return !!msk->cached_ext; + switch (sk->sk_state) { + case TCP_ESTABLISHED: + inet_sk_state_store(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + inet_sk_state_store(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + inet_sk_state_store(sk, TCP_CLOSE); + break; + default: + return; + } + mptcp_close_wake_up(sk); } static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) @@ -754,8 +842,11 @@ static bool mptcp_skb_can_collapse_to(u64 write_seq, if (!tcp_skb_can_collapse_to(skb)) return false; - /* can collapse only if MPTCP level sequence is in order */ - return mpext && mpext->data_seq + mpext->data_len == write_seq; + /* can collapse only if MPTCP level sequence is in order and this + * mapping has not been xmitted yet + */ + return mpext && mpext->data_seq + mpext->data_len == write_seq && + !mpext->frozen; } static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, @@ -763,9 +854,125 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && + pfrag->size - pfrag->offset > 0 && df->data_seq + df->data_len == msk->write_seq; } +static int mptcp_wmem_with_overhead(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int ret, skbs; + + ret = size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); + skbs = (msk->tx_pending_data + size) / msk->size_goal_cache; + if (skbs < msk->skb_tx_cache.qlen) + return ret; + + return ret + (skbs - msk->skb_tx_cache.qlen) * SKB_TRUESIZE(MAX_TCP_HEADER); +} + +static void __mptcp_wmem_reserve(struct sock *sk, int size) +{ + int amount = mptcp_wmem_with_overhead(sk, size); + struct mptcp_sock *msk = mptcp_sk(sk); + + WARN_ON_ONCE(msk->wmem_reserved); + if (amount <= sk->sk_forward_alloc) + goto reserve; + + /* under memory pressure try to reserve at most a single page + * otherwise try to reserve the full estimate and fallback + * to a single page before entering the error path + */ + if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) || + !sk_wmem_schedule(sk, amount)) { + if (amount <= PAGE_SIZE) + goto nomem; + + amount = PAGE_SIZE; + if (!sk_wmem_schedule(sk, amount)) + goto nomem; + } + +reserve: + msk->wmem_reserved = amount; + sk->sk_forward_alloc -= amount; + return; + +nomem: + /* we will wait for memory on next allocation */ + msk->wmem_reserved = -1; +} + +static void __mptcp_update_wmem(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->wmem_reserved) + return; + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + if (msk->wmem_reserved > 0) { + sk->sk_forward_alloc += msk->wmem_reserved; + msk->wmem_reserved = 0; + } +} + +static bool mptcp_wmem_alloc(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* check for pre-existing error condition */ + if (msk->wmem_reserved < 0) + return false; + + if (msk->wmem_reserved >= size) + goto account; + + mptcp_data_lock(sk); + if (!sk_wmem_schedule(sk, size)) { + mptcp_data_unlock(sk); + return false; + } + + sk->sk_forward_alloc -= size; + msk->wmem_reserved += size; + mptcp_data_unlock(sk); + +account: + msk->wmem_reserved -= size; + return true; +} + +static void mptcp_wmem_uncharge(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (msk->wmem_reserved < 0) + msk->wmem_reserved = 0; + msk->wmem_reserved += size; +} + +static void mptcp_mem_reclaim_partial(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + /* if we are experiencing a transint allocation error, + * the forward allocation memory has been already + * released + */ + if (msk->wmem_reserved < 0) + return; + + mptcp_data_lock(sk); + sk->sk_forward_alloc += msk->wmem_reserved; + sk_mem_reclaim_partial(sk); + msk->wmem_reserved = sk->sk_forward_alloc; + sk->sk_forward_alloc = 0; + mptcp_data_unlock(sk); +} + static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); @@ -781,21 +988,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } -static bool mptcp_is_writeable(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - if (!sk_stream_is_writeable((struct sock *)msk)) - return false; - - mptcp_for_each_subflow(msk, subflow) { - if (sk_stream_is_writeable(subflow->tcp_sock)) - return true; - } - return false; -} - -static void mptcp_clean_una(struct sock *sk) +static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; @@ -806,13 +999,15 @@ static void mptcp_clean_una(struct sock *sk) * plain TCP */ if (__mptcp_check_fallback(msk)) - atomic64_set(&msk->snd_una, msk->write_seq); - snd_una = atomic64_read(&msk->snd_una); + msk->snd_una = READ_ONCE(msk->snd_nxt); + snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; + if (WARN_ON_ONCE(dfrag == msk->first_pending)) + break; dfrag_clear(sk, dfrag); cleaned = true; } @@ -821,12 +1016,13 @@ static void mptcp_clean_una(struct sock *sk) if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; - if (WARN_ON_ONCE(delta > dfrag->data_len)) + if (WARN_ON_ONCE(delta > dfrag->already_sent)) goto out; dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; + dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); cleaned = true; @@ -834,19 +1030,42 @@ static void mptcp_clean_una(struct sock *sk) out: if (cleaned) { - sk_mem_reclaim_partial(sk); - - /* Only wake up writers if a subflow is ready */ - if (mptcp_is_writeable(msk)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags); - smp_mb__after_atomic(); + if (tcp_under_memory_pressure(sk)) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } - /* set SEND_SPACE before sk_stream_write_space clears - * NOSPACE - */ - sk_stream_write_space(sk); + if (sk_stream_is_writeable(sk)) { + /* pairs with memory barrier in mptcp_poll */ + smp_mb(); + if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags)) + sk_stream_write_space(sk); } } + + if (snd_una == READ_ONCE(msk->snd_nxt)) { + if (msk->timer_ival) + mptcp_stop_timer(sk); + } else { + mptcp_reset_timer(sk); + } +} + +static void mptcp_enter_memory_pressure(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + bool first = true; + + sk_stream_moderate_sndbuf(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (first) + tcp_enter_memory_pressure(ssk); + sk_stream_moderate_sndbuf(ssk); + first = false; + } } /* ensure we get enough memory for the frag hdr, beyond some minimal amount of @@ -858,8 +1077,7 @@ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) pfrag, sk->sk_allocation))) return true; - sk->sk_prot->enter_memory_pressure(sk); - sk_stream_moderate_sndbuf(sk); + mptcp_enter_memory_pressure(sk); return false; } @@ -875,149 +1093,241 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); + dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; } -static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, struct mptcp_data_frag *dfrag, - long *timeo, int *pmss_now, - int *ps_goal) +struct mptcp_sendmsg_info { + int mss_now; + int size_goal; + u16 limit; + u16 sent; + unsigned int flags; +}; + +static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, + int avail_size) +{ + u64 window_end = mptcp_wnd_end(msk); + + if (__mptcp_check_fallback(msk)) + return avail_size; + + if (!before64(data_seq + avail_size, window_end)) { + u64 allowed_size = window_end - data_seq; + + return min_t(unsigned int, allowed_size, avail_size); + } + + return avail_size; +} + +static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) +{ + struct skb_ext *mpext = __skb_ext_alloc(gfp); + + if (!mpext) + return false; + __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); + return true; +} + +static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) +{ + struct sk_buff *skb; + + skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); + if (likely(skb)) { + if (likely(__mptcp_add_ext(skb, gfp))) { + skb_reserve(skb, MAX_TCP_HEADER); + skb->reserved_tailroom = skb->end - skb->tail; + return skb; + } + __kfree_skb(skb); + } else { + mptcp_enter_memory_pressure(sk); + } + return NULL; +} + +static bool mptcp_tx_cache_refill(struct sock *sk, int size, + struct sk_buff_head *skbs, int *total_ts) { - int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; - bool dfrag_collapsed, can_collapse = false; struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_ext *mpext = NULL; - bool retransmission = !!dfrag; - struct sk_buff *skb, *tail; - struct page_frag *pfrag; - struct page *page; - u64 *write_seq; - size_t psize; - - /* use the mptcp page cache so that we can easily move the data - * from one substream to another, but do per subflow memory accounting - * Note: pfrag is used only !retransmission, but the compiler if - * fooled into a warning if we don't init here - */ - pfrag = sk_page_frag(sk); - if (!retransmission) { - write_seq = &msk->write_seq; - page = pfrag->page; + struct sk_buff *skb; + int space_needed; + + if (unlikely(tcp_under_memory_pressure(sk))) { + mptcp_mem_reclaim_partial(sk); + + /* under pressure pre-allocate at most a single skb */ + if (msk->skb_tx_cache.qlen) + return true; + space_needed = msk->size_goal_cache; } else { - write_seq = &dfrag->data_seq; - page = dfrag->page; + space_needed = msk->tx_pending_data + size - + msk->skb_tx_cache.qlen * msk->size_goal_cache; } - /* compute copy limit */ - mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); - *pmss_now = mss_now; - *ps_goal = size_goal; - avail_size = size_goal; - skb = tcp_write_queue_tail(ssk); + while (space_needed > 0) { + skb = __mptcp_do_alloc_tx_skb(sk, sk->sk_allocation); + if (unlikely(!skb)) { + /* under memory pressure, try to pass the caller a + * single skb to allow forward progress + */ + while (skbs->qlen > 1) { + skb = __skb_dequeue_tail(skbs); + __kfree_skb(skb); + } + return skbs->qlen > 0; + } + + *total_ts += skb->truesize; + __skb_queue_tail(skbs, skb); + space_needed -= msk->size_goal_cache; + } + return true; +} + +static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + + if (ssk->sk_tx_skb_cache) { + skb = ssk->sk_tx_skb_cache; + if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && + !__mptcp_add_ext(skb, gfp))) + return false; + return true; + } + + skb = skb_peek(&msk->skb_tx_cache); if (skb) { - mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + skb = __skb_dequeue(&msk->skb_tx_cache); + if (WARN_ON_ONCE(!skb)) + return false; + + mptcp_wmem_uncharge(sk, skb->truesize); + ssk->sk_tx_skb_cache = skb; + return true; + } + + /* over memory limit, no point to try to allocate a new skb */ + return false; + } + + skb = __mptcp_do_alloc_tx_skb(sk, gfp); + if (!skb) + return false; + + if (likely(sk_wmem_schedule(ssk, skb->truesize))) { + ssk->sk_tx_skb_cache = skb; + return true; + } + kfree_skb(skb); + return false; +} + +static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) +{ + return !ssk->sk_tx_skb_cache && + !skb_peek(&mptcp_sk(sk)->skb_tx_cache) && + tcp_under_memory_pressure(sk); +} + +static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) +{ + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) + mptcp_mem_reclaim_partial(sk); + return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); +} +static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, + struct mptcp_data_frag *dfrag, + struct mptcp_sendmsg_info *info) +{ + u64 data_seq = dfrag->data_seq + info->sent; + struct mptcp_sock *msk = mptcp_sk(sk); + bool zero_window_probe = false; + struct mptcp_ext *mpext = NULL; + struct sk_buff *skb, *tail; + bool can_collapse = false; + int size_bias = 0; + int avail_size; + size_t ret = 0; + + pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", + msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); + + /* compute send limit */ + info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); + avail_size = info->size_goal; + msk->size_goal_cache = info->size_goal; + skb = tcp_write_queue_tail(ssk); + if (skb) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ - can_collapse = (size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(*write_seq, skb, mpext); - if (!can_collapse) + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); + can_collapse = (info->size_goal - skb->len > 0) && + mptcp_skb_can_collapse_to(data_seq, skb, mpext); + if (!can_collapse) { TCP_SKB_CB(skb)->eor = 1; - else - avail_size = size_goal - skb->len; - } - - if (!retransmission) { - /* reuse tail pfrag, if possible, or carve a new one from the - * page allocator - */ - dfrag = mptcp_rtx_tail(sk); - offset = pfrag->offset; - dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); - if (!dfrag_collapsed) { - dfrag = mptcp_carve_data_frag(msk, pfrag, offset); - offset = dfrag->offset; - frag_truesize = dfrag->overhead; - } - psize = min_t(size_t, pfrag->size - offset, avail_size); - - /* Copy to page */ - pr_debug("left=%zu", msg_data_left(msg)); - psize = copy_page_from_iter(pfrag->page, offset, - min_t(size_t, msg_data_left(msg), - psize), - &msg->msg_iter); - pr_debug("left=%zu", msg_data_left(msg)); - if (!psize) - return -EINVAL; - - if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) { - iov_iter_revert(&msg->msg_iter, psize); - return -ENOMEM; + } else { + size_bias = skb->len; + avail_size = info->size_goal - skb->len; } - } else { - offset = dfrag->offset; - psize = min_t(size_t, dfrag->data_len, avail_size); } - /* tell the TCP stack to delay the push so that we can safely - * access the skb after the sendpages call - */ - ret = do_tcp_sendpages(ssk, page, offset, psize, - msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); - if (ret <= 0) { - if (!retransmission) - iov_iter_revert(&msg->msg_iter, psize); - return ret; - } + /* Zero window and all data acked? Probe. */ + avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); + if (avail_size == 0) { + u64 snd_una = READ_ONCE(msk->snd_una); - frag_truesize += ret; - if (!retransmission) { - if (unlikely(ret < psize)) - iov_iter_revert(&msg->msg_iter, psize - ret); + if (skb || snd_una != msk->snd_nxt) + return 0; + zero_window_probe = true; + data_seq = snd_una - 1; + avail_size = 1; + } - /* send successful, keep track of sent data for mptcp-level - * retransmission - */ - dfrag->data_len += ret; - if (!dfrag_collapsed) { - get_page(dfrag->page); - list_add_tail(&dfrag->list, &msk->rtx_queue); - sk_wmem_queued_add(sk, frag_truesize); - } else { - sk_wmem_queued_add(sk, ret); - } + if (WARN_ON_ONCE(info->sent > info->limit || + info->limit > dfrag->data_len)) + return 0; - /* charge data on mptcp rtx queue to the master socket - * Note: we charge such data both to sk and ssk - */ - sk->sk_forward_alloc -= frag_truesize; + ret = info->limit - info->sent; + tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, + dfrag->page, dfrag->offset + info->sent, &ret); + if (!tail) { + tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); + return -ENOMEM; } - /* if the tail skb extension is still the cached one, collapsing - * really happened. Note: we can't check for 'same skb' as the sk_buff - * hdr on tail can be transmitted, freed and re-allocated by the - * do_tcp_sendpages() call + /* if the tail skb is still the cached one, collapsing really happened. */ - tail = tcp_write_queue_tail(ssk); - if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) { - WARN_ON_ONCE(!can_collapse); + if (skb == tail) { + TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += ret; + WARN_ON_ONCE(!can_collapse); + WARN_ON_ONCE(zero_window_probe); goto out; } - skb = tcp_write_queue_tail(ssk); - mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext); - msk->cached_ext = NULL; + mpext = skb_ext_find(tail, SKB_EXT_MPTCP); + if (WARN_ON_ONCE(!mpext)) { + /* should never reach here, stream corrupted */ + return -EINVAL; + } memset(mpext, 0, sizeof(*mpext)); - mpext->data_seq = *write_seq; + mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = ret; mpext->use_map = 1; @@ -1027,44 +1337,17 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); + if (zero_window_probe) { + mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + mpext->frozen = 1; + ret = 0; + tcp_push_pending_frames(ssk); + } out: - if (!retransmission) - pfrag->offset += frag_truesize; - WRITE_ONCE(*write_seq, *write_seq + ret); mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - return ret; } -static void mptcp_nospace(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - struct socket *sock = READ_ONCE(ssk->sk_socket); - - /* enables ssk->write_space() callbacks */ - if (sock) - set_bit(SOCK_NOSPACE, &sock->flags); - } -} - -static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) -{ - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ - if (subflow->request_join && !subflow->fully_established) - return false; - - /* only send if our side has not closed yet */ - return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); -} - #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ @@ -1089,9 +1372,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, sock_owned_by_me((struct sock *)msk); *sndbuf = 0; - if (!mptcp_ext_cache_refill(msk)) - return NULL; - if (__mptcp_check_fallback(msk)) { if (!msk->first) return NULL; @@ -1154,27 +1434,160 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, return NULL; } -static void ssk_check_wmem(struct mptcp_sock *msk) +static void mptcp_push_release(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) +{ + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); + release_sock(ssk); +} + +static void mptcp_push_pending(struct sock *sk, unsigned int flags) +{ + struct sock *prev_ssk = NULL, *ssk = NULL; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .flags = flags, + }; + struct mptcp_data_frag *dfrag; + int len, copied = 0; + u32 sndbuf; + + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + prev_ssk = ssk; + __mptcp_flush_join_list(msk); + ssk = mptcp_subflow_get_send(msk, &sndbuf); + + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, sndbuf); + + /* try to keep the subflow socket lock across + * consecutive xmit on the same socket + */ + if (ssk != prev_ssk && prev_ssk) + mptcp_push_release(sk, prev_ssk, &info); + if (!ssk) + goto out; + + if (ssk != prev_ssk || !prev_ssk) + lock_sock(ssk); + + /* keep it simple and always provide a new skb for the + * subflow, even if we will not use it when collapsing + * on the pending one + */ + if (!mptcp_alloc_tx_skb(sk, ssk)) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) { + mptcp_push_release(sk, ssk, &info); + goto out; + } + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + msk->tx_pending_data -= ret; + copied += ret; + len -= ret; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + + /* at this point we held the socket lock for the last subflow we used */ + if (ssk) + mptcp_push_release(sk, ssk, &info); + +out: + if (copied) { + /* start the timer, if it's not pending */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + __mptcp_check_send_data_fin(sk); + } +} + +static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) { - if (unlikely(!mptcp_is_writeable(msk))) - mptcp_nospace(msk); + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info; + struct mptcp_data_frag *dfrag; + int len, copied = 0; + + info.flags = 0; + while ((dfrag = mptcp_send_head(sk))) { + info.sent = dfrag->already_sent; + info.limit = dfrag->data_len; + len = dfrag->data_len - dfrag->already_sent; + while (len > 0) { + int ret = 0; + + /* do auto tuning */ + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf)) + WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf); + + if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); + } + if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC)) + goto out; + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + goto out; + + info.sent += ret; + dfrag->already_sent += ret; + msk->snd_nxt += ret; + msk->snd_burst -= ret; + msk->tx_pending_data -= ret; + copied += ret; + len -= ret; + } + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + +out: + /* __mptcp_alloc_tx_skb could have released some wmem and we are + * not going to flush it via release_sock() + */ + __mptcp_update_wmem(sk); + if (copied) { + mptcp_set_timeout(sk, ssk); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + if (msk->snd_data_fin_enable && + msk->snd_nxt + 1 == msk->write_seq) + mptcp_schedule_work(sk); + } } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { - int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; - struct sock *ssk; - u32 sndbuf; - bool tx_ok; + int ret = 0; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len)); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1185,130 +1598,97 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } pfrag = sk_page_frag(sk); -restart: - mptcp_clean_una(sk); - - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { - ret = -EPIPE; - goto out; - } - __mptcp_flush_join_list(msk); - ssk = mptcp_subflow_get_send(msk, &sndbuf); - while (!sk_stream_memory_free(sk) || - !ssk || - !mptcp_page_frag_refill(ssk, pfrag)) { - if (ssk) { - /* make sure retransmit timer is - * running before we wait for memory. - * - * The retransmit timer might be needed - * to make the peer send an up-to-date - * MPTCP Ack. - */ - mptcp_set_timeout(sk, ssk); - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); - } + while (msg_data_left(msg)) { + int total_ts, frag_truesize = 0; + struct mptcp_data_frag *dfrag; + struct sk_buff_head skbs; + bool dfrag_collapsed; + size_t psize, offset; - mptcp_nospace(msk); - ret = sk_stream_wait_memory(sk, &timeo); - if (ret) - goto out; - - mptcp_clean_una(sk); - - ssk = mptcp_subflow_get_send(msk, &sndbuf); - if (list_empty(&msk->conn_list)) { - ret = -ENOTCONN; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + ret = -EPIPE; goto out; } - } - /* do auto tuning */ - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - sndbuf > READ_ONCE(sk->sk_sndbuf)) - WRITE_ONCE(sk->sk_sndbuf, sndbuf); + /* reuse tail pfrag, if possible, or carve a new one from the + * page allocator + */ + dfrag = mptcp_pending_tail(sk); + dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); + if (!dfrag_collapsed) { + if (!sk_stream_memory_free(sk)) + goto wait_for_memory; - pr_debug("conn_list->subflow=%p", ssk); + if (!mptcp_page_frag_refill(sk, pfrag)) + goto wait_for_memory; - lock_sock(ssk); - tx_ok = msg_data_left(msg); - while (tx_ok) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, - &size_goal); - if (ret < 0) { - if (ret == -EAGAIN && timeo > 0) { - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; - } - break; + dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); + frag_truesize = dfrag->overhead; } - /* burst can be negative, we will try move to the next subflow - * at selection time, if possible. + /* we do not bound vs wspace, to allow a single packet. + * memory accounting will prevent execessive memory usage + * anyway */ - msk->snd_burst -= ret; - copied += ret; - - tx_ok = msg_data_left(msg); - if (!tx_ok) - break; + offset = dfrag->offset + dfrag->data_len; + psize = pfrag->size - offset; + psize = min_t(size_t, psize, msg_data_left(msg)); + total_ts = psize + frag_truesize; + __skb_queue_head_init(&skbs); + if (!mptcp_tx_cache_refill(sk, psize, &skbs, &total_ts)) + goto wait_for_memory; + + if (!mptcp_wmem_alloc(sk, total_ts)) { + __skb_queue_purge(&skbs); + goto wait_for_memory; + } - if (!sk_stream_memory_free(ssk) || - !mptcp_page_frag_refill(ssk, pfrag) || - !mptcp_ext_cache_refill(msk)) { - tcp_push(ssk, msg->msg_flags, mss_now, - tcp_sk(ssk)->nonagle, size_goal); - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; + skb_queue_splice_tail(&skbs, &msk->skb_tx_cache); + if (copy_page_from_iter(dfrag->page, offset, psize, + &msg->msg_iter) != psize) { + mptcp_wmem_uncharge(sk, psize + frag_truesize); + ret = -EFAULT; + goto out; } - /* memory is charged to mptcp level socket as well, i.e. - * if msg is very large, mptcp socket may run out of buffer - * space. mptcp_clean_una() will release data that has - * been acked at mptcp level in the mean time, so there is - * a good chance we can continue sending data right away. - * - * Normally, when the tcp subflow can accept more data, then - * so can the MPTCP socket. However, we need to cope with - * peers that might lag behind in their MPTCP-level - * acknowledgements, i.e. data might have been acked at - * tcp level only. So, we must also check the MPTCP socket - * limits before we send more data. + /* data successfully copied into the write queue */ + copied += psize; + dfrag->data_len += psize; + frag_truesize += psize; + pfrag->offset += frag_truesize; + WRITE_ONCE(msk->write_seq, msk->write_seq + psize); + + /* charge data on mptcp pending queue to the msk socket + * Note: we charge such data both to sk and ssk */ - if (unlikely(!sk_stream_memory_free(sk))) { - tcp_push(ssk, msg->msg_flags, mss_now, - tcp_sk(ssk)->nonagle, size_goal); - mptcp_clean_una(sk); - if (!sk_stream_memory_free(sk)) { - /* can't send more for now, need to wait for - * MPTCP-level ACKs from peer. - * - * Wakeup will happen via mptcp_clean_una(). - */ - mptcp_set_timeout(sk, ssk); - release_sock(ssk); - goto restart; - } + sk_wmem_queued_add(sk, frag_truesize); + if (!dfrag_collapsed) { + get_page(dfrag->page); + list_add_tail(&dfrag->list, &msk->rtx_queue); + if (!msk->first_pending) + WRITE_ONCE(msk->first_pending, dfrag); } + pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk, + dfrag->data_seq, dfrag->data_len, dfrag->already_sent, + !dfrag_collapsed); + + continue; + +wait_for_memory: + set_bit(MPTCP_NOSPACE, &msk->flags); + mptcp_push_pending(sk, msg->msg_flags); + ret = sk_stream_wait_memory(sk, &timeo); + if (ret) + goto out; } - mptcp_set_timeout(sk, ssk); if (copied) { - tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, - size_goal); - - /* start the timer, if it's not pending */ - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + msk->tx_pending_data += copied; + mptcp_push_pending(sk, msg->msg_flags); } - release_sock(ssk); out: - ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; } @@ -1332,11 +1712,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len) { - struct sock *sk = (struct sock *)msk; struct sk_buff *skb; int copied = 0; - while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + while ((skb = skb_peek(&msk->receive_queue)) != NULL) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); @@ -1356,7 +1735,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } - __skb_unlink(skb, &sk->sk_receive_queue); + /* we will bulk release the skb memory later */ + skb->destructor = NULL; + msk->rmem_released += skb->truesize; + __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); if (copied >= len) @@ -1464,32 +1846,68 @@ new_measure: msk->rcvq_space.time = mstamp; } -static bool __mptcp_move_skbs(struct mptcp_sock *msk) +static void __mptcp_update_rmem(struct sock *sk) { - unsigned int moved = 0; - bool done; + struct mptcp_sock *msk = mptcp_sk(sk); - /* avoid looping forever below on racing close */ - if (((struct sock *)msk)->sk_state == TCP_CLOSE) - return false; + if (!msk->rmem_released) + return; + + atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, msk->rmem_released); + msk->rmem_released = 0; +} + +static void __mptcp_splice_receive_queue(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); +} + +static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv) +{ + struct sock *sk = (struct sock *)msk; + unsigned int moved = 0; + bool ret, done; __mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); + bool slowpath; - if (!ssk) + /* we can have data pending in the subflows only if the msk + * receive buffer was full at subflow_data_ready() time, + * that is an unlikely slow path. + */ + if (likely(!ssk)) break; - lock_sock(ssk); + slowpath = lock_sock_fast(ssk); + mptcp_data_lock(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - release_sock(ssk); + mptcp_data_unlock(sk); + if (moved && rcv) { + WRITE_ONCE(msk->rmem_pending, min(rcv, moved)); + tcp_cleanup_rbuf(ssk, 1); + WRITE_ONCE(msk->rmem_pending, 0); + } + unlock_sock_fast(ssk, slowpath); } while (!done); - if (mptcp_ofo_queue(msk) || moved > 0) { - mptcp_check_data_fin((struct sock *)msk); - return true; + /* acquire the data lock only if some input data is pending */ + ret = moved > 0; + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || + !skb_queue_empty_lockless(&sk->sk_receive_queue)) { + mptcp_data_lock(sk); + __mptcp_update_rmem(sk); + ret |= __mptcp_ofo_queue(msk); + __mptcp_splice_receive_queue(sk); + mptcp_data_unlock(sk); } - return false; + if (ret) + mptcp_check_data_fin((struct sock *)msk); + return !skb_queue_empty(&msk->receive_queue); } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -1503,15 +1921,19 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) return -EOPNOTSUPP; - lock_sock(sk); + mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); + if (unlikely(sk->sk_state == TCP_LISTEN)) { + copied = -ENOTCONN; + goto out_err; + } + timeo = sock_rcvtimeo(sk, nonblock); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - __mptcp_flush_join_list(msk); - while (len > (size_t)copied) { - int bytes_read; + while (copied < len) { + int bytes_read, old_space; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); if (unlikely(bytes_read < 0)) { @@ -1522,10 +1944,15 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&sk->sk_receive_queue) && - __mptcp_move_skbs(msk)) + if (skb_queue_empty(&msk->receive_queue) && + __mptcp_move_skbs(msk, len - copied)) continue; + /* be sure to advertise window change */ + old_space = READ_ONCE(msk->old_wspace); + if ((tcp_space(sk) - old_space) >= old_space) + mptcp_cleanup_rbuf(msk); + /* only the master socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ @@ -1548,8 +1975,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (sk->sk_shutdown & RCV_SHUTDOWN) { + /* race breaker: the shutdown could be after the + * previous receive queue check + */ + if (__mptcp_move_skbs(msk, len - copied)) + continue; break; + } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; @@ -1571,14 +2004,15 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, mptcp_wait_data(sk, &timeo); } - if (skb_queue_empty(&sk->sk_receive_queue)) { + if (skb_queue_empty_lockless(&sk->sk_receive_queue) && + skb_queue_empty(&msk->receive_queue)) { /* entire backlog drained, clear DATA_READY. */ clear_bit(MPTCP_DATA_READY, &msk->flags); /* .. race-breaker: ssk might have gotten new data * after last __mptcp_move_skbs() returned false. */ - if (unlikely(__mptcp_move_skbs(msk))) + if (unlikely(__mptcp_move_skbs(msk, 0))) set_bit(MPTCP_DATA_READY, &msk->flags); } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) { /* data to read but mptcp_wait_data() cleared DATA_READY */ @@ -1587,7 +2021,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, out_err: pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), - skb_queue_empty(&sk->sk_receive_queue), copied); + skb_queue_empty_lockless(&sk->sk_receive_queue), copied); mptcp_rcv_space_adjust(msk, copied); release_sock(sk); @@ -1598,13 +2032,8 @@ static void mptcp_retransmit_handler(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq)) { - mptcp_stop_timer(sk); - } else { - set_bit(MPTCP_WORK_RTX, &msk->flags); - if (schedule_work(&msk->work)) - sock_hold(sk); - } + set_bit(MPTCP_WORK_RTX, &msk->flags); + mptcp_schedule_work(sk); } static void mptcp_retransmit_timer(struct timer_list *t) @@ -1626,6 +2055,14 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } +static void mptcp_timeout_timer(struct timer_list *t) +{ + struct sock *sk = from_timer(sk, t, sk_timer); + + mptcp_schedule_work(sk); + sock_put(sk); +} + /* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * @@ -1639,7 +2076,7 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); if (__mptcp_check_fallback(msk)) - return msk->first; + return NULL; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -1648,8 +2085,11 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) continue; /* still data outstanding at TCP level? Don't retransmit. */ - if (!tcp_write_queue_empty(ssk)) + if (!tcp_write_queue_empty(ssk)) { + if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss) + continue; return NULL; + } if (subflow->backup) { if (!backup) @@ -1672,20 +2112,44 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) * parent socket. */ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow, - long timeout) + struct mptcp_subflow_context *subflow) { - struct socket *sock = READ_ONCE(ssk->sk_socket); + bool dispose_socket = false; + struct socket *sock; list_del(&subflow->node); - if (sock && sock != sk->sk_socket) { - /* outgoing subflow */ - sock_release(sock); + lock_sock(ssk); + + /* if we are invoked by the msk cleanup code, the subflow is + * already orphaned + */ + sock = ssk->sk_socket; + if (sock) { + dispose_socket = sock != sk->sk_socket; + sock_orphan(ssk); + } + + subflow->disposable = 1; + + /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops + * the ssk has been already destroyed, we just need to release the + * reference owned by msk; + */ + if (!inet_csk(ssk)->icsk_ulp_ops) { + kfree_rcu(subflow, rcu); } else { - /* incoming subflow */ - tcp_close(ssk, timeout); + /* otherwise tcp will dispose of the ssk and subflow ctx */ + __tcp_close(ssk, 0); + + /* close acquired an extra ref */ + __sock_put(ssk); } + release_sock(ssk); + if (dispose_socket) + iput(SOCK_INODE(sock)); + + sock_put(ssk); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) @@ -1704,6 +2168,10 @@ static void pm_work(struct mptcp_sock *msk) pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); } + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); + mptcp_pm_nl_add_addr_send_ack(msk); + } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); mptcp_pm_nl_rm_addr_received(msk); @@ -1730,40 +2198,102 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) if (inet_sk_state_load(ssk) != TCP_CLOSE) continue; - __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0); + __mptcp_close_ssk((struct sock *)msk, ssk, subflow); + } +} + +static bool mptcp_check_close_timeout(const struct sock *sk) +{ + s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; + struct mptcp_subflow_context *subflow; + + if (delta >= TCP_TIMEWAIT_LEN) + return true; + + /* if all subflows are in closed status don't bother with additional + * timeout + */ + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) != + TCP_CLOSE) + return false; + } + return true; +} + +static void mptcp_check_fastclose(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = &msk->sk.icsk_inet.sk; + + if (likely(!READ_ONCE(msk->rcv_fastclose))) + return; + + mptcp_token_destroy(msk); + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(tcp_sk); + if (tcp_sk->sk_state != TCP_CLOSE) { + tcp_send_active_reset(tcp_sk, GFP_ATOMIC); + tcp_set_state(tcp_sk, TCP_CLOSE); + } + release_sock(tcp_sk); } + + inet_sk_state_store(sk, TCP_CLOSE); + sk->sk_shutdown = SHUTDOWN_MASK; + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); + + mptcp_close_wake_up(sk); } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; - int orig_len, orig_offset, mss_now = 0, size_goal = 0; + struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; - u64 orig_write_seq; size_t copied = 0; - struct msghdr msg = { - .msg_flags = MSG_DONTWAIT, - }; - long timeo = 0; + int state, ret; lock_sock(sk); - mptcp_clean_una(sk); + state = sk->sk_state; + if (unlikely(state == TCP_CLOSE)) + goto unlock; + mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); + + mptcp_check_fastclose(msk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(msk); - __mptcp_move_skbs(msk); - if (msk->pm.status) pm_work(msk); if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) mptcp_check_for_eof(msk); + __mptcp_check_send_data_fin(sk); mptcp_check_data_fin(sk); + /* if the msk data is completely acked, or the socket timedout, + * there is no point in keeping around an orphaned sk + */ + if (sock_flag(sk, SOCK_DEAD) && + (mptcp_check_close_timeout(sk) || + (state != sk->sk_state && + ((1 << inet_sk_state_load(sk)) & (TCPF_CLOSE | TCPF_FIN_WAIT2))))) { + inet_sk_state_store(sk, TCP_CLOSE); + __mptcp_destroy_sock(sk); + goto unlock; + } + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; @@ -1771,39 +2301,30 @@ static void mptcp_worker(struct work_struct *work) if (!dfrag) goto unlock; - if (!mptcp_ext_cache_refill(msk)) - goto reset_unlock; - ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_unlock; lock_sock(ssk); - orig_len = dfrag->data_len; - orig_offset = dfrag->offset; - orig_write_seq = dfrag->data_seq; - while (dfrag->data_len > 0) { - int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, - &mss_now, &size_goal); - if (ret < 0) + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = dfrag->already_sent; + while (info.sent < dfrag->already_sent) { + if (!mptcp_alloc_tx_skb(sk, ssk)) + break; + + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; - dfrag->data_len -= ret; - dfrag->offset += ret; - - if (!mptcp_ext_cache_refill(msk)) - break; + info.sent += ret; } if (copied) - tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, - size_goal); - - dfrag->data_seq = orig_write_seq; - dfrag->offset = orig_offset; - dfrag->data_len = orig_len; + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); mptcp_set_timeout(sk, ssk); release_sock(ssk); @@ -1826,10 +2347,17 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); - __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); + __skb_queue_head_init(&msk->receive_queue); + __skb_queue_head_init(&msk->skb_tx_cache); msk->out_of_order_queue = RB_ROOT; + msk->first_pending = NULL; + msk->wmem_reserved = 0; + msk->rmem_released = 0; + msk->tx_pending_data = 0; + msk->size_goal_cache = TCP_BASE_MSS; + msk->ack_hint = NULL; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; @@ -1837,7 +2365,7 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); - + timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); return 0; } @@ -1871,11 +2399,15 @@ static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; + struct sk_buff *skb; - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); - + WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); + while ((skb = __skb_dequeue(&msk->skb_tx_cache)) != NULL) { + sk->sk_forward_alloc += skb->truesize; + kfree_skb(skb); + } } static void mptcp_cancel_work(struct sock *sk) @@ -1883,7 +2415,7 @@ static void mptcp_cancel_work(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); if (cancel_work_sync(&msk->work)) - sock_put(sk); + __sock_put(sk); } void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) @@ -1941,42 +2473,67 @@ static int mptcp_close_state(struct sock *sk) return next & TCP_ACTION_FIN; } -static void mptcp_close(struct sock *sk, long timeout) +static void __mptcp_check_send_data_fin(struct sock *sk) { - struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - LIST_HEAD(conn_list); - lock_sock(sk); - sk->sk_shutdown = SHUTDOWN_MASK; + pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", + msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), + msk->snd_nxt, msk->write_seq); - if (sk->sk_state == TCP_LISTEN) { - inet_sk_state_store(sk, TCP_CLOSE); - goto cleanup; - } else if (sk->sk_state == TCP_CLOSE) { - goto cleanup; - } + /* we still need to enqueue subflows or not really shutting down, + * skip this + */ + if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || + mptcp_send_head(sk)) + return; + + WRITE_ONCE(msk->snd_nxt, msk->write_seq); + /* fallback socket will not get data_fin/ack, can move to the next + * state now + */ if (__mptcp_check_fallback(msk)) { - goto update_state; - } else if (mptcp_close_state(sk)) { - pr_debug("Sending DATA_FIN sk=%p", sk); - WRITE_ONCE(msk->write_seq, msk->write_seq + 1); - WRITE_ONCE(msk->snd_data_fin_enable, 1); + if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_close_wake_up(sk); + } else if (sk->sk_state == TCP_FIN_WAIT1) { + inet_sk_state_store(sk, TCP_FIN_WAIT2); + } + } - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); + __mptcp_flush_join_list(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - mptcp_subflow_shutdown(sk, tcp_sk, SHUTDOWN_MASK); - } + mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); } +} - sk_stream_wait_close(sk, timeout); +static void __mptcp_wr_shutdown(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); -update_state: - inet_sk_state_store(sk, TCP_CLOSE); + pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", + msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, + !!mptcp_send_head(sk)); + + /* will be ignored by fallback sockets */ + WRITE_ONCE(msk->write_seq, msk->write_seq + 1); + WRITE_ONCE(msk->snd_data_fin_enable, 1); + + __mptcp_check_send_data_fin(sk); +} + +static void __mptcp_destroy_sock(struct sock *sk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + LIST_HEAD(conn_list); + + pr_debug("msk=%p", msk); -cleanup: /* be sure to always acquire the join list lock, to sync vs * mptcp_finish_join(). */ @@ -1985,20 +2542,77 @@ cleanup: spin_unlock_bh(&msk->join_list_lock); list_splice_init(&msk->conn_list, &conn_list); - __mptcp_clear_xmit(sk); - - release_sock(sk); + sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + sk_stop_timer(sk, &sk->sk_timer); + msk->pm.status = 0; list_for_each_entry_safe(subflow, tmp, &conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - __mptcp_close_ssk(sk, ssk, subflow, timeout); + __mptcp_close_ssk(sk, ssk, subflow); } - mptcp_cancel_work(sk); + sk->sk_prot->destroy(sk); + + WARN_ON_ONCE(msk->wmem_reserved); + WARN_ON_ONCE(msk->rmem_released); + sk_stream_kill_queues(sk); + xfrm_sk_free_policy(sk); + sk_refcnt_debug_release(sk); + sock_put(sk); +} + +static void mptcp_close(struct sock *sk, long timeout) +{ + struct mptcp_subflow_context *subflow; + bool do_cancel_work = false; - __skb_queue_purge(&sk->sk_receive_queue); + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; - sk_common_release(sk); + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { + inet_sk_state_store(sk, TCP_CLOSE); + goto cleanup; + } + + if (mptcp_close_state(sk)) + __mptcp_wr_shutdown(sk); + + sk_stream_wait_close(sk, timeout); + +cleanup: + /* orphan all the subflows */ + inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; + list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow, dispose_socket; + struct socket *sock; + + slow = lock_sock_fast(ssk); + sock = ssk->sk_socket; + dispose_socket = sock && sock != sk->sk_socket; + sock_orphan(ssk); + unlock_sock_fast(ssk, slow); + + /* for the outgoing subflows we additionally need to free + * the associated socket + */ + if (dispose_socket) + iput(SOCK_INODE(sock)); + } + sock_orphan(sk); + + sock_hold(sk); + pr_debug("msk=%p state=%d", sk, sk->sk_state); + if (sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + do_cancel_work = true; + } else { + sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + } + release_sock(sk); + if (do_cancel_work) + mptcp_cancel_work(sk); + sock_put(sk); } static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) @@ -2069,13 +2683,17 @@ struct sock *mptcp_sk_clone(const struct sock *sk, WRITE_ONCE(msk->fully_established, false); msk->write_seq = subflow_req->idsn + 1; - atomic64_set(&msk->snd_una, msk->write_seq); + msk->snd_nxt = msk->write_seq; + msk->snd_una = msk->write_seq; + msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; + if (mp_opt->mp_capable) { msk->can_ack = true; msk->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } sock_reset_flag(nsk, SOCK_RCU_FREE); @@ -2102,6 +2720,8 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; + + WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, @@ -2126,7 +2746,6 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; - struct sock *ssk = newsk; subflow = mptcp_subflow_ctx(newsk); new_mptcp_sock = subflow->conn; @@ -2141,21 +2760,8 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, /* acquire the 2nd reference for the owning socket */ sock_hold(new_mptcp_sock); - - local_bh_disable(); - bh_lock_sock(new_mptcp_sock); - msk = mptcp_sk(new_mptcp_sock); - msk->first = newsk; - newsk = new_mptcp_sock; - mptcp_copy_inaddrs(newsk, ssk); - list_add(&subflow->node, &msk->conn_list); - - mptcp_rcv_space_init(msk, ssk); - bh_unlock_sock(new_mptcp_sock); - - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); - local_bh_enable(); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); @@ -2166,6 +2772,13 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, void mptcp_destroy_common(struct mptcp_sock *msk) { + struct sock *sk = (struct sock *)msk; + + __mptcp_clear_xmit(sk); + + /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ + skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); + skb_rbtree_purge(&msk->out_of_order_queue); mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); @@ -2175,9 +2788,6 @@ static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (msk->cached_ext) - __skb_ext_put(msk->cached_ext); - mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } @@ -2292,16 +2902,58 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; } -#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ - TCPF_WRITE_TIMER_DEFERRED) +void __mptcp_data_acked(struct sock *sk) +{ + if (!sock_owned_by_user(sk)) + __mptcp_clean_una(sk); + else + set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags); -/* this is very alike tcp_release_cb() but we must handle differently a - * different set of events - */ + if (mptcp_pending_data_fin_ack(sk)) + mptcp_schedule_work(sk); +} + +void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk) +{ + if (!mptcp_send_head(sk)) + return; + + if (!sock_owned_by_user(sk)) + __mptcp_subflow_push_pending(sk, ssk); + else + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +} + +#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) + +/* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) { unsigned long flags, nflags; + /* push_pending may touch wmem_reserved, do it before the later + * cleanup + */ + if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) + __mptcp_clean_una(sk); + if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) { + /* mptcp_push_pending() acquires the subflow socket lock + * + * 1) can't be invoked in atomic scope + * 2) must avoid ABBA deadlock with msk socket spinlock: the RX + * datapath acquires the msk socket spinlock while helding + * the subflow socket lock + */ + + spin_unlock_bh(&sk->sk_lock.slock); + mptcp_push_pending(sk, 0); + spin_lock_bh(&sk->sk_lock.slock); + } + + /* clear any wmem reservation and errors */ + __mptcp_update_wmem(sk); + __mptcp_update_rmem(sk); + do { flags = sk->sk_tsq_flags; if (!(flags & MPTCP_DEFERRED_ALL)) @@ -2311,15 +2963,6 @@ static void mptcp_release_cb(struct sock *sk) sock_release_ownership(sk); - if (flags & TCPF_DELACK_TIMER_DEFERRED) { - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - ssk = mptcp_subflow_recv_lookup(msk); - if (!ssk || !schedule_work(&msk->work)) - __sock_put(sk); - } - if (flags & TCPF_WRITE_TIMER_DEFERRED) { mptcp_retransmit_handler(sk); __sock_put(sk); @@ -2377,9 +3020,11 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); + WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); + WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); - atomic64_set(&msk->snd_una, msk->write_seq); + WRITE_ONCE(msk->snd_una, msk->write_seq); mptcp_pm_new_connection(msk, 0); @@ -2395,9 +3040,9 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } -bool mptcp_finish_join(struct sock *sk) +bool mptcp_finish_join(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; struct socket *parent_sock; @@ -2418,12 +3063,14 @@ bool mptcp_finish_join(struct sock *sk) /* active connections are already on conn_list, and we can't acquire * msk lock here. * use the join list lock as synchronization point and double-check - * msk status to avoid racing with mptcp_close() + * msk status to avoid racing with __mptcp_destroy_sock() */ spin_lock_bh(&msk->join_list_lock); ret = inet_sk_state_load(parent) == TCP_ESTABLISHED; - if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) + if (ret && !WARN_ON_ONCE(!list_empty(&subflow->node))) { list_add_tail(&subflow->node, &msk->join_list); + sock_hold(ssk); + } spin_unlock_bh(&msk->join_list_lock); if (!ret) return false; @@ -2432,19 +3079,12 @@ bool mptcp_finish_join(struct sock *sk) * at close time */ parent_sock = READ_ONCE(parent->sk_socket); - if (parent_sock && !sk->sk_socket) - mptcp_sock_graft(sk, parent_sock); + if (parent_sock && !ssk->sk_socket) + mptcp_sock_graft(ssk, parent_sock); subflow->map_seq = READ_ONCE(msk->ack_seq); return true; } -static bool mptcp_memory_free(const struct sock *sk, int wake) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; -} - static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -2465,7 +3105,6 @@ static struct proto mptcp_prot = { .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, - .stream_memory_free = mptcp_memory_free, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, @@ -2610,6 +3249,23 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (err == 0 && !mptcp_is_tcpsk(newsock->sk)) { struct mptcp_sock *msk = mptcp_sk(newsock->sk); struct mptcp_subflow_context *subflow; + struct sock *newsk = newsock->sk; + bool slowpath; + + slowpath = lock_sock_fast(newsk); + + /* PM/worker can now acquire the first subflow socket + * lock without racing with listener queue cleanup, + * we can notify it, if needed. + */ + subflow = mptcp_subflow_ctx(msk->first); + list_add(&subflow->node, &msk->conn_list); + sock_hold(msk->first); + if (mptcp_is_fully_established(newsk)) + mptcp_pm_fully_established(msk); + + mptcp_copy_inaddrs(newsk, msk->first); + mptcp_rcv_space_init(msk, msk->first); /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. @@ -2621,6 +3277,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + unlock_sock_fast(newsk, slowpath); } if (inet_csk_listen_poll(ssock->sk)) @@ -2639,6 +3296,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 0; } +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + + if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) + return 0; + + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; + + set_bit(MPTCP_NOSPACE, &msk->flags); + smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; + + return 0; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -2657,8 +3332,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + mask |= mptcp_check_writeable(msk); } if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; @@ -2669,12 +3343,12 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct mptcp_subflow_context *subflow; + struct sock *sk = sock->sk; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); - lock_sock(sock->sk); + lock_sock(sk); how++; if ((how & ~SHUTDOWN_MASK) || !how) { @@ -2683,45 +3357,22 @@ static int mptcp_shutdown(struct socket *sock, int how) } if (sock->state == SS_CONNECTING) { - if ((1 << sock->sk->sk_state) & + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) sock->state = SS_DISCONNECTING; else sock->state = SS_CONNECTED; } - /* If we've already sent a FIN, or it's a closed state, skip this. */ - if (__mptcp_check_fallback(msk)) { - if (how == SHUT_WR || how == SHUT_RDWR) - inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - - mptcp_subflow_shutdown(sock->sk, tcp_sk, how); - } - } else if ((how & SEND_SHUTDOWN) && - ((1 << sock->sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_SYN_SENT | - TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) && - mptcp_close_state(sock->sk)) { - __mptcp_flush_join_list(msk); - - WRITE_ONCE(msk->write_seq, msk->write_seq + 1); - WRITE_ONCE(msk->snd_data_fin_enable, 1); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); - - mptcp_subflow_shutdown(sock->sk, tcp_sk, how); - } - } + sk->sk_shutdown |= how; + if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) + __mptcp_wr_shutdown(sk); /* Wake up anyone sleeping in poll. */ - sock->sk->sk_state_change(sock->sk); + sk->sk_state_change(sk); out_unlock: - release_sock(sock->sk); + release_sock(sk); return ret; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 13ab89dc1914..7cf9d110b85f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -23,6 +23,7 @@ #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_ADD_ADDR6 BIT(7) #define OPTION_MPTCP_RM_ADDR BIT(8) +#define OPTION_MPTCP_FASTCLOSE BIT(9) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -49,15 +50,16 @@ #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_ADD_ADDR 16 -#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 +#define TCPOLEN_MPTCP_ADD_ADDR_PORT 20 #define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 -#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 12 #define TCPOLEN_MPTCP_ADD_ADDR6 28 -#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 +#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 32 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 -#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 -#define TCPOLEN_MPTCP_PORT_LEN 2 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24 +#define TCPOLEN_MPTCP_PORT_LEN 4 #define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_FASTCLOSE 12 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -86,11 +88,20 @@ /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 -#define MPTCP_SEND_SPACE 1 +#define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 #define MPTCP_WORK_CLOSE_SUBFLOW 5 +#define MPTCP_PUSH_PENDING 6 +#define MPTCP_CLEAN_UNA 7 + +static inline bool before64(__u64 seq1, __u64 seq2) +{ + return (__s64)(seq1 - seq2) < 0; +} + +#define after64(seq2, seq1) before64(seq1, seq2) struct mptcp_options_received { u64 sndr_key; @@ -101,6 +112,7 @@ struct mptcp_options_received { u16 data_len; u16 mp_capable : 1, mp_join : 1, + fastclose : 1, dss : 1, add_addr : 1, rm_addr : 1, @@ -110,7 +122,7 @@ struct mptcp_options_received { u32 token; u32 nonce; u64 thmac; - u8 hmac[20]; + u8 hmac[MPTCPOPT_HMAC_LEN]; u8 join_id; u8 use_map:1, dsn64:1, @@ -153,11 +165,21 @@ struct mptcp_addr_info { enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, + MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, + MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ MPTCP_PM_SUBFLOW_ESTABLISHED, }; +enum mptcp_addr_signal_status { + MPTCP_ADD_ADDR_SIGNAL, + MPTCP_ADD_ADDR_ECHO, + MPTCP_ADD_ADDR_IPV6, + MPTCP_ADD_ADDR_PORT, + MPTCP_RM_ADDR_SIGNAL, +}; + struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; @@ -165,13 +187,11 @@ struct mptcp_pm_data { spinlock_t lock; /*protects the whole PM data */ - bool add_addr_signal; - bool rm_addr_signal; + u8 addr_signal; bool server_side; bool work_pending; bool accept_addr; bool accept_subflow; - bool add_addr_echo; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; @@ -187,9 +207,10 @@ struct mptcp_pm_data { struct mptcp_data_frag { struct list_head list; u64 data_seq; - int data_len; - int offset; - int overhead; + u16 data_len; + u16 offset; + u16 overhead; + u16 already_sent; struct page *page; }; @@ -200,27 +221,40 @@ struct mptcp_sock { u64 local_key; u64 remote_key; u64 write_seq; + u64 snd_nxt; u64 ack_seq; + u64 rcv_wnd_sent; u64 rcv_data_fin_seq; + int wmem_reserved; struct sock *last_snd; int snd_burst; - atomic64_t snd_una; + int old_wspace; + u64 snd_una; + u64 wnd_end; unsigned long timer_ival; u32 token; + int rmem_pending; + int rmem_released; unsigned long flags; bool can_ack; bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; + bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; + struct sock *ack_hint; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; + struct sk_buff_head receive_queue; + struct sk_buff_head skb_tx_cache; /* this is wmem accounted */ + int tx_pending_data; + int size_goal_cache; struct list_head conn_list; struct list_head rtx_queue; + struct mptcp_data_frag *first_pending; struct list_head join_list; - struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; struct mptcp_pm_data pm; @@ -232,6 +266,22 @@ struct mptcp_sock { } rcvq_space; }; +#define mptcp_lock_sock(___sk, cb) do { \ + struct sock *__sk = (___sk); /* silence macro reuse warning */ \ + might_sleep(); \ + spin_lock_bh(&__sk->sk_lock.slock); \ + if (__sk->sk_lock.owned) \ + __lock_sock(__sk); \ + cb; \ + __sk->sk_lock.owned = 1; \ + spin_unlock(&__sk->sk_lock.slock); \ + mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \ + local_bh_enable(); \ +} while (0) + +#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) +#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) + #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) @@ -240,11 +290,46 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +static inline int __mptcp_space(const struct sock *sk) +{ + return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_pending); +} + +static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) +{ + const struct mptcp_sock *msk = mptcp_sk(sk); + + return READ_ONCE(msk->first_pending); +} + +static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *cur; + + cur = msk->first_pending; + return list_is_last(&cur->list, &msk->rtx_queue) ? NULL : + list_next_entry(cur, list); +} + +static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!msk->first_pending) + return NULL; + + if (WARN_ON_ONCE(list_empty(&msk->rtx_queue))) + return NULL; + + return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); +} + static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (list_empty(&msk->rtx_queue)) + if (!before64(msk->snd_nxt, READ_ONCE(msk->snd_una))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); @@ -312,7 +397,8 @@ struct mptcp_subflow_context { mpc_map : 1, backup : 1, rx_eof : 1, - can_ack : 1; /* only after processing the remote a key */ + can_ack : 1, /* only after processing the remote a key */ + disposable : 1; /* ctx can be free at ulp release time */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -361,15 +447,24 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } +static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + sock_hold(mptcp_subflow_tcp_sock(subflow)); + spin_lock_bh(&msk->join_list_lock); + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); +} + int mptcp_is_enabled(struct net *net); +unsigned int mptcp_get_add_addr_timeout(struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, - struct mptcp_subflow_context *subflow, - long timeout); + struct mptcp_subflow_context *subflow); void mptcp_subflow_reset(struct sock *ssk); /* called with sk socket lock held */ @@ -407,9 +502,18 @@ static inline bool mptcp_is_fully_established(struct sock *sk) void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); -void mptcp_data_acked(struct sock *sk); +bool mptcp_schedule_work(struct sock *sk); +void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk); +void __mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); +void __mptcp_flush_join_list(struct mptcp_sock *msk); +static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->snd_data_fin_enable) && + READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); +} + void mptcp_destroy_common(struct mptcp_sock *msk); void __init mptcp_token_init(void); @@ -444,6 +548,7 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk, void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); void mptcp_pm_add_addr_received(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); +void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); struct mptcp_pm_add_entry * @@ -452,30 +557,51 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo); + bool echo, bool port); int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.add_addr_signal); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); +} + +static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); +} + +static inline bool mptcp_pm_should_add_signal_ipv6(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6); +} + +static inline bool mptcp_pm_should_add_signal_port(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_PORT); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.rm_addr_signal); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } -static inline unsigned int mptcp_add_addr_len(int family, bool echo) +static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { - if (family == AF_INET) - return echo ? TCPOLEN_MPTCP_ADD_ADDR_BASE - : TCPOLEN_MPTCP_ADD_ADDR; - return echo ? TCPOLEN_MPTCP_ADD_ADDR6_BASE : TCPOLEN_MPTCP_ADD_ADDR6; + u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; + + if (family == AF_INET6) + len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; + if (!echo) + len += MPTCPOPT_THMAC_LEN; + if (port) + len += TCPOLEN_MPTCP_PORT_LEN; + + return len; } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo); + struct mptcp_addr_info *saddr, bool *echo, bool *port); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, u8 *rm_id); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); @@ -485,6 +611,7 @@ void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk); +void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); @@ -494,13 +621,6 @@ static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } -static inline bool before64(__u64 seq1, __u64 seq2) -{ - return (__s64)(seq1 - seq2) < 0; -} - -#define after64(seq2, seq1) before64(seq1, seq2) - void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index c21a852a6ffa..278cbe3e539e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -112,9 +112,14 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li return 0; } -static void subflow_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +/* Init mptcp request socket. + * + * Returns an error code if a JOIN has failed and a TCP reset + * should be sent. + */ +static int subflow_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -125,7 +130,7 @@ static void subflow_init_req(struct request_sock *req, ret = __subflow_init_req(req, sk_listener); if (ret) - return; + return 0; mptcp_get_options(skb, &mp_opt); @@ -133,7 +138,7 @@ static void subflow_init_req(struct request_sock *req, SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (mp_opt.mp_join) - return; + return 0; } else if (mp_opt.mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } @@ -157,7 +162,7 @@ again: } else { subflow_req->mp_capable = 1; } - return; + return 0; } err = mptcp_token_new_request(req); @@ -175,7 +180,11 @@ again: subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req, skb); - if (unlikely(req->syncookie) && subflow_req->msk) { + /* Can't fall back to TCP in this case. */ + if (!subflow_req->msk) + return -EPERM; + + if (unlikely(req->syncookie)) { if (mptcp_can_accept_new_subflow(subflow_req->msk)) subflow_init_req_cookie_join_save(subflow_req, skb); } @@ -183,6 +192,8 @@ again: pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } + + return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, @@ -228,27 +239,53 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); -static void subflow_v4_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v4_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + int err; + tcp_rsk(req)->is_mptcp = 1; - tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb); + dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; + + err = subflow_init_req(req, sk, skb); + if (err == 0) + return dst; - subflow_init_req(req, sk_listener, skb); + dst_release(dst); + if (!req->syncookie) + tcp_request_sock_ops.send_reset(sk, skb); + return NULL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) -static void subflow_v6_init_req(struct request_sock *req, - const struct sock *sk_listener, - struct sk_buff *skb) +static struct dst_entry *subflow_v6_route_req(const struct sock *sk, + struct sk_buff *skb, + struct flowi *fl, + struct request_sock *req) { + struct dst_entry *dst; + int err; + tcp_rsk(req)->is_mptcp = 1; - tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb); + dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); + if (!dst) + return NULL; - subflow_init_req(req, sk_listener, skb); + err = subflow_init_req(req, sk, skb); + if (err == 0) + return dst; + + dst_release(dst); + if (!req->syncookie) + tcp6_request_sock_ops.send_reset(sk, skb); + return NULL; } #endif @@ -276,12 +313,17 @@ void mptcp_subflow_reset(struct sock *ssk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; + /* must hold: tcp_done() could drop last reference on parent */ + sock_hold(sk); + tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && schedule_work(&mptcp_sk(sk)->work)) - sock_hold(sk); + return; /* worker will put sk for us */ + + sock_put(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) @@ -577,6 +619,11 @@ create_child: */ inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); + /* record the newly created socket as the first msk + * subflow, but don't link it yet into conn_list + */ + WRITE_ONCE(mptcp_sk(new_msk)->first, child); + /* new mpc subflow takes ownership of the newly * created mptcp socket */ @@ -845,8 +892,6 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; - if (incr) - tcp_cleanup_rbuf(ssk, incr); } static bool subflow_check_data_avail(struct sock *ssk) @@ -968,7 +1013,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; - *space = tcp_space(sk); + *space = __mptcp_space(sk); *full_space = tcp_full_space(sk); } @@ -993,20 +1038,9 @@ static void subflow_data_ready(struct sock *sk) mptcp_data_ready(parent, sk); } -static void subflow_write_space(struct sock *sk) +static void subflow_write_space(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *parent = subflow->conn; - - if (!sk_stream_is_writeable(sk)) - return; - - if (sk_stream_is_writeable(parent)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); - smp_mb__after_atomic(); - /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ - sk_stream_write_space(parent); - } + /* we take action in __mptcp_clean_una() */ } static struct inet_connection_sock_af_ops * @@ -1120,21 +1154,48 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); mptcp_info2sockaddr(remote, &addr); + mptcp_add_pending_subflow(msk, subflow); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) - goto failed; + goto failed_unlink; + return err; + +failed_unlink: spin_lock_bh(&msk->join_list_lock); - list_add_tail(&subflow->node, &msk->join_list); + list_del(&subflow->node); spin_unlock_bh(&msk->join_list_lock); - return err; - failed: + subflow->disposable = 1; sock_release(sf); return err; } +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ +#ifdef CONFIG_SOCK_CGROUP_DATA + struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, + *child_skcd = &child->sk_cgrp_data; + + /* only the additional subflows created by kworkers have to be modified */ + if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != + cgroup_id(sock_cgroup_ptr(child_skcd))) { +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = parent->sk_memcg; + + mem_cgroup_sk_free(child); + if (memcg && css_tryget(&memcg->css)) + child->sk_memcg = memcg; +#endif /* CONFIG_MEMCG */ + + cgroup_sk_free(child_skcd); + *child_skcd = *parent_skcd; + cgroup_sk_clone(child_skcd); + } +#endif /* CONFIG_SOCK_CGROUP_DATA */ +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -1155,6 +1216,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) lock_sock(sf->sk); + /* the newly created socket has to be in the same cgroup as its parent */ + mptcp_attach_cgroup(sk, sf->sk); + /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. */ @@ -1253,7 +1317,6 @@ static void subflow_state_change(struct sock *sk) mptcp_data_ready(parent, sk); if (__mptcp_check_fallback(mptcp_sk(parent)) && - !(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; mptcp_subflow_eof(parent); @@ -1296,17 +1359,27 @@ out: return err; } -static void subflow_ulp_release(struct sock *sk) +static void subflow_ulp_release(struct sock *ssk) { - struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); + bool release = true; + struct sock *sk; if (!ctx) return; - if (ctx->conn) - sock_put(ctx->conn); + sk = ctx->conn; + if (sk) { + /* if the msk has been orphaned, keep the ctx + * alive, will be freed by __mptcp_close_ssk(), + * when the subflow is still unaccepted + */ + release = ctx->disposable || list_empty(&ctx->node); + sock_put(sk); + } - kfree_rcu(ctx, rcu); + if (release) + kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, @@ -1391,7 +1464,7 @@ void __init mptcp_subflow_init(void) panic("MPTCP: failed to init subflow request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; - subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req; + subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; @@ -1400,7 +1473,7 @@ void __init mptcp_subflow_init(void) #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; - subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req; + subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; |