summaryrefslogtreecommitdiffstats
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2023-10-23 13:44:38 -0700
committerJakub Kicinski <kuba@kernel.org>2023-10-25 12:23:34 -0700
commit5684ab1a0effbfeb706f47d85785f653005b97b1 (patch)
tree545dd36c49a503cee46fc07f4e686801b3829d05 /net/mptcp/protocol.c
parent849ee75a38b297187c760bb1d23d8f2a7b1fc73e (diff)
downloadlinux-5684ab1a0effbfeb706f47d85785f653005b97b1.tar.gz
linux-5684ab1a0effbfeb706f47d85785f653005b97b1.tar.bz2
linux-5684ab1a0effbfeb706f47d85785f653005b97b1.zip
mptcp: give rcvlowat some love
The MPTCP protocol allow setting sk_rcvlowat, but the value there is currently ignored. Additionally, the default subflows sk_rcvlowat basically disables per subflow delayed ack: the MPTCP protocol move the incoming data from the subflows into the msk socket as soon as the TCP stacks invokes the subflow data_ready callback. Later, when __tcp_ack_snd_check() takes action, the subflow-level copied_seq matches rcv_nxt, and that mandate for an immediate ack. Let the mptcp receive path be aware of such threshold, explicitly tracking the amount of data available to be ready and checking vs sk_rcvlowat in mptcp_poll() and before waking-up readers. Additionally implement the set_rcvlowat() callback, to properly handle the rcvbuf auto-tuning on sk_rcvlowat changes. Finally to properly handle delayed ack, force the subflow level threshold to 0 and instead explicitly ask for an immediate ack when the msk level th is not reached. Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <martineau@kernel.org> Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-2-v1-5-9dc60939d371@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c24
1 files changed, 11 insertions, 13 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index a21f8ed26343..7036e30c449f 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -863,9 +863,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
/* Wake-up the reader only for in-sequence data */
mptcp_data_lock(sk);
- if (move_skbs_to_msk(msk, ssk))
+ if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
sk->sk_data_ready(sk);
-
mptcp_data_unlock(sk);
}
@@ -1922,6 +1921,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
if (!(flags & MSG_PEEK)) {
MPTCP_SKB_CB(skb)->offset += count;
MPTCP_SKB_CB(skb)->map_seq += count;
+ msk->bytes_consumed += count;
}
break;
}
@@ -1932,6 +1932,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
__skb_unlink(skb, &msk->receive_queue);
__kfree_skb(skb);
+ msk->bytes_consumed += count;
}
if (copied >= len)
@@ -2755,6 +2756,7 @@ static void __mptcp_init_sock(struct sock *sk)
msk->rmem_fwd_alloc = 0;
WRITE_ONCE(msk->rmem_released, 0);
msk->timer_ival = TCP_RTO_MIN;
+ msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
WRITE_ONCE(msk->first, NULL);
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -2964,16 +2966,9 @@ void __mptcp_unaccepted_force_close(struct sock *sk)
__mptcp_destroy_sock(sk);
}
-static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+static __poll_t mptcp_check_readable(struct sock *sk)
{
- /* Concurrent splices from sk_receive_queue into receive_queue will
- * always show at least one non-empty queue when checked in this order.
- */
- if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
- skb_queue_empty_lockless(&msk->receive_queue))
- return 0;
-
- return EPOLLIN | EPOLLRDNORM;
+ return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
}
static void mptcp_check_listen_stop(struct sock *sk)
@@ -3011,7 +3006,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
goto cleanup;
}
- if (mptcp_check_readable(msk) || timeout < 0) {
+ if (mptcp_data_avail(msk) || timeout < 0) {
/* If the msk has read data, or the caller explicitly ask it,
* do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
*/
@@ -3138,6 +3133,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
msk->snd_data_fin_enable = false;
msk->rcv_fastclose = false;
msk->use_64bit_ack = false;
+ msk->bytes_consumed = 0;
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
mptcp_pm_data_reset(msk);
mptcp_ca_reset(sk);
@@ -3909,7 +3905,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
- mask |= mptcp_check_readable(msk);
+ mask |= mptcp_check_readable(sk);
if (shutdown & SEND_SHUTDOWN)
mask |= EPOLLOUT | EPOLLWRNORM;
else
@@ -3947,6 +3943,7 @@ static const struct proto_ops mptcp_stream_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
+ .set_rcvlowat = mptcp_set_rcvlowat,
};
static struct inet_protosw mptcp_protosw = {
@@ -4048,6 +4045,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = inet6_compat_ioctl,
#endif
+ .set_rcvlowat = mptcp_set_rcvlowat,
};
static struct proto mptcp_v6_prot;