summaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2020-02-26 10:14:48 +0100
committerDavid S. Miller <davem@davemloft.net>2020-02-26 20:46:26 -0800
commit6771bfd9ee2460c13e38c0cd46a3afb5404ae716 (patch)
treeed9e4ecc52cac5cfd22025c7a4d6d9584cbeaa0f /net/mptcp
parent80992017150b4e37fe01c0aa2e3c9d02de66c11e (diff)
downloadlinux-stable-6771bfd9ee2460c13e38c0cd46a3afb5404ae716.tar.gz
linux-stable-6771bfd9ee2460c13e38c0cd46a3afb5404ae716.tar.bz2
linux-stable-6771bfd9ee2460c13e38c0cd46a3afb5404ae716.zip
mptcp: update mptcp ack sequence from work queue
If userspace is not reading data, all the mptcp-level acks contain the ack_seq from the last time userspace read data rather than the most recent in-sequence value. This causes pointless retransmissions for data that is already queued. The reason for this is that all the mptcp protocol level processing happens at mptcp_recv time. This adds work queue to move skbs from the subflow sockets receive queue on the mptcp socket receive queue (which was not used so far). This allows us to announce the correct mptcp ack sequence in a timely fashion, even when the application does not call recv() on the mptcp socket for some time. We still wake userspace tasks waiting for POLLIN immediately: If the mptcp level receive queue is empty (because the work queue is still pending) it can be filled from in-sequence subflow sockets at recv time without a need to wait for the worker. The skb_orphan when moving skbs from subflow to mptcp level is needed, because the destructor (sock_rfree) relies on skb->sk (ssk!) lock being taken. A followup patch will add needed rmem accouting for the moved skbs. Other problem: In case application behaves as expected, and calls recv() as soon as mptcp socket becomes readable, the work queue will only waste cpu cycles. This will also be addressed in followup patches. Signed-off-by: Florian Westphal <fw@strlen.de> Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/protocol.c234
1 files changed, 165 insertions, 69 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index cbf184a71ed7..b4a8517d8eac 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -31,6 +31,12 @@ struct mptcp6_sock {
};
#endif
+struct mptcp_skb_cb {
+ u32 offset;
+};
+
+#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
+
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket.
* Otherwise return NULL.
@@ -111,11 +117,88 @@ static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
return NULL;
}
+static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb,
+ unsigned int offset, size_t copy_len)
+{
+ struct sock *sk = (struct sock *)msk;
+
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+ skb_orphan(skb);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+
+ msk->ack_seq += copy_len;
+ MPTCP_SKB_CB(skb)->offset = offset;
+}
+
+static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
+ struct sock *ssk,
+ unsigned int *bytes)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ unsigned int moved = 0;
+ bool more_data_avail;
+ struct tcp_sock *tp;
+ bool done = false;
+
+ tp = tcp_sk(ssk);
+ do {
+ u32 map_remaining, offset;
+ u32 seq = tp->copied_seq;
+ struct sk_buff *skb;
+ bool fin;
+
+ /* try to move as much data as available */
+ map_remaining = subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (!skb)
+ break;
+
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ if (fin) {
+ done = true;
+ seq++;
+ }
+
+ if (offset < skb->len) {
+ size_t len = skb->len - offset;
+
+ if (tp->urg_data)
+ done = true;
+
+ __mptcp_move_skb(msk, ssk, skb, offset, len);
+ seq += len;
+ moved += len;
+
+ if (WARN_ON_ONCE(map_remaining < len))
+ break;
+ } else {
+ WARN_ON_ONCE(!fin);
+ sk_eat_skb(ssk, skb);
+ done = true;
+ }
+
+ WRITE_ONCE(tp->copied_seq, seq);
+ more_data_avail = mptcp_subflow_data_available(ssk);
+ } while (more_data_avail);
+
+ *bytes = moved;
+
+ return done;
+}
+
void mptcp_data_ready(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
set_bit(MPTCP_DATA_READY, &msk->flags);
+
+ if (schedule_work(&msk->work))
+ sock_hold((struct sock *)msk);
+
sk->sk_data_ready(sk);
}
@@ -373,19 +456,68 @@ static void mptcp_wait_data(struct sock *sk, long *timeo)
remove_wait_queue(sk_sleep(sk), &wait);
}
+static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb;
+ int copied = 0;
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u32 offset = MPTCP_SKB_CB(skb)->offset;
+ u32 data_len = skb->len - offset;
+ u32 count = min_t(size_t, len - copied, data_len);
+ int err;
+
+ err = skb_copy_datagram_msg(skb, offset, msg, count);
+ if (unlikely(err < 0)) {
+ if (!copied)
+ return err;
+ break;
+ }
+
+ copied += count;
+
+ if (count < data_len) {
+ MPTCP_SKB_CB(skb)->offset += count;
+ break;
+ }
+
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ __kfree_skb(skb);
+
+ if (copied >= len)
+ break;
+ }
+
+ return copied;
+}
+
+static bool __mptcp_move_skbs(struct mptcp_sock *msk)
+{
+ unsigned int moved = 0;
+ bool done;
+
+ do {
+ struct sock *ssk = mptcp_subflow_recv_lookup(msk);
+
+ if (!ssk)
+ break;
+
+ lock_sock(ssk);
+ done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ release_sock(ssk);
+ } while (!done);
+
+ return moved > 0;
+}
+
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct mptcp_subflow_context *subflow;
- bool more_data_avail = false;
- struct mptcp_read_arg arg;
- read_descriptor_t desc;
- bool wait_data = false;
struct socket *ssock;
- struct tcp_sock *tp;
- bool done = false;
- struct sock *ssk;
int copied = 0;
int target;
long timeo;
@@ -403,65 +535,26 @@ fallback:
return copied;
}
- arg.msg = msg;
- desc.arg.data = &arg;
- desc.error = 0;
-
timeo = sock_rcvtimeo(sk, nonblock);
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
- while (!done) {
- u32 map_remaining;
+ while (len > (size_t)copied) {
int bytes_read;
- ssk = mptcp_subflow_recv_lookup(msk);
- pr_debug("msk=%p ssk=%p", msk, ssk);
- if (!ssk)
- goto wait_for_data;
+ bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
+ if (unlikely(bytes_read < 0)) {
+ if (!copied)
+ copied = bytes_read;
+ goto out_err;
+ }
- subflow = mptcp_subflow_ctx(ssk);
- tp = tcp_sk(ssk);
+ copied += bytes_read;
- lock_sock(ssk);
- do {
- /* try to read as much data as available */
- map_remaining = subflow->map_data_len -
- mptcp_subflow_get_map_offset(subflow);
- desc.count = min_t(size_t, len - copied, map_remaining);
- pr_debug("reading %zu bytes, copied %d", desc.count,
- copied);
- bytes_read = tcp_read_sock(ssk, &desc,
- mptcp_read_actor);
- if (bytes_read < 0) {
- if (!copied)
- copied = bytes_read;
- done = true;
- goto next;
- }
-
- pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq,
- msk->ack_seq + bytes_read);
- msk->ack_seq += bytes_read;
- copied += bytes_read;
- if (copied >= len) {
- done = true;
- goto next;
- }
- if (tp->urg_data && tp->urg_seq == tp->copied_seq) {
- pr_err("Urgent data present, cannot proceed");
- done = true;
- goto next;
- }
-next:
- more_data_avail = mptcp_subflow_data_available(ssk);
- } while (more_data_avail && !done);
- release_sock(ssk);
- continue;
-
-wait_for_data:
- more_data_avail = false;
+ if (skb_queue_empty(&sk->sk_receive_queue) &&
+ __mptcp_move_skbs(msk))
+ continue;
/* only the master socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg()
@@ -502,26 +595,25 @@ wait_for_data:
}
pr_debug("block timeout %ld", timeo);
- wait_data = true;
mptcp_wait_data(sk, &timeo);
if (unlikely(__mptcp_tcp_fallback(msk)))
goto fallback;
}
- if (more_data_avail) {
- if (!test_bit(MPTCP_DATA_READY, &msk->flags))
- set_bit(MPTCP_DATA_READY, &msk->flags);
- } else if (!wait_data) {
+ if (skb_queue_empty(&sk->sk_receive_queue)) {
+ /* entire backlog drained, clear DATA_READY. */
clear_bit(MPTCP_DATA_READY, &msk->flags);
- /* .. race-breaker: ssk might get new data after last
- * data_available() returns false.
+ /* .. race-breaker: ssk might have gotten new data
+ * after last __mptcp_move_skbs() returned false.
*/
- ssk = mptcp_subflow_recv_lookup(msk);
- if (unlikely(ssk))
+ if (unlikely(__mptcp_move_skbs(msk)))
set_bit(MPTCP_DATA_READY, &msk->flags);
+ } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
+ /* data to read but mptcp_wait_data() cleared DATA_READY */
+ set_bit(MPTCP_DATA_READY, &msk->flags);
}
-
+out_err:
release_sock(sk);
return copied;
}
@@ -557,7 +649,7 @@ static void mptcp_worker(struct work_struct *work)
struct sock *sk = &msk->sk.icsk_inet.sk;
lock_sock(sk);
-
+ __mptcp_move_skbs(msk);
release_sock(sk);
sock_put(sk);
}
@@ -638,6 +730,8 @@ static void mptcp_close(struct sock *sk, long timeout)
mptcp_cancel_work(sk);
+ __skb_queue_purge(&sk->sk_receive_queue);
+
sk_common_release(sk);
}
@@ -1204,6 +1298,8 @@ void mptcp_proto_init(void)
panic("Failed to register MPTCP proto.\n");
inet_register_protosw(&mptcp_protosw);
+
+ BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)