From d4131f09770d9b7471c9da65e6ecd2477746ac5c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 27 Feb 2018 14:15:01 -0800 Subject: tcp: revert F-RTO middle-box workaround This reverts commit cc663f4d4c97b7297fb45135ab23cfd508b35a77. While fixing some broken middle-boxes that modifies receive window fields, it does not address middle-boxes that strip off SACK options. The best solution is to fully revert this patch and the root F-RTO enhancement. Fixes: cc663f4d4c97 ("tcp: restrict F-RTO to work-around broken middle-boxes") Reported-by: Teodor Milkov Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 575d3c1fb6e8..cd8ea972dc65 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1909,7 +1909,6 @@ void tcp_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sk_buff *skb; - bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ bool mark_lost; @@ -1968,17 +1967,15 @@ void tcp_enter_loss(struct sock *sk) tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); - /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous - * loss recovery is underway except recurring timeout(s) on - * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing - * - * In theory F-RTO can be used repeatedly during loss recovery. - * In practice this interacts badly with broken middle-boxes that - * falsely raise the receive window, which results in repeated - * timeouts and stop-and-go behavior. + /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO + * if a previous recovery is underway, otherwise it may incorrectly + * call a timeout spurious if some previously retransmitted packets + * are s/acked (sec 3.2). We do not apply that retriction since + * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS + * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO + * on PTMU discovery to avoid sending new data. */ tp->frto = net->ipv4.sysctl_tcp_frto && - (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } -- cgit v1.2.3 From fc68e171d376c322e6777a3d7ac2f0278b68b17f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 27 Feb 2018 14:15:02 -0800 Subject: tcp: revert F-RTO extension to detect more spurious timeouts This reverts commit 89fe18e44f7ee5ab1c90d0dff5835acee7751427. While the patch could detect more spurious timeouts, it could cause poor TCP performance on broken middle-boxes that modifies TCP packets (e.g. receive window, SACK options). Since the performance gain is much smaller compared to the potential loss. The best solution is to fully revert the change. Fixes: 89fe18e44f7e ("tcp: extend F-RTO to catch more spurious timeouts") Reported-by: Teodor Milkov Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cd8ea972dc65..8d480542aa07 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1909,6 +1909,7 @@ void tcp_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sk_buff *skb; + bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ bool mark_lost; @@ -1967,15 +1968,12 @@ void tcp_enter_loss(struct sock *sk) tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); - /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO - * if a previous recovery is underway, otherwise it may incorrectly - * call a timeout spurious if some previously retransmitted packets - * are s/acked (sec 3.2). We do not apply that retriction since - * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS - * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO - * on PTMU discovery to avoid sending new data. + /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous + * loss recovery is underway except recurring timeout(s) on + * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing */ tp->frto = net->ipv4.sysctl_tcp_frto && + (new_recovery || icsk->icsk_retransmits) && !inet_csk(sk)->icsk_mtup.probe_size; } @@ -2628,18 +2626,14 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, tcp_try_undo_loss(sk, false)) return; - /* The ACK (s)acks some never-retransmitted data meaning not all - * the data packets before the timeout were lost. Therefore we - * undo the congestion window and state. This is essentially - * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since - * a retransmitted skb is permantly marked, we can apply such an - * operation even if F-RTO was not used. - */ - if ((flag & FLAG_ORIG_SACK_ACKED) && - tcp_try_undo_loss(sk, tp->undo_marker)) - return; - if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + if ((flag & FLAG_ORIG_SACK_ACKED) && + tcp_try_undo_loss(sk, true)) + return; + if (after(tp->snd_nxt, tp->high_seq)) { if (flag & FLAG_DATA_SACKED || is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ -- cgit v1.2.3 From a27fd7a8ed3856faaf5a2ff1c8c5f00c0667aaa0 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Tue, 27 Feb 2018 18:32:18 -0500 Subject: tcp: purge write queue upon RST When the connection is reset, there is no point in keeping the packets on the write queue until the connection is closed. RFC 793 (page 70) and RFC 793-bis (page 64) both suggest purging the write queue upon RST: https://tools.ietf.org/html/draft-ietf-tcpm-rfc793bis-07 Moreover, this is essential for a correct MSG_ZEROCOPY implementation, because userspace cannot call close(fd) before receiving zerocopy signals even when the connection is reset. Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY") Signed-off-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8d480542aa07..9a1b3c1c1c14 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3992,6 +3992,7 @@ void tcp_reset(struct sock *sk) /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); + tcp_write_queue_purge(sk); tcp_done(sk); if (!sock_flag(sk, SOCK_DEAD)) -- cgit v1.2.3