summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuchung Cheng <ycheng@google.com>2018-07-18 13:56:36 -0700
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2018-07-28 07:57:14 +0200
commitafaf0f83261da12bddc779c15741cb3435ac16df (patch)
treec27a5a61d7a21b0ed74570390fe9a88300b2ed0f
parent5a2ebffa878dabd21e2b4769e00b7dba10207192 (diff)
downloadlinux-stable-afaf0f83261da12bddc779c15741cb3435ac16df.tar.gz
linux-stable-afaf0f83261da12bddc779c15741cb3435ac16df.tar.bz2
linux-stable-afaf0f83261da12bddc779c15741cb3435ac16df.zip
tcp: do not delay ACK in DCTCP upon CE status change
[ Upstream commit a0496ef2c23b3b180902dd185d0d63ccbc624cf8 ] Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change has to be sent immediately so the sender can respond quickly: """ When receiving packets, the CE codepoint MUST be processed as follows: 1. If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to true and send an immediate ACK. 2. If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE to false and send an immediate ACK. """ Previously DCTCP implementation may continue to delay the ACK. This patch fixes that to implement the RFC by forcing an immediate ACK. Tested with this packetdrill script provided by Larry Brakmo 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 0.000 bind(3, ..., ...) = 0 0.000 listen(3, 1) = 0 0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> 0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> 0.110 < [ect0] . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0 0.200 < [ect0] . 1:1001(1000) ack 1 win 257 0.200 > [ect01] . 1:1(0) ack 1001 0.200 write(4, ..., 1) = 1 0.200 > [ect01] P. 1:2(1) ack 1001 0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 +0.005 < [ce] . 2001:3001(1000) ack 2 win 257 +0.000 > [ect01] . 2:2(0) ack 2001 // Previously the ACK below would be delayed by 40ms +0.000 > [ect01] E. 2:2(0) ack 3001 +0.500 < F. 9501:9501(0) ack 4 win 257 Signed-off-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/ipv4/tcp_dctcp.c30
-rw-r--r--net/ipv4/tcp_input.c3
3 files changed, 21 insertions, 13 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d4ea41d76946..c1ba68e45f20 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -342,6 +342,7 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
+void tcp_enter_quickack_mode(struct sock *sk);
static inline void tcp_dec_quickack_mode(struct sock *sk,
const unsigned int pkts)
{
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 39d96d27ff94..c78fb53988a1 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -131,12 +131,15 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- /* State has changed from CE=0 to CE=1 and delayed
- * ACK has not sent yet.
- */
- if (!ca->ce_state &&
- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
- __tcp_send_ack(sk, ca->prior_rcv_nxt);
+ if (!ca->ce_state) {
+ /* State has changed from CE=0 to CE=1, force an immediate
+ * ACK to reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+ __tcp_send_ack(sk, ca->prior_rcv_nxt);
+ tcp_enter_quickack_mode(sk);
+ }
ca->prior_rcv_nxt = tp->rcv_nxt;
ca->ce_state = 1;
@@ -149,12 +152,15 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- /* State has changed from CE=1 to CE=0 and delayed
- * ACK has not sent yet.
- */
- if (ca->ce_state &&
- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
- __tcp_send_ack(sk, ca->prior_rcv_nxt);
+ if (ca->ce_state) {
+ /* State has changed from CE=1 to CE=0, force an immediate
+ * ACK to reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+ __tcp_send_ack(sk, ca->prior_rcv_nxt);
+ tcp_enter_quickack_mode(sk);
+ }
ca->prior_rcv_nxt = tp->rcv_nxt;
ca->ce_state = 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1f25ebab25d2..ba358b7369c4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -195,13 +195,14 @@ static void tcp_incr_quickack(struct sock *sk)
icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
-static void tcp_enter_quickack_mode(struct sock *sk)
+void tcp_enter_quickack_mode(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk);
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
+EXPORT_SYMBOL(tcp_enter_quickack_mode);
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.