From fc6415bcb0f58f03adb910e56d7e1df6368794e0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:17:45 -0700 Subject: [TCP]: Fix quick-ack decrementing with TSO. On each packet output, we call tcp_dec_quickack_mode() if the ACK flag is set. It drops tp->ack.quick until it hits zero, at which time we deflate the ATO value. When doing TSO, we are emitting multiple packets with ACK set, so we should decrement tp->ack.quick that many segments. Note that, unlike this case, tcp_enter_cwr() should not take the tcp_skb_pcount(skb) into consideration. That function, one time, readjusts tp->snd_cwnd and moves into TCP_CA_CWR state. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0e17c244875c..389deeb2a457 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, tp->ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk) +static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { struct tcp_sock *tp = tcp_sk(sk); - tcp_dec_quickack_mode(tp); + tcp_dec_quickack_mode(tp, pkts); tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } @@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) tp->af_specific->send_check(sk, th, skb->len, skb); if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); -- cgit v1.2.3 From f6302d1d78f77c2d4c8bd32b0afc2df7fdf5f281 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:03 -0700 Subject: [TCP]: Move send test logic out of net/tcp.h This just moves the code into tcp_output.c, no code logic changes are made by this patch. Using this as a baseline, we can begin to untangle the mess of comparisons for the Nagle test et al. We will also be able to reduce all of the redundant computation that occurs when outputting data packets. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 150 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 129 insertions(+), 21 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 389deeb2a457..2cbe879ee16a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -413,6 +413,135 @@ static inline void tcp_tso_set_push(struct sk_buff *skb) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; } +static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (skb->len <= tp->mss_cache_std || + !(sk->sk_route_caps & NETIF_F_TSO)) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + } else { + unsigned int factor; + + factor = skb->len + (tp->mss_cache_std - 1); + factor /= tp->mss_cache_std; + skb_shinfo(skb)->tso_segs = factor; + skb_shinfo(skb)->tso_size = tp->mss_cache_std; + } +} + +static inline int tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ + +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return (skb->len < mss_now && + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + ((nonagle&TCP_NAGLE_CORK) || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. + */ +static int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + int pkts = tcp_skb_pcount(skb); + + if (!pkts) { + tcp_set_skb_tso_segs(sk, skb); + pkts = tcp_skb_pcount(skb); + } + + /* RFC 1122 - section 4.2.3.4 + * + * We must queue if + * + * a) The right edge of this frame exceeds the window + * b) There are packets in flight and we have a small segment + * [SWS avoidance and Nagle algorithm] + * (part of SWS is done on packetization) + * Minshall version sounds: there are no _small_ + * segments in flight. (tcp_nagle_check) + * c) We have too many packets 'in flight' + * + * Don't use the nagle rule for urgent data (or + * for the final FIN -DaveM). + * + * Also, Nagle rule does not apply to frames, which + * sit in the middle of queue (they have no chances + * to get new data) and if room at tail of skb is + * not enough to save something seriously (<32 for now). + */ + + /* Don't be strict about the congestion window for the + * final FIN frame. -DaveM + */ + return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode + || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && + (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && + !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); +} + +static inline int tcp_skb_is_last(const struct sock *sk, + const struct sk_buff *skb) +{ + return skb->next == (struct sk_buff *)&sk->sk_write_queue; +} + +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned cur_mss, int nonagle) +{ + struct sk_buff *skb = sk->sk_send_head; + + if (skb) { + if (!tcp_skb_is_last(sk, skb)) + nonagle = TCP_NAGLE_PUSH; + if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || + tcp_write_xmit(sk, nonagle)) + tcp_check_probe_timer(sk, tp); + } + tcp_cwnd_validate(sk, tp); +} + +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb = sk->sk_send_head; + + return (skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + (tcp_skb_is_last(sk, skb) ? + TCP_NAGLE_PUSH : + tp->nonagle))); +} + + /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ @@ -434,27 +563,6 @@ void tcp_push_one(struct sock *sk, unsigned cur_mss) } } -void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (skb->len <= tp->mss_cache_std || - !(sk->sk_route_caps & NETIF_F_TSO)) { - /* Avoid the costly divide in the normal - * non-TSO case. - */ - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; - } else { - unsigned int factor; - - factor = skb->len + (tp->mss_cache_std - 1); - factor /= tp->mss_cache_std; - skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = tp->mss_cache_std; - } -} - /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. -- cgit v1.2.3 From 84d3e7b9573291a1ea845bdd51b74bb484597661 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:18 -0700 Subject: [TCP]: Move __tcp_data_snd_check into tcp_output.c It reimplements portions of tcp_snd_check(), so it we move it to tcp_output.c we can consolidate it's logic much easier in a later change. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2cbe879ee16a..362b811a2460 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -530,6 +530,16 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, tcp_cwnd_validate(sk, tp); } +void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +} + int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) { struct sk_buff *skb = sk->sk_send_head; -- cgit v1.2.3 From f44b527177d57ed382bfd93e1b55232465f6d058 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:34 -0700 Subject: [TCP]: Add missing skb_header_release() call to tcp_fragment(). When we add any new packet to the TCP socket write queue, we must call skb_header_release() on it in order for the TSO sharing checks in the drivers to work. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 362b811a2460..5e63ed09658d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -655,6 +655,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) } /* Link BUFF into the send queue. */ + skb_header_release(buff); __skb_append(skb, buff); return 0; -- cgit v1.2.3 From a762a9800752f05fa8768bb0ac35d0e7f1bcfe7f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:18:51 -0700 Subject: [TCP]: Kill extra cwnd validate in __tcp_push_pending_frames(). The tcp_cwnd_validate() function should only be invoked if we actually send some frames, yet __tcp_push_pending_frames() will always invoke it. tcp_write_xmit() does the call for us, so the call here can simply be removed. Also, tcp_write_xmit() can be marked static. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 79 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 30 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5e63ed09658d..a6375ca2a59e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -511,35 +511,6 @@ static inline int tcp_skb_is_last(const struct sock *sk, return skb->next == (struct sk_buff *)&sk->sk_write_queue; } -/* Push out any pending frames which were held back due to - * TCP_CORK or attempt at coalescing tiny packets. - * The socket must be locked by the caller. - */ -void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned cur_mss, int nonagle) -{ - struct sk_buff *skb = sk->sk_send_head; - - if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) - tcp_check_probe_timer(sk, tp); - } - tcp_cwnd_validate(sk, tp); -} - -void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) { struct sk_buff *skb = sk->sk_send_head; @@ -841,6 +812,26 @@ unsigned int tcp_current_mss(struct sock *sk, int large) return mss_now; } +/* Congestion window validation. (RFC2861) */ + +static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +{ + __u32 packets_out = tp->packets_out; + + if (packets_out >= tp->snd_cwnd) { + /* Network is feed fully. */ + tp->snd_cwnd_used = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + } else { + /* Network starves. */ + if (tp->packets_out > tp->snd_cwnd_used) + tp->snd_cwnd_used = tp->packets_out; + + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + tcp_cwnd_application_limited(sk); + } +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -848,7 +839,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large) * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -int tcp_write_xmit(struct sock *sk, int nonagle) +static int tcp_write_xmit(struct sock *sk, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); unsigned int mss_now; @@ -901,6 +892,34 @@ int tcp_write_xmit(struct sock *sk, int nonagle) return 0; } +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned cur_mss, int nonagle) +{ + struct sk_buff *skb = sk->sk_send_head; + + if (skb) { + if (!tcp_skb_is_last(sk, skb)) + nonagle = TCP_NAGLE_PUSH; + if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || + tcp_write_xmit(sk, nonagle)) + tcp_check_probe_timer(sk, tp); + } +} + +void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +} + /* This function returns the amount that we can raise the * usable window based on the following constraints * -- cgit v1.2.3 From 92df7b518dcb113de8bc2494e3cd275ad887f12b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:06 -0700 Subject: [TCP]: tcp_write_xmit() tabbing cleanup Put the main basic block of work at the top-level of tabbing, and mark the TCP_CLOSE test with unlikely(). Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 68 +++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a6375ca2a59e..2a8409c3af1a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -842,54 +842,54 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) static int tcp_write_xmit(struct sock *sk, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; unsigned int mss_now; + int sent_pkts; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all * will be happy. */ - if (sk->sk_state != TCP_CLOSE) { - struct sk_buff *skb; - int sent_pkts = 0; + if (unlikely(sk->sk_state == TCP_CLOSE)) + return 0; - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk, 1); - - while ((skb = sk->sk_send_head) && - tcp_snd_test(sk, skb, mss_now, - tcp_skb_is_last(sk, skb) ? nonagle : - TCP_NAGLE_PUSH)) { - if (skb->len > mss_now) { - if (tcp_fragment(sk, skb, mss_now)) - break; - } - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. + */ + mss_now = tcp_current_mss(sk, 1); + sent_pkts = 0; + while ((skb = sk->sk_send_head) && + tcp_snd_test(sk, skb, mss_now, + tcp_skb_is_last(sk, skb) ? nonagle : + TCP_NAGLE_PUSH)) { + if (skb->len > mss_now) { + if (tcp_fragment(sk, skb, mss_now)) break; + } - /* Advance the send_head. This one is sent out. - * This call will increment packets_out. - */ - update_send_head(sk, tp, skb); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + break; - tcp_minshall_update(tp, mss_now, skb); - sent_pkts = 1; - } + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ + update_send_head(sk, tp, skb); - if (sent_pkts) { - tcp_cwnd_validate(sk, tp); - return 0; - } + tcp_minshall_update(tp, mss_now, skb); + sent_pkts = 1; + } - return !tp->packets_out && sk->sk_send_head; + if (sent_pkts) { + tcp_cwnd_validate(sk, tp); + return 0; } - return 0; + + return !tp->packets_out && sk->sk_send_head; } /* Push out any pending frames which were held back due to -- cgit v1.2.3 From a2e2a59c93cc8ba39caa9011c2573f429e40ccd9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:23 -0700 Subject: [TCP]: Fix redundant calculations of tcp_current_mss() tcp_write_xmit() uses tcp_current_mss(), but some of it's callers, namely __tcp_push_pending_frames(), already has this value available already. While we're here, fix the "cur_mss" argument to be "unsigned int" instead of plain "unsigned". Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2a8409c3af1a..e292e11c7319 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -839,11 +839,10 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -static int tcp_write_xmit(struct sock *sk, int nonagle) +static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int mss_now; int sent_pkts; /* If we are closed, the bytes will have to remain here. @@ -853,13 +852,6 @@ static int tcp_write_xmit(struct sock *sk, int nonagle) if (unlikely(sk->sk_state == TCP_CLOSE)) return 0; - - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk, 1); sent_pkts = 0; while ((skb = sk->sk_send_head) && tcp_snd_test(sk, skb, mss_now, @@ -897,7 +889,7 @@ static int tcp_write_xmit(struct sock *sk, int nonagle) * The socket must be locked by the caller. */ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, - unsigned cur_mss, int nonagle) + unsigned int cur_mss, int nonagle) { struct sk_buff *skb = sk->sk_send_head; @@ -905,7 +897,7 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, if (!tcp_skb_is_last(sk, skb)) nonagle = TCP_NAGLE_PUSH; if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, nonagle)) + tcp_write_xmit(sk, cur_mss, nonagle)) tcp_check_probe_timer(sk, tp); } } @@ -916,7 +908,7 @@ void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) + tcp_write_xmit(sk, tcp_current_mss(sk, 1), tp->nonagle)) tcp_check_probe_timer(sk, tp); } -- cgit v1.2.3 From 55c97f3e990c1ff63957c64f6cb10711a09fd70e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:38 -0700 Subject: [TCP]: Fix __tcp_push_pending_frames() 'nonagle' handling. 'nonagle' should be passed to the tcp_snd_test() function as 'TCP_NAGLE_PUSH' if we are checking an SKB not at the tail of the write_queue. This is because Nagle does not apply to such frames since we cannot possibly tack more data onto them. However, while doing this __tcp_push_pending_frames() makes all of the packets in the write_queue use this modified 'nonagle' value. Fix the bug and simplify this function by just calling tcp_write_xmit() directly if sk_send_head is non-NULL. As a result, we can now make tcp_data_snd_check() just call tcp_push_pending_frames() instead of the specialized __tcp_data_snd_check(). Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e292e11c7319..ce1d7cfbecfc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -894,24 +894,11 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb = sk->sk_send_head; if (skb) { - if (!tcp_skb_is_last(sk, skb)) - nonagle = TCP_NAGLE_PUSH; - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) || - tcp_write_xmit(sk, cur_mss, nonagle)) + if (tcp_write_xmit(sk, cur_mss, nonagle)) tcp_check_probe_timer(sk, tp); } } -void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tcp_current_mss(sk, 1), tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - /* This function returns the amount that we can raise the * usable window based on the following constraints * -- cgit v1.2.3 From 7f4dd0a9438c73cbb1c240ece31390cf2c57294e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:19:54 -0700 Subject: [TCP]: Break out tcp_snd_test() into it's constituent parts. tcp_snd_test() does several different things, use inline functions to express this more clearly. 1) It initializes the TSO count of SKB, if necessary. 2) It performs the Nagle test. 3) It makes sure the congestion window is adhered to. 4) It makes sure SKB fits into the send window. This cleanup also sets things up so that things like the available packets in the congestion window does not need to be calculated multiple times by packet sending loops such as tcp_write_xmit(). Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 120 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 82 insertions(+), 38 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ce1d7cfbecfc..8327e5e86d15 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -434,6 +434,33 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) } } +/* Does SKB fit into the send window? */ +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + return !after(end_seq, tp->snd_una + tp->snd_wnd); +} + +/* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 in_flight, cwnd; + + /* Don't be strict about the congestion window for the final FIN. */ + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 1; + + in_flight = tcp_packets_in_flight(tp); + cwnd = tp->snd_cwnd; + if (in_flight < cwnd) + return (cwnd - in_flight); + + return 0; +} + static inline int tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml,tp->snd_una) && @@ -442,7 +469,7 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp) /* Return 0, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. - * 2. Or it contains FIN. + * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_NODELAY was set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. @@ -453,56 +480,73 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp, unsigned mss_now, int nonagle) { return (skb->len < mss_now && - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && ((nonagle&TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); } -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) - * should be put on the wire right now. +/* Return non-zero if the Nagle test allows this packet to be + * sent now. */ -static int tcp_snd_test(struct sock *sk, struct sk_buff *skb, - unsigned cur_mss, int nonagle) +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) { - struct tcp_sock *tp = tcp_sk(sk); - int pkts = tcp_skb_pcount(skb); + /* Nagle rule does not apply to frames, which sit in the middle of the + * write_queue (they have no chances to get new data). + * + * This is implemented in the callers, where they modify the 'nonagle' + * argument based upon the location of SKB in the send queue. + */ + if (nonagle & TCP_NAGLE_PUSH) + return 1; + + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tp->urg_mode || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + return 1; + + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + return 1; - if (!pkts) { + return 0; +} + +/* This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs) { tcp_set_skb_tso_segs(sk, skb); - pkts = tcp_skb_pcount(skb); + tso_segs = tcp_skb_pcount(skb); } + return tso_segs; +} - /* RFC 1122 - section 4.2.3.4 - * - * We must queue if - * - * a) The right edge of this frame exceeds the window - * b) There are packets in flight and we have a small segment - * [SWS avoidance and Nagle algorithm] - * (part of SWS is done on packetization) - * Minshall version sounds: there are no _small_ - * segments in flight. (tcp_nagle_check) - * c) We have too many packets 'in flight' - * - * Don't use the nagle rule for urgent data (or - * for the final FIN -DaveM). - * - * Also, Nagle rule does not apply to frames, which - * sit in the middle of queue (they have no chances - * to get new data) and if room at tail of skb is - * not enough to save something seriously (<32 for now). - */ +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. If so, it returns the number of + * packets allowed by the congestion window. + */ +static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cwnd_quota; - /* Don't be strict about the congestion window for the - * final FIN frame. -DaveM - */ - return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode - || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && - (((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && - !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); + tcp_init_tso_segs(sk, skb); + + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) + return 0; + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (cwnd_quota && + !tcp_snd_wnd_test(tp, skb, cur_mss)) + cwnd_quota = 0; + + return cwnd_quota; } static inline int tcp_skb_is_last(const struct sock *sk, -- cgit v1.2.3 From aa93466bdfd901b926e033801f0b82b3eaa67be2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:20:09 -0700 Subject: [TCP]: Eliminate redundant computations in tcp_write_xmit(). tcp_snd_test() is run for every packet output by a single call to tcp_write_xmit(), but this is not necessary. For one, the congestion window space needs to only be calculated one time, then used throughout the duration of the loop. This cleanup also makes experimenting with different TSO packetization schemes much easier. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8327e5e86d15..0a4cd24b6578 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -887,6 +887,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + unsigned int tso_segs, cwnd_quota; int sent_pkts; /* If we are closed, the bytes will have to remain here. @@ -896,19 +897,31 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) if (unlikely(sk->sk_state == TCP_CLOSE)) return 0; + skb = sk->sk_send_head; + if (unlikely(!skb)) + return 0; + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_cwnd_test(tp, skb); sent_pkts = 0; - while ((skb = sk->sk_send_head) && - tcp_snd_test(sk, skb, mss_now, - tcp_skb_is_last(sk, skb) ? nonagle : - TCP_NAGLE_PUSH)) { - if (skb->len > mss_now) { - if (tcp_fragment(sk, skb, mss_now)) + + while (cwnd_quota >= tso_segs) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) + break; + + if (unlikely(skb->len > mss_now)) { + if (unlikely(tcp_fragment(sk, skb, mss_now))) break; } TCP_SKB_CB(skb)->when = tcp_time_stamp; tcp_tso_set_push(skb); - if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) break; /* Advance the send_head. This one is sent out. @@ -917,10 +930,19 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) update_send_head(sk, tp, skb); tcp_minshall_update(tp, mss_now, skb); - sent_pkts = 1; + sent_pkts++; + + /* Do not optimize this to use tso_segs. If we chopped up + * the packet above, tso_segs will no longer be valid. + */ + cwnd_quota -= tcp_skb_pcount(skb); + skb = sk->sk_send_head; + if (!skb) + break; + tso_segs = tcp_init_tso_segs(sk, skb); } - if (sent_pkts) { + if (likely(sent_pkts)) { tcp_cwnd_validate(sk, tp); return 0; } -- cgit v1.2.3 From c1b4a7e69576d65efc31a8cea0714173c2841244 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:24:38 -0700 Subject: [TCP]: Move to new TSO segmenting scheme. Make TSO segment transmit size decisions at send time not earlier. The basic scheme is that we try to build as large a TSO frame as possible when pulling in the user data, but the size of the TSO frame output to the card is determined at transmit time. This is guided by tp->xmit_size_goal. It is always set to a multiple of MSS and tells sendmsg/sendpage how large an SKB to try and build. Later, tcp_write_xmit() and tcp_push_one() chop up the packet if necessary and conditions warrant. These routines can also decide to "defer" in order to wait for more ACKs to arrive and thus allow larger TSO frames to be emitted. A general observation is that TSO elongates the pipe, thus requiring a larger congestion window and larger buffering especially at the sender side. Therefore, it is important that applications 1) get a large enough socket send buffer (this is accomplished by our dynamic send buffer expansion code) 2) do large enough writes. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 578 +++++++++++++++++++++++++++++++------------------- 1 file changed, 359 insertions(+), 219 deletions(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0a4cd24b6578..fd3ce38184ae 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1; * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. */ -int sysctl_tcp_tso_win_divisor = 8; +int sysctl_tcp_tso_win_divisor = 3; static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) @@ -403,21 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) sk->sk_send_head = skb; } -static inline void tcp_tso_set_push(struct sk_buff *skb) -{ - /* Force push to be on for any TSO frames to workaround - * problems with busted implementations like Mac OS-X that - * hold off socket receive wakeups until push is seen. - */ - if (tcp_skb_pcount(skb) > 1) - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; -} - static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - if (skb->len <= tp->mss_cache_std || + if (skb->len <= tp->mss_cache || !(sk->sk_route_caps & NETIF_F_TSO)) { /* Avoid the costly divide in the normal * non-TSO case. @@ -427,164 +417,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) } else { unsigned int factor; - factor = skb->len + (tp->mss_cache_std - 1); - factor /= tp->mss_cache_std; + factor = skb->len + (tp->mss_cache - 1); + factor /= tp->mss_cache; skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = tp->mss_cache_std; - } -} - -/* Does SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) -{ - u32 end_seq = TCP_SKB_CB(skb)->end_seq; - - return !after(end_seq, tp->snd_una + tp->snd_wnd); -} - -/* Can at least one segment of SKB be sent right now, according to the - * congestion window rules? If so, return how many segments are allowed. - */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) -{ - u32 in_flight, cwnd; - - /* Don't be strict about the congestion window for the final FIN. */ - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) - return 1; - - in_flight = tcp_packets_in_flight(tp); - cwnd = tp->snd_cwnd; - if (in_flight < cwnd) - return (cwnd - in_flight); - - return 0; -} - -static inline int tcp_minshall_check(const struct tcp_sock *tp) -{ - return after(tp->snd_sml,tp->snd_una) && - !after(tp->snd_sml, tp->snd_nxt); -} - -/* Return 0, if packet can be sent now without violation Nagle's rules: - * 1. It is full sized. - * 2. Or it contains FIN. (already checked by caller) - * 3. Or TCP_NODELAY was set. - * 4. Or TCP_CORK is not set, and all sent packets are ACKed. - * With Minshall's modification: all sent small packets are ACKed. - */ - -static inline int tcp_nagle_check(const struct tcp_sock *tp, - const struct sk_buff *skb, - unsigned mss_now, int nonagle) -{ - return (skb->len < mss_now && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); -} - -/* Return non-zero if the Nagle test allows this packet to be - * sent now. - */ -static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - /* Nagle rule does not apply to frames, which sit in the middle of the - * write_queue (they have no chances to get new data). - * - * This is implemented in the callers, where they modify the 'nonagle' - * argument based upon the location of SKB in the send queue. - */ - if (nonagle & TCP_NAGLE_PUSH) - return 1; - - /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tp->urg_mode || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) - return 1; - - if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) - return 1; - - return 0; -} - -/* This must be invoked the first time we consider transmitting - * SKB onto the wire. - */ -static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) -{ - int tso_segs = tcp_skb_pcount(skb); - - if (!tso_segs) { - tcp_set_skb_tso_segs(sk, skb); - tso_segs = tcp_skb_pcount(skb); - } - return tso_segs; -} - -/* This checks if the data bearing packet SKB (usually sk->sk_send_head) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(sk, skb); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && - !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -static inline int tcp_skb_is_last(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb->next == (struct sk_buff *)&sk->sk_write_queue; -} - -int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) -{ - struct sk_buff *skb = sk->sk_send_head; - - return (skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), - (tcp_skb_is_last(sk, skb) ? - TCP_NAGLE_PUSH : - tp->nonagle))); -} - - -/* Send _single_ skb sitting at the send head. This function requires - * true push pending frames to setup probe timer etc. - */ -void tcp_push_one(struct sock *sk, unsigned cur_mss) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = sk->sk_send_head; - - if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { - /* Send it out now. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { - sk->sk_send_head = NULL; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, tp, skb); - return; - } + skb_shinfo(skb)->tso_size = tp->mss_cache; } } @@ -791,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ tp->pmtu_cookie = pmtu; - tp->mss_cache = tp->mss_cache_std = mss_now; + tp->mss_cache = mss_now; return mss_now; } @@ -803,56 +639,47 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */ - -unsigned int tcp_current_mss(struct sock *sk, int large) +unsigned int tcp_current_mss(struct sock *sk, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); - unsigned int do_large, mss_now; + u32 mss_now; + u16 xmit_size_goal; + int doing_tso = 0; + + mss_now = tp->mss_cache; + + if (large_allowed && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode) + doing_tso = 1; - mss_now = tp->mss_cache_std; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != tp->pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } - do_large = (large && - (sk->sk_route_caps & NETIF_F_TSO) && - !tp->urg_mode); + if (tp->rx_opt.eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - if (do_large) { - unsigned int large_mss, factor, limit; + xmit_size_goal = mss_now; - large_mss = 65535 - tp->af_specific->net_header_len - + if (doing_tso) { + xmit_size_goal = 65535 - + tp->af_specific->net_header_len - tp->ext_header_len - tp->tcp_header_len; - if (tp->max_window && large_mss > (tp->max_window>>1)) - large_mss = max((tp->max_window>>1), - 68U - tp->tcp_header_len); - - factor = large_mss / mss_now; + if (tp->max_window && + (xmit_size_goal > (tp->max_window >> 1))) + xmit_size_goal = max((tp->max_window >> 1), + 68U - tp->tcp_header_len); - /* Always keep large mss multiple of real mss, but - * do not exceed 1/tso_win_divisor of the congestion window - * so we can keep the ACK clock ticking and minimize - * bursting. - */ - limit = tp->snd_cwnd; - if (sysctl_tcp_tso_win_divisor) - limit /= sysctl_tcp_tso_win_divisor; - limit = max(1U, limit); - if (factor > limit) - factor = limit; - - tp->mss_cache = mss_now * factor; - - mss_now = tp->mss_cache; + xmit_size_goal -= (xmit_size_goal % mss_now); } + tp->xmit_size_goal = xmit_size_goal; - if (tp->rx_opt.eff_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); return mss_now; } @@ -876,6 +703,251 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) } } +static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) +{ + u32 window, cwnd_len; + + window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq); + cwnd_len = mss_now * cwnd; + return min(window, cwnd_len); +} + +/* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 in_flight, cwnd; + + /* Don't be strict about the congestion window for the final FIN. */ + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 1; + + in_flight = tcp_packets_in_flight(tp); + cwnd = tp->snd_cwnd; + if (in_flight < cwnd) + return (cwnd - in_flight); + + return 0; +} + +/* This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs) { + tcp_set_skb_tso_segs(sk, skb); + tso_segs = tcp_skb_pcount(skb); + } + return tso_segs; +} + +static inline int tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ + +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return (skb->len < mss_now && + ((nonagle&TCP_NAGLE_CORK) || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + +/* Return non-zero if the Nagle test allows this packet to be + * sent now. + */ +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + /* Nagle rule does not apply to frames, which sit in the middle of the + * write_queue (they have no chances to get new data). + * + * This is implemented in the callers, where they modify the 'nonagle' + * argument based upon the location of SKB in the send queue. + */ + if (nonagle & TCP_NAGLE_PUSH) + return 1; + + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tp->urg_mode || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + return 1; + + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + return 1; + + return 0; +} + +/* Does at least the first segment of SKB fit into the send window? */ +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (skb->len > cur_mss) + end_seq = TCP_SKB_CB(skb)->seq + cur_mss; + + return !after(end_seq, tp->snd_una + tp->snd_wnd); +} + +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. If so, it returns the number of + * packets allowed by the congestion window. + */ +static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cwnd_quota; + + tcp_init_tso_segs(sk, skb); + + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) + return 0; + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (cwnd_quota && + !tcp_snd_wnd_test(tp, skb, cur_mss)) + cwnd_quota = 0; + + return cwnd_quota; +} + +static inline int tcp_skb_is_last(const struct sock *sk, + const struct sk_buff *skb) +{ + return skb->next == (struct sk_buff *)&sk->sk_write_queue; +} + +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb = sk->sk_send_head; + + return (skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + (tcp_skb_is_last(sk, skb) ? + TCP_NAGLE_PUSH : + tp->nonagle))); +} + +/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet + * which is put after SKB on the list. It is very much like + * tcp_fragment() except that it may make several kinds of assumptions + * in order to speed up the splitting operation. In particular, we + * know that all the data is in scatter-gather pages, and that the + * packet has never been sent out before (and thus is not cloned). + */ +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len) +{ + struct sk_buff *buff; + int nlen = skb->len - len; + u16 flags; + + /* All of a TSO frame must be composed of paged data. */ + BUG_ON(skb->len != skb->data_len); + + buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); + if (unlikely(buff == NULL)) + return -ENOMEM; + + buff->truesize = nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(buff)->flags = flags; + + /* This packet was never sent out yet, so no SACK bits. */ + TCP_SKB_CB(buff)->sacked = 0; + + buff->ip_summed = skb->ip_summed = CHECKSUM_HW; + skb_split(skb, buff, len); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb); + tcp_set_skb_tso_segs(sk, buff); + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + __skb_append(skb, buff); + + return 0; +} + +/* Try to defer sending, if possible, in order to minimize the amount + * of TSO splitting we do. View it as a kind of TSO Nagle test. + * + * This algorithm is from John Heffner. + */ +static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 send_win, cong_win, limit, in_flight; + + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 0; + + in_flight = tcp_packets_in_flight(tp); + + BUG_ON(tcp_skb_pcount(skb) <= 1 || + (tp->snd_cwnd <= in_flight)); + + send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq; + + /* From in_flight test above, we know that cwnd > in_flight. */ + cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + limit = min(send_win, cong_win); + + /* If sk_send_head can be sent fully now, just do it. */ + if (skb->len <= limit) + return 0; + + if (sysctl_tcp_tso_win_divisor) { + u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); + + /* If at least some fraction of a window is available, + * just use it. + */ + chunk /= sysctl_tcp_tso_win_divisor; + if (limit >= chunk) + return 0; + } else { + /* Different approach, try not to defer past a single + * ACK. Receiver should ACK every other full sized + * frame, so if we have space for more than 3 frames + * then send now. + */ + if (limit > tcp_max_burst(tp) * tp->mss_cache) + return 0; + } + + /* Ok, it looks like it is advisable to defer. */ + return 1; +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -887,8 +959,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - unsigned int tso_segs, cwnd_quota; - int sent_pkts; + unsigned int tso_segs, sent_pkts; + int cwnd_quota; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all @@ -903,24 +975,44 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) tso_segs = tcp_init_tso_segs(sk, skb); cwnd_quota = tcp_cwnd_test(tp, skb); + if (unlikely(!cwnd_quota)) + goto out; + sent_pkts = 0; + while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) { + BUG_ON(!tso_segs); - while (cwnd_quota >= tso_segs) { - if (unlikely(!tcp_nagle_test(tp, skb, mss_now, - (tcp_skb_is_last(sk, skb) ? - nonagle : TCP_NAGLE_PUSH)))) - break; + if (tso_segs == 1) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + } else { + if (tcp_tso_should_defer(sk, tp, skb)) + break; + } - if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) - break; + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; - if (unlikely(skb->len > mss_now)) { + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (tso_fragment(sk, skb, limit)) + break; + } + } else if (unlikely(skb->len > mss_now)) { if (unlikely(tcp_fragment(sk, skb, mss_now))) break; } TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); + if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) break; @@ -936,6 +1028,11 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) * the packet above, tso_segs will no longer be valid. */ cwnd_quota -= tcp_skb_pcount(skb); + + BUG_ON(cwnd_quota < 0); + if (!cwnd_quota) + break; + skb = sk->sk_send_head; if (!skb) break; @@ -946,7 +1043,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) tcp_cwnd_validate(sk, tp); return 0; } - +out: return !tp->packets_out && sk->sk_send_head; } @@ -965,6 +1062,53 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, } } +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. + */ +void tcp_push_one(struct sock *sk, unsigned int mss_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_send_head; + unsigned int tso_segs, cwnd_quota; + + BUG_ON(!skb || skb->len < mss_now); + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); + + if (likely(cwnd_quota)) { + BUG_ON(!tso_segs); + + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; + + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (unlikely(tso_fragment(sk, skb, limit))) + return; + } + } else if (unlikely(skb->len > mss_now)) { + if (unlikely(tcp_fragment(sk, skb, mss_now))) + return; + } + + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + update_send_head(sk, tp, skb); + tcp_cwnd_validate(sk, tp); + return; + } + } +} + /* This function returns the amount that we can raise the * usable window based on the following constraints * @@ -1222,7 +1366,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (sk->sk_route_caps & NETIF_F_TSO) { sk->sk_route_caps &= ~NETIF_F_TSO; sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) @@ -1284,7 +1427,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, (skb_cloned(skb) ? pskb_copy(skb, GFP_ATOMIC): @@ -1853,14 +1995,12 @@ int tcp_write_wakeup(struct sock *sk) if (sk->sk_route_caps & NETIF_F_TSO) { sock_set_flag(sk, SOCK_NO_LARGESEND); sk->sk_route_caps &= ~NETIF_F_TSO; - tp->mss_cache = tp->mss_cache_std; } } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!err) { update_send_head(sk, tp, skb); -- cgit v1.2.3 From 908a75c17a9e5a888347c2c1d3572203d1b1c7db Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Jul 2005 15:43:58 -0700 Subject: [TCP]: Never TSO defer under periods of congestion. Congestion window recover after loss depends upon the fact that if we have a full MSS sized frame at the head of the send queue, we will send it. TSO deferral can defeat the ACK clocking necessary to exit cleanly from recovery. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fd3ce38184ae..e041d057ec86 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -909,6 +909,9 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_ if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) return 0; + if (tp->ca_state != TCP_CA_Open) + return 0; + in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1 || -- cgit v1.2.3 From 86a76caf8705e3524e15f343f3c4806939a06dc8 Mon Sep 17 00:00:00 2001 From: Victor Fusco Date: Fri, 8 Jul 2005 14:57:47 -0700 Subject: [NET]: Fix sparse warnings From: Victor Fusco Fix the sparse warning "implicit cast to nocast type" Signed-off-by: Victor Fusco Signed-off-by: Domen Puncer Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_output.c') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e041d057ec86..e3f8ea1bfa9c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1613,7 +1613,7 @@ void tcp_send_fin(struct sock *sk) * was unread data in the receive queue. This behavior is recommended * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM */ -void tcp_send_active_reset(struct sock *sk, int priority) +void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; -- cgit v1.2.3