diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-02 13:38:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-02 13:38:27 -0700 |
commit | aecdc33e111b2c447b622e287c6003726daa1426 (patch) | |
tree | 3e7657eae4b785e1a1fb5dfb225dbae0b2f0cfc6 /net/ipv4/tcp_input.c | |
parent | a20acf99f75e49271381d65db097c9763060a1e8 (diff) | |
parent | a3a6cab5ea10cca64d036851fe0d932448f2fe4f (diff) | |
download | linux-stable-aecdc33e111b2c447b622e287c6003726daa1426.tar.gz linux-stable-aecdc33e111b2c447b622e287c6003726daa1426.tar.bz2 linux-stable-aecdc33e111b2c447b622e287c6003726daa1426.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David Miller:
1) GRE now works over ipv6, from Dmitry Kozlov.
2) Make SCTP more network namespace aware, from Eric Biederman.
3) TEAM driver now works with non-ethernet devices, from Jiri Pirko.
4) Make openvswitch network namespace aware, from Pravin B Shelar.
5) IPV6 NAT implementation, from Patrick McHardy.
6) Server side support for TCP Fast Open, from Jerry Chu and others.
7) Packet BPF filter supports MOD and XOR, from Eric Dumazet and Daniel
Borkmann.
8) Increate the loopback default MTU to 64K, from Eric Dumazet.
9) Use a per-task rather than per-socket page fragment allocator for
outgoing networking traffic. This benefits processes that have very
many mostly idle sockets, which is quite common.
From Eric Dumazet.
10) Use up to 32K for page fragment allocations, with fallbacks to
smaller sizes when higher order page allocations fail. Benefits are
a) less segments for driver to process b) less calls to page
allocator c) less waste of space.
From Eric Dumazet.
11) Allow GRO to be used on GRE tunnels, from Eric Dumazet.
12) VXLAN device driver, one way to handle VLAN issues such as the
limitation of 4096 VLAN IDs yet still have some level of isolation.
From Stephen Hemminger.
13) As usual there is a large boatload of driver changes, with the scale
perhaps tilted towards the wireless side this time around.
Fix up various fairly trivial conflicts, mostly caused by the user
namespace changes.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1012 commits)
hyperv: Add buffer for extended info after the RNDIS response message.
hyperv: Report actual status in receive completion packet
hyperv: Remove extra allocated space for recv_pkt_list elements
hyperv: Fix page buffer handling in rndis_filter_send_request()
hyperv: Fix the missing return value in rndis_filter_set_packet_filter()
hyperv: Fix the max_xfer_size in RNDIS initialization
vxlan: put UDP socket in correct namespace
vxlan: Depend on CONFIG_INET
sfc: Fix the reported priorities of different filter types
sfc: Remove EFX_FILTER_FLAG_RX_OVERRIDE_IP
sfc: Fix loopback self-test with separate_tx_channels=1
sfc: Fix MCDI structure field lookup
sfc: Add parentheses around use of bitfield macro arguments
sfc: Fix null function pointer in efx_sriov_channel_type
vxlan: virtual extensible lan
igmp: export symbol ip_mc_leave_group
netlink: add attributes to fdb interface
tg3: unconditionally select HWMON support when tg3 is enabled.
Revert "net: ti cpsw ethernet: allow reading phy interface mode from DT"
gre: fix sparse warning
...
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 281 |
1 files changed, 148 insertions, 133 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d377f4854cb8..432c36649db3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -237,7 +237,11 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + /* Better not delay acks, sender can have a very low cwnd */ + tcp_enter_quickack_mode((struct sock *)tp); + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + } /* fallinto */ default: tp->ecn_flags |= TCP_ECN_SEEN; @@ -374,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) /* 4. Try to fixup all. It is made immediately after connection enters * established state. */ -static void tcp_init_buffer_space(struct sock *sk) +void tcp_init_buffer_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int maxwin; @@ -739,29 +743,6 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -/* Set slow start threshold and cwnd not falling to slow start */ -void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) -{ - struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - - tp->prior_ssthresh = 0; - tp->bytes_acked = 0; - if (icsk->icsk_ca_state < TCP_CA_CWR) { - tp->undo_marker = 0; - if (set_ssthresh) - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + 1U); - tp->snd_cwnd_cnt = 0; - tp->high_seq = tp->snd_nxt; - tp->snd_cwnd_stamp = tcp_time_stamp; - TCP_ECN_queue_cwr(tp); - - tcp_set_ca_state(sk, TCP_CA_CWR); - } -} - /* * Packet counting of FACK is based on in-order assumptions, therefore TCP * disables it when reordering is detected @@ -2489,35 +2470,6 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } -/* Lower bound on congestion window is slow start threshold - * unless congestion avoidance choice decides to overide it. - */ -static inline u32 tcp_cwnd_min(const struct sock *sk) -{ - const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - - return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; -} - -/* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct sock *sk, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - int decr = tp->snd_cwnd_cnt + 1; - - if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || - (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { - tp->snd_cwnd_cnt = decr & 1; - decr >>= 1; - - if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) - tp->snd_cwnd -= decr; - - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); - tp->snd_cwnd_stamp = tcp_time_stamp; - } -} - /* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */ @@ -2719,24 +2671,80 @@ static bool tcp_try_undo_loss(struct sock *sk) return false; } -static inline void tcp_complete_cwr(struct sock *sk) +/* The cwnd reduction in CWR and Recovery use the PRR algorithm + * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ + * It computes the number of packets to send (sndcnt) based on packets newly + * delivered: + * 1) If the packets in flight is larger than ssthresh, PRR spreads the + * cwnd reductions across a full RTT. + * 2) If packets in flight is lower than ssthresh (such as due to excess + * losses and/or application stalls), do not perform any further cwnd + * reductions, but instead slow start up to ssthresh. + */ +static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) { struct tcp_sock *tp = tcp_sk(sk); - /* Do not moderate cwnd if it's already undone in cwr or recovery. */ - if (tp->undo_marker) { - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) { - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); - tp->snd_cwnd_stamp = tcp_time_stamp; - } else if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { - /* PRR algorithm. */ - tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_stamp = tcp_time_stamp; - } + tp->high_seq = tp->snd_nxt; + tp->bytes_acked = 0; + tp->snd_cwnd_cnt = 0; + tp->prior_cwnd = tp->snd_cwnd; + tp->prr_delivered = 0; + tp->prr_out = 0; + if (set_ssthresh) + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + TCP_ECN_queue_cwr(tp); +} + +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, + int fast_rexmit) +{ + struct tcp_sock *tp = tcp_sk(sk); + int sndcnt = 0; + int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); + + tp->prr_delivered += newly_acked_sacked; + if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { + u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + + tp->prior_cwnd - 1; + sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; + } else { + sndcnt = min_t(int, delta, + max_t(int, tp->prr_delivered - tp->prr_out, + newly_acked_sacked) + 1); + } + + sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); + tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; +} + +static inline void tcp_end_cwnd_reduction(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ + if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || + (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_stamp = tcp_time_stamp; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } +/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ +void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->prior_ssthresh = 0; + tp->bytes_acked = 0; + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + tp->undo_marker = 0; + tcp_init_cwnd_reduction(sk, set_ssthresh); + tcp_set_ca_state(sk, TCP_CA_CWR); + } +} + static void tcp_try_keep_open(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2751,7 +2759,7 @@ static void tcp_try_keep_open(struct sock *sk) } } -static void tcp_try_to_open(struct sock *sk, int flag) +static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) { struct tcp_sock *tp = tcp_sk(sk); @@ -2768,7 +2776,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) tcp_moderate_cwnd(tp); } else { - tcp_cwnd_down(sk, flag); + tcp_cwnd_reduction(sk, newly_acked_sacked, 0); } } @@ -2850,38 +2858,6 @@ void tcp_simple_retransmit(struct sock *sk) } EXPORT_SYMBOL(tcp_simple_retransmit); -/* This function implements the PRR algorithm, specifcally the PRR-SSRB - * (proportional rate reduction with slow start reduction bound) as described in - * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt. - * It computes the number of packets to send (sndcnt) based on packets newly - * delivered: - * 1) If the packets in flight is larger than ssthresh, PRR spreads the - * cwnd reductions across a full RTT. - * 2) If packets in flight is lower than ssthresh (such as due to excess - * losses and/or application stalls), do not perform any further cwnd - * reductions, but instead slow start up to ssthresh. - */ -static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked, - int fast_rexmit, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - int sndcnt = 0; - int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); - - if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { - u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + - tp->prior_cwnd - 1; - sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else { - sndcnt = min_t(int, delta, - max_t(int, tp->prr_delivered - tp->prr_out, - newly_acked_sacked) + 1); - } - - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); - tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; -} - static void tcp_enter_recovery(struct sock *sk, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); @@ -2894,7 +2870,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) NET_INC_STATS_BH(sock_net(sk), mib_idx); - tp->high_seq = tp->snd_nxt; tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; tp->undo_retrans = tp->retrans_out; @@ -2902,15 +2877,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); - TCP_ECN_queue_cwr(tp); + tcp_init_cwnd_reduction(sk, true); } - - tp->bytes_acked = 0; - tp->snd_cwnd_cnt = 0; - tp->prior_cwnd = tp->snd_cwnd; - tp->prr_delivered = 0; - tp->prr_out = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -2970,7 +2938,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { - tcp_complete_cwr(sk); + tcp_end_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_Open); } break; @@ -2980,7 +2948,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk)) return; - tcp_complete_cwr(sk); + tcp_end_cwnd_reduction(sk); break; } } @@ -3021,7 +2989,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk, flag)) { - tcp_try_to_open(sk, flag); + tcp_try_to_open(sk, flag, newly_acked_sacked); return; } @@ -3043,8 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) tcp_update_scoreboard(sk, fast_rexmit); - tp->prr_delivered += newly_acked_sacked; - tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag); + tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit); tcp_xmit_retransmit_queue(sk); } @@ -3123,6 +3090,12 @@ void tcp_rearm_rto(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + /* If the retrans timer is currently being used by Fast Open + * for SYN-ACK retrans purpose, stay put. + */ + if (tp->fastopen_rsk) + return; + if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { @@ -3384,7 +3357,7 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { const struct tcp_sock *tp = tcp_sk(sk); return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && - !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); + !tcp_in_cwnd_reduction(sk); } /* Check that window update is acceptable. @@ -3452,9 +3425,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) } /* A conservative spurious RTO response algorithm: reduce cwnd using - * rate halving and continue in congestion avoidance. + * PRR and continue in congestion avoidance. */ -static void tcp_ratehalving_spur_to_response(struct sock *sk) +static void tcp_cwr_spur_to_response(struct sock *sk) { tcp_enter_cwr(sk, 0); } @@ -3462,7 +3435,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk) static void tcp_undo_spur_to_response(struct sock *sk, int flag) { if (flag & FLAG_ECE) - tcp_ratehalving_spur_to_response(sk); + tcp_cwr_spur_to_response(sk); else tcp_undo_cwr(sk, true); } @@ -3569,7 +3542,7 @@ static bool tcp_process_frto(struct sock *sk, int flag) tcp_conservative_spur_to_response(tp); break; default: - tcp_ratehalving_spur_to_response(sk); + tcp_cwr_spur_to_response(sk); break; } tp->frto_counter = 0; @@ -4034,7 +4007,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) } /* When we get a reset we do this. */ -static void tcp_reset(struct sock *sk) +void tcp_reset(struct sock *sk) { /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { @@ -5740,7 +5713,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, TCP_ECN_rcv_synack(tp, th); - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and @@ -5753,7 +5726,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * never scaled. */ tp->snd_wnd = ntohs(th->window); - tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); if (!tp->rx_opt.wscale_ok) { tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; @@ -5891,7 +5863,9 @@ discard: tcp_send_synack(sk); #if 0 /* Note, we could accept data and URG from this segment. - * There are no obstacles to make this. + * There are no obstacles to make this (except that we must + * either change tcp_recvmsg() to prevent it from returning data + * before 3WHS completes per RFC793, or employ TCP Fast Open). * * However, if we ignore data in ACKless segments sometimes, * we have no reasons to accept it sometimes. @@ -5931,6 +5905,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *req; int queued = 0; tp->rx_opt.saw_tstamp = 0; @@ -5986,6 +5961,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 0; } + req = tp->fastopen_rsk; + if (req != NULL) { + BUG_ON(sk->sk_state != TCP_SYN_RECV && + sk->sk_state != TCP_FIN_WAIT1); + + if (tcp_check_req(sk, skb, req, NULL, true) == NULL) + goto discard; + } if (!tcp_validate_incoming(sk, skb, th, 0)) return 0; @@ -5996,7 +5979,25 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, switch (sk->sk_state) { case TCP_SYN_RECV: if (acceptable) { - tp->copied_seq = tp->rcv_nxt; + /* Once we leave TCP_SYN_RECV, we no longer + * need req so release it. + */ + if (req) { + tcp_synack_rtt_meas(sk, req); + tp->total_retrans = req->retrans; + + reqsk_fastopen_remove(sk, req, false); + } else { + /* Make sure socket is routed, for + * correct metrics. + */ + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_congestion_control(sk); + + tcp_mtup_init(sk); + tcp_init_buffer_space(sk); + tp->copied_seq = tp->rcv_nxt; + } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); @@ -6018,23 +6019,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - /* Make sure socket is routed, for - * correct metrics. - */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - - tcp_init_congestion_control(sk); + if (req) { + /* Re-arm the timer because data may + * have been sent out. This is similar + * to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more + * aggressive and retranmitting any data + * sooner based on when they were sent + * out. + */ + tcp_rearm_rto(sk); + } else + tcp_init_metrics(sk); /* Prevent spurious tcp_cwnd_restart() on * first data packet. */ tp->lsndtime = tcp_time_stamp; - tcp_mtup_init(sk); tcp_initialize_rcv_mss(sk); - tcp_init_buffer_space(sk); tcp_fast_path_on(tp); } else { return 1; @@ -6042,6 +6047,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_FIN_WAIT1: + /* If we enter the TCP_FIN_WAIT1 state and we are a + * Fast Open socket and this is the first acceptable + * ACK we have received, this would have acknowledged + * our SYNACK so stop the SYNACK timer. + */ + if (acceptable && req != NULL) { + /* We no longer need the request sock. */ + reqsk_fastopen_remove(sk, req, false); + tcp_rearm_rto(sk); + } if (tp->snd_una == tp->write_seq) { struct dst_entry *dst; |