From 6f73601efb35c7003f5c58c2bc6fd08f3652169c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 19 Oct 2012 15:14:44 +0000 Subject: tcp: add SYN/data info to TCP_INFO Add a bit TCPI_OPT_SYN_DATA (32) to the socket option TCP_INFO:tcpi_options. It's set if the data in SYN (sent or received) is acked by SYN-ACK. Server or client application can use this information to check Fast Open success rate. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 1 + 4 files changed, 5 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b7c2f439b54f..197c0008503c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2764,6 +2764,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 432c36649db3..036f85738141 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5646,6 +5646,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_rearm_rto(sk); return true; } + tp->syn_data_acked = tp->syn_data; return false; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ef998b008a57..0c4a64355603 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1461,6 +1461,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk, skb_set_owner_r(skb, child); __skb_queue_tail(&child->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tp->syn_data_acked = 1; } sk->sk_data_ready(sk, 0); bh_unlock_sock(child); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 27536ba16c9d..a7302d974f32 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -510,6 +510,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } -- cgit v1.2.3 From 37561f68bd527ec39076e32effdc7b1dcdfb17ea Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Mon, 22 Oct 2012 11:26:36 +0000 Subject: tcp: Reject invalid ack_seq to Fast Open sockets A packet with an invalid ack_seq may cause a TCP Fast Open socket to switch to the unexpected TCP_CLOSING state, triggering a BUG_ON kernel panic. When a FIN packet with an invalid ack_seq# arrives at a socket in the TCP_FIN_WAIT1 state, rather than discarding the packet, the current code will accept the FIN, causing state transition to TCP_CLOSING. This may be a small deviation from RFC793, which seems to say that the packet should be dropped. Unfortunately I did not expect this case for Fast Open hence it will trigger a BUG_ON panic. It turns out there is really nothing bad about a TFO socket going into TCP_CLOSING state so I could just remove the BUG_ON statements. But after some thought I think it's better to treat this case like TCP_SYN_RECV and return a RST to the confused peer who caused the unacceptable ack_seq to be generated in the first place. Signed-off-by: H.K. Jerry Chu Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 ++++++++++-- net/ipv4/tcp_timer.c | 4 ++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 036f85738141..1db663983587 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5964,7 +5964,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, req = tp->fastopen_rsk; if (req != NULL) { - BUG_ON(sk->sk_state != TCP_SYN_RECV && + WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); if (tcp_check_req(sk, skb, req, NULL, true) == NULL) @@ -6053,7 +6053,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * ACK we have received, this would have acknowledged * our SYNACK so stop the SYNACK timer. */ - if (acceptable && req != NULL) { + if (req != NULL) { + /* Return RST if ack_seq is invalid. + * Note that RFC793 only says to generate a + * DUPACK for it but for TCP Fast Open it seems + * better to treat this case like TCP_SYN_RECV + * above. + */ + if (!acceptable) + return 1; /* We no longer need the request sock. */ reqsk_fastopen_remove(sk, req, false); tcp_rearm_rto(sk); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index fc04711e80c8..d47c1b4421a3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -347,8 +347,8 @@ void tcp_retransmit_timer(struct sock *sk) return; } if (tp->fastopen_rsk) { - BUG_ON(sk->sk_state != TCP_SYN_RECV && - sk->sk_state != TCP_FIN_WAIT1); + WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && + sk->sk_state != TCP_FIN_WAIT1); tcp_fastopen_synack_timer(sk); /* Before we receive ACK to our SYN-ACK don't retransmit * anything else (e.g., data or FIN segments). -- cgit v1.2.3 From 38fe36a248ec3228f8e6507955d7ceb0432d2000 Mon Sep 17 00:00:00 2001 From: Ulrich Weber Date: Thu, 25 Oct 2012 05:34:45 +0000 Subject: netfilter: nf_nat: don't check for port change on ICMP tuples ICMP tuples have id in src and type/code in dst. So comparing src.u.all with dst.u.all will always fail here and ip_xfrm_me_harder() is called for every ICMP packet, even if there was no NAT. Signed-off-by: Ulrich Weber Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/iptable_nat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 9e0ffaf1d942..a82047282dbb 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -184,7 +184,8 @@ nf_nat_ipv4_out(unsigned int hooknum, if ((ct->tuplehash[dir].tuple.src.u3.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) || - (ct->tuplehash[dir].tuple.src.u.all != + (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && + ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) if (nf_xfrm_me_harder(skb, AF_INET) < 0) ret = NF_DROP; @@ -221,6 +222,7 @@ nf_nat_ipv4_local_fn(unsigned int hooknum, } #ifdef CONFIG_XFRM else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && + ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) if (nf_xfrm_me_harder(skb, AF_INET) < 0) -- cgit v1.2.3 From 8f363b77ee4fbf7c3bbcf5ec2c5ca482d396d664 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 31 Oct 2012 02:45:32 +0000 Subject: net: fix divide by zero in tcp algorithm illinois Reading TCP stats when using TCP Illinois congestion control algorithm can cause a divide by zero kernel oops. The division by zero occur in tcp_illinois_info() at: do_div(t, ca->cnt_rtt); where ca->cnt_rtt can become zero (when rtt_reset is called) Steps to Reproduce: 1. Register tcp_illinois: # sysctl -w net.ipv4.tcp_congestion_control=illinois 2. Monitor internal TCP information via command "ss -i" # watch -d ss -i 3. Establish new TCP conn to machine Either it fails at the initial conn, or else it needs to wait for a loss or a reset. This is only related to reading stats. The function avg_delay() also performs the same divide, but is guarded with a (ca->cnt_rtt > 0) at its calling point in update_params(). Thus, simply fix tcp_illinois_info(). Function tcp_illinois_info() / get_info() is called without socket lock. Thus, eliminate any race condition on ca->cnt_rtt by using a local stack variable. Simply reuse info.tcpv_rttcnt, as its already set to ca->cnt_rtt. Function avg_delay() is not affected by this race condition, as its called with the socket lock. Cc: Petr Matousek Signed-off-by: Jesper Dangaard Brouer Acked-by: Eric Dumazet Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_illinois.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 813b43a76fec..834857f3c871 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -313,11 +313,13 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, .tcpv_rttcnt = ca->cnt_rtt, .tcpv_minrtt = ca->base_rtt, }; - u64 t = ca->sum_rtt; - do_div(t, ca->cnt_rtt); - info.tcpv_rtt = t; + if (info.tcpv_rttcnt > 0) { + u64 t = ca->sum_rtt; + do_div(t, info.tcpv_rttcnt); + info.tcpv_rtt = t; + } nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); } } -- cgit v1.2.3 From 2c42a3fb30845867bfcaf0747ff50c1375884ff2 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 30 Oct 2012 12:03:09 +0000 Subject: tcp: Fix double sizeof in new tcp_metrics code Fix double sizeof when parsing IPv6 address from user space because it breaks get/del by specific IPv6 address. Problem noticed by David Binderman: https://bugzilla.kernel.org/show_bug.cgi?id=49171 Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4c752a6e0bcd..53bc5847bfa8 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -864,7 +864,7 @@ static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, } a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6]; if (a) { - if (nla_len(a) != sizeof(sizeof(struct in6_addr))) + if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; addr->family = AF_INET6; memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); -- cgit v1.2.3 From c454e6111d1ef4268fe98e87087216e51c2718c3 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 29 Oct 2012 05:05:33 +0000 Subject: tcp-repair: Handle zero-length data put in rcv queue When sending data into a tcp socket in repair state we should check for the amount of data being 0 explicitly. Otherwise we'll have an skb with seq == end_seq in rcv queue, but tcp doesn't expect this to happen (in particular a warn_on in tcp_recvmsg shoots). Signed-off-by: Pavel Emelyanov Reported-by: Giorgos Mavrikas Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1db663983587..2c2b13a999ea 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4529,6 +4529,9 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) struct tcphdr *th; bool fragstolen; + if (size == 0) + return 0; + skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); if (!skb) goto err; -- cgit v1.2.3 From cacb6ba0f36ab14a507f4ee7697e8332899015d2 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 3 Nov 2012 09:30:34 +0000 Subject: net: inet_diag -- Return error code if protocol handler is missed We've observed that in case if UDP diag module is not supported in kernel the netlink returns NLMSG_DONE without notifying a caller that handler is missed. This patch makes __inet_diag_dump to return error code instead. So as example it become possible to detect such situation and handle it gracefully on userspace level. Signed-off-by: Cyrill Gorcunov CC: David Miller CC: Eric Dumazet CC: Pavel Emelyanov Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 535584c00f91..0c34bfabc11f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -892,13 +892,16 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc) { const struct inet_diag_handler *handler; + int err = 0; handler = inet_diag_lock_handler(r->sdiag_protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r, bc); + else + err = PTR_ERR(handler); inet_diag_unlock_handler(handler); - return skb->len; + return err ? : skb->len; } static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) -- cgit v1.2.3