From 6f73601efb35c7003f5c58c2bc6fd08f3652169c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Fri, 19 Oct 2012 15:14:44 +0000
Subject: tcp: add SYN/data info to TCP_INFO

Add a bit TCPI_OPT_SYN_DATA (32) to the socket option TCP_INFO:tcpi_options.
It's set if the data in SYN (sent or received) is acked by SYN-ACK. Server or
client application can use this information to check Fast Open success rate.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c           | 2 ++
 net/ipv4/tcp_input.c     | 1 +
 net/ipv4/tcp_ipv4.c      | 1 +
 net/ipv4/tcp_minisocks.c | 1 +
 4 files changed, 5 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b7c2f439b54f..197c0008503c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2764,6 +2764,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
 		info->tcpi_options |= TCPI_OPT_ECN;
 	if (tp->ecn_flags & TCP_ECN_SEEN)
 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+	if (tp->syn_data_acked)
+		info->tcpi_options |= TCPI_OPT_SYN_DATA;
 
 	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
 	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 432c36649db3..036f85738141 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5646,6 +5646,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		tcp_rearm_rto(sk);
 		return true;
 	}
+	tp->syn_data_acked = tp->syn_data;
 	return false;
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ef998b008a57..0c4a64355603 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1461,6 +1461,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
 		skb_set_owner_r(skb, child);
 		__skb_queue_tail(&child->sk_receive_queue, skb);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		tp->syn_data_acked = 1;
 	}
 	sk->sk_data_ready(sk, 0);
 	bh_unlock_sock(child);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 27536ba16c9d..a7302d974f32 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -510,6 +510,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->rx_opt.mss_clamp = req->mss;
 		TCP_ECN_openreq_child(newtp, req);
 		newtp->fastopen_rsk = NULL;
+		newtp->syn_data_acked = 0;
 
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
-- 
cgit v1.2.3


From 37561f68bd527ec39076e32effdc7b1dcdfb17ea Mon Sep 17 00:00:00 2001
From: Jerry Chu <hkchu@google.com>
Date: Mon, 22 Oct 2012 11:26:36 +0000
Subject: tcp: Reject invalid ack_seq to Fast Open sockets

A packet with an invalid ack_seq may cause a TCP Fast Open socket to switch
to the unexpected TCP_CLOSING state, triggering a BUG_ON kernel panic.

When a FIN packet with an invalid ack_seq# arrives at a socket in
the TCP_FIN_WAIT1 state, rather than discarding the packet, the current
code will accept the FIN, causing state transition to TCP_CLOSING.

This may be a small deviation from RFC793, which seems to say that the
packet should be dropped. Unfortunately I did not expect this case for
Fast Open hence it will trigger a BUG_ON panic.

It turns out there is really nothing bad about a TFO socket going into
TCP_CLOSING state so I could just remove the BUG_ON statements. But after
some thought I think it's better to treat this case like TCP_SYN_RECV
and return a RST to the confused peer who caused the unacceptable ack_seq
to be generated in the first place.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 12 ++++++++++--
 net/ipv4/tcp_timer.c |  4 ++--
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 036f85738141..1db663983587 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5964,7 +5964,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 	req = tp->fastopen_rsk;
 	if (req != NULL) {
-		BUG_ON(sk->sk_state != TCP_SYN_RECV &&
+		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
 		    sk->sk_state != TCP_FIN_WAIT1);
 
 		if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
@@ -6053,7 +6053,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			 * ACK we have received, this would have acknowledged
 			 * our SYNACK so stop the SYNACK timer.
 			 */
-			if (acceptable && req != NULL) {
+			if (req != NULL) {
+				/* Return RST if ack_seq is invalid.
+				 * Note that RFC793 only says to generate a
+				 * DUPACK for it but for TCP Fast Open it seems
+				 * better to treat this case like TCP_SYN_RECV
+				 * above.
+				 */
+				if (!acceptable)
+					return 1;
 				/* We no longer need the request sock. */
 				reqsk_fastopen_remove(sk, req, false);
 				tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index fc04711e80c8..d47c1b4421a3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -347,8 +347,8 @@ void tcp_retransmit_timer(struct sock *sk)
 		return;
 	}
 	if (tp->fastopen_rsk) {
-		BUG_ON(sk->sk_state != TCP_SYN_RECV &&
-		    sk->sk_state != TCP_FIN_WAIT1);
+		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
+			     sk->sk_state != TCP_FIN_WAIT1);
 		tcp_fastopen_synack_timer(sk);
 		/* Before we receive ACK to our SYN-ACK don't retransmit
 		 * anything else (e.g., data or FIN segments).
-- 
cgit v1.2.3


From 38fe36a248ec3228f8e6507955d7ceb0432d2000 Mon Sep 17 00:00:00 2001
From: Ulrich Weber <ulrich.weber@sophos.com>
Date: Thu, 25 Oct 2012 05:34:45 +0000
Subject: netfilter: nf_nat: don't check for port change on ICMP tuples

ICMP tuples have id in src and type/code in dst.
So comparing src.u.all with dst.u.all will always fail here
and ip_xfrm_me_harder() is called for every ICMP packet,
even if there was no NAT.

Signed-off-by: Ulrich Weber <ulrich.weber@sophos.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/iptable_nat.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 9e0ffaf1d942..a82047282dbb 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -184,7 +184,8 @@ nf_nat_ipv4_out(unsigned int hooknum,
 
 		if ((ct->tuplehash[dir].tuple.src.u3.ip !=
 		     ct->tuplehash[!dir].tuple.dst.u3.ip) ||
-		    (ct->tuplehash[dir].tuple.src.u.all !=
+		    (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+		     ct->tuplehash[dir].tuple.src.u.all !=
 		     ct->tuplehash[!dir].tuple.dst.u.all))
 			if (nf_xfrm_me_harder(skb, AF_INET) < 0)
 				ret = NF_DROP;
@@ -221,6 +222,7 @@ nf_nat_ipv4_local_fn(unsigned int hooknum,
 		}
 #ifdef CONFIG_XFRM
 		else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+			 ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
 			 ct->tuplehash[dir].tuple.dst.u.all !=
 			 ct->tuplehash[!dir].tuple.src.u.all)
 			if (nf_xfrm_me_harder(skb, AF_INET) < 0)
-- 
cgit v1.2.3


From 8f363b77ee4fbf7c3bbcf5ec2c5ca482d396d664 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 31 Oct 2012 02:45:32 +0000
Subject: net: fix divide by zero in tcp algorithm illinois

Reading TCP stats when using TCP Illinois congestion control algorithm
can cause a divide by zero kernel oops.

The division by zero occur in tcp_illinois_info() at:
 do_div(t, ca->cnt_rtt);
where ca->cnt_rtt can become zero (when rtt_reset is called)

Steps to Reproduce:
 1. Register tcp_illinois:
     # sysctl -w net.ipv4.tcp_congestion_control=illinois
 2. Monitor internal TCP information via command "ss -i"
     # watch -d ss -i
 3. Establish new TCP conn to machine

Either it fails at the initial conn, or else it needs to wait
for a loss or a reset.

This is only related to reading stats.  The function avg_delay() also
performs the same divide, but is guarded with a (ca->cnt_rtt > 0) at its
calling point in update_params().  Thus, simply fix tcp_illinois_info().

Function tcp_illinois_info() / get_info() is called without
socket lock.  Thus, eliminate any race condition on ca->cnt_rtt
by using a local stack variable.  Simply reuse info.tcpv_rttcnt,
as its already set to ca->cnt_rtt.
Function avg_delay() is not affected by this race condition, as
its called with the socket lock.

Cc: Petr Matousek <pmatouse@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_illinois.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 813b43a76fec..834857f3c871 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -313,11 +313,13 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
 			.tcpv_rttcnt = ca->cnt_rtt,
 			.tcpv_minrtt = ca->base_rtt,
 		};
-		u64 t = ca->sum_rtt;
 
-		do_div(t, ca->cnt_rtt);
-		info.tcpv_rtt = t;
+		if (info.tcpv_rttcnt > 0) {
+			u64 t = ca->sum_rtt;
 
+			do_div(t, info.tcpv_rttcnt);
+			info.tcpv_rtt = t;
+		}
 		nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
 	}
 }
-- 
cgit v1.2.3


From 2c42a3fb30845867bfcaf0747ff50c1375884ff2 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 30 Oct 2012 12:03:09 +0000
Subject: tcp: Fix double sizeof in new tcp_metrics code

Fix double sizeof when parsing IPv6 address from
user space because it breaks get/del by specific IPv6 address.

	Problem noticed by David Binderman:

https://bugzilla.kernel.org/show_bug.cgi?id=49171

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4c752a6e0bcd..53bc5847bfa8 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -864,7 +864,7 @@ static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
 	}
 	a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
 	if (a) {
-		if (nla_len(a) != sizeof(sizeof(struct in6_addr)))
+		if (nla_len(a) != sizeof(struct in6_addr))
 			return -EINVAL;
 		addr->family = AF_INET6;
 		memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
-- 
cgit v1.2.3


From c454e6111d1ef4268fe98e87087216e51c2718c3 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Mon, 29 Oct 2012 05:05:33 +0000
Subject: tcp-repair: Handle zero-length data put in rcv queue

When sending data into a tcp socket in repair state we should check
for the amount of data being 0 explicitly. Otherwise we'll have an skb
with seq == end_seq in rcv queue, but tcp doesn't expect this to happen
(in particular a warn_on in tcp_recvmsg shoots).

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Reported-by: Giorgos Mavrikas <gmavrikas@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1db663983587..2c2b13a999ea 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4529,6 +4529,9 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 	struct tcphdr *th;
 	bool fragstolen;
 
+	if (size == 0)
+		return 0;
+
 	skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
 	if (!skb)
 		goto err;
-- 
cgit v1.2.3


From cacb6ba0f36ab14a507f4ee7697e8332899015d2 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 3 Nov 2012 09:30:34 +0000
Subject: net: inet_diag -- Return error code if protocol handler is missed

We've observed that in case if UDP diag module is not
supported in kernel the netlink returns NLMSG_DONE without
notifying a caller that handler is missed.

This patch makes __inet_diag_dump to return error code instead.

So as example it become possible to detect such situation
and handle it gracefully on userspace level.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: David Miller <davem@davemloft.net>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 535584c00f91..0c34bfabc11f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -892,13 +892,16 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
 	const struct inet_diag_handler *handler;
+	int err = 0;
 
 	handler = inet_diag_lock_handler(r->sdiag_protocol);
 	if (!IS_ERR(handler))
 		handler->dump(skb, cb, r, bc);
+	else
+		err = PTR_ERR(handler);
 	inet_diag_unlock_handler(handler);
 
-	return skb->len;
+	return err ? : skb->len;
 }
 
 static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
-- 
cgit v1.2.3