From 9cd3e072b0be17446e37d7414eac8a3499e0601e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Nov 2015 20:03:10 -0800
Subject: net: rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA

This patch is a cleanup to make following patch easier to
review.

Goal is to move SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA
from (struct socket)->flags to a (struct socket_wq)->flags
to benefit from RCU protection in sock_wake_async()

To ease backports, we rename both constants.

Two new helpers, sk_set_bit(int nr, struct sock *sk)
and sk_clear_bit(int net, struct sock *sk) are added so that
following patch can change their implementation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 7f89e4ba18d1..c155d09d8af4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2005,6 +2005,16 @@ static inline unsigned long sock_wspace(struct sock *sk)
 	return amt;
 }
 
+static inline void sk_set_bit(int nr, struct sock *sk)
+{
+	set_bit(nr, &sk->sk_socket->flags);
+}
+
+static inline void sk_clear_bit(int nr, struct sock *sk)
+{
+	clear_bit(nr, &sk->sk_socket->flags);
+}
+
 static inline void sk_wake_async(struct sock *sk, int how, int band)
 {
 	if (sock_flag(sk, SOCK_FASYNC))
-- 
cgit v1.2.3


From ceb5d58b217098a657f3850b7a2640f995032e62 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 29 Nov 2015 20:03:11 -0800
Subject: net: fix sock_wake_async() rcu protection

Dmitry provided a syzkaller (http://github.com/google/syzkaller)
triggering a fault in sock_wake_async() when async IO is requested.

Said program stressed af_unix sockets, but the issue is generic
and should be addressed in core networking stack.

The problem is that by the time sock_wake_async() is called,
we should not access the @flags field of 'struct socket',
as the inode containing this socket might be freed without
further notice, and without RCU grace period.

We already maintain an RCU protected structure, "struct socket_wq"
so moving SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA into it
is the safe route.

It also reduces number of cache lines needing dirtying, so might
provide a performance improvement anyway.

In followup patches, we might move remaining flags (SOCK_NOSPACE,
SOCK_PASSCRED, SOCK_PASSSEC) to save 8 bytes and let 'struct socket'
being mostly read and let it being shared between cpus.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  7 ++++++-
 include/net/sock.h  | 23 ++++++++++++++++-------
 net/core/stream.c   |  2 +-
 net/sctp/socket.c   | 24 ++++++++++++++----------
 net/socket.c        | 21 +++++++--------------
 5 files changed, 44 insertions(+), 33 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/linux/net.h b/include/linux/net.h
index f514e4dd5521..0b4ac7da583a 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -34,6 +34,10 @@ struct inode;
 struct file;
 struct net;
 
+/* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located
+ * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected.
+ * Eventually all flags will be in sk->sk_wq_flags.
+ */
 #define SOCKWQ_ASYNC_NOSPACE	0
 #define SOCKWQ_ASYNC_WAITDATA	1
 #define SOCK_NOSPACE		2
@@ -89,6 +93,7 @@ struct socket_wq {
 	/* Note: wait MUST be first field of socket_wq */
 	wait_queue_head_t	wait;
 	struct fasync_struct	*fasync_list;
+	unsigned long		flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */
 	struct rcu_head		rcu;
 } ____cacheline_aligned_in_smp;
 
@@ -202,7 +207,7 @@ enum {
 	SOCK_WAKE_URG,
 };
 
-int sock_wake_async(struct socket *sk, int how, int band);
+int sock_wake_async(struct socket_wq *sk_wq, int how, int band);
 int sock_register(const struct net_proto_family *fam);
 void sock_unregister(int family);
 int __sock_create(struct net *net, int family, int type, int proto,
diff --git a/include/net/sock.h b/include/net/sock.h
index c155d09d8af4..0434138c5f95 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -384,8 +384,10 @@ struct sock {
 	int			sk_rcvbuf;
 
 	struct sk_filter __rcu	*sk_filter;
-	struct socket_wq __rcu	*sk_wq;
-
+	union {
+		struct socket_wq __rcu	*sk_wq;
+		struct socket_wq	*sk_wq_raw;
+	};
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
 #endif
@@ -2005,20 +2007,27 @@ static inline unsigned long sock_wspace(struct sock *sk)
 	return amt;
 }
 
+/* Note:
+ *  We use sk->sk_wq_raw, from contexts knowing this
+ *  pointer is not NULL and cannot disappear/change.
+ */
 static inline void sk_set_bit(int nr, struct sock *sk)
 {
-	set_bit(nr, &sk->sk_socket->flags);
+	set_bit(nr, &sk->sk_wq_raw->flags);
 }
 
 static inline void sk_clear_bit(int nr, struct sock *sk)
 {
-	clear_bit(nr, &sk->sk_socket->flags);
+	clear_bit(nr, &sk->sk_wq_raw->flags);
 }
 
-static inline void sk_wake_async(struct sock *sk, int how, int band)
+static inline void sk_wake_async(const struct sock *sk, int how, int band)
 {
-	if (sock_flag(sk, SOCK_FASYNC))
-		sock_wake_async(sk->sk_socket, how, band);
+	if (sock_flag(sk, SOCK_FASYNC)) {
+		rcu_read_lock();
+		sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
+		rcu_read_unlock();
+	}
 }
 
 /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
diff --git a/net/core/stream.c b/net/core/stream.c
index 43309428644d..b96f7a79e544 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk)
 			wake_up_interruptible_poll(&wq->wait, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
-			sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+			sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
 		rcu_read_unlock();
 	}
 }
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 2353985d689c..5e35ef34008b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6801,26 +6801,30 @@ no_packet:
 static void __sctp_write_space(struct sctp_association *asoc)
 {
 	struct sock *sk = asoc->base.sk;
-	struct socket *sock = sk->sk_socket;
 
-	if ((sctp_wspace(asoc) > 0) && sock) {
-		if (waitqueue_active(&asoc->wait))
-			wake_up_interruptible(&asoc->wait);
+	if (sctp_wspace(asoc) <= 0)
+		return;
+
+	if (waitqueue_active(&asoc->wait))
+		wake_up_interruptible(&asoc->wait);
 
-		if (sctp_writeable(sk)) {
-			wait_queue_head_t *wq = sk_sleep(sk);
+	if (sctp_writeable(sk)) {
+		struct socket_wq *wq;
 
-			if (wq && waitqueue_active(wq))
-				wake_up_interruptible(wq);
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq) {
+			if (waitqueue_active(&wq->wait))
+				wake_up_interruptible(&wq->wait);
 
 			/* Note that we try to include the Async I/O support
 			 * here by modeling from the current TCP/UDP code.
 			 * We have not tested with it yet.
 			 */
 			if (!(sk->sk_shutdown & SEND_SHUTDOWN))
-				sock_wake_async(sock,
-						SOCK_WAKE_SPACE, POLL_OUT);
+				sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
 		}
+		rcu_read_unlock();
 	}
 }
 
diff --git a/net/socket.c b/net/socket.c
index 16be908205fc..456fadb3d819 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1056,27 +1056,20 @@ static int sock_fasync(int fd, struct file *filp, int on)
 	return 0;
 }
 
-/* This function may be called only under socket lock or callback_lock or rcu_lock */
+/* This function may be called only under rcu_lock */
 
-int sock_wake_async(struct socket *sock, int how, int band)
+int sock_wake_async(struct socket_wq *wq, int how, int band)
 {
-	struct socket_wq *wq;
-
-	if (!sock)
-		return -1;
-	rcu_read_lock();
-	wq = rcu_dereference(sock->wq);
-	if (!wq || !wq->fasync_list) {
-		rcu_read_unlock();
+	if (!wq || !wq->fasync_list)
 		return -1;
-	}
+
 	switch (how) {
 	case SOCK_WAKE_WAITD:
-		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &sock->flags))
+		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
 			break;
 		goto call_kill;
 	case SOCK_WAKE_SPACE:
-		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags))
+		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
 			break;
 		/* fall through */
 	case SOCK_WAKE_IO:
@@ -1086,7 +1079,7 @@ call_kill:
 	case SOCK_WAKE_URG:
 		kill_fasync(&wq->fasync_list, SIGURG, band);
 	}
-	rcu_read_unlock();
+
 	return 0;
 }
 EXPORT_SYMBOL(sock_wake_async);
-- 
cgit v1.2.3


From 6bd4f355df2eae80b8a5c7b097371cd1e05f20d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Dec 2015 21:53:57 -0800
Subject: ipv6: kill sk_dst_lock

While testing the np->opt RCU conversion, I found that UDP/IPv6 was
using a mixture of xchg() and sk_dst_lock to protect concurrent changes
to sk->sk_dst_cache, leading to possible corruptions and crashes.

ip6_sk_dst_lookup_flow() uses sk_dst_check() anyway, so the simplest
way to fix the mess is to remove sk_dst_lock completely, as we did for
IPv4.

__ip6_dst_store() and ip6_dst_store() share same implementation.

sk_setup_caps() being called with socket lock being held or not,
we have to use sk_dst_set() instead of __sk_dst_set()

Note that I had to move the "np->dst_cookie = rt6_get_cookie(rt);"
in ip6_dst_store() before the sk_setup_caps(sk, dst) call.

This is because ip6_dst_store() can be called from process context,
without any lock held.

As soon as the dst is installed in sk->sk_dst_cache, dst can be freed
from another cpu doing a concurrent ip6_dst_store()

Doing the dst dereference before doing the install is needed to make
sure no use after free would trigger.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h          | 17 ++++-------------
 include/net/sock.h               |  3 +--
 net/core/sock.c                  |  4 +---
 net/dccp/ipv6.c                  |  4 ++--
 net/ipv6/af_inet6.c              |  2 +-
 net/ipv6/icmp.c                  | 14 --------------
 net/ipv6/inet6_connection_sock.c | 10 +---------
 net/ipv6/tcp_ipv6.c              |  4 ++--
 8 files changed, 12 insertions(+), 46 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 2bfb2ad2fab1..877f682989b8 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -133,27 +133,18 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 /*
  *	Store a destination cache entry in a socket
  */
-static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
-				   const struct in6_addr *daddr,
-				   const struct in6_addr *saddr)
+static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
+				 const struct in6_addr *daddr,
+				 const struct in6_addr *saddr)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct rt6_info *rt = (struct rt6_info *) dst;
 
+	np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst);
 	sk_setup_caps(sk, dst);
 	np->daddr_cache = daddr;
 #ifdef CONFIG_IPV6_SUBTREES
 	np->saddr_cache = saddr;
 #endif
-	np->dst_cookie = rt6_get_cookie(rt);
-}
-
-static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
-				 struct in6_addr *daddr, struct in6_addr *saddr)
-{
-	spin_lock(&sk->sk_dst_lock);
-	__ip6_dst_store(sk, dst, daddr, saddr);
-	spin_unlock(&sk->sk_dst_lock);
 }
 
 static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
diff --git a/include/net/sock.h b/include/net/sock.h
index 0434138c5f95..52d27ee924f4 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -254,7 +254,6 @@ struct cg_proto;
   *	@sk_wq: sock wait queue and async head
   *	@sk_rx_dst: receive input route used by early demux
   *	@sk_dst_cache: destination cache
-  *	@sk_dst_lock: destination cache lock
   *	@sk_policy: flow policy
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
@@ -393,7 +392,7 @@ struct sock {
 #endif
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
-	spinlock_t		sk_dst_lock;
+	/* Note: 32bit hole on 64bit arches */
 	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
diff --git a/net/core/sock.c b/net/core/sock.c
index 9d79569935a3..e31dfcee1729 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1530,7 +1530,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		skb_queue_head_init(&newsk->sk_receive_queue);
 		skb_queue_head_init(&newsk->sk_write_queue);
 
-		spin_lock_init(&newsk->sk_dst_lock);
 		rwlock_init(&newsk->sk_callback_lock);
 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
 				af_callback_keys + newsk->sk_family,
@@ -1607,7 +1606,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
 	u32 max_segs = 1;
 
-	__sk_dst_set(sk, dst);
+	sk_dst_set(sk, dst);
 	sk->sk_route_caps = dst->dev->features;
 	if (sk->sk_route_caps & NETIF_F_GSO)
 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
@@ -2388,7 +2387,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	} else
 		sk->sk_wq	=	NULL;
 
-	spin_lock_init(&sk->sk_dst_lock);
 	rwlock_init(&sk->sk_callback_lock);
 	lockdep_set_class_and_name(&sk->sk_callback_lock,
 			af_callback_keys + sk->sk_family,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index e7e0b9bc2a43..9c6d0508e63a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -459,7 +459,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 	 * comment in that function for the gory details. -acme
 	 */
 
-	__ip6_dst_store(newsk, dst, NULL, NULL);
+	ip6_dst_store(newsk, dst, NULL, NULL);
 	newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
 						      NETIF_F_TSO);
 	newdp6 = (struct dccp6_sock *)newsk;
@@ -883,7 +883,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	np->saddr = *saddr;
 	inet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
-	__ip6_dst_store(sk, dst, NULL, NULL);
+	ip6_dst_store(sk, dst, NULL, NULL);
 
 	icsk->icsk_ext_hdr_len = 0;
 	if (opt)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 38d66ddfb937..8ec0df75f1c4 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -673,7 +673,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 			return PTR_ERR(dst);
 		}
 
-		__ip6_dst_store(sk, dst, NULL, NULL);
+		ip6_dst_store(sk, dst, NULL, NULL);
 	}
 
 	return 0;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 36c5a98b0472..0a37ddc7af51 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -834,11 +834,6 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
 	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 }
 
-/*
- * Special lock-class for __icmpv6_sk:
- */
-static struct lock_class_key icmpv6_socket_sk_dst_lock_key;
-
 static int __net_init icmpv6_sk_init(struct net *net)
 {
 	struct sock *sk;
@@ -860,15 +855,6 @@ static int __net_init icmpv6_sk_init(struct net *net)
 
 		net->ipv6.icmp_sk[i] = sk;
 
-		/*
-		 * Split off their lock-class, because sk->sk_dst_lock
-		 * gets used from softirqs, which is safe for
-		 * __icmpv6_sk (because those never get directly used
-		 * via userspace syscalls), but unsafe for normal sockets.
-		 */
-		lockdep_set_class(&sk->sk_dst_lock,
-				  &icmpv6_socket_sk_dst_lock_key);
-
 		/* Enough space for 2 64K ICMP packets, including
 		 * sk_buff struct overhead.
 		 */
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 3ff5208772bb..a7ca2cde2ecb 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -110,14 +110,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
 }
 EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
 
-static inline
-void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
-			   const struct in6_addr *daddr,
-			   const struct in6_addr *saddr)
-{
-	__ip6_dst_store(sk, dst, daddr, saddr);
-}
-
 static inline
 struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
 {
@@ -153,7 +145,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
 		dst = ip6_dst_lookup_flow(sk, fl6, final_p);
 
 		if (!IS_ERR(dst))
-			__inet6_csk_dst_store(sk, dst, NULL, NULL);
+			ip6_dst_store(sk, dst, NULL, NULL);
 	}
 	return dst;
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6a50bb4a0dae..e7aab561b7b4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -257,7 +257,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	inet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
 	sk->sk_gso_type = SKB_GSO_TCPV6;
-	__ip6_dst_store(sk, dst, NULL, NULL);
+	ip6_dst_store(sk, dst, NULL, NULL);
 
 	if (tcp_death_row.sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp &&
@@ -1060,7 +1060,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 	 */
 
 	newsk->sk_gso_type = SKB_GSO_TCPV6;
-	__ip6_dst_store(newsk, dst, NULL, NULL);
+	ip6_dst_store(newsk, dst, NULL, NULL);
 	inet6_sk_rx_dst_set(newsk, skb);
 
 	newtcp6sk = (struct tcp6_sock *)newsk;
-- 
cgit v1.2.3


From 01ce63c90170283a9855d1db4fe81934dddce648 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Fri, 4 Dec 2015 15:14:04 -0200
Subject: sctp: update the netstamp_needed counter when copying sockets

Dmitry Vyukov reported that SCTP was triggering a WARN on socket destroy
related to disabling sock timestamp.

When SCTP accepts an association or peel one off, it copies sock flags
but forgot to call net_enable_timestamp() if a packet timestamping flag
was copied, leading to extra calls to net_disable_timestamp() whenever
such clones were closed.

The fix is to call net_enable_timestamp() whenever we copy a sock with
that flag on, like tcp does.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 ++
 net/core/sock.c    | 2 --
 net/sctp/socket.c  | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 52d27ee924f4..b1d475b5db68 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -740,6 +740,8 @@ enum sock_flags {
 	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
 };
 
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
 {
 	nsk->sk_flags = osk->sk_flags;
diff --git a/net/core/sock.c b/net/core/sock.c
index e31dfcee1729..d01c8f42dbb2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -433,8 +433,6 @@ static bool sock_needs_netstamp(const struct sock *sk)
 	}
 }
 
-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-
 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 {
 	if (sk->sk_flags & flags) {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 03c8256063ec..4c9282bdd067 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7199,6 +7199,9 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_ttl = 1;
 	newinet->mc_index = 0;
 	newinet->mc_list = NULL;
+
+	if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+		net_enable_timestamp();
 }
 
 static inline void sctp_copy_descendant(struct sock *sk_to,
-- 
cgit v1.2.3


From d188ba86dd07a72ebebfa22fe9cb0b0572e57740 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Dec 2015 07:22:02 -0800
Subject: xfrm: add rcu protection to sk->sk_policy[]

XFRM can deal with SYNACK messages, sent while listener socket
is not locked. We add proper rcu protection to __xfrm_sk_clone_policy()
and xfrm_sk_policy_lookup()

This might serve as the first step to remove xfrm.xfrm_policy_lock
use in fast path.

Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     |  2 +-
 include/net/xfrm.h     | 24 +++++++++++++++---------
 net/core/sock.c        |  2 +-
 net/xfrm/xfrm_policy.c | 37 +++++++++++++++++++++++++------------
 4 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index b1d475b5db68..eaef41433d7a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -388,7 +388,7 @@ struct sock {
 		struct socket_wq	*sk_wq_raw;
 	};
 #ifdef CONFIG_XFRM
-	struct xfrm_policy	*sk_policy[2];
+	struct xfrm_policy __rcu *sk_policy[2];
 #endif
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8bae1ef647cd..d6f6e5006ee9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1142,12 +1142,14 @@ static inline int xfrm6_route_forward(struct sk_buff *skb)
 	return xfrm_route_forward(skb, AF_INET6);
 }
 
-int __xfrm_sk_clone_policy(struct sock *sk);
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk);
 
-static inline int xfrm_sk_clone_policy(struct sock *sk)
+static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 {
-	if (unlikely(sk->sk_policy[0] || sk->sk_policy[1]))
-		return __xfrm_sk_clone_policy(sk);
+	sk->sk_policy[0] = NULL;
+	sk->sk_policy[1] = NULL;
+	if (unlikely(osk->sk_policy[0] || osk->sk_policy[1]))
+		return __xfrm_sk_clone_policy(sk, osk);
 	return 0;
 }
 
@@ -1155,12 +1157,16 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir);
 
 static inline void xfrm_sk_free_policy(struct sock *sk)
 {
-	if (unlikely(sk->sk_policy[0] != NULL)) {
-		xfrm_policy_delete(sk->sk_policy[0], XFRM_POLICY_MAX);
+	struct xfrm_policy *pol;
+
+	pol = rcu_dereference_protected(sk->sk_policy[0], 1);
+	if (unlikely(pol != NULL)) {
+		xfrm_policy_delete(pol, XFRM_POLICY_MAX);
 		sk->sk_policy[0] = NULL;
 	}
-	if (unlikely(sk->sk_policy[1] != NULL)) {
-		xfrm_policy_delete(sk->sk_policy[1], XFRM_POLICY_MAX+1);
+	pol = rcu_dereference_protected(sk->sk_policy[1], 1);
+	if (unlikely(pol != NULL)) {
+		xfrm_policy_delete(pol, XFRM_POLICY_MAX+1);
 		sk->sk_policy[1] = NULL;
 	}
 }
@@ -1170,7 +1176,7 @@ void xfrm_garbage_collect(struct net *net);
 #else
 
 static inline void xfrm_sk_free_policy(struct sock *sk) {}
-static inline int xfrm_sk_clone_policy(struct sock *sk) { return 0; }
+static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; }
 static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }  
 static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } 
 static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
diff --git a/net/core/sock.c b/net/core/sock.c
index d01c8f42dbb2..765be835b06c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1550,7 +1550,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 			 */
 			is_charged = sk_filter_charge(newsk, filter);
 
-		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
 			/* It is still raw copy of parent, so invalidate
 			 * destructor and make plain sk_free() */
 			newsk->sk_destruct = NULL;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f57a5712cedd..948fa5560de5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1221,8 +1221,10 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 	struct xfrm_policy *pol;
 	struct net *net = sock_net(sk);
 
+	rcu_read_lock();
 	read_lock_bh(&net->xfrm.xfrm_policy_lock);
-	if ((pol = sk->sk_policy[dir]) != NULL) {
+	pol = rcu_dereference(sk->sk_policy[dir]);
+	if (pol != NULL) {
 		bool match = xfrm_selector_match(&pol->selector, fl,
 						 sk->sk_family);
 		int err = 0;
@@ -1246,6 +1248,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 	}
 out:
 	read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	rcu_read_unlock();
 	return pol;
 }
 
@@ -1314,13 +1317,14 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 #endif
 
 	write_lock_bh(&net->xfrm.xfrm_policy_lock);
-	old_pol = sk->sk_policy[dir];
-	sk->sk_policy[dir] = pol;
+	old_pol = rcu_dereference_protected(sk->sk_policy[dir],
+				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
 	if (pol) {
 		pol->curlft.add_time = get_seconds();
 		pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
 		xfrm_sk_policy_link(pol, dir);
 	}
+	rcu_assign_pointer(sk->sk_policy[dir], pol);
 	if (old_pol) {
 		if (pol)
 			xfrm_policy_requeue(old_pol, pol);
@@ -1368,17 +1372,26 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
 	return newp;
 }
 
-int __xfrm_sk_clone_policy(struct sock *sk)
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 {
-	struct xfrm_policy *p0 = sk->sk_policy[0],
-			   *p1 = sk->sk_policy[1];
+	const struct xfrm_policy *p;
+	struct xfrm_policy *np;
+	int i, ret = 0;
 
-	sk->sk_policy[0] = sk->sk_policy[1] = NULL;
-	if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
-		return -ENOMEM;
-	if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
-		return -ENOMEM;
-	return 0;
+	rcu_read_lock();
+	for (i = 0; i < 2; i++) {
+		p = rcu_dereference(osk->sk_policy[i]);
+		if (p) {
+			np = clone_policy(p, i);
+			if (unlikely(!np)) {
+				ret = -ENOMEM;
+				break;
+			}
+			rcu_assign_pointer(sk->sk_policy[i], np);
+		}
+	}
+	rcu_read_unlock();
+	return ret;
 }
 
 static int
-- 
cgit v1.2.3


From 79462ad02e861803b3840cc782248c7359451cd9 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 14 Dec 2015 22:03:39 +0100
Subject: net: add validation for the socket syscall protocol argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

郭永刚 reported that one could simply crash the kernel as root by
using a simple program:

	int socket_fd;
	struct sockaddr_in addr;
	addr.sin_port = 0;
	addr.sin_addr.s_addr = INADDR_ANY;
	addr.sin_family = 10;

	socket_fd = socket(10,3,0x40000000);
	connect(socket_fd , &addr,16);

AF_INET, AF_INET6 sockets actually only support 8-bit protocol
identifiers. inet_sock's skc_protocol field thus is sized accordingly,
thus larger protocol identifiers simply cut off the higher bits and
store a zero in the protocol fields.

This could lead to e.g. NULL function pointer because as a result of
the cut off inet_num is zero and we call down to inet_autobind, which
is NULL for raw sockets.

kernel: Call Trace:
kernel:  [<ffffffff816db90e>] ? inet_autobind+0x2e/0x70
kernel:  [<ffffffff816db9a4>] inet_dgram_connect+0x54/0x80
kernel:  [<ffffffff81645069>] SYSC_connect+0xd9/0x110
kernel:  [<ffffffff810ac51b>] ? ptrace_notify+0x5b/0x80
kernel:  [<ffffffff810236d8>] ? syscall_trace_enter_phase2+0x108/0x200
kernel:  [<ffffffff81645e0e>] SyS_connect+0xe/0x10
kernel:  [<ffffffff81779515>] tracesys_phase2+0x84/0x89

I found no particular commit which introduced this problem.

CVE: CVE-2015-8543
Cc: Cong Wang <cwang@twopensource.com>
Reported-by: 郭永刚 <guoyonggang@360.cn>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     | 1 +
 net/ax25/af_ax25.c     | 3 +++
 net/decnet/af_decnet.c | 3 +++
 net/ipv4/af_inet.c     | 3 +++
 net/ipv6/af_inet6.c    | 3 +++
 net/irda/af_irda.c     | 3 +++
 6 files changed, 16 insertions(+)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index eaef41433d7a..c4205e0a3a2d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -403,6 +403,7 @@ struct sock {
 				sk_no_check_rx : 1,
 				sk_userlocks : 4,
 				sk_protocol  : 8,
+#define SK_PROTOCOL_MAX U8_MAX
 				sk_type      : 16;
 	kmemcheck_bitfield_end(flags);
 	int			sk_wmem_queued;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index ae3a47f9d1d5..fbd0acf80b13 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -805,6 +805,9 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
 	struct sock *sk;
 	ax25_cb *ax25;
 
+	if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+		return -EINVAL;
+
 	if (!net_eq(net, &init_net))
 		return -EAFNOSUPPORT;
 
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index eebf5ac8ce18..13d6b1a6e0fc 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -678,6 +678,9 @@ static int dn_create(struct net *net, struct socket *sock, int protocol,
 {
 	struct sock *sk;
 
+	if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+		return -EINVAL;
+
 	if (!net_eq(net, &init_net))
 		return -EAFNOSUPPORT;
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 11c4ca13ec3b..5c5db6636704 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -257,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
 	int try_loading_module = 0;
 	int err;
 
+	if (protocol < 0 || protocol >= IPPROTO_MAX)
+		return -EINVAL;
+
 	sock->state = SS_UNCONNECTED;
 
 	/* Look for the requested type/protocol pair. */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 8ec0df75f1c4..9f5137cd604e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -109,6 +109,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
 	int try_loading_module = 0;
 	int err;
 
+	if (protocol < 0 || protocol >= IPPROTO_MAX)
+		return -EINVAL;
+
 	/* Look for the requested type/protocol pair. */
 lookup_protocol:
 	err = -ESOCKTNOSUPPORT;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index e6aa48b5395c..923abd6b3064 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1086,6 +1086,9 @@ static int irda_create(struct net *net, struct socket *sock, int protocol,
 	struct sock *sk;
 	struct irda_sock *self;
 
+	if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+		return -EINVAL;
+
 	if (net != &init_net)
 		return -EAFNOSUPPORT;
 
-- 
cgit v1.2.3


From 5037e9ef9454917b047f9f3a19b4dd179fbf7cd4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 14 Dec 2015 14:08:53 -0800
Subject: net: fix IP early demux races

David Wilder reported crashes caused by dst reuse.

<quote David>
  I am seeing a crash on a distro V4.2.3 kernel caused by a double
  release of a dst_entry.  In ipv4_dst_destroy() the call to
  list_empty() finds a poisoned next pointer, indicating the dst_entry
  has already been removed from the list and freed. The crash occurs
  18 to 24 hours into a run of a network stress exerciser.
</quote>

Thanks to his detailed report and analysis, we were able to understand
the core issue.

IP early demux can associate a dst to skb, after a lookup in TCP/UDP
sockets.

When socket cache is not properly set, we want to store into
sk->sk_dst_cache the dst for future IP early demux lookups,
by acquiring a stable refcount on the dst.

Problem is this acquisition is simply using an atomic_inc(),
which works well, unless the dst was queued for destruction from
dst_release() noticing dst refcount went to zero, if DST_NOCACHE
was set on dst.

We need to make sure current refcount is not zero before incrementing
it, or risk double free as David reported.

This patch, being a stable candidate, adds two new helpers, and use
them only from IP early demux problematic paths.

It might be possible to merge in net-next skb_dst_force() and
skb_dst_force_safe(), but I prefer having the smallest patch for stable
kernels : Maybe some skb_dst_force() callers do not expect skb->dst
can suddenly be cleared.

Can probably be backported back to linux-3.6 kernels

Reported-by: David J. Wilder <dwilder@us.ibm.com>
Tested-by: David J. Wilder <dwilder@us.ibm.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h   | 33 +++++++++++++++++++++++++++++++++
 include/net/sock.h  |  2 +-
 net/ipv4/tcp_ipv4.c |  5 ++---
 net/ipv6/tcp_ipv6.c |  3 +--
 4 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/dst.h b/include/net/dst.h
index 1279f9b09791..c7329dcd90cc 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -322,6 +322,39 @@ static inline void skb_dst_force(struct sk_buff *skb)
 	}
 }
 
+/**
+ * dst_hold_safe - Take a reference on a dst if possible
+ * @dst: pointer to dst entry
+ *
+ * This helper returns false if it could not safely
+ * take a reference on a dst.
+ */
+static inline bool dst_hold_safe(struct dst_entry *dst)
+{
+	if (dst->flags & DST_NOCACHE)
+		return atomic_inc_not_zero(&dst->__refcnt);
+	dst_hold(dst);
+	return true;
+}
+
+/**
+ * skb_dst_force_safe - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted and not destroyed, grab a ref on it.
+ */
+static inline void skb_dst_force_safe(struct sk_buff *skb)
+{
+	if (skb_dst_is_noref(skb)) {
+		struct dst_entry *dst = skb_dst(skb);
+
+		if (!dst_hold_safe(dst))
+			dst = NULL;
+
+		skb->_skb_refdst = (unsigned long)dst;
+	}
+}
+
 
 /**
  *	__skb_tunnel_rx - prepare skb for rx reinsert
diff --git a/include/net/sock.h b/include/net/sock.h
index c4205e0a3a2d..28790fe18206 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -817,7 +817,7 @@ void sk_stream_write_space(struct sock *sk);
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
 	/* dont let skb dst not refcounted, we are going to leave rcu lock */
-	skb_dst_force(skb);
+	skb_dst_force_safe(skb);
 
 	if (!sk->sk_backlog.tail)
 		sk->sk_backlog.head = skb;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index db003438aaf5..d8841a2f1569 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1493,7 +1493,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 	if (likely(sk->sk_rx_dst))
 		skb_dst_drop(skb);
 	else
-		skb_dst_force(skb);
+		skb_dst_force_safe(skb);
 
 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
 	tp->ucopy.memory += skb->truesize;
@@ -1721,8 +1721,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 
-	if (dst) {
-		dst_hold(dst);
+	if (dst && dst_hold_safe(dst)) {
 		sk->sk_rx_dst = dst;
 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e7aab561b7b4..6b8a8a9091fa 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -93,10 +93,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 
-	if (dst) {
+	if (dst && dst_hold_safe(dst)) {
 		const struct rt6_info *rt = (const struct rt6_info *)dst;
 
-		dst_hold(dst);
 		sk->sk_rx_dst = dst;
 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
 		inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
-- 
cgit v1.2.3


From 7bbadd2d1009575dad675afc16650ebb5aa10612 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 14 Dec 2015 23:30:43 +0100
Subject: net: fix warnings in 'make htmldocs' by moving macro definition out
 of field declaration

Docbook does not like the definition of macros inside a field declaration
and adds a warning. Move the definition out.

Fixes: 79462ad02e86180 ("net: add validation for the socket syscall protocol argument")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 28790fe18206..14d3c0734007 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -403,8 +403,8 @@ struct sock {
 				sk_no_check_rx : 1,
 				sk_userlocks : 4,
 				sk_protocol  : 8,
-#define SK_PROTOCOL_MAX U8_MAX
 				sk_type      : 16;
+#define SK_PROTOCOL_MAX U8_MAX
 	kmemcheck_bitfield_end(flags);
 	int			sk_wmem_queued;
 	gfp_t			sk_allocation;
-- 
cgit v1.2.3