From 9cd3e072b0be17446e37d7414eac8a3499e0601e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 20:03:10 -0800 Subject: net: rename SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA This patch is a cleanup to make following patch easier to review. Goal is to move SOCK_ASYNC_NOSPACE and SOCK_ASYNC_WAITDATA from (struct socket)->flags to a (struct socket_wq)->flags to benefit from RCU protection in sock_wake_async() To ease backports, we rename both constants. Two new helpers, sk_set_bit(int nr, struct sock *sk) and sk_clear_bit(int net, struct sock *sk) are added so that following patch can change their implementation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 7f89e4ba18d1..c155d09d8af4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2005,6 +2005,16 @@ static inline unsigned long sock_wspace(struct sock *sk) return amt; } +static inline void sk_set_bit(int nr, struct sock *sk) +{ + set_bit(nr, &sk->sk_socket->flags); +} + +static inline void sk_clear_bit(int nr, struct sock *sk) +{ + clear_bit(nr, &sk->sk_socket->flags); +} + static inline void sk_wake_async(struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) -- cgit v1.2.3 From ceb5d58b217098a657f3850b7a2640f995032e62 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2015 20:03:11 -0800 Subject: net: fix sock_wake_async() rcu protection Dmitry provided a syzkaller (http://github.com/google/syzkaller) triggering a fault in sock_wake_async() when async IO is requested. Said program stressed af_unix sockets, but the issue is generic and should be addressed in core networking stack. The problem is that by the time sock_wake_async() is called, we should not access the @flags field of 'struct socket', as the inode containing this socket might be freed without further notice, and without RCU grace period. We already maintain an RCU protected structure, "struct socket_wq" so moving SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA into it is the safe route. It also reduces number of cache lines needing dirtying, so might provide a performance improvement anyway. In followup patches, we might move remaining flags (SOCK_NOSPACE, SOCK_PASSCRED, SOCK_PASSSEC) to save 8 bytes and let 'struct socket' being mostly read and let it being shared between cpus. Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 7 ++++++- include/net/sock.h | 23 ++++++++++++++++------- net/core/stream.c | 2 +- net/sctp/socket.c | 24 ++++++++++++++---------- net/socket.c | 21 +++++++-------------- 5 files changed, 44 insertions(+), 33 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/linux/net.h b/include/linux/net.h index f514e4dd5521..0b4ac7da583a 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -34,6 +34,10 @@ struct inode; struct file; struct net; +/* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located + * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. + * Eventually all flags will be in sk->sk_wq_flags. + */ #define SOCKWQ_ASYNC_NOSPACE 0 #define SOCKWQ_ASYNC_WAITDATA 1 #define SOCK_NOSPACE 2 @@ -89,6 +93,7 @@ struct socket_wq { /* Note: wait MUST be first field of socket_wq */ wait_queue_head_t wait; struct fasync_struct *fasync_list; + unsigned long flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */ struct rcu_head rcu; } ____cacheline_aligned_in_smp; @@ -202,7 +207,7 @@ enum { SOCK_WAKE_URG, }; -int sock_wake_async(struct socket *sk, int how, int band); +int sock_wake_async(struct socket_wq *sk_wq, int how, int band); int sock_register(const struct net_proto_family *fam); void sock_unregister(int family); int __sock_create(struct net *net, int family, int type, int proto, diff --git a/include/net/sock.h b/include/net/sock.h index c155d09d8af4..0434138c5f95 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -384,8 +384,10 @@ struct sock { int sk_rcvbuf; struct sk_filter __rcu *sk_filter; - struct socket_wq __rcu *sk_wq; - + union { + struct socket_wq __rcu *sk_wq; + struct socket_wq *sk_wq_raw; + }; #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif @@ -2005,20 +2007,27 @@ static inline unsigned long sock_wspace(struct sock *sk) return amt; } +/* Note: + * We use sk->sk_wq_raw, from contexts knowing this + * pointer is not NULL and cannot disappear/change. + */ static inline void sk_set_bit(int nr, struct sock *sk) { - set_bit(nr, &sk->sk_socket->flags); + set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { - clear_bit(nr, &sk->sk_socket->flags); + clear_bit(nr, &sk->sk_wq_raw->flags); } -static inline void sk_wake_async(struct sock *sk, int how, int band) +static inline void sk_wake_async(const struct sock *sk, int how, int band) { - if (sock_flag(sk, SOCK_FASYNC)) - sock_wake_async(sk->sk_socket, how, band); + if (sock_flag(sk, SOCK_FASYNC)) { + rcu_read_lock(); + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); + rcu_read_unlock(); + } } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might diff --git a/net/core/stream.c b/net/core/stream.c index 43309428644d..b96f7a79e544 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -39,7 +39,7 @@ void sk_stream_write_space(struct sock *sk) wake_up_interruptible_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 2353985d689c..5e35ef34008b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6801,26 +6801,30 @@ no_packet: static void __sctp_write_space(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; - struct socket *sock = sk->sk_socket; - if ((sctp_wspace(asoc) > 0) && sock) { - if (waitqueue_active(&asoc->wait)) - wake_up_interruptible(&asoc->wait); + if (sctp_wspace(asoc) <= 0) + return; + + if (waitqueue_active(&asoc->wait)) + wake_up_interruptible(&asoc->wait); - if (sctp_writeable(sk)) { - wait_queue_head_t *wq = sk_sleep(sk); + if (sctp_writeable(sk)) { + struct socket_wq *wq; - if (wq && waitqueue_active(wq)) - wake_up_interruptible(wq); + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq) { + if (waitqueue_active(&wq->wait)) + wake_up_interruptible(&wq->wait); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) - sock_wake_async(sock, - SOCK_WAKE_SPACE, POLL_OUT); + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); } + rcu_read_unlock(); } } diff --git a/net/socket.c b/net/socket.c index 16be908205fc..456fadb3d819 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1056,27 +1056,20 @@ static int sock_fasync(int fd, struct file *filp, int on) return 0; } -/* This function may be called only under socket lock or callback_lock or rcu_lock */ +/* This function may be called only under rcu_lock */ -int sock_wake_async(struct socket *sock, int how, int band) +int sock_wake_async(struct socket_wq *wq, int how, int band) { - struct socket_wq *wq; - - if (!sock) - return -1; - rcu_read_lock(); - wq = rcu_dereference(sock->wq); - if (!wq || !wq->fasync_list) { - rcu_read_unlock(); + if (!wq || !wq->fasync_list) return -1; - } + switch (how) { case SOCK_WAKE_WAITD: - if (test_bit(SOCKWQ_ASYNC_WAITDATA, &sock->flags)) + if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags)) break; goto call_kill; case SOCK_WAKE_SPACE: - if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags)) + if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags)) break; /* fall through */ case SOCK_WAKE_IO: @@ -1086,7 +1079,7 @@ call_kill: case SOCK_WAKE_URG: kill_fasync(&wq->fasync_list, SIGURG, band); } - rcu_read_unlock(); + return 0; } EXPORT_SYMBOL(sock_wake_async); -- cgit v1.2.3 From 6bd4f355df2eae80b8a5c7b097371cd1e05f20d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Dec 2015 21:53:57 -0800 Subject: ipv6: kill sk_dst_lock While testing the np->opt RCU conversion, I found that UDP/IPv6 was using a mixture of xchg() and sk_dst_lock to protect concurrent changes to sk->sk_dst_cache, leading to possible corruptions and crashes. ip6_sk_dst_lookup_flow() uses sk_dst_check() anyway, so the simplest way to fix the mess is to remove sk_dst_lock completely, as we did for IPv4. __ip6_dst_store() and ip6_dst_store() share same implementation. sk_setup_caps() being called with socket lock being held or not, we have to use sk_dst_set() instead of __sk_dst_set() Note that I had to move the "np->dst_cookie = rt6_get_cookie(rt);" in ip6_dst_store() before the sk_setup_caps(sk, dst) call. This is because ip6_dst_store() can be called from process context, without any lock held. As soon as the dst is installed in sk->sk_dst_cache, dst can be freed from another cpu doing a concurrent ip6_dst_store() Doing the dst dereference before doing the install is needed to make sure no use after free would trigger. Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- include/net/ip6_route.h | 17 ++++------------- include/net/sock.h | 3 +-- net/core/sock.c | 4 +--- net/dccp/ipv6.c | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/icmp.c | 14 -------------- net/ipv6/inet6_connection_sock.c | 10 +--------- net/ipv6/tcp_ipv6.c | 4 ++-- 8 files changed, 12 insertions(+), 46 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 2bfb2ad2fab1..877f682989b8 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -133,27 +133,18 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); /* * Store a destination cache entry in a socket */ -static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct ipv6_pinfo *np = inet6_sk(sk); - struct rt6_info *rt = (struct rt6_info *) dst; + np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr; #endif - np->dst_cookie = rt6_get_cookie(rt); -} - -static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - struct in6_addr *daddr, struct in6_addr *saddr) -{ - spin_lock(&sk->sk_dst_lock); - __ip6_dst_store(sk, dst, daddr, saddr); - spin_unlock(&sk->sk_dst_lock); } static inline bool ipv6_unicast_destination(const struct sk_buff *skb) diff --git a/include/net/sock.h b/include/net/sock.h index 0434138c5f95..52d27ee924f4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -254,7 +254,6 @@ struct cg_proto; * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_dst_cache: destination cache - * @sk_dst_lock: destination cache lock * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed @@ -393,7 +392,7 @@ struct sock { #endif struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; - spinlock_t sk_dst_lock; + /* Note: 32bit hole on 64bit arches */ atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; diff --git a/net/core/sock.c b/net/core/sock.c index 9d79569935a3..e31dfcee1729 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1530,7 +1530,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); - spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); lockdep_set_class_and_name(&newsk->sk_callback_lock, af_callback_keys + newsk->sk_family, @@ -1607,7 +1606,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - __sk_dst_set(sk, dst); + sk_dst_set(sk, dst); sk->sk_route_caps = dst->dev->features; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; @@ -2388,7 +2387,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) } else sk->sk_wq = NULL; - spin_lock_init(&sk->sk_dst_lock); rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e7e0b9bc2a43..9c6d0508e63a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -459,7 +459,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * comment in that function for the gory details. -acme */ - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); newdp6 = (struct dccp6_sock *)newsk; @@ -883,7 +883,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; if (opt) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 38d66ddfb937..8ec0df75f1c4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -673,7 +673,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return 0; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 36c5a98b0472..0a37ddc7af51 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -834,11 +834,6 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); } -/* - * Special lock-class for __icmpv6_sk: - */ -static struct lock_class_key icmpv6_socket_sk_dst_lock_key; - static int __net_init icmpv6_sk_init(struct net *net) { struct sock *sk; @@ -860,15 +855,6 @@ static int __net_init icmpv6_sk_init(struct net *net) net->ipv6.icmp_sk[i] = sk; - /* - * Split off their lock-class, because sk->sk_dst_lock - * gets used from softirqs, which is safe for - * __icmpv6_sk (because those never get directly used - * via userspace syscalls), but unsafe for normal sockets. - */ - lockdep_set_class(&sk->sk_dst_lock, - &icmpv6_socket_sk_dst_lock_key); - /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 3ff5208772bb..a7ca2cde2ecb 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -110,14 +110,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) } EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); -static inline -void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) -{ - __ip6_dst_store(sk, dst, daddr, saddr); -} - static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { @@ -153,7 +145,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sk, fl6, final_p); if (!IS_ERR(dst)) - __inet6_csk_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); } return dst; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6a50bb4a0dae..e7aab561b7b4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -257,7 +257,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, NULL); if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && @@ -1060,7 +1060,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * */ newsk->sk_gso_type = SKB_GSO_TCPV6; - __ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; -- cgit v1.2.3 From 01ce63c90170283a9855d1db4fe81934dddce648 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 4 Dec 2015 15:14:04 -0200 Subject: sctp: update the netstamp_needed counter when copying sockets Dmitry Vyukov reported that SCTP was triggering a WARN on socket destroy related to disabling sock timestamp. When SCTP accepts an association or peel one off, it copies sock flags but forgot to call net_enable_timestamp() if a packet timestamping flag was copied, leading to extra calls to net_disable_timestamp() whenever such clones were closed. The fix is to call net_enable_timestamp() whenever we copy a sock with that flag on, like tcp does. Reported-by: Dmitry Vyukov Signed-off-by: Marcelo Ricardo Leitner Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 2 -- net/sctp/socket.c | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 52d27ee924f4..b1d475b5db68 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -740,6 +740,8 @@ enum sock_flags { SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ }; +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) { nsk->sk_flags = osk->sk_flags; diff --git a/net/core/sock.c b/net/core/sock.c index e31dfcee1729..d01c8f42dbb2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -433,8 +433,6 @@ static bool sock_needs_netstamp(const struct sock *sk) } } -#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) - static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { if (sk->sk_flags & flags) { diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 03c8256063ec..4c9282bdd067 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7199,6 +7199,9 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->mc_ttl = 1; newinet->mc_index = 0; newinet->mc_list = NULL; + + if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); } static inline void sctp_copy_descendant(struct sock *sk_to, -- cgit v1.2.3 From d188ba86dd07a72ebebfa22fe9cb0b0572e57740 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Dec 2015 07:22:02 -0800 Subject: xfrm: add rcu protection to sk->sk_policy[] XFRM can deal with SYNACK messages, sent while listener socket is not locked. We add proper rcu protection to __xfrm_sk_clone_policy() and xfrm_sk_policy_lookup() This might serve as the first step to remove xfrm.xfrm_policy_lock use in fast path. Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: Eric Dumazet Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- include/net/xfrm.h | 24 +++++++++++++++--------- net/core/sock.c | 2 +- net/xfrm/xfrm_policy.c | 37 +++++++++++++++++++++++++------------ 4 files changed, 42 insertions(+), 23 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index b1d475b5db68..eaef41433d7a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -388,7 +388,7 @@ struct sock { struct socket_wq *sk_wq_raw; }; #ifdef CONFIG_XFRM - struct xfrm_policy *sk_policy[2]; + struct xfrm_policy __rcu *sk_policy[2]; #endif struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8bae1ef647cd..d6f6e5006ee9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1142,12 +1142,14 @@ static inline int xfrm6_route_forward(struct sk_buff *skb) return xfrm_route_forward(skb, AF_INET6); } -int __xfrm_sk_clone_policy(struct sock *sk); +int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk); -static inline int xfrm_sk_clone_policy(struct sock *sk) +static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { - if (unlikely(sk->sk_policy[0] || sk->sk_policy[1])) - return __xfrm_sk_clone_policy(sk); + sk->sk_policy[0] = NULL; + sk->sk_policy[1] = NULL; + if (unlikely(osk->sk_policy[0] || osk->sk_policy[1])) + return __xfrm_sk_clone_policy(sk, osk); return 0; } @@ -1155,12 +1157,16 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir); static inline void xfrm_sk_free_policy(struct sock *sk) { - if (unlikely(sk->sk_policy[0] != NULL)) { - xfrm_policy_delete(sk->sk_policy[0], XFRM_POLICY_MAX); + struct xfrm_policy *pol; + + pol = rcu_dereference_protected(sk->sk_policy[0], 1); + if (unlikely(pol != NULL)) { + xfrm_policy_delete(pol, XFRM_POLICY_MAX); sk->sk_policy[0] = NULL; } - if (unlikely(sk->sk_policy[1] != NULL)) { - xfrm_policy_delete(sk->sk_policy[1], XFRM_POLICY_MAX+1); + pol = rcu_dereference_protected(sk->sk_policy[1], 1); + if (unlikely(pol != NULL)) { + xfrm_policy_delete(pol, XFRM_POLICY_MAX+1); sk->sk_policy[1] = NULL; } } @@ -1170,7 +1176,7 @@ void xfrm_garbage_collect(struct net *net); #else static inline void xfrm_sk_free_policy(struct sock *sk) {} -static inline int xfrm_sk_clone_policy(struct sock *sk) { return 0; } +static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; } static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; } static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb) diff --git a/net/core/sock.c b/net/core/sock.c index d01c8f42dbb2..765be835b06c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1550,7 +1550,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ is_charged = sk_filter_charge(newsk, filter); - if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) { + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f57a5712cedd..948fa5560de5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1221,8 +1221,10 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, struct xfrm_policy *pol; struct net *net = sock_net(sk); + rcu_read_lock(); read_lock_bh(&net->xfrm.xfrm_policy_lock); - if ((pol = sk->sk_policy[dir]) != NULL) { + pol = rcu_dereference(sk->sk_policy[dir]); + if (pol != NULL) { bool match = xfrm_selector_match(&pol->selector, fl, sk->sk_family); int err = 0; @@ -1246,6 +1248,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, } out: read_unlock_bh(&net->xfrm.xfrm_policy_lock); + rcu_read_unlock(); return pol; } @@ -1314,13 +1317,14 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) #endif write_lock_bh(&net->xfrm.xfrm_policy_lock); - old_pol = sk->sk_policy[dir]; - sk->sk_policy[dir] = pol; + old_pol = rcu_dereference_protected(sk->sk_policy[dir], + lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = get_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } + rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); @@ -1368,17 +1372,26 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) return newp; } -int __xfrm_sk_clone_policy(struct sock *sk) +int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { - struct xfrm_policy *p0 = sk->sk_policy[0], - *p1 = sk->sk_policy[1]; + const struct xfrm_policy *p; + struct xfrm_policy *np; + int i, ret = 0; - sk->sk_policy[0] = sk->sk_policy[1] = NULL; - if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL) - return -ENOMEM; - if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL) - return -ENOMEM; - return 0; + rcu_read_lock(); + for (i = 0; i < 2; i++) { + p = rcu_dereference(osk->sk_policy[i]); + if (p) { + np = clone_policy(p, i); + if (unlikely(!np)) { + ret = -ENOMEM; + break; + } + rcu_assign_pointer(sk->sk_policy[i], np); + } + } + rcu_read_unlock(); + return ret; } static int -- cgit v1.2.3 From 79462ad02e861803b3840cc782248c7359451cd9 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 14 Dec 2015 22:03:39 +0100 Subject: net: add validation for the socket syscall protocol argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 郭永刚 reported that one could simply crash the kernel as root by using a simple program: int socket_fd; struct sockaddr_in addr; addr.sin_port = 0; addr.sin_addr.s_addr = INADDR_ANY; addr.sin_family = 10; socket_fd = socket(10,3,0x40000000); connect(socket_fd , &addr,16); AF_INET, AF_INET6 sockets actually only support 8-bit protocol identifiers. inet_sock's skc_protocol field thus is sized accordingly, thus larger protocol identifiers simply cut off the higher bits and store a zero in the protocol fields. This could lead to e.g. NULL function pointer because as a result of the cut off inet_num is zero and we call down to inet_autobind, which is NULL for raw sockets. kernel: Call Trace: kernel: [] ? inet_autobind+0x2e/0x70 kernel: [] inet_dgram_connect+0x54/0x80 kernel: [] SYSC_connect+0xd9/0x110 kernel: [] ? ptrace_notify+0x5b/0x80 kernel: [] ? syscall_trace_enter_phase2+0x108/0x200 kernel: [] SyS_connect+0xe/0x10 kernel: [] tracesys_phase2+0x84/0x89 I found no particular commit which introduced this problem. CVE: CVE-2015-8543 Cc: Cong Wang Reported-by: 郭永刚 Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/ax25/af_ax25.c | 3 +++ net/decnet/af_decnet.c | 3 +++ net/ipv4/af_inet.c | 3 +++ net/ipv6/af_inet6.c | 3 +++ net/irda/af_irda.c | 3 +++ 6 files changed, 16 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index eaef41433d7a..c4205e0a3a2d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -403,6 +403,7 @@ struct sock { sk_no_check_rx : 1, sk_userlocks : 4, sk_protocol : 8, +#define SK_PROTOCOL_MAX U8_MAX sk_type : 16; kmemcheck_bitfield_end(flags); int sk_wmem_queued; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index ae3a47f9d1d5..fbd0acf80b13 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -805,6 +805,9 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; ax25_cb *ax25; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index eebf5ac8ce18..13d6b1a6e0fc 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -678,6 +678,9 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 11c4ca13ec3b..5c5db6636704 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -257,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (protocol < 0 || protocol >= IPPROTO_MAX) + return -EINVAL; + sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8ec0df75f1c4..9f5137cd604e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -109,6 +109,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, int try_loading_module = 0; int err; + if (protocol < 0 || protocol >= IPPROTO_MAX) + return -EINVAL; + /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index e6aa48b5395c..923abd6b3064 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1086,6 +1086,9 @@ static int irda_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; struct irda_sock *self; + if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + return -EINVAL; + if (net != &init_net) return -EAFNOSUPPORT; -- cgit v1.2.3 From 5037e9ef9454917b047f9f3a19b4dd179fbf7cd4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Dec 2015 14:08:53 -0800 Subject: net: fix IP early demux races David Wilder reported crashes caused by dst reuse. I am seeing a crash on a distro V4.2.3 kernel caused by a double release of a dst_entry. In ipv4_dst_destroy() the call to list_empty() finds a poisoned next pointer, indicating the dst_entry has already been removed from the list and freed. The crash occurs 18 to 24 hours into a run of a network stress exerciser. Thanks to his detailed report and analysis, we were able to understand the core issue. IP early demux can associate a dst to skb, after a lookup in TCP/UDP sockets. When socket cache is not properly set, we want to store into sk->sk_dst_cache the dst for future IP early demux lookups, by acquiring a stable refcount on the dst. Problem is this acquisition is simply using an atomic_inc(), which works well, unless the dst was queued for destruction from dst_release() noticing dst refcount went to zero, if DST_NOCACHE was set on dst. We need to make sure current refcount is not zero before incrementing it, or risk double free as David reported. This patch, being a stable candidate, adds two new helpers, and use them only from IP early demux problematic paths. It might be possible to merge in net-next skb_dst_force() and skb_dst_force_safe(), but I prefer having the smallest patch for stable kernels : Maybe some skb_dst_force() callers do not expect skb->dst can suddenly be cleared. Can probably be backported back to linux-3.6 kernels Reported-by: David J. Wilder Tested-by: David J. Wilder Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 33 +++++++++++++++++++++++++++++++++ include/net/sock.h | 2 +- net/ipv4/tcp_ipv4.c | 5 ++--- net/ipv6/tcp_ipv6.c | 3 +-- 4 files changed, 37 insertions(+), 6 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/dst.h b/include/net/dst.h index 1279f9b09791..c7329dcd90cc 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -322,6 +322,39 @@ static inline void skb_dst_force(struct sk_buff *skb) } } +/** + * dst_hold_safe - Take a reference on a dst if possible + * @dst: pointer to dst entry + * + * This helper returns false if it could not safely + * take a reference on a dst. + */ +static inline bool dst_hold_safe(struct dst_entry *dst) +{ + if (dst->flags & DST_NOCACHE) + return atomic_inc_not_zero(&dst->__refcnt); + dst_hold(dst); + return true; +} + +/** + * skb_dst_force_safe - makes sure skb dst is refcounted + * @skb: buffer + * + * If dst is not yet refcounted and not destroyed, grab a ref on it. + */ +static inline void skb_dst_force_safe(struct sk_buff *skb) +{ + if (skb_dst_is_noref(skb)) { + struct dst_entry *dst = skb_dst(skb); + + if (!dst_hold_safe(dst)) + dst = NULL; + + skb->_skb_refdst = (unsigned long)dst; + } +} + /** * __skb_tunnel_rx - prepare skb for rx reinsert diff --git a/include/net/sock.h b/include/net/sock.h index c4205e0a3a2d..28790fe18206 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -817,7 +817,7 @@ void sk_stream_write_space(struct sock *sk); static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ - skb_dst_force(skb); + skb_dst_force_safe(skb); if (!sk->sk_backlog.tail) sk->sk_backlog.head = skb; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index db003438aaf5..d8841a2f1569 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1493,7 +1493,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) if (likely(sk->sk_rx_dst)) skb_dst_drop(skb); else - skb_dst_force(skb); + skb_dst_force_safe(skb); __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; @@ -1721,8 +1721,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { - dst_hold(dst); + if (dst && dst_hold_safe(dst)) { sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e7aab561b7b4..6b8a8a9091fa 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -93,10 +93,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { + if (dst && dst_hold_safe(dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; - dst_hold(dst); sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); -- cgit v1.2.3 From 7bbadd2d1009575dad675afc16650ebb5aa10612 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Mon, 14 Dec 2015 23:30:43 +0100 Subject: net: fix warnings in 'make htmldocs' by moving macro definition out of field declaration Docbook does not like the definition of macros inside a field declaration and adds a warning. Move the definition out. Fixes: 79462ad02e86180 ("net: add validation for the socket syscall protocol argument") Reported-by: kbuild test robot Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 28790fe18206..14d3c0734007 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -403,8 +403,8 @@ struct sock { sk_no_check_rx : 1, sk_userlocks : 4, sk_protocol : 8, -#define SK_PROTOCOL_MAX U8_MAX sk_type : 16; +#define SK_PROTOCOL_MAX U8_MAX kmemcheck_bitfield_end(flags); int sk_wmem_queued; gfp_t sk_allocation; -- cgit v1.2.3