From 9f1c2674b328a69ab5a9b5a1c52405795ee4163f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 8 Oct 2017 21:44:51 -0700
Subject: net: memcontrol: defer call to mem_cgroup_sk_alloc()

Instead of calling mem_cgroup_sk_alloc() from BH context,
it is better to call it from inet_csk_accept() in process context.

Not only this removes code in mem_cgroup_sk_alloc(), but it also
fixes a bug since listener might have been dismantled and css_get()
might cause a use-after-free.

Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c                 | 5 ++++-
 net/ipv4/inet_connection_sock.c | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 23953b741a41..70c6ccbdf49f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1677,6 +1677,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_dst_pending_confirm = 0;
 		newsk->sk_wmem_queued	= 0;
 		newsk->sk_forward_alloc = 0;
+
+		/* sk->sk_memcg will be populated at accept() time */
+		newsk->sk_memcg = NULL;
+
 		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
@@ -1714,7 +1718,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
 
-		mem_cgroup_sk_alloc(newsk);
 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
 
 		/*
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c039c937ba90..67aec7a10686 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -475,6 +475,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
 		}
 		spin_unlock_bh(&queue->fastopenq.lock);
 	}
+	mem_cgroup_sk_alloc(newsk);
 out:
 	release_sock(sk);
 	if (req)
-- 
cgit v1.2.3


From fbb1fb4ad415cb31ce944f65a5ca700aaf73a227 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 8 Oct 2017 21:44:52 -0700
Subject: net: defer call to cgroup_sk_alloc()

sk_clone_lock() might run while TCP/DCCP listener already vanished.

In order to prevent use after free, it is better to defer cgroup_sk_alloc()
to the point we know both parent and child exist, and from process context.

Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c                 | 3 +--
 net/ipv4/inet_connection_sock.c | 5 +++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 70c6ccbdf49f..4499e3153813 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1680,6 +1680,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 		/* sk->sk_memcg will be populated at accept() time */
 		newsk->sk_memcg = NULL;
+		memset(&newsk->sk_cgrp_data, 0, sizeof(newsk->sk_cgrp_data));
 
 		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
@@ -1718,8 +1719,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
 
-		cgroup_sk_alloc(&newsk->sk_cgrp_data);
-
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 67aec7a10686..d32c74507314 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -26,6 +26,8 @@
 #include <net/tcp.h>
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
+#include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
 
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -476,6 +478,9 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
 		spin_unlock_bh(&queue->fastopenq.lock);
 	}
 	mem_cgroup_sk_alloc(newsk);
+	cgroup_sk_alloc(&newsk->sk_cgrp_data);
+	sock_update_classid(&newsk->sk_cgrp_data);
+	sock_update_netprioidx(&newsk->sk_cgrp_data);
 out:
 	release_sock(sk);
 	if (req)
-- 
cgit v1.2.3


From 75cb070960ade40fba5de32138390f3c85c90941 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 10 Oct 2017 19:12:32 -0700
Subject: Revert "net: defer call to cgroup_sk_alloc()"

This reverts commit fbb1fb4ad415cb31ce944f65a5ca700aaf73a227.

This was not the proper fix, lets cleanly revert it, so that
following patch can be carried to stable versions.

sock_cgroup_ptr() callers do not expect a NULL return value.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c                 | 3 ++-
 net/ipv4/inet_connection_sock.c | 5 -----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 4499e3153813..70c6ccbdf49f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1680,7 +1680,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 		/* sk->sk_memcg will be populated at accept() time */
 		newsk->sk_memcg = NULL;
-		memset(&newsk->sk_cgrp_data, 0, sizeof(newsk->sk_cgrp_data));
 
 		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
@@ -1719,6 +1718,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
 
+		cgroup_sk_alloc(&newsk->sk_cgrp_data);
+
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d32c74507314..67aec7a10686 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -26,8 +26,6 @@
 #include <net/tcp.h>
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
-#include <net/cls_cgroup.h>
-#include <net/netprio_cgroup.h>
 
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -478,9 +476,6 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
 		spin_unlock_bh(&queue->fastopenq.lock);
 	}
 	mem_cgroup_sk_alloc(newsk);
-	cgroup_sk_alloc(&newsk->sk_cgrp_data);
-	sock_update_classid(&newsk->sk_cgrp_data);
-	sock_update_netprioidx(&newsk->sk_cgrp_data);
 out:
 	release_sock(sk);
 	if (req)
-- 
cgit v1.2.3


From c0576e3975084d4699b7bfef578613fb8e1144f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 10 Oct 2017 19:12:33 -0700
Subject: net: call cgroup_sk_alloc() earlier in sk_clone_lock()

If for some reason, the newly allocated child need to be freed,
we will call cgroup_put() (via sk_free_unlock_clone()) while the
corresponding cgroup_get() was not yet done, and we will free memory
too soon.

Fixes: d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 70c6ccbdf49f..415f441c63b9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1687,6 +1687,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		atomic_set(&newsk->sk_zckey, 0);
 
 		sock_reset_flag(newsk, SOCK_DONE);
+		cgroup_sk_alloc(&newsk->sk_cgrp_data);
 
 		rcu_read_lock();
 		filter = rcu_dereference(sk->sk_filter);
@@ -1718,8 +1719,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
 
-		cgroup_sk_alloc(&newsk->sk_cgrp_data);
-
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
-- 
cgit v1.2.3


From 6e9c0075409d4ec1bc63558ee5a93916a6d7d16f Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Wed, 11 Oct 2017 16:54:27 +1100
Subject: net/ncsi: Don't limit vids based on hot_channel

Currently we drop any new VLAN ids if there are more than the current
(or last used) channel can support. Most importantly this is a problem
if no channel has been selected yet, resulting in a segfault.

Secondly this does not necessarily reflect the capabilities of any other
channels. Instead only drop a new VLAN id if we are already tracking the
maximum allowed by the NCSI specification. Per-channel limits are
already handled by ncsi_add_filter(), but add a message to set_one_vid()
to make it obvious that the channel can not support any more VLAN ids.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/internal.h    |  1 +
 net/ncsi/ncsi-manage.c | 17 +++++++++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index af3d636534ef..d30f7bd741d0 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -286,6 +286,7 @@ struct ncsi_dev_priv {
 	struct work_struct  work;            /* For channel management     */
 	struct packet_type  ptype;           /* NCSI packet Rx handler     */
 	struct list_head    node;            /* Form NCSI device list      */
+#define NCSI_MAX_VLAN_VIDS	15
 	struct list_head    vlan_vids;       /* List of active VLAN IDs */
 };
 
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 3fd3c39e6278..b6a449aa9d4b 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -732,6 +732,10 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 	if (index < 0) {
 		netdev_err(ndp->ndev.dev,
 			   "Failed to add new VLAN tag, error %d\n", index);
+		if (index == -ENOSPC)
+			netdev_err(ndp->ndev.dev,
+				   "Channel %u already has all VLAN filters set\n",
+				   nc->id);
 		return -1;
 	}
 
@@ -1403,7 +1407,6 @@ static int ncsi_kick_channels(struct ncsi_dev_priv *ndp)
 
 int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
-	struct ncsi_channel_filter *ncf;
 	struct ncsi_dev_priv *ndp;
 	unsigned int n_vids = 0;
 	struct vlan_vid *vlan;
@@ -1420,7 +1423,6 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 	}
 
 	ndp = TO_NCSI_DEV_PRIV(nd);
-	ncf = ndp->hot_channel->filters[NCSI_FILTER_VLAN];
 
 	/* Add the VLAN id to our internal list */
 	list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) {
@@ -1431,12 +1433,11 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 			return 0;
 		}
 	}
-
-	if (n_vids >= ncf->total) {
-		netdev_info(dev,
-			    "NCSI Channel supports up to %u VLAN tags but %u are already set\n",
-			    ncf->total, n_vids);
-		return -EINVAL;
+	if (n_vids >= NCSI_MAX_VLAN_VIDS) {
+		netdev_warn(dev,
+			    "tried to add vlan id %u but NCSI max already registered (%u)\n",
+			    vid, NCSI_MAX_VLAN_VIDS);
+		return -ENOSPC;
 	}
 
 	vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
-- 
cgit v1.2.3


From 12ed3772b7e11b53a58ecbd8ce258271fb148cc6 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 11 Oct 2017 20:10:31 -0700
Subject: ip: update policy routing config help

The kernel config help for policy routing was still pointing at
an ancient document from 2000 that refers to Linux 2.1. Update it
to point to something that is at least occasionally updated.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 91a2557942fa..f48fe6fc7e8c 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -70,11 +70,9 @@ config IP_MULTIPLE_TABLES
 	  address into account. Furthermore, the TOS (Type-Of-Service) field
 	  of the packet can be used for routing decisions as well.
 
-	  If you are interested in this, please see the preliminary
-	  documentation at <http://www.compendium.com.ar/policy-routing.txt>
-	  and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
-	  You will need supporting software from
-	  <ftp://ftp.tux.org/pub/net/ip-routing/>.
+	  If you need more information, see the Linux Advanced
+	  Routing and Traffic Control documentation at
+	  <http://lartc.org/howto/lartc.rpdb.html>
 
 	  If unsure, say N.
 
-- 
cgit v1.2.3


From 09001b03f722be96827bf8df5ba4d48b7ec0cc30 Mon Sep 17 00:00:00 2001
From: Wenhua Shi <march511@gmail.com>
Date: Sat, 14 Oct 2017 18:51:36 +0200
Subject: net: fix typo in skbuff.c

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 16982de649b9..e62476beee95 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1896,7 +1896,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
 	}
 
 	/* If we need update frag list, we are in troubles.
-	 * Certainly, it possible to add an offset to skb data,
+	 * Certainly, it is possible to add an offset to skb data,
 	 * but taking into account that pulling is expected to
 	 * be very rare operation, it is worth to fight against
 	 * further bloating skb head and crucify ourselves here instead.
-- 
cgit v1.2.3


From 5903f594935a3841137c86b9d5b75143a5b7121c Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 13 Oct 2017 19:22:35 +0200
Subject: l2tp: check ps->sock before running pppol2tp_session_ioctl()

When pppol2tp_session_ioctl() is called by pppol2tp_tunnel_ioctl(),
the session may be unconnected. That is, it was created by
pppol2tp_session_create() and hasn't been connected with
pppol2tp_connect(). In this case, ps->sock is NULL, so we need to check
for this case in order to avoid dereferencing a NULL pointer.

Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ppp.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index bc6e8bfc5be4..f50452b919d5 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -988,6 +988,9 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
 		 session->name, cmd, arg);
 
 	sk = ps->sock;
+	if (!sk)
+		return -EBADR;
+
 	sock_hold(sk);
 
 	switch (cmd) {
-- 
cgit v1.2.3


From fdf7cb4185b60c68e1a75e61691c4afdc15dea0e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 5 Sep 2017 14:54:54 +0200
Subject: mac80211: accept key reinstall without changing anything

When a key is reinstalled we can reset the replay counters
etc. which can lead to nonce reuse and/or replay detection
being impossible, breaking security properties, as described
in the "KRACK attacks".

In particular, CVE-2017-13080 applies to GTK rekeying that
happened in firmware while the host is in D3, with the second
part of the attack being done after the host wakes up. In
this case, the wpa_supplicant mitigation isn't sufficient
since wpa_supplicant doesn't know the GTK material.

In case this happens, simply silently accept the new key
coming from userspace but don't take any action on it since
it's the same key; this keeps the PN replay counters intact.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/key.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index a98fc2b5e0dc..ae995c8480db 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -4,7 +4,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007-2008	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015	Intel Deutschland GmbH
+ * Copyright 2015-2017	Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -620,9 +620,6 @@ int ieee80211_key_link(struct ieee80211_key *key,
 
 	pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
 	idx = key->conf.keyidx;
-	key->local = sdata->local;
-	key->sdata = sdata;
-	key->sta = sta;
 
 	mutex_lock(&sdata->local->key_mtx);
 
@@ -633,6 +630,21 @@ int ieee80211_key_link(struct ieee80211_key *key,
 	else
 		old_key = key_mtx_dereference(sdata->local, sdata->keys[idx]);
 
+	/*
+	 * Silently accept key re-installation without really installing the
+	 * new version of the key to avoid nonce reuse or replay issues.
+	 */
+	if (old_key && key->conf.keylen == old_key->conf.keylen &&
+	    !memcmp(key->conf.key, old_key->conf.key, key->conf.keylen)) {
+		ieee80211_key_free_unused(key);
+		ret = 0;
+		goto out;
+	}
+
+	key->local = sdata->local;
+	key->sdata = sdata;
+	key->sta = sta;
+
 	increment_tailroom_need_count(sdata);
 
 	ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
@@ -648,6 +660,7 @@ int ieee80211_key_link(struct ieee80211_key *key,
 		ret = 0;
 	}
 
+ out:
 	mutex_unlock(&sdata->local->key_mtx);
 
 	return ret;
-- 
cgit v1.2.3


From 8a212589fe0e45f26c549dfa271a157ca8eea1ac Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:41 +0800
Subject: rtnetlink: bring NETDEV_CHANGEMTU event process back in
 rtnetlink_event

Commit 085e1a65f04f ("rtnetlink: Do not generate notifications for MTU
events") tried to fix the redundant notifications issue when ip link
set mtu by removing NETDEV_CHANGEMTU event process in rtnetlink_event.

But it also resulted in no notification generated when dev's mtu is
changed via other methods, like:
  'ifconfig eth1 mtu 1400' or 'echo 1400 > /sys/class/net/eth1/mtu'
It would cause users not to be notified by this change.

This patch is to fix it by bringing NETDEV_CHANGEMTU event back into
rtnetlink_event, and the redundant notifications issue will be fixed
in the later patch 'rtnetlink: check DO_SETLINK_NOTIFY correctly in
do_setlink'.

Fixes: 085e1a65f04f ("rtnetlink: Do not generate notifications for MTU events")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d4bcdcc68e92..72053ed7c891 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4279,6 +4279,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 
 	switch (event) {
 	case NETDEV_REBOOT:
+	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGEADDR:
 	case NETDEV_CHANGENAME:
 	case NETDEV_FEAT_CHANGE:
-- 
cgit v1.2.3


From ebdcf0450b020748c2dab6bfe44a5ac3c5159fb0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:42 +0800
Subject: rtnetlink: bring NETDEV_CHANGE_TX_QUEUE_LEN event process back in
 rtnetlink_event

The same fix for changing mtu in the patch 'rtnetlink: bring
NETDEV_CHANGEMTU event process back in rtnetlink_event' is
needed for changing tx_queue_len.

Note that the redundant notifications issue for tx_queue_len
will be fixed in the later patch 'rtnetlink: do not send
notification for tx_queue_len in do_setlink'.

Fixes: 27b3b551d8a7 ("rtnetlink: Do not generate notifications for NETDEV_CHANGE_TX_QUEUE_LEN event")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 72053ed7c891..bf473604f33d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4287,6 +4287,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
+	case NETDEV_CHANGE_TX_QUEUE_LEN:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
 				   GFP_KERNEL);
 		break;
-- 
cgit v1.2.3


From e6e6659446c87057aede26a39d9f16b19001716f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:43 +0800
Subject: rtnetlink: bring NETDEV_POST_TYPE_CHANGE event process back in
 rtnetlink_event

As I said in patch 'rtnetlink: bring NETDEV_CHANGEMTU event process back
in rtnetlink_event', removing NETDEV_POST_TYPE_CHANGE event was not the
right fix for the redundant notifications issue.

So bring this event process back to rtnetlink_event and the old redundant
notifications issue would be fixed in the later patch 'rtnetlink: check
DO_SETLINK_NOTIFY correctly in do_setlink'.

Fixes: aef091ae58aa ("rtnetlink: Do not generate notifications for POST_TYPE_CHANGE event")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bf473604f33d..8e44fd597f46 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4284,6 +4284,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_CHANGENAME:
 	case NETDEV_FEAT_CHANGE:
 	case NETDEV_BONDING_FAILOVER:
+	case NETDEV_POST_TYPE_CHANGE:
 	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
-- 
cgit v1.2.3


From dc709f375743ebf5c9326cc9b946f6f09a34ac44 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:44 +0800
Subject: rtnetlink: bring NETDEV_CHANGEUPPER event process back in
 rtnetlink_event

libteam needs this event notification in userspace when dev's master
dev has been changed. After this, the redundant notifications issue
would be fixed in the later patch 'rtnetlink: check DO_SETLINK_NOTIFY
correctly in do_setlink'.

Fixes: b6b36eb23a46 ("rtnetlink: Do not generate notifications for NETDEV_CHANGEUPPER event")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8e44fd597f46..ab98c1c8b6f3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4286,6 +4286,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_BONDING_FAILOVER:
 	case NETDEV_POST_TYPE_CHANGE:
 	case NETDEV_NOTIFY_PEERS:
+	case NETDEV_CHANGEUPPER:
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
-- 
cgit v1.2.3


From 64ff90cc2e6f42596d7a0c37e41dc95292bb63b1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:45 +0800
Subject: rtnetlink: check DO_SETLINK_NOTIFY correctly in do_setlink

The check 'status & DO_SETLINK_NOTIFY' in do_setlink doesn't really
work after status & DO_SETLINK_MODIFIED, as:

  DO_SETLINK_MODIFIED 0x1
  DO_SETLINK_NOTIFY 0x3

Considering that notifications are suppposed to be sent only when
status have the flag DO_SETLINK_NOTIFY, the right check would be:

  (status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY

This would avoid lots of duplicated notifications when setting some
properties of a link.

Fixes: ba9989069f4e ("rtnl/do_setlink(): notify when a netdev is modified")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ab98c1c8b6f3..3e98fb557598 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2248,7 +2248,7 @@ static int do_setlink(const struct sk_buff *skb,
 
 errout:
 	if (status & DO_SETLINK_MODIFIED) {
-		if (status & DO_SETLINK_NOTIFY)
+		if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
 			netdev_state_change(dev);
 
 		if (err < 0)
-- 
cgit v1.2.3


From 2d7f669b42a97022c8c2b6cd86f3990be5fcd1bc Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:46 +0800
Subject: rtnetlink: do not set notification for tx_queue_len in do_setlink

NETDEV_CHANGE_TX_QUEUE_LEN event process in rtnetlink_event would
send a notification for userspace and tx_queue_len's setting in
do_setlink would trigger NETDEV_CHANGE_TX_QUEUE_LEN.

So it shouldn't set DO_SETLINK_NOTIFY status for this change to
send a notification any more.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3e98fb557598..a6bcf86ce471 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2093,7 +2093,7 @@ static int do_setlink(const struct sk_buff *skb,
 				dev->tx_queue_len = orig_len;
 				goto errout;
 			}
-			status |= DO_SETLINK_NOTIFY;
+			status |= DO_SETLINK_MODIFIED;
 		}
 	}
 
-- 
cgit v1.2.3


From 2459b4c635858094df78abb9ca87d99f89fe8ca5 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 11 Oct 2017 16:24:48 +0200
Subject: net: enable interface alias removal via rtnl

IFLA_IFALIAS is defined as NLA_STRING. It means that the minimal length of
the attribute is 1 ("\0"). However, to remove an alias, the attribute
length must be 0 (see dev_set_alias()).

Let's define the type to NLA_BINARY to allow 0-length string, so that the
alias can be removed.

Example:
$ ip l s dummy0 alias foo
$ ip l l dev dummy0
5: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether ae:20:30:4f:a7:f3 brd ff:ff:ff:ff:ff:ff
    alias foo

Before the patch:
$ ip l s dummy0 alias ""
RTNETLINK answers: Numerical result out of range

After the patch:
$ ip l s dummy0 alias ""
$ ip l l dev dummy0
5: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether ae:20:30:4f:a7:f3 brd ff:ff:ff:ff:ff:ff

CC: Oliver Hartkopp <oliver@hartkopp.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
Fixes: 96ca4a2cc145 ("net: remove ifalias on empty given alias")
Reported-by: Julien FLoret <julien.floret@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a6bcf86ce471..5ace48926b19 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1483,7 +1483,10 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_NET_NS_FD]	= { .type = NLA_U32 },
-	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
+	/* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to
+	 * allow 0-length string (needed to remove an alias).
+	 */
+	[IFLA_IFALIAS]	        = { .type = NLA_BINARY, .len = IFALIASZ - 1 },
 	[IFLA_VFINFO_LIST]	= {. type = NLA_NESTED },
 	[IFLA_VF_PORTS]		= { .type = NLA_NESTED },
 	[IFLA_PORT_SELF]	= { .type = NLA_NESTED },
-- 
cgit v1.2.3


From 0ad646c81b2182f7fa67ec0c8c825e0ee165696d Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 13 Oct 2017 11:58:53 -0700
Subject: tun: call dev_get_valid_name() before register_netdevice()

register_netdevice() could fail early when we have an invalid
dev name, in which case ->ndo_uninit() is not called. For tun
device, this is a problem because a timer etc. are already
initialized and it expects ->ndo_uninit() to clean them up.

We could move these initializations into a ->ndo_init() so
that register_netdevice() knows better, however this is still
complicated due to the logic in tun_detach().

Therefore, I choose to just call dev_get_valid_name() before
register_netdevice(), which is quicker and much easier to audit.
And for this specific case, it is already enough.

Fixes: 96442e42429e ("tuntap: choose the txq based on rxq")
Reported-by: Dmitry Alexeev <avekceeb@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 588b473194a8..11596a302a26 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1147,9 +1147,8 @@ static int dev_alloc_name_ns(struct net *net,
 	return ret;
 }
 
-static int dev_get_valid_name(struct net *net,
-			      struct net_device *dev,
-			      const char *name)
+int dev_get_valid_name(struct net *net, struct net_device *dev,
+		       const char *name)
 {
 	BUG_ON(!net);
 
@@ -1165,6 +1164,7 @@ static int dev_get_valid_name(struct net *net,
 
 	return 0;
 }
+EXPORT_SYMBOL(dev_get_valid_name);
 
 /**
  *	dev_change_name - change name of a device
-- 
cgit v1.2.3


From c019b5166e11faaf9ed3b64316ed338eaa19de60 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Mon, 16 Oct 2017 12:19:48 +0300
Subject: net/sched: cls_flower: Set egress_dev mark when calling into the HW
 driver

Commit 7091d8c '(net/sched: cls_flower: Add offload support using egress
Hardware device') made sure (when fl_hw_replace_filter is called) to put
the egress_dev mark on persisent structure instance. Hence, following calls
into the HW driver for stats and deletion will note it and act accordingly.

With commit de4784ca030f this property is lost and hence when called,
the HW driver failes to operate (stats, delete) on the offloaded flow.

Fix it by setting the egress_dev flag whenever the ingress device is
different from the hw device since this is exactly the condition under
which we're calling into the HW driver through the egress port net-device.

Fixes: de4784ca030f ('net: sched: get rid of struct tc_to_netdev')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roi Dayan <roid@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index d230cb4c8094..b480d7c792ba 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -234,6 +234,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 	tc_cls_common_offload_init(&cls_flower.common, tp);
 	cls_flower.command = TC_CLSFLOWER_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
+	cls_flower.egress_dev = f->hw_dev != tp->q->dev_queue->dev;
 
 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower);
 }
@@ -289,6 +290,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	cls_flower.command = TC_CLSFLOWER_STATS;
 	cls_flower.cookie = (unsigned long) f;
 	cls_flower.exts = &f->exts;
+	cls_flower.egress_dev = f->hw_dev != tp->q->dev_queue->dev;
 
 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
 				      &cls_flower);
-- 
cgit v1.2.3


From 823038ca030e9f8283518b1e6a5a6879edcbe057 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 16 Oct 2017 19:43:15 +0800
Subject: dev_ioctl: add missing NETDEV_CHANGE_TX_QUEUE_LEN event notification

When changing dev tx_queue_len via netlink or net-sysfs,
a NETDEV_CHANGE_TX_QUEUE_LEN event notification will be
called.

But dev_ioctl missed this event notification, which could
cause no userspace notification would be sent.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev_ioctl.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 709a4e6fb447..f9c7a88cd981 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -303,7 +303,18 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 	case SIOCSIFTXQLEN:
 		if (ifr->ifr_qlen < 0)
 			return -EINVAL;
-		dev->tx_queue_len = ifr->ifr_qlen;
+		if (dev->tx_queue_len ^ ifr->ifr_qlen) {
+			unsigned int orig_len = dev->tx_queue_len;
+
+			dev->tx_queue_len = ifr->ifr_qlen;
+			err = call_netdevice_notifiers(
+					NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+			err = notifier_to_errno(err);
+			if (err) {
+				dev->tx_queue_len = orig_len;
+				return err;
+			}
+		}
 		return 0;
 
 	case SIOCSIFNAME:
-- 
cgit v1.2.3


From 363b02dab09b3226f3bd1420dad9c72b79a42a76 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 4 Oct 2017 16:43:25 +0100
Subject: KEYS: Fix race between updating and finding a negative key

Consolidate KEY_FLAG_INSTANTIATED, KEY_FLAG_NEGATIVE and the rejection
error into one field such that:

 (1) The instantiation state can be modified/read atomically.

 (2) The error can be accessed atomically with the state.

 (3) The error isn't stored unioned with the payload pointers.

This deals with the problem that the state is spread over three different
objects (two bits and a separate variable) and reading or updating them
atomically isn't practical, given that not only can uninstantiated keys
change into instantiated or rejected keys, but rejected keys can also turn
into instantiated keys - and someone accessing the key might not be using
any locking.

The main side effect of this problem is that what was held in the payload
may change, depending on the state.  For instance, you might observe the
key to be in the rejected state.  You then read the cached error, but if
the key semaphore wasn't locked, the key might've become instantiated
between the two reads - and you might now have something in hand that isn't
actually an error code.

The state is now KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE or a negative error
code if the key is negatively instantiated.  The key_is_instantiated()
function is replaced with key_is_positive() to avoid confusion as negative
keys are also 'instantiated'.

Additionally, barriering is included:

 (1) Order payload-set before state-set during instantiation.

 (2) Order state-read before payload-read when using the key.

Further separate barriering is necessary if RCU is being used to access the
payload content after reading the payload pointers.

Fixes: 146aa8b1453b ("KEYS: Merge the type-specific data with the payload data")
Cc: stable@vger.kernel.org # v4.4+
Reported-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
---
 net/dns_resolver/dns_key.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 8737412c7b27..e1d4d898a007 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -224,7 +224,7 @@ static int dns_resolver_match_preparse(struct key_match_data *match_data)
 static void dns_resolver_describe(const struct key *key, struct seq_file *m)
 {
 	seq_puts(m, key->description);
-	if (key_is_instantiated(key)) {
+	if (key_is_positive(key)) {
 		int err = PTR_ERR(key->payload.data[dns_key_error]);
 
 		if (err)
-- 
cgit v1.2.3


From 48044eb490be71c203e14dd89e8bae87209eab52 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 16 Oct 2017 17:09:53 +0200
Subject: netlink: fix netlink_ack() extack race

It seems that it's possible to toggle NETLINK_F_EXT_ACK
through setsockopt() while another thread/CPU is building
a message inside netlink_ack(), which could then trigger
the WARN_ON()s I added since if it goes from being turned
off to being turned on between allocating and filling the
message, the skb could end up being too small.

Avoid this whole situation by storing the value of this
flag in a separate variable and using that throughout the
function instead.

Fixes: 2d4bc93368f5 ("netlink: extended ACK reporting")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f34750691c5c..b93148e8e9fb 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2307,6 +2307,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 	size_t tlvlen = 0;
 	struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
 	unsigned int flags = 0;
+	bool nlk_has_extack = nlk->flags & NETLINK_F_EXT_ACK;
 
 	/* Error messages get the original request appened, unless the user
 	 * requests to cap the error message, and get extra error data if
@@ -2317,7 +2318,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 			payload += nlmsg_len(nlh);
 		else
 			flags |= NLM_F_CAPPED;
-		if (nlk->flags & NETLINK_F_EXT_ACK && extack) {
+		if (nlk_has_extack && extack) {
 			if (extack->_msg)
 				tlvlen += nla_total_size(strlen(extack->_msg) + 1);
 			if (extack->bad_attr)
@@ -2326,8 +2327,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 	} else {
 		flags |= NLM_F_CAPPED;
 
-		if (nlk->flags & NETLINK_F_EXT_ACK &&
-		    extack && extack->cookie_len)
+		if (nlk_has_extack && extack && extack->cookie_len)
 			tlvlen += nla_total_size(extack->cookie_len);
 	}
 
@@ -2355,7 +2355,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 	errmsg->error = err;
 	memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh));
 
-	if (nlk->flags & NETLINK_F_EXT_ACK && extack) {
+	if (nlk_has_extack && extack) {
 		if (err) {
 			if (extack->_msg)
 				WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG,
-- 
cgit v1.2.3


From 62c04647c6f44fa3d5d0c077133da0aa1cbbc34c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 8 Sep 2017 16:02:35 +0100
Subject: can: bcm: check for null sk before deferencing it via the call to
 sock_net

The assignment of net via call sock_net will dereference sk. This
is performed before a sanity null check on sk, so there could be
a potential null dereference on the sock_net call if sk is null.
Fix this by assigning net after the sk null check. Also replace
the sk == NULL with the more usual !sk idiom.

Detected by CoverityScan CID#1431862 ("Dereference before null check")

Fixes: 384317ef4187 ("can: network namespace support for CAN_BCM protocol")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 net/can/bcm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/can/bcm.c b/net/can/bcm.c
index 47a8748d953a..13690334efa3 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1493,13 +1493,14 @@ static int bcm_init(struct sock *sk)
 static int bcm_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
-	struct net *net = sock_net(sk);
+	struct net *net;
 	struct bcm_sock *bo;
 	struct bcm_op *op, *next;
 
-	if (sk == NULL)
+	if (!sk)
 		return 0;
 
+	net = sock_net(sk);
 	bo = bcm_sk(sk);
 
 	/* remove bcm_ops, timer, rx_unregister(), etc. */
-- 
cgit v1.2.3


From cae1d5b78fb4874086170ad07921bca59ea2e893 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Tue, 17 Oct 2017 07:18:35 +0200
Subject: can: af_can: do not access proto_tab directly use rcu_access_pointer
 instead

"proto_tab" is a RCU protected array, when directly accessing the array,
sparse throws these warnings:

  CHECK   /srv/work/frogger/socketcan/linux/net/can/af_can.c
net/can/af_can.c:115:14: error: incompatible types in comparison expression (different address spaces)
net/can/af_can.c:795:17: error: incompatible types in comparison expression (different address spaces)
net/can/af_can.c:816:9: error: incompatible types in comparison expression (different address spaces)

This patch fixes the problem by using rcu_access_pointer() and
annotating "proto_tab" array as __rcu.

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 net/can/af_can.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/can/af_can.c b/net/can/af_can.c
index 88edac0f3e36..eb1ad74b40f4 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -78,7 +78,7 @@ MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)");
 static struct kmem_cache *rcv_cache __read_mostly;
 
 /* table of registered CAN protocols */
-static const struct can_proto *proto_tab[CAN_NPROTO] __read_mostly;
+static const struct can_proto __rcu *proto_tab[CAN_NPROTO] __read_mostly;
 static DEFINE_MUTEX(proto_tab_lock);
 
 static atomic_t skbcounter = ATOMIC_INIT(0);
@@ -788,7 +788,7 @@ int can_proto_register(const struct can_proto *cp)
 
 	mutex_lock(&proto_tab_lock);
 
-	if (proto_tab[proto]) {
+	if (rcu_access_pointer(proto_tab[proto])) {
 		pr_err("can: protocol %d already registered\n", proto);
 		err = -EBUSY;
 	} else
@@ -812,7 +812,7 @@ void can_proto_unregister(const struct can_proto *cp)
 	int proto = cp->protocol;
 
 	mutex_lock(&proto_tab_lock);
-	BUG_ON(proto_tab[proto] != cp);
+	BUG_ON(rcu_access_pointer(proto_tab[proto]) != cp);
 	RCU_INIT_POINTER(proto_tab[proto], NULL);
 	mutex_unlock(&proto_tab_lock);
 
-- 
cgit v1.2.3


From 5a606223c6b5b7560da253ed52e62c67fa18e29b Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Sat, 29 Jul 2017 11:51:01 +0200
Subject: can: af_can: can_pernet_init(): add missing error handling for
 kzalloc returning NULL

This patch adds the missing check and error handling for out-of-memory
situations, when kzalloc cannot allocate memory.

Fixes: cb5635a36776 ("can: complete initial namespace support")
Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 net/can/af_can.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/can/af_can.c b/net/can/af_can.c
index eb1ad74b40f4..ecd5c703d11e 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -875,9 +875,14 @@ static int can_pernet_init(struct net *net)
 	spin_lock_init(&net->can.can_rcvlists_lock);
 	net->can.can_rx_alldev_list =
 		kzalloc(sizeof(struct dev_rcv_lists), GFP_KERNEL);
-
+	if (!net->can.can_rx_alldev_list)
+		goto out;
 	net->can.can_stats = kzalloc(sizeof(struct s_stats), GFP_KERNEL);
+	if (!net->can.can_stats)
+		goto out_free_alldev_list;
 	net->can.can_pstats = kzalloc(sizeof(struct s_pstats), GFP_KERNEL);
+	if (!net->can.can_pstats)
+		goto out_free_can_stats;
 
 	if (IS_ENABLED(CONFIG_PROC_FS)) {
 		/* the statistics are updated every second (timer triggered) */
@@ -892,6 +897,13 @@ static int can_pernet_init(struct net *net)
 	}
 
 	return 0;
+
+ out_free_can_stats:
+	kfree(net->can.can_stats);
+ out_free_alldev_list:
+	kfree(net->can.can_rx_alldev_list);
+ out:
+	return -ENOMEM;
 }
 
 static void can_pernet_exit(struct net *net)
-- 
cgit v1.2.3


From df80cd9b28b9ebaa284a41df611dbf3a2d05ca74 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 17 Oct 2017 23:26:10 +0800
Subject: sctp: do not peel off an assoc from one netns to another one

Now when peeling off an association to the sock in another netns, all
transports in this assoc are not to be rehashed and keep use the old
key in hashtable.

As a transport uses sk->net as the hash key to insert into hashtable,
it would miss removing these transports from hashtable due to the new
netns when closing the sock and all transports are being freeed, then
later an use-after-free issue could be caused when looking up an asoc
and dereferencing those transports.

This is a very old issue since very beginning, ChunYu found it with
syzkaller fuzz testing with this series:

  socket$inet6_sctp()
  bind$inet6()
  sendto$inet6()
  unshare(0x40000000)
  getsockopt$inet_sctp6_SCTP_GET_ASSOC_ID_LIST()
  getsockopt$inet_sctp6_SCTP_SOCKOPT_PEELOFF()

This patch is to block this call when peeling one assoc off from one
netns to another one, so that the netns of all transport would not
go out-sync with the key in hashtable.

Note that this patch didn't fix it by rehashing transports, as it's
difficult to handle the situation when the tuple is already in use
in the new netns. Besides, no one would like to peel off one assoc
to another netns, considering ipaddrs, ifaces, etc. are usually
different.

Reported-by: ChunYu Wang <chunwang@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d4730ada7f32..17841ab30798 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4906,6 +4906,10 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
 	struct socket *sock;
 	int err = 0;
 
+	/* Do not peel off from one netns to another one. */
+	if (!net_eq(current->nsproxy->net_ns, sock_net(sk)))
+		return -EINVAL;
+
 	if (!asoc)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 1cc276cec9ec574d41cf47dfc0f51406b6f26ab4 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 18 Oct 2017 21:37:49 +0800
Subject: sctp: add the missing sock_owned_by_user check in sctp_icmp_redirect

Now sctp processes icmp redirect packet in sctp_icmp_redirect where
it calls sctp_transport_dst_check in which tp->dst can be released.

The problem is before calling sctp_transport_dst_check, it doesn't
check sock_owned_by_user, which means tp->dst could be freed while
a process is accessing it with owning the socket.

An use-after-free issue could be triggered by this.

This patch is to fix it by checking sock_owned_by_user before calling
sctp_transport_dst_check in sctp_icmp_redirect, so that it would not
release tp->dst if users still hold sock lock.

Besides, the same issue fixed in commit 45caeaa5ac0b ("dccp/tcp: fix
routing redirect race") on sctp also needs this check.

Fixes: 55be7a9c6074 ("ipv4: Add redirect support to all protocol icmp error handlers")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 92a07141fd07..34f10e75f3b9 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -421,7 +421,7 @@ void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t,
 {
 	struct dst_entry *dst;
 
-	if (!t)
+	if (sock_owned_by_user(sk) || !t)
 		return;
 	dst = sctp_transport_dst_check(t);
 	if (dst)
-- 
cgit v1.2.3


From 34f79502bbcfab659b8729da68b5e387f96eb4c1 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 18 Oct 2017 07:10:36 -0700
Subject: bpf: avoid preempt enable/disable in sockmap using tcp_skb_cb region

SK_SKB BPF programs are run from the socket/tcp context but early in
the stack before much of the TCP metadata is needed in tcp_skb_cb. So
we can use some unused fields to place BPF metadata needed for SK_SKB
programs when implementing the redirect function.

This allows us to drop the preempt disable logic. It does however
require an API change so sk_redirect_map() has been updated to
additionally provide ctx_ptr to skb. Note, we do however continue to
disable/enable preemption around actual BPF program running to account
for map updates.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 74b8c91fb5f4..ca1ba0bbfbc2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1839,31 +1839,31 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+	   struct bpf_map *, map, u32, key, u64, flags)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
 	if (unlikely(flags))
 		return SK_ABORTED;
 
-	ri->ifindex = key;
-	ri->flags = flags;
-	ri->map = map;
+	tcb->bpf.key = key;
+	tcb->bpf.flags = flags;
+	tcb->bpf.map = map;
 
 	return SK_REDIRECT;
 }
 
-struct sock *do_sk_redirect_map(void)
+struct sock *do_sk_redirect_map(struct sk_buff *skb)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 	struct sock *sk = NULL;
 
-	if (ri->map) {
-		sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
+	if (tcb->bpf.map) {
+		sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
 
-		ri->ifindex = 0;
-		ri->map = NULL;
-		/* we do not clear flags for future lookup */
+		tcb->bpf.key = 0;
+		tcb->bpf.map = NULL;
 	}
 
 	return sk;
@@ -1873,9 +1873,10 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
 	.func           = bpf_sk_redirect_map,
 	.gpl_only       = false,
 	.ret_type       = RET_INTEGER,
-	.arg1_type      = ARG_CONST_MAP_PTR,
-	.arg2_type      = ARG_ANYTHING,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
 	.arg3_type      = ARG_ANYTHING,
+	.arg4_type      = ARG_ANYTHING,
 };
 
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
-- 
cgit v1.2.3


From f7e9cb1ecb6d922584abff16db07930162c57155 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 18 Oct 2017 07:10:58 -0700
Subject: bpf: remove mark access for SK_SKB program types

The skb->mark field is a union with reserved_tailroom which is used
in the TCP code paths from stream memory allocation. Allowing SK_SKB
programs to set this field creates a conflict with future code
optimizations, such as "gifting" the skb to the egress path instead
of creating a new skb and doing a memcpy.

Because we do not have a released version of SK_SKB yet lets just
remove it for now. A more appropriate scratch pad to use at the
socket layer is dev_scratch, but lets add that in future kernels
when needed.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index ca1ba0bbfbc2..aa0265997f93 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3684,7 +3684,6 @@ static bool sk_skb_is_valid_access(int off, int size,
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
-		case bpf_ctx_range(struct __sk_buff, mark):
 		case bpf_ctx_range(struct __sk_buff, tc_index):
 		case bpf_ctx_range(struct __sk_buff, priority):
 			break;
@@ -3694,6 +3693,7 @@ static bool sk_skb_is_valid_access(int off, int size,
 	}
 
 	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, mark):
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
-- 
cgit v1.2.3


From c92e8c02fe664155ac4234516e32544bec0f113d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Oct 2017 09:04:13 -0700
Subject: tcp/dccp: fix ireq->opt races

syzkaller found another bug in DCCP/TCP stacks [1]

For the reasons explained in commit ce1050089c96 ("tcp/dccp: fix
ireq->pktopts race"), we need to make sure we do not access
ireq->opt unless we own the request sock.

Note the opt field is renamed to ireq_opt to ease grep games.

[1]
BUG: KASAN: use-after-free in ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474
Read of size 1 at addr ffff8801c951039c by task syz-executor5/3295

CPU: 1 PID: 3295 Comm: syz-executor5 Not tainted 4.14.0-rc4+ #80
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x25b/0x340 mm/kasan/report.c:409
 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:427
 ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474
 tcp_transmit_skb+0x1ab7/0x3840 net/ipv4/tcp_output.c:1135
 tcp_send_ack.part.37+0x3bb/0x650 net/ipv4/tcp_output.c:3587
 tcp_send_ack+0x49/0x60 net/ipv4/tcp_output.c:3557
 __tcp_ack_snd_check+0x2c6/0x4b0 net/ipv4/tcp_input.c:5072
 tcp_ack_snd_check net/ipv4/tcp_input.c:5085 [inline]
 tcp_rcv_state_process+0x2eff/0x4850 net/ipv4/tcp_input.c:6071
 tcp_child_process+0x342/0x990 net/ipv4/tcp_minisocks.c:816
 tcp_v4_rcv+0x1827/0x2f80 net/ipv4/tcp_ipv4.c:1682
 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:249 [inline]
 ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:464 [inline]
 ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397
 NF_HOOK include/linux/netfilter.h:249 [inline]
 ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493
 __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514
 netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587
 netif_receive_skb+0xae/0x390 net/core/dev.c:4611
 tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372
 tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766
 tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792
 call_write_iter include/linux/fs.h:1770 [inline]
 new_sync_write fs/read_write.c:468 [inline]
 __vfs_write+0x68a/0x970 fs/read_write.c:481
 vfs_write+0x18f/0x510 fs/read_write.c:543
 SYSC_write fs/read_write.c:588 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:580
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x40c341
RSP: 002b:00007f469523ec10 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 000000000040c341
RDX: 0000000000000037 RSI: 0000000020004000 RDI: 0000000000000015
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 00000000000f4240 R11: 0000000000000293 R12: 00000000004b7fd1
R13: 00000000ffffffff R14: 0000000020000000 R15: 0000000000025000

Allocated by task 3295:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 __do_kmalloc mm/slab.c:3725 [inline]
 __kmalloc+0x162/0x760 mm/slab.c:3734
 kmalloc include/linux/slab.h:498 [inline]
 tcp_v4_save_options include/net/tcp.h:1962 [inline]
 tcp_v4_init_req+0x2d3/0x3e0 net/ipv4/tcp_ipv4.c:1271
 tcp_conn_request+0xf6d/0x3410 net/ipv4/tcp_input.c:6283
 tcp_v4_conn_request+0x157/0x210 net/ipv4/tcp_ipv4.c:1313
 tcp_rcv_state_process+0x8ea/0x4850 net/ipv4/tcp_input.c:5857
 tcp_v4_do_rcv+0x55c/0x7d0 net/ipv4/tcp_ipv4.c:1482
 tcp_v4_rcv+0x2d10/0x2f80 net/ipv4/tcp_ipv4.c:1711
 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:249 [inline]
 ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:464 [inline]
 ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397
 NF_HOOK include/linux/netfilter.h:249 [inline]
 ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493
 __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514
 netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587
 netif_receive_skb+0xae/0x390 net/core/dev.c:4611
 tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372
 tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766
 tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792
 call_write_iter include/linux/fs.h:1770 [inline]
 new_sync_write fs/read_write.c:468 [inline]
 __vfs_write+0x68a/0x970 fs/read_write.c:481
 vfs_write+0x18f/0x510 fs/read_write.c:543
 SYSC_write fs/read_write.c:588 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:580
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 3306:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kfree+0xca/0x250 mm/slab.c:3820
 inet_sock_destruct+0x59d/0x950 net/ipv4/af_inet.c:157
 __sk_destruct+0xfd/0x910 net/core/sock.c:1560
 sk_destruct+0x47/0x80 net/core/sock.c:1595
 __sk_free+0x57/0x230 net/core/sock.c:1603
 sk_free+0x2a/0x40 net/core/sock.c:1614
 sock_put include/net/sock.h:1652 [inline]
 inet_csk_complete_hashdance+0xd5/0xf0 net/ipv4/inet_connection_sock.c:959
 tcp_check_req+0xf4d/0x1620 net/ipv4/tcp_minisocks.c:765
 tcp_v4_rcv+0x17f6/0x2f80 net/ipv4/tcp_ipv4.c:1675
 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:249 [inline]
 ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:464 [inline]
 ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397
 NF_HOOK include/linux/netfilter.h:249 [inline]
 ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493
 __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514
 netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587
 netif_receive_skb+0xae/0x390 net/core/dev.c:4611
 tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372
 tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766
 tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792
 call_write_iter include/linux/fs.h:1770 [inline]
 new_sync_write fs/read_write.c:468 [inline]
 __vfs_write+0x68a/0x970 fs/read_write.c:481
 vfs_write+0x18f/0x510 fs/read_write.c:543
 SYSC_write fs/read_write.c:588 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:580
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets")
Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c                 | 13 ++++++++-----
 net/ipv4/cipso_ipv4.c           | 24 +++++++-----------------
 net/ipv4/inet_connection_sock.c |  8 +++-----
 net/ipv4/syncookies.c           |  2 +-
 net/ipv4/tcp_input.c            |  2 +-
 net/ipv4/tcp_ipv4.c             | 22 +++++++++++++---------
 6 files changed, 33 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 001c08696334..0490916864f9 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -414,8 +414,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
 	newinet->inet_saddr	= ireq->ir_loc_addr;
-	newinet->inet_opt	= ireq->opt;
-	ireq->opt	   = NULL;
+	RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt));
 	newinet->mc_index  = inet_iif(skb);
 	newinet->mc_ttl	   = ip_hdr(skb)->ttl;
 	newinet->inet_id   = jiffies;
@@ -430,7 +429,10 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
 	if (__inet_inherit_port(sk, newsk) < 0)
 		goto put_and_exit;
 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
-
+	if (*own_req)
+		ireq->ireq_opt = NULL;
+	else
+		newinet->inet_opt = NULL;
 	return newsk;
 
 exit_overflow:
@@ -441,6 +443,7 @@ exit:
 	__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
 	return NULL;
 put_and_exit:
+	newinet->inet_opt = NULL;
 	inet_csk_prepare_forced_close(newsk);
 	dccp_done(newsk);
 	goto exit;
@@ -492,7 +495,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
 							      ireq->ir_rmt_addr);
 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 					    ireq->ir_rmt_addr,
-					    ireq->opt);
+					    rcu_dereference(ireq->ireq_opt));
 		err = net_xmit_eval(err);
 	}
 
@@ -548,7 +551,7 @@ out:
 static void dccp_v4_reqsk_destructor(struct request_sock *req)
 {
 	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
-	kfree(inet_rsk(req)->opt);
+	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 }
 
 void dccp_syn_ack_timeout(const struct request_sock *req)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 2ae8f54cb321..82178cc69c96 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1951,7 +1951,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
 	buf = NULL;
 
 	req_inet = inet_rsk(req);
-	opt = xchg(&req_inet->opt, opt);
+	opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt);
 	if (opt)
 		kfree_rcu(opt, rcu);
 
@@ -1973,11 +1973,13 @@ req_setattr_failure:
  * values on failure.
  *
  */
-static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
+static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
 {
+	struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1);
 	int hdr_delta = 0;
-	struct ip_options_rcu *opt = *opt_ptr;
 
+	if (!opt || opt->opt.cipso == 0)
+		return 0;
 	if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
 		u8 cipso_len;
 		u8 cipso_off;
@@ -2039,14 +2041,10 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
  */
 void cipso_v4_sock_delattr(struct sock *sk)
 {
-	int hdr_delta;
-	struct ip_options_rcu *opt;
 	struct inet_sock *sk_inet;
+	int hdr_delta;
 
 	sk_inet = inet_sk(sk);
-	opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
-	if (!opt || opt->opt.cipso == 0)
-		return;
 
 	hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
 	if (sk_inet->is_icsk && hdr_delta > 0) {
@@ -2066,15 +2064,7 @@ void cipso_v4_sock_delattr(struct sock *sk)
  */
 void cipso_v4_req_delattr(struct request_sock *req)
 {
-	struct ip_options_rcu *opt;
-	struct inet_request_sock *req_inet;
-
-	req_inet = inet_rsk(req);
-	opt = req_inet->opt;
-	if (!opt || opt->opt.cipso == 0)
-		return;
-
-	cipso_v4_delopt(&req_inet->opt);
+	cipso_v4_delopt(&inet_rsk(req)->ireq_opt);
 }
 
 /**
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 67aec7a10686..5ec9136a7c36 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -540,9 +540,10 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct net *net = read_pnet(&ireq->ireq_net);
-	struct ip_options_rcu *opt = ireq->opt;
+	struct ip_options_rcu *opt;
 	struct rtable *rt;
 
+	opt = rcu_dereference(ireq->ireq_opt);
 	flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
@@ -576,10 +577,9 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 	struct flowi4 *fl4;
 	struct rtable *rt;
 
+	opt = rcu_dereference(ireq->ireq_opt);
 	fl4 = &newinet->cork.fl.u.ip4;
 
-	rcu_read_lock();
-	opt = rcu_dereference(newinet->inet_opt);
 	flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
@@ -592,13 +592,11 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 		goto no_route;
 	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
 		goto route_err;
-	rcu_read_unlock();
 	return &rt->dst;
 
 route_err:
 	ip_rt_put(rt);
 no_route:
-	rcu_read_unlock();
 	__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 	return NULL;
 }
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b1bb1b3a1082..77cf32a80952 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -355,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
 	 */
-	ireq->opt = tcp_v4_save_options(sock_net(sk), skb);
+	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb));
 
 	if (security_inet_conn_request(sk, skb, req)) {
 		reqsk_free(req);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5d7656beeee..7eec3383702b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6196,7 +6196,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 		struct inet_request_sock *ireq = inet_rsk(req);
 
 		kmemcheck_annotate_bitfield(ireq, flags);
-		ireq->opt = NULL;
+		ireq->ireq_opt = NULL;
 #if IS_ENABLED(CONFIG_IPV6)
 		ireq->pktopts = NULL;
 #endif
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 85164d4d3e53..4c43365c374c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -877,7 +877,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 
 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 					    ireq->ir_rmt_addr,
-					    ireq->opt);
+					    rcu_dereference(ireq->ireq_opt));
 		err = net_xmit_eval(err);
 	}
 
@@ -889,7 +889,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
  */
 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 {
-	kfree(inet_rsk(req)->opt);
+	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 }
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1265,10 +1265,11 @@ static void tcp_v4_init_req(struct request_sock *req,
 			    struct sk_buff *skb)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
+	struct net *net = sock_net(sk_listener);
 
 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
-	ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb);
+	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
 }
 
 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
@@ -1355,10 +1356,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
 	newsk->sk_bound_dev_if = ireq->ir_iif;
-	newinet->inet_saddr	      = ireq->ir_loc_addr;
-	inet_opt	      = ireq->opt;
-	rcu_assign_pointer(newinet->inet_opt, inet_opt);
-	ireq->opt	      = NULL;
+	newinet->inet_saddr   = ireq->ir_loc_addr;
+	inet_opt	      = rcu_dereference(ireq->ireq_opt);
+	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
 	newinet->mc_index     = inet_iif(skb);
 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
 	newinet->rcv_tos      = ip_hdr(skb)->tos;
@@ -1403,9 +1403,12 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	if (__inet_inherit_port(sk, newsk) < 0)
 		goto put_and_exit;
 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
-	if (*own_req)
+	if (likely(*own_req)) {
 		tcp_move_syn(newtp, req);
-
+		ireq->ireq_opt = NULL;
+	} else {
+		newinet->inet_opt = NULL;
+	}
 	return newsk;
 
 exit_overflow:
@@ -1416,6 +1419,7 @@ exit:
 	tcp_listendrop(sk);
 	return NULL;
 put_and_exit:
+	newinet->inet_opt = NULL;
 	inet_csk_prepare_forced_close(newsk);
 	tcp_done(newsk);
 	goto exit;
-- 
cgit v1.2.3


From 509c7a1ecc8601f94ffba8a00889fefb239c00c6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Oct 2017 16:14:52 -0700
Subject: packet: avoid panic in packet_getsockopt()

syzkaller got crashes in packet_getsockopt() processing
PACKET_ROLLOVER_STATS command while another thread was managing
to change po->rollover

Using RCU will fix this bug. We might later add proper RCU annotations
for sparse sake.

In v2: I replaced kfree(rollover) in fanout_add() to kfree_rcu()
variant, as spotted by John.

Fixes: a9b6391814d5 ("packet: rollover statistics")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: John Sperbeck <jsperbeck@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bec01a3daf5b..2986941164b1 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1769,7 +1769,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 
 out:
 	if (err && rollover) {
-		kfree(rollover);
+		kfree_rcu(rollover, rcu);
 		po->rollover = NULL;
 	}
 	mutex_unlock(&fanout_mutex);
@@ -1796,8 +1796,10 @@ static struct packet_fanout *fanout_release(struct sock *sk)
 		else
 			f = NULL;
 
-		if (po->rollover)
+		if (po->rollover) {
 			kfree_rcu(po->rollover, rcu);
+			po->rollover = NULL;
+		}
 	}
 	mutex_unlock(&fanout_mutex);
 
@@ -3851,6 +3853,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 	void *data = &val;
 	union tpacket_stats_u st;
 	struct tpacket_rollover_stats rstats;
+	struct packet_rollover *rollover;
 
 	if (level != SOL_PACKET)
 		return -ENOPROTOOPT;
@@ -3929,13 +3932,18 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		       0);
 		break;
 	case PACKET_ROLLOVER_STATS:
-		if (!po->rollover)
+		rcu_read_lock();
+		rollover = rcu_dereference(po->rollover);
+		if (rollover) {
+			rstats.tp_all = atomic_long_read(&rollover->num);
+			rstats.tp_huge = atomic_long_read(&rollover->num_huge);
+			rstats.tp_failed = atomic_long_read(&rollover->num_failed);
+			data = &rstats;
+			lv = sizeof(rstats);
+		}
+		rcu_read_unlock();
+		if (!rollover)
 			return -EINVAL;
-		rstats.tp_all = atomic_long_read(&po->rollover->num);
-		rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
-		rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
-		data = &rstats;
-		lv = sizeof(rstats);
 		break;
 	case PACKET_TX_HAS_OFF:
 		val = po->tp_tx_has_off;
-- 
cgit v1.2.3


From 6850d0f8b2542112629061808ed950b35eb982e4 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Thu, 19 Oct 2017 13:43:05 +1100
Subject: net/ncsi: Fix AEN HNCDSC packet length

Correct the value of the HNCDSC AEN packet.
Fixes: 7a82ecf4cfb85 "net/ncsi: NCSI AEN packet handler"

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-aen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index 6898e7229285..f135938bf781 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -187,7 +187,7 @@ static struct ncsi_aen_handler {
 } ncsi_aen_handlers[] = {
 	{ NCSI_PKT_AEN_LSC,    12, ncsi_aen_handler_lsc    },
 	{ NCSI_PKT_AEN_CR,      4, ncsi_aen_handler_cr     },
-	{ NCSI_PKT_AEN_HNCDSC,  4, ncsi_aen_handler_hncdsc }
+	{ NCSI_PKT_AEN_HNCDSC,  8, ncsi_aen_handler_hncdsc }
 };
 
 int ncsi_aen_handler(struct ncsi_dev_priv *ndp, struct sk_buff *skb)
-- 
cgit v1.2.3


From 0795fb2021f07969949f523ea33c39785bfae9d6 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Thu, 19 Oct 2017 13:43:06 +1100
Subject: net/ncsi: Stop monitor if channel times out or is inactive

ncsi_channel_monitor() misses stopping the channel monitor in several
places that it should, causing a WARN_ON_ONCE() to trigger when the
monitor is re-started later, eg:

[  459.040000] WARNING: CPU: 0 PID: 1093 at net/ncsi/ncsi-manage.c:269 ncsi_start_channel_monitor+0x7c/0x90
[  459.040000] CPU: 0 PID: 1093 Comm: kworker/0:3 Not tainted 4.10.17-gaca2fdd #140
[  459.040000] Hardware name: ASpeed SoC
[  459.040000] Workqueue: events ncsi_dev_work
[  459.040000] [<80010094>] (unwind_backtrace) from [<8000d950>] (show_stack+0x20/0x24)
[  459.040000] [<8000d950>] (show_stack) from [<801dbf70>] (dump_stack+0x20/0x28)
[  459.040000] [<801dbf70>] (dump_stack) from [<80018d7c>] (__warn+0xe0/0x108)
[  459.040000] [<80018d7c>] (__warn) from [<80018e70>] (warn_slowpath_null+0x30/0x38)
[  459.040000] [<80018e70>] (warn_slowpath_null) from [<803f6a08>] (ncsi_start_channel_monitor+0x7c/0x90)
[  459.040000] [<803f6a08>] (ncsi_start_channel_monitor) from [<803f7664>] (ncsi_configure_channel+0xdc/0x5fc)
[  459.040000] [<803f7664>] (ncsi_configure_channel) from [<803f8160>] (ncsi_dev_work+0xac/0x474)
[  459.040000] [<803f8160>] (ncsi_dev_work) from [<8002d244>] (process_one_work+0x1e0/0x450)
[  459.040000] [<8002d244>] (process_one_work) from [<8002d510>] (worker_thread+0x5c/0x570)
[  459.040000] [<8002d510>] (worker_thread) from [<80033614>] (kthread+0x124/0x164)
[  459.040000] [<80033614>] (kthread) from [<8000a5e8>] (ret_from_fork+0x14/0x2c)

This also updates the monitor instead of just returning if
ncsi_xmit_cmd() fails to send the get-link-status command so that the
monitor properly times out.

Fixes: e6f44ed6d04d3 "net/ncsi: Package and channel management"

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-manage.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index b6a449aa9d4b..b022deb39d31 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -202,11 +202,15 @@ static void ncsi_channel_monitor(unsigned long data)
 	monitor_state = nc->monitor.state;
 	spin_unlock_irqrestore(&nc->lock, flags);
 
-	if (!enabled || chained)
+	if (!enabled || chained) {
+		ncsi_stop_channel_monitor(nc);
 		return;
+	}
 	if (state != NCSI_CHANNEL_INACTIVE &&
-	    state != NCSI_CHANNEL_ACTIVE)
+	    state != NCSI_CHANNEL_ACTIVE) {
+		ncsi_stop_channel_monitor(nc);
 		return;
+	}
 
 	switch (monitor_state) {
 	case NCSI_CHANNEL_MONITOR_START:
@@ -217,12 +221,9 @@ static void ncsi_channel_monitor(unsigned long data)
 		nca.type = NCSI_PKT_CMD_GLS;
 		nca.req_flags = 0;
 		ret = ncsi_xmit_cmd(&nca);
-		if (ret) {
+		if (ret)
 			netdev_err(ndp->ndev.dev, "Error %d sending GLS\n",
 				   ret);
-			return;
-		}
-
 		break;
 	case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX:
 		break;
@@ -233,6 +234,8 @@ static void ncsi_channel_monitor(unsigned long data)
 			ndp->flags |= NCSI_DEV_RESHUFFLE;
 		}
 
+		ncsi_stop_channel_monitor(nc);
+
 		spin_lock_irqsave(&nc->lock, flags);
 		nc->state = NCSI_CHANNEL_INVISIBLE;
 		spin_unlock_irqrestore(&nc->lock, flags);
-- 
cgit v1.2.3


From 100ef01f3ea4badbee6479290a41f74abd0e523f Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Thu, 19 Oct 2017 13:43:07 +1100
Subject: net/ncsi: Disable HWA mode when no channels are found

When there are no NCSI channels probed, HWA (Hardware Arbitration)
mode is enabled. It's not correct because HWA depends on the fact:
NCSI channels exist and all of them support HWA mode. This disables
HWA when no channels are probed.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-manage.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index b022deb39d31..0966eff48ce7 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -1005,12 +1005,15 @@ static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp)
 	struct ncsi_package *np;
 	struct ncsi_channel *nc;
 	unsigned int cap;
+	bool has_channel = false;
 
 	/* The hardware arbitration is disabled if any one channel
 	 * doesn't support explicitly.
 	 */
 	NCSI_FOR_EACH_PACKAGE(ndp, np) {
 		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			has_channel = true;
+
 			cap = nc->caps[NCSI_CAP_GENERIC].cap;
 			if (!(cap & NCSI_CAP_GENERIC_HWA) ||
 			    (cap & NCSI_CAP_GENERIC_HWA_MASK) !=
@@ -1021,8 +1024,13 @@ static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp)
 		}
 	}
 
-	ndp->flags |= NCSI_DEV_HWA;
-	return true;
+	if (has_channel) {
+		ndp->flags |= NCSI_DEV_HWA;
+		return true;
+	}
+
+	ndp->flags &= ~NCSI_DEV_HWA;
+	return false;
 }
 
 static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp)
-- 
cgit v1.2.3


From 52b4c8627f9f0d882e969967a207a27a80c9c753 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Thu, 19 Oct 2017 13:43:08 +1100
Subject: net/ncsi: Enforce failover on link monitor timeout

The NCSI channel has been configured to provide service if its link
monitor timer is enabled, regardless of its state (inactive or active).
So the timeout event on the link monitor indicates the out-of-service
on that channel, for which a failover is needed.

This sets NCSI_DEV_RESHUFFLE flag to enforce failover on link monitor
timeout, regardless the channel's original state (inactive or active).
Also, the link is put into "down" state to give the failing channel
lowest priority when selecting for the active channel. The state of
failing channel should be set to active in order for deinitialization
and failover to be done.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-manage.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 0966eff48ce7..28c42b22b748 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -189,6 +189,7 @@ static void ncsi_channel_monitor(unsigned long data)
 	struct ncsi_channel *nc = (struct ncsi_channel *)data;
 	struct ncsi_package *np = nc->package;
 	struct ncsi_dev_priv *ndp = np->ndp;
+	struct ncsi_channel_mode *ncm;
 	struct ncsi_cmd_arg nca;
 	bool enabled, chained;
 	unsigned int monitor_state;
@@ -228,20 +229,21 @@ static void ncsi_channel_monitor(unsigned long data)
 	case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX:
 		break;
 	default:
-		if (!(ndp->flags & NCSI_DEV_HWA) &&
-		    state == NCSI_CHANNEL_ACTIVE) {
+		if (!(ndp->flags & NCSI_DEV_HWA)) {
 			ncsi_report_link(ndp, true);
 			ndp->flags |= NCSI_DEV_RESHUFFLE;
 		}
 
 		ncsi_stop_channel_monitor(nc);
 
+		ncm = &nc->modes[NCSI_MODE_LINK];
 		spin_lock_irqsave(&nc->lock, flags);
 		nc->state = NCSI_CHANNEL_INVISIBLE;
+		ncm->data[2] &= ~0x1;
 		spin_unlock_irqrestore(&nc->lock, flags);
 
 		spin_lock_irqsave(&ndp->lock, flags);
-		nc->state = NCSI_CHANNEL_INACTIVE;
+		nc->state = NCSI_CHANNEL_ACTIVE;
 		list_add_tail_rcu(&nc->link, &ndp->channel_queue);
 		spin_unlock_irqrestore(&ndp->lock, flags);
 		ncsi_process_next_channel(ndp);
-- 
cgit v1.2.3


From 0a90e251988ceedc528c8db98f25b051cf190f44 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Thu, 19 Oct 2017 13:43:09 +1100
Subject: net/ncsi: Fix length of GVI response packet

The length of GVI (GetVersionInfo) response packet should be 40 instead
of 36. This issue was found from /sys/kernel/debug/ncsi/eth0/stats.

 # ethtool --ncsi eth0 swstats
     :
 RESPONSE     OK       TIMEOUT  ERROR
 =======================================
 GVI          0        0        2

With this applied, no error reported on GVI response packets:

 # ethtool --ncsi eth0 swstats
     :
 RESPONSE     OK       TIMEOUT  ERROR
 =======================================
 GVI          2        0        0

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-rsp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 265b9a892d41..927dad4759d1 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -959,7 +959,7 @@ static struct ncsi_rsp_handler {
 	{ NCSI_PKT_RSP_EGMF,    4, ncsi_rsp_handler_egmf    },
 	{ NCSI_PKT_RSP_DGMF,    4, ncsi_rsp_handler_dgmf    },
 	{ NCSI_PKT_RSP_SNFC,    4, ncsi_rsp_handler_snfc    },
-	{ NCSI_PKT_RSP_GVI,    36, ncsi_rsp_handler_gvi     },
+	{ NCSI_PKT_RSP_GVI,    40, ncsi_rsp_handler_gvi     },
 	{ NCSI_PKT_RSP_GC,     32, ncsi_rsp_handler_gc      },
 	{ NCSI_PKT_RSP_GP,     -1, ncsi_rsp_handler_gp      },
 	{ NCSI_PKT_RSP_GCPS,  172, ncsi_rsp_handler_gcps    },
-- 
cgit v1.2.3


From b4562ca7925a3bedada87a3dd072dd5bad043288 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Thu, 19 Oct 2017 03:33:14 +0000
Subject: hv_sock: add locking in the open/close/release code paths

Without the patch, when hvs_open_connection() hasn't completely established
a connection (e.g. it has changed sk->sk_state to SS_CONNECTED, but hasn't
inserted the sock into the connected queue), vsock_stream_connect() may see
the sk_state change and return the connection to the userspace, and next
when the userspace closes the connection quickly, hvs_release() may not see
the connection in the connected queue; finally hvs_open_connection()
inserts the connection into the queue, but we won't be able to purge the
connection for ever.

Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Cathy Avery <cavery@redhat.com>
Cc: Rolf Neugebauer <rolf.neugebauer@docker.com>
Cc: Marcelo Cerri <marcelo.cerri@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/hyperv_transport.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 14ed5a344cdf..e21991fe883a 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -310,11 +310,15 @@ static void hvs_close_connection(struct vmbus_channel *chan)
 	struct sock *sk = get_per_channel_state(chan);
 	struct vsock_sock *vsk = vsock_sk(sk);
 
+	lock_sock(sk);
+
 	sk->sk_state = SS_UNCONNECTED;
 	sock_set_flag(sk, SOCK_DONE);
 	vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN;
 
 	sk->sk_state_change(sk);
+
+	release_sock(sk);
 }
 
 static void hvs_open_connection(struct vmbus_channel *chan)
@@ -344,6 +348,8 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 	if (!sk)
 		return;
 
+	lock_sock(sk);
+
 	if ((conn_from_host && sk->sk_state != VSOCK_SS_LISTEN) ||
 	    (!conn_from_host && sk->sk_state != SS_CONNECTING))
 		goto out;
@@ -395,9 +401,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 
 		vsock_insert_connected(vnew);
 
-		lock_sock(sk);
 		vsock_enqueue_accept(sk, new);
-		release_sock(sk);
 	} else {
 		sk->sk_state = SS_CONNECTED;
 		sk->sk_socket->state = SS_CONNECTED;
@@ -410,6 +414,8 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 out:
 	/* Release refcnt obtained when we called vsock_find_bound_socket() */
 	sock_put(sk);
+
+	release_sock(sk);
 }
 
 static u32 hvs_get_local_cid(void)
@@ -476,13 +482,21 @@ out:
 
 static void hvs_release(struct vsock_sock *vsk)
 {
+	struct sock *sk = sk_vsock(vsk);
 	struct hvsock *hvs = vsk->trans;
-	struct vmbus_channel *chan = hvs->chan;
+	struct vmbus_channel *chan;
 
+	lock_sock(sk);
+
+	sk->sk_state = SS_DISCONNECTING;
+	vsock_remove_sock(vsk);
+
+	release_sock(sk);
+
+	chan = hvs->chan;
 	if (chan)
 		hvs_shutdown(vsk, RCV_SHUTDOWN | SEND_SHUTDOWN);
 
-	vsock_remove_sock(vsk);
 }
 
 static void hvs_destruct(struct vsock_sock *vsk)
-- 
cgit v1.2.3


From 197df02cb3d3e969fb1d6fc11f5a634b7bfc2124 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Thu, 19 Oct 2017 14:22:17 +0200
Subject: udp: make some messages more descriptive

In the UDP code there are two leftover error messages with very few meaning.
Replace them with a more descriptive error message as some users
reported them as "strange network error".

Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e45177ceb0ee..806b298a3bdd 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1061,7 +1061,7 @@ back_from_confirm:
 		/* ... which is an evident application bug. --ANK */
 		release_sock(sk);
 
-		net_dbg_ratelimited("cork app bug 2\n");
+		net_dbg_ratelimited("socket already corked\n");
 		err = -EINVAL;
 		goto out;
 	}
@@ -1144,7 +1144,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
 	if (unlikely(!up->pending)) {
 		release_sock(sk);
 
-		net_dbg_ratelimited("udp cork app bug 3\n");
+		net_dbg_ratelimited("cork failed\n");
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 54d431176429e9cf064461589e5174349a9f73da Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 19 Oct 2017 12:40:39 -0400
Subject: sock: correct sk_wmem_queued accounting on efault in tcp zerocopy

Syzkaller hits WARN_ON(sk->sk_wmem_queued) in sk_stream_kill_queues
after triggering an EFAULT in __zerocopy_sg_from_iter.

On this error, skb_zerocopy_stream_iter resets the skb to its state
before the operation with __pskb_trim. It cannot kfree_skb like
datagram callers, as the skb may have data from a previous send call.

__pskb_trim calls skb_condense for unowned skbs, which adjusts their
truesize. These tcp skbuffs are owned and their truesize must add up
to sk_wmem_queued. But they match because their skb->sk is NULL until
tcp_transmit_skb.

Temporarily set skb->sk when calling __pskb_trim to signal that the
skbuffs are owned and avoid the skb_condense path.

Fixes: 52267790ef52 ("sock: add MSG_ZEROCOPY")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e62476beee95..24656076906d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1124,9 +1124,13 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 
 	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
 	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
+		struct sock *save_sk = skb->sk;
+
 		/* Streams do not free skb on error. Reset to prev state. */
 		msg->msg_iter = orig_iter;
+		skb->sk = sk;
 		___pskb_trim(skb, orig_len);
+		skb->sk = save_sk;
 		return err;
 	}
 
-- 
cgit v1.2.3


From 66c54517540cedf5a22911c6b7f5c7d8b5d1e1be Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 19 Oct 2017 20:17:32 +0300
Subject: net: bridge: fix returning of vlan range op errors

When vlan tunnels were introduced, vlan range errors got silently
dropped and instead 0 was returned always. Restore the previous
behaviour and return errors to user-space.

Fixes: efa5356b0d97 ("bridge: per vlan dst_metadata netlink support")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 3bc890716c89..de2152730809 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -573,7 +573,7 @@ static int br_process_vlan_info(struct net_bridge *br,
 		}
 		*vinfo_last = NULL;
 
-		return 0;
+		return err;
 	}
 
 	return br_vlan_info(br, p, cmd, vinfo_curr);
-- 
cgit v1.2.3


From 1b5f962e71bfad6284574655c406597535c3ea7a Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Thu, 19 Oct 2017 15:00:29 -0400
Subject: soreuseport: fix initialization race

Syzkaller stumbled upon a way to trigger
WARNING: CPU: 1 PID: 13881 at net/core/sock_reuseport.c:41
reuseport_alloc+0x306/0x3b0 net/core/sock_reuseport.c:39

There are two initialization paths for the sock_reuseport structure in a
socket: Through the udp/tcp bind paths of SO_REUSEPORT sockets or through
SO_ATTACH_REUSEPORT_[CE]BPF before bind.  The existing implementation
assumedthat the socket lock protected both of these paths when it actually
only protects the SO_ATTACH_REUSEPORT path.  Syzkaller triggered this
double allocation by running these paths concurrently.

This patch moves the check for double allocation into the reuseport_alloc
function which is protected by a global spin lock.

Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
Fixes: c125e80b8868 ("soreuseport: fast reuseport TCP socket selection")
Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock_reuseport.c  | 12 +++++++++---
 net/ipv4/inet_hashtables.c |  5 +----
 net/ipv4/udp.c             |  5 +----
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index eed1ebf7f29d..b1e0dbea1e8c 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -36,9 +36,14 @@ int reuseport_alloc(struct sock *sk)
 	 * soft irq of receive path or setsockopt from process context
 	 */
 	spin_lock_bh(&reuseport_lock);
-	WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
-					    lockdep_is_held(&reuseport_lock)),
-		  "multiple allocations for the same socket");
+
+	/* Allocation attempts can occur concurrently via the setsockopt path
+	 * and the bind/hash path.  Nothing to do when we lose the race.
+	 */
+	if (rcu_dereference_protected(sk->sk_reuseport_cb,
+				      lockdep_is_held(&reuseport_lock)))
+		goto out;
+
 	reuse = __reuseport_alloc(INIT_SOCKS);
 	if (!reuse) {
 		spin_unlock_bh(&reuseport_lock);
@@ -49,6 +54,7 @@ int reuseport_alloc(struct sock *sk)
 	reuse->num_socks = 1;
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
+out:
 	spin_unlock_bh(&reuseport_lock);
 
 	return 0;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 597bb4cfe805..e7d15fb0d94d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -456,10 +456,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
 			return reuseport_add_sock(sk, sk2);
 	}
 
-	/* Initial allocation may have already happened via setsockopt */
-	if (!rcu_access_pointer(sk->sk_reuseport_cb))
-		return reuseport_alloc(sk);
-	return 0;
+	return reuseport_alloc(sk);
 }
 
 int __inet_hash(struct sock *sk, struct sock *osk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 806b298a3bdd..ebfbccae62fd 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -231,10 +231,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
 		}
 	}
 
-	/* Initial allocation may have already happened via setsockopt */
-	if (!rcu_access_pointer(sk->sk_reuseport_cb))
-		return reuseport_alloc(sk);
-	return 0;
+	return reuseport_alloc(sk);
 }
 
 /**
-- 
cgit v1.2.3


From 95491e3cf37840c518d81e1a3a6a8ef554e03c54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20S=C3=B6derlund?=
 <niklas.soderlund+renesas@ragnatech.se>
Date: Fri, 20 Oct 2017 01:32:08 +0200
Subject: net: ethtool: remove error check for legacy setting transceiver type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 9cab88726929605 ("net: ethtool: Add back transceiver type")
restores the transceiver type to struct ethtool_link_settings and
convert_link_ksettings_to_legacy_settings() but forgets to remove the
error check for the same in convert_legacy_settings_to_link_ksettings().
This prevents older versions of ethtool to change link settings.

    # ethtool --version
    ethtool version 3.16

    # ethtool -s eth0 autoneg on speed 100 duplex full
    Cannot set new settings: Invalid argument
      not setting speed
      not setting duplex
      not setting autoneg

While newer versions of ethtool works.

    # ethtool --version
    ethtool version 4.10

    # ethtool -s eth0 autoneg on speed 100 duplex full
    [   57.703268] sh-eth ee700000.ethernet eth0: Link is Down
    [   59.618227] sh-eth ee700000.ethernet eth0: Link is Up - 100Mbps/Full - flow control rx/tx

Fixes: 19cab88726929605 ("net: ethtool: Add back transceiver type")
Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reported-by: Renjith R V <renjith.rv@quest-global.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 3228411ada0f..9a9a3d77e327 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -436,7 +436,7 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
 
 /* return false if legacy contained non-0 deprecated fields
- * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
+ * maxtxpkt/maxrxpkt. rest of ksettings always updated
  */
 static bool
 convert_legacy_settings_to_link_ksettings(
@@ -451,8 +451,7 @@ convert_legacy_settings_to_link_ksettings(
 	 * deprecated legacy fields, and they should not use
 	 * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
 	 */
-	if (legacy_settings->transceiver ||
-	    legacy_settings->maxtxpkt ||
+	if (legacy_settings->maxtxpkt ||
 	    legacy_settings->maxrxpkt)
 		retval = false;
 
-- 
cgit v1.2.3


From 6cb3ece9685f78f9b288dd2afea58c35784e40b8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 20 Oct 2017 17:01:22 +0100
Subject: rxrpc: Don't release call mutex on error pointer

Don't release call mutex at the end of rxrpc_kernel_begin_call() if the
call pointer actually holds an error value.

Fixes: 540b1c48c37a ("rxrpc: Fix deadlock between call creation and sendmsg/recvmsg")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/af_rxrpc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index fb17552fd292..4b0a8288c98a 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -308,10 +308,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 	call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, tx_total_len,
 				     gfp);
 	/* The socket has been unlocked. */
-	if (!IS_ERR(call))
+	if (!IS_ERR(call)) {
 		call->notify_rx = notify_rx;
+		mutex_unlock(&call->user_mutex);
+	}
 
-	mutex_unlock(&call->user_mutex);
 	_leave(" = %p", call);
 	return call;
 }
-- 
cgit v1.2.3


From 864e2a1f8aac05effac6063ce316b480facb46ff Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 21 Oct 2017 12:26:23 -0700
Subject: ipv6: flowlabel: do not leave opt->tot_len with garbage

When syzkaller team brought us a C repro for the crash [1] that
had been reported many times in the past, I finally could find
the root cause.

If FlowLabel info is merged by fl6_merge_options(), we leave
part of the opt_space storage provided by udp/raw/l2tp with random value
in opt_space.tot_len, unless a control message was provided at sendmsg()
time.

Then ip6_setup_cork() would use this random value to perform a kzalloc()
call. Undefined behavior and crashes.

Fix is to properly set tot_len in fl6_merge_options()

At the same time, we can also avoid consuming memory and cpu cycles
to clear it, if every option is copied via a kmemdup(). This is the
change in ip6_setup_cork().

[1]
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 6613 Comm: syz-executor0 Not tainted 4.14.0-rc4+ #127
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
task: ffff8801cb64a100 task.stack: ffff8801cc350000
RIP: 0010:ip6_setup_cork+0x274/0x15c0 net/ipv6/ip6_output.c:1168
RSP: 0018:ffff8801cc357550 EFLAGS: 00010203
RAX: dffffc0000000000 RBX: ffff8801cc357748 RCX: 0000000000000010
RDX: 0000000000000002 RSI: ffffffff842bd1d9 RDI: 0000000000000014
RBP: ffff8801cc357620 R08: ffff8801cb17f380 R09: ffff8801cc357b10
R10: ffff8801cb64a100 R11: 0000000000000000 R12: ffff8801cc357ab0
R13: ffff8801cc357b10 R14: 0000000000000000 R15: ffff8801c3bbf0c0
FS:  00007f9c5c459700(0000) GS:ffff8801db200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020324000 CR3: 00000001d1cf2000 CR4: 00000000001406f0
DR0: 0000000020001010 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
Call Trace:
 ip6_make_skb+0x282/0x530 net/ipv6/ip6_output.c:1729
 udpv6_sendmsg+0x2769/0x3380 net/ipv6/udp.c:1340
 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 SYSC_sendto+0x358/0x5a0 net/socket.c:1750
 SyS_sendto+0x40/0x50 net/socket.c:1718
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4520a9
RSP: 002b:00007f9c5c458c08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004520a9
RDX: 0000000000000001 RSI: 0000000020fd1000 RDI: 0000000000000016
RBP: 0000000000000086 R08: 0000000020e0afe4 R09: 000000000000001c
R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004bb1ee
R13: 00000000ffffffff R14: 0000000000000016 R15: 0000000000000029
Code: e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 ea 0f 00 00 48 8d 79 04 48 b8 00 00 00 00 00 fc ff df 45 8b 74 24 04 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85
RIP: ip6_setup_cork+0x274/0x15c0 net/ipv6/ip6_output.c:1168 RSP: ffff8801cc357550

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_flowlabel.c | 1 +
 net/ipv6/ip6_output.c    | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 8081bafe441b..15535ee327c5 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -315,6 +315,7 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
 	}
 	opt_space->dst1opt = fopt->dst1opt;
 	opt_space->opt_flen = fopt->opt_flen;
+	opt_space->tot_len = fopt->tot_len;
 	return opt_space;
 }
 EXPORT_SYMBOL_GPL(fl6_merge_options);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 43ca864327c7..5110a418cc4d 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1161,11 +1161,11 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 		if (WARN_ON(v6_cork->opt))
 			return -EINVAL;
 
-		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
+		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
 		if (unlikely(!v6_cork->opt))
 			return -ENOBUFS;
 
-		v6_cork->opt->tot_len = opt->tot_len;
+		v6_cork->opt->tot_len = sizeof(*opt);
 		v6_cork->opt->opt_flen = opt->opt_flen;
 		v6_cork->opt->opt_nflen = opt->opt_nflen;
 
-- 
cgit v1.2.3