From 9f1c2674b328a69ab5a9b5a1c52405795ee4163f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 8 Oct 2017 21:44:51 -0700
Subject: net: memcontrol: defer call to mem_cgroup_sk_alloc()

Instead of calling mem_cgroup_sk_alloc() from BH context,
it is better to call it from inet_csk_accept() in process context.

Not only this removes code in mem_cgroup_sk_alloc(), but it also
fixes a bug since listener might have been dismantled and css_get()
might cause a use-after-free.

Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 23953b741a41..70c6ccbdf49f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1677,6 +1677,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_dst_pending_confirm = 0;
 		newsk->sk_wmem_queued	= 0;
 		newsk->sk_forward_alloc = 0;
+
+		/* sk->sk_memcg will be populated at accept() time */
+		newsk->sk_memcg = NULL;
+
 		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
@@ -1714,7 +1718,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
 
-		mem_cgroup_sk_alloc(newsk);
 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
 
 		/*
-- 
cgit v1.2.3


From fbb1fb4ad415cb31ce944f65a5ca700aaf73a227 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 8 Oct 2017 21:44:52 -0700
Subject: net: defer call to cgroup_sk_alloc()

sk_clone_lock() might run while TCP/DCCP listener already vanished.

In order to prevent use after free, it is better to defer cgroup_sk_alloc()
to the point we know both parent and child exist, and from process context.

Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 70c6ccbdf49f..4499e3153813 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1680,6 +1680,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 		/* sk->sk_memcg will be populated at accept() time */
 		newsk->sk_memcg = NULL;
+		memset(&newsk->sk_cgrp_data, 0, sizeof(newsk->sk_cgrp_data));
 
 		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
@@ -1718,8 +1719,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
 
-		cgroup_sk_alloc(&newsk->sk_cgrp_data);
-
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
-- 
cgit v1.2.3


From 75cb070960ade40fba5de32138390f3c85c90941 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 10 Oct 2017 19:12:32 -0700
Subject: Revert "net: defer call to cgroup_sk_alloc()"

This reverts commit fbb1fb4ad415cb31ce944f65a5ca700aaf73a227.

This was not the proper fix, lets cleanly revert it, so that
following patch can be carried to stable versions.

sock_cgroup_ptr() callers do not expect a NULL return value.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 4499e3153813..70c6ccbdf49f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1680,7 +1680,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 		/* sk->sk_memcg will be populated at accept() time */
 		newsk->sk_memcg = NULL;
-		memset(&newsk->sk_cgrp_data, 0, sizeof(newsk->sk_cgrp_data));
 
 		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
@@ -1719,6 +1718,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
 
+		cgroup_sk_alloc(&newsk->sk_cgrp_data);
+
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
-- 
cgit v1.2.3


From c0576e3975084d4699b7bfef578613fb8e1144f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 10 Oct 2017 19:12:33 -0700
Subject: net: call cgroup_sk_alloc() earlier in sk_clone_lock()

If for some reason, the newly allocated child need to be freed,
we will call cgroup_put() (via sk_free_unlock_clone()) while the
corresponding cgroup_get() was not yet done, and we will free memory
too soon.

Fixes: d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 70c6ccbdf49f..415f441c63b9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1687,6 +1687,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		atomic_set(&newsk->sk_zckey, 0);
 
 		sock_reset_flag(newsk, SOCK_DONE);
+		cgroup_sk_alloc(&newsk->sk_cgrp_data);
 
 		rcu_read_lock();
 		filter = rcu_dereference(sk->sk_filter);
@@ -1718,8 +1719,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		atomic64_set(&newsk->sk_cookie, 0);
 
-		cgroup_sk_alloc(&newsk->sk_cgrp_data);
-
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)
-- 
cgit v1.2.3


From 09001b03f722be96827bf8df5ba4d48b7ec0cc30 Mon Sep 17 00:00:00 2001
From: Wenhua Shi <march511@gmail.com>
Date: Sat, 14 Oct 2017 18:51:36 +0200
Subject: net: fix typo in skbuff.c

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 16982de649b9..e62476beee95 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1896,7 +1896,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
 	}
 
 	/* If we need update frag list, we are in troubles.
-	 * Certainly, it possible to add an offset to skb data,
+	 * Certainly, it is possible to add an offset to skb data,
 	 * but taking into account that pulling is expected to
 	 * be very rare operation, it is worth to fight against
 	 * further bloating skb head and crucify ourselves here instead.
-- 
cgit v1.2.3


From 8a212589fe0e45f26c549dfa271a157ca8eea1ac Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:41 +0800
Subject: rtnetlink: bring NETDEV_CHANGEMTU event process back in
 rtnetlink_event

Commit 085e1a65f04f ("rtnetlink: Do not generate notifications for MTU
events") tried to fix the redundant notifications issue when ip link
set mtu by removing NETDEV_CHANGEMTU event process in rtnetlink_event.

But it also resulted in no notification generated when dev's mtu is
changed via other methods, like:
  'ifconfig eth1 mtu 1400' or 'echo 1400 > /sys/class/net/eth1/mtu'
It would cause users not to be notified by this change.

This patch is to fix it by bringing NETDEV_CHANGEMTU event back into
rtnetlink_event, and the redundant notifications issue will be fixed
in the later patch 'rtnetlink: check DO_SETLINK_NOTIFY correctly in
do_setlink'.

Fixes: 085e1a65f04f ("rtnetlink: Do not generate notifications for MTU events")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d4bcdcc68e92..72053ed7c891 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4279,6 +4279,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 
 	switch (event) {
 	case NETDEV_REBOOT:
+	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGEADDR:
 	case NETDEV_CHANGENAME:
 	case NETDEV_FEAT_CHANGE:
-- 
cgit v1.2.3


From ebdcf0450b020748c2dab6bfe44a5ac3c5159fb0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:42 +0800
Subject: rtnetlink: bring NETDEV_CHANGE_TX_QUEUE_LEN event process back in
 rtnetlink_event

The same fix for changing mtu in the patch 'rtnetlink: bring
NETDEV_CHANGEMTU event process back in rtnetlink_event' is
needed for changing tx_queue_len.

Note that the redundant notifications issue for tx_queue_len
will be fixed in the later patch 'rtnetlink: do not send
notification for tx_queue_len in do_setlink'.

Fixes: 27b3b551d8a7 ("rtnetlink: Do not generate notifications for NETDEV_CHANGE_TX_QUEUE_LEN event")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 72053ed7c891..bf473604f33d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4287,6 +4287,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
+	case NETDEV_CHANGE_TX_QUEUE_LEN:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
 				   GFP_KERNEL);
 		break;
-- 
cgit v1.2.3


From e6e6659446c87057aede26a39d9f16b19001716f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:43 +0800
Subject: rtnetlink: bring NETDEV_POST_TYPE_CHANGE event process back in
 rtnetlink_event

As I said in patch 'rtnetlink: bring NETDEV_CHANGEMTU event process back
in rtnetlink_event', removing NETDEV_POST_TYPE_CHANGE event was not the
right fix for the redundant notifications issue.

So bring this event process back to rtnetlink_event and the old redundant
notifications issue would be fixed in the later patch 'rtnetlink: check
DO_SETLINK_NOTIFY correctly in do_setlink'.

Fixes: aef091ae58aa ("rtnetlink: Do not generate notifications for POST_TYPE_CHANGE event")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bf473604f33d..8e44fd597f46 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4284,6 +4284,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_CHANGENAME:
 	case NETDEV_FEAT_CHANGE:
 	case NETDEV_BONDING_FAILOVER:
+	case NETDEV_POST_TYPE_CHANGE:
 	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
-- 
cgit v1.2.3


From dc709f375743ebf5c9326cc9b946f6f09a34ac44 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:44 +0800
Subject: rtnetlink: bring NETDEV_CHANGEUPPER event process back in
 rtnetlink_event

libteam needs this event notification in userspace when dev's master
dev has been changed. After this, the redundant notifications issue
would be fixed in the later patch 'rtnetlink: check DO_SETLINK_NOTIFY
correctly in do_setlink'.

Fixes: b6b36eb23a46 ("rtnetlink: Do not generate notifications for NETDEV_CHANGEUPPER event")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8e44fd597f46..ab98c1c8b6f3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4286,6 +4286,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_BONDING_FAILOVER:
 	case NETDEV_POST_TYPE_CHANGE:
 	case NETDEV_NOTIFY_PEERS:
+	case NETDEV_CHANGEUPPER:
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
-- 
cgit v1.2.3


From 64ff90cc2e6f42596d7a0c37e41dc95292bb63b1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:45 +0800
Subject: rtnetlink: check DO_SETLINK_NOTIFY correctly in do_setlink

The check 'status & DO_SETLINK_NOTIFY' in do_setlink doesn't really
work after status & DO_SETLINK_MODIFIED, as:

  DO_SETLINK_MODIFIED 0x1
  DO_SETLINK_NOTIFY 0x3

Considering that notifications are suppposed to be sent only when
status have the flag DO_SETLINK_NOTIFY, the right check would be:

  (status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY

This would avoid lots of duplicated notifications when setting some
properties of a link.

Fixes: ba9989069f4e ("rtnl/do_setlink(): notify when a netdev is modified")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ab98c1c8b6f3..3e98fb557598 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2248,7 +2248,7 @@ static int do_setlink(const struct sk_buff *skb,
 
 errout:
 	if (status & DO_SETLINK_MODIFIED) {
-		if (status & DO_SETLINK_NOTIFY)
+		if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
 			netdev_state_change(dev);
 
 		if (err < 0)
-- 
cgit v1.2.3


From 2d7f669b42a97022c8c2b6cd86f3990be5fcd1bc Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 15 Oct 2017 18:13:46 +0800
Subject: rtnetlink: do not set notification for tx_queue_len in do_setlink

NETDEV_CHANGE_TX_QUEUE_LEN event process in rtnetlink_event would
send a notification for userspace and tx_queue_len's setting in
do_setlink would trigger NETDEV_CHANGE_TX_QUEUE_LEN.

So it shouldn't set DO_SETLINK_NOTIFY status for this change to
send a notification any more.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3e98fb557598..a6bcf86ce471 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2093,7 +2093,7 @@ static int do_setlink(const struct sk_buff *skb,
 				dev->tx_queue_len = orig_len;
 				goto errout;
 			}
-			status |= DO_SETLINK_NOTIFY;
+			status |= DO_SETLINK_MODIFIED;
 		}
 	}
 
-- 
cgit v1.2.3


From 2459b4c635858094df78abb9ca87d99f89fe8ca5 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 11 Oct 2017 16:24:48 +0200
Subject: net: enable interface alias removal via rtnl

IFLA_IFALIAS is defined as NLA_STRING. It means that the minimal length of
the attribute is 1 ("\0"). However, to remove an alias, the attribute
length must be 0 (see dev_set_alias()).

Let's define the type to NLA_BINARY to allow 0-length string, so that the
alias can be removed.

Example:
$ ip l s dummy0 alias foo
$ ip l l dev dummy0
5: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether ae:20:30:4f:a7:f3 brd ff:ff:ff:ff:ff:ff
    alias foo

Before the patch:
$ ip l s dummy0 alias ""
RTNETLINK answers: Numerical result out of range

After the patch:
$ ip l s dummy0 alias ""
$ ip l l dev dummy0
5: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether ae:20:30:4f:a7:f3 brd ff:ff:ff:ff:ff:ff

CC: Oliver Hartkopp <oliver@hartkopp.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
Fixes: 96ca4a2cc145 ("net: remove ifalias on empty given alias")
Reported-by: Julien FLoret <julien.floret@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a6bcf86ce471..5ace48926b19 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1483,7 +1483,10 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
 	[IFLA_NET_NS_FD]	= { .type = NLA_U32 },
-	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
+	/* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to
+	 * allow 0-length string (needed to remove an alias).
+	 */
+	[IFLA_IFALIAS]	        = { .type = NLA_BINARY, .len = IFALIASZ - 1 },
 	[IFLA_VFINFO_LIST]	= {. type = NLA_NESTED },
 	[IFLA_VF_PORTS]		= { .type = NLA_NESTED },
 	[IFLA_PORT_SELF]	= { .type = NLA_NESTED },
-- 
cgit v1.2.3


From 0ad646c81b2182f7fa67ec0c8c825e0ee165696d Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Fri, 13 Oct 2017 11:58:53 -0700
Subject: tun: call dev_get_valid_name() before register_netdevice()

register_netdevice() could fail early when we have an invalid
dev name, in which case ->ndo_uninit() is not called. For tun
device, this is a problem because a timer etc. are already
initialized and it expects ->ndo_uninit() to clean them up.

We could move these initializations into a ->ndo_init() so
that register_netdevice() knows better, however this is still
complicated due to the logic in tun_detach().

Therefore, I choose to just call dev_get_valid_name() before
register_netdevice(), which is quicker and much easier to audit.
And for this specific case, it is already enough.

Fixes: 96442e42429e ("tuntap: choose the txq based on rxq")
Reported-by: Dmitry Alexeev <avekceeb@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 588b473194a8..11596a302a26 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1147,9 +1147,8 @@ static int dev_alloc_name_ns(struct net *net,
 	return ret;
 }
 
-static int dev_get_valid_name(struct net *net,
-			      struct net_device *dev,
-			      const char *name)
+int dev_get_valid_name(struct net *net, struct net_device *dev,
+		       const char *name)
 {
 	BUG_ON(!net);
 
@@ -1165,6 +1164,7 @@ static int dev_get_valid_name(struct net *net,
 
 	return 0;
 }
+EXPORT_SYMBOL(dev_get_valid_name);
 
 /**
  *	dev_change_name - change name of a device
-- 
cgit v1.2.3


From 823038ca030e9f8283518b1e6a5a6879edcbe057 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 16 Oct 2017 19:43:15 +0800
Subject: dev_ioctl: add missing NETDEV_CHANGE_TX_QUEUE_LEN event notification

When changing dev tx_queue_len via netlink or net-sysfs,
a NETDEV_CHANGE_TX_QUEUE_LEN event notification will be
called.

But dev_ioctl missed this event notification, which could
cause no userspace notification would be sent.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev_ioctl.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 709a4e6fb447..f9c7a88cd981 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -303,7 +303,18 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 	case SIOCSIFTXQLEN:
 		if (ifr->ifr_qlen < 0)
 			return -EINVAL;
-		dev->tx_queue_len = ifr->ifr_qlen;
+		if (dev->tx_queue_len ^ ifr->ifr_qlen) {
+			unsigned int orig_len = dev->tx_queue_len;
+
+			dev->tx_queue_len = ifr->ifr_qlen;
+			err = call_netdevice_notifiers(
+					NETDEV_CHANGE_TX_QUEUE_LEN, dev);
+			err = notifier_to_errno(err);
+			if (err) {
+				dev->tx_queue_len = orig_len;
+				return err;
+			}
+		}
 		return 0;
 
 	case SIOCSIFNAME:
-- 
cgit v1.2.3


From 34f79502bbcfab659b8729da68b5e387f96eb4c1 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 18 Oct 2017 07:10:36 -0700
Subject: bpf: avoid preempt enable/disable in sockmap using tcp_skb_cb region

SK_SKB BPF programs are run from the socket/tcp context but early in
the stack before much of the TCP metadata is needed in tcp_skb_cb. So
we can use some unused fields to place BPF metadata needed for SK_SKB
programs when implementing the redirect function.

This allows us to drop the preempt disable logic. It does however
require an API change so sk_redirect_map() has been updated to
additionally provide ctx_ptr to skb. Note, we do however continue to
disable/enable preemption around actual BPF program running to account
for map updates.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 74b8c91fb5f4..ca1ba0bbfbc2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1839,31 +1839,31 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+	   struct bpf_map *, map, u32, key, u64, flags)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
 	if (unlikely(flags))
 		return SK_ABORTED;
 
-	ri->ifindex = key;
-	ri->flags = flags;
-	ri->map = map;
+	tcb->bpf.key = key;
+	tcb->bpf.flags = flags;
+	tcb->bpf.map = map;
 
 	return SK_REDIRECT;
 }
 
-struct sock *do_sk_redirect_map(void)
+struct sock *do_sk_redirect_map(struct sk_buff *skb)
 {
-	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 	struct sock *sk = NULL;
 
-	if (ri->map) {
-		sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
+	if (tcb->bpf.map) {
+		sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
 
-		ri->ifindex = 0;
-		ri->map = NULL;
-		/* we do not clear flags for future lookup */
+		tcb->bpf.key = 0;
+		tcb->bpf.map = NULL;
 	}
 
 	return sk;
@@ -1873,9 +1873,10 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
 	.func           = bpf_sk_redirect_map,
 	.gpl_only       = false,
 	.ret_type       = RET_INTEGER,
-	.arg1_type      = ARG_CONST_MAP_PTR,
-	.arg2_type      = ARG_ANYTHING,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
 	.arg3_type      = ARG_ANYTHING,
+	.arg4_type      = ARG_ANYTHING,
 };
 
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
-- 
cgit v1.2.3


From f7e9cb1ecb6d922584abff16db07930162c57155 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 18 Oct 2017 07:10:58 -0700
Subject: bpf: remove mark access for SK_SKB program types

The skb->mark field is a union with reserved_tailroom which is used
in the TCP code paths from stream memory allocation. Allowing SK_SKB
programs to set this field creates a conflict with future code
optimizations, such as "gifting" the skb to the egress path instead
of creating a new skb and doing a memcpy.

Because we do not have a released version of SK_SKB yet lets just
remove it for now. A more appropriate scratch pad to use at the
socket layer is dev_scratch, but lets add that in future kernels
when needed.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index ca1ba0bbfbc2..aa0265997f93 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3684,7 +3684,6 @@ static bool sk_skb_is_valid_access(int off, int size,
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
-		case bpf_ctx_range(struct __sk_buff, mark):
 		case bpf_ctx_range(struct __sk_buff, tc_index):
 		case bpf_ctx_range(struct __sk_buff, priority):
 			break;
@@ -3694,6 +3693,7 @@ static bool sk_skb_is_valid_access(int off, int size,
 	}
 
 	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, mark):
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
-- 
cgit v1.2.3


From 54d431176429e9cf064461589e5174349a9f73da Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 19 Oct 2017 12:40:39 -0400
Subject: sock: correct sk_wmem_queued accounting on efault in tcp zerocopy

Syzkaller hits WARN_ON(sk->sk_wmem_queued) in sk_stream_kill_queues
after triggering an EFAULT in __zerocopy_sg_from_iter.

On this error, skb_zerocopy_stream_iter resets the skb to its state
before the operation with __pskb_trim. It cannot kfree_skb like
datagram callers, as the skb may have data from a previous send call.

__pskb_trim calls skb_condense for unowned skbs, which adjusts their
truesize. These tcp skbuffs are owned and their truesize must add up
to sk_wmem_queued. But they match because their skb->sk is NULL until
tcp_transmit_skb.

Temporarily set skb->sk when calling __pskb_trim to signal that the
skbuffs are owned and avoid the skb_condense path.

Fixes: 52267790ef52 ("sock: add MSG_ZEROCOPY")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e62476beee95..24656076906d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1124,9 +1124,13 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 
 	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
 	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
+		struct sock *save_sk = skb->sk;
+
 		/* Streams do not free skb on error. Reset to prev state. */
 		msg->msg_iter = orig_iter;
+		skb->sk = sk;
 		___pskb_trim(skb, orig_len);
+		skb->sk = save_sk;
 		return err;
 	}
 
-- 
cgit v1.2.3


From 1b5f962e71bfad6284574655c406597535c3ea7a Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Thu, 19 Oct 2017 15:00:29 -0400
Subject: soreuseport: fix initialization race

Syzkaller stumbled upon a way to trigger
WARNING: CPU: 1 PID: 13881 at net/core/sock_reuseport.c:41
reuseport_alloc+0x306/0x3b0 net/core/sock_reuseport.c:39

There are two initialization paths for the sock_reuseport structure in a
socket: Through the udp/tcp bind paths of SO_REUSEPORT sockets or through
SO_ATTACH_REUSEPORT_[CE]BPF before bind.  The existing implementation
assumedthat the socket lock protected both of these paths when it actually
only protects the SO_ATTACH_REUSEPORT path.  Syzkaller triggered this
double allocation by running these paths concurrently.

This patch moves the check for double allocation into the reuseport_alloc
function which is protected by a global spin lock.

Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
Fixes: c125e80b8868 ("soreuseport: fast reuseport TCP socket selection")
Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock_reuseport.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index eed1ebf7f29d..b1e0dbea1e8c 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -36,9 +36,14 @@ int reuseport_alloc(struct sock *sk)
 	 * soft irq of receive path or setsockopt from process context
 	 */
 	spin_lock_bh(&reuseport_lock);
-	WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
-					    lockdep_is_held(&reuseport_lock)),
-		  "multiple allocations for the same socket");
+
+	/* Allocation attempts can occur concurrently via the setsockopt path
+	 * and the bind/hash path.  Nothing to do when we lose the race.
+	 */
+	if (rcu_dereference_protected(sk->sk_reuseport_cb,
+				      lockdep_is_held(&reuseport_lock)))
+		goto out;
+
 	reuse = __reuseport_alloc(INIT_SOCKS);
 	if (!reuse) {
 		spin_unlock_bh(&reuseport_lock);
@@ -49,6 +54,7 @@ int reuseport_alloc(struct sock *sk)
 	reuse->num_socks = 1;
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
+out:
 	spin_unlock_bh(&reuseport_lock);
 
 	return 0;
-- 
cgit v1.2.3


From 95491e3cf37840c518d81e1a3a6a8ef554e03c54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niklas=20S=C3=B6derlund?=
 <niklas.soderlund+renesas@ragnatech.se>
Date: Fri, 20 Oct 2017 01:32:08 +0200
Subject: net: ethtool: remove error check for legacy setting transceiver type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 9cab88726929605 ("net: ethtool: Add back transceiver type")
restores the transceiver type to struct ethtool_link_settings and
convert_link_ksettings_to_legacy_settings() but forgets to remove the
error check for the same in convert_legacy_settings_to_link_ksettings().
This prevents older versions of ethtool to change link settings.

    # ethtool --version
    ethtool version 3.16

    # ethtool -s eth0 autoneg on speed 100 duplex full
    Cannot set new settings: Invalid argument
      not setting speed
      not setting duplex
      not setting autoneg

While newer versions of ethtool works.

    # ethtool --version
    ethtool version 4.10

    # ethtool -s eth0 autoneg on speed 100 duplex full
    [   57.703268] sh-eth ee700000.ethernet eth0: Link is Down
    [   59.618227] sh-eth ee700000.ethernet eth0: Link is Up - 100Mbps/Full - flow control rx/tx

Fixes: 19cab88726929605 ("net: ethtool: Add back transceiver type")
Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reported-by: Renjith R V <renjith.rv@quest-global.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 3228411ada0f..9a9a3d77e327 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -436,7 +436,7 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
 
 /* return false if legacy contained non-0 deprecated fields
- * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
+ * maxtxpkt/maxrxpkt. rest of ksettings always updated
  */
 static bool
 convert_legacy_settings_to_link_ksettings(
@@ -451,8 +451,7 @@ convert_legacy_settings_to_link_ksettings(
 	 * deprecated legacy fields, and they should not use
 	 * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
 	 */
-	if (legacy_settings->transceiver ||
-	    legacy_settings->maxtxpkt ||
+	if (legacy_settings->maxtxpkt ||
 	    legacy_settings->maxrxpkt)
 		retval = false;
 
-- 
cgit v1.2.3