From c78296665c3d81f040117432ab9e1cb125521b0c Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 26 Feb 2016 17:56:13 +0100
Subject: batman-adv: Check skb size before using encapsulated ETH+VLAN header

The encapsulated ethernet and VLAN header may be outside the received
ethernet frame. Thus the skb buffer size has to be checked before it can be
parsed to find out if it encapsulates another batman-adv packet.

Fixes: 420193573f11 ("batman-adv: softif bridge loop avoidance")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/soft-interface.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 0710379491bf..8a136b6a1ff0 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -408,11 +408,17 @@ void batadv_interface_rx(struct net_device *soft_iface,
 	 */
 	nf_reset(skb);
 
+	if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+		goto dropped;
+
 	vid = batadv_get_vid(skb, 0);
 	ethhdr = eth_hdr(skb);
 
 	switch (ntohs(ethhdr->h_proto)) {
 	case ETH_P_8021Q:
+		if (!pskb_may_pull(skb, VLAN_ETH_HLEN))
+			goto dropped;
+
 		vhdr = (struct vlan_ethhdr *)skb->data;
 
 		if (vhdr->h_vlan_encapsulated_proto != ethertype)
@@ -424,8 +430,6 @@ void batadv_interface_rx(struct net_device *soft_iface,
 	}
 
 	/* skb->dev & skb->pkt_type are set here */
-	if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
-		goto dropped;
 	skb->protocol = eth_type_trans(skb, soft_iface);
 
 	/* should not be necessary anymore as we use skb_pull_rcsum()
-- 
cgit v1.2.3


From e48474ed8a217b7f80f2a42bc05352406a06cb67 Mon Sep 17 00:00:00 2001
From: Marek Lindner <mareklindner@neomailbox.ch>
Date: Fri, 11 Mar 2016 16:01:09 +0100
Subject: batman-adv: init neigh node last seen field

Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
[sven@narfation.org: fix conflicts with current version]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/originator.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index e4cbb0753e37..d52f67a0c057 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -663,6 +663,7 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 	ether_addr_copy(neigh_node->addr, neigh_addr);
 	neigh_node->if_incoming = hard_iface;
 	neigh_node->orig_node = orig_node;
+	neigh_node->last_seen = jiffies;
 
 	/* extra reference for return */
 	kref_init(&neigh_node->refcount);
-- 
cgit v1.2.3


From f2d23861b818d08bcd15cc1612ae94aa33b3931c Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sat, 19 Mar 2016 13:55:21 +0100
Subject: batman-adv: Deactivate TO_BE_ACTIVATED hardif on shutdown

The shutdown of an batman-adv interface can happen with one of its slave
interfaces still being in the BATADV_IF_TO_BE_ACTIVATED state. A possible
reason for it is that the routing algorithm BATMAN_V was selected and
batadv_schedule_bat_ogm was not yet called for this interface. This slave
interface still has to be set to BATADV_IF_INACTIVE or the batman-adv
interface will never reduce its usage counter and thus never gets shutdown.

This problem can be simulated via:

    $ modprobe dummy
    $ modprobe batman-adv routing_algo=BATMAN_V
    $ ip link add bat0 type batadv
    $ ip link set dummy0 master bat0
    $ ip link set dummy0 up
    $ ip link del bat0
    unregister_netdevice: waiting for bat0 to become free. Usage count = 3

Reported-by: Matthias Schiffer <mschiffer@universe-factory.net>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/hard-interface.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index b22b2775a0a5..c61d5b0b24d2 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -572,8 +572,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
 	struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
 	struct batadv_hard_iface *primary_if = NULL;
 
-	if (hard_iface->if_status == BATADV_IF_ACTIVE)
-		batadv_hardif_deactivate_interface(hard_iface);
+	batadv_hardif_deactivate_interface(hard_iface);
 
 	if (hard_iface->if_status != BATADV_IF_INACTIVE)
 		goto out;
-- 
cgit v1.2.3


From d1a65f1741bfd9c69f9e4e2ad447a89b6810427d Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 20 Mar 2016 12:27:53 +0100
Subject: batman-adv: Reduce refcnt of removed router when updating route

_batadv_update_route rcu_derefences orig_ifinfo->router outside of a
spinlock protected region to print some information messages to the debug
log. But this pointer is not checked again when the new pointer is assigned
in the spinlock protected region. Thus is can happen that the value of
orig_ifinfo->router changed in the meantime and thus the reference counter
of the wrong router gets reduced after the spinlock protected region.

Just rcu_dereferencing the value of orig_ifinfo->router inside the spinlock
protected region (which also set the new pointer) is enough to get the
correct old router object.

Fixes: e1a5382f978b ("batman-adv: Make orig_node->router an rcu protected pointer")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/routing.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 4dd646a52f1a..b781bf753250 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -105,6 +105,15 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
 		neigh_node = NULL;
 
 	spin_lock_bh(&orig_node->neigh_list_lock);
+	/* curr_router used earlier may not be the current orig_ifinfo->router
+	 * anymore because it was dereferenced outside of the neigh_list_lock
+	 * protected region. After the new best neighbor has replace the current
+	 * best neighbor the reference counter needs to decrease. Consequently,
+	 * the code needs to ensure the curr_router variable contains a pointer
+	 * to the replaced best neighbor.
+	 */
+	curr_router = rcu_dereference_protected(orig_ifinfo->router, true);
+
 	rcu_assign_pointer(orig_ifinfo->router, neigh_node);
 	spin_unlock_bh(&orig_node->neigh_list_lock);
 	batadv_orig_ifinfo_put(orig_ifinfo);
-- 
cgit v1.2.3


From c4fdb6cff2aa0ae740c5f19b6f745cbbe786d42f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@c0d3.blue>
Date: Fri, 11 Mar 2016 14:04:49 +0100
Subject: batman-adv: Fix broadcast/ogm queue limit on a removed interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When removing a single interface while a broadcast or ogm packet is
still pending then we will free the forward packet without releasing the
queue slots again.

This patch is supposed to fix this issue.

Fixes: 6d5808d4ae1b ("batman-adv: Add missing hardif_free_ref in forw_packet_free")
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
[sven@narfation.org: fix conflicts with current version]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/send.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 3ce06e0a91b1..76417850d3fc 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -675,6 +675,9 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
 
 		if (pending) {
 			hlist_del(&forw_packet->list);
+			if (!forw_packet->own)
+				atomic_inc(&bat_priv->bcast_queue_left);
+
 			batadv_forw_packet_free(forw_packet);
 		}
 	}
@@ -702,6 +705,9 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
 
 		if (pending) {
 			hlist_del(&forw_packet->list);
+			if (!forw_packet->own)
+				atomic_inc(&bat_priv->batman_queue_left);
+
 			batadv_forw_packet_free(forw_packet);
 		}
 	}
-- 
cgit v1.2.3


From 7ceb2afbd6aee4643056b47156baad6841db8e78 Mon Sep 17 00:00:00 2001
From: Elad Raz <eladr@mellanox.com>
Date: Thu, 21 Apr 2016 12:52:43 +0200
Subject: switchdev: Adding complete operation to deferred switchdev ops

When using switchdev deferred operation (SWITCHDEV_F_DEFER), the operation
is executed in different context and the application doesn't have any way
to get the operation real status.

Adding a completion callback fixes that. This patch adds fields to
switchdev_attr and switchdev_obj "complete_priv" field which is used by
the "complete" callback.

Application can set a complete function which will be called once the
operation executed.

Signed-off-by: Elad Raz <eladr@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/switchdev/switchdev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 2b9b98f1c2ff..b7e01d88bdc5 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -305,6 +305,8 @@ static void switchdev_port_attr_set_deferred(struct net_device *dev,
 	if (err && err != -EOPNOTSUPP)
 		netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n",
 			   err, attr->id);
+	if (attr->complete)
+		attr->complete(dev, err, attr->complete_priv);
 }
 
 static int switchdev_port_attr_set_defer(struct net_device *dev,
@@ -434,6 +436,8 @@ static void switchdev_port_obj_add_deferred(struct net_device *dev,
 	if (err && err != -EOPNOTSUPP)
 		netdev_err(dev, "failed (err=%d) to add object (id=%d)\n",
 			   err, obj->id);
+	if (obj->complete)
+		obj->complete(dev, err, obj->complete_priv);
 }
 
 static int switchdev_port_obj_add_defer(struct net_device *dev,
@@ -502,6 +506,8 @@ static void switchdev_port_obj_del_deferred(struct net_device *dev,
 	if (err && err != -EOPNOTSUPP)
 		netdev_err(dev, "failed (err=%d) to del object (id=%d)\n",
 			   err, obj->id);
+	if (obj->complete)
+		obj->complete(dev, err, obj->complete_priv);
 }
 
 static int switchdev_port_obj_del_defer(struct net_device *dev,
-- 
cgit v1.2.3


From 6dd684c0feb207f30180570bad24264b922d9476 Mon Sep 17 00:00:00 2001
From: Elad Raz <eladr@mellanox.com>
Date: Thu, 21 Apr 2016 12:52:44 +0200
Subject: bridge: mdb: Common function for mdb entry translation

There is duplicate code that translates br_mdb_entry to br_ip let's wrap it
in a common function.

Signed-off-by: Elad Raz <eladr@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 253bc77eda3b..b258120e1145 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -61,6 +61,19 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
 		e->flags |= MDB_FLAGS_OFFLOAD;
 }
 
+static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
+{
+	memset(ip, 0, sizeof(struct br_ip));
+	ip->vid = entry->vid;
+	ip->proto = entry->addr.proto;
+	if (ip->proto == htons(ETH_P_IP))
+		ip->u.ip4 = entry->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		ip->u.ip6 = entry->addr.u.ip6;
+#endif
+}
+
 static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			    struct net_device *dev)
 {
@@ -509,15 +522,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
 	if (!p || p->br != br || p->state == BR_STATE_DISABLED)
 		return -EINVAL;
 
-	memset(&ip, 0, sizeof(ip));
-	ip.vid = entry->vid;
-	ip.proto = entry->addr.proto;
-	if (ip.proto == htons(ETH_P_IP))
-		ip.u.ip4 = entry->addr.u.ip4;
-#if IS_ENABLED(CONFIG_IPV6)
-	else
-		ip.u.ip6 = entry->addr.u.ip6;
-#endif
+	__mdb_entry_to_br_ip(entry, &ip);
 
 	spin_lock_bh(&br->multicast_lock);
 	ret = br_mdb_add_group(br, p, &ip, entry->state, pg);
@@ -584,15 +589,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
 	if (!netif_running(br->dev) || br->multicast_disabled)
 		return -EINVAL;
 
-	memset(&ip, 0, sizeof(ip));
-	ip.vid = entry->vid;
-	ip.proto = entry->addr.proto;
-	if (ip.proto == htons(ETH_P_IP))
-		ip.u.ip4 = entry->addr.u.ip4;
-#if IS_ENABLED(CONFIG_IPV6)
-	else
-		ip.u.ip6 = entry->addr.u.ip6;
-#endif
+	__mdb_entry_to_br_ip(entry, &ip);
 
 	spin_lock_bh(&br->multicast_lock);
 	mdb = mlock_dereference(br->mdb, br);
-- 
cgit v1.2.3


From 45ebcce56823d14d196dbdecd26783b3d5f464a6 Mon Sep 17 00:00:00 2001
From: Elad Raz <eladr@mellanox.com>
Date: Thu, 21 Apr 2016 12:52:45 +0200
Subject: bridge: mdb: Marking port-group as offloaded

There is a race-condition when updating the mdb offload flag without using
the mulicast_lock. This reverts commit 9e8430f8d60d98 ("bridge: mdb:
Passing the port-group pointer to br_mdb module").

This patch marks offloaded MDB entry as "offload" by changing the port-
group flags and marks it as MDB_PG_FLAGS_OFFLOAD.

When switchdev PORT_MDB succeeded and adds a multicast group, a completion
callback is been invoked "br_mdb_complete". The completion function
locks the multicast_lock and finds the right net_bridge_port_group and
marks it as offloaded.

Fixes: 9e8430f8d60d98 ("bridge: mdb: Passing the port-group pointer to br_mdb module")
Reported-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Elad Raz <eladr@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c       | 91 +++++++++++++++++++++++++++++++++--------------
 net/bridge/br_multicast.c |  8 +++--
 net/bridge/br_private.h   |  4 +--
 3 files changed, 71 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index b258120e1145..7dbc80d01eb0 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -256,9 +256,45 @@ static inline size_t rtnl_mdb_nlmsg_size(void)
 		+ nla_total_size(sizeof(struct br_mdb_entry));
 }
 
-static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry,
-			    int type, struct net_bridge_port_group *pg)
+struct br_mdb_complete_info {
+	struct net_bridge_port *port;
+	struct br_ip ip;
+};
+
+static void br_mdb_complete(struct net_device *dev, int err, void *priv)
 {
+	struct br_mdb_complete_info *data = priv;
+	struct net_bridge_port_group __rcu **pp;
+	struct net_bridge_port_group *p;
+	struct net_bridge_mdb_htable *mdb;
+	struct net_bridge_mdb_entry *mp;
+	struct net_bridge_port *port = data->port;
+	struct net_bridge *br = port->br;
+
+	if (err)
+		goto err;
+
+	spin_lock_bh(&br->multicast_lock);
+	mdb = mlock_dereference(br->mdb, br);
+	mp = br_mdb_ip_get(mdb, &data->ip);
+	if (!mp)
+		goto out;
+	for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
+		if (p->port != port)
+			continue;
+		p->flags |= MDB_PG_FLAGS_OFFLOAD;
+	}
+out:
+	spin_unlock_bh(&br->multicast_lock);
+err:
+	kfree(priv);
+}
+
+static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
+			    struct br_mdb_entry *entry, int type)
+{
+	struct br_mdb_complete_info *complete_info;
 	struct switchdev_obj_port_mdb mdb = {
 		.obj = {
 			.id = SWITCHDEV_OBJ_ID_PORT_MDB,
@@ -281,9 +317,14 @@ static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry,
 
 	mdb.obj.orig_dev = port_dev;
 	if (port_dev && type == RTM_NEWMDB) {
-		err = switchdev_port_obj_add(port_dev, &mdb.obj);
-		if (!err && pg)
-			pg->flags |= MDB_PG_FLAGS_OFFLOAD;
+		complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC);
+		if (complete_info) {
+			complete_info->port = p;
+			__mdb_entry_to_br_ip(entry, &complete_info->ip);
+			mdb.obj.complete_priv = complete_info;
+			mdb.obj.complete = br_mdb_complete;
+			switchdev_port_obj_add(port_dev, &mdb.obj);
+		}
 	} else if (port_dev && type == RTM_DELMDB) {
 		switchdev_port_obj_del(port_dev, &mdb.obj);
 	}
@@ -304,21 +345,21 @@ errout:
 	rtnl_set_sk_err(net, RTNLGRP_MDB, err);
 }
 
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg,
-		   int type)
+void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
+		   struct br_ip *group, int type, u8 flags)
 {
 	struct br_mdb_entry entry;
 
 	memset(&entry, 0, sizeof(entry));
-	entry.ifindex = pg->port->dev->ifindex;
-	entry.addr.proto = pg->addr.proto;
-	entry.addr.u.ip4 = pg->addr.u.ip4;
+	entry.ifindex = port->dev->ifindex;
+	entry.addr.proto = group->proto;
+	entry.addr.u.ip4 = group->u.ip4;
 #if IS_ENABLED(CONFIG_IPV6)
-	entry.addr.u.ip6 = pg->addr.u.ip6;
+	entry.addr.u.ip6 = group->u.ip6;
 #endif
-	entry.vid = pg->addr.vid;
-	__mdb_entry_fill_flags(&entry, pg->flags);
-	__br_mdb_notify(dev, &entry, type, pg);
+	entry.vid = group->vid;
+	__mdb_entry_fill_flags(&entry, flags);
+	__br_mdb_notify(dev, port, &entry, type);
 }
 
 static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
@@ -463,8 +504,7 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
-			    struct br_ip *group, unsigned char state,
-			    struct net_bridge_port_group **pg)
+			    struct br_ip *group, unsigned char state)
 {
 	struct net_bridge_mdb_entry *mp;
 	struct net_bridge_port_group *p;
@@ -495,7 +535,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 	if (unlikely(!p))
 		return -ENOMEM;
 	rcu_assign_pointer(*pp, p);
-	*pg = p;
 	if (state == MDB_TEMPORARY)
 		mod_timer(&p->timer, now + br->multicast_membership_interval);
 
@@ -503,8 +542,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 }
 
 static int __br_mdb_add(struct net *net, struct net_bridge *br,
-			struct br_mdb_entry *entry,
-			struct net_bridge_port_group **pg)
+			struct br_mdb_entry *entry)
 {
 	struct br_ip ip;
 	struct net_device *dev;
@@ -525,7 +563,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
 	__mdb_entry_to_br_ip(entry, &ip);
 
 	spin_lock_bh(&br->multicast_lock);
-	ret = br_mdb_add_group(br, p, &ip, entry->state, pg);
+	ret = br_mdb_add_group(br, p, &ip, entry->state);
 	spin_unlock_bh(&br->multicast_lock);
 	return ret;
 }
@@ -533,7 +571,6 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
 static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
-	struct net_bridge_port_group *pg;
 	struct net_bridge_vlan_group *vg;
 	struct net_device *dev, *pdev;
 	struct br_mdb_entry *entry;
@@ -563,15 +600,15 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (br_vlan_enabled(br) && vg && entry->vid == 0) {
 		list_for_each_entry(v, &vg->vlan_list, vlist) {
 			entry->vid = v->vid;
-			err = __br_mdb_add(net, br, entry, &pg);
+			err = __br_mdb_add(net, br, entry);
 			if (err)
 				break;
-			__br_mdb_notify(dev, entry, RTM_NEWMDB, pg);
+			__br_mdb_notify(dev, p, entry, RTM_NEWMDB);
 		}
 	} else {
-		err = __br_mdb_add(net, br, entry, &pg);
+		err = __br_mdb_add(net, br, entry);
 		if (!err)
-			__br_mdb_notify(dev, entry, RTM_NEWMDB, pg);
+			__br_mdb_notify(dev, p, entry, RTM_NEWMDB);
 	}
 
 	return err;
@@ -659,12 +696,12 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 			entry->vid = v->vid;
 			err = __br_mdb_del(br, entry);
 			if (!err)
-				__br_mdb_notify(dev, entry, RTM_DELMDB, NULL);
+				__br_mdb_notify(dev, p, entry, RTM_DELMDB);
 		}
 	} else {
 		err = __br_mdb_del(br, entry);
 		if (!err)
-			__br_mdb_notify(dev, entry, RTM_DELMDB, NULL);
+			__br_mdb_notify(dev, p, entry, RTM_DELMDB);
 	}
 
 	return err;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index a4c15df2b792..191ea66e4d92 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -283,7 +283,8 @@ static void br_multicast_del_pg(struct net_bridge *br,
 		rcu_assign_pointer(*pp, p->next);
 		hlist_del_init(&p->mglist);
 		del_timer(&p->timer);
-		br_mdb_notify(br->dev, p, RTM_DELMDB);
+		br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
+			      p->flags);
 		call_rcu_bh(&p->rcu, br_multicast_free_pg);
 
 		if (!mp->ports && !mp->mglist &&
@@ -705,7 +706,7 @@ static int br_multicast_add_group(struct net_bridge *br,
 	if (unlikely(!p))
 		goto err;
 	rcu_assign_pointer(*pp, p);
-	br_mdb_notify(br->dev, p, RTM_NEWMDB);
+	br_mdb_notify(br->dev, port, group, RTM_NEWMDB, 0);
 
 found:
 	mod_timer(&p->timer, now + br->multicast_membership_interval);
@@ -1461,7 +1462,8 @@ br_multicast_leave_group(struct net_bridge *br,
 			hlist_del_init(&p->mglist);
 			del_timer(&p->timer);
 			call_rcu_bh(&p->rcu, br_multicast_free_pg);
-			br_mdb_notify(br->dev, p, RTM_DELMDB);
+			br_mdb_notify(br->dev, port, group, RTM_DELMDB,
+				      p->flags);
 
 			if (!mp->ports && !mp->mglist &&
 			    netif_running(br->dev))
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1b5d145dfcbf..d9da857182ef 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -560,8 +560,8 @@ br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
 			    unsigned char flags);
 void br_mdb_init(void);
 void br_mdb_uninit(void);
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg,
-		   int type);
+void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
+		   struct br_ip *group, int type, u8 flags);
 void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
 		   int type);
 
-- 
cgit v1.2.3


From 391a20333b8393ef2e13014e6e59d192c5594471 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 21 Apr 2016 22:23:31 +0200
Subject: ipv4/fib: don't warn when primary address is missing if in_dev is
 dead

After commit fbd40ea0180a ("ipv4: Don't do expensive useless work
during inetdev destroy.") when deleting an interface,
fib_del_ifaddr() can be executed without any primary address
present on the dead interface.

The above is safe, but triggers some "bug: prim == NULL" warnings.

This commit avoids warning if the in_dev is dead

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8a9246deccfe..63566ec54794 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -904,7 +904,11 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 	if (ifa->ifa_flags & IFA_F_SECONDARY) {
 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
 		if (!prim) {
-			pr_warn("%s: bug: prim == NULL\n", __func__);
+			/* if the device has been deleted, we don't perform
+			 * address promotion
+			 */
+			if (!in_dev->dead)
+				pr_warn("%s: bug: prim == NULL\n", __func__);
 			return;
 		}
 		if (iprim && iprim != prim) {
-- 
cgit v1.2.3


From 6c1ea260f89709e0021d2c59f8fd2a104b5b1123 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 11 Apr 2016 19:34:49 +0200
Subject: libceph: make authorizer destruction independent of ceph_auth_client

Starting the kernel client with cephx disabled and then enabling cephx
and restarting userspace daemons can result in a crash:

    [262671.478162] BUG: unable to handle kernel paging request at ffffebe000000000
    [262671.531460] IP: [<ffffffff811cd04a>] kfree+0x5a/0x130
    [262671.584334] PGD 0
    [262671.635847] Oops: 0000 [#1] SMP
    [262672.055841] CPU: 22 PID: 2961272 Comm: kworker/22:2 Not tainted 4.2.0-34-generic #39~14.04.1-Ubuntu
    [262672.162338] Hardware name: Dell Inc. PowerEdge R720/068CDY, BIOS 2.4.3 07/09/2014
    [262672.268937] Workqueue: ceph-msgr con_work [libceph]
    [262672.322290] task: ffff88081c2d0dc0 ti: ffff880149ae8000 task.ti: ffff880149ae8000
    [262672.428330] RIP: 0010:[<ffffffff811cd04a>]  [<ffffffff811cd04a>] kfree+0x5a/0x130
    [262672.535880] RSP: 0018:ffff880149aeba58  EFLAGS: 00010286
    [262672.589486] RAX: 000001e000000000 RBX: 0000000000000012 RCX: ffff8807e7461018
    [262672.695980] RDX: 000077ff80000000 RSI: ffff88081af2be04 RDI: 0000000000000012
    [262672.803668] RBP: ffff880149aeba78 R08: 0000000000000000 R09: 0000000000000000
    [262672.912299] R10: ffffebe000000000 R11: ffff880819a60e78 R12: ffff8800aec8df40
    [262673.021769] R13: ffffffffc035f70f R14: ffff8807e5b138e0 R15: ffff880da9785840
    [262673.131722] FS:  0000000000000000(0000) GS:ffff88081fac0000(0000) knlGS:0000000000000000
    [262673.245377] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    [262673.303281] CR2: ffffebe000000000 CR3: 0000000001c0d000 CR4: 00000000001406e0
    [262673.417556] Stack:
    [262673.472943]  ffff880149aeba88 ffff88081af2be04 ffff8800aec8df40 ffff88081af2be04
    [262673.583767]  ffff880149aeba98 ffffffffc035f70f ffff880149aebac8 ffff8800aec8df00
    [262673.694546]  ffff880149aebac8 ffffffffc035c89e ffff8807e5b138e0 ffff8805b047f800
    [262673.805230] Call Trace:
    [262673.859116]  [<ffffffffc035f70f>] ceph_x_destroy_authorizer+0x1f/0x50 [libceph]
    [262673.968705]  [<ffffffffc035c89e>] ceph_auth_destroy_authorizer+0x3e/0x60 [libceph]
    [262674.078852]  [<ffffffffc0352805>] put_osd+0x45/0x80 [libceph]
    [262674.134249]  [<ffffffffc035290e>] remove_osd+0xae/0x140 [libceph]
    [262674.189124]  [<ffffffffc0352aa3>] __reset_osd+0x103/0x150 [libceph]
    [262674.243749]  [<ffffffffc0354703>] kick_requests+0x223/0x460 [libceph]
    [262674.297485]  [<ffffffffc03559e2>] ceph_osdc_handle_map+0x282/0x5e0 [libceph]
    [262674.350813]  [<ffffffffc035022e>] dispatch+0x4e/0x720 [libceph]
    [262674.403312]  [<ffffffffc034bd91>] try_read+0x3d1/0x1090 [libceph]
    [262674.454712]  [<ffffffff810ab7c2>] ? dequeue_entity+0x152/0x690
    [262674.505096]  [<ffffffffc034cb1b>] con_work+0xcb/0x1300 [libceph]
    [262674.555104]  [<ffffffff8108fb3e>] process_one_work+0x14e/0x3d0
    [262674.604072]  [<ffffffff810901ea>] worker_thread+0x11a/0x470
    [262674.652187]  [<ffffffff810900d0>] ? rescuer_thread+0x310/0x310
    [262674.699022]  [<ffffffff810957a2>] kthread+0xd2/0xf0
    [262674.744494]  [<ffffffff810956d0>] ? kthread_create_on_node+0x1c0/0x1c0
    [262674.789543]  [<ffffffff817bd81f>] ret_from_fork+0x3f/0x70
    [262674.834094]  [<ffffffff810956d0>] ? kthread_create_on_node+0x1c0/0x1c0

What happens is the following:

    (1) new MON session is established
    (2) old "none" ac is destroyed
    (3) new "cephx" ac is constructed
    ...
    (4) old OSD session (w/ "none" authorizer) is put
          ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer)

osd->o_auth.authorizer in the "none" case is just a bare pointer into
ac, which contains a single static copy for all services.  By the time
we get to (4), "none" ac, freed in (2), is long gone.  On top of that,
a new vtable installed in (3) points us at ceph_x_destroy_authorizer(),
so we end up trying to destroy a "none" authorizer with a "cephx"
destructor operating on invalid memory!

To fix this, decouple authorizer destruction from ac and do away with
a single static "none" authorizer by making a copy for each OSD or MDS
session.  Authorizers themselves are independent of ac and so there is
no reason for destroy_authorizer() to be an ac op.  Make it an op on
the authorizer itself by turning ceph_authorizer into a real struct.

Fixes: http://tracker.ceph.com/issues/15447

Reported-by: Alan Zhang <alan.zhang@linux.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 net/ceph/auth.c       |  8 ++----
 net/ceph/auth_none.c  | 71 ++++++++++++++++++++++++++++-----------------------
 net/ceph/auth_none.h  |  3 +--
 net/ceph/auth_x.c     | 21 ++++++++-------
 net/ceph/auth_x.h     |  1 +
 net/ceph/osd_client.c |  6 ++---
 6 files changed, 55 insertions(+), 55 deletions(-)

(limited to 'net')

diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 6b923bcaa2a4..2bc5965fdd1e 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -293,13 +293,9 @@ int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 }
 EXPORT_SYMBOL(ceph_auth_create_authorizer);
 
-void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
-				  struct ceph_authorizer *a)
+void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
 {
-	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->destroy_authorizer)
-		ac->ops->destroy_authorizer(ac, a);
-	mutex_unlock(&ac->mutex);
+	a->destroy(a);
 }
 EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
 
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 8c93fa8d81bc..5f836f02ae36 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -16,7 +16,6 @@ static void reset(struct ceph_auth_client *ac)
 	struct ceph_auth_none_info *xi = ac->private;
 
 	xi->starting = true;
-	xi->built_authorizer = false;
 }
 
 static void destroy(struct ceph_auth_client *ac)
@@ -39,6 +38,27 @@ static int should_authenticate(struct ceph_auth_client *ac)
 	return xi->starting;
 }
 
+static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac,
+					   struct ceph_none_authorizer *au)
+{
+	void *p = au->buf;
+	void *const end = p + sizeof(au->buf);
+	int ret;
+
+	ceph_encode_8_safe(&p, end, 1, e_range);
+	ret = ceph_entity_name_encode(ac->name, &p, end);
+	if (ret < 0)
+		return ret;
+
+	ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+	au->buf_len = p - (void *)au->buf;
+	dout("%s built authorizer len %d\n", __func__, au->buf_len);
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+
 static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
 {
 	return 0;
@@ -57,32 +77,32 @@ static int handle_reply(struct ceph_auth_client *ac, int result,
 	return result;
 }
 
+static void ceph_auth_none_destroy_authorizer(struct ceph_authorizer *a)
+{
+	kfree(a);
+}
+
 /*
- * build an 'authorizer' with our entity_name and global_id.  we can
- * reuse a single static copy since it is identical for all services
- * we connect to.
+ * build an 'authorizer' with our entity_name and global_id.  it is
+ * identical for all services we connect to.
  */
 static int ceph_auth_none_create_authorizer(
 	struct ceph_auth_client *ac, int peer_type,
 	struct ceph_auth_handshake *auth)
 {
-	struct ceph_auth_none_info *ai = ac->private;
-	struct ceph_none_authorizer *au = &ai->au;
-	void *p, *end;
+	struct ceph_none_authorizer *au;
 	int ret;
 
-	if (!ai->built_authorizer) {
-		p = au->buf;
-		end = p + sizeof(au->buf);
-		ceph_encode_8(&p, 1);
-		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
-		if (ret < 0)
-			goto bad;
-		ceph_decode_need(&p, end, sizeof(u64), bad2);
-		ceph_encode_64(&p, ac->global_id);
-		au->buf_len = p - (void *)au->buf;
-		ai->built_authorizer = true;
-		dout("built authorizer len %d\n", au->buf_len);
+	au = kmalloc(sizeof(*au), GFP_NOFS);
+	if (!au)
+		return -ENOMEM;
+
+	au->base.destroy = ceph_auth_none_destroy_authorizer;
+
+	ret = ceph_auth_none_build_authorizer(ac, au);
+	if (ret) {
+		kfree(au);
+		return ret;
 	}
 
 	auth->authorizer = (struct ceph_authorizer *) au;
@@ -92,17 +112,6 @@ static int ceph_auth_none_create_authorizer(
 	auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
 
 	return 0;
-
-bad2:
-	ret = -ERANGE;
-bad:
-	return ret;
-}
-
-static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
-{
-	/* nothing to do */
 }
 
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
@@ -114,7 +123,6 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = {
 	.build_request = build_request,
 	.handle_reply = handle_reply,
 	.create_authorizer = ceph_auth_none_create_authorizer,
-	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
 };
 
 int ceph_auth_none_init(struct ceph_auth_client *ac)
@@ -127,7 +135,6 @@ int ceph_auth_none_init(struct ceph_auth_client *ac)
 		return -ENOMEM;
 
 	xi->starting = true;
-	xi->built_authorizer = false;
 
 	ac->protocol = CEPH_AUTH_NONE;
 	ac->private = xi;
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
index 059a3ce4b53f..62021535ae4a 100644
--- a/net/ceph/auth_none.h
+++ b/net/ceph/auth_none.h
@@ -12,6 +12,7 @@
  */
 
 struct ceph_none_authorizer {
+	struct ceph_authorizer base;
 	char buf[128];
 	int buf_len;
 	char reply_buf[0];
@@ -19,8 +20,6 @@ struct ceph_none_authorizer {
 
 struct ceph_auth_none_info {
 	bool starting;
-	bool built_authorizer;
-	struct ceph_none_authorizer au;   /* we only need one; it's static */
 };
 
 int ceph_auth_none_init(struct ceph_auth_client *ac);
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 9e43a315e662..a0905f04bd13 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -565,6 +565,14 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 	return -EAGAIN;
 }
 
+static void ceph_x_destroy_authorizer(struct ceph_authorizer *a)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+
+	ceph_x_authorizer_cleanup(au);
+	kfree(au);
+}
+
 static int ceph_x_create_authorizer(
 	struct ceph_auth_client *ac, int peer_type,
 	struct ceph_auth_handshake *auth)
@@ -581,6 +589,8 @@ static int ceph_x_create_authorizer(
 	if (!au)
 		return -ENOMEM;
 
+	au->base.destroy = ceph_x_destroy_authorizer;
+
 	ret = ceph_x_build_authorizer(ac, th, au);
 	if (ret) {
 		kfree(au);
@@ -643,16 +653,6 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
 	return ret;
 }
 
-static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
-{
-	struct ceph_x_authorizer *au = (void *)a;
-
-	ceph_x_authorizer_cleanup(au);
-	kfree(au);
-}
-
-
 static void ceph_x_reset(struct ceph_auth_client *ac)
 {
 	struct ceph_x_info *xi = ac->private;
@@ -770,7 +770,6 @@ static const struct ceph_auth_client_ops ceph_x_ops = {
 	.create_authorizer = ceph_x_create_authorizer,
 	.update_authorizer = ceph_x_update_authorizer,
 	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
-	.destroy_authorizer = ceph_x_destroy_authorizer,
 	.invalidate_authorizer = ceph_x_invalidate_authorizer,
 	.reset =  ceph_x_reset,
 	.destroy = ceph_x_destroy,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index 40b1a3cf7397..21a5af904bae 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -26,6 +26,7 @@ struct ceph_x_ticket_handler {
 
 
 struct ceph_x_authorizer {
+	struct ceph_authorizer base;
 	struct ceph_crypto_key session_key;
 	struct ceph_buffer *buf;
 	unsigned int service;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 32355d9d0103..40a53a70efdf 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1087,10 +1087,8 @@ static void put_osd(struct ceph_osd *osd)
 	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
 	     atomic_read(&osd->o_ref) - 1);
 	if (atomic_dec_and_test(&osd->o_ref)) {
-		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
-
 		if (osd->o_auth.authorizer)
-			ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
+			ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
 		kfree(osd);
 	}
 }
@@ -2984,7 +2982,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	struct ceph_auth_handshake *auth = &o->o_auth;
 
 	if (force_new && auth->authorizer) {
-		ceph_auth_destroy_authorizer(ac, auth->authorizer);
+		ceph_auth_destroy_authorizer(auth->authorizer);
 		auth->authorizer = NULL;
 	}
 	if (!auth->authorizer) {
-- 
cgit v1.2.3


From 841645b5f2dfceac69b78fcd0c9050868d41ea61 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 25 Apr 2016 15:33:55 -0400
Subject: ipv6: Revert optional address flusing on ifdown.

This reverts the following three commits:

70af921db6f8835f4b11c65731116560adb00c14
799977d9aafbf0ca0b9c39b04cbfb16db71302c9
f1705ec197e705b79ea40fe7a2cc5acfa1d3bfac

The feature was ill conceived, has terrible semantics, and has added
nothing but regressions to the already fragile ipv6 stack.

Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 162 ++++------------------------------------------------
 1 file changed, 12 insertions(+), 150 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 23cec53b568a..d77ba395d593 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -216,7 +216,6 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
-	.keep_addr_on_down	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -261,7 +260,6 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
-	.keep_addr_on_down	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -3176,81 +3174,6 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
-/* If the host route is cached on the addr struct make sure it is associated
- * with the proper table. e.g., enslavement can change and if so the cached
- * host route needs to move to the new table.
- */
-static void l3mdev_check_host_rt(struct inet6_dev *idev,
-				  struct inet6_ifaddr *ifp)
-{
-	if (ifp->rt) {
-		u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
-
-		if (tb_id != ifp->rt->rt6i_table->tb6_id) {
-			ip6_del_rt(ifp->rt);
-			ifp->rt = NULL;
-		}
-	}
-}
-#else
-static void l3mdev_check_host_rt(struct inet6_dev *idev,
-				  struct inet6_ifaddr *ifp)
-{
-}
-#endif
-
-static int fixup_permanent_addr(struct inet6_dev *idev,
-				struct inet6_ifaddr *ifp)
-{
-	l3mdev_check_host_rt(idev, ifp);
-
-	if (!ifp->rt) {
-		struct rt6_info *rt;
-
-		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
-		if (unlikely(IS_ERR(rt)))
-			return PTR_ERR(rt);
-
-		ifp->rt = rt;
-	}
-
-	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
-				      idev->dev, 0, 0);
-	}
-
-	addrconf_dad_start(ifp);
-
-	return 0;
-}
-
-static void addrconf_permanent_addr(struct net_device *dev)
-{
-	struct inet6_ifaddr *ifp, *tmp;
-	struct inet6_dev *idev;
-
-	idev = __in6_dev_get(dev);
-	if (!idev)
-		return;
-
-	write_lock_bh(&idev->lock);
-
-	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
-		if ((ifp->flags & IFA_F_PERMANENT) &&
-		    fixup_permanent_addr(idev, ifp) < 0) {
-			write_unlock_bh(&idev->lock);
-			ipv6_del_addr(ifp);
-			write_lock_bh(&idev->lock);
-
-			net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
-					     idev->dev->name, &ifp->addr);
-		}
-	}
-
-	write_unlock_bh(&idev->lock);
-}
-
 static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
@@ -3337,9 +3260,6 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			run_pending = 1;
 		}
 
-		/* restore routes for permanent addresses */
-		addrconf_permanent_addr(dev);
-
 		switch (dev->type) {
 #if IS_ENABLED(CONFIG_IPV6_SIT)
 		case ARPHRD_SIT:
@@ -3448,20 +3368,11 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event)
 		ipv6_mc_unmap(idev);
 }
 
-static bool addr_is_local(const struct in6_addr *addr)
-{
-	return ipv6_addr_type(addr) &
-		(IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
-}
-
 static int addrconf_ifdown(struct net_device *dev, int how)
 {
 	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
-	struct inet6_ifaddr *ifa, *tmp;
-	struct list_head del_list;
-	int _keep_addr;
-	bool keep_addr;
+	struct inet6_ifaddr *ifa;
 	int state, i;
 
 	ASSERT_RTNL();
@@ -3488,16 +3399,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	}
 
-	/* aggregate the system setting and interface setting */
-	_keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
-	if (!_keep_addr)
-		_keep_addr = idev->cnf.keep_addr_on_down;
-
-	/* combine the user config with event to determine if permanent
-	 * addresses are to be removed from address hash table
-	 */
-	keep_addr = !(how || _keep_addr <= 0);
-
 	/* Step 2: clear hash table */
 	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
 		struct hlist_head *h = &inet6_addr_lst[i];
@@ -3506,16 +3407,9 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 restart:
 		hlist_for_each_entry_rcu(ifa, h, addr_lst) {
 			if (ifa->idev == idev) {
+				hlist_del_init_rcu(&ifa->addr_lst);
 				addrconf_del_dad_work(ifa);
-				/* combined flag + permanent flag decide if
-				 * address is retained on a down event
-				 */
-				if (!keep_addr ||
-				    !(ifa->flags & IFA_F_PERMANENT) ||
-				    addr_is_local(&ifa->addr)) {
-					hlist_del_init_rcu(&ifa->addr_lst);
-					goto restart;
-				}
+				goto restart;
 			}
 		}
 		spin_unlock_bh(&addrconf_hash_lock);
@@ -3549,54 +3443,31 @@ restart:
 		write_lock_bh(&idev->lock);
 	}
 
-	/* re-combine the user config with event to determine if permanent
-	 * addresses are to be removed from the interface list
-	 */
-	keep_addr = (!how && _keep_addr > 0);
-
-	INIT_LIST_HEAD(&del_list);
-	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+	while (!list_empty(&idev->addr_list)) {
+		ifa = list_first_entry(&idev->addr_list,
+				       struct inet6_ifaddr, if_list);
 		addrconf_del_dad_work(ifa);
 
-		write_unlock_bh(&idev->lock);
-		spin_lock_bh(&ifa->lock);
-
-		if (keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
-		    !addr_is_local(&ifa->addr)) {
-			/* set state to skip the notifier below */
-			state = INET6_IFADDR_STATE_DEAD;
-			ifa->state = 0;
-			if (!(ifa->flags & IFA_F_NODAD))
-				ifa->flags |= IFA_F_TENTATIVE;
-		} else {
-			state = ifa->state;
-			ifa->state = INET6_IFADDR_STATE_DEAD;
+		list_del(&ifa->if_list);
 
-			list_del(&ifa->if_list);
-			list_add(&ifa->if_list, &del_list);
-		}
+		write_unlock_bh(&idev->lock);
 
+		spin_lock_bh(&ifa->lock);
+		state = ifa->state;
+		ifa->state = INET6_IFADDR_STATE_DEAD;
 		spin_unlock_bh(&ifa->lock);
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
 			inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
 		}
+		in6_ifa_put(ifa);
 
 		write_lock_bh(&idev->lock);
 	}
 
 	write_unlock_bh(&idev->lock);
 
-	/* now clean up addresses to be removed */
-	while (!list_empty(&del_list)) {
-		ifa = list_first_entry(&del_list,
-				       struct inet6_ifaddr, if_list);
-		list_del(&ifa->if_list);
-
-		in6_ifa_put(ifa);
-	}
-
 	/* Step 5: Discard anycast and multicast list */
 	if (how) {
 		ipv6_ac_destroy_dev(idev);
@@ -4861,7 +4732,6 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
-	array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5949,14 +5819,6 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
-			.procname       = "keep_addr_on_down",
-			.data           = &ipv6_devconf.keep_addr_on_down,
-			.maxlen         = sizeof(int),
-			.mode           = 0644,
-			.proc_handler   = proc_dointvec,
-
-		},
 		{
 			/* sentinel */
 		}
-- 
cgit v1.2.3


From 6a923934c33c750a595868af6bef5f1a1fa90054 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 26 Apr 2016 11:47:41 -0400
Subject: Revert "ipv6: Revert optional address flusing on ifdown."

This reverts commit 841645b5f2dfceac69b78fcd0c9050868d41ea61.

Ok, this puts the feature back.  I've decided to apply David A.'s
bug fix and run with that rather than make everyone wait another
whole release for this feature.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 150 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d77ba395d593..23cec53b568a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -3174,6 +3176,81 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+/* If the host route is cached on the addr struct make sure it is associated
+ * with the proper table. e.g., enslavement can change and if so the cached
+ * host route needs to move to the new table.
+ */
+static void l3mdev_check_host_rt(struct inet6_dev *idev,
+				  struct inet6_ifaddr *ifp)
+{
+	if (ifp->rt) {
+		u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
+
+		if (tb_id != ifp->rt->rt6i_table->tb6_id) {
+			ip6_del_rt(ifp->rt);
+			ifp->rt = NULL;
+		}
+	}
+}
+#else
+static void l3mdev_check_host_rt(struct inet6_dev *idev,
+				  struct inet6_ifaddr *ifp)
+{
+}
+#endif
+
+static int fixup_permanent_addr(struct inet6_dev *idev,
+				struct inet6_ifaddr *ifp)
+{
+	l3mdev_check_host_rt(idev, ifp);
+
+	if (!ifp->rt) {
+		struct rt6_info *rt;
+
+		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
+		if (unlikely(IS_ERR(rt)))
+			return PTR_ERR(rt);
+
+		ifp->rt = rt;
+	}
+
+	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      idev->dev, 0, 0);
+	}
+
+	addrconf_dad_start(ifp);
+
+	return 0;
+}
+
+static void addrconf_permanent_addr(struct net_device *dev)
+{
+	struct inet6_ifaddr *ifp, *tmp;
+	struct inet6_dev *idev;
+
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		return;
+
+	write_lock_bh(&idev->lock);
+
+	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+		if ((ifp->flags & IFA_F_PERMANENT) &&
+		    fixup_permanent_addr(idev, ifp) < 0) {
+			write_unlock_bh(&idev->lock);
+			ipv6_del_addr(ifp);
+			write_lock_bh(&idev->lock);
+
+			net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
+					     idev->dev->name, &ifp->addr);
+		}
+	}
+
+	write_unlock_bh(&idev->lock);
+}
+
 static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
@@ -3260,6 +3337,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			run_pending = 1;
 		}
 
+		/* restore routes for permanent addresses */
+		addrconf_permanent_addr(dev);
+
 		switch (dev->type) {
 #if IS_ENABLED(CONFIG_IPV6_SIT)
 		case ARPHRD_SIT:
@@ -3368,11 +3448,20 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event)
 		ipv6_mc_unmap(idev);
 }
 
+static bool addr_is_local(const struct in6_addr *addr)
+{
+	return ipv6_addr_type(addr) &
+		(IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
+}
+
 static int addrconf_ifdown(struct net_device *dev, int how)
 {
 	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
-	struct inet6_ifaddr *ifa;
+	struct inet6_ifaddr *ifa, *tmp;
+	struct list_head del_list;
+	int _keep_addr;
+	bool keep_addr;
 	int state, i;
 
 	ASSERT_RTNL();
@@ -3399,6 +3488,16 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	}
 
+	/* aggregate the system setting and interface setting */
+	_keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+	if (!_keep_addr)
+		_keep_addr = idev->cnf.keep_addr_on_down;
+
+	/* combine the user config with event to determine if permanent
+	 * addresses are to be removed from address hash table
+	 */
+	keep_addr = !(how || _keep_addr <= 0);
+
 	/* Step 2: clear hash table */
 	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
 		struct hlist_head *h = &inet6_addr_lst[i];
@@ -3407,9 +3506,16 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 restart:
 		hlist_for_each_entry_rcu(ifa, h, addr_lst) {
 			if (ifa->idev == idev) {
-				hlist_del_init_rcu(&ifa->addr_lst);
 				addrconf_del_dad_work(ifa);
-				goto restart;
+				/* combined flag + permanent flag decide if
+				 * address is retained on a down event
+				 */
+				if (!keep_addr ||
+				    !(ifa->flags & IFA_F_PERMANENT) ||
+				    addr_is_local(&ifa->addr)) {
+					hlist_del_init_rcu(&ifa->addr_lst);
+					goto restart;
+				}
 			}
 		}
 		spin_unlock_bh(&addrconf_hash_lock);
@@ -3443,31 +3549,54 @@ restart:
 		write_lock_bh(&idev->lock);
 	}
 
-	while (!list_empty(&idev->addr_list)) {
-		ifa = list_first_entry(&idev->addr_list,
-				       struct inet6_ifaddr, if_list);
-		addrconf_del_dad_work(ifa);
+	/* re-combine the user config with event to determine if permanent
+	 * addresses are to be removed from the interface list
+	 */
+	keep_addr = (!how && _keep_addr > 0);
 
-		list_del(&ifa->if_list);
+	INIT_LIST_HEAD(&del_list);
+	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+		addrconf_del_dad_work(ifa);
 
 		write_unlock_bh(&idev->lock);
-
 		spin_lock_bh(&ifa->lock);
-		state = ifa->state;
-		ifa->state = INET6_IFADDR_STATE_DEAD;
+
+		if (keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
+		    !addr_is_local(&ifa->addr)) {
+			/* set state to skip the notifier below */
+			state = INET6_IFADDR_STATE_DEAD;
+			ifa->state = 0;
+			if (!(ifa->flags & IFA_F_NODAD))
+				ifa->flags |= IFA_F_TENTATIVE;
+		} else {
+			state = ifa->state;
+			ifa->state = INET6_IFADDR_STATE_DEAD;
+
+			list_del(&ifa->if_list);
+			list_add(&ifa->if_list, &del_list);
+		}
+
 		spin_unlock_bh(&ifa->lock);
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
 			inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
 		}
-		in6_ifa_put(ifa);
 
 		write_lock_bh(&idev->lock);
 	}
 
 	write_unlock_bh(&idev->lock);
 
+	/* now clean up addresses to be removed */
+	while (!list_empty(&del_list)) {
+		ifa = list_first_entry(&del_list,
+				       struct inet6_ifaddr, if_list);
+		list_del(&ifa->if_list);
+
+		in6_ifa_put(ifa);
+	}
+
 	/* Step 5: Discard anycast and multicast list */
 	if (how) {
 		ipv6_ac_destroy_dev(idev);
@@ -4732,6 +4861,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
+	array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5819,6 +5949,14 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname       = "keep_addr_on_down",
+			.data           = &ipv6_devconf.keep_addr_on_down,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 		{
 			/* sentinel */
 		}
-- 
cgit v1.2.3


From 38bd10c447f8e8980753149a8a65108159871df5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 21 Apr 2016 20:56:12 -0700
Subject: net: ipv6: Delete host routes on an ifdown

It was a simple idea -- save IPv6 configured addresses on a link down
so that IPv6 behaves similar to IPv4. As always the devil is in the
details and the IPv6 stack as too many behavioral differences from IPv4
making the simple idea more complicated than it needs to be.

The current implementation for keeping IPv6 addresses can panic or spit
out a warning in one of many paths:

1. IPv6 route gets an IPv4 route as its 'next' which causes a panic in
   rt6_fill_node while handling a route dump request.

2. rt->dst.obsolete is set to DST_OBSOLETE_DEAD hitting the WARN_ON in
   fib6_del

3. Panic in fib6_purge_rt because rt6i_ref count is not 1.

The root cause of all these is references related to the host route for
an address that is retained.

So, this patch deletes the host route every time the ifdown loop runs.
Since the host route is deleted and will be re-generated an up there is
no longer a need for the l3mdev fix up. On the 'admin up' side move
addrconf_permanent_addr into the NETDEV_UP event handling so that it
runs only once versus on UP and CHANGE events.

All of the current panics and warnings appear to be related to
addresses on the loopback device, but given the catastrophic nature when
a bug is triggered this patch takes the conservative approach and evicts
all host routes rather than trying to determine when it can be re-used
and when it can not. That can be a later optimizaton if desired.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 48 +++++++++++++++---------------------------------
 1 file changed, 15 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 23cec53b568a..8ec4b3089e20 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3176,35 +3176,9 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
-/* If the host route is cached on the addr struct make sure it is associated
- * with the proper table. e.g., enslavement can change and if so the cached
- * host route needs to move to the new table.
- */
-static void l3mdev_check_host_rt(struct inet6_dev *idev,
-				  struct inet6_ifaddr *ifp)
-{
-	if (ifp->rt) {
-		u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
-
-		if (tb_id != ifp->rt->rt6i_table->tb6_id) {
-			ip6_del_rt(ifp->rt);
-			ifp->rt = NULL;
-		}
-	}
-}
-#else
-static void l3mdev_check_host_rt(struct inet6_dev *idev,
-				  struct inet6_ifaddr *ifp)
-{
-}
-#endif
-
 static int fixup_permanent_addr(struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
-	l3mdev_check_host_rt(idev, ifp);
-
 	if (!ifp->rt) {
 		struct rt6_info *rt;
 
@@ -3304,6 +3278,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			break;
 
 		if (event == NETDEV_UP) {
+			/* restore routes for permanent addresses */
+			addrconf_permanent_addr(dev);
+
 			if (!addrconf_qdisc_ok(dev)) {
 				/* device is not ready yet. */
 				pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n",
@@ -3337,9 +3314,6 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			run_pending = 1;
 		}
 
-		/* restore routes for permanent addresses */
-		addrconf_permanent_addr(dev);
-
 		switch (dev->type) {
 #if IS_ENABLED(CONFIG_IPV6_SIT)
 		case ARPHRD_SIT:
@@ -3556,6 +3530,8 @@ restart:
 
 	INIT_LIST_HEAD(&del_list);
 	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+		struct rt6_info *rt = NULL;
+
 		addrconf_del_dad_work(ifa);
 
 		write_unlock_bh(&idev->lock);
@@ -3568,6 +3544,9 @@ restart:
 			ifa->state = 0;
 			if (!(ifa->flags & IFA_F_NODAD))
 				ifa->flags |= IFA_F_TENTATIVE;
+
+			rt = ifa->rt;
+			ifa->rt = NULL;
 		} else {
 			state = ifa->state;
 			ifa->state = INET6_IFADDR_STATE_DEAD;
@@ -3578,6 +3557,9 @@ restart:
 
 		spin_unlock_bh(&ifa->lock);
 
+		if (rt)
+			ip6_del_rt(rt);
+
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
 			inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
@@ -5343,10 +5325,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			if (rt)
 				ip6_del_rt(rt);
 		}
-		dst_hold(&ifp->rt->dst);
-
-		ip6_del_rt(ifp->rt);
-
+		if (ifp->rt) {
+			dst_hold(&ifp->rt->dst);
+			ip6_del_rt(ifp->rt);
+		}
 		rt_genid_bump_ipv6(net);
 		break;
 	}
-- 
cgit v1.2.3


From e6436be21e77e3659b4ff7e357ab5a8342d132d2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 26 Apr 2016 13:47:08 +0200
Subject: mac80211: fix statistics leak if dev_alloc_name() fails

In the case that dev_alloc_name() fails, e.g. because the name was
given by the user and already exists, we need to clean up properly
and free the per-CPU statistics. Fix that.

Cc: stable@vger.kernel.org
Fixes: 5a490510ba5f ("mac80211: use per-CPU TX/RX statistics")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 453b4e741780..e1cb22c16530 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1761,7 +1761,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 
 		ret = dev_alloc_name(ndev, ndev->name);
 		if (ret < 0) {
-			free_netdev(ndev);
+			ieee80211_if_free(ndev);
 			return ret;
 		}
 
@@ -1847,7 +1847,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 
 		ret = register_netdevice(ndev);
 		if (ret) {
-			free_netdev(ndev);
+			ieee80211_if_free(ndev);
 			return ret;
 		}
 	}
-- 
cgit v1.2.3


From a64b04d86d14c81f50f68e102f79ef301e3d0a0e Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Wed, 27 Apr 2016 11:29:06 +0200
Subject: gre: do not assign header_ops in collect metadata mode

In ipgre mode (i.e. not gretap) with collect metadata flag set, the tunnel
is incorrectly assumed to be mGRE in NBMA mode (see commit 6a5f44d7a048c).
This is not the case, we're controlling the encapsulation addresses by
lwtunnel metadata. And anyway, assigning dev->header_ops in collect metadata
mode does not make sense.

Although it would be more user firendly to reject requests that specify
both the collect metadata flag and a remote/local IP address, this would
break current users of gretap or introduce ugly code and differences in
handling ipgre and gretap configuration. Keep the current behavior of
remote/local IP address being ignored in such case.

v3: Back to v1, added explanation paragraph.
v2: Reject configuration specifying both remote/local address and collect
    metadata flag.

Fixes: 2e15ea390e6f4 ("ip_gre: Add support to collect tunnel metadata.")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index af5d1f38217f..d0abde4236af 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -893,7 +893,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
 	netif_keep_dst(dev);
 	dev->addr_len		= 4;
 
-	if (iph->daddr) {
+	if (iph->daddr && !tunnel->collect_md) {
 #ifdef CONFIG_NET_IPGRE_BROADCAST
 		if (ipv4_is_multicast(iph->daddr)) {
 			if (!iph->saddr)
@@ -902,8 +902,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
 			dev->header_ops = &ipgre_header_ops;
 		}
 #endif
-	} else
+	} else if (!tunnel->collect_md) {
 		dev->header_ops = &ipgre_header_ops;
+	}
 
 	return ip_tunnel_init(dev);
 }
-- 
cgit v1.2.3


From 2090714e1d6e80979dd6926be22b0de9ca432273 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Wed, 27 Apr 2016 11:29:07 +0200
Subject: gre: build header correctly for collect metadata tunnels

In ipgre (i.e. not gretap) + collect metadata mode, the skb was assumed to
contain Ethernet header and was encapsulated as ETH_P_TEB. This is not the
case, the interface is ARPHRD_IPGRE and the protocol to be used for
encapsulation is skb->protocol.

Fixes: 2e15ea390e6f4 ("ip_gre: Add support to collect tunnel metadata.")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d0abde4236af..f973e0a58993 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -523,7 +523,8 @@ static struct rtable *gre_get_rt(struct sk_buff *skb,
 	return ip_route_output_key(net, fl);
 }
 
-static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
+			__be16 proto)
 {
 	struct ip_tunnel_info *tun_info;
 	const struct ip_tunnel_key *key;
@@ -575,7 +576,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
-	build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+	build_header(skb, tunnel_hlen, flags, proto,
 		     tunnel_id_to_key(tun_info->key.tun_id), 0);
 
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
@@ -616,7 +617,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 	const struct iphdr *tnl_params;
 
 	if (tunnel->collect_md) {
-		gre_fb_xmit(skb, dev);
+		gre_fb_xmit(skb, dev, skb->protocol);
 		return NETDEV_TX_OK;
 	}
 
@@ -660,7 +661,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
 	if (tunnel->collect_md) {
-		gre_fb_xmit(skb, dev);
+		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
 		return NETDEV_TX_OK;
 	}
 
-- 
cgit v1.2.3


From 946b636f1730c64e05ff7fe8cf7136422fa8ea70 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Wed, 27 Apr 2016 14:08:01 +0200
Subject: gre: reject GUE and FOU in collect metadata mode

The collect metadata mode does not support GUE nor FOU. This might be
implemented later; until then, we should reject such config.

I think this is okay to be changed. It's unlikely anyone has such
configuration (as it doesn't work anyway) and we may need a way to
distinguish whether it's supported or not by the kernel later.

For backwards compatibility with iproute2, it's not possible to just check
the attribute presence (iproute2 always includes the attribute), the actual
value has to be checked, too.

Fixes: 2e15ea390e6f4 ("ip_gre: Add support to collect tunnel metadata.")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f973e0a58993..f502d34bcb40 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -948,6 +948,11 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
 	if (flags & (GRE_VERSION|GRE_ROUTING))
 		return -EINVAL;
 
+	if (data[IFLA_GRE_COLLECT_METADATA] &&
+	    data[IFLA_GRE_ENCAP_TYPE] &&
+	    nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
+		return -EINVAL;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2871734e85e920503d49b3a8bc0afbe0773b6036 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <a@unstable.cc>
Date: Sat, 12 Mar 2016 11:12:59 +0100
Subject: batman-adv: fix DAT candidate selection (must use vid)

Now that DAT is VLAN aware, it must use the VID when
computing the DHT address of the candidate nodes where
an entry is going to be stored/retrieved.

Fixes: be1db4f6615b ("batman-adv: make the Distributed ARP Table vlan aware")
Signed-off-by: Antonio Quartulli <a@unstable.cc>
[sven@narfation.org: fix conflicts with current version]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/distributed-arp-table.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index e96d7c745b4a..3e6b2624f980 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -568,6 +568,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
  * be sent to
  * @bat_priv: the bat priv with all the soft interface information
  * @ip_dst: ipv4 to look up in the DHT
+ * @vid: VLAN identifier
  *
  * An originator O is selected if and only if its DHT_ID value is one of three
  * closest values (from the LEFT, with wrap around if needed) then the hash
@@ -576,7 +577,8 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
  * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM.
  */
 static struct batadv_dat_candidate *
-batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
+batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst,
+			     unsigned short vid)
 {
 	int select;
 	batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key;
@@ -592,7 +594,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
 		return NULL;
 
 	dat.ip = ip_dst;
-	dat.vid = 0;
+	dat.vid = vid;
 	ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat,
 						    BATADV_DAT_ADDR_MAX);
 
@@ -612,6 +614,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
  * @bat_priv: the bat priv with all the soft interface information
  * @skb: payload to send
  * @ip: the DHT key
+ * @vid: VLAN identifier
  * @packet_subtype: unicast4addr packet subtype to use
  *
  * This function copies the skb with pskb_copy() and is sent as unicast packet
@@ -622,7 +625,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
  */
 static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
 				 struct sk_buff *skb, __be32 ip,
-				 int packet_subtype)
+				 unsigned short vid, int packet_subtype)
 {
 	int i;
 	bool ret = false;
@@ -631,7 +634,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
 	struct sk_buff *tmp_skb;
 	struct batadv_dat_candidate *cand;
 
-	cand = batadv_dat_select_candidates(bat_priv, ip);
+	cand = batadv_dat_select_candidates(bat_priv, ip, vid);
 	if (!cand)
 		goto out;
 
@@ -1022,7 +1025,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
 		ret = true;
 	} else {
 		/* Send the request to the DHT */
-		ret = batadv_dat_send_data(bat_priv, skb, ip_dst,
+		ret = batadv_dat_send_data(bat_priv, skb, ip_dst, vid,
 					   BATADV_P_DAT_DHT_GET);
 	}
 out:
@@ -1150,8 +1153,8 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
 	/* Send the ARP reply to the candidates for both the IP addresses that
 	 * the node obtained from the ARP reply
 	 */
-	batadv_dat_send_data(bat_priv, skb, ip_src, BATADV_P_DAT_DHT_PUT);
-	batadv_dat_send_data(bat_priv, skb, ip_dst, BATADV_P_DAT_DHT_PUT);
+	batadv_dat_send_data(bat_priv, skb, ip_src, vid, BATADV_P_DAT_DHT_PUT);
+	batadv_dat_send_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_PUT);
 }
 
 /**
-- 
cgit v1.2.3


From b6cf5d499fddbfcffe751e81fb9f1a07d6348026 Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <a@unstable.cc>
Date: Thu, 14 Apr 2016 09:37:05 +0800
Subject: batman-adv: B.A.T.M.A.N V - make sure iface is reactivated upon
 NETDEV_UP event

At the moment there is no explicit reactivation of an hard-interface
upon NETDEV_UP event. In case of B.A.T.M.A.N. IV the interface is
reactivated as soon as the next OGM is scheduled for sending, but this
mechanism does not work with B.A.T.M.A.N. V. The latter does not rely
on the same scheduling mechanism as its predecessor and for this reason
the hard-interface remains deactivated forever after being brought down
once.

This patch fixes the reactivation mechanism by adding a new routing API
which explicitly allows each algorithm to perform any needed operation
upon interface re-activation.

Such API is optional and is implemented by B.A.T.M.A.N. V only and it
just takes care of setting the iface status to ACTIVE

Signed-off-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
---
 net/batman-adv/bat_v.c          | 12 ++++++++++++
 net/batman-adv/hard-interface.c |  3 +++
 net/batman-adv/types.h          |  3 +++
 3 files changed, 18 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 3315b9a598af..4026f198a734 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -32,10 +32,21 @@
 
 #include "bat_v_elp.h"
 #include "bat_v_ogm.h"
+#include "hard-interface.h"
 #include "hash.h"
 #include "originator.h"
 #include "packet.h"
 
+static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
+{
+	/* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can
+	 * set the interface as ACTIVE right away, without any risk of race
+	 * condition
+	 */
+	if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED)
+		hard_iface->if_status = BATADV_IF_ACTIVE;
+}
+
 static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
 {
 	int ret;
@@ -274,6 +285,7 @@ static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
 
 static struct batadv_algo_ops batadv_batman_v __read_mostly = {
 	.name = "BATMAN_V",
+	.bat_iface_activate = batadv_v_iface_activate,
 	.bat_iface_enable = batadv_v_iface_enable,
 	.bat_iface_disable = batadv_v_iface_disable,
 	.bat_iface_update_mac = batadv_v_iface_update_mac,
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index c61d5b0b24d2..0a7deaf2670a 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -407,6 +407,9 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface)
 
 	batadv_update_min_mtu(hard_iface->soft_iface);
 
+	if (bat_priv->bat_algo_ops->bat_iface_activate)
+		bat_priv->bat_algo_ops->bat_iface_activate(hard_iface);
+
 out:
 	if (primary_if)
 		batadv_hardif_put(primary_if);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 9abfb3e73c34..443e9b84e07d 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1250,6 +1250,8 @@ struct batadv_forw_packet {
  * struct batadv_algo_ops - mesh algorithm callbacks
  * @list: list node for the batadv_algo_list
  * @name: name of the algorithm
+ * @bat_iface_activate: start routing mechanisms when hard-interface is brought
+ *  up
  * @bat_iface_enable: init routing info when hard-interface is enabled
  * @bat_iface_disable: de-init routing info when hard-interface is disabled
  * @bat_iface_update_mac: (re-)init mac addresses of the protocol information
@@ -1277,6 +1279,7 @@ struct batadv_forw_packet {
 struct batadv_algo_ops {
 	struct hlist_node list;
 	char *name;
+	void (*bat_iface_activate)(struct batadv_hard_iface *hard_iface);
 	int (*bat_iface_enable)(struct batadv_hard_iface *hard_iface);
 	void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface);
 	void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface);
-- 
cgit v1.2.3


From a33d970d0b54b09746d5540af8271fad4eb10229 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 11 Mar 2016 16:44:05 +0100
Subject: batman-adv: Fix reference counting of vlan object for tt_local_entry

The batadv_tt_local_entry was specific to a batadv_softif_vlan and held an
implicit reference to it. But this reference was never stored in form of a
pointer in the tt_local_entry itself. Instead batadv_tt_local_remove,
batadv_tt_local_table_free and batadv_tt_local_purge_pending_clients depend
on a consistent state of bat_priv->softif_vlan_list and that
batadv_softif_vlan_get always returns the batadv_softif_vlan object which
it has a reference for. But batadv_softif_vlan_get cannot guarantee that
because it is working only with rcu_read_lock on this list. It can
therefore happen that an vid is in this list twice or that
batadv_softif_vlan_get cannot find the batadv_softif_vlan for an vid due to
some other list operations taking place at the same time.

Instead add a batadv_softif_vlan pointer directly in batadv_tt_local_entry
which will be used for the reference counter decremented on release of
batadv_tt_local_entry.

Fixes: 35df3b298fc8 ("batman-adv: fix TT VLAN inconsistency on VLAN re-add")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Acked-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/translation-table.c | 42 ++++----------------------------------
 net/batman-adv/types.h             |  2 ++
 2 files changed, 6 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 0b43e86328a5..9b4551a86535 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -215,6 +215,8 @@ static void batadv_tt_local_entry_release(struct kref *ref)
 	tt_local_entry = container_of(ref, struct batadv_tt_local_entry,
 				      common.refcount);
 
+	batadv_softif_vlan_put(tt_local_entry->vlan);
+
 	kfree_rcu(tt_local_entry, common.rcu);
 }
 
@@ -673,6 +675,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
 	kref_get(&tt_local->common.refcount);
 	tt_local->last_seen = jiffies;
 	tt_local->common.added_at = tt_local->last_seen;
+	tt_local->vlan = vlan;
 
 	/* the batman interface mac and multicast addresses should never be
 	 * purged
@@ -991,7 +994,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
 	struct batadv_tt_common_entry *tt_common_entry;
 	struct batadv_tt_local_entry *tt_local;
 	struct batadv_hard_iface *primary_if;
-	struct batadv_softif_vlan *vlan;
 	struct hlist_head *head;
 	unsigned short vid;
 	u32 i;
@@ -1027,14 +1029,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
 			last_seen_msecs = last_seen_msecs % 1000;
 
 			no_purge = tt_common_entry->flags & np_flag;
-
-			vlan = batadv_softif_vlan_get(bat_priv, vid);
-			if (!vlan) {
-				seq_printf(seq, "Cannot retrieve VLAN %d\n",
-					   BATADV_PRINT_VID(vid));
-				continue;
-			}
-
 			seq_printf(seq,
 				   " * %pM %4i [%c%c%c%c%c%c] %3u.%03u   (%#.8x)\n",
 				   tt_common_entry->addr,
@@ -1052,9 +1046,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
 				     BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
 				   no_purge ? 0 : last_seen_secs,
 				   no_purge ? 0 : last_seen_msecs,
-				   vlan->tt.crc);
-
-			batadv_softif_vlan_put(vlan);
+				   tt_local->vlan->tt.crc);
 		}
 		rcu_read_unlock();
 	}
@@ -1099,7 +1091,6 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
 {
 	struct batadv_tt_local_entry *tt_local_entry;
 	u16 flags, curr_flags = BATADV_NO_FLAGS;
-	struct batadv_softif_vlan *vlan;
 	void *tt_entry_exists;
 
 	tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid);
@@ -1139,14 +1130,6 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
 	/* extra call to free the local tt entry */
 	batadv_tt_local_entry_put(tt_local_entry);
 
-	/* decrease the reference held for this vlan */
-	vlan = batadv_softif_vlan_get(bat_priv, vid);
-	if (!vlan)
-		goto out;
-
-	batadv_softif_vlan_put(vlan);
-	batadv_softif_vlan_put(vlan);
-
 out:
 	if (tt_local_entry)
 		batadv_tt_local_entry_put(tt_local_entry);
@@ -1219,7 +1202,6 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv)
 	spinlock_t *list_lock; /* protects write access to the hash lists */
 	struct batadv_tt_common_entry *tt_common_entry;
 	struct batadv_tt_local_entry *tt_local;
-	struct batadv_softif_vlan *vlan;
 	struct hlist_node *node_tmp;
 	struct hlist_head *head;
 	u32 i;
@@ -1241,14 +1223,6 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv)
 						struct batadv_tt_local_entry,
 						common);
 
-			/* decrease the reference held for this vlan */
-			vlan = batadv_softif_vlan_get(bat_priv,
-						      tt_common_entry->vid);
-			if (vlan) {
-				batadv_softif_vlan_put(vlan);
-				batadv_softif_vlan_put(vlan);
-			}
-
 			batadv_tt_local_entry_put(tt_local);
 		}
 		spin_unlock_bh(list_lock);
@@ -3309,7 +3283,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
 	struct batadv_hashtable *hash = bat_priv->tt.local_hash;
 	struct batadv_tt_common_entry *tt_common;
 	struct batadv_tt_local_entry *tt_local;
-	struct batadv_softif_vlan *vlan;
 	struct hlist_node *node_tmp;
 	struct hlist_head *head;
 	spinlock_t *list_lock; /* protects write access to the hash lists */
@@ -3339,13 +3312,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
 						struct batadv_tt_local_entry,
 						common);
 
-			/* decrease the reference held for this vlan */
-			vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid);
-			if (vlan) {
-				batadv_softif_vlan_put(vlan);
-				batadv_softif_vlan_put(vlan);
-			}
-
 			batadv_tt_local_entry_put(tt_local);
 		}
 		spin_unlock_bh(list_lock);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 443e9b84e07d..65afd090ab3e 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1073,10 +1073,12 @@ struct batadv_tt_common_entry {
  * struct batadv_tt_local_entry - translation table local entry data
  * @common: general translation table data
  * @last_seen: timestamp used for purging stale tt local entries
+ * @vlan: soft-interface vlan of the entry
  */
 struct batadv_tt_local_entry {
 	struct batadv_tt_common_entry common;
 	unsigned long last_seen;
+	struct batadv_softif_vlan *vlan;
 };
 
 /**
-- 
cgit v1.2.3


From abe59c65225ccd63a5964e2f2a73dd2995b948e7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 11 Mar 2016 16:44:06 +0100
Subject: batman-adv: Fix reference counting of hardif_neigh_node object for
 neigh_node

The batadv_neigh_node was specific to a batadv_hardif_neigh_node and held
an implicit reference to it. But this reference was never stored in form of
a pointer in the batadv_neigh_node itself. Instead
batadv_neigh_node_release depends on a consistent state of
hard_iface->neigh_list and that batadv_hardif_neigh_get always returns the
batadv_hardif_neigh_node object which it has a reference for. But
batadv_hardif_neigh_get cannot guarantee that because it is working only
with rcu_read_lock on this list. It can therefore happen that a neigh_addr
is in this list twice or that batadv_hardif_neigh_get cannot find the
batadv_hardif_neigh_node for an neigh_addr due to some other list
operations taking place at the same time.

Instead add a batadv_hardif_neigh_node pointer directly in
batadv_neigh_node which will be used for the reference counter decremented
on release of batadv_neigh_node.

Fixes: cef63419f7db ("batman-adv: add list of unique single hop neighbors per hard-interface")
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Signed-off-by: Antonio Quartulli <a@unstable.cc>
---
 net/batman-adv/originator.c | 16 +++++-----------
 net/batman-adv/types.h      |  2 ++
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index d52f67a0c057..c355a824713c 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -250,7 +250,6 @@ static void batadv_neigh_node_release(struct kref *ref)
 {
 	struct hlist_node *node_tmp;
 	struct batadv_neigh_node *neigh_node;
-	struct batadv_hardif_neigh_node *hardif_neigh;
 	struct batadv_neigh_ifinfo *neigh_ifinfo;
 	struct batadv_algo_ops *bao;
 
@@ -262,13 +261,7 @@ static void batadv_neigh_node_release(struct kref *ref)
 		batadv_neigh_ifinfo_put(neigh_ifinfo);
 	}
 
-	hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming,
-					       neigh_node->addr);
-	if (hardif_neigh) {
-		/* batadv_hardif_neigh_get() increases refcount too */
-		batadv_hardif_neigh_put(hardif_neigh);
-		batadv_hardif_neigh_put(hardif_neigh);
-	}
+	batadv_hardif_neigh_put(neigh_node->hardif_neigh);
 
 	if (bao->bat_neigh_free)
 		bao->bat_neigh_free(neigh_node);
@@ -665,6 +658,10 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 	neigh_node->orig_node = orig_node;
 	neigh_node->last_seen = jiffies;
 
+	/* increment unique neighbor refcount */
+	kref_get(&hardif_neigh->refcount);
+	neigh_node->hardif_neigh = hardif_neigh;
+
 	/* extra reference for return */
 	kref_init(&neigh_node->refcount);
 	kref_get(&neigh_node->refcount);
@@ -673,9 +670,6 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
 	hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
 	spin_unlock_bh(&orig_node->neigh_list_lock);
 
-	/* increment unique neighbor refcount */
-	kref_get(&hardif_neigh->refcount);
-
 	batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
 		   "Creating new neighbor %pM for orig_node %pM on interface %s\n",
 		   neigh_addr, orig_node->orig, hard_iface->net_dev->name);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 65afd090ab3e..1e47fbe8bb7b 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -433,6 +433,7 @@ struct batadv_hardif_neigh_node {
  * @ifinfo_lock: lock protecting private ifinfo members and list
  * @if_incoming: pointer to incoming hard-interface
  * @last_seen: when last packet via this neighbor was received
+ * @hardif_neigh: hardif_neigh of this neighbor
  * @refcount: number of contexts the object is used
  * @rcu: struct used for freeing in an RCU-safe manner
  */
@@ -444,6 +445,7 @@ struct batadv_neigh_node {
 	spinlock_t ifinfo_lock;	/* protects ifinfo_list and its members */
 	struct batadv_hard_iface *if_incoming;
 	unsigned long last_seen;
+	struct batadv_hardif_neigh_node *hardif_neigh;
 	struct kref refcount;
 	struct rcu_head rcu;
 };
-- 
cgit v1.2.3


From f27337e16f2d0e52a8d05ea599ed13cd266ac291 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 28 Apr 2016 11:04:51 +0200
Subject: ip_tunnel: fix preempt warning in ip tunnel creation/updating

After the commit e09acddf873b ("ip_tunnel: replace dst_cache with generic
implementation"), a preemption debug warning is triggered on ip4
tunnels updating; the dst cache helper needs to be invoked in unpreemptible
context.

We don't need to load the cache on tunnel update, so this commit fixes
the warning replacing the load with a dst cache reset, which is
preempt safe.

Fixes: e09acddf873b ("ip_tunnel: replace dst_cache with generic implementation")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 6aad0192443d..a69ed94bda1b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -326,12 +326,12 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 
 		if (!IS_ERR(rt)) {
 			tdev = rt->dst.dev;
-			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
-					  fl4.saddr);
 			ip_rt_put(rt);
 		}
 		if (dev->type != ARPHRD_ETHER)
 			dev->flags |= IFF_POINTOPOINT;
+
+		dst_cache_reset(&tunnel->dst_cache);
 	}
 
 	if (!tdev && tunnel->parms.link)
-- 
cgit v1.2.3


From 018f8258582381bcce484312f0e9ec2970d0383e Mon Sep 17 00:00:00 2001
From: Wang Shanker <shankerwangmiao@gmail.com>
Date: Fri, 29 Apr 2016 01:29:43 +0800
Subject: net: l2tp: fix reversed udp6 checksum flags

This patch fixes a bug which causes the behavior of whether to ignore
udp6 checksum of udp6 encapsulated l2tp tunnel contrary to what
userspace program requests.

When the flag `L2TP_ATTR_UDP_ZERO_CSUM6_RX` is set by userspace, it is
expected that udp6 checksums of received packets of the l2tp tunnel
to create should be ignored. In `l2tp_netlink.c`:
`l2tp_nl_cmd_tunnel_create()`, `cfg.udp6_zero_rx_checksums` is set
according to the flag, and then passed to `l2tp_core.c`:
`l2tp_tunnel_create()` and then `l2tp_tunnel_sock_create()`. In
`l2tp_tunnel_sock_create()`, `udp_conf.use_udp6_rx_checksums` is set
the same to `cfg.udp6_zero_rx_checksums`. However, if we want the
checksum to be ignored, `udp_conf.use_udp6_rx_checksums` should be set
to `false`, i.e. be set to the contrary. Similarly, the same should be
done to `udp_conf.use_udp6_tx_checksums`.

Signed-off-by: Miao Wang <shankerwangmiao@gmail.com>
Acked-by: James Chapman <jchapman@katalix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index afca2eb4dfa7..6edfa9980314 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1376,9 +1376,9 @@ static int l2tp_tunnel_sock_create(struct net *net,
 			memcpy(&udp_conf.peer_ip6, cfg->peer_ip6,
 			       sizeof(udp_conf.peer_ip6));
 			udp_conf.use_udp6_tx_checksums =
-			    cfg->udp6_zero_tx_checksums;
+			  ! cfg->udp6_zero_tx_checksums;
 			udp_conf.use_udp6_rx_checksums =
-			    cfg->udp6_zero_rx_checksums;
+			  ! cfg->udp6_zero_rx_checksums;
 		} else
 #endif
 		{
-- 
cgit v1.2.3


From 90e5d0db2b221f0cbbb91e9e61fdb7dbb9e1afc2 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Thu, 28 Apr 2016 19:24:32 -0400
Subject: soreuseport: Fix TCP listener hash collision

I forgot to include a check for listener port equality when deciding
if two sockets should belong to the same reuseport group.  This was
not caught previously because it's only necessary when two listening
sockets for the same user happen to hash to the same listener bucket.
The same error does not exist in the UDP path.

Fixes: c125e80b8868("soreuseport: fast reuseport TCP socket selection")
Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index bc68eced0105..0d9e9d7bb029 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -470,6 +470,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
 						     const struct sock *sk2,
 						     bool match_wildcard))
 {
+	struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
 	struct sock *sk2;
 	struct hlist_nulls_node *node;
 	kuid_t uid = sock_i_uid(sk);
@@ -479,6 +480,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
 		    sk2->sk_family == sk->sk_family &&
 		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
 		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
+		    inet_csk(sk2)->icsk_bind_hash == tb &&
 		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
 		    saddr_same(sk, sk2, false))
 			return reuseport_add_sock(sk, sk2);
-- 
cgit v1.2.3


From efe790502be85c60daa65c8aa51f05c333186e12 Mon Sep 17 00:00:00 2001
From: Hamish Martin <hamish.martin@alliedtelesis.co.nz>
Date: Fri, 29 Apr 2016 10:40:24 -0400
Subject: tipc: only process unicast on intended node

We have observed complete lock up of broadcast-link transmission due to
unacknowledged packets never being removed from the 'transmq' queue. This
is traced to nodes having their ack field set beyond the sequence number
of packets that have actually been transmitted to them.
Consider an example where node 1 has sent 10 packets to node 2 on a
link and node 3 has sent 20 packets to node 2 on another link. We
see examples of an ack from node 2 destined for node 3 being treated as
an ack from node 2 at node 1. This leads to the ack on the node 1 to node
2 link being increased to 20 even though we have only sent 10 packets.
When node 1 does get around to sending further packets, none of the
packets with sequence numbers less than 21 are actually removed from the
transmq.
To resolve this we reinstate some code lost in commit d999297c3dbb ("tipc:
reduce locking scope during packet reception") which ensures that only
messages destined for the receiving node are processed by that node. This
prevents the sequence numbers from getting out of sync and resolves the
packet leakage, thereby resolving the broadcast-link transmission
lock-ups we observed.

While we are aware that this change only patches over a root problem that
we still haven't identified, this is a sanity test that it is always
legitimate to do. It will remain in the code even after we identify and
fix the real problem.

Reviewed-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Reviewed-by: John Thompson <john.thompson@alliedtelesis.co.nz>
Signed-off-by: Hamish Martin <hamish.martin@alliedtelesis.co.nz>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/node.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/tipc/node.c b/net/tipc/node.c
index ace178fd3850..9aaa1bc566ae 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1444,6 +1444,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 	int bearer_id = b->identity;
 	struct tipc_link_entry *le;
 	u16 bc_ack = msg_bcast_ack(hdr);
+	u32 self = tipc_own_addr(net);
 	int rc = 0;
 
 	__skb_queue_head_init(&xmitq);
@@ -1460,6 +1461,10 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 			return tipc_node_bc_rcv(net, skb, bearer_id);
 	}
 
+	/* Discard unicast link messages destined for another node */
+	if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self)))
+		goto discard;
+
 	/* Locate neighboring node that sent packet */
 	n = tipc_node_find(net, msg_prevnode(hdr));
 	if (unlikely(!n))
-- 
cgit v1.2.3


From b7f8fe251e4609e2a437bd2c2dea01e61db6849c Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Fri, 29 Apr 2016 23:31:32 +0200
Subject: gre: do not pull header in ICMP error processing

iptunnel_pull_header expects that IP header was already pulled; with this
expectation, it pulls the tunnel header. This is not true in gre_err.
Furthermore, ipv4_update_pmtu and ipv4_redirect expect that skb->data points
to the IP header.

We cannot pull the tunnel header in this path. It's just a matter of not
calling iptunnel_pull_header - we don't need any of its effects.

Fixes: bda7bb463436 ("gre: Allow multiple protocol listener for gre protocol.")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f502d34bcb40..205a2b8a5a84 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -179,6 +179,7 @@ static __be16 tnl_flags_to_gre_flags(__be16 tflags)
 	return flags;
 }
 
+/* Fills in tpi and returns header length to be pulled. */
 static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 			    bool *csum_err)
 {
@@ -238,7 +239,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 				return -EINVAL;
 		}
 	}
-	return iptunnel_pull_header(skb, hdr_len, tpi->proto, false);
+	return hdr_len;
 }
 
 static void ipgre_err(struct sk_buff *skb, u32 info,
@@ -341,7 +342,7 @@ static void gre_err(struct sk_buff *skb, u32 info)
 	struct tnl_ptk_info tpi;
 	bool csum_err = false;
 
-	if (parse_gre_header(skb, &tpi, &csum_err)) {
+	if (parse_gre_header(skb, &tpi, &csum_err) < 0) {
 		if (!csum_err)		/* ignore csum errors. */
 			return;
 	}
@@ -419,6 +420,7 @@ static int gre_rcv(struct sk_buff *skb)
 {
 	struct tnl_ptk_info tpi;
 	bool csum_err = false;
+	int hdr_len;
 
 #ifdef CONFIG_NET_IPGRE_BROADCAST
 	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
@@ -428,7 +430,10 @@ static int gre_rcv(struct sk_buff *skb)
 	}
 #endif
 
-	if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+	hdr_len = parse_gre_header(skb, &tpi, &csum_err);
+	if (hdr_len < 0)
+		goto drop;
+	if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false) < 0)
 		goto drop;
 
 	if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
-- 
cgit v1.2.3


From 6071bd1aa13ed9e41824bafad845b7b7f4df5cfd Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 2 May 2016 12:20:15 -0400
Subject: netem: Segment GSO packets on enqueue

This was recently reported to me, and reproduced on the latest net kernel,
when attempting to run netperf from a host that had a netem qdisc attached
to the egress interface:

[  788.073771] ---------------------[ cut here ]---------------------------
[  788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda()
[  788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962
data_len=0 gso_size=1448 gso_type=1 ip_summed=3
[  788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif
ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul
glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si
i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter
pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c
sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt
i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci
crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp
serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log
dm_mod
[  788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G        W
------------   3.10.0-327.el7.x86_64 #1
[  788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012
[  788.542260]  ffff880437c036b8 f7afc56532a53db9 ffff880437c03670
ffffffff816351f1
[  788.576332]  ffff880437c036a8 ffffffff8107b200 ffff880633e74200
ffff880231674000
[  788.611943]  0000000000000001 0000000000000003 0000000000000000
ffff880437c03710
[  788.647241] Call Trace:
[  788.658817]  <IRQ>  [<ffffffff816351f1>] dump_stack+0x19/0x1b
[  788.686193]  [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0
[  788.713803]  [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80
[  788.741314]  [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100
[  788.767018]  [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda
[  788.796117]  [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190
[  788.823392]  [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem]
[  788.854487]  [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570
[  788.880870]  [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0
...

The problem occurs because netem is not prepared to handle GSO packets (as it
uses skb_checksum_help in its enqueue path, which cannot manipulate these
frames).

The solution I think is to simply segment the skb in a simmilar fashion to the
way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes.
When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt
the first segment, and enqueue the remaining ones.

tested successfully by myself on the latest net kernel, to which this applies

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Jamal Hadi Salim <jhs@mojatatu.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: netem@lists.linux-foundation.org
CC: eric.dumazet@gmail.com
CC: stephen@networkplumber.org
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 9640bb39a5d2..4befe97a9034 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -395,6 +395,25 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 	sch->q.qlen++;
 }
 
+/* netem can't properly corrupt a megapacket (like we get from GSO), so instead
+ * when we statistically choose to corrupt one, we instead segment it, returning
+ * the first packet to be corrupted, and re-enqueue the remaining frames
+ */
+static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct sk_buff *segs;
+	netdev_features_t features = netif_skb_features(skb);
+
+	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+
+	if (IS_ERR_OR_NULL(segs)) {
+		qdisc_reshape_fail(skb, sch);
+		return NULL;
+	}
+	consume_skb(skb);
+	return segs;
+}
+
 /*
  * Insert one skb into qdisc.
  * Note: parent depends on return value to account for queue length.
@@ -407,7 +426,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	/* We don't fill cb now as skb_unshare() may invalidate it */
 	struct netem_skb_cb *cb;
 	struct sk_buff *skb2;
+	struct sk_buff *segs = NULL;
+	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
+	int nb = 0;
 	int count = 1;
+	int rc = NET_XMIT_SUCCESS;
 
 	/* Random duplication */
 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
@@ -453,10 +476,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	 * do it now in software before we mangle it.
 	 */
 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+		if (skb_is_gso(skb)) {
+			segs = netem_segment(skb, sch);
+			if (!segs)
+				return NET_XMIT_DROP;
+		} else {
+			segs = skb;
+		}
+
+		skb = segs;
+		segs = segs->next;
+
 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
-		     skb_checksum_help(skb)))
-			return qdisc_drop(skb, sch);
+		     skb_checksum_help(skb))) {
+			rc = qdisc_drop(skb, sch);
+			goto finish_segs;
+		}
 
 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
 			1<<(prandom_u32() % 8);
@@ -516,6 +552,27 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		sch->qstats.requeues++;
 	}
 
+finish_segs:
+	if (segs) {
+		while (segs) {
+			skb2 = segs->next;
+			segs->next = NULL;
+			qdisc_skb_cb(segs)->pkt_len = segs->len;
+			last_len = segs->len;
+			rc = qdisc_enqueue(segs, sch);
+			if (rc != NET_XMIT_SUCCESS) {
+				if (net_xmit_drop_count(rc))
+					qdisc_qstats_drop(sch);
+			} else {
+				nb++;
+				len += last_len;
+			}
+			segs = skb2;
+		}
+		sch->q.qlen += nb;
+		if (nb > 1)
+			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
+	}
 	return NET_XMIT_SUCCESS;
 }
 
-- 
cgit v1.2.3


From 996e802187889f1cd412e6929c9344b92ccb78c4 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <aduyck@mirantis.com>
Date: Mon, 2 May 2016 09:25:10 -0700
Subject: net: Disable segmentation if checksumming is not supported

In the case of the mlx4 and mlx5 driver they do not support IPv6 checksum
offload for tunnels.  With this being the case we should disable GSO in
addition to the checksum offload features when we find that a device cannot
perform a checksum on a given packet type.

Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 77a71cd68535..5c925ac50b95 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2802,7 +2802,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
 
 	if (skb->ip_summed != CHECKSUM_NONE &&
 	    !can_checksum_protocol(features, type)) {
-		features &= ~NETIF_F_CSUM_MASK;
+		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 	} else if (illegal_highdma(skb->dev, skb)) {
 		features &= ~NETIF_F_SG;
 	}
-- 
cgit v1.2.3


From eb192840266fab3e3da644018121eed30153355d Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Mon, 2 May 2016 11:24:51 -0700
Subject: RDS:TCP: Synchronize rds_tcp_accept_one with rds_send_xmit when
 resetting t_sock

There is a race condition between rds_send_xmit -> rds_tcp_xmit
and the code that deals with resolution of duelling syns added
by commit 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an
outgoing socket in rds_tcp_accept_one()").

Specifically, we may end up derefencing a null pointer in rds_send_xmit
if we have the interleaving sequence:
           rds_tcp_accept_one                  rds_send_xmit

                                             conn is RDS_CONN_UP, so
    					 invoke rds_tcp_xmit

                                             tc = conn->c_transport_data
        rds_tcp_restore_callbacks
            /* reset t_sock */
    					 null ptr deref from tc->t_sock

The race condition can be avoided without adding the overhead of
additional locking in the xmit path: have rds_tcp_accept_one wait
for rds_tcp_xmit threads to complete before resetting callbacks.
The synchronization can be done in the same manner as rds_conn_shutdown().
First set the rds_conn_state to something other than RDS_CONN_UP
(so that new threads cannot get into rds_tcp_xmit()), then wait for
RDS_IN_XMIT to be cleared in the conn->c_flags indicating that any
threads in rds_tcp_xmit are done.

Fixes: 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an
outgoing socket in rds_tcp_accept_one()")
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c        |  2 +-
 net/rds/tcp_listen.c | 40 ++++++++++++++++++++++++----------------
 2 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 61ed2a8764ba..9134544941c2 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -127,7 +127,7 @@ void rds_tcp_restore_callbacks(struct socket *sock,
 
 /*
  * This is the only path that sets tc->t_sock.  Send and receive trust that
- * it is set.  The RDS_CONN_CONNECTED bit protects those paths from being
+ * it is set.  The RDS_CONN_UP bit protects those paths from being
  * called while it isn't set.
  */
 void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 0936a4a32b47..0896187243d6 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -115,24 +115,32 @@ int rds_tcp_accept_one(struct socket *sock)
 	 * rds_tcp_state_change() will do that cleanup
 	 */
 	rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
-	if (rs_tcp->t_sock &&
-	    ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
-		struct sock *nsk = new_sock->sk;
-
-		nsk->sk_user_data = NULL;
-		nsk->sk_prot->disconnect(nsk, 0);
-		tcp_done(nsk);
-		new_sock = NULL;
-		ret = 0;
-		goto out;
-	} else if (rs_tcp->t_sock) {
-		rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
-		conn->c_outgoing = 0;
-	}
-
 	rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
+	if (rs_tcp->t_sock) {
+		/* Need to resolve a duelling SYN between peers.
+		 * We have an outstanding SYN to this peer, which may
+		 * potentially have transitioned to the RDS_CONN_UP state,
+		 * so we must quiesce any send threads before resetting
+		 * c_transport_data.
+		 */
+		wait_event(conn->c_waitq,
+			   !test_bit(RDS_IN_XMIT, &conn->c_flags));
+		if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
+			struct sock *nsk = new_sock->sk;
+
+			nsk->sk_user_data = NULL;
+			nsk->sk_prot->disconnect(nsk, 0);
+			tcp_done(nsk);
+			new_sock = NULL;
+			ret = 0;
+			goto out;
+		} else if (rs_tcp->t_sock) {
+			rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
+			conn->c_outgoing = 0;
+		}
+	}
 	rds_tcp_set_callbacks(new_sock, conn);
-	rds_connect_complete(conn);
+	rds_connect_complete(conn); /* marks RDS_CONN_UP */
 	new_sock = NULL;
 	ret = 0;
 
-- 
cgit v1.2.3


From bd7c5f983f3185b75cc23bdd5dbc3a676aef3d1e Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Mon, 2 May 2016 11:24:52 -0700
Subject: RDS: TCP: Synchronize accept() and connect() paths on t_conn_lock.

An arbitration scheme for duelling SYNs is implemented as part of
commit 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an
outgoing socket in rds_tcp_accept_one()") which ensures that both nodes
involved will arrive at the same arbitration decision. However, this
needs to be synchronized with an outgoing SYN to be generated by
rds_tcp_conn_connect(). This commit achieves the synchronization
through the t_conn_lock mutex in struct rds_tcp_connection.

The rds_conn_state is checked in rds_tcp_conn_connect() after acquiring
the t_conn_lock mutex.  A SYN is sent out only if the RDS connection is
not already UP (an UP would indicate that rds_tcp_accept_one() has
completed 3WH, so no SYN needs to be generated).

Similarly, the rds_conn_state is checked in rds_tcp_accept_one() after
acquiring the t_conn_lock mutex. The only acceptable states (to
allow continuation of the arbitration logic) are UP (i.e., outgoing SYN
was SYN-ACKed by peer after it sent us the SYN) or CONNECTING (we sent
outgoing SYN before we saw incoming SYN).

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c         |  1 +
 net/rds/tcp.h         |  4 ++++
 net/rds/tcp_connect.c |  8 ++++++++
 net/rds/tcp_listen.c  | 30 ++++++++++++++++++++----------
 4 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 9134544941c2..86187dad1440 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -216,6 +216,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 	if (!tc)
 		return -ENOMEM;
 
+	mutex_init(&tc->t_conn_lock);
 	tc->t_sock = NULL;
 	tc->t_tinc = NULL;
 	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 64f873c0c6b6..41c228300525 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -12,6 +12,10 @@ struct rds_tcp_connection {
 
 	struct list_head	t_tcp_node;
 	struct rds_connection   *conn;
+	/* t_conn_lock synchronizes the connection establishment between
+	 * rds_tcp_accept_one and rds_tcp_conn_connect
+	 */
+	struct mutex		t_conn_lock;
 	struct socket		*t_sock;
 	void			*t_orig_write_space;
 	void			*t_orig_data_ready;
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 5cb16875c460..49a3fcfed360 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -78,7 +78,14 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
 	struct socket *sock = NULL;
 	struct sockaddr_in src, dest;
 	int ret;
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+
+	mutex_lock(&tc->t_conn_lock);
 
+	if (rds_conn_up(conn)) {
+		mutex_unlock(&tc->t_conn_lock);
+		return 0;
+	}
 	ret = sock_create_kern(rds_conn_net(conn), PF_INET,
 			       SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (ret < 0)
@@ -120,6 +127,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
 	}
 
 out:
+	mutex_unlock(&tc->t_conn_lock);
 	if (sock)
 		sock_release(sock);
 	return ret;
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 0896187243d6..be263cdf268b 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -76,7 +76,9 @@ int rds_tcp_accept_one(struct socket *sock)
 	struct rds_connection *conn;
 	int ret;
 	struct inet_sock *inet;
-	struct rds_tcp_connection *rs_tcp;
+	struct rds_tcp_connection *rs_tcp = NULL;
+	int conn_state;
+	struct sock *nsk;
 
 	ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
 			       sock->sk->sk_type, sock->sk->sk_protocol,
@@ -116,6 +118,10 @@ int rds_tcp_accept_one(struct socket *sock)
 	 */
 	rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
 	rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
+	mutex_lock(&rs_tcp->t_conn_lock);
+	conn_state = rds_conn_state(conn);
+	if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP)
+		goto rst_nsk;
 	if (rs_tcp->t_sock) {
 		/* Need to resolve a duelling SYN between peers.
 		 * We have an outstanding SYN to this peer, which may
@@ -126,14 +132,7 @@ int rds_tcp_accept_one(struct socket *sock)
 		wait_event(conn->c_waitq,
 			   !test_bit(RDS_IN_XMIT, &conn->c_flags));
 		if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
-			struct sock *nsk = new_sock->sk;
-
-			nsk->sk_user_data = NULL;
-			nsk->sk_prot->disconnect(nsk, 0);
-			tcp_done(nsk);
-			new_sock = NULL;
-			ret = 0;
-			goto out;
+			goto rst_nsk;
 		} else if (rs_tcp->t_sock) {
 			rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
 			conn->c_outgoing = 0;
@@ -143,8 +142,19 @@ int rds_tcp_accept_one(struct socket *sock)
 	rds_connect_complete(conn); /* marks RDS_CONN_UP */
 	new_sock = NULL;
 	ret = 0;
-
+	goto out;
+rst_nsk:
+	/* reset the newly returned accept sock and bail */
+	nsk = new_sock->sk;
+	rds_tcp_stats_inc(s_tcp_listen_closed_stale);
+	nsk->sk_user_data = NULL;
+	nsk->sk_prot->disconnect(nsk, 0);
+	tcp_done(nsk);
+	new_sock = NULL;
+	ret = 0;
 out:
+	if (rs_tcp)
+		mutex_unlock(&rs_tcp->t_conn_lock);
 	if (new_sock)
 		sock_release(new_sock);
 	return ret;
-- 
cgit v1.2.3


From 79e8dc8b80bff0bc5bbb90ca5e73044bf207c8ac Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 3 May 2016 09:58:27 +0200
Subject: ipv6/ila: fix nlsize calculation for lwtunnel

The handler 'ila_fill_encap_info' adds one attribute: ILA_ATTR_LOCATOR.

Fixes: 65d7ab8de582 ("net: Identifier Locator Addressing module")
CC: Tom Herbert <tom@herbertland.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ila/ila_lwt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 2ae3c4fd8aab..41f18de5dcc2 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -120,8 +120,7 @@ nla_put_failure:
 
 static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
 {
-	/* No encapsulation overhead */
-	return 0;
+	return nla_total_size(sizeof(u64)); /* ILA_ATTR_LOCATOR */
 }
 
 static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
-- 
cgit v1.2.3