433 files changed, 24605 insertions, 3987 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt b/Documentation/ABI/testing/sysfs-bus-thunderbolt
index 392bef5bd399..93798c02e28b 100644
--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
+++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
@@ -110,3 +110,51 @@ Description:	When new NVM image is written to the non-active NVM
 		is directly the status value from the DMA configuration
 		based mailbox before the device is power cycled. Writing
 		0 here clears the status.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/key
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains name of the property directory the XDomain
+		service exposes. This entry describes the protocol in
+		question. Following directories are already reserved by
+		the Apple XDomain specification:
+
+		network:  IP/ethernet over Thunderbolt
+		targetdm: Target disk mode protocol over Thunderbolt
+		extdisp:  External display mode protocol over Thunderbolt
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/modalias
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	Stores the same MODALIAS value emitted by uevent for
+		the XDomain service. Format: tbtsvc:kSpNvNrN
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcid
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain protocol identifier the XDomain
+		service supports.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcvers
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain protocol version the XDomain
+		service supports.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcrevs
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain software version the XDomain
+		service supports.
+
+What:		/sys/bus/thunderbolt/devices/<xdomain>.<service>/prtcstns
+Date:		Jan 2018
+KernelVersion:	4.15
+Contact:	thunderbolt-software@lists.01.org
+Description:	This contains XDomain service specific settings as
+		bitmask. Format: %x
diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst
index 6a4cd1f159ca..5c62d11d77e8 100644
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -197,3 +197,27 @@ information is missing.
 
 To recover from this mode, one needs to flash a valid NVM image to the
 host host controller in the same way it is done in the previous chapter.
+
+Networking over Thunderbolt cable
+---------------------------------
+Thunderbolt technology allows software communication across two hosts
+connected by a Thunderbolt cable.
+
+It is possible to tunnel any kind of traffic over Thunderbolt link but
+currently we only support Apple ThunderboltIP protocol.
+
+If the other host is running Windows or macOS only thing you need to
+do is to connect Thunderbolt cable between the two hosts, the
+``thunderbolt-net`` is loaded automatically. If the other host is also
+Linux you should load ``thunderbolt-net`` manually on one host (it does
+not matter which one)::
+
+  # modprobe thunderbolt-net
+
+This triggers module load on the other host automatically. If the driver
+is built-in to the kernel image, there is no need to do anything.
+
+The driver will create one virtual ethernet interface per Thunderbolt
+port which are named like ``thunderbolt0`` and so on. From this point
+you can either use standard userspace tools like ``ifconfig`` to
+configure the interface or let your GUI to handle it automatically.
diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt
index 16723535e1aa..c902261893b9 100644
--- a/Documentation/devicetree/bindings/net/renesas,ravb.txt
+++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt
@@ -17,6 +17,8 @@ Required properties:
 
       - "renesas,etheravb-r8a7795" for the R8A7795 SoC.
       - "renesas,etheravb-r8a7796" for the R8A7796 SoC.
+      - "renesas,etheravb-r8a77970" for the R8A77970 SoC.
+      - "renesas,etheravb-r8a77995" for the R8A77995 SoC.
       - "renesas,etheravb-rcar-gen3" as a fallback for the above
 		R-Car Gen3 devices.
 
@@ -40,7 +42,7 @@ Optional properties:
 - interrupt-parent: the phandle for the interrupt controller that services
 		    interrupts for this device.
 - interrupt-names: A list of interrupt names.
-		   For the R8A779[56] SoCs this property is mandatory;
+		   For the R-Car Gen 3 SoCs this property is mandatory;
 		   it should include one entry per channel, named "ch%u",
 		   where %u is the channel number ranging from 0 to 24.
 		   For other SoCs this property is optional; if present
diff --git a/MAINTAINERS b/MAINTAINERS
index cc42c838ab4f..d0cbb3d7a0ca 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2725,7 +2725,7 @@ F:	net/core/filter.c
 F:	net/sched/act_bpf.c
 F:	net/sched/cls_bpf.c
 F:	samples/bpf/
-F:	tools/net/bpf*
+F:	tools/bpf/
 F:	tools/testing/selftests/bpf/
 
 BROADCOM B44 10/100 ETHERNET DRIVER
@@ -9428,7 +9428,6 @@ F:	include/uapi/linux/in.h
 F:	include/uapi/linux/net.h
 F:	include/uapi/linux/netdevice.h
 F:	include/uapi/linux/net_namespace.h
-F:	tools/net/
 F:	tools/testing/selftests/net/
 F:	lib/random32.c
 
@@ -13290,6 +13289,15 @@ M:	Mika Westerberg <mika.westerberg@linux.intel.com>
 M:	Yehezkel Bernat <yehezkel.bernat@intel.com>
 S:	Maintained
 F:	drivers/thunderbolt/
+F:	include/linux/thunderbolt.h
+
+THUNDERBOLT NETWORK DRIVER
+M:	Michael Jamet <michael.jamet@intel.com>
+M:	Mika Westerberg <mika.westerberg@linux.intel.com>
+M:	Yehezkel Bernat <yehezkel.bernat@intel.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/thunderbolt.c
 
 THUNDERX GPIO DRIVER
 M:	David Daney <david.daney@cavium.com>
diff --git a/arch/blackfin/mach-bf518/boards/ezbrd.c b/arch/blackfin/mach-bf518/boards/ezbrd.c
index d022112927c2..c51d1b810ac3 100644
--- a/arch/blackfin/mach-bf518/boards/ezbrd.c
+++ b/arch/blackfin/mach-bf518/boards/ezbrd.c
@@ -25,7 +25,6 @@
 #include <asm/dpmc.h>
 #include <asm/bfin_sdh.h>
 #include <linux/spi/ad7877.h>
-#include <net/dsa.h>
 
 /*
  * Name the Board for the /proc/cpuinfo
@@ -105,11 +104,7 @@ static const unsigned short bfin_mac_peripherals[] = {
 
 static struct bfin_phydev_platform_data bfin_phydev_data[] = {
 	{
-#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M)
-		.addr = 3,
-#else
 		.addr = 1,
-#endif
 		.irq = IRQ_MAC_PHYINT,
 	},
 };
@@ -119,9 +114,6 @@ static struct bfin_mii_bus_platform_data bfin_mii_bus_data = {
 	.phydev_data = bfin_phydev_data,
 	.phy_mode = PHY_INTERFACE_MODE_MII,
 	.mac_peripherals = bfin_mac_peripherals,
-#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M)
-	.phy_mask = 0xfff7, /* Only probe the port phy connect to the on chip MAC */
-#endif
 	.vlan1_mask = 1,
 	.vlan2_mask = 2,
 };
@@ -140,29 +132,6 @@ static struct platform_device bfin_mac_device = {
 	}
 };
 
-#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M)
-static struct dsa_chip_data ksz8893m_switch_chip_data = {
-	.mii_bus = &bfin_mii_bus.dev,
-	.port_names = {
-		NULL,
-		"eth%d",
-		"eth%d",
-		"cpu",
-	},
-};
-static struct dsa_platform_data ksz8893m_switch_data = {
-	.nr_chips = 1,
-	.netdev = &bfin_mac_device.dev,
-	.chip = &ksz8893m_switch_chip_data,
-};
-
-static struct platform_device ksz8893m_switch_device = {
-	.name		= "dsa",
-	.id		= 0,
-	.num_resources	= 0,
-	.dev.platform_data = &ksz8893m_switch_data,
-};
-#endif
 #endif
 
 #if IS_ENABLED(CONFIG_MTD_M25P80)
@@ -228,19 +197,6 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = {
 	},
 #endif
 
-#if IS_ENABLED(CONFIG_BFIN_MAC)
-#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M)
-	{
-		.modalias = "ksz8893m",
-		.max_speed_hz = 5000000,
-		.bus_num = 0,
-		.chip_select = 1,
-		.platform_data = NULL,
-		.mode = SPI_MODE_3,
-	},
-#endif
-#endif
-
 #if IS_ENABLED(CONFIG_MMC_SPI)
 	{
 		.modalias = "mmc_spi",
@@ -714,9 +670,6 @@ static struct platform_device *stamp_devices[] __initdata = {
 #if IS_ENABLED(CONFIG_BFIN_MAC)
 	&bfin_mii_bus,
 	&bfin_mac_device,
-#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M)
-	&ksz8893m_switch_device,
-#endif
 #endif
 
 #if IS_ENABLED(CONFIG_SPI_BFIN5XX)
diff --git a/arch/blackfin/mach-bf518/boards/tcm-bf518.c b/arch/blackfin/mach-bf518/boards/tcm-bf518.c
index 240d5cb1f02c..37d868085f6a 100644
--- a/arch/blackfin/mach-bf518/boards/tcm-bf518.c
+++ b/arch/blackfin/mach-bf518/boards/tcm-bf518.c
@@ -25,7 +25,6 @@
 #include <asm/dpmc.h>
 #include <asm/bfin_sdh.h>
 #include <linux/spi/ad7877.h>
-#include <net/dsa.h>
 
 /*
  * Name the Board for the /proc/cpuinfo
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index aba0d652095b..0936da592e12 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -483,6 +483,18 @@ config FUJITSU_ES
 	  This driver provides support for Extended Socket network device
           on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series.
 
+config THUNDERBOLT_NET
+	tristate "Networking over Thunderbolt cable"
+	depends on THUNDERBOLT && INET
+	help
+	  Select this if you want to create network between two
+	  computers over a Thunderbolt cable. The driver supports Apple
+	  ThunderboltIP protocol and allows communication with any host
+	  supporting the same protocol including Windows and macOS.
+
+	  To compile this driver a module, choose M here. The module will be
+	  called thunderbolt-net.
+
 source "drivers/net/hyperv/Kconfig"
 
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 8dff900085d6..7c8f4dd3a7c5 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -74,3 +74,6 @@ obj-$(CONFIG_HYPERV_NET) += hyperv/
 obj-$(CONFIG_NTB_NETDEV) += ntb_netdev.o
 
 obj-$(CONFIG_FUJITSU_ES) += fjes/
+
+thunderbolt-net-y += thunderbolt.o
+obj-$(CONFIG_THUNDERBOLT_NET) += thunderbolt-net.o
diff --git a/drivers/net/appletalk/cops.c b/drivers/net/appletalk/cops.c
index 486e1e6997fc..caf04284711a 100644
--- a/drivers/net/appletalk/cops.c
+++ b/drivers/net/appletalk/cops.c
@@ -424,9 +424,7 @@ static int cops_open(struct net_device *dev)
 		 */
 		if(lp->board==TANGENT)	/* Poll 20 times per second */
 		{
-		    init_timer(&cops_timer);
-		    cops_timer.function = cops_poll;
-		    cops_timer.data 	= (unsigned long)dev;
+		    setup_timer(&cops_timer, cops_poll, (unsigned long)dev);
 		    cops_timer.expires 	= jiffies + HZ/20;
 		    add_timer(&cops_timer);
 		} 
diff --git a/drivers/net/appletalk/ltpc.c b/drivers/net/appletalk/ltpc.c
index ac755d2950a6..e4aa374caa4d 100644
--- a/drivers/net/appletalk/ltpc.c
+++ b/drivers/net/appletalk/ltpc.c
@@ -1165,9 +1165,7 @@ struct net_device * __init ltpc_probe(void)
 		dev->irq = 0;
 		/* polled mode -- 20 times per second */
 		/* this is really, really slow... should it poll more often? */
-		init_timer(&ltpc_timer);
-		ltpc_timer.function=ltpc_poll;
-		ltpc_timer.data = (unsigned long) dev;
+		setup_timer(&ltpc_timer, ltpc_poll, (unsigned long)dev);
 
 		ltpc_timer.expires = jiffies + HZ/20;
 		add_timer(&ltpc_timer);
diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c
index fcfccbb3d9a2..13236b2cdf13 100644
--- a/drivers/net/arcnet/arcnet.c
+++ b/drivers/net/arcnet/arcnet.c
@@ -450,9 +450,7 @@ struct net_device *alloc_arcdev(const char *name)
 
 		lp->dev = dev;
 		spin_lock_init(&lp->lock);
-		init_timer(&lp->timer);
-		lp->timer.data = (unsigned long) dev;
-		lp->timer.function = arcnet_timer;
+		setup_timer(&lp->timer, arcnet_timer, (unsigned long)dev);
 	}
 
 	return dev;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c99dc59d729b..172eeeb68152 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1217,14 +1217,15 @@ static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
 	}
 }
 
-static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave)
+static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
+				      struct netlink_ext_ack *extack)
 {
 	struct netdev_lag_upper_info lag_upper_info;
 	int err;
 
 	lag_upper_info.tx_type = bond_lag_tx_type(bond);
 	err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
-					   &lag_upper_info);
+					   &lag_upper_info, extack);
 	if (err)
 		return err;
 	rtmsg_ifinfo(RTM_NEWLINK, slave->dev, IFF_SLAVE, GFP_KERNEL);
@@ -1328,7 +1329,8 @@ void bond_lower_state_changed(struct slave *slave)
 }
 
 /* enslave device <slave> to bond device <master> */
-int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
+int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
+		 struct netlink_ext_ack *extack)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
@@ -1346,12 +1348,14 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 
 	/* already in-use? */
 	if (netdev_is_rx_handler_busy(slave_dev)) {
+		NL_SET_ERR_MSG(extack, "Device is in use and cannot be enslaved");
 		netdev_err(bond_dev,
 			   "Error: Device is in use and cannot be enslaved\n");
 		return -EBUSY;
 	}
 
 	if (bond_dev == slave_dev) {
+		NL_SET_ERR_MSG(extack, "Cannot enslave bond to itself.");
 		netdev_err(bond_dev, "cannot enslave bond to itself.\n");
 		return -EPERM;
 	}
@@ -1362,6 +1366,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		netdev_dbg(bond_dev, "%s is NETIF_F_VLAN_CHALLENGED\n",
 			   slave_dev->name);
 		if (vlan_uses_dev(bond_dev)) {
+			NL_SET_ERR_MSG(extack, "Can not enslave VLAN challenged device to VLAN enabled bond");
 			netdev_err(bond_dev, "Error: cannot enslave VLAN challenged slave %s on VLAN enabled bond %s\n",
 				   slave_dev->name, bond_dev->name);
 			return -EPERM;
@@ -1381,6 +1386,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	 * enslaving it; the old ifenslave will not.
 	 */
 	if (slave_dev->flags & IFF_UP) {
+		NL_SET_ERR_MSG(extack, "Device can not be enslaved while up");
 		netdev_err(bond_dev, "%s is up - this may be due to an out of date ifenslave\n",
 			   slave_dev->name);
 		return -EPERM;
@@ -1421,6 +1427,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 						 bond_dev);
 		}
 	} else if (bond_dev->type != slave_dev->type) {
+		NL_SET_ERR_MSG(extack, "Device type is different from other slaves");
 		netdev_err(bond_dev, "%s ether type (%d) is different from other slaves (%d), can not enslave it\n",
 			   slave_dev->name, slave_dev->type, bond_dev->type);
 		return -EINVAL;
@@ -1428,6 +1435,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 
 	if (slave_dev->type == ARPHRD_INFINIBAND &&
 	    BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
+		NL_SET_ERR_MSG(extack, "Only active-backup mode is supported for infiniband slaves");
 		netdev_warn(bond_dev, "Type (%d) supports only active-backup mode\n",
 			    slave_dev->type);
 		res = -EOPNOTSUPP;
@@ -1443,6 +1451,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 				bond->params.fail_over_mac = BOND_FOM_ACTIVE;
 				netdev_warn(bond_dev, "Setting fail_over_mac to active for active-backup mode\n");
 			} else {
+				NL_SET_ERR_MSG(extack, "Slave device does not support setting the MAC address, but fail_over_mac is not set to active");
 				netdev_err(bond_dev, "The slave device specified does not support setting the MAC address, but fail_over_mac is not set to active\n");
 				res = -EOPNOTSUPP;
 				goto err_undo_flags;
@@ -1709,7 +1718,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		goto err_detach;
 	}
 
-	res = bond_master_upper_dev_link(bond, new_slave);
+	res = bond_master_upper_dev_link(bond, new_slave, extack);
 	if (res) {
 		netdev_dbg(bond_dev, "Error %d calling bond_master_upper_dev_link\n", res);
 		goto err_unregister;
@@ -2491,7 +2500,8 @@ int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
 	struct slave *curr_active_slave, *curr_arp_slave;
 	unsigned char *arp_ptr;
 	__be32 sip, tip;
-	int alen, is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
+	int is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
+	unsigned int alen;
 
 	if (!slave_do_arp_validate(bond, slave)) {
 		if ((slave_do_arp_validate_only(bond) && is_arp) ||
@@ -3072,7 +3082,16 @@ static int bond_slave_netdev_event(unsigned long event,
 		break;
 	case NETDEV_UP:
 	case NETDEV_CHANGE:
-		bond_update_speed_duplex(slave);
+		/* For 802.3ad mode only:
+		 * Getting invalid Speed/Duplex values here will put slave
+		 * in weird state. So mark it as link-down for the time
+		 * being and let link-monitoring (miimon) set it right when
+		 * correct speeds/duplex are available.
+		 */
+		if (bond_update_speed_duplex(slave) &&
+		    BOND_MODE(bond) == BOND_MODE_8023AD)
+			slave->link = BOND_LINK_DOWN;
+
 		if (BOND_MODE(bond) == BOND_MODE_8023AD)
 			bond_3ad_adapter_speed_duplex_changed(slave);
 		/* Fallthrough */
@@ -3482,7 +3501,7 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd
 	switch (cmd) {
 	case BOND_ENSLAVE_OLD:
 	case SIOCBONDENSLAVE:
-		res = bond_enslave(bond_dev, slave_dev);
+		res = bond_enslave(bond_dev, slave_dev, NULL);
 		break;
 	case BOND_RELEASE_OLD:
 	case SIOCBONDRELEASE:
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 5931aa2fe997..8a9b085c2a98 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -1383,7 +1383,7 @@ static int bond_option_slaves_set(struct bonding *bond,
 	switch (command[0]) {
 	case '+':
 		netdev_dbg(bond->dev, "Adding slave %s\n", dev->name);
-		ret = bond_enslave(bond->dev, dev);
+		ret = bond_enslave(bond->dev, dev, NULL);
 		break;
 
 	case '-':
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
index 438966bf51c2..fed75e75207a 100644
--- a/drivers/net/caif/caif_hsi.c
+++ b/drivers/net/caif/caif_hsi.c
@@ -1211,17 +1211,14 @@ static int cfhsi_open(struct net_device *ndev)
 	init_waitqueue_head(&cfhsi->flush_fifo_wait);
 
 	/* Setup the inactivity timer. */
-	init_timer(&cfhsi->inactivity_timer);
-	cfhsi->inactivity_timer.data = (unsigned long)cfhsi;
-	cfhsi->inactivity_timer.function = cfhsi_inactivity_tout;
+	setup_timer(&cfhsi->inactivity_timer, cfhsi_inactivity_tout,
+		    (unsigned long)cfhsi);
 	/* Setup the slowpath RX timer. */
-	init_timer(&cfhsi->rx_slowpath_timer);
-	cfhsi->rx_slowpath_timer.data = (unsigned long)cfhsi;
-	cfhsi->rx_slowpath_timer.function = cfhsi_rx_slowpath;
+	setup_timer(&cfhsi->rx_slowpath_timer, cfhsi_rx_slowpath,
+		    (unsigned long)cfhsi);
 	/* Setup the aggregation timer. */
-	init_timer(&cfhsi->aggregation_timer);
-	cfhsi->aggregation_timer.data = (unsigned long)cfhsi;
-	cfhsi->aggregation_timer.function = cfhsi_aggregation_tout;
+	setup_timer(&cfhsi->aggregation_timer, cfhsi_aggregation_tout,
+		    (unsigned long)cfhsi);
 
 	/* Activate HSI interface. */
 	res = cfhsi->ops->cfhsi_up(cfhsi->ops);
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index a7be12d9a139..8570cfdaea75 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -1626,13 +1626,11 @@ static int grcan_setup_netdev(struct platform_device *ofdev,
 	spin_lock_init(&priv->lock);
 
 	if (priv->need_txbug_workaround) {
-		init_timer(&priv->rr_timer);
-		priv->rr_timer.function = grcan_running_reset;
-		priv->rr_timer.data = (unsigned long)dev;
+		setup_timer(&priv->rr_timer, grcan_running_reset,
+			    (unsigned long)dev);
 
-		init_timer(&priv->hang_timer);
-		priv->hang_timer.function = grcan_initiate_running_reset;
-		priv->hang_timer.data = (unsigned long)dev;
+		setup_timer(&priv->hang_timer, grcan_initiate_running_reset,
+			    (unsigned long)dev);
 	}
 
 	netif_napi_add(dev, &priv->napi, grcan_poll, GRCAN_NAPI_WEIGHT);
diff --git a/drivers/net/can/sja1000/peak_pcmcia.c b/drivers/net/can/sja1000/peak_pcmcia.c
index dd56133cc461..4b8758e10bd4 100644
--- a/drivers/net/can/sja1000/peak_pcmcia.c
+++ b/drivers/net/can/sja1000/peak_pcmcia.c
@@ -692,9 +692,7 @@ static int pcan_probe(struct pcmcia_device *pdev)
 	}
 
 	/* init the timer which controls the leds */
-	init_timer(&card->led_timer);
-	card->led_timer.function = pcan_led_timer;
-	card->led_timer.data = (unsigned long)card;
+	setup_timer(&card->led_timer, pcan_led_timer, (unsigned long)card);
 
 	/* request the given irq */
 	err = request_irq(pdev->irq, &pcan_isr, IRQF_SHARED, PCC_NAME, card);
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c
index 838545ce468d..7e10dbdded28 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c
@@ -798,9 +798,8 @@ static int pcan_usb_init(struct peak_usb_device *dev)
 	int err;
 
 	/* initialize a timer needed to wait for hardware restart */
-	init_timer(&pdev->restart_timer);
-	pdev->restart_timer.function = pcan_usb_restart;
-	pdev->restart_timer.data = (unsigned long)dev;
+	setup_timer(&pdev->restart_timer, pcan_usb_restart,
+		    (unsigned long)dev);
 
 	/*
 	 * explicit use of dev_xxx() instead of netdev_xxx() here:
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 274f3679f33d..d4ce092def83 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -484,7 +484,7 @@ static int b53_fast_age_vlan(struct b53_device *dev, u16 vid)
 	return b53_flush_arl(dev, FAST_AGE_VLAN);
 }
 
-static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port)
+void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port)
 {
 	struct b53_device *dev = ds->priv;
 	unsigned int i;
@@ -500,9 +500,9 @@ static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port)
 		b53_write16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(i), pvlan);
 	}
 }
+EXPORT_SYMBOL(b53_imp_vlan_setup);
 
-static int b53_enable_port(struct dsa_switch *ds, int port,
-			   struct phy_device *phy)
+int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
 {
 	struct b53_device *dev = ds->priv;
 	unsigned int cpu_port = dev->cpu_port;
@@ -523,11 +523,15 @@ static int b53_enable_port(struct dsa_switch *ds, int port,
 
 	b53_imp_vlan_setup(ds, cpu_port);
 
+	/* If EEE was enabled, restore it */
+	if (dev->ports[port].eee.eee_enabled)
+		b53_eee_enable_set(ds, port, true);
+
 	return 0;
 }
+EXPORT_SYMBOL(b53_enable_port);
 
-static void b53_disable_port(struct dsa_switch *ds, int port,
-			     struct phy_device *phy)
+void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
 {
 	struct b53_device *dev = ds->priv;
 	u8 reg;
@@ -537,20 +541,67 @@ static void b53_disable_port(struct dsa_switch *ds, int port,
 	reg |= PORT_CTRL_RX_DISABLE | PORT_CTRL_TX_DISABLE;
 	b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg);
 }
+EXPORT_SYMBOL(b53_disable_port);
 
-static void b53_enable_cpu_port(struct b53_device *dev)
+void b53_brcm_hdr_setup(struct dsa_switch *ds, int port)
+{
+	struct b53_device *dev = ds->priv;
+	u8 hdr_ctl, val;
+	u16 reg;
+
+	/* Resolve which bit controls the Broadcom tag */
+	switch (port) {
+	case 8:
+		val = BRCM_HDR_P8_EN;
+		break;
+	case 7:
+		val = BRCM_HDR_P7_EN;
+		break;
+	case 5:
+		val = BRCM_HDR_P5_EN;
+		break;
+	default:
+		val = 0;
+		break;
+	}
+
+	/* Enable Broadcom tags for IMP port */
+	b53_read8(dev, B53_MGMT_PAGE, B53_BRCM_HDR, &hdr_ctl);
+	hdr_ctl |= val;
+	b53_write8(dev, B53_MGMT_PAGE, B53_BRCM_HDR, hdr_ctl);
+
+	/* Registers below are only accessible on newer devices */
+	if (!is58xx(dev))
+		return;
+
+	/* Enable reception Broadcom tag for CPU TX (switch RX) to
+	 * allow us to tag outgoing frames
+	 */
+	b53_read16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_RX_DIS, &reg);
+	reg &= ~BIT(port);
+	b53_write16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_RX_DIS, reg);
+
+	/* Enable transmission of Broadcom tags from the switch (CPU RX) to
+	 * allow delivering frames to the per-port net_devices
+	 */
+	b53_read16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_TX_DIS, &reg);
+	reg &= ~BIT(port);
+	b53_write16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_TX_DIS, reg);
+}
+EXPORT_SYMBOL(b53_brcm_hdr_setup);
+
+static void b53_enable_cpu_port(struct b53_device *dev, int port)
 {
-	unsigned int cpu_port = dev->cpu_port;
 	u8 port_ctrl;
 
 	/* BCM5325 CPU port is at 8 */
-	if ((is5325(dev) || is5365(dev)) && cpu_port == B53_CPU_PORT_25)
-		cpu_port = B53_CPU_PORT;
+	if ((is5325(dev) || is5365(dev)) && port == B53_CPU_PORT_25)
+		port = B53_CPU_PORT;
 
 	port_ctrl = PORT_CTRL_RX_BCST_EN |
 		    PORT_CTRL_RX_MCST_EN |
 		    PORT_CTRL_RX_UCST_EN;
-	b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(cpu_port), port_ctrl);
+	b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), port_ctrl);
 }
 
 static void b53_enable_mib(struct b53_device *dev)
@@ -816,12 +867,13 @@ static int b53_setup(struct dsa_switch *ds)
 	if (ret)
 		dev_err(ds->dev, "failed to apply configuration\n");
 
+	/* Configure IMP/CPU port, disable unused ports. Enabled
+	 * ports will be configured with .port_enable
+	 */
 	for (port = 0; port < dev->num_ports; port++) {
-		if (BIT(port) & ds->enabled_port_mask)
-			b53_enable_port(ds, port, NULL);
-		else if (dsa_is_cpu_port(ds, port))
-			b53_enable_cpu_port(dev);
-		else
+		if (dsa_is_cpu_port(ds, port))
+			b53_enable_cpu_port(dev, port);
+		else if (!(BIT(port) & ds->enabled_port_mask))
 			b53_disable_port(ds, port, NULL);
 	}
 
@@ -832,6 +884,7 @@ static void b53_adjust_link(struct dsa_switch *ds, int port,
 			    struct phy_device *phydev)
 {
 	struct b53_device *dev = ds->priv;
+	struct ethtool_eee *p = &dev->ports[port].eee;
 	u8 rgmii_ctrl = 0, reg = 0, off;
 
 	if (!phy_is_pseudo_fixed_link(phydev))
@@ -953,6 +1006,9 @@ static void b53_adjust_link(struct dsa_switch *ds, int port,
 			b53_write8(dev, B53_CTRL_PAGE, po_reg, gmii_po);
 		}
 	}
+
+	/* Re-negotiate EEE if it was enabled already */
+	p->eee_enabled = b53_eee_init(ds, port, phydev);
 }
 
 int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering)
@@ -1280,7 +1336,7 @@ EXPORT_SYMBOL(b53_fdb_dump);
 int b53_br_join(struct dsa_switch *ds, int port, struct net_device *br)
 {
 	struct b53_device *dev = ds->priv;
-	s8 cpu_port = ds->dst->cpu_dp->index;
+	s8 cpu_port = ds->ports[port].cpu_dp->index;
 	u16 pvlan, reg;
 	unsigned int i;
 
@@ -1326,7 +1382,7 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *br)
 {
 	struct b53_device *dev = ds->priv;
 	struct b53_vlan *vl = &dev->vlans[0];
-	s8 cpu_port = ds->dst->cpu_dp->index;
+	s8 cpu_port = ds->ports[port].cpu_dp->index;
 	unsigned int i;
 	u16 pvlan, reg, pvid;
 
@@ -1484,6 +1540,69 @@ void b53_mirror_del(struct dsa_switch *ds, int port,
 }
 EXPORT_SYMBOL(b53_mirror_del);
 
+void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable)
+{
+	struct b53_device *dev = ds->priv;
+	u16 reg;
+
+	b53_read16(dev, B53_EEE_PAGE, B53_EEE_EN_CTRL, &reg);
+	if (enable)
+		reg |= BIT(port);
+	else
+		reg &= ~BIT(port);
+	b53_write16(dev, B53_EEE_PAGE, B53_EEE_EN_CTRL, reg);
+}
+EXPORT_SYMBOL(b53_eee_enable_set);
+
+
+/* Returns 0 if EEE was not enabled, or 1 otherwise
+ */
+int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy)
+{
+	int ret;
+
+	ret = phy_init_eee(phy, 0);
+	if (ret)
+		return 0;
+
+	b53_eee_enable_set(ds, port, true);
+
+	return 1;
+}
+EXPORT_SYMBOL(b53_eee_init);
+
+int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e)
+{
+	struct b53_device *dev = ds->priv;
+	struct ethtool_eee *p = &dev->ports[port].eee;
+	u16 reg;
+
+	if (is5325(dev) || is5365(dev))
+		return -EOPNOTSUPP;
+
+	b53_read16(dev, B53_EEE_PAGE, B53_EEE_LPI_INDICATE, &reg);
+	e->eee_enabled = p->eee_enabled;
+	e->eee_active = !!(reg & BIT(port));
+
+	return 0;
+}
+EXPORT_SYMBOL(b53_get_mac_eee);
+
+int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e)
+{
+	struct b53_device *dev = ds->priv;
+	struct ethtool_eee *p = &dev->ports[port].eee;
+
+	if (is5325(dev) || is5365(dev))
+		return -EOPNOTSUPP;
+
+	p->eee_enabled = e->eee_enabled;
+	b53_eee_enable_set(ds, port, e->eee_enabled);
+
+	return 0;
+}
+EXPORT_SYMBOL(b53_set_mac_eee);
+
 static const struct dsa_switch_ops b53_switch_ops = {
 	.get_tag_protocol	= b53_get_tag_protocol,
 	.setup			= b53_setup,
@@ -1495,6 +1614,8 @@ static const struct dsa_switch_ops b53_switch_ops = {
 	.adjust_link		= b53_adjust_link,
 	.port_enable		= b53_enable_port,
 	.port_disable		= b53_disable_port,
+	.get_mac_eee		= b53_get_mac_eee,
+	.set_mac_eee		= b53_set_mac_eee,
 	.port_bridge_join	= b53_br_join,
 	.port_bridge_leave	= b53_br_leave,
 	.port_stp_state_set	= b53_br_set_stp_state,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 01bd8cbe9a3f..603c66d240d8 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -70,6 +70,7 @@ enum {
 
 struct b53_port {
 	u16		vlan_ctl_mask;
+	struct ethtool_eee eee;
 };
 
 struct b53_vlan {
@@ -186,11 +187,6 @@ static inline int is58xx(struct b53_device *dev)
 #define B53_CPU_PORT_25	5
 #define B53_CPU_PORT	8
 
-static inline int is_cpu_port(struct b53_device *dev, int port)
-{
-	return dev->cpu_port;
-}
-
 struct b53_device *b53_switch_alloc(struct device *base,
 				    const struct b53_io_ops *ops,
 				    void *priv);
@@ -204,119 +200,30 @@ static inline void b53_switch_remove(struct b53_device *dev)
 	dsa_unregister_switch(dev->ds);
 }
 
-static inline int b53_read8(struct b53_device *dev, u8 page, u8 reg, u8 *val)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->read8(dev, page, reg, val);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
+#define b53_build_op(type_op_size, val_type)				\
+static inline int b53_##type_op_size(struct b53_device *dev, u8 page,	\
+				     u8 reg, val_type val)		\
+{									\
+	int ret;							\
+									\
+	mutex_lock(&dev->reg_mutex);					\
+	ret = dev->ops->type_op_size(dev, page, reg, val);		\
+	mutex_unlock(&dev->reg_mutex);					\
+									\
+	return ret;							\
 }
 
-static inline int b53_read16(struct b53_device *dev, u8 page, u8 reg, u16 *val)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->read16(dev, page, reg, val);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
-}
-
-static inline int b53_read32(struct b53_device *dev, u8 page, u8 reg, u32 *val)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->read32(dev, page, reg, val);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
-}
-
-static inline int b53_read48(struct b53_device *dev, u8 page, u8 reg, u64 *val)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->read48(dev, page, reg, val);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
-}
-
-static inline int b53_read64(struct b53_device *dev, u8 page, u8 reg, u64 *val)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->read64(dev, page, reg, val);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
-}
+b53_build_op(read8, u8 *);
+b53_build_op(read16, u16 *);
+b53_build_op(read32, u32 *);
+b53_build_op(read48, u64 *);
+b53_build_op(read64, u64 *);
 
-static inline int b53_write8(struct b53_device *dev, u8 page, u8 reg, u8 value)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->write8(dev, page, reg, value);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
-}
-
-static inline int b53_write16(struct b53_device *dev, u8 page, u8 reg,
-			      u16 value)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->write16(dev, page, reg, value);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
-}
-
-static inline int b53_write32(struct b53_device *dev, u8 page, u8 reg,
-			      u32 value)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->write32(dev, page, reg, value);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
-}
-
-static inline int b53_write48(struct b53_device *dev, u8 page, u8 reg,
-			      u64 value)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->write48(dev, page, reg, value);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
-}
-
-static inline int b53_write64(struct b53_device *dev, u8 page, u8 reg,
-			       u64 value)
-{
-	int ret;
-
-	mutex_lock(&dev->reg_mutex);
-	ret = dev->ops->write64(dev, page, reg, value);
-	mutex_unlock(&dev->reg_mutex);
-
-	return ret;
-}
+b53_build_op(write8, u8);
+b53_build_op(write16, u16);
+b53_build_op(write32, u32);
+b53_build_op(write48, u64);
+b53_build_op(write64, u64);
 
 struct b53_arl_entry {
 	u8 port;
@@ -377,6 +284,7 @@ static inline int b53_switch_get_reset_gpio(struct b53_device *dev)
 #endif
 
 /* Exported functions towards other drivers */
+void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port);
 void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data);
 void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data);
 int b53_get_sset_count(struct dsa_switch *ds);
@@ -403,5 +311,12 @@ int b53_mirror_add(struct dsa_switch *ds, int port,
 		   struct dsa_mall_mirror_tc_entry *mirror, bool ingress);
 void b53_mirror_del(struct dsa_switch *ds, int port,
 		    struct dsa_mall_mirror_tc_entry *mirror);
+int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy);
+void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy);
+void b53_brcm_hdr_setup(struct dsa_switch *ds, int port);
+void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable);
+int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy);
+int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e);
+int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e);
 
 #endif
diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h
index e5c86d44667a..2a9f421680aa 100644
--- a/drivers/net/dsa/b53/b53_regs.h
+++ b/drivers/net/dsa/b53/b53_regs.h
@@ -50,6 +50,9 @@
 /* Jumbo Frame Registers */
 #define B53_JUMBO_PAGE			0x40
 
+/* EEE Control Registers Page */
+#define B53_EEE_PAGE			0x92
+
 /* CFP Configuration Registers Page */
 #define B53_CFP_PAGE			0xa1
 
@@ -210,6 +213,7 @@
 #define B53_BRCM_HDR			0x03
 #define   BRCM_HDR_P8_EN		BIT(0) /* Enable tagging on port 8 */
 #define   BRCM_HDR_P5_EN		BIT(1) /* Enable tagging on port 5 */
+#define   BRCM_HDR_P7_EN		BIT(2) /* Enable tagging on port 7 */
 
 /* Mirror capture control register (16 bit) */
 #define B53_MIR_CAP_CTL			0x10
@@ -249,6 +253,12 @@
 /* Revision ID register (8 bit) */
 #define B53_REV_ID			0x40
 
+/* Broadcom header RX control (16 bit) */
+#define B53_BRCM_HDR_RX_DIS		0x60
+
+/* Broadcom header TX control (16 bit)	*/
+#define B53_BRCM_HDR_TX_DIS		0x62
+
 /*************************************************************************
  * ARL Access Page Registers
  *************************************************************************/
@@ -465,6 +475,44 @@
 #define   JMS_MAX_SIZE			9724
 
 /*************************************************************************
+ * EEE Configuration Page Registers
+ *************************************************************************/
+
+/* EEE Enable control register (16 bit) */
+#define B53_EEE_EN_CTRL			0x00
+
+/* EEE LPI assert status register (16 bit) */
+#define B53_EEE_LPI_ASSERT_STS		0x02
+
+/* EEE LPI indicate status register (16 bit) */
+#define B53_EEE_LPI_INDICATE		0x4
+
+/* EEE Receiving idle symbols status register (16 bit) */
+#define B53_EEE_RX_IDLE_SYM_STS		0x6
+
+/* EEE Pipeline timer register (32 bit) */
+#define B53_EEE_PIP_TIMER		0xC
+
+/* EEE Sleep timer Gig register (32 bit) */
+#define B53_EEE_SLEEP_TIMER_GIG(i)	(0x10 + 4 * (i))
+
+/* EEE Sleep timer FE register (32 bit) */
+#define B53_EEE_SLEEP_TIMER_FE(i)	(0x34 + 4 * (i))
+
+/* EEE Minimum LP timer Gig register (32 bit) */
+#define B53_EEE_MIN_LP_TIMER_GIG(i)	(0x58 + 4 * (i))
+
+/* EEE Minimum LP timer FE register (32 bit) */
+#define B53_EEE_MIN_LP_TIMER_FE(i)	(0x7c + 4 * (i))
+
+/* EEE Wake timer Gig register (16 bit) */
+#define B53_EEE_WAKE_TIMER_GIG(i)	(0xa0 + 2 * (i))
+
+/* EEE Wake timer FE register (16 bit) */
+#define B53_EEE_WAKE_TIMER_FE(i)	(0xb2 + 2 * (i))
+
+
+/*************************************************************************
  * CFP Configuration Page Registers
  *************************************************************************/
 
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index d7b53d53c116..7aecc98d0a18 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -40,66 +40,6 @@ static enum dsa_tag_protocol bcm_sf2_sw_get_tag_protocol(struct dsa_switch *ds)
 	return DSA_TAG_PROTO_BRCM;
 }
 
-static void bcm_sf2_imp_vlan_setup(struct dsa_switch *ds, int cpu_port)
-{
-	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	unsigned int i;
-	u32 reg;
-
-	/* Enable the IMP Port to be in the same VLAN as the other ports
-	 * on a per-port basis such that we only have Port i and IMP in
-	 * the same VLAN.
-	 */
-	for (i = 0; i < priv->hw_params.num_ports; i++) {
-		if (!((1 << i) & ds->enabled_port_mask))
-			continue;
-
-		reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i));
-		reg |= (1 << cpu_port);
-		core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(i));
-	}
-}
-
-static void bcm_sf2_brcm_hdr_setup(struct bcm_sf2_priv *priv, int port)
-{
-	u32 reg, val;
-
-	/* Resolve which bit controls the Broadcom tag */
-	switch (port) {
-	case 8:
-		val = BRCM_HDR_EN_P8;
-		break;
-	case 7:
-		val = BRCM_HDR_EN_P7;
-		break;
-	case 5:
-		val = BRCM_HDR_EN_P5;
-		break;
-	default:
-		val = 0;
-		break;
-	}
-
-	/* Enable Broadcom tags for IMP port */
-	reg = core_readl(priv, CORE_BRCM_HDR_CTRL);
-	reg |= val;
-	core_writel(priv, reg, CORE_BRCM_HDR_CTRL);
-
-	/* Enable reception Broadcom tag for CPU TX (switch RX) to
-	 * allow us to tag outgoing frames
-	 */
-	reg = core_readl(priv, CORE_BRCM_HDR_RX_DIS);
-	reg &= ~(1 << port);
-	core_writel(priv, reg, CORE_BRCM_HDR_RX_DIS);
-
-	/* Enable transmission of Broadcom tags from the switch (CPU RX) to
-	 * allow delivering frames to the per-port net_devices
-	 */
-	reg = core_readl(priv, CORE_BRCM_HDR_TX_DIS);
-	reg &= ~(1 << port);
-	core_writel(priv, reg, CORE_BRCM_HDR_TX_DIS);
-}
-
 static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
@@ -138,7 +78,7 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
 		reg |= i << (PRT_TO_QID_SHIFT * i);
 	core_writel(priv, reg, CORE_PORT_TC2_QOS_MAP_PORT(port));
 
-	bcm_sf2_brcm_hdr_setup(priv, port);
+	b53_brcm_hdr_setup(ds, port);
 
 	/* Force link status for IMP port */
 	reg = core_readl(priv, offset);
@@ -146,19 +86,6 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
 	core_writel(priv, reg, offset);
 }
 
-static void bcm_sf2_eee_enable_set(struct dsa_switch *ds, int port, bool enable)
-{
-	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	u32 reg;
-
-	reg = core_readl(priv, CORE_EEE_EN_CTRL);
-	if (enable)
-		reg |= 1 << port;
-	else
-		reg &= ~(1 << port);
-	core_writel(priv, reg, CORE_EEE_EN_CTRL);
-}
-
 static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
@@ -236,7 +163,6 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port,
 			      struct phy_device *phy)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	s8 cpu_port = ds->dst->cpu_dp->index;
 	unsigned int i;
 	u32 reg;
 
@@ -247,7 +173,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port,
 
 	/* Enable Broadcom tags for that port if requested */
 	if (priv->brcm_tag_mask & BIT(port))
-		bcm_sf2_brcm_hdr_setup(priv, port);
+		b53_brcm_hdr_setup(ds, port);
 
 	/* Configure Traffic Class to QoS mapping, allow each priority to map
 	 * to a different queue number
@@ -257,9 +183,6 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port,
 		reg |= i << (PRT_TO_QID_SHIFT * i);
 	core_writel(priv, reg, CORE_PORT_TC2_QOS_MAP_PORT(port));
 
-	/* Clear the Rx and Tx disable bits and set to no spanning tree */
-	core_writel(priv, 0, CORE_G_PCTL_PORT(port));
-
 	/* Re-enable the GPHY and re-apply workarounds */
 	if (priv->int_phy_mask & 1 << port && priv->hw_params.num_gphy == 1) {
 		bcm_sf2_gphy_enable_set(ds, true);
@@ -282,23 +205,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port,
 	if (port == priv->moca_port)
 		bcm_sf2_port_intr_enable(priv, port);
 
-	/* Set this port, and only this one to be in the default VLAN,
-	 * if member of a bridge, restore its membership prior to
-	 * bringing down this port.
-	 */
-	reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(port));
-	reg &= ~PORT_VLAN_CTRL_MASK;
-	reg |= (1 << port);
-	reg |= priv->dev->ports[port].vlan_ctl_mask;
-	core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(port));
-
-	bcm_sf2_imp_vlan_setup(ds, cpu_port);
-
-	/* If EEE was enabled, restore it */
-	if (priv->port_sts[port].eee.eee_enabled)
-		bcm_sf2_eee_enable_set(ds, port, true);
-
-	return 0;
+	return b53_enable_port(ds, port, phy);
 }
 
 static void bcm_sf2_port_disable(struct dsa_switch *ds, int port,
@@ -321,9 +228,7 @@ static void bcm_sf2_port_disable(struct dsa_switch *ds, int port,
 	else
 		off = CORE_G_PCTL_PORT(port);
 
-	reg = core_readl(priv, off);
-	reg |= RX_DIS | TX_DIS;
-	core_writel(priv, reg, off);
+	b53_disable_port(ds, port, phy);
 
 	/* Power down the port memory */
 	reg = core_readl(priv, CORE_MEM_PSM_VDD_CTRL);
@@ -331,47 +236,6 @@ static void bcm_sf2_port_disable(struct dsa_switch *ds, int port,
 	core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL);
 }
 
-/* Returns 0 if EEE was not enabled, or 1 otherwise
- */
-static int bcm_sf2_eee_init(struct dsa_switch *ds, int port,
-			    struct phy_device *phy)
-{
-	int ret;
-
-	ret = phy_init_eee(phy, 0);
-	if (ret)
-		return 0;
-
-	bcm_sf2_eee_enable_set(ds, port, true);
-
-	return 1;
-}
-
-static int bcm_sf2_sw_get_mac_eee(struct dsa_switch *ds, int port,
-				  struct ethtool_eee *e)
-{
-	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	struct ethtool_eee *p = &priv->port_sts[port].eee;
-	u32 reg;
-
-	reg = core_readl(priv, CORE_EEE_LPI_INDICATE);
-	e->eee_enabled = p->eee_enabled;
-	e->eee_active = !!(reg & (1 << port));
-
-	return 0;
-}
-
-static int bcm_sf2_sw_set_mac_eee(struct dsa_switch *ds, int port,
-				  struct ethtool_eee *e)
-{
-	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	struct ethtool_eee *p = &priv->port_sts[port].eee;
-
-	p->eee_enabled = e->eee_enabled;
-	bcm_sf2_eee_enable_set(ds, port, e->eee_enabled);
-
-	return 0;
-}
 
 static int bcm_sf2_sw_indir_rw(struct bcm_sf2_priv *priv, int op, int addr,
 			       int regnum, u16 val)
@@ -606,7 +470,7 @@ static void bcm_sf2_sw_adjust_link(struct dsa_switch *ds, int port,
 				   struct phy_device *phydev)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	struct ethtool_eee *p = &priv->port_sts[port].eee;
+	struct ethtool_eee *p = &priv->dev->ports[port].eee;
 	u32 id_mode_dis = 0, port_mode;
 	const char *str = NULL;
 	u32 reg, offset;
@@ -688,7 +552,7 @@ force_link:
 	core_writel(priv, reg, offset);
 
 	if (!phydev->is_pseudo_fixed_link)
-		p->eee_enabled = bcm_sf2_eee_init(ds, port, phydev);
+		p->eee_enabled = b53_eee_init(ds, port, phydev);
 }
 
 static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port,
@@ -797,7 +661,7 @@ static int bcm_sf2_sw_resume(struct dsa_switch *ds)
 static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 			       struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = ds->dst->cpu_dp->netdev;
+	struct net_device *p = ds->ports[port].cpu_dp->netdev;
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	struct ethtool_wolinfo pwol;
 
@@ -820,9 +684,9 @@ static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port,
 			      struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = ds->dst->cpu_dp->netdev;
+	struct net_device *p = ds->ports[port].cpu_dp->netdev;
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	s8 cpu_port = ds->dst->cpu_dp->index;
+	s8 cpu_port = ds->ports[port].cpu_dp->index;
 	struct ethtool_wolinfo pwol;
 
 	p->ethtool_ops->get_wol(p, &pwol);
@@ -1020,8 +884,8 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
 	.set_wol		= bcm_sf2_sw_set_wol,
 	.port_enable		= bcm_sf2_port_setup,
 	.port_disable		= bcm_sf2_port_disable,
-	.get_mac_eee		= bcm_sf2_sw_get_mac_eee,
-	.set_mac_eee		= bcm_sf2_sw_set_mac_eee,
+	.get_mac_eee		= b53_get_mac_eee,
+	.set_mac_eee		= b53_set_mac_eee,
 	.port_bridge_join	= b53_br_join,
 	.port_bridge_leave	= b53_br_leave,
 	.port_stp_state_set	= b53_br_set_stp_state,
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index 02c499f9c56b..1922e027ff59 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -48,8 +48,6 @@ struct bcm_sf2_hw_params {
 
 struct bcm_sf2_port_status {
 	unsigned int link;
-
-	struct ethtool_eee eee;
 };
 
 struct bcm_sf2_cfp_priv {
diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 8a1da7e67707..94649e1481ec 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -144,7 +144,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	 * destination port is enabled and that we are within the
 	 * number of ports supported by the switch
 	 */
-	port_num = fs->ring_cookie / 8;
+	port_num = fs->ring_cookie / SF2_NUM_EGRESS_QUEUES;
 
 	if (fs->ring_cookie == RX_CLS_FLOW_DISC ||
 	    !(BIT(port_num) & ds->enabled_port_mask) ||
@@ -280,7 +280,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	 * We have a small oddity where Port 6 just does not have a
 	 * valid bit here (so we subtract by one).
 	 */
-	queue_num = fs->ring_cookie % 8;
+	queue_num = fs->ring_cookie % SF2_NUM_EGRESS_QUEUES;
 	if (port_num >= 7)
 		port_num -= 1;
 
@@ -401,7 +401,7 @@ static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port,
 	/* There is no Port 6, so we compensate for that here */
 	if (nfc->fs.ring_cookie >= 6)
 		nfc->fs.ring_cookie++;
-	nfc->fs.ring_cookie *= 8;
+	nfc->fs.ring_cookie *= SF2_NUM_EGRESS_QUEUES;
 
 	/* Extract the destination queue */
 	queue_num = (reg >> NEW_TC_SHIFT) & NEW_TC_MASK;
diff --git a/drivers/net/dsa/bcm_sf2_regs.h b/drivers/net/dsa/bcm_sf2_regs.h
index 49695fcc2ea8..d8b8074a47b9 100644
--- a/drivers/net/dsa/bcm_sf2_regs.h
+++ b/drivers/net/dsa/bcm_sf2_regs.h
@@ -205,16 +205,8 @@ enum bcm_sf2_reg_offs {
 
 #define CORE_IMP0_PRT_ID		0x0804
 
-#define CORE_BRCM_HDR_CTRL		0x0080c
-#define  BRCM_HDR_EN_P8			(1 << 0)
-#define  BRCM_HDR_EN_P5			(1 << 1)
-#define  BRCM_HDR_EN_P7			(1 << 2)
-
 #define CORE_RST_MIB_CNT_EN		0x0950
 
-#define CORE_BRCM_HDR_RX_DIS		0x0980
-#define CORE_BRCM_HDR_TX_DIS		0x0988
-
 #define CORE_ARLA_VTBL_RWCTRL		0x1600
 #define  ARLA_VTBL_CMD_WRITE		0
 #define  ARLA_VTBL_CMD_READ		1
@@ -252,9 +244,6 @@ enum bcm_sf2_reg_offs {
 
 #define CORE_JOIN_ALL_VLAN_EN		0xd140
 
-#define CORE_EEE_EN_CTRL		0x24800
-#define CORE_EEE_LPI_INDICATE		0x24810
-
 #define CORE_CFP_ACC			0x28000
 #define  OP_STR_DONE			(1 << 0)
 #define  OP_SEL_SHIFT			1
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index b471413d3df9..07355db2ad81 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -17,6 +17,7 @@
 #include <linux/regmap.h>
 #include <linux/mutex.h>
 #include <linux/mii.h>
+#include <linux/phy.h>
 
 #include "lan9303.h"
 
@@ -57,6 +58,7 @@
 #define LAN9303_SWITCH_CSR_CMD_LANES (BIT(19) | BIT(18) | BIT(17) | BIT(16))
 #define LAN9303_VIRT_PHY_BASE 0x70
 #define LAN9303_VIRT_SPECIAL_CTRL 0x77
+#define  LAN9303_VIRT_SPECIAL_TURBO BIT(10) /*Turbo MII Enable*/
 
 /*13.4 Switch Fabric Control and Status Registers
  * Accessed indirectly via SWITCH_CSR_CMD, SWITCH_CSR_DATA.
@@ -760,6 +762,43 @@ static int lan9303_phy_write(struct dsa_switch *ds, int phy, int regnum,
 	return chip->ops->phy_write(chip, phy, regnum, val);
 }
 
+static void lan9303_adjust_link(struct dsa_switch *ds, int port,
+				struct phy_device *phydev)
+{
+	struct lan9303 *chip = ds->priv;
+	int ctl, res;
+
+	if (!phy_is_pseudo_fixed_link(phydev))
+		return;
+
+	ctl = lan9303_phy_read(ds, port, MII_BMCR);
+
+	ctl &= ~BMCR_ANENABLE;
+
+	if (phydev->speed == SPEED_100)
+		ctl |= BMCR_SPEED100;
+	else if (phydev->speed == SPEED_10)
+		ctl &= ~BMCR_SPEED100;
+	else
+		dev_err(ds->dev, "unsupported speed: %d\n", phydev->speed);
+
+	if (phydev->duplex == DUPLEX_FULL)
+		ctl |= BMCR_FULLDPLX;
+	else
+		ctl &= ~BMCR_FULLDPLX;
+
+	res =  lan9303_phy_write(ds, port, MII_BMCR, ctl);
+
+	if (port == chip->phy_addr_sel_strap) {
+		/* Virtual Phy: Remove Turbo 200Mbit mode */
+		lan9303_read(chip->regmap, LAN9303_VIRT_SPECIAL_CTRL, &ctl);
+
+		ctl &= ~LAN9303_VIRT_SPECIAL_TURBO;
+		res =  regmap_write(chip->regmap,
+				    LAN9303_VIRT_SPECIAL_CTRL, ctl);
+	}
+}
+
 static int lan9303_port_enable(struct dsa_switch *ds, int port,
 			       struct phy_device *phy)
 {
@@ -803,6 +842,7 @@ static const struct dsa_switch_ops lan9303_switch_ops = {
 	.get_strings = lan9303_get_strings,
 	.phy_read = lan9303_phy_read,
 	.phy_write = lan9303_phy_write,
+	.adjust_link = lan9303_adjust_link,
 	.get_ethtool_stats = lan9303_get_ethtool_stats,
 	.get_sset_count = lan9303_get_sset_count,
 	.port_enable = lan9303_port_enable,
diff --git a/drivers/net/dsa/lan9303_mdio.c b/drivers/net/dsa/lan9303_mdio.c
index fc16668a487f..0bc56b9900f9 100644
--- a/drivers/net/dsa/lan9303_mdio.c
+++ b/drivers/net/dsa/lan9303_mdio.c
@@ -67,14 +67,15 @@ static int lan9303_mdio_read(void *ctx, uint32_t reg, uint32_t *val)
 	return 0;
 }
 
-int lan9303_mdio_phy_write(struct lan9303 *chip, int phy, int reg, u16 val)
+static int lan9303_mdio_phy_write(struct lan9303 *chip, int phy, int reg,
+				  u16 val)
 {
 	struct lan9303_mdio *sw_dev = dev_get_drvdata(chip->dev);
 
 	return mdiobus_write_nested(sw_dev->device->bus, phy, reg, val);
 }
 
-int lan9303_mdio_phy_read(struct lan9303 *chip, int phy,  int reg)
+static int lan9303_mdio_phy_read(struct lan9303 *chip, int phy,  int reg)
 {
 	struct lan9303_mdio *sw_dev = dev_get_drvdata(chip->dev);
 
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index c142b97add2c..034241696ce2 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -564,7 +564,8 @@ static int mt7530_phy_read(struct dsa_switch *ds, int port, int regnum)
 	return mdiobus_read_nested(priv->bus, port, regnum);
 }
 
-int mt7530_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val)
+static int mt7530_phy_write(struct dsa_switch *ds, int port, int regnum,
+			    u16 val)
 {
 	struct mt7530_priv *priv = ds->priv;
 
@@ -928,11 +929,11 @@ mt7530_setup(struct dsa_switch *ds)
 	struct device_node *dn;
 	struct mt7530_dummy_poll p;
 
-	/* The parent node of cpu_dp->netdev which holds the common system
+	/* The parent node of master netdev which holds the common system
 	 * controller also is the container for two GMACs nodes representing
 	 * as two netdev instances.
 	 */
-	dn = ds->dst->cpu_dp->netdev->dev.of_node->parent;
+	dn = ds->ports[MT7530_CPU_PORT].netdev->dev.of_node->parent;
 	priv->ethernet = syscon_node_to_regmap(dn);
 	if (IS_ERR(priv->ethernet))
 		return PTR_ERR(priv->ethernet);
diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index dce7fa57eb55..621cdc46ad81 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -176,7 +176,7 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
 		  ((p & 0xf) << PORT_VLAN_MAP_DBNUM_SHIFT) |
 		   (dsa_is_cpu_port(ds, p) ?
 			ds->enabled_port_mask :
-			BIT(ds->dst->cpu_dp->index)));
+			BIT(ds->ports[p].cpu_dp->index)));
 
 	/* Port Association Vector: when learning source addresses
 	 * of packets, add the address to the address database using
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 5ada7a41449c..82f09711ac1a 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -506,7 +506,7 @@ qca8k_setup(struct dsa_switch *ds)
 		pr_warn("regmap initialization failed");
 
 	/* Initialize CPU port pad mode (xMII type, delays...) */
-	phy_mode = of_get_phy_mode(ds->dst->cpu_dp->dn);
+	phy_mode = of_get_phy_mode(ds->ports[QCA8K_CPU_PORT].dn);
 	if (phy_mode < 0) {
 		pr_err("Can't find phy-mode for master device\n");
 		return phy_mode;
diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index d0a1f9ce3168..58483af80bdb 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -353,7 +353,7 @@ static void dummy_setup(struct net_device *dev)
 	eth_hw_addr_random(dev);
 
 	dev->min_mtu = 0;
-	dev->max_mtu = ETH_MAX_MTU;
+	dev->max_mtu = 0;
 }
 
 static int dummy_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -388,7 +388,7 @@ static int __init dummy_init_one(void)
 	int err;
 
 	dev_dummy = alloc_netdev(sizeof(struct dummy_priv),
-				 "dummy%d", NET_NAME_UNKNOWN, dummy_setup);
+				 "dummy%d", NET_NAME_ENUM, dummy_setup);
 	if (!dev_dummy)
 		return -ENOMEM;
 
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index fe13bfea30ac..fccce4b47778 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -178,10 +178,8 @@ static void __init eql_setup(struct net_device *dev)
 {
 	equalizer_t *eql = netdev_priv(dev);
 
-	init_timer(&eql->timer);
-	eql->timer.data     	= (unsigned long) eql;
+	setup_timer(&eql->timer, eql_timer, (unsigned long)eql);
 	eql->timer.expires  	= jiffies + EQL_DEFAULT_RESCHED_IVAL;
-	eql->timer.function 	= eql_timer;
 
 	spin_lock_init(&eql->queue.lock);
 	INIT_LIST_HEAD(&eql->queue.all_slaves);
diff --git a/drivers/net/ethernet/adi/bfin_mac.c b/drivers/net/ethernet/adi/bfin_mac.c
index a251de8d9a91..0658cde1586a 100644
--- a/drivers/net/ethernet/adi/bfin_mac.c
+++ b/drivers/net/ethernet/adi/bfin_mac.c
@@ -1650,9 +1650,8 @@ static int bfin_mac_probe(struct platform_device *pdev)
 	ndev->netdev_ops = &bfin_mac_netdev_ops;
 	ndev->ethtool_ops = &bfin_mac_ethtool_ops;
 
-	init_timer(&lp->tx_reclaim_timer);
-	lp->tx_reclaim_timer.data = (unsigned long)lp;
-	lp->tx_reclaim_timer.function = tx_reclaim_skb_timeout;
+	setup_timer(&lp->tx_reclaim_timer, tx_reclaim_skb_timeout,
+		    (unsigned long)lp);
 
 	lp->flags = 0;
 	netif_napi_add(ndev, &lp->napi, bfin_mac_poll, CONFIG_BFIN_RX_DESC_NUM);
diff --git a/drivers/net/ethernet/agere/et131x.c b/drivers/net/ethernet/agere/et131x.c
index 54eff90e2f02..658e92f79d36 100644
--- a/drivers/net/ethernet/agere/et131x.c
+++ b/drivers/net/ethernet/agere/et131x.c
@@ -3624,11 +3624,10 @@ static int et131x_open(struct net_device *netdev)
 	int result;
 
 	/* Start the timer to track NIC errors */
-	init_timer(&adapter->error_timer);
+	setup_timer(&adapter->error_timer, et131x_error_timer_handler,
+		    (unsigned long)adapter);
 	adapter->error_timer.expires = jiffies +
 		msecs_to_jiffies(TX_ERROR_PERIOD);
-	adapter->error_timer.function = et131x_error_timer_handler;
-	adapter->error_timer.data = (unsigned long)adapter;
 	add_timer(&adapter->error_timer);
 
 	result = request_irq(irq, et131x_isr,
diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
index 52beba8c7a39..ded29af648c9 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c
@@ -315,7 +315,7 @@ static struct ena_comp_ctx *ena_com_submit_admin_cmd(struct ena_com_admin_queue
 					      cmd_size_in_bytes,
 					      comp,
 					      comp_size_in_bytes);
-	if (unlikely(IS_ERR(comp_ctx)))
+	if (IS_ERR(comp_ctx))
 		admin_queue->running_state = false;
 	spin_unlock_irqrestore(&admin_queue->q_lock, flags);
 
@@ -1130,7 +1130,7 @@ int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue,
 
 	comp_ctx = ena_com_submit_admin_cmd(admin_queue, cmd, cmd_size,
 					    comp, comp_size);
-	if (unlikely(IS_ERR(comp_ctx))) {
+	if (IS_ERR(comp_ctx)) {
 		if (comp_ctx == ERR_PTR(-ENODEV))
 			pr_debug("Failed to submit command [%ld]\n",
 				 PTR_ERR(comp_ctx));
diff --git a/drivers/net/ethernet/amd/a2065.c b/drivers/net/ethernet/amd/a2065.c
index e22f976a0d18..998d30e050a6 100644
--- a/drivers/net/ethernet/amd/a2065.c
+++ b/drivers/net/ethernet/amd/a2065.c
@@ -733,10 +733,9 @@ static int a2065_init_one(struct zorro_dev *z,
 	dev->watchdog_timeo = 5*HZ;
 	dev->dma = 0;
 
-	init_timer(&priv->multicast_timer);
-	priv->multicast_timer.data = (unsigned long) dev;
-	priv->multicast_timer.function =
-		(void (*)(unsigned long))lance_set_multicast;
+	setup_timer(&priv->multicast_timer,
+		    (void(*)(unsigned long))lance_set_multicast,
+		    (unsigned long)dev);
 
 	err = register_netdev(dev);
 	if (err) {
diff --git a/drivers/net/ethernet/amd/am79c961a.c b/drivers/net/ethernet/amd/am79c961a.c
index b11e910850f7..0612dbee00d2 100644
--- a/drivers/net/ethernet/amd/am79c961a.c
+++ b/drivers/net/ethernet/amd/am79c961a.c
@@ -728,9 +728,7 @@ static int am79c961_probe(struct platform_device *pdev)
 	am79c961_banner();
 
 	spin_lock_init(&priv->chip_lock);
-	init_timer(&priv->timer);
-	priv->timer.data = (unsigned long)dev;
-	priv->timer.function = am79c961_timer;
+	setup_timer(&priv->timer, am79c961_timer, (unsigned long)dev);
 
 	if (am79c961_hw_init(dev))
 		goto release;
diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index 7b5df562f30f..7f22af6e37e0 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -1883,9 +1883,8 @@ static int amd8111e_probe_one(struct pci_dev *pdev,
 
 	/* Initialize software ipg timer */
 	if(lp->options & OPTION_DYN_IPG_ENABLE){
-		init_timer(&lp->ipg_data.ipg_timer);
-		lp->ipg_data.ipg_timer.data = (unsigned long) dev;
-		lp->ipg_data.ipg_timer.function = (void *)&amd8111e_config_ipg;
+		setup_timer(&lp->ipg_data.ipg_timer,
+			    (void *)&amd8111e_config_ipg, (unsigned long)dev);
 		lp->ipg_data.ipg_timer.expires = jiffies +
 						 IPG_CONVERGE_JIFFIES;
 		lp->ipg_data.ipg = DEFAULT_IPG;
diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c
index 82cc81385033..9bdf81c2cd00 100644
--- a/drivers/net/ethernet/amd/declance.c
+++ b/drivers/net/ethernet/amd/declance.c
@@ -1246,9 +1246,9 @@ static int dec_lance_probe(struct device *bdev, const int type)
 	 * can occur from interrupts (ex. IPv6).  So we
 	 * use a timer to try again later when necessary. -DaveM
 	 */
-	init_timer(&lp->multicast_timer);
-	lp->multicast_timer.data = (unsigned long) dev;
-	lp->multicast_timer.function = lance_set_multicast_retry;
+	setup_timer(&lp->multicast_timer, lance_set_multicast_retry,
+		    (unsigned long)dev);
+
 
 	ret = register_netdev(dev);
 	if (ret) {
diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index 7f60d17819ce..e46153654016 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -1970,9 +1970,8 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
 			lp->options |= PCNET32_PORT_MII;
 	}
 
-	init_timer(&lp->watchdog_timer);
-	lp->watchdog_timer.data = (unsigned long)dev;
-	lp->watchdog_timer.function = (void *)&pcnet32_watchdog;
+	setup_timer(&lp->watchdog_timer, (void *)&pcnet32_watchdog,
+		    (unsigned long)dev);
 
 	/* The PCNET32-specific entries in the device structure. */
 	dev->netdev_ops = &pcnet32_netdev_ops;
diff --git a/drivers/net/ethernet/amd/sunlance.c b/drivers/net/ethernet/amd/sunlance.c
index 291ca5187f12..0183ffb9d3ba 100644
--- a/drivers/net/ethernet/amd/sunlance.c
+++ b/drivers/net/ethernet/amd/sunlance.c
@@ -1459,9 +1459,8 @@ no_link_test:
 	 * can occur from interrupts (ex. IPv6).  So we
 	 * use a timer to try again later when necessary. -DaveM
 	 */
-	init_timer(&lp->multicast_timer);
-	lp->multicast_timer.data = (unsigned long) dev;
-	lp->multicast_timer.function = lance_set_multicast_retry;
+	setup_timer(&lp->multicast_timer, lance_set_multicast_retry,
+		    (unsigned long)dev);
 
 	if (register_netdev(dev)) {
 		printk(KERN_ERR "SunLance: Cannot register device.\n");
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index a1125d10c825..42e44fc03a18 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -1474,10 +1474,8 @@ static int b44_open(struct net_device *dev)
 		goto out;
 	}
 
-	init_timer(&bp->timer);
+	setup_timer(&bp->timer, b44_timer, (unsigned long)bp);
 	bp->timer.expires = jiffies + HZ;
-	bp->timer.data = (unsigned long) bp;
-	bp->timer.function = b44_timer;
 	add_timer(&bp->timer);
 
 	b44_enable_ints(bp);
diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 4f3845a58126..f6bc13fe8a99 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -1062,7 +1062,8 @@ static int bcm_enet_open(struct net_device *dev)
 	val = enet_readl(priv, ENET_CTL_REG);
 	val |= ENET_CTL_ENABLE_MASK;
 	enet_writel(priv, val, ENET_CTL_REG);
-	enet_dma_writel(priv, ENETDMA_CFG_EN_MASK, ENETDMA_CFG_REG);
+	if (priv->dma_has_sram)
+		enet_dma_writel(priv, ENETDMA_CFG_EN_MASK, ENETDMA_CFG_REG);
 	enet_dmac_writel(priv, priv->dma_chan_en_mask,
 			 ENETDMAC_CHANCFG, priv->rx_chan);
 
@@ -1721,10 +1722,8 @@ static int bcm_enet_probe(struct platform_device *pdev)
 	const char *clk_name;
 	int i, ret;
 
-	/* stop if shared driver failed, assume driver->probe will be
-	 * called in the same order we register devices (correct ?) */
 	if (!bcm_enet_shared_base[0])
-		return -ENODEV;
+		return -EPROBE_DEFER;
 
 	res_irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
 	res_irq_rx = platform_get_resource(pdev, IORESOURCE_IRQ, 1);
@@ -1768,12 +1767,14 @@ static int bcm_enet_probe(struct platform_device *pdev)
 		clk_name = "enet1";
 	}
 
-	priv->mac_clk = clk_get(&pdev->dev, clk_name);
+	priv->mac_clk = devm_clk_get(&pdev->dev, clk_name);
 	if (IS_ERR(priv->mac_clk)) {
 		ret = PTR_ERR(priv->mac_clk);
 		goto out;
 	}
-	clk_prepare_enable(priv->mac_clk);
+	ret = clk_prepare_enable(priv->mac_clk);
+	if (ret)
+		goto out;
 
 	/* initialize default and fetch platform data */
 	priv->rx_ring_size = BCMENET_DEF_RX_DESC;
@@ -1801,13 +1802,15 @@ static int bcm_enet_probe(struct platform_device *pdev)
 
 	if (priv->mac_id == 0 && priv->has_phy && !priv->use_external_mii) {
 		/* using internal PHY, enable clock */
-		priv->phy_clk = clk_get(&pdev->dev, "ephy");
+		priv->phy_clk = devm_clk_get(&pdev->dev, "ephy");
 		if (IS_ERR(priv->phy_clk)) {
 			ret = PTR_ERR(priv->phy_clk);
 			priv->phy_clk = NULL;
-			goto out_put_clk_mac;
+			goto out_disable_clk_mac;
 		}
-		clk_prepare_enable(priv->phy_clk);
+		ret = clk_prepare_enable(priv->phy_clk);
+		if (ret)
+			goto out_disable_clk_mac;
 	}
 
 	/* do minimal hardware init to be able to probe mii bus */
@@ -1857,9 +1860,8 @@ static int bcm_enet_probe(struct platform_device *pdev)
 	spin_lock_init(&priv->rx_lock);
 
 	/* init rx timeout (used for oom) */
-	init_timer(&priv->rx_timeout);
-	priv->rx_timeout.function = bcm_enet_refill_rx_timer;
-	priv->rx_timeout.data = (unsigned long)dev;
+	setup_timer(&priv->rx_timeout, bcm_enet_refill_rx_timer,
+		    (unsigned long)dev);
 
 	/* init the mib update lock&work */
 	mutex_init(&priv->mib_update_lock);
@@ -1901,14 +1903,10 @@ out_free_mdio:
 out_uninit_hw:
 	/* turn off mdc clock */
 	enet_writel(priv, 0, ENET_MIISC_REG);
-	if (priv->phy_clk) {
-		clk_disable_unprepare(priv->phy_clk);
-		clk_put(priv->phy_clk);
-	}
+	clk_disable_unprepare(priv->phy_clk);
 
-out_put_clk_mac:
+out_disable_clk_mac:
 	clk_disable_unprepare(priv->mac_clk);
-	clk_put(priv->mac_clk);
 out:
 	free_netdev(dev);
 	return ret;
@@ -1944,12 +1942,8 @@ static int bcm_enet_remove(struct platform_device *pdev)
 	}
 
 	/* disable hw block clocks */
-	if (priv->phy_clk) {
-		clk_disable_unprepare(priv->phy_clk);
-		clk_put(priv->phy_clk);
-	}
+	clk_disable_unprepare(priv->phy_clk);
 	clk_disable_unprepare(priv->mac_clk);
-	clk_put(priv->mac_clk);
 
 	free_netdev(dev);
 	return 0;
@@ -2332,11 +2326,8 @@ static int bcm_enetsw_open(struct net_device *dev)
 	}
 
 	/* start phy polling timer */
-	init_timer(&priv->swphy_poll);
-	priv->swphy_poll.function = swphy_poll_timer;
-	priv->swphy_poll.data = (unsigned long)priv;
-	priv->swphy_poll.expires = jiffies;
-	add_timer(&priv->swphy_poll);
+	setup_timer(&priv->swphy_poll, swphy_poll_timer, (unsigned long)priv);
+	mod_timer(&priv->swphy_poll, jiffies);
 	return 0;
 
 out:
@@ -2692,11 +2683,8 @@ static int bcm_enetsw_probe(struct platform_device *pdev)
 	struct resource *res_mem;
 	int ret, irq_rx, irq_tx;
 
-	/* stop if shared driver failed, assume driver->probe will be
-	 * called in the same order we register devices (correct ?)
-	 */
 	if (!bcm_enet_shared_base[0])
-		return -ENODEV;
+		return -EPROBE_DEFER;
 
 	res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	irq_rx = platform_get_irq(pdev, 0);
@@ -2735,24 +2723,20 @@ static int bcm_enetsw_probe(struct platform_device *pdev)
 	if (ret)
 		goto out;
 
-	if (!request_mem_region(res_mem->start, resource_size(res_mem),
-				"bcm63xx_enetsw")) {
-		ret = -EBUSY;
+	priv->base = devm_ioremap_resource(&pdev->dev, res_mem);
+	if (IS_ERR(priv->base)) {
+		ret = PTR_ERR(priv->base);
 		goto out;
 	}
 
-	priv->base = ioremap(res_mem->start, resource_size(res_mem));
-	if (priv->base == NULL) {
-		ret = -ENOMEM;
-		goto out_release_mem;
-	}
-
-	priv->mac_clk = clk_get(&pdev->dev, "enetsw");
+	priv->mac_clk = devm_clk_get(&pdev->dev, "enetsw");
 	if (IS_ERR(priv->mac_clk)) {
 		ret = PTR_ERR(priv->mac_clk);
-		goto out_unmap;
+		goto out;
 	}
-	clk_enable(priv->mac_clk);
+	ret = clk_prepare_enable(priv->mac_clk);
+	if (ret)
+		goto out;
 
 	priv->rx_chan = 0;
 	priv->tx_chan = 1;
@@ -2773,7 +2757,7 @@ static int bcm_enetsw_probe(struct platform_device *pdev)
 
 	ret = register_netdev(dev);
 	if (ret)
-		goto out_put_clk;
+		goto out_disable_clk;
 
 	netif_carrier_off(dev);
 	platform_set_drvdata(pdev, dev);
@@ -2782,14 +2766,8 @@ static int bcm_enetsw_probe(struct platform_device *pdev)
 
 	return 0;
 
-out_put_clk:
-	clk_put(priv->mac_clk);
-
-out_unmap:
-	iounmap(priv->base);
-
-out_release_mem:
-	release_mem_region(res_mem->start, resource_size(res_mem));
+out_disable_clk:
+	clk_disable_unprepare(priv->mac_clk);
 out:
 	free_netdev(dev);
 	return ret;
@@ -2801,17 +2779,13 @@ static int bcm_enetsw_remove(struct platform_device *pdev)
 {
 	struct bcm_enet_priv *priv;
 	struct net_device *dev;
-	struct resource *res;
 
 	/* stop netdevice */
 	dev = platform_get_drvdata(pdev);
 	priv = netdev_priv(dev);
 	unregister_netdev(dev);
 
-	/* release device resources */
-	iounmap(priv->base);
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	release_mem_region(res->start, resource_size(res));
+	clk_disable_unprepare(priv->mac_clk);
 
 	free_netdev(dev);
 	return 0;
diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.h b/drivers/net/ethernet/broadcom/bcm63xx_enet.h
index 0a1b7b2e55bd..dd6ae3077433 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.h
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.h
@@ -8,7 +8,6 @@
 #include <linux/platform_device.h>
 
 #include <bcm63xx_regs.h>
-#include <bcm63xx_irq.h>
 #include <bcm63xx_io.h>
 #include <bcm63xx_iudma.h>
 
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index e3af1f3cb61f..b3055a76dfbf 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -8462,10 +8462,8 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 	bnx2_set_default_link(bp);
 	bp->req_flow_ctrl = FLOW_CTRL_RX | FLOW_CTRL_TX;
 
-	init_timer(&bp->timer);
+	setup_timer(&bp->timer, bnx2_timer, (unsigned long)bp);
 	bp->timer.expires = RUN_AT(BNX2_TIMER_INTERVAL);
-	bp->timer.data = (unsigned long) bp;
-	bp->timer.function = bnx2_timer;
 
 #ifdef BCM_CNIC
 	if (bnx2_shmem_rd(bp, BNX2_ISCSI_INITIATOR) & BNX2_ISCSI_INITIATOR_EN)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index c12b4d3e946e..54d1571384a0 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12414,10 +12414,8 @@ static int bnx2x_init_bp(struct bnx2x *bp)
 
 	bp->current_interval = CHIP_REV_IS_SLOW(bp) ? 5*HZ : HZ;
 
-	init_timer(&bp->timer);
+	setup_timer(&bp->timer, bnx2x_timer, (unsigned long)bp);
 	bp->timer.expires = jiffies + bp->current_interval;
-	bp->timer.data = (unsigned long) bp;
-	bp->timer.function = bnx2x_timer;
 
 	if (SHMEM2_HAS(bp, dcbx_lldp_params_offset) &&
 	    SHMEM2_HAS(bp, dcbx_lldp_dcbx_stat_offset) &&
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index aacec8bc19d5..5ba49938ba55 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -1491,7 +1491,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons,
 				   (struct rx_tpa_end_cmp *)rxcmp,
 				   (struct rx_tpa_end_cmp_ext *)rxcmp1, event);
 
-		if (unlikely(IS_ERR(skb)))
+		if (IS_ERR(skb))
 			return -EBUSY;
 
 		rc = -ENOMEM;
@@ -7190,9 +7190,7 @@ static int bnxt_init_board(struct pci_dev *pdev, struct net_device *dev)
 
 	bp->stats_coal_ticks = BNXT_DEF_STATS_COAL_TICKS;
 
-	init_timer(&bp->timer);
-	bp->timer.data = (unsigned long)bp;
-	bp->timer.function = bnxt_timer;
+	setup_timer(&bp->timer, bnxt_timer, (unsigned long)bp);
 	bp->current_interval = BNXT_TIMER_INTERVAL;
 
 	clear_bit(BNXT_STATE_OPEN, &bp->state);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index d8f0c837b72c..06ce63c00821 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 
 	xdp.data_hard_start = *data_ptr - offset;
 	xdp.data = *data_ptr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = *data_ptr + *len;
 	orig_data = xdp.data;
 	mapping = rx_buf->mapping - bp->rx_dma_offset;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 656e6af70f0a..d8d5f207c759 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -11087,9 +11087,7 @@ static void tg3_timer_init(struct tg3 *tp)
 	tp->asf_multiplier = (HZ / tp->timer_offset) *
 			     TG3_FW_UPDATE_FREQ_SEC;
 
-	init_timer(&tp->timer);
-	tp->timer.data = (unsigned long) tp;
-	tp->timer.function = tg3_timer;
+	setup_timer(&tp->timer, tg3_timer, (unsigned long)tp);
 }
 
 static void tg3_timer_start(struct tg3 *tp)
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index e7f54948173f..963803bc6633 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -59,9 +59,9 @@ static int debug = -1;
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "NETIF_MSG debug bits");
 
-static char fw_type[LIO_MAX_FW_TYPE_LEN] = LIO_FW_NAME_TYPE_NIC;
+static char fw_type[LIO_MAX_FW_TYPE_LEN] = LIO_FW_NAME_TYPE_AUTO;
 module_param_string(fw_type, fw_type, sizeof(fw_type), 0444);
-MODULE_PARM_DESC(fw_type, "Type of firmware to be loaded. Default \"nic\".  Use \"none\" to load firmware from flash.");
+MODULE_PARM_DESC(fw_type, "Type of firmware to be loaded (default is \"auto\"), which uses firmware in flash, if present, else loads \"nic\".");
 
 static u32 console_bitmask;
 module_param(console_bitmask, int, 0644);
@@ -1115,10 +1115,10 @@ liquidio_probe(struct pci_dev *pdev,
 	return 0;
 }
 
-static bool fw_type_is_none(void)
+static bool fw_type_is_auto(void)
 {
-	return strncmp(fw_type, LIO_FW_NAME_TYPE_NONE,
-		       sizeof(LIO_FW_NAME_TYPE_NONE)) == 0;
+	return strncmp(fw_type, LIO_FW_NAME_TYPE_AUTO,
+		       sizeof(LIO_FW_NAME_TYPE_AUTO)) == 0;
 }
 
 /**
@@ -1302,7 +1302,7 @@ static void octeon_destroy_resources(struct octeon_device *oct)
 		 * Implementation note: only soft-reset the device
 		 * if it is a CN6XXX OR the LAST CN23XX device.
 		 */
-		if (fw_type_is_none())
+		if (atomic_read(oct->adapter_fw_state) == FW_IS_PRELOADED)
 			octeon_pci_flr(oct);
 		else if (OCTEON_CN6XXX(oct) || !refcount)
 			oct->fn_list.soft_reset(oct);
@@ -1934,10 +1934,12 @@ static int load_firmware(struct octeon_device *oct)
 	char fw_name[LIO_MAX_FW_FILENAME_LEN];
 	char *tmp_fw_type;
 
-	if (fw_type[0] == '\0')
+	if (fw_type_is_auto()) {
 		tmp_fw_type = LIO_FW_NAME_TYPE_NIC;
-	else
+		strncpy(fw_type, tmp_fw_type, sizeof(fw_type));
+	} else {
 		tmp_fw_type = fw_type;
+	}
 
 	sprintf(fw_name, "%s%s%s_%s%s", LIO_FW_DIR, LIO_FW_BASE_NAME,
 		octeon_get_conf(oct)->card_name, tmp_fw_type,
@@ -3303,7 +3305,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 {
 	struct lio *lio = NULL;
 	struct net_device *netdev;
-	u8 mac[6], i, j;
+	u8 mac[6], i, j, *fw_ver;
 	struct octeon_soft_command *sc;
 	struct liquidio_if_cfg_context *ctx;
 	struct liquidio_if_cfg_resp *resp;
@@ -3414,6 +3416,22 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 			goto setup_nic_dev_fail;
 		}
 
+		/* Verify f/w version (in case of 'auto' loading from flash) */
+		fw_ver = octeon_dev->fw_info.liquidio_firmware_version;
+		if (memcmp(LIQUIDIO_BASE_VERSION,
+			   fw_ver,
+			   strlen(LIQUIDIO_BASE_VERSION))) {
+			dev_err(&octeon_dev->pci_dev->dev,
+				"Unmatched firmware version. Expected %s.x, got %s.\n",
+				LIQUIDIO_BASE_VERSION, fw_ver);
+			goto setup_nic_dev_fail;
+		} else if (atomic_read(octeon_dev->adapter_fw_state) ==
+			   FW_IS_PRELOADED) {
+			dev_info(&octeon_dev->pci_dev->dev,
+				 "Using auto-loaded firmware version %s.\n",
+				 fw_ver);
+		}
+
 		octeon_swap_8B_data((u64 *)(&resp->cfg_info),
 				    (sizeof(struct liquidio_if_cfg_info)) >> 3);
 
@@ -3882,9 +3900,9 @@ octeon_recv_vf_drv_notice(struct octeon_recv_info *recv_info, void *buf)
 static int octeon_device_init(struct octeon_device *octeon_dev)
 {
 	int j, ret;
-	int fw_loaded = 0;
 	char bootcmd[] = "\n";
 	char *dbg_enb = NULL;
+	enum lio_fw_state fw_state;
 	struct octeon_device_priv *oct_priv =
 		(struct octeon_device_priv *)octeon_dev->priv;
 	atomic_set(&octeon_dev->status, OCT_DEV_BEGIN_STATE);
@@ -3916,24 +3934,40 @@ static int octeon_device_init(struct octeon_device *octeon_dev)
 
 	octeon_dev->app_mode = CVM_DRV_INVALID_APP;
 
-	if (OCTEON_CN23XX_PF(octeon_dev)) {
-		if (!cn23xx_fw_loaded(octeon_dev) && !fw_type_is_none()) {
-			fw_loaded = 0;
-			/* Do a soft reset of the Octeon device. */
-			if (octeon_dev->fn_list.soft_reset(octeon_dev))
-				return 1;
-			/* things might have changed */
-			if (!cn23xx_fw_loaded(octeon_dev))
-				fw_loaded = 0;
-			else
-				fw_loaded = 1;
-		} else {
-			fw_loaded = 1;
-		}
-	} else if (octeon_dev->fn_list.soft_reset(octeon_dev)) {
-		return 1;
+	/* CN23XX supports preloaded firmware if the following is true:
+	 *
+	 * The adapter indicates that firmware is currently running AND
+	 * 'fw_type' is 'auto'.
+	 *
+	 * (default state is NEEDS_TO_BE_LOADED, override it if appropriate).
+	 */
+	if (OCTEON_CN23XX_PF(octeon_dev) &&
+	    cn23xx_fw_loaded(octeon_dev) && fw_type_is_auto()) {
+		atomic_cmpxchg(octeon_dev->adapter_fw_state,
+			       FW_NEEDS_TO_BE_LOADED, FW_IS_PRELOADED);
 	}
 
+	/* If loading firmware, only first device of adapter needs to do so. */
+	fw_state = atomic_cmpxchg(octeon_dev->adapter_fw_state,
+				  FW_NEEDS_TO_BE_LOADED,
+				  FW_IS_BEING_LOADED);
+
+	/* Here, [local variable] 'fw_state' is set to one of:
+	 *
+	 *   FW_IS_PRELOADED:       No firmware is to be loaded (see above)
+	 *   FW_NEEDS_TO_BE_LOADED: The driver's first instance will load
+	 *                          firmware to the adapter.
+	 *   FW_IS_BEING_LOADED:    The driver's second instance will not load
+	 *                          firmware to the adapter.
+	 */
+
+	/* Prior to f/w load, perform a soft reset of the Octeon device;
+	 * if error resetting, return w/error.
+	 */
+	if (fw_state == FW_NEEDS_TO_BE_LOADED)
+		if (octeon_dev->fn_list.soft_reset(octeon_dev))
+			return 1;
+
 	/* Initialize the dispatch mechanism used to push packets arriving on
 	 * Octeon Output queues.
 	 */
@@ -4063,7 +4097,7 @@ static int octeon_device_init(struct octeon_device *octeon_dev)
 
 	atomic_set(&octeon_dev->status, OCT_DEV_IO_QUEUES_DONE);
 
-	if ((!OCTEON_CN23XX_PF(octeon_dev)) || !fw_loaded) {
+	if (fw_state == FW_NEEDS_TO_BE_LOADED) {
 		dev_dbg(&octeon_dev->pci_dev->dev, "Waiting for DDR initialization...\n");
 		if (!ddr_timeout) {
 			dev_info(&octeon_dev->pci_dev->dev,
@@ -4125,6 +4159,8 @@ static int octeon_device_init(struct octeon_device *octeon_dev)
 			dev_err(&octeon_dev->pci_dev->dev, "Could not load firmware to board\n");
 			return 1;
 		}
+
+		atomic_set(octeon_dev->adapter_fw_state, FW_HAS_BEEN_LOADED);
 	}
 
 	handshake[octeon_dev->octeon_id].init_ok = 1;
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_image.h b/drivers/net/ethernet/cavium/liquidio/liquidio_image.h
index 78a3685f6fe0..5bf5e8791dfb 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_image.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_image.h
@@ -24,6 +24,7 @@
 #define LIO_FW_BASE_NAME        "lio_"
 #define LIO_FW_NAME_SUFFIX      ".bin"
 #define LIO_FW_NAME_TYPE_NIC    "nic"
+#define LIO_FW_NAME_TYPE_AUTO   "auto"
 #define LIO_FW_NAME_TYPE_NONE   "none"
 #define LIO_MAX_FIRMWARE_VERSION_LEN 16
 
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
index 29d53b1763a7..e4aa3395a578 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
@@ -541,6 +541,7 @@ static char oct_dev_app_str[CVM_DRV_APP_COUNT + 1][32] = {
 
 static struct octeon_device *octeon_device[MAX_OCTEON_DEVICES];
 static atomic_t adapter_refcounts[MAX_OCTEON_DEVICES];
+static atomic_t adapter_fw_states[MAX_OCTEON_DEVICES];
 
 static u32 octeon_device_count;
 /* locks device array (i.e. octeon_device[]) */
@@ -770,6 +771,10 @@ int octeon_register_device(struct octeon_device *oct,
 	oct->adapter_refcount = &adapter_refcounts[oct->octeon_id];
 	atomic_set(oct->adapter_refcount, 0);
 
+	/* Like the reference count, the f/w state is shared 'per-adapter' */
+	oct->adapter_fw_state = &adapter_fw_states[oct->octeon_id];
+	atomic_set(oct->adapter_fw_state, FW_NEEDS_TO_BE_LOADED);
+
 	spin_lock(&octeon_devices_lock);
 	for (idx = (int)oct->octeon_id - 1; idx >= 0; idx--) {
 		if (!octeon_device[idx]) {
@@ -780,11 +785,15 @@ int octeon_register_device(struct octeon_device *oct,
 			atomic_inc(oct->adapter_refcount);
 			return 1; /* here, refcount is guaranteed to be 1 */
 		}
-		/* if another device is at same bus/dev, use its refcounter */
+		/* If another device is at same bus/dev, use its refcounter
+		 * (and f/w state variable).
+		 */
 		if ((octeon_device[idx]->loc.bus == bus) &&
 		    (octeon_device[idx]->loc.dev == dev)) {
 			oct->adapter_refcount =
 				octeon_device[idx]->adapter_refcount;
+			oct->adapter_fw_state =
+				octeon_device[idx]->adapter_fw_state;
 			break;
 		}
 	}
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index 894af199ddef..33d19c4509bc 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -50,6 +50,13 @@ enum octeon_pci_swap_mode {
 	OCTEON_PCI_32BIT_LW_SWAP = 3
 };
 
+enum lio_fw_state {
+	FW_IS_PRELOADED = 0,
+	FW_NEEDS_TO_BE_LOADED = 1,
+	FW_IS_BEING_LOADED = 2,
+	FW_HAS_BEEN_LOADED = 3,
+};
+
 enum {
 	OCTEON_CONFIG_TYPE_DEFAULT = 0,
 	NUM_OCTEON_CONFS,
@@ -557,6 +564,9 @@ struct octeon_device {
 	} loc;
 
 	atomic_t *adapter_refcount; /* reference count of adapter */
+
+	atomic_t *adapter_fw_state; /* per-adapter, lio_fw_state */
+
 	bool ptp_enable;
 };
 
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 49b80da51ba7..d68478afccbf 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 
 	xdp.data_hard_start = page_address(page);
 	xdp.data = (void *)cpu_addr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + len;
 	orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
index 0f13a7f7c1d3..75e439918700 100644
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -2075,9 +2075,8 @@ struct sge *t1_sge_create(struct adapter *adapter, struct sge_params *p)
 			goto nomem_port;
 	}
 
-	init_timer(&sge->tx_reclaim_timer);
-	sge->tx_reclaim_timer.data = (unsigned long)sge;
-	sge->tx_reclaim_timer.function = sge_tx_reclaim_cb;
+	setup_timer(&sge->tx_reclaim_timer, sge_tx_reclaim_cb,
+		    (unsigned long)sge);
 
 	if (is_T2(sge->adapter)) {
 		init_timer(&sge->espibug_timer);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile b/drivers/net/ethernet/chelsio/cxgb4/Makefile
index 817212702f0a..fecd7aab673b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/Makefile
+++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile
@@ -4,7 +4,9 @@
 
 obj-$(CONFIG_CHELSIO_T4) += cxgb4.o
 
-cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o cxgb4_uld.o sched.o cxgb4_filter.o cxgb4_tc_u32.o cxgb4_ptp.o
+cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o \
+	      cxgb4_uld.o sched.o cxgb4_filter.o cxgb4_tc_u32.o \
+	      cxgb4_ptp.o cxgb4_tc_flower.o
 cxgb4-$(CONFIG_CHELSIO_T4_DCB) +=  cxgb4_dcb.o
 cxgb4-$(CONFIG_CHELSIO_T4_FCOE) +=  cxgb4_fcoe.o
 cxgb4-$(CONFIG_DEBUG_FS) += cxgb4_debugfs.o
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index ea72d2d2e1b4..0db3ab6ad094 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -549,6 +549,7 @@ enum {                                 /* adapter flags */
 	MASTER_PF          = (1 << 7),
 	FW_OFLD_CONN       = (1 << 9),
 	ROOT_NO_RELAXED_ORDERING = (1 << 10),
+	SHUTTING_DOWN	   = (1 << 11),
 };
 
 enum {
@@ -904,6 +905,10 @@ struct adapter {
 	/* TC u32 offload */
 	struct cxgb4_tc_u32_table *tc_u32;
 	struct chcr_stats_debug chcr_stats;
+
+	/* TC flower offload */
+	DECLARE_HASHTABLE(flower_anymatch_tbl, 9);
+	struct timer_list flower_stats_timer;
 };
 
 /* Support for "sched-class" command to allow a TX Scheduling Class to be
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
index 6ee2ed30626b..4e7f72b17e82 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
@@ -40,8 +40,7 @@ static inline bool cxgb4_dcb_state_synced(enum cxgb4_dcb_state state)
 		return false;
 }
 
-/* Initialize a port's Data Center Bridging state.  Typically used after a
- * Link Down event.
+/* Initialize a port's Data Center Bridging state.
  */
 void cxgb4_dcb_state_init(struct net_device *dev)
 {
@@ -106,6 +105,15 @@ static void cxgb4_dcb_cleanup_apps(struct net_device *dev)
 	}
 }
 
+/* Reset a port's Data Center Bridging state.  Typically used after a
+ * Link Down event.
+ */
+void cxgb4_dcb_reset(struct net_device *dev)
+{
+	cxgb4_dcb_cleanup_apps(dev);
+	cxgb4_dcb_state_init(dev);
+}
+
 /* Finite State machine for Data Center Bridging.
  */
 void cxgb4_dcb_state_fsm(struct net_device *dev,
@@ -194,8 +202,7 @@ void cxgb4_dcb_state_fsm(struct net_device *dev,
 			 * state.  We need to reset back to a ground state
 			 * of incomplete.
 			 */
-			cxgb4_dcb_cleanup_apps(dev);
-			cxgb4_dcb_state_init(dev);
+			cxgb4_dcb_reset(dev);
 			dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE;
 			dcb->supported = CXGB4_DCBX_FW_SUPPORT;
 			linkwatch_fire_event(dev);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
index ccf24d3dc982..02040b99c78a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
@@ -131,6 +131,7 @@ struct port_dcb_info {
 
 void cxgb4_dcb_state_init(struct net_device *);
 void cxgb4_dcb_version_init(struct net_device *);
+void cxgb4_dcb_reset(struct net_device *dev);
 void cxgb4_dcb_state_fsm(struct net_device *, enum cxgb4_dcb_state_input);
 void cxgb4_dcb_handle_fw_update(struct adapter *, const struct fw_port_cmd *);
 void cxgb4_dcb_set_caps(struct adapter *, const struct fw_port_cmd *);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index 45b5853ca2f1..15361ca2857c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -148,6 +148,106 @@ static int get_filter_steerq(struct net_device *dev,
 	return iq;
 }
 
+static int get_filter_count(struct adapter *adapter, unsigned int fidx,
+			    u64 *pkts, u64 *bytes)
+{
+	unsigned int tcb_base, tcbaddr;
+	unsigned int word_offset;
+	struct filter_entry *f;
+	__be64 be64_byte_count;
+	int ret;
+
+	tcb_base = t4_read_reg(adapter, TP_CMM_TCB_BASE_A);
+	if ((fidx != (adapter->tids.nftids + adapter->tids.nsftids - 1)) &&
+	    fidx >= adapter->tids.nftids)
+		return -E2BIG;
+
+	f = &adapter->tids.ftid_tab[fidx];
+	if (!f->valid)
+		return -EINVAL;
+
+	tcbaddr = tcb_base + f->tid * TCB_SIZE;
+
+	spin_lock(&adapter->win0_lock);
+	if (is_t4(adapter->params.chip)) {
+		__be64 be64_count;
+
+		/* T4 doesn't maintain byte counts in hw */
+		*bytes = 0;
+
+		/* Get pkts */
+		word_offset = 4;
+		ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0,
+				   tcbaddr + (word_offset * sizeof(__be32)),
+				   sizeof(be64_count),
+				   (__be32 *)&be64_count,
+				   T4_MEMORY_READ);
+		if (ret < 0)
+			goto out;
+		*pkts = be64_to_cpu(be64_count);
+	} else {
+		__be32 be32_count;
+
+		/* Get bytes */
+		word_offset = 4;
+		ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0,
+				   tcbaddr + (word_offset * sizeof(__be32)),
+				   sizeof(be64_byte_count),
+				   &be64_byte_count,
+				   T4_MEMORY_READ);
+		if (ret < 0)
+			goto out;
+		*bytes = be64_to_cpu(be64_byte_count);
+
+		/* Get pkts */
+		word_offset = 6;
+		ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0,
+				   tcbaddr + (word_offset * sizeof(__be32)),
+				   sizeof(be32_count),
+				   &be32_count,
+				   T4_MEMORY_READ);
+		if (ret < 0)
+			goto out;
+		*pkts = (u64)be32_to_cpu(be32_count);
+	}
+
+out:
+	spin_unlock(&adapter->win0_lock);
+	return ret;
+}
+
+int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx,
+			      u64 *hitcnt, u64 *bytecnt)
+{
+	struct adapter *adapter = netdev2adap(dev);
+
+	return get_filter_count(adapter, fidx, hitcnt, bytecnt);
+}
+
+int cxgb4_get_free_ftid(struct net_device *dev, int family)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct tid_info *t = &adap->tids;
+	int ftid;
+
+	spin_lock_bh(&t->ftid_lock);
+	if (family == PF_INET) {
+		ftid = find_first_zero_bit(t->ftid_bmap, t->nftids);
+		if (ftid >= t->nftids)
+			ftid = -1;
+	} else {
+		ftid = bitmap_find_free_region(t->ftid_bmap, t->nftids, 2);
+		if (ftid < 0)
+			goto out_unlock;
+
+		/* this is only a lookup, keep the found region unallocated */
+		bitmap_release_region(t->ftid_bmap, ftid, 2);
+	}
+out_unlock:
+	spin_unlock_bh(&t->ftid_lock);
+	return ftid;
+}
+
 static int cxgb4_set_ftid(struct tid_info *t, int fidx, int family)
 {
 	spin_lock_bh(&t->ftid_lock);
@@ -191,7 +291,8 @@ static int del_filter_wr(struct adapter *adapter, int fidx)
 		return -ENOMEM;
 
 	fwr = __skb_put(skb, len);
-	t4_mk_filtdelwr(f->tid, fwr, adapter->sge.fw_evtq.abs_id);
+	t4_mk_filtdelwr(f->tid, fwr, (adapter->flags & SHUTTING_DOWN) ? -1
+			: adapter->sge.fw_evtq.abs_id);
 
 	/* Mark the filter as "pending" and ship off the Filter Work Request.
 	 * When we get the Work Request Reply we'll clear the pending status.
@@ -636,6 +737,10 @@ int cxgb4_del_filter(struct net_device *dev, int filter_id)
 	struct filter_ctx ctx;
 	int ret;
 
+	/* If we are shutting down the adapter do not wait for completion */
+	if (netdev2adap(dev)->flags & SHUTTING_DOWN)
+		return __cxgb4_del_filter(dev, filter_id, NULL);
+
 	init_completion(&ctx.completion);
 
 	ret = __cxgb4_del_filter(dev, filter_id, &ctx);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 92d9d795d874..fe4cbe22d5d7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -79,6 +79,7 @@
 #include "l2t.h"
 #include "sched.h"
 #include "cxgb4_tc_u32.h"
+#include "cxgb4_tc_flower.h"
 #include "cxgb4_ptp.h"
 
 char cxgb4_driver_name[] = KBUILD_MODNAME;
@@ -280,7 +281,7 @@ void t4_os_link_changed(struct adapter *adapter, int port_id, int link_stat)
 		else {
 #ifdef CONFIG_CHELSIO_T4_DCB
 			if (cxgb4_dcb_enabled(dev)) {
-				cxgb4_dcb_state_init(dev);
+				cxgb4_dcb_reset(dev);
 				dcb_tx_queue_prio_enable(dev, false);
 			}
 #endif /* CONFIG_CHELSIO_T4_DCB */
@@ -2303,10 +2304,16 @@ static int cxgb_close(struct net_device *dev)
 {
 	struct port_info *pi = netdev_priv(dev);
 	struct adapter *adapter = pi->adapter;
+	int ret;
 
 	netif_tx_stop_all_queues(dev);
 	netif_carrier_off(dev);
-	return t4_enable_vi(adapter, adapter->pf, pi->viid, false, false);
+	ret = t4_enable_vi(adapter, adapter->pf, pi->viid, false, false);
+#ifdef CONFIG_CHELSIO_T4_DCB
+	cxgb4_dcb_reset(dev);
+	dcb_tx_queue_prio_enable(dev, false);
+#endif
+	return ret;
 }
 
 int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
@@ -2873,6 +2880,25 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
 	return err;
 }
 
+static int cxgb_setup_tc_flower(struct net_device *dev,
+				struct tc_cls_flower_offload *cls_flower)
+{
+	if (!is_classid_clsact_ingress(cls_flower->common.classid) ||
+	    cls_flower->common.chain_index)
+		return -EOPNOTSUPP;
+
+	switch (cls_flower->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return cxgb4_tc_flower_replace(dev, cls_flower);
+	case TC_CLSFLOWER_DESTROY:
+		return cxgb4_tc_flower_destroy(dev, cls_flower);
+	case TC_CLSFLOWER_STATS:
+		return cxgb4_tc_flower_stats(dev, cls_flower);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static int cxgb_setup_tc_cls_u32(struct net_device *dev,
 				 struct tc_cls_u32_offload *cls_u32)
 {
@@ -2907,6 +2933,8 @@ static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	switch (type) {
 	case TC_SETUP_CLSU32:
 		return cxgb_setup_tc_cls_u32(dev, type_data);
+	case TC_SETUP_CLSFLOWER:
+		return cxgb_setup_tc_flower(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -4615,6 +4643,7 @@ static void free_some_resources(struct adapter *adapter)
 	kvfree(adapter->l2t);
 	t4_cleanup_sched(adapter);
 	kvfree(adapter->tids.tid_tab);
+	cxgb4_cleanup_tc_flower(adapter);
 	cxgb4_cleanup_tc_u32(adapter);
 	kfree(adapter->sge.egr_map);
 	kfree(adapter->sge.ingr_map);
@@ -4995,7 +5024,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		netdev->priv_flags |= IFF_UNICAST_FLT;
 
 		/* MTU range: 81 - 9600 */
-		netdev->min_mtu = 81;
+		netdev->min_mtu = 81;              /* accommodate SACK */
 		netdev->max_mtu = MAX_MTU;
 
 		netdev->netdev_ops = &cxgb4_netdev_ops;
@@ -5083,6 +5112,8 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		if (!adapter->tc_u32)
 			dev_warn(&pdev->dev,
 				 "could not offload tc u32, continuing\n");
+
+		cxgb4_init_tc_flower(adapter);
 	}
 
 	if (is_offload(adapter)) {
@@ -5254,6 +5285,8 @@ static void remove_one(struct pci_dev *pdev)
 		return;
 	}
 
+	adapter->flags |= SHUTTING_DOWN;
+
 	if (adapter->pf == 4) {
 		int i;
 
@@ -5339,6 +5372,8 @@ static void shutdown_one(struct pci_dev *pdev)
 		return;
 	}
 
+	adapter->flags |= SHUTTING_DOWN;
+
 	if (adapter->pf == 4) {
 		int i;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
new file mode 100644
index 000000000000..92a311767381
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -0,0 +1,452 @@
+/*
+ * This file is part of the Chelsio T4/T5/T6 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2017 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_vlan.h>
+
+#include "cxgb4.h"
+#include "cxgb4_tc_flower.h"
+
+#define STATS_CHECK_PERIOD (HZ / 2)
+
+static struct ch_tc_flower_entry *allocate_flower_entry(void)
+{
+	struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL);
+	spin_lock_init(&new->lock);
+	return new;
+}
+
+/* Must be called with either RTNL or rcu_read_lock */
+static struct ch_tc_flower_entry *ch_flower_lookup(struct adapter *adap,
+						   unsigned long flower_cookie)
+{
+	struct ch_tc_flower_entry *flower_entry;
+
+	hash_for_each_possible_rcu(adap->flower_anymatch_tbl, flower_entry,
+				   link, flower_cookie)
+		if (flower_entry->tc_flower_cookie == flower_cookie)
+			return flower_entry;
+	return NULL;
+}
+
+static void cxgb4_process_flow_match(struct net_device *dev,
+				     struct tc_cls_flower_offload *cls,
+				     struct ch_filter_specification *fs)
+{
+	u16 addr_type = 0;
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  cls->key);
+
+		addr_type = key->addr_type;
+	}
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  cls->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  cls->mask);
+		u16 ethtype_key = ntohs(key->n_proto);
+		u16 ethtype_mask = ntohs(mask->n_proto);
+
+		if (ethtype_key == ETH_P_ALL) {
+			ethtype_key = 0;
+			ethtype_mask = 0;
+		}
+
+		fs->val.ethtype = ethtype_key;
+		fs->mask.ethtype = ethtype_mask;
+		fs->val.proto = key->ip_proto;
+		fs->mask.proto = mask->ip_proto;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_dissector_key_ipv4_addrs *key =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  cls->key);
+		struct flow_dissector_key_ipv4_addrs *mask =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  cls->mask);
+		fs->type = 0;
+		memcpy(&fs->val.lip[0], &key->dst, sizeof(key->dst));
+		memcpy(&fs->val.fip[0], &key->src, sizeof(key->src));
+		memcpy(&fs->mask.lip[0], &mask->dst, sizeof(mask->dst));
+		memcpy(&fs->mask.fip[0], &mask->src, sizeof(mask->src));
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_dissector_key_ipv6_addrs *key =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  cls->key);
+		struct flow_dissector_key_ipv6_addrs *mask =
+			skb_flow_dissector_target(cls->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  cls->mask);
+
+		fs->type = 1;
+		memcpy(&fs->val.lip[0], key->dst.s6_addr, sizeof(key->dst));
+		memcpy(&fs->val.fip[0], key->src.s6_addr, sizeof(key->src));
+		memcpy(&fs->mask.lip[0], mask->dst.s6_addr, sizeof(mask->dst));
+		memcpy(&fs->mask.fip[0], mask->src.s6_addr, sizeof(mask->src));
+	}
+
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+		struct flow_dissector_key_ports *key, *mask;
+
+		key = skb_flow_dissector_target(cls->dissector,
+						FLOW_DISSECTOR_KEY_PORTS,
+						cls->key);
+		mask = skb_flow_dissector_target(cls->dissector,
+						 FLOW_DISSECTOR_KEY_PORTS,
+						 cls->mask);
+		fs->val.lport = cpu_to_be16(key->dst);
+		fs->mask.lport = cpu_to_be16(mask->dst);
+		fs->val.fport = cpu_to_be16(key->src);
+		fs->mask.fport = cpu_to_be16(mask->src);
+	}
+
+	/* Match only packets coming from the ingress port where this
+	 * filter will be created.
+	 */
+	fs->val.iport = netdev2pinfo(dev)->port_id;
+	fs->mask.iport = ~0;
+}
+
+static int cxgb4_validate_flow_match(struct net_device *dev,
+				     struct tc_cls_flower_offload *cls)
+{
+	if (cls->dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_PORTS))) {
+		netdev_warn(dev, "Unsupported key used: 0x%x\n",
+			    cls->dissector->used_keys);
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static void cxgb4_process_flow_actions(struct net_device *in,
+				       struct tc_cls_flower_offload *cls,
+				       struct ch_filter_specification *fs)
+{
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+
+	tcf_exts_to_list(cls->exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (is_tcf_gact_shot(a)) {
+			fs->action = FILTER_DROP;
+		} else if (is_tcf_mirred_egress_redirect(a)) {
+			int ifindex = tcf_mirred_ifindex(a);
+			struct net_device *out = __dev_get_by_index(dev_net(in),
+								    ifindex);
+			struct port_info *pi = netdev_priv(out);
+
+			fs->action = FILTER_SWITCH;
+			fs->eport = pi->port_id;
+		} else if (is_tcf_vlan(a)) {
+			u32 vlan_action = tcf_vlan_action(a);
+			u8 prio = tcf_vlan_push_prio(a);
+			u16 vid = tcf_vlan_push_vid(a);
+			u16 vlan_tci = (prio << VLAN_PRIO_SHIFT) | vid;
+
+			switch (vlan_action) {
+			case TCA_VLAN_ACT_POP:
+				fs->newvlan |= VLAN_REMOVE;
+				break;
+			case TCA_VLAN_ACT_PUSH:
+				fs->newvlan |= VLAN_INSERT;
+				fs->vlan = vlan_tci;
+				break;
+			case TCA_VLAN_ACT_MODIFY:
+				fs->newvlan |= VLAN_REWRITE;
+				fs->vlan = vlan_tci;
+				break;
+			default:
+				break;
+			}
+		}
+	}
+}
+
+static int cxgb4_validate_flow_actions(struct net_device *dev,
+				       struct tc_cls_flower_offload *cls)
+{
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+
+	tcf_exts_to_list(cls->exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (is_tcf_gact_shot(a)) {
+			/* Do nothing */
+		} else if (is_tcf_mirred_egress_redirect(a)) {
+			struct adapter *adap = netdev2adap(dev);
+			struct net_device *n_dev;
+			unsigned int i, ifindex;
+			bool found = false;
+
+			ifindex = tcf_mirred_ifindex(a);
+			for_each_port(adap, i) {
+				n_dev = adap->port[i];
+				if (ifindex == n_dev->ifindex) {
+					found = true;
+					break;
+				}
+			}
+
+			/* If interface doesn't belong to our hw, then
+			 * the provided output port is not valid
+			 */
+			if (!found) {
+				netdev_err(dev, "%s: Out port invalid\n",
+					   __func__);
+				return -EINVAL;
+			}
+		} else if (is_tcf_vlan(a)) {
+			u16 proto = be16_to_cpu(tcf_vlan_push_proto(a));
+			u32 vlan_action = tcf_vlan_action(a);
+
+			switch (vlan_action) {
+			case TCA_VLAN_ACT_POP:
+				break;
+			case TCA_VLAN_ACT_PUSH:
+			case TCA_VLAN_ACT_MODIFY:
+				if (proto != ETH_P_8021Q) {
+					netdev_err(dev, "%s: Unsupported vlan proto\n",
+						   __func__);
+					return -EOPNOTSUPP;
+				}
+				break;
+			default:
+				netdev_err(dev, "%s: Unsupported vlan action\n",
+					   __func__);
+				return -EOPNOTSUPP;
+			}
+		} else {
+			netdev_err(dev, "%s: Unsupported action\n", __func__);
+			return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
+int cxgb4_tc_flower_replace(struct net_device *dev,
+			    struct tc_cls_flower_offload *cls)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct ch_tc_flower_entry *ch_flower;
+	struct ch_filter_specification *fs;
+	struct filter_ctx ctx;
+	int fidx;
+	int ret;
+
+	if (cxgb4_validate_flow_actions(dev, cls))
+		return -EOPNOTSUPP;
+
+	if (cxgb4_validate_flow_match(dev, cls))
+		return -EOPNOTSUPP;
+
+	ch_flower = allocate_flower_entry();
+	if (!ch_flower) {
+		netdev_err(dev, "%s: ch_flower alloc failed.\n", __func__);
+		return -ENOMEM;
+	}
+
+	fs = &ch_flower->fs;
+	fs->hitcnts = 1;
+	cxgb4_process_flow_actions(dev, cls, fs);
+	cxgb4_process_flow_match(dev, cls, fs);
+
+	fidx = cxgb4_get_free_ftid(dev, fs->type ? PF_INET6 : PF_INET);
+	if (fidx < 0) {
+		netdev_err(dev, "%s: No fidx for offload.\n", __func__);
+		ret = -ENOMEM;
+		goto free_entry;
+	}
+
+	init_completion(&ctx.completion);
+	ret = __cxgb4_set_filter(dev, fidx, fs, &ctx);
+	if (ret) {
+		netdev_err(dev, "%s: filter creation err %d\n",
+			   __func__, ret);
+		goto free_entry;
+	}
+
+	/* Wait for reply */
+	ret = wait_for_completion_timeout(&ctx.completion, 10 * HZ);
+	if (!ret) {
+		ret = -ETIMEDOUT;
+		goto free_entry;
+	}
+
+	ret = ctx.result;
+	/* Check if hw returned error for filter creation */
+	if (ret) {
+		netdev_err(dev, "%s: filter creation err %d\n",
+			   __func__, ret);
+		goto free_entry;
+	}
+
+	INIT_HLIST_NODE(&ch_flower->link);
+	ch_flower->tc_flower_cookie = cls->cookie;
+	ch_flower->filter_id = ctx.tid;
+	hash_add_rcu(adap->flower_anymatch_tbl, &ch_flower->link, cls->cookie);
+
+	return ret;
+
+free_entry:
+	kfree(ch_flower);
+	return ret;
+}
+
+int cxgb4_tc_flower_destroy(struct net_device *dev,
+			    struct tc_cls_flower_offload *cls)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct ch_tc_flower_entry *ch_flower;
+	int ret;
+
+	ch_flower = ch_flower_lookup(adap, cls->cookie);
+	if (!ch_flower)
+		return -ENOENT;
+
+	ret = cxgb4_del_filter(dev, ch_flower->filter_id);
+	if (ret)
+		goto err;
+
+	hash_del_rcu(&ch_flower->link);
+	kfree_rcu(ch_flower, rcu);
+
+err:
+	return ret;
+}
+
+static void ch_flower_stats_cb(unsigned long data)
+{
+	struct adapter *adap = (struct adapter *)data;
+	struct ch_tc_flower_entry *flower_entry;
+	struct ch_tc_flower_stats *ofld_stats;
+	unsigned int i;
+	u64 packets;
+	u64 bytes;
+	int ret;
+
+	rcu_read_lock();
+	hash_for_each_rcu(adap->flower_anymatch_tbl, i, flower_entry, link) {
+		ret = cxgb4_get_filter_counters(adap->port[0],
+						flower_entry->filter_id,
+						&packets, &bytes);
+		if (!ret) {
+			spin_lock(&flower_entry->lock);
+			ofld_stats = &flower_entry->stats;
+
+			if (ofld_stats->prev_packet_count != packets) {
+				ofld_stats->prev_packet_count = packets;
+				ofld_stats->last_used = jiffies;
+			}
+			spin_unlock(&flower_entry->lock);
+		}
+	}
+	rcu_read_unlock();
+	mod_timer(&adap->flower_stats_timer, jiffies + STATS_CHECK_PERIOD);
+}
+
+int cxgb4_tc_flower_stats(struct net_device *dev,
+			  struct tc_cls_flower_offload *cls)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct ch_tc_flower_stats *ofld_stats;
+	struct ch_tc_flower_entry *ch_flower;
+	u64 packets;
+	u64 bytes;
+	int ret;
+
+	ch_flower = ch_flower_lookup(adap, cls->cookie);
+	if (!ch_flower) {
+		ret = -ENOENT;
+		goto err;
+	}
+
+	ret = cxgb4_get_filter_counters(dev, ch_flower->filter_id,
+					&packets, &bytes);
+	if (ret < 0)
+		goto err;
+
+	spin_lock_bh(&ch_flower->lock);
+	ofld_stats = &ch_flower->stats;
+	if (ofld_stats->packet_count != packets) {
+		if (ofld_stats->prev_packet_count != packets)
+			ofld_stats->last_used = jiffies;
+		tcf_exts_stats_update(cls->exts, bytes - ofld_stats->byte_count,
+				      packets - ofld_stats->packet_count,
+				      ofld_stats->last_used);
+
+		ofld_stats->packet_count = packets;
+		ofld_stats->byte_count = bytes;
+		ofld_stats->prev_packet_count = packets;
+	}
+	spin_unlock_bh(&ch_flower->lock);
+	return 0;
+
+err:
+	return ret;
+}
+
+void cxgb4_init_tc_flower(struct adapter *adap)
+{
+	hash_init(adap->flower_anymatch_tbl);
+	setup_timer(&adap->flower_stats_timer, ch_flower_stats_cb,
+		    (unsigned long)adap);
+	mod_timer(&adap->flower_stats_timer, jiffies + STATS_CHECK_PERIOD);
+}
+
+void cxgb4_cleanup_tc_flower(struct adapter *adap)
+{
+	if (adap->flower_stats_timer.function)
+		del_timer_sync(&adap->flower_stats_timer);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
new file mode 100644
index 000000000000..604feffc752e
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
@@ -0,0 +1,66 @@
+/*
+ * This file is part of the Chelsio T4/T5/T6 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2017 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_TC_FLOWER_H
+#define __CXGB4_TC_FLOWER_H
+
+#include <net/pkt_cls.h>
+
+struct ch_tc_flower_stats {
+	u64 prev_packet_count;
+	u64 packet_count;
+	u64 byte_count;
+	u64 last_used;
+};
+
+struct ch_tc_flower_entry {
+	struct ch_filter_specification fs;
+	struct ch_tc_flower_stats stats;
+	unsigned long tc_flower_cookie;
+	struct hlist_node link;
+	struct rcu_head rcu;
+	spinlock_t lock; /* lock for stats */
+	u32 filter_id;
+};
+
+int cxgb4_tc_flower_replace(struct net_device *dev,
+			    struct tc_cls_flower_offload *cls);
+int cxgb4_tc_flower_destroy(struct net_device *dev,
+			    struct tc_cls_flower_offload *cls);
+int cxgb4_tc_flower_stats(struct net_device *dev,
+			  struct tc_cls_flower_offload *cls);
+
+void cxgb4_init_tc_flower(struct adapter *adap);
+void cxgb4_cleanup_tc_flower(struct adapter *adap);
+#endif /* __CXGB4_TC_FLOWER_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 84541fce94c5..52324c77a4fe 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -212,6 +212,7 @@ struct filter_ctx {
 
 struct ch_filter_specification;
 
+int cxgb4_get_free_ftid(struct net_device *dev, int family);
 int __cxgb4_set_filter(struct net_device *dev, int filter_id,
 		       struct ch_filter_specification *fs,
 		       struct filter_ctx *ctx);
@@ -220,6 +221,8 @@ int __cxgb4_del_filter(struct net_device *dev, int filter_id,
 int cxgb4_set_filter(struct net_device *dev, int filter_id,
 		     struct ch_filter_specification *fs);
 int cxgb4_del_filter(struct net_device *dev, int filter_id);
+int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx,
+			      u64 *hitcnt, u64 *bytecnt);
 
 static inline void set_wr_txq(struct sk_buff *skb, int prio, int queue)
 {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
index f7ef8871dd0b..1817a0307d26 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
@@ -422,7 +422,7 @@ struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh,
 	u8 lport;
 	u16 vlan;
 	struct l2t_entry *e;
-	int addr_len = neigh->tbl->key_len;
+	unsigned int addr_len = neigh->tbl->key_len;
 	u32 *addr = (u32 *)neigh->primary_key;
 	int ifidx = neigh->dev->ifindex;
 	int hash = addr_hash(d, addr, addr_len, ifidx);
@@ -536,7 +536,7 @@ void t4_l2t_update(struct adapter *adap, struct neighbour *neigh)
 	struct l2t_entry *e;
 	struct sk_buff_head *arpq = NULL;
 	struct l2t_data *d = adap->l2t;
-	int addr_len = neigh->tbl->key_len;
+	unsigned int addr_len = neigh->tbl->key_len;
 	u32 *addr = (u32 *) neigh->primary_key;
 	int ifidx = neigh->dev->ifindex;
 	int hash = addr_hash(d, addr, addr_len, ifidx);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
index aa28299aef5f..633e9751a25e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
@@ -176,6 +176,11 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
 	CH_PCI_ID_TABLE_FENTRY(0x50a2), /* Custom T540-KR4 */
 	CH_PCI_ID_TABLE_FENTRY(0x50a3), /* Custom T580-KR4 */
 	CH_PCI_ID_TABLE_FENTRY(0x50a4), /* Custom 2x T540-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x50a5), /* Custom T522-BT */
+	CH_PCI_ID_TABLE_FENTRY(0x50a6), /* Custom T522-BT-SO */
+	CH_PCI_ID_TABLE_FENTRY(0x50a7), /* Custom T580-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x50a8), /* Custom T580-KR */
+	CH_PCI_ID_TABLE_FENTRY(0x50a9), /* Custom T580-KR */
 
 	/* T6 adapters:
 	 */
@@ -197,6 +202,7 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
 	CH_PCI_ID_TABLE_FENTRY(0x6082), /* Custom T6225-CR SFP28 */
 	CH_PCI_ID_TABLE_FENTRY(0x6083), /* Custom T62100-CR QSFP28 */
 	CH_PCI_ID_TABLE_FENTRY(0x6084), /* Custom T64100-CR QSFP28 */
+	CH_PCI_ID_TABLE_FENTRY(0x6085), /* Custom T6240-SO */
 CH_PCI_DEVICE_ID_TABLE_DEFINE_END;
 
 #endif /* __T4_PCI_ID_TBL_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
index a8d94963b4d0..67aec59a14e6 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
@@ -1812,7 +1812,7 @@ int t4vf_eth_eq_free(struct adapter *adapter, unsigned int eqid)
  *
  *	Returns a string representation of the Link Down Reason Code.
  */
-const char *t4vf_link_down_rc_str(unsigned char link_down_rc)
+static const char *t4vf_link_down_rc_str(unsigned char link_down_rc)
 {
 	static const char * const reason[] = {
 		"Link Down",
@@ -1838,8 +1838,8 @@ const char *t4vf_link_down_rc_str(unsigned char link_down_rc)
  *
  *	Processes a GET_PORT_INFO FW reply message.
  */
-void t4vf_handle_get_port_info(struct port_info *pi,
-			       const struct fw_port_cmd *cmd)
+static void t4vf_handle_get_port_info(struct port_info *pi,
+				      const struct fw_port_cmd *cmd)
 {
 	int action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16));
 	struct adapter *adapter = pi->adapter;
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.h b/drivers/net/ethernet/cisco/enic/enic_clsf.h
index 6aa9f89d073b..4bfbf25f9ddc 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.h
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.h
@@ -19,9 +19,8 @@ void enic_flow_may_expire(unsigned long data);
 
 static inline void enic_rfs_timer_start(struct enic *enic)
 {
-	init_timer(&enic->rfs_h.rfs_may_expire);
-	enic->rfs_h.rfs_may_expire.function = enic_flow_may_expire;
-	enic->rfs_h.rfs_may_expire.data = (unsigned long)enic;
+	setup_timer(&enic->rfs_h.rfs_may_expire, enic_flow_may_expire,
+		    (unsigned long)enic);
 	mod_timer(&enic->rfs_h.rfs_may_expire, jiffies + HZ/4);
 }
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index d24ee1ad3be1..4a11baffe02d 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -2846,9 +2846,8 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Setup notification timer, HW reset task, and wq locks
 	 */
 
-	init_timer(&enic->notify_timer);
-	enic->notify_timer.function = enic_notify_timer;
-	enic->notify_timer.data = (unsigned long)enic;
+	setup_timer(&enic->notify_timer, enic_notify_timer,
+		    (unsigned long)enic);
 
 	enic_set_rx_coal_setting(enic);
 	INIT_WORK(&enic->reset, enic_reset);
diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c
index 0affee9c8aa2..299812e92db7 100644
--- a/drivers/net/ethernet/dec/tulip/de4x5.c
+++ b/drivers/net/ethernet/dec/tulip/de4x5.c
@@ -1147,9 +1147,8 @@ de4x5_hw_init(struct net_device *dev, u_long iobase, struct device *gendev)
 	lp->timeout = -1;
 	lp->gendev = gendev;
 	spin_lock_init(&lp->lock);
-	init_timer(&lp->timer);
-	lp->timer.function = (void (*)(unsigned long))de4x5_ast;
-	lp->timer.data = (unsigned long)dev;
+	setup_timer(&lp->timer, (void (*)(unsigned long))de4x5_ast,
+		    (unsigned long)dev);
 	de4x5_parse_params(dev);
 
 	/*
diff --git a/drivers/net/ethernet/dec/tulip/dmfe.c b/drivers/net/ethernet/dec/tulip/dmfe.c
index 07e10a45beaa..6585f737d08b 100644
--- a/drivers/net/ethernet/dec/tulip/dmfe.c
+++ b/drivers/net/ethernet/dec/tulip/dmfe.c
@@ -596,10 +596,8 @@ static int dmfe_open(struct net_device *dev)
 	netif_wake_queue(dev);
 
 	/* set and active a timer process */
-	init_timer(&db->timer);
+	setup_timer(&db->timer, dmfe_timer, (unsigned long)dev);
 	db->timer.expires = DMFE_TIMER_WUT + HZ * 2;
-	db->timer.data = (unsigned long)dev;
-	db->timer.function = dmfe_timer;
 	add_timer(&db->timer);
 
 	return 0;
diff --git a/drivers/net/ethernet/dec/tulip/uli526x.c b/drivers/net/ethernet/dec/tulip/uli526x.c
index 7fc248efc4ba..5fbbc0caba99 100644
--- a/drivers/net/ethernet/dec/tulip/uli526x.c
+++ b/drivers/net/ethernet/dec/tulip/uli526x.c
@@ -491,10 +491,8 @@ static int uli526x_open(struct net_device *dev)
 	netif_wake_queue(dev);
 
 	/* set and active a timer process */
-	init_timer(&db->timer);
+	setup_timer(&db->timer, uli526x_timer, (unsigned long)dev);
 	db->timer.expires = ULI526X_TIMER_WUT + HZ * 2;
-	db->timer.data = (unsigned long)dev;
-	db->timer.function = uli526x_timer;
 	add_timer(&db->timer);
 
 	return 0;
diff --git a/drivers/net/ethernet/dec/tulip/winbond-840.c b/drivers/net/ethernet/dec/tulip/winbond-840.c
index 32d7229544fa..6f88d687b6d2 100644
--- a/drivers/net/ethernet/dec/tulip/winbond-840.c
+++ b/drivers/net/ethernet/dec/tulip/winbond-840.c
@@ -655,10 +655,8 @@ static int netdev_open(struct net_device *dev)
 		netdev_dbg(dev, "Done netdev_open()\n");
 
 	/* Set the timer to check for link beat. */
-	init_timer(&np->timer);
+	setup_timer(&np->timer, netdev_timer, (unsigned long)dev);
 	np->timer.expires = jiffies + 1*HZ;
-	np->timer.data = (unsigned long)dev;
-	np->timer.function = netdev_timer;				/* timer handler */
 	add_timer(&np->timer);
 	return 0;
 out_err:
diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c
index 2704bcf023be..6ca9e981ad57 100644
--- a/drivers/net/ethernet/dlink/sundance.c
+++ b/drivers/net/ethernet/dlink/sundance.c
@@ -913,10 +913,8 @@ static int netdev_open(struct net_device *dev)
 			   ioread16(ioaddr + MACCtrl1), ioread16(ioaddr + MACCtrl0));
 
 	/* Set the timer to check for link beat. */
-	init_timer(&np->timer);
+	setup_timer(&np->timer, netdev_timer, (unsigned long)dev);
 	np->timer.expires = jiffies + 3*HZ;
-	np->timer.data = (unsigned long)dev;
-	np->timer.function = netdev_timer;				/* timer handler */
 	add_timer(&np->timer);
 
 	/* Enable interrupts by setting the interrupt mask. */
diff --git a/drivers/net/ethernet/fealnx.c b/drivers/net/ethernet/fealnx.c
index e92859dab7ae..c8982313d850 100644
--- a/drivers/net/ethernet/fealnx.c
+++ b/drivers/net/ethernet/fealnx.c
@@ -909,17 +909,13 @@ static int netdev_open(struct net_device *dev)
 		printk(KERN_DEBUG "%s: Done netdev_open().\n", dev->name);
 
 	/* Set the timer to check for link beat. */
-	init_timer(&np->timer);
+	setup_timer(&np->timer, netdev_timer, (unsigned long)dev);
 	np->timer.expires = RUN_AT(3 * HZ);
-	np->timer.data = (unsigned long) dev;
-	np->timer.function = netdev_timer;
 
 	/* timer handler */
 	add_timer(&np->timer);
 
-	init_timer(&np->reset_timer);
-	np->reset_timer.data = (unsigned long) dev;
-	np->reset_timer.function = reset_timer;
+	setup_timer(&np->reset_timer, reset_timer, (unsigned long)dev);
 	np->reset_timer_armed = 0;
 	return rc;
 }
diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index f77ba9fa257b..a96b838cffce 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -3857,8 +3857,9 @@ static int ucc_geth_probe(struct platform_device* ofdev)
 	}
 
 	if (netif_msg_probe(&debug))
-		pr_info("UCC%1d at 0x%8x (irq = %d)\n",
-			ug_info->uf_info.ucc_num + 1, ug_info->uf_info.regs,
+		pr_info("UCC%1d at 0x%8llx (irq = %d)\n",
+			ug_info->uf_info.ucc_num + 1,
+			(u64)ug_info->uf_info.regs,
 			ug_info->uf_info.irq);
 
 	/* Create an ethernet device instance */
diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig
index 91c7bdb9b43c..9d7cb0387bf7 100644
--- a/drivers/net/ethernet/hisilicon/Kconfig
+++ b/drivers/net/ethernet/hisilicon/Kconfig
@@ -103,4 +103,13 @@ config HNS3_ENET
 	  family of SoCs. This module depends upon HNAE3 driver to access the HNAE3
 	  devices and their associated operations.
 
+config HNS3_DCB
+	bool "Hisilicon HNS3 Data Center Bridge Support"
+	default n
+	depends on HNS3 && HNS3_HCLGE && DCB
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the HNS3 driver.
+
+	  If unsure, say N.
+
 endif # NET_VENDOR_HISILICON
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 1a01cadfe5f3..c677530841cf 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -28,6 +28,7 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/dcbnl.h>
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/module.h>
@@ -131,6 +132,7 @@ struct hnae3_client_ops {
 	int (*init_instance)(struct hnae3_handle *handle);
 	void (*uninit_instance)(struct hnae3_handle *handle, bool reset);
 	void (*link_status_change)(struct hnae3_handle *handle, bool state);
+	int (*setup_tc)(struct hnae3_handle *handle, u8 tc);
 };
 
 #define HNAE3_CLIENT_NAME_LENGTH 16
@@ -363,6 +365,20 @@ struct hnae3_ae_ops {
 				  u16 vlan, u8 qos, __be16 proto);
 };
 
+struct hnae3_dcb_ops {
+	/* IEEE 802.1Qaz std */
+	int (*ieee_getets)(struct hnae3_handle *, struct ieee_ets *);
+	int (*ieee_setets)(struct hnae3_handle *, struct ieee_ets *);
+	int (*ieee_getpfc)(struct hnae3_handle *, struct ieee_pfc *);
+	int (*ieee_setpfc)(struct hnae3_handle *, struct ieee_pfc *);
+
+	/* DCBX configuration */
+	u8   (*getdcbx)(struct hnae3_handle *);
+	u8   (*setdcbx)(struct hnae3_handle *, u8);
+
+	int (*map_update)(struct hnae3_handle *);
+};
+
 struct hnae3_ae_algo {
 	const struct hnae3_ae_ops *ops;
 	struct list_head node;
@@ -394,6 +410,7 @@ struct hnae3_knic_private_info {
 
 	u16 num_tqps;		  /* total number of TQPs in this handle */
 	struct hnae3_queue **tqp;  /* array base of all TQPs in this instance */
+	const struct hnae3_dcb_ops *dcb_ops;
 };
 
 struct hnae3_roce_private_info {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile b/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile
index 162e8a42acd0..d2b20d01a58c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile
@@ -7,5 +7,9 @@ ccflags-y := -Idrivers/net/ethernet/hisilicon/hns3
 obj-$(CONFIG_HNS3_HCLGE) += hclge.o
 hclge-objs = hclge_main.o hclge_cmd.o hclge_mdio.o hclge_tm.o
 
+hclge-$(CONFIG_HNS3_DCB) += hclge_dcb.o
+
 obj-$(CONFIG_HNS3_ENET) += hns3.o
 hns3-objs = hns3_enet.o hns3_ethtool.o
+
+hns3-$(CONFIG_HNS3_DCB) += hns3_dcbnl.o
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index 758cf3948131..6b6d28eff664 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -311,6 +311,7 @@ struct hclge_tc_thrd {
 struct hclge_priv_buf {
 	struct hclge_waterline wl;	/* Waterline for low and high*/
 	u32 buf_size;	/* TC private buffer size */
+	u32 tx_buf_size;
 	u32 enable;	/* Enable TC private buffer or not */
 };
 
@@ -321,6 +322,11 @@ struct hclge_shared_buf {
 	u32 buf_size;
 };
 
+struct hclge_pkt_buf_alloc {
+	struct hclge_priv_buf priv_buf[HCLGE_MAX_TC_NUM];
+	struct hclge_shared_buf s_buf;
+};
+
 #define HCLGE_RX_COM_WL_EN_B	15
 struct hclge_rx_com_wl_buf {
 	__le16 high_wl;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
new file mode 100644
index 000000000000..1b30a6f966d8
--- /dev/null
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2016-2017 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include "hclge_main.h"
+#include "hclge_tm.h"
+#include "hnae3.h"
+
+#define BW_PERCENT	100
+
+static int hclge_ieee_ets_to_tm_info(struct hclge_dev *hdev,
+				     struct ieee_ets *ets)
+{
+	u8 i;
+
+	for (i = 0; i < HNAE3_MAX_TC; i++) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_STRICT:
+			hdev->tm_info.tc_info[i].tc_sch_mode =
+				HCLGE_SCH_MODE_SP;
+			hdev->tm_info.pg_info[0].tc_dwrr[i] = 0;
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			hdev->tm_info.tc_info[i].tc_sch_mode =
+				HCLGE_SCH_MODE_DWRR;
+			hdev->tm_info.pg_info[0].tc_dwrr[i] =
+				ets->tc_tx_bw[i];
+			break;
+		default:
+			/* Hardware only supports SP (strict priority)
+			 * or ETS (enhanced transmission selection)
+			 * algorithms, if we receive some other value
+			 * from dcbnl, then throw an error.
+			 */
+			return -EINVAL;
+		}
+	}
+
+	return hclge_tm_prio_tc_info_update(hdev, ets->prio_tc);
+}
+
+static void hclge_tm_info_to_ieee_ets(struct hclge_dev *hdev,
+				      struct ieee_ets *ets)
+{
+	u32 i;
+
+	memset(ets, 0, sizeof(*ets));
+	ets->willing = 1;
+	ets->ets_cap = hdev->tc_max;
+
+	for (i = 0; i < HNAE3_MAX_TC; i++) {
+		ets->prio_tc[i] = hdev->tm_info.prio_tc[i];
+		ets->tc_tx_bw[i] = hdev->tm_info.pg_info[0].tc_dwrr[i];
+
+		if (hdev->tm_info.tc_info[i].tc_sch_mode ==
+		    HCLGE_SCH_MODE_SP)
+			ets->tc_tsa[i] = IEEE_8021QAZ_TSA_STRICT;
+		else
+			ets->tc_tsa[i] = IEEE_8021QAZ_TSA_ETS;
+	}
+}
+
+/* IEEE std */
+static int hclge_ieee_getets(struct hnae3_handle *h, struct ieee_ets *ets)
+{
+	struct hclge_vport *vport = hclge_get_vport(h);
+	struct hclge_dev *hdev = vport->back;
+
+	hclge_tm_info_to_ieee_ets(hdev, ets);
+
+	return 0;
+}
+
+static int hclge_ets_validate(struct hclge_dev *hdev, struct ieee_ets *ets,
+			      u8 *tc, bool *changed)
+{
+	u32 total_ets_bw = 0;
+	u8 max_tc = 0;
+	u8 i;
+
+	for (i = 0; i < HNAE3_MAX_TC; i++) {
+		if (ets->prio_tc[i] >= hdev->tc_max ||
+		    i >= hdev->tc_max)
+			return -EINVAL;
+
+		if (ets->prio_tc[i] != hdev->tm_info.prio_tc[i])
+			*changed = true;
+
+		if (ets->prio_tc[i] > max_tc)
+			max_tc = ets->prio_tc[i];
+
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_STRICT:
+			if (hdev->tm_info.tc_info[i].tc_sch_mode !=
+				HCLGE_SCH_MODE_SP)
+				*changed = true;
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			if (hdev->tm_info.tc_info[i].tc_sch_mode !=
+				HCLGE_SCH_MODE_DWRR)
+				*changed = true;
+
+			total_ets_bw += ets->tc_tx_bw[i];
+		break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	if (total_ets_bw != BW_PERCENT)
+		return -EINVAL;
+
+	*tc = max_tc + 1;
+	if (*tc != hdev->tm_info.num_tc)
+		*changed = true;
+
+	return 0;
+}
+
+static int hclge_map_update(struct hnae3_handle *h)
+{
+	struct hclge_vport *vport = hclge_get_vport(h);
+	struct hclge_dev *hdev = vport->back;
+	int ret;
+
+	ret = hclge_tm_map_cfg(hdev);
+	if (ret)
+		return ret;
+
+	ret = hclge_tm_schd_mode_hw(hdev);
+	if (ret)
+		return ret;
+
+	ret = hclge_pause_setup_hw(hdev);
+	if (ret)
+		return ret;
+
+	ret = hclge_buffer_alloc(hdev);
+	if (ret)
+		return ret;
+
+	return hclge_rss_init_hw(hdev);
+}
+
+static int hclge_client_setup_tc(struct hclge_dev *hdev)
+{
+	struct hclge_vport *vport = hdev->vport;
+	struct hnae3_client *client;
+	struct hnae3_handle *handle;
+	int ret;
+	u32 i;
+
+	for (i = 0; i < hdev->num_vmdq_vport + 1; i++) {
+		handle = &vport[i].nic;
+		client = handle->client;
+
+		if (!client || !client->ops || !client->ops->setup_tc)
+			continue;
+
+		ret = client->ops->setup_tc(handle, hdev->tm_info.num_tc);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int hclge_ieee_setets(struct hnae3_handle *h, struct ieee_ets *ets)
+{
+	struct hclge_vport *vport = hclge_get_vport(h);
+	struct hclge_dev *hdev = vport->back;
+	bool map_changed = false;
+	u8 num_tc = 0;
+	int ret;
+
+	if (!(hdev->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	ret = hclge_ets_validate(hdev, ets, &num_tc, &map_changed);
+	if (ret)
+		return ret;
+
+	hclge_tm_schd_info_update(hdev, num_tc);
+
+	ret = hclge_ieee_ets_to_tm_info(hdev, ets);
+	if (ret)
+		return ret;
+
+	if (map_changed) {
+		ret = hclge_client_setup_tc(hdev);
+		if (ret)
+			return ret;
+	}
+
+	return hclge_tm_dwrr_cfg(hdev);
+}
+
+static int hclge_ieee_getpfc(struct hnae3_handle *h, struct ieee_pfc *pfc)
+{
+	struct hclge_vport *vport = hclge_get_vport(h);
+	struct hclge_dev *hdev = vport->back;
+	u8 i, j, pfc_map, *prio_tc;
+
+	memset(pfc, 0, sizeof(*pfc));
+	pfc->pfc_cap = hdev->pfc_max;
+	prio_tc = hdev->tm_info.prio_tc;
+	pfc_map = hdev->tm_info.hw_pfc_map;
+
+	/* Pfc setting is based on TC */
+	for (i = 0; i < hdev->tm_info.num_tc; i++) {
+		for (j = 0; j < HNAE3_MAX_USER_PRIO; j++) {
+			if ((prio_tc[j] == i) && (pfc_map & BIT(i)))
+				pfc->pfc_en |= BIT(j);
+		}
+	}
+
+	return 0;
+}
+
+static int hclge_ieee_setpfc(struct hnae3_handle *h, struct ieee_pfc *pfc)
+{
+	struct hclge_vport *vport = hclge_get_vport(h);
+	struct hclge_dev *hdev = vport->back;
+	u8 i, j, pfc_map, *prio_tc;
+
+	if (!(hdev->dcbx_cap & DCB_CAP_DCBX_VER_IEEE))
+		return -EINVAL;
+
+	prio_tc = hdev->tm_info.prio_tc;
+	pfc_map = 0;
+
+	for (i = 0; i < hdev->tm_info.num_tc; i++) {
+		for (j = 0; j < HNAE3_MAX_USER_PRIO; j++) {
+			if ((prio_tc[j] == i) && (pfc->pfc_en & BIT(j))) {
+				pfc_map |= BIT(i);
+				break;
+			}
+		}
+	}
+
+	if (pfc_map == hdev->tm_info.hw_pfc_map)
+		return 0;
+
+	hdev->tm_info.hw_pfc_map = pfc_map;
+
+	return hclge_pause_setup_hw(hdev);
+}
+
+/* DCBX configuration */
+static u8 hclge_getdcbx(struct hnae3_handle *h)
+{
+	struct hclge_vport *vport = hclge_get_vport(h);
+	struct hclge_dev *hdev = vport->back;
+
+	return hdev->dcbx_cap;
+}
+
+static u8 hclge_setdcbx(struct hnae3_handle *h, u8 mode)
+{
+	struct hclge_vport *vport = hclge_get_vport(h);
+	struct hclge_dev *hdev = vport->back;
+
+	/* No support for LLD_MANAGED modes or CEE */
+	if ((mode & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    (mode & DCB_CAP_DCBX_VER_CEE) ||
+	    !(mode & DCB_CAP_DCBX_HOST))
+		return 1;
+
+	hdev->dcbx_cap = mode;
+
+	return 0;
+}
+
+static const struct hnae3_dcb_ops hns3_dcb_ops = {
+	.ieee_getets	= hclge_ieee_getets,
+	.ieee_setets	= hclge_ieee_setets,
+	.ieee_getpfc	= hclge_ieee_getpfc,
+	.ieee_setpfc	= hclge_ieee_setpfc,
+	.getdcbx	= hclge_getdcbx,
+	.setdcbx	= hclge_setdcbx,
+	.map_update	= hclge_map_update,
+};
+
+void hclge_dcb_ops_set(struct hclge_dev *hdev)
+{
+	struct hclge_vport *vport = hdev->vport;
+	struct hnae3_knic_private_info *kinfo;
+
+	/* Hdev does not support DCB or vport is
+	 * not a pf, then dcb_ops is not set.
+	 */
+	if (!hnae3_dev_dcb_supported(hdev) ||
+	    vport->vport_id != 0)
+		return;
+
+	kinfo = &vport->nic.kinfo;
+	kinfo->dcb_ops = &hns3_dcb_ops;
+	hdev->dcbx_cap = DCB_CAP_DCBX_VER_IEEE | DCB_CAP_DCBX_HOST;
+}
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h
new file mode 100644
index 000000000000..7d808ee96694
--- /dev/null
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2016~2017 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __HCLGE_DCB_H__
+#define __HCLGE_DCB_H__
+
+#include "hclge_main.h"
+
+#ifdef CONFIG_HNS3_DCB
+void hclge_dcb_ops_set(struct hclge_dev *hdev);
+#else
+static inline void hclge_dcb_ops_set(struct hclge_dev *hdev) {}
+#endif
+
+#endif /* __HCLGE_DCB_H__ */
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index c1cdbfd83bdb..1a13614af3de 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -19,6 +19,7 @@
 #include <linux/platform_device.h>
 
 #include "hclge_cmd.h"
+#include "hclge_dcb.h"
 #include "hclge_main.h"
 #include "hclge_mdio.h"
 #include "hclge_tm.h"
@@ -30,7 +31,6 @@
 #define HCLGE_64BIT_STATS_FIELD_OFF(f) (offsetof(struct hclge_64_bit_stats, f))
 #define HCLGE_32BIT_STATS_FIELD_OFF(f) (offsetof(struct hclge_32_bit_stats, f))
 
-static int hclge_rss_init_hw(struct hclge_dev *hdev);
 static int hclge_set_mta_filter_mode(struct hclge_dev *hdev,
 				     enum hclge_mta_dmac_sel_type mta_mac_sel,
 				     bool enable);
@@ -1058,7 +1058,7 @@ static int hclge_configure(struct hclge_dev *hdev)
 	hdev->hw.mac.phy_addr = cfg.phy_addr;
 	hdev->num_desc = cfg.tqp_desc_num;
 	hdev->tm_info.num_pg = 1;
-	hdev->tm_info.num_tc = cfg.tc_num;
+	hdev->tc_max = cfg.tc_num;
 	hdev->tm_info.hw_pfc_map = 0;
 
 	ret = hclge_parse_speed(cfg.default_speed, &hdev->hw.mac.speed);
@@ -1067,15 +1067,25 @@ static int hclge_configure(struct hclge_dev *hdev)
 		return ret;
 	}
 
-	if ((hdev->tm_info.num_tc > HNAE3_MAX_TC) ||
-	    (hdev->tm_info.num_tc < 1)) {
+	if ((hdev->tc_max > HNAE3_MAX_TC) ||
+	    (hdev->tc_max < 1)) {
 		dev_warn(&hdev->pdev->dev, "TC num = %d.\n",
-			 hdev->tm_info.num_tc);
-		hdev->tm_info.num_tc = 1;
+			 hdev->tc_max);
+		hdev->tc_max = 1;
 	}
 
+	/* Dev does not support DCB */
+	if (!hnae3_dev_dcb_supported(hdev)) {
+		hdev->tc_max = 1;
+		hdev->pfc_max = 0;
+	} else {
+		hdev->pfc_max = hdev->tc_max;
+	}
+
+	hdev->tm_info.num_tc = hdev->tc_max;
+
 	/* Currently not support uncontiuous tc */
-	for (i = 0; i < cfg.tc_num; i++)
+	for (i = 0; i < hdev->tm_info.num_tc; i++)
 		hnae_set_bit(hdev->hw_tc_map, i, 1);
 
 	if (!hdev->num_vmdq_vport && !hdev->num_req_vfs)
@@ -1324,7 +1334,8 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size)
+static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev,
+				    struct hclge_pkt_buf_alloc *buf_alloc)
 {
 /* TX buffer size is unit by 128 byte */
 #define HCLGE_BUF_SIZE_UNIT_SHIFT	7
@@ -1337,10 +1348,13 @@ static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size)
 	req = (struct hclge_tx_buff_alloc *)desc.data;
 
 	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0);
-	for (i = 0; i < HCLGE_TC_NUM; i++)
+	for (i = 0; i < HCLGE_TC_NUM; i++) {
+		u32 buf_size = buf_alloc->priv_buf[i].tx_buf_size;
+
 		req->tx_pkt_buff[i] =
 			cpu_to_le16((buf_size >> HCLGE_BUF_SIZE_UNIT_SHIFT) |
 				     HCLGE_BUF_SIZE_UPDATE_EN_MSK);
+	}
 
 	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
 	if (ret) {
@@ -1352,9 +1366,10 @@ static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size)
 	return 0;
 }
 
-static int hclge_tx_buffer_alloc(struct hclge_dev *hdev, u32 buf_size)
+static int hclge_tx_buffer_alloc(struct hclge_dev *hdev,
+				 struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	int ret = hclge_cmd_alloc_tx_buff(hdev, buf_size);
+	int ret = hclge_cmd_alloc_tx_buff(hdev, buf_alloc);
 
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
@@ -1387,13 +1402,14 @@ static int hclge_get_pfc_enalbe_num(struct hclge_dev *hdev)
 }
 
 /* Get the number of pfc enabled TCs, which have private buffer */
-static int hclge_get_pfc_priv_num(struct hclge_dev *hdev)
+static int hclge_get_pfc_priv_num(struct hclge_dev *hdev,
+				  struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	int i, cnt = 0;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if ((hdev->tm_info.hw_pfc_map & BIT(i)) &&
 		    priv->enable)
 			cnt++;
@@ -1403,13 +1419,14 @@ static int hclge_get_pfc_priv_num(struct hclge_dev *hdev)
 }
 
 /* Get the number of pfc disabled TCs, which have private buffer */
-static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev)
+static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev,
+				     struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	int i, cnt = 0;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if (hdev->hw_tc_map & BIT(i) &&
 		    !(hdev->tm_info.hw_pfc_map & BIT(i)) &&
 		    priv->enable)
@@ -1419,21 +1436,33 @@ static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev)
 	return cnt;
 }
 
-static u32 hclge_get_rx_priv_buff_alloced(struct hclge_dev *hdev)
+static u32 hclge_get_rx_priv_buff_alloced(struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	u32 rx_priv = 0;
 	int i;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if (priv->enable)
 			rx_priv += priv->buf_size;
 	}
 	return rx_priv;
 }
 
-static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all)
+static u32 hclge_get_tx_buff_alloced(struct hclge_pkt_buf_alloc *buf_alloc)
+{
+	u32 i, total_tx_size = 0;
+
+	for (i = 0; i < HCLGE_MAX_TC_NUM; i++)
+		total_tx_size += buf_alloc->priv_buf[i].tx_buf_size;
+
+	return total_tx_size;
+}
+
+static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev,
+				struct hclge_pkt_buf_alloc *buf_alloc,
+				u32 rx_all)
 {
 	u32 shared_buf_min, shared_buf_tc, shared_std;
 	int tc_num, pfc_enable_num;
@@ -1454,46 +1483,74 @@ static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all)
 			hdev->mps;
 	shared_std = max_t(u32, shared_buf_min, shared_buf_tc);
 
-	rx_priv = hclge_get_rx_priv_buff_alloced(hdev);
+	rx_priv = hclge_get_rx_priv_buff_alloced(buf_alloc);
 	if (rx_all <= rx_priv + shared_std)
 		return false;
 
 	shared_buf = rx_all - rx_priv;
-	hdev->s_buf.buf_size = shared_buf;
-	hdev->s_buf.self.high = shared_buf;
-	hdev->s_buf.self.low =  2 * hdev->mps;
+	buf_alloc->s_buf.buf_size = shared_buf;
+	buf_alloc->s_buf.self.high = shared_buf;
+	buf_alloc->s_buf.self.low =  2 * hdev->mps;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
 		if ((hdev->hw_tc_map & BIT(i)) &&
 		    (hdev->tm_info.hw_pfc_map & BIT(i))) {
-			hdev->s_buf.tc_thrd[i].low = hdev->mps;
-			hdev->s_buf.tc_thrd[i].high = 2 * hdev->mps;
+			buf_alloc->s_buf.tc_thrd[i].low = hdev->mps;
+			buf_alloc->s_buf.tc_thrd[i].high = 2 * hdev->mps;
 		} else {
-			hdev->s_buf.tc_thrd[i].low = 0;
-			hdev->s_buf.tc_thrd[i].high = hdev->mps;
+			buf_alloc->s_buf.tc_thrd[i].low = 0;
+			buf_alloc->s_buf.tc_thrd[i].high = hdev->mps;
 		}
 	}
 
 	return true;
 }
 
+static int hclge_tx_buffer_calc(struct hclge_dev *hdev,
+				struct hclge_pkt_buf_alloc *buf_alloc)
+{
+	u32 i, total_size;
+
+	total_size = hdev->pkt_buf_size;
+
+	/* alloc tx buffer for all enabled tc */
+	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
+		struct hclge_priv_buf *priv = &buf_alloc->priv_buf[i];
+
+		if (total_size < HCLGE_DEFAULT_TX_BUF)
+			return -ENOMEM;
+
+		if (hdev->hw_tc_map & BIT(i))
+			priv->tx_buf_size = HCLGE_DEFAULT_TX_BUF;
+		else
+			priv->tx_buf_size = 0;
+
+		total_size -= priv->tx_buf_size;
+	}
+
+	return 0;
+}
+
 /* hclge_rx_buffer_calc: calculate the rx private buffer size for all TCs
  * @hdev: pointer to struct hclge_dev
- * @tx_size: the allocated tx buffer for all TCs
+ * @buf_alloc: pointer to buffer calculation data
  * @return: 0: calculate sucessful, negative: fail
  */
-int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size)
+int hclge_rx_buffer_calc(struct hclge_dev *hdev,
+			 struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	u32 rx_all = hdev->pkt_buf_size - tx_size;
+	u32 rx_all = hdev->pkt_buf_size;
 	int no_pfc_priv_num, pfc_priv_num;
 	struct hclge_priv_buf *priv;
 	int i;
 
+	rx_all -= hclge_get_tx_buff_alloced(buf_alloc);
+
 	/* When DCB is not supported, rx private
 	 * buffer is not allocated.
 	 */
 	if (!hnae3_dev_dcb_supported(hdev)) {
-		if (!hclge_is_rx_buf_ok(hdev, rx_all))
+		if (!hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 			return -ENOMEM;
 
 		return 0;
@@ -1501,7 +1558,7 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size)
 
 	/* step 1, try to alloc private buffer for all enabled tc */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if (hdev->hw_tc_map & BIT(i)) {
 			priv->enable = 1;
 			if (hdev->tm_info.hw_pfc_map & BIT(i)) {
@@ -1522,14 +1579,14 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size)
 		}
 	}
 
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	/* step 2, try to decrease the buffer size of
 	 * no pfc TC's private buffer
 	 */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		priv->enable = 0;
 		priv->wl.low = 0;
@@ -1552,18 +1609,18 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size)
 		}
 	}
 
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	/* step 3, try to reduce the number of pfc disabled TCs,
 	 * which have private buffer
 	 */
 	/* get the total no pfc enable TC number, which have private buffer */
-	no_pfc_priv_num = hclge_get_no_pfc_priv_num(hdev);
+	no_pfc_priv_num = hclge_get_no_pfc_priv_num(hdev, buf_alloc);
 
 	/* let the last to be cleared first */
 	for (i = HCLGE_MAX_TC_NUM - 1; i >= 0; i--) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		if (hdev->hw_tc_map & BIT(i) &&
 		    !(hdev->tm_info.hw_pfc_map & BIT(i))) {
@@ -1575,22 +1632,22 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size)
 			no_pfc_priv_num--;
 		}
 
-		if (hclge_is_rx_buf_ok(hdev, rx_all) ||
+		if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all) ||
 		    no_pfc_priv_num == 0)
 			break;
 	}
 
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	/* step 4, try to reduce the number of pfc enabled TCs
 	 * which have private buffer.
 	 */
-	pfc_priv_num = hclge_get_pfc_priv_num(hdev);
+	pfc_priv_num = hclge_get_pfc_priv_num(hdev, buf_alloc);
 
 	/* let the last to be cleared first */
 	for (i = HCLGE_MAX_TC_NUM - 1; i >= 0; i--) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		if (hdev->hw_tc_map & BIT(i) &&
 		    hdev->tm_info.hw_pfc_map & BIT(i)) {
@@ -1602,17 +1659,18 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size)
 			pfc_priv_num--;
 		}
 
-		if (hclge_is_rx_buf_ok(hdev, rx_all) ||
+		if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all) ||
 		    pfc_priv_num == 0)
 			break;
 	}
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	return -ENOMEM;
 }
 
-static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
+static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev,
+				   struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_rx_priv_buff *req;
 	struct hclge_desc desc;
@@ -1624,7 +1682,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
 
 	/* Alloc private buffer TCs */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		struct hclge_priv_buf *priv = &hdev->priv_buf[i];
+		struct hclge_priv_buf *priv = &buf_alloc->priv_buf[i];
 
 		req->buf_num[i] =
 			cpu_to_le16(priv->buf_size >> HCLGE_BUF_UNIT_S);
@@ -1633,7 +1691,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
 	}
 
 	req->shared_buf =
-		cpu_to_le16((hdev->s_buf.buf_size >> HCLGE_BUF_UNIT_S) |
+		cpu_to_le16((buf_alloc->s_buf.buf_size >> HCLGE_BUF_UNIT_S) |
 			    (1 << HCLGE_TC0_PRI_BUF_EN_B));
 
 	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -1648,7 +1706,8 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
 
 #define HCLGE_PRIV_ENABLE(a) ((a) > 0 ? 1 : 0)
 
-static int hclge_rx_priv_wl_config(struct hclge_dev *hdev)
+static int hclge_rx_priv_wl_config(struct hclge_dev *hdev,
+				   struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_rx_priv_wl_buf *req;
 	struct hclge_priv_buf *priv;
@@ -1668,7 +1727,9 @@ static int hclge_rx_priv_wl_config(struct hclge_dev *hdev)
 			desc[i].flag &= ~cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
 
 		for (j = 0; j < HCLGE_TC_NUM_ONE_DESC; j++) {
-			priv = &hdev->priv_buf[i * HCLGE_TC_NUM_ONE_DESC + j];
+			u32 idx = i * HCLGE_TC_NUM_ONE_DESC + j;
+
+			priv = &buf_alloc->priv_buf[idx];
 			req->tc_wl[j].high =
 				cpu_to_le16(priv->wl.high >> HCLGE_BUF_UNIT_S);
 			req->tc_wl[j].high |=
@@ -1693,9 +1754,10 @@ static int hclge_rx_priv_wl_config(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_common_thrd_config(struct hclge_dev *hdev)
+static int hclge_common_thrd_config(struct hclge_dev *hdev,
+				    struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	struct hclge_shared_buf *s_buf = &hdev->s_buf;
+	struct hclge_shared_buf *s_buf = &buf_alloc->s_buf;
 	struct hclge_rx_com_thrd *req;
 	struct hclge_desc desc[2];
 	struct hclge_tc_thrd *tc;
@@ -1739,9 +1801,10 @@ static int hclge_common_thrd_config(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_common_wl_config(struct hclge_dev *hdev)
+static int hclge_common_wl_config(struct hclge_dev *hdev,
+				  struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	struct hclge_shared_buf *buf = &hdev->s_buf;
+	struct hclge_shared_buf *buf = &buf_alloc->s_buf;
 	struct hclge_rx_com_wl *req;
 	struct hclge_desc desc;
 	int ret;
@@ -1771,63 +1834,68 @@ static int hclge_common_wl_config(struct hclge_dev *hdev)
 
 int hclge_buffer_alloc(struct hclge_dev *hdev)
 {
-	u32 tx_buf_size = HCLGE_DEFAULT_TX_BUF;
+	struct hclge_pkt_buf_alloc *pkt_buf;
 	int ret;
 
-	hdev->priv_buf = devm_kmalloc_array(&hdev->pdev->dev, HCLGE_MAX_TC_NUM,
-					    sizeof(struct hclge_priv_buf),
-					    GFP_KERNEL | __GFP_ZERO);
-	if (!hdev->priv_buf)
+	pkt_buf = kzalloc(sizeof(*pkt_buf), GFP_KERNEL);
+	if (!pkt_buf)
 		return -ENOMEM;
 
-	ret = hclge_tx_buffer_alloc(hdev, tx_buf_size);
+	ret = hclge_tx_buffer_calc(hdev, pkt_buf);
+	if (ret) {
+		dev_err(&hdev->pdev->dev,
+			"could not calc tx buffer size for all TCs %d\n", ret);
+		goto out;
+	}
+
+	ret = hclge_tx_buffer_alloc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not alloc tx buffers %d\n", ret);
-		return ret;
+		goto out;
 	}
 
-	ret = hclge_rx_buffer_calc(hdev, tx_buf_size);
+	ret = hclge_rx_buffer_calc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not calc rx priv buffer size for all TCs %d\n",
 			ret);
-		return ret;
+		goto out;
 	}
 
-	ret = hclge_rx_priv_buf_alloc(hdev);
+	ret = hclge_rx_priv_buf_alloc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev, "could not alloc rx priv buffer %d\n",
 			ret);
-		return ret;
+		goto out;
 	}
 
 	if (hnae3_dev_dcb_supported(hdev)) {
-		ret = hclge_rx_priv_wl_config(hdev);
+		ret = hclge_rx_priv_wl_config(hdev, pkt_buf);
 		if (ret) {
 			dev_err(&hdev->pdev->dev,
 				"could not configure rx private waterline %d\n",
 				ret);
-			return ret;
+			goto out;
 		}
 
-		ret = hclge_common_thrd_config(hdev);
+		ret = hclge_common_thrd_config(hdev, pkt_buf);
 		if (ret) {
 			dev_err(&hdev->pdev->dev,
 				"could not configure common threshold %d\n",
 				ret);
-			return ret;
+			goto out;
 		}
 	}
 
-	ret = hclge_common_wl_config(hdev);
-	if (ret) {
+	ret = hclge_common_wl_config(hdev, pkt_buf);
+	if (ret)
 		dev_err(&hdev->pdev->dev,
 			"could not configure common waterline %d\n", ret);
-		return ret;
-	}
 
-	return 0;
+out:
+	kfree(pkt_buf);
+	return ret;
 }
 
 static int hclge_init_roce_base_info(struct hclge_vport *vport)
@@ -2597,7 +2665,7 @@ static int hclge_get_tc_size(struct hnae3_handle *handle)
 	return hdev->rss_size_max;
 }
 
-static int hclge_rss_init_hw(struct hclge_dev *hdev)
+int hclge_rss_init_hw(struct hclge_dev *hdev)
 {
 	const  u8 hfunc = HCLGE_RSS_HASH_ALGO_TOEPLITZ;
 	struct hclge_vport *vport = hdev->vport;
@@ -4182,6 +4250,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 		return ret;
 	}
 
+	hclge_dcb_ops_set(hdev);
+
 	setup_timer(&hdev->service_timer, hclge_service_timer,
 		    (unsigned long)hdev);
 	INIT_WORK(&hdev->service_task, hclge_service_task);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 9fcfd9395424..7c66c00e8a3e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -421,8 +421,11 @@ struct hclge_dev {
 #define HCLGE_FLAG_TC_BASE_SCH_MODE		1
 #define HCLGE_FLAG_VNET_BASE_SCH_MODE		2
 	u8 tx_sch_mode;
+	u8 tc_max;
+	u8 pfc_max;
 
 	u8 default_up;
+	u8 dcbx_cap;
 	struct hclge_tm_info tm_info;
 
 	u16 num_msi;
@@ -463,8 +466,6 @@ struct hclge_dev {
 
 	u32 pkt_buf_size; /* Total pf buf size for tx/rx */
 	u32 mps; /* Max packet size */
-	struct hclge_priv_buf *priv_buf;
-	struct hclge_shared_buf s_buf;
 
 	enum hclge_mta_dmac_sel_type mta_mac_sel_type;
 	bool enable_mta; /* Mutilcast filter enable */
@@ -517,4 +518,7 @@ static inline int hclge_get_queue_id(struct hnae3_queue *queue)
 int hclge_cfg_mac_speed_dup(struct hclge_dev *hdev, int speed, u8 duplex);
 int hclge_set_vf_vlan_common(struct hclge_dev *vport, int vfid,
 			     bool is_kill, u16 vlan, u8 qos, __be16 proto);
+
+int hclge_buffer_alloc(struct hclge_dev *hdev);
+int hclge_rss_init_hw(struct hclge_dev *hdev);
 #endif
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 73a75d7cc551..359ee670d1e1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -124,6 +124,20 @@ static int hclge_mac_pause_en_cfg(struct hclge_dev *hdev, bool tx, bool rx)
 	return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
 
+static int hclge_pfc_pause_en_cfg(struct hclge_dev *hdev, u8 tx_rx_bitmap,
+				  u8 pfc_bitmap)
+{
+	struct hclge_desc desc;
+	struct hclge_pfc_en_cmd *pfc = (struct hclge_pfc_en_cmd *)&desc.data;
+
+	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_PFC_PAUSE_EN, false);
+
+	pfc->tx_rx_en_bitmap = tx_rx_bitmap;
+	pfc->pri_en_bitmap = pfc_bitmap;
+
+	return hclge_cmd_send(&hdev->hw, &desc, 1);
+}
+
 static int hclge_fill_pri_array(struct hclge_dev *hdev, u8 *pri, u8 pri_id)
 {
 	u8 tc;
@@ -287,6 +301,34 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev,
 	return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
 
+static int hclge_tm_port_shaper_cfg(struct hclge_dev *hdev)
+{
+	struct hclge_port_shapping_cmd *shap_cfg_cmd;
+	struct hclge_desc desc;
+	u32 shapping_para = 0;
+	u8 ir_u, ir_b, ir_s;
+	int ret;
+
+	ret = hclge_shaper_para_calc(HCLGE_ETHER_MAX_RATE,
+				     HCLGE_SHAPER_LVL_PORT,
+				     &ir_b, &ir_u, &ir_s);
+	if (ret)
+		return ret;
+
+	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TM_PORT_SHAPPING, false);
+	shap_cfg_cmd = (struct hclge_port_shapping_cmd *)desc.data;
+
+	hclge_tm_set_field(shapping_para, IR_B, ir_b);
+	hclge_tm_set_field(shapping_para, IR_U, ir_u);
+	hclge_tm_set_field(shapping_para, IR_S, ir_s);
+	hclge_tm_set_field(shapping_para, BS_B, HCLGE_SHAPER_BS_U_DEF);
+	hclge_tm_set_field(shapping_para, BS_S, HCLGE_SHAPER_BS_S_DEF);
+
+	shap_cfg_cmd->port_shapping_para = cpu_to_le32(shapping_para);
+
+	return hclge_cmd_send(&hdev->hw, &desc, 1);
+}
+
 static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev,
 				     enum hclge_shap_bucket bucket, u8 pri_id,
 				     u8 ir_b, u8 ir_u, u8 ir_s,
@@ -346,13 +388,13 @@ static int hclge_tm_pri_schd_mode_cfg(struct hclge_dev *hdev, u8 pri_id)
 	return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
 
-static int hclge_tm_qs_schd_mode_cfg(struct hclge_dev *hdev, u16 qs_id)
+static int hclge_tm_qs_schd_mode_cfg(struct hclge_dev *hdev, u16 qs_id, u8 mode)
 {
 	struct hclge_desc desc;
 
 	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TM_QS_SCH_MODE_CFG, false);
 
-	if (hdev->tm_info.tc_info[qs_id].tc_sch_mode == HCLGE_SCH_MODE_DWRR)
+	if (mode == HCLGE_SCH_MODE_DWRR)
 		desc.data[1] = cpu_to_le32(HCLGE_TM_TX_SCHD_DWRR_MSK);
 	else
 		desc.data[1] = 0;
@@ -444,7 +486,11 @@ static void hclge_tm_tc_info_init(struct hclge_dev *hdev)
 		hdev->tm_info.prio_tc[i] =
 			(i >= hdev->tm_info.num_tc) ? 0 : i;
 
-	hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE;
+	/* DCB is enabled if we have more than 1 TC */
+	if (hdev->tm_info.num_tc > 1)
+		hdev->flag |= HCLGE_FLAG_DCB_ENABLE;
+	else
+		hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE;
 }
 
 static void hclge_tm_pg_info_init(struct hclge_dev *hdev)
@@ -470,6 +516,24 @@ static void hclge_tm_pg_info_init(struct hclge_dev *hdev)
 	}
 }
 
+static void hclge_pfc_info_init(struct hclge_dev *hdev)
+{
+	if (!(hdev->flag & HCLGE_FLAG_DCB_ENABLE)) {
+		if (hdev->fc_mode_last_time == HCLGE_FC_PFC)
+			dev_warn(&hdev->pdev->dev,
+				 "DCB is disable, but last mode is FC_PFC\n");
+
+		hdev->tm_info.fc_mode = hdev->fc_mode_last_time;
+	} else if (hdev->tm_info.fc_mode != HCLGE_FC_PFC) {
+		/* fc_mode_last_time record the last fc_mode when
+		 * DCB is enabled, so that fc_mode can be set to
+		 * the correct value when DCB is disabled.
+		 */
+		hdev->fc_mode_last_time = hdev->tm_info.fc_mode;
+		hdev->tm_info.fc_mode = HCLGE_FC_PFC;
+	}
+}
+
 static int hclge_tm_schd_info_init(struct hclge_dev *hdev)
 {
 	if ((hdev->tx_sch_mode != HCLGE_FLAG_TC_BASE_SCH_MODE) &&
@@ -482,8 +546,7 @@ static int hclge_tm_schd_info_init(struct hclge_dev *hdev)
 
 	hclge_tm_vport_info_update(hdev);
 
-	hdev->tm_info.fc_mode = HCLGE_FC_NONE;
-	hdev->fc_mode_last_time = hdev->tm_info.fc_mode;
+	hclge_pfc_info_init(hdev);
 
 	return 0;
 }
@@ -596,17 +659,18 @@ static int hclge_tm_pri_q_qs_cfg(struct hclge_dev *hdev)
 {
 	struct hclge_vport *vport = hdev->vport;
 	int ret;
-	u32 i;
+	u32 i, k;
 
 	if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) {
 		/* Cfg qs -> pri mapping, one by one mapping */
-		for (i = 0; i < hdev->tm_info.num_tc; i++) {
-			ret = hclge_tm_qs_to_pri_map_cfg(hdev, i, i);
-			if (ret)
-				return ret;
-		}
+		for (k = 0; k < hdev->num_alloc_vport; k++)
+			for (i = 0; i < hdev->tm_info.num_tc; i++) {
+				ret = hclge_tm_qs_to_pri_map_cfg(
+					hdev, vport[k].qs_offset + i, i);
+				if (ret)
+					return ret;
+			}
 	} else if (hdev->tx_sch_mode == HCLGE_FLAG_VNET_BASE_SCH_MODE) {
-		int k;
 		/* Cfg qs -> pri mapping,  qs = tc, pri = vf, 8 qs -> 1 pri */
 		for (k = 0; k < hdev->num_alloc_vport; k++)
 			for (i = 0; i < HNAE3_MAX_TC; i++) {
@@ -755,10 +819,11 @@ static int hclge_tm_pri_shaper_cfg(struct hclge_dev *hdev)
 
 static int hclge_tm_pri_tc_base_dwrr_cfg(struct hclge_dev *hdev)
 {
+	struct hclge_vport *vport = hdev->vport;
 	struct hclge_pg_info *pg_info;
 	u8 dwrr;
 	int ret;
-	u32 i;
+	u32 i, k;
 
 	for (i = 0; i < hdev->tm_info.num_tc; i++) {
 		pg_info =
@@ -769,9 +834,13 @@ static int hclge_tm_pri_tc_base_dwrr_cfg(struct hclge_dev *hdev)
 		if (ret)
 			return ret;
 
-		ret = hclge_tm_qs_weight_cfg(hdev, i, dwrr);
-		if (ret)
-			return ret;
+		for (k = 0; k < hdev->num_alloc_vport; k++) {
+			ret = hclge_tm_qs_weight_cfg(
+				hdev, vport[k].qs_offset + i,
+				vport[k].dwrr);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return 0;
@@ -835,10 +904,14 @@ static int hclge_tm_pri_dwrr_cfg(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_tm_map_cfg(struct hclge_dev *hdev)
+int hclge_tm_map_cfg(struct hclge_dev *hdev)
 {
 	int ret;
 
+	ret = hclge_up_to_tc_map(hdev);
+	if (ret)
+		return ret;
+
 	ret = hclge_tm_pg_to_pri_map(hdev);
 	if (ret)
 		return ret;
@@ -850,6 +923,10 @@ static int hclge_tm_shaper_cfg(struct hclge_dev *hdev)
 {
 	int ret;
 
+	ret = hclge_tm_port_shaper_cfg(hdev);
+	if (ret)
+		return ret;
+
 	ret = hclge_tm_pg_shaper_cfg(hdev);
 	if (ret)
 		return ret;
@@ -898,7 +975,10 @@ static int hclge_tm_schd_mode_vnet_base_cfg(struct hclge_vport *vport)
 		return ret;
 
 	for (i = 0; i < kinfo->num_tc; i++) {
-		ret = hclge_tm_qs_schd_mode_cfg(hdev, vport->qs_offset + i);
+		u8 sch_mode = hdev->tm_info.tc_info[i].tc_sch_mode;
+
+		ret = hclge_tm_qs_schd_mode_cfg(hdev, vport->qs_offset + i,
+						sch_mode);
 		if (ret)
 			return ret;
 	}
@@ -910,7 +990,7 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev)
 {
 	struct hclge_vport *vport = hdev->vport;
 	int ret;
-	u8 i;
+	u8 i, k;
 
 	if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) {
 		for (i = 0; i < hdev->tm_info.num_tc; i++) {
@@ -918,9 +998,13 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev)
 			if (ret)
 				return ret;
 
-			ret = hclge_tm_qs_schd_mode_cfg(hdev, i);
-			if (ret)
-				return ret;
+			for (k = 0; k < hdev->num_alloc_vport; k++) {
+				ret = hclge_tm_qs_schd_mode_cfg(
+					hdev, vport[k].qs_offset + i,
+					HCLGE_SCH_MODE_DWRR);
+				if (ret)
+					return ret;
+			}
 		}
 	} else {
 		for (i = 0; i < hdev->num_alloc_vport; i++) {
@@ -935,7 +1019,7 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_tm_schd_mode_hw(struct hclge_dev *hdev)
+int hclge_tm_schd_mode_hw(struct hclge_dev *hdev)
 {
 	int ret;
 
@@ -969,27 +1053,109 @@ static int hclge_tm_schd_setup_hw(struct hclge_dev *hdev)
 	return hclge_tm_schd_mode_hw(hdev);
 }
 
+static int hclge_pfc_setup_hw(struct hclge_dev *hdev)
+{
+	u8 enable_bitmap = 0;
+
+	if (hdev->tm_info.fc_mode == HCLGE_FC_PFC)
+		enable_bitmap = HCLGE_TX_MAC_PAUSE_EN_MSK |
+				HCLGE_RX_MAC_PAUSE_EN_MSK;
+
+	return hclge_pfc_pause_en_cfg(hdev, enable_bitmap,
+				      hdev->tm_info.hw_pfc_map);
+}
+
+static int hclge_mac_pause_setup_hw(struct hclge_dev *hdev)
+{
+	bool tx_en, rx_en;
+
+	switch (hdev->tm_info.fc_mode) {
+	case HCLGE_FC_NONE:
+		tx_en = false;
+		rx_en = false;
+		break;
+	case HCLGE_FC_RX_PAUSE:
+		tx_en = false;
+		rx_en = true;
+		break;
+	case HCLGE_FC_TX_PAUSE:
+		tx_en = true;
+		rx_en = false;
+		break;
+	case HCLGE_FC_FULL:
+		tx_en = true;
+		rx_en = true;
+		break;
+	default:
+		tx_en = true;
+		rx_en = true;
+	}
+
+	return hclge_mac_pause_en_cfg(hdev, tx_en, rx_en);
+}
+
 int hclge_pause_setup_hw(struct hclge_dev *hdev)
 {
-	bool en = hdev->tm_info.fc_mode != HCLGE_FC_PFC;
 	int ret;
 	u8 i;
 
-	ret = hclge_mac_pause_en_cfg(hdev, en, en);
-	if (ret)
-		return ret;
+	if (hdev->tm_info.fc_mode != HCLGE_FC_PFC)
+		return hclge_mac_pause_setup_hw(hdev);
 
-	/* Only DCB-supported dev supports qset back pressure setting */
+	/* Only DCB-supported dev supports qset back pressure and pfc cmd */
 	if (!hnae3_dev_dcb_supported(hdev))
 		return 0;
 
+	/* When MAC is GE Mode, hdev does not support pfc setting */
+	ret = hclge_pfc_setup_hw(hdev);
+	if (ret)
+		dev_warn(&hdev->pdev->dev, "set pfc pause failed:%d\n", ret);
+
 	for (i = 0; i < hdev->tm_info.num_tc; i++) {
 		ret = hclge_tm_qs_bp_cfg(hdev, i);
 		if (ret)
 			return ret;
 	}
 
-	return hclge_up_to_tc_map(hdev);
+	return 0;
+}
+
+int hclge_tm_prio_tc_info_update(struct hclge_dev *hdev, u8 *prio_tc)
+{
+	struct hclge_vport *vport = hdev->vport;
+	struct hnae3_knic_private_info *kinfo;
+	u32 i, k;
+
+	for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) {
+		if (prio_tc[i] >= hdev->tm_info.num_tc)
+			return -EINVAL;
+		hdev->tm_info.prio_tc[i] = prio_tc[i];
+
+		for (k = 0;  k < hdev->num_alloc_vport; k++) {
+			kinfo = &vport[k].nic.kinfo;
+			kinfo->prio_tc[i] = prio_tc[i];
+		}
+	}
+	return 0;
+}
+
+void hclge_tm_schd_info_update(struct hclge_dev *hdev, u8 num_tc)
+{
+	u8 i, bit_map = 0;
+
+	hdev->tm_info.num_tc = num_tc;
+
+	for (i = 0; i < hdev->tm_info.num_tc; i++)
+		bit_map |= BIT(i);
+
+	if (!bit_map) {
+		bit_map = 1;
+		hdev->tm_info.num_tc = 1;
+	}
+
+	hdev->hw_tc_map = bit_map;
+
+	hclge_tm_schd_info_init(hdev);
 }
 
 int hclge_tm_init_hw(struct hclge_dev *hdev)
@@ -1013,8 +1179,13 @@ int hclge_tm_init_hw(struct hclge_dev *hdev)
 
 int hclge_tm_schd_init(struct hclge_dev *hdev)
 {
-	int ret = hclge_tm_schd_info_init(hdev);
+	int ret;
+
+	/* fc_mode is HCLGE_FC_FULL on reset */
+	hdev->tm_info.fc_mode = HCLGE_FC_FULL;
+	hdev->fc_mode_last_time = hdev->tm_info.fc_mode;
 
+	ret = hclge_tm_schd_info_init(hdev);
 	if (ret)
 		return ret;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
index 85158b0d73fe..bf59961918ab 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
@@ -94,6 +94,15 @@ struct hclge_bp_to_qs_map_cmd {
 	u32 rsvd1;
 };
 
+struct hclge_pfc_en_cmd {
+	u8 tx_rx_en_bitmap;
+	u8 pri_en_bitmap;
+};
+
+struct hclge_port_shapping_cmd {
+	__le32 port_shapping_para;
+};
+
 #define hclge_tm_set_field(dest, string, val) \
 			hnae_set_field((dest), (HCLGE_TM_SHAP_##string##_MSK), \
 				       (HCLGE_TM_SHAP_##string##_LSH), val)
@@ -103,4 +112,10 @@ struct hclge_bp_to_qs_map_cmd {
 
 int hclge_tm_schd_init(struct hclge_dev *hdev);
 int hclge_pause_setup_hw(struct hclge_dev *hdev);
+int hclge_tm_schd_mode_hw(struct hclge_dev *hdev);
+int hclge_tm_prio_tc_info_update(struct hclge_dev *hdev, u8 *prio_tc);
+void hclge_tm_schd_info_update(struct hclge_dev *hdev, u8 num_tc);
+int hclge_tm_dwrr_cfg(struct hclge_dev *hdev);
+int hclge_tm_map_cfg(struct hclge_dev *hdev);
+int hclge_tm_init_hw(struct hclge_dev *hdev);
 #endif
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c
new file mode 100644
index 000000000000..9832172bfb08
--- /dev/null
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016-2017 Hisilicon Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include "hnae3.h"
+#include "hns3_enet.h"
+
+static
+int hns3_dcbnl_ieee_getets(struct net_device *ndev, struct ieee_ets *ets)
+{
+	struct hns3_nic_priv *priv = netdev_priv(ndev);
+	struct hnae3_handle *h = priv->ae_handle;
+
+	if (h->kinfo.dcb_ops->ieee_getets)
+		return h->kinfo.dcb_ops->ieee_getets(h, ets);
+
+	return -EOPNOTSUPP;
+}
+
+static
+int hns3_dcbnl_ieee_setets(struct net_device *ndev, struct ieee_ets *ets)
+{
+	struct hns3_nic_priv *priv = netdev_priv(ndev);
+	struct hnae3_handle *h = priv->ae_handle;
+
+	if (h->kinfo.dcb_ops->ieee_setets)
+		return h->kinfo.dcb_ops->ieee_setets(h, ets);
+
+	return -EOPNOTSUPP;
+}
+
+static
+int hns3_dcbnl_ieee_getpfc(struct net_device *ndev, struct ieee_pfc *pfc)
+{
+	struct hns3_nic_priv *priv = netdev_priv(ndev);
+	struct hnae3_handle *h = priv->ae_handle;
+
+	if (h->kinfo.dcb_ops->ieee_getpfc)
+		return h->kinfo.dcb_ops->ieee_getpfc(h, pfc);
+
+	return -EOPNOTSUPP;
+}
+
+static
+int hns3_dcbnl_ieee_setpfc(struct net_device *ndev, struct ieee_pfc *pfc)
+{
+	struct hns3_nic_priv *priv = netdev_priv(ndev);
+	struct hnae3_handle *h = priv->ae_handle;
+
+	if (h->kinfo.dcb_ops->ieee_setpfc)
+		return h->kinfo.dcb_ops->ieee_setpfc(h, pfc);
+
+	return -EOPNOTSUPP;
+}
+
+/* DCBX configuration */
+static u8 hns3_dcbnl_getdcbx(struct net_device *ndev)
+{
+	struct hns3_nic_priv *priv = netdev_priv(ndev);
+	struct hnae3_handle *h = priv->ae_handle;
+
+	if (h->kinfo.dcb_ops->getdcbx)
+		return h->kinfo.dcb_ops->getdcbx(h);
+
+	return 0;
+}
+
+/* return 0 if successful, otherwise fail */
+static u8 hns3_dcbnl_setdcbx(struct net_device *ndev, u8 mode)
+{
+	struct hns3_nic_priv *priv = netdev_priv(ndev);
+	struct hnae3_handle *h = priv->ae_handle;
+
+	if (h->kinfo.dcb_ops->setdcbx)
+		return h->kinfo.dcb_ops->setdcbx(h, mode);
+
+	return 1;
+}
+
+static const struct dcbnl_rtnl_ops hns3_dcbnl_ops = {
+	.ieee_getets	= hns3_dcbnl_ieee_getets,
+	.ieee_setets	= hns3_dcbnl_ieee_setets,
+	.ieee_getpfc	= hns3_dcbnl_ieee_getpfc,
+	.ieee_setpfc	= hns3_dcbnl_ieee_setpfc,
+	.getdcbx	= hns3_dcbnl_getdcbx,
+	.setdcbx	= hns3_dcbnl_setdcbx,
+};
+
+/* hclge_dcbnl_setup - DCBNL setup
+ * @handle: the corresponding vport handle
+ * Set up DCBNL
+ */
+void hns3_dcbnl_setup(struct hnae3_handle *handle)
+{
+	struct net_device *dev = handle->kinfo.netdev;
+
+	if (!handle->kinfo.dcb_ops)
+		return;
+
+	dev->dcbnl_ops = &hns3_dcbnl_ops;
+}
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
index 35369e1c8036..c31506514e5d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -196,6 +196,32 @@ static void hns3_vector_gl_rl_init(struct hns3_enet_tqp_vector *tqp_vector)
 	tqp_vector->tx_group.flow_level = HNS3_FLOW_LOW;
 }
 
+static int hns3_nic_set_real_num_queue(struct net_device *netdev)
+{
+	struct hns3_nic_priv *priv = netdev_priv(netdev);
+	struct hnae3_handle *h = priv->ae_handle;
+	struct hnae3_knic_private_info *kinfo = &h->kinfo;
+	unsigned int queue_size = kinfo->rss_size * kinfo->num_tc;
+	int ret;
+
+	ret = netif_set_real_num_tx_queues(netdev, queue_size);
+	if (ret) {
+		netdev_err(netdev,
+			   "netif_set_real_num_tx_queues fail, ret=%d!\n",
+			   ret);
+		return ret;
+	}
+
+	ret = netif_set_real_num_rx_queues(netdev, queue_size);
+	if (ret) {
+		netdev_err(netdev,
+			   "netif_set_real_num_rx_queues fail, ret=%d!\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
 static int hns3_nic_net_up(struct net_device *netdev)
 {
 	struct hns3_nic_priv *priv = netdev_priv(netdev);
@@ -232,26 +258,13 @@ out_start_err:
 
 static int hns3_nic_net_open(struct net_device *netdev)
 {
-	struct hns3_nic_priv *priv = netdev_priv(netdev);
-	struct hnae3_handle *h = priv->ae_handle;
 	int ret;
 
 	netif_carrier_off(netdev);
 
-	ret = netif_set_real_num_tx_queues(netdev, h->kinfo.num_tqps);
-	if (ret) {
-		netdev_err(netdev,
-			   "netif_set_real_num_tx_queues fail, ret=%d!\n",
-			   ret);
+	ret = hns3_nic_set_real_num_queue(netdev);
+	if (ret)
 		return ret;
-	}
-
-	ret = netif_set_real_num_rx_queues(netdev, h->kinfo.num_tqps);
-	if (ret) {
-		netdev_err(netdev,
-			   "netif_set_real_num_rx_queues fail, ret=%d!\n", ret);
-		return ret;
-	}
 
 	ret = hns3_nic_net_up(netdev);
 	if (ret) {
@@ -2790,6 +2803,8 @@ static int hns3_client_init(struct hnae3_handle *handle)
 		goto out_reg_netdev_fail;
 	}
 
+	hns3_dcbnl_setup(handle);
+
 	/* MTU range: (ETH_MIN_MTU(kernel default) - 9706) */
 	netdev->max_mtu = HNS3_MAX_MTU - (ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN);
 
@@ -2846,10 +2861,71 @@ static void hns3_link_status_change(struct hnae3_handle *handle, bool linkup)
 	}
 }
 
+static int hns3_client_setup_tc(struct hnae3_handle *handle, u8 tc)
+{
+	struct hnae3_knic_private_info *kinfo = &handle->kinfo;
+	struct net_device *ndev = kinfo->netdev;
+	bool if_running;
+	int ret;
+	u8 i;
+
+	if (tc > HNAE3_MAX_TC)
+		return -EINVAL;
+
+	if (!ndev)
+		return -ENODEV;
+
+	if_running = netif_running(ndev);
+
+	ret = netdev_set_num_tc(ndev, tc);
+	if (ret)
+		return ret;
+
+	if (if_running) {
+		(void)hns3_nic_net_stop(ndev);
+		msleep(100);
+	}
+
+	ret = (kinfo->dcb_ops && kinfo->dcb_ops->map_update) ?
+		kinfo->dcb_ops->map_update(handle) : -EOPNOTSUPP;
+	if (ret)
+		goto err_out;
+
+	if (tc <= 1) {
+		netdev_reset_tc(ndev);
+		goto out;
+	}
+
+	for (i = 0; i < HNAE3_MAX_TC; i++) {
+		struct hnae3_tc_info *tc_info = &kinfo->tc_info[i];
+
+		if (tc_info->enable)
+			netdev_set_tc_queue(ndev,
+					    tc_info->tc,
+					    tc_info->tqp_count,
+					    tc_info->tqp_offset);
+	}
+
+	for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) {
+		netdev_set_prio_tc_map(ndev, i,
+				       kinfo->prio_tc[i]);
+	}
+
+out:
+	ret = hns3_nic_set_real_num_queue(ndev);
+
+err_out:
+	if (if_running)
+		(void)hns3_nic_net_open(ndev);
+
+	return ret;
+}
+
 const struct hnae3_client_ops client_ops = {
 	.init_instance = hns3_client_init,
 	.uninit_instance = hns3_client_uninit,
 	.link_status_change = hns3_link_status_change,
+	.setup_tc = hns3_client_setup_tc,
 };
 
 /* hns3_init_module - Driver registration routine
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h
index 7e8746189747..481eada73e2d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h
@@ -590,4 +590,11 @@ static inline void hns3_write_reg(void __iomem *base, u32 reg, u32 value)
 void hns3_ethtool_set_ops(struct net_device *netdev);
 
 int hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget);
+
+#ifdef CONFIG_HNS3_DCB
+void hns3_dcbnl_setup(struct hnae3_handle *handle);
+#else
+static inline void hns3_dcbnl_setup(struct hnae3_handle *handle) {}
+#endif
+
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
index 1d4f712b15a8..e2e5cdc7119c 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
@@ -26,6 +26,7 @@
 #include <linux/skbuff.h>
 #include <linux/dma-mapping.h>
 #include <linux/prefetch.h>
+#include <linux/cpumask.h>
 #include <asm/barrier.h>
 
 #include "hinic_common.h"
@@ -171,11 +172,10 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq)
 	struct hinic_sge sge;
 	dma_addr_t dma_addr;
 	struct sk_buff *skb;
-	int i, alloc_more;
 	u16 prod_idx;
+	int i;
 
 	free_wqebbs = hinic_get_rq_free_wqebbs(rxq->rq);
-	alloc_more = 0;
 
 	/* Limit the allocation chunks */
 	if (free_wqebbs > nic_dev->rx_weight)
@@ -185,7 +185,6 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq)
 		skb = rx_alloc_skb(rxq, &dma_addr);
 		if (!skb) {
 			netdev_err(rxq->netdev, "Failed to alloc Rx skb\n");
-			alloc_more = 1;
 			goto skb_out;
 		}
 
@@ -195,7 +194,6 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq)
 					  &prod_idx);
 		if (!rq_wqe) {
 			rx_free_skb(rxq, skb, dma_addr);
-			alloc_more = 1;
 			goto skb_out;
 		}
 
@@ -211,9 +209,7 @@ skb_out:
 		hinic_rq_update(rxq->rq, prod_idx);
 	}
 
-	if (alloc_more)
-		tasklet_schedule(&rxq->rx_task);
-
+	tasklet_schedule(&rxq->rx_task);
 	return i;
 }
 
@@ -357,7 +353,7 @@ static int rxq_recv(struct hinic_rxq *rxq, int budget)
 	}
 
 	if (pkts)
-		tasklet_schedule(&rxq->rx_task); /* hinic_rx_alloc_pkts */
+		tasklet_schedule(&rxq->rx_task); /* rx_alloc_pkts */
 
 	u64_stats_update_begin(&rxq->rxq_stats.syncp);
 	rxq->rxq_stats.pkts += pkts;
@@ -417,6 +413,8 @@ static int rx_request_irq(struct hinic_rxq *rxq)
 	struct hinic_dev *nic_dev = netdev_priv(rxq->netdev);
 	struct hinic_hwdev *hwdev = nic_dev->hwdev;
 	struct hinic_rq *rq = rxq->rq;
+	struct hinic_qp *qp;
+	struct cpumask mask;
 	int err;
 
 	rx_add_napi(rxq);
@@ -432,7 +430,9 @@ static int rx_request_irq(struct hinic_rxq *rxq)
 		return err;
 	}
 
-	return 0;
+	qp = container_of(rq, struct hinic_qp, rq);
+	cpumask_set_cpu(qp->q_id % num_online_cpus(), &mask);
+	return irq_set_affinity_hint(rq->irq, &mask);
 }
 
 static void rx_free_irq(struct hinic_rxq *rxq)
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
index abe3e38cd342..9128858479c4 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
@@ -212,10 +212,19 @@ netdev_tx_t hinic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 
 	sq_wqe = hinic_sq_get_wqe(txq->sq, wqe_size, &prod_idx);
 	if (!sq_wqe) {
-		tx_unmap_skb(nic_dev, skb, txq->sges);
-
 		netif_stop_subqueue(netdev, qp->q_id);
 
+		/* Check for the case free_tx_poll is called in another cpu
+		 * and we stopped the subqueue after free_tx_poll check.
+		 */
+		sq_wqe = hinic_sq_get_wqe(txq->sq, wqe_size, &prod_idx);
+		if (sq_wqe) {
+			netif_wake_subqueue(nic_dev->netdev, qp->q_id);
+			goto process_sq_wqe;
+		}
+
+		tx_unmap_skb(nic_dev, skb, txq->sges);
+
 		u64_stats_update_begin(&txq->txq_stats.syncp);
 		txq->txq_stats.tx_busy++;
 		u64_stats_update_end(&txq->txq_stats.syncp);
@@ -223,6 +232,7 @@ netdev_tx_t hinic_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 		goto flush_skbs;
 	}
 
+process_sq_wqe:
 	hinic_sq_prepare_wqe(txq->sq, prod_idx, sq_wqe, txq->sges, nr_sges);
 
 	hinic_sq_write_wqe(txq->sq, prod_idx, sq_wqe, skb, wqe_size);
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index cb8182f4fdfa..4bc14a901571 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -927,6 +927,7 @@ static int ibmvnic_open(struct net_device *netdev)
 	}
 
 	rc = __ibmvnic_open(netdev);
+	netif_carrier_on(netdev);
 	mutex_unlock(&adapter->reset_lock);
 
 	return rc;
@@ -3899,6 +3900,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	if (rc)
 		goto ibmvnic_init_fail;
 
+	netif_carrier_off(netdev);
 	rc = register_netdev(netdev);
 	if (rc) {
 		dev_err(&dev->dev, "failed to register netdev rc=%d\n", rc);
diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c
index 4d10270ddf8f..184f11242f56 100644
--- a/drivers/net/ethernet/intel/e100.c
+++ b/drivers/net/ethernet/intel/e100.c
@@ -1910,11 +1910,10 @@ static int e100_alloc_cbs(struct nic *nic)
 	nic->cb_to_use = nic->cb_to_send = nic->cb_to_clean = NULL;
 	nic->cbs_avail = 0;
 
-	nic->cbs = pci_pool_alloc(nic->cbs_pool, GFP_KERNEL,
-				  &nic->cbs_dma_addr);
+	nic->cbs = pci_pool_zalloc(nic->cbs_pool, GFP_KERNEL,
+				   &nic->cbs_dma_addr);
 	if (!nic->cbs)
 		return -ENOMEM;
-	memset(nic->cbs, 0, count * sizeof(struct cb));
 
 	for (cb = nic->cbs, i = 0; i < count; cb++, i++) {
 		cb->next = (i + 1 < count) ? cb + 1 : nic->cbs;
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 327dfe5bedc0..8436c5f2c3e8 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -7252,13 +7252,10 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_eeprom;
 	}
 
-	init_timer(&adapter->watchdog_timer);
-	adapter->watchdog_timer.function = e1000_watchdog;
-	adapter->watchdog_timer.data = (unsigned long)adapter;
-
-	init_timer(&adapter->phy_info_timer);
-	adapter->phy_info_timer.function = e1000_update_phy_info;
-	adapter->phy_info_timer.data = (unsigned long)adapter;
+	setup_timer(&adapter->watchdog_timer, e1000_watchdog,
+		    (unsigned long)adapter);
+	setup_timer(&adapter->phy_info_timer, e1000_update_phy_info,
+		    (unsigned long)adapter);
 
 	INIT_WORK(&adapter->reset_task, e1000_reset_task);
 	INIT_WORK(&adapter->watchdog_task, e1000_watchdog_task);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index 689c413b7782..46973fb234c5 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -248,6 +248,29 @@ struct fm10k_udp_port {
 	__be16			port;
 };
 
+enum fm10k_macvlan_request_type {
+	FM10K_UC_MAC_REQUEST,
+	FM10K_MC_MAC_REQUEST,
+	FM10K_VLAN_REQUEST
+};
+
+struct fm10k_macvlan_request {
+	enum fm10k_macvlan_request_type type;
+	struct list_head list;
+	union {
+		struct fm10k_mac_request {
+			u8 addr[ETH_ALEN];
+			u16 glort;
+			u16 vid;
+		} mac;
+		struct fm10k_vlan_request {
+			u32 vid;
+			u8 vsi;
+		} vlan;
+	};
+	bool set;
+};
+
 /* one work queue for entire driver */
 extern struct workqueue_struct *fm10k_workqueue;
 
@@ -270,11 +293,15 @@ enum fm10k_flags_t {
 
 enum fm10k_state_t {
 	__FM10K_RESETTING,
+	__FM10K_RESET_DETACHED,
+	__FM10K_RESET_SUSPENDED,
 	__FM10K_DOWN,
 	__FM10K_SERVICE_SCHED,
 	__FM10K_SERVICE_REQUEST,
 	__FM10K_SERVICE_DISABLE,
-	__FM10K_MBX_LOCK,
+	__FM10K_MACVLAN_SCHED,
+	__FM10K_MACVLAN_REQUEST,
+	__FM10K_MACVLAN_DISABLE,
 	__FM10K_LINK_DOWN,
 	__FM10K_UPDATING_STATS,
 	/* This value must be last and determines the BITMAP size */
@@ -344,6 +371,8 @@ struct fm10k_intfc {
 
 	struct fm10k_hw_stats stats;
 	struct fm10k_hw hw;
+	/* Mailbox lock */
+	spinlock_t mbx_lock;
 	u32 __iomem *uc_addr;
 	u32 __iomem *sw_addr;
 	u16 msg_enable;
@@ -365,6 +394,12 @@ struct fm10k_intfc {
 	struct list_head vxlan_port;
 	struct list_head geneve_port;
 
+	/* MAC/VLAN update queue */
+	struct list_head macvlan_requests;
+	struct delayed_work macvlan_task;
+	/* MAC/VLAN update queue lock */
+	spinlock_t macvlan_lock;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dbg_intfc;
 #endif /* CONFIG_DEBUG_FS */
@@ -384,23 +419,17 @@ struct fm10k_intfc {
 
 static inline void fm10k_mbx_lock(struct fm10k_intfc *interface)
 {
-	/* busy loop if we cannot obtain the lock as some calls
-	 * such as ndo_set_rx_mode may be made in atomic context
-	 */
-	while (test_and_set_bit(__FM10K_MBX_LOCK, interface->state))
-		udelay(20);
+	spin_lock(&interface->mbx_lock);
 }
 
 static inline void fm10k_mbx_unlock(struct fm10k_intfc *interface)
 {
-	/* flush memory to make sure state is correct */
-	smp_mb__before_atomic();
-	clear_bit(__FM10K_MBX_LOCK, interface->state);
+	spin_unlock(&interface->mbx_lock);
 }
 
 static inline int fm10k_mbx_trylock(struct fm10k_intfc *interface)
 {
-	return !test_and_set_bit(__FM10K_MBX_LOCK, interface->state);
+	return spin_trylock(&interface->mbx_lock);
 }
 
 /* fm10k_test_staterr - test bits in Rx descriptor status and error fields */
@@ -490,6 +519,7 @@ void fm10k_up(struct fm10k_intfc *interface);
 void fm10k_down(struct fm10k_intfc *interface);
 void fm10k_update_stats(struct fm10k_intfc *interface);
 void fm10k_service_event_schedule(struct fm10k_intfc *interface);
+void fm10k_macvlan_schedule(struct fm10k_intfc *interface);
 void fm10k_update_rx_drop_en(struct fm10k_intfc *interface);
 #ifdef CONFIG_NET_POLL_CONTROLLER
 void fm10k_netpoll(struct net_device *netdev);
@@ -510,6 +540,12 @@ void fm10k_reset_rx_state(struct fm10k_intfc *);
 int fm10k_setup_tc(struct net_device *dev, u8 tc);
 int fm10k_open(struct net_device *netdev);
 int fm10k_close(struct net_device *netdev);
+int fm10k_queue_vlan_request(struct fm10k_intfc *interface, u32 vid,
+			     u8 vsi, bool set);
+int fm10k_queue_mac_request(struct fm10k_intfc *interface, u16 glort,
+			    const unsigned char *addr, u16 vid, bool set);
+void fm10k_clear_macvlan_queue(struct fm10k_intfc *interface,
+			       u16 glort, bool vlans);
 
 /* Ethtool */
 void fm10k_set_ethtool_ops(struct net_device *dev);
@@ -526,8 +562,8 @@ s32 fm10k_iov_update_pvid(struct fm10k_intfc *interface, u16 glort, u16 pvid);
 int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac);
 int fm10k_ndo_set_vf_vlan(struct net_device *netdev,
 			  int vf_idx, u16 vid, u8 qos, __be16 vlan_proto);
-int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, int rate,
-			int unused);
+int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx,
+			int __always_unused min_rate, int max_rate);
 int fm10k_ndo_get_vf_config(struct net_device *netdev,
 			    int vf_idx, struct ifla_vf_info *ivi);
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_common.c b/drivers/net/ethernet/intel/fm10k/fm10k_common.c
index 62a6ad9b3eed..736a9f087bc9 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_common.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_common.c
@@ -1,5 +1,5 @@
 /* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
+ * Copyright(c) 2013 - 2017 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -517,8 +517,8 @@ s32 fm10k_get_host_state_generic(struct fm10k_hw *hw, bool *host_ready)
 		goto out;
 	}
 
-	/* verify Mailbox is still valid */
-	if (!mbx->ops.tx_ready(mbx, FM10K_VFMBX_MSG_MTU))
+	/* verify Mailbox is still open */
+	if (mbx->state != FM10K_STATE_OPEN)
 		goto out;
 
 	/* interface cannot receive traffic without logical ports */
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
index 5116fd043630..14df09e2d964 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
@@ -52,9 +52,9 @@ static void fm10k_dbg_desc_seq_stop(struct seq_file __always_unused *s,
 static void fm10k_dbg_desc_break(struct seq_file *s, int i)
 {
 	while (i--)
-		seq_puts(s, "-");
+		seq_putc(s, '-');
 
-	seq_puts(s, "\n");
+	seq_putc(s, '\n');
 }
 
 static int fm10k_dbg_tx_desc_seq_show(struct seq_file *s, void *v)
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
index 5f4dac0d36ef..ea3ab24265ee 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
@@ -1,5 +1,5 @@
 /* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
+ * Copyright(c) 2013 - 2017 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -35,10 +35,133 @@ static s32 fm10k_iov_msg_error(struct fm10k_hw *hw, u32 **results,
 	return fm10k_tlv_msg_error(hw, results, mbx);
 }
 
+/**
+ *  fm10k_iov_msg_queue_mac_vlan - Message handler for MAC/VLAN request from VF
+ *  @hw: Pointer to hardware structure
+ *  @results: Pointer array to message, results[0] is pointer to message
+ *  @mbx: Pointer to mailbox information structure
+ *
+ *  This function is a custom handler for MAC/VLAN requests from the VF. The
+ *  assumption is that it is acceptable to directly hand off the message from
+ *  the VF to the PF's switch manager. However, we use a MAC/VLAN message
+ *  queue to avoid overloading the mailbox when a large number of requests
+ *  come in.
+ **/
+static s32 fm10k_iov_msg_queue_mac_vlan(struct fm10k_hw *hw, u32 **results,
+					struct fm10k_mbx_info *mbx)
+{
+	struct fm10k_vf_info *vf_info = (struct fm10k_vf_info *)mbx;
+	struct fm10k_intfc *interface = hw->back;
+	u8 mac[ETH_ALEN];
+	u32 *result;
+	int err = 0;
+	bool set;
+	u16 vlan;
+	u32 vid;
+
+	/* we shouldn't be updating rules on a disabled interface */
+	if (!FM10K_VF_FLAG_ENABLED(vf_info))
+		err = FM10K_ERR_PARAM;
+
+	if (!err && !!results[FM10K_MAC_VLAN_MSG_VLAN]) {
+		result = results[FM10K_MAC_VLAN_MSG_VLAN];
+
+		/* record VLAN id requested */
+		err = fm10k_tlv_attr_get_u32(result, &vid);
+		if (err)
+			return err;
+
+		set = !(vid & FM10K_VLAN_CLEAR);
+		vid &= ~FM10K_VLAN_CLEAR;
+
+		/* if the length field has been set, this is a multi-bit
+		 * update request. For multi-bit requests, simply disallow
+		 * them when the pf_vid has been set. In this case, the PF
+		 * should have already cleared the VLAN_TABLE, and if we
+		 * allowed them, it could allow a rogue VF to receive traffic
+		 * on a VLAN it was not assigned. In the single-bit case, we
+		 * need to modify requests for VLAN 0 to use the default PF or
+		 * SW vid when assigned.
+		 */
+
+		if (vid >> 16) {
+			/* prevent multi-bit requests when PF has
+			 * administratively set the VLAN for this VF
+			 */
+			if (vf_info->pf_vid)
+				return FM10K_ERR_PARAM;
+		} else {
+			err = fm10k_iov_select_vid(vf_info, (u16)vid);
+			if (err < 0)
+				return err;
+
+			vid = err;
+		}
+
+		/* update VSI info for VF in regards to VLAN table */
+		err = hw->mac.ops.update_vlan(hw, vid, vf_info->vsi, set);
+	}
+
+	if (!err && !!results[FM10K_MAC_VLAN_MSG_MAC]) {
+		result = results[FM10K_MAC_VLAN_MSG_MAC];
+
+		/* record unicast MAC address requested */
+		err = fm10k_tlv_attr_get_mac_vlan(result, mac, &vlan);
+		if (err)
+			return err;
+
+		/* block attempts to set MAC for a locked device */
+		if (is_valid_ether_addr(vf_info->mac) &&
+		    !ether_addr_equal(mac, vf_info->mac))
+			return FM10K_ERR_PARAM;
+
+		set = !(vlan & FM10K_VLAN_CLEAR);
+		vlan &= ~FM10K_VLAN_CLEAR;
+
+		err = fm10k_iov_select_vid(vf_info, vlan);
+		if (err < 0)
+			return err;
+
+		vlan = (u16)err;
+
+		/* Add this request to the MAC/VLAN queue */
+		err = fm10k_queue_mac_request(interface, vf_info->glort,
+					      mac, vlan, set);
+	}
+
+	if (!err && !!results[FM10K_MAC_VLAN_MSG_MULTICAST]) {
+		result = results[FM10K_MAC_VLAN_MSG_MULTICAST];
+
+		/* record multicast MAC address requested */
+		err = fm10k_tlv_attr_get_mac_vlan(result, mac, &vlan);
+		if (err)
+			return err;
+
+		/* verify that the VF is allowed to request multicast */
+		if (!(vf_info->vf_flags & FM10K_VF_FLAG_MULTI_ENABLED))
+			return FM10K_ERR_PARAM;
+
+		set = !(vlan & FM10K_VLAN_CLEAR);
+		vlan &= ~FM10K_VLAN_CLEAR;
+
+		err = fm10k_iov_select_vid(vf_info, vlan);
+		if (err < 0)
+			return err;
+
+		vlan = (u16)err;
+
+		/* Add this request to the MAC/VLAN queue */
+		err = fm10k_queue_mac_request(interface, vf_info->glort,
+					      mac, vlan, set);
+	}
+
+	return err;
+}
+
 static const struct fm10k_msg_data iov_mbx_data[] = {
 	FM10K_TLV_MSG_TEST_HANDLER(fm10k_tlv_msg_test),
 	FM10K_VF_MSG_MSIX_HANDLER(fm10k_iov_msg_msix_pf),
-	FM10K_VF_MSG_MAC_VLAN_HANDLER(fm10k_iov_msg_mac_vlan_pf),
+	FM10K_VF_MSG_MAC_VLAN_HANDLER(fm10k_iov_msg_queue_mac_vlan),
 	FM10K_VF_MSG_LPORT_STATE_HANDLER(fm10k_iov_msg_lport_state_pf),
 	FM10K_TLV_MSG_ERROR_HANDLER(fm10k_iov_msg_error),
 };
@@ -66,25 +189,21 @@ s32 fm10k_iov_event(struct fm10k_intfc *interface)
 		goto read_unlock;
 
 	/* read VFLRE to determine if any VFs have been reset */
-	do {
-		vflre = fm10k_read_reg(hw, FM10K_PFVFLRE(0));
-		vflre <<= 32;
-		vflre |= fm10k_read_reg(hw, FM10K_PFVFLRE(1));
-		vflre = (vflre << 32) | (vflre >> 32);
-		vflre |= fm10k_read_reg(hw, FM10K_PFVFLRE(0));
+	vflre = fm10k_read_reg(hw, FM10K_PFVFLRE(1));
+	vflre <<= 32;
+	vflre |= fm10k_read_reg(hw, FM10K_PFVFLRE(0));
 
-		i = iov_data->num_vfs;
+	i = iov_data->num_vfs;
 
-		for (vflre <<= 64 - i; vflre && i--; vflre += vflre) {
-			struct fm10k_vf_info *vf_info = &iov_data->vf_info[i];
+	for (vflre <<= 64 - i; vflre && i--; vflre += vflre) {
+		struct fm10k_vf_info *vf_info = &iov_data->vf_info[i];
 
-			if (vflre >= 0)
-				continue;
+		if (vflre >= 0)
+			continue;
 
-			hw->iov.ops.reset_resources(hw, vf_info);
-			vf_info->mbx.ops.connect(hw, &vf_info->mbx);
-		}
-	} while (i != iov_data->num_vfs);
+		hw->iov.ops.reset_resources(hw, vf_info);
+		vf_info->mbx.ops.connect(hw, &vf_info->mbx);
+	}
 
 read_unlock:
 	rcu_read_unlock();
@@ -126,9 +245,14 @@ process_mbx:
 		struct fm10k_mbx_info *mbx = &vf_info->mbx;
 		u16 glort = vf_info->glort;
 
+		/* process the SM mailbox first to drain outgoing messages */
+		hw->mbx.ops.process(hw, &hw->mbx);
+
 		/* verify port mapping is valid, if not reset port */
-		if (vf_info->vf_flags && !fm10k_glort_valid_pf(hw, glort))
+		if (vf_info->vf_flags && !fm10k_glort_valid_pf(hw, glort)) {
 			hw->iov.ops.reset_lport(hw, vf_info);
+			fm10k_clear_macvlan_queue(interface, glort, false);
+		}
 
 		/* reset VFs that have mailbox timed out */
 		if (!mbx->timeout) {
@@ -140,6 +264,10 @@ process_mbx:
 		if (!hw->mbx.ops.tx_ready(&hw->mbx, FM10K_VFMBX_MSG_MTU)) {
 			/* keep track of how many times this occurs */
 			interface->hw_sm_mbx_full++;
+
+			/* make sure we try again momentarily */
+			fm10k_service_event_schedule(interface);
+
 			break;
 		}
 
@@ -187,6 +315,7 @@ void fm10k_iov_suspend(struct pci_dev *pdev)
 
 		hw->iov.ops.reset_resources(hw, vf_info);
 		hw->iov.ops.reset_lport(hw, vf_info);
+		fm10k_clear_macvlan_queue(interface, vf_info->glort, false);
 	}
 }
 
@@ -411,6 +540,8 @@ static inline void fm10k_reset_vf_info(struct fm10k_intfc *interface,
 	/* disable LPORT for this VF which clears switch rules */
 	hw->iov.ops.reset_lport(hw, vf_info);
 
+	fm10k_clear_macvlan_queue(interface, vf_info->glort, false);
+
 	/* assign new MAC+VLAN for this VF */
 	hw->iov.ops.assign_default_mac_vlan(hw, vf_info);
 
@@ -482,7 +613,7 @@ int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid,
 }
 
 int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx,
-			int __always_unused unused, int rate)
+			int __always_unused min_rate, int max_rate)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
 	struct fm10k_iov_data *iov_data = interface->iov_data;
@@ -493,14 +624,15 @@ int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx,
 		return -EINVAL;
 
 	/* rate limit cannot be less than 10Mbs or greater than link speed */
-	if (rate && ((rate < FM10K_VF_TC_MIN) || rate > FM10K_VF_TC_MAX))
+	if (max_rate &&
+	    (max_rate < FM10K_VF_TC_MIN || max_rate > FM10K_VF_TC_MAX))
 		return -EINVAL;
 
 	/* store values */
-	iov_data->vf_info[vf_idx].rate = rate;
+	iov_data->vf_info[vf_idx].rate = max_rate;
 
 	/* update hardware configuration */
-	hw->iov.ops.configure_tc(hw, vf_idx, rate);
+	hw->iov.ops.configure_tc(hw, vf_idx, max_rate);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 9dffaba85ae6..dbd69310f263 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -28,7 +28,7 @@
 
 #include "fm10k.h"
 
-#define DRV_VERSION	"0.21.7-k"
+#define DRV_VERSION	"0.22.1-k"
 #define DRV_SUMMARY	"Intel(R) Ethernet Switch Host Interface Driver"
 const char fm10k_driver_version[] = DRV_VERSION;
 char fm10k_driver_name[] = "fm10k";
@@ -806,9 +806,10 @@ static int fm10k_tso(struct fm10k_ring *tx_ring,
 	tx_desc->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
 
 	return 1;
+
 err_vxlan:
 	tx_ring->netdev->features &= ~NETIF_F_GSO_UDP_TUNNEL;
-	if (!net_ratelimit())
+	if (net_ratelimit())
 		netdev_err(tx_ring->netdev,
 			   "TSO requested for unsupported tunnel, disabling offload\n");
 	return -1;
@@ -876,6 +877,7 @@ static void fm10k_tx_csum(struct fm10k_ring *tx_ring,
 	case IPPROTO_GRE:
 		if (skb->encapsulation)
 			break;
+		/* fall through */
 	default:
 		if (unlikely(net_ratelimit())) {
 			dev_warn(tx_ring->dev,
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
index 334088a101c3..244d3ad58ca7 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
@@ -1,5 +1,5 @@
 /* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
+ * Copyright(c) 2013 - 2017 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -1586,7 +1586,7 @@ s32 fm10k_pfvf_mbx_init(struct fm10k_hw *hw, struct fm10k_mbx_info *mbx,
 			mbx->mbmem_reg = FM10K_MBMEM_VF(id, 0);
 			break;
 		}
-		/* fallthough */
+		/* fall through */
 	default:
 		return FM10K_MBX_ERR_NO_MBX;
 	}
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index e69d49d91d67..81e4425f0529 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -643,9 +643,13 @@ int fm10k_close(struct net_device *netdev)
 static netdev_tx_t fm10k_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 {
 	struct fm10k_intfc *interface = netdev_priv(dev);
+	int num_tx_queues = READ_ONCE(interface->num_tx_queues);
 	unsigned int r_idx = skb->queue_mapping;
 	int err;
 
+	if (!num_tx_queues)
+		return NETDEV_TX_BUSY;
+
 	if ((skb->protocol == htons(ETH_P_8021Q)) &&
 	    !skb_vlan_tag_present(skb)) {
 		/* FM10K only supports hardware tagging, any tags in frame
@@ -698,8 +702,8 @@ static netdev_tx_t fm10k_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 		__skb_put(skb, pad_len);
 	}
 
-	if (r_idx >= interface->num_tx_queues)
-		r_idx %= interface->num_tx_queues;
+	if (r_idx >= num_tx_queues)
+		r_idx %= num_tx_queues;
 
 	err = fm10k_xmit_frame_ring(skb, interface->tx_ring[r_idx]);
 
@@ -754,11 +758,132 @@ static bool fm10k_host_mbx_ready(struct fm10k_intfc *interface)
 	return (hw->mac.type == fm10k_mac_vf || interface->host_ready);
 }
 
+/**
+ * fm10k_queue_vlan_request - Queue a VLAN update request
+ * @interface: the fm10k interface structure
+ * @vid: the VLAN vid
+ * @vsi: VSI index number
+ * @set: whether to set or clear
+ *
+ * This function queues up a VLAN update. For VFs, this must be sent to the
+ * managing PF over the mailbox. For PFs, we'll use the same handling so that
+ * it's similar to the VF. This avoids storming the PF<->VF mailbox with too
+ * many VLAN updates during reset.
+ */
+int fm10k_queue_vlan_request(struct fm10k_intfc *interface,
+			     u32 vid, u8 vsi, bool set)
+{
+	struct fm10k_macvlan_request *request;
+	unsigned long flags;
+
+	/* This must be atomic since we may be called while the netdev
+	 * addr_list_lock is held
+	 */
+	request = kzalloc(sizeof(*request), GFP_ATOMIC);
+	if (!request)
+		return -ENOMEM;
+
+	request->type = FM10K_VLAN_REQUEST;
+	request->vlan.vid = vid;
+	request->vlan.vsi = vsi;
+	request->set = set;
+
+	spin_lock_irqsave(&interface->macvlan_lock, flags);
+	list_add_tail(&request->list, &interface->macvlan_requests);
+	spin_unlock_irqrestore(&interface->macvlan_lock, flags);
+
+	fm10k_macvlan_schedule(interface);
+
+	return 0;
+}
+
+/**
+ * fm10k_queue_mac_request - Queue a MAC update request
+ * @interface: the fm10k interface structure
+ * @glort: the target glort for this update
+ * @addr: the address to update
+ * @vid: the vid to update
+ * @sync: whether to add or remove
+ *
+ * This function queues up a MAC request for sending to the switch manager.
+ * A separate thread monitors the queue and sends updates to the switch
+ * manager. Return 0 on success, and negative error code on failure.
+ **/
+int fm10k_queue_mac_request(struct fm10k_intfc *interface, u16 glort,
+			    const unsigned char *addr, u16 vid, bool set)
+{
+	struct fm10k_macvlan_request *request;
+	unsigned long flags;
+
+	/* This must be atomic since we may be called while the netdev
+	 * addr_list_lock is held
+	 */
+	request = kzalloc(sizeof(*request), GFP_ATOMIC);
+	if (!request)
+		return -ENOMEM;
+
+	if (is_multicast_ether_addr(addr))
+		request->type = FM10K_MC_MAC_REQUEST;
+	else
+		request->type = FM10K_UC_MAC_REQUEST;
+
+	ether_addr_copy(request->mac.addr, addr);
+	request->mac.glort = glort;
+	request->mac.vid = vid;
+	request->set = set;
+
+	spin_lock_irqsave(&interface->macvlan_lock, flags);
+	list_add_tail(&request->list, &interface->macvlan_requests);
+	spin_unlock_irqrestore(&interface->macvlan_lock, flags);
+
+	fm10k_macvlan_schedule(interface);
+
+	return 0;
+}
+
+/**
+ * fm10k_clear_macvlan_queue - Cancel pending updates for a given glort
+ * @interface: the fm10k interface structure
+ * @glort: the target glort to clear
+ * @vlans: true to clear VLAN messages, false to ignore them
+ *
+ * Cancel any outstanding MAC/VLAN requests for a given glort. This is
+ * expected to be called when a logical port goes down.
+ **/
+void fm10k_clear_macvlan_queue(struct fm10k_intfc *interface,
+			       u16 glort, bool vlans)
+
+{
+	struct fm10k_macvlan_request *r, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&interface->macvlan_lock, flags);
+
+	/* Free any outstanding MAC/VLAN requests for this interface */
+	list_for_each_entry_safe(r, tmp, &interface->macvlan_requests, list) {
+		switch (r->type) {
+		case FM10K_MC_MAC_REQUEST:
+		case FM10K_UC_MAC_REQUEST:
+			/* Don't free requests for other interfaces */
+			if (r->mac.glort != glort)
+				break;
+			/* fall through */
+		case FM10K_VLAN_REQUEST:
+			if (vlans) {
+				list_del(&r->list);
+				kfree(r);
+			}
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&interface->macvlan_lock, flags);
+}
+
 static int fm10k_uc_vlan_unsync(struct net_device *netdev,
 				const unsigned char *uc_addr)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
-	struct fm10k_hw *hw = &interface->hw;
 	u16 glort = interface->glort;
 	u16 vid = interface->vid;
 	bool set = !!(vid / VLAN_N_VID);
@@ -767,10 +892,7 @@ static int fm10k_uc_vlan_unsync(struct net_device *netdev,
 	/* drop any leading bits on the VLAN ID */
 	vid &= VLAN_N_VID - 1;
 
-	if (fm10k_host_mbx_ready(interface))
-		err = hw->mac.ops.update_uc_addr(hw, glort, uc_addr,
-						 vid, set, 0);
-
+	err = fm10k_queue_mac_request(interface, glort, uc_addr, vid, set);
 	if (err)
 		return err;
 
@@ -782,7 +904,6 @@ static int fm10k_mc_vlan_unsync(struct net_device *netdev,
 				const unsigned char *mc_addr)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
-	struct fm10k_hw *hw = &interface->hw;
 	u16 glort = interface->glort;
 	u16 vid = interface->vid;
 	bool set = !!(vid / VLAN_N_VID);
@@ -791,9 +912,7 @@ static int fm10k_mc_vlan_unsync(struct net_device *netdev,
 	/* drop any leading bits on the VLAN ID */
 	vid &= VLAN_N_VID - 1;
 
-	if (fm10k_host_mbx_ready(interface))
-		err = hw->mac.ops.update_mc_addr(hw, glort, mc_addr, vid, set);
-
+	err = fm10k_queue_mac_request(interface, glort, mc_addr, vid, set);
 	if (err)
 		return err;
 
@@ -851,18 +970,14 @@ static int fm10k_update_vid(struct net_device *netdev, u16 vid, bool set)
 
 	/* only need to update the VLAN if not in promiscuous mode */
 	if (!(netdev->flags & IFF_PROMISC)) {
-		err = hw->mac.ops.update_vlan(hw, vid, 0, set);
+		err = fm10k_queue_vlan_request(interface, vid, 0, set);
 		if (err)
 			goto err_out;
 	}
 
-	/* update our base MAC address if host's mailbox is ready */
-	if (fm10k_host_mbx_ready(interface))
-		err = hw->mac.ops.update_uc_addr(hw, interface->glort,
-						 hw->mac.addr, vid, set, 0);
-	else
-		err = -EHOSTDOWN;
-
+	/* Update our base MAC address */
+	err = fm10k_queue_mac_request(interface, interface->glort,
+				      hw->mac.addr, vid, set);
 	if (err)
 		goto err_out;
 
@@ -906,7 +1021,6 @@ static u16 fm10k_find_next_vlan(struct fm10k_intfc *interface, u16 vid)
 
 static void fm10k_clear_unused_vlans(struct fm10k_intfc *interface)
 {
-	struct fm10k_hw *hw = &interface->hw;
 	u32 vid, prev_vid;
 
 	/* loop through and find any gaps in the table */
@@ -918,7 +1032,7 @@ static void fm10k_clear_unused_vlans(struct fm10k_intfc *interface)
 
 		/* send request to clear multiple bits at a time */
 		prev_vid += (vid - prev_vid - 1) << FM10K_VLAN_LENGTH_SHIFT;
-		hw->mac.ops.update_vlan(hw, prev_vid, 0, false);
+		fm10k_queue_vlan_request(interface, prev_vid, 0, false);
 	}
 }
 
@@ -933,15 +1047,11 @@ static int __fm10k_uc_sync(struct net_device *dev,
 	if (!is_valid_ether_addr(addr))
 		return -EADDRNOTAVAIL;
 
-	/* update table with current entries if host's mailbox is ready */
-	if (!fm10k_host_mbx_ready(interface))
-		return -EHOSTDOWN;
-
 	for (vid = hw->mac.default_vid ? fm10k_find_next_vlan(interface, 0) : 1;
 	     vid < VLAN_N_VID;
 	     vid = fm10k_find_next_vlan(interface, vid)) {
-		err = hw->mac.ops.update_uc_addr(hw, glort, addr,
-						 vid, sync, 0);
+		err = fm10k_queue_mac_request(interface, glort,
+					      addr, vid, sync);
 		if (err)
 			return err;
 	}
@@ -998,15 +1108,18 @@ static int __fm10k_mc_sync(struct net_device *dev,
 	struct fm10k_intfc *interface = netdev_priv(dev);
 	struct fm10k_hw *hw = &interface->hw;
 	u16 vid, glort = interface->glort;
+	s32 err;
 
-	/* update table with current entries if host's mailbox is ready */
-	if (!fm10k_host_mbx_ready(interface))
-		return 0;
+	if (!is_multicast_ether_addr(addr))
+		return -EADDRNOTAVAIL;
 
 	for (vid = hw->mac.default_vid ? fm10k_find_next_vlan(interface, 0) : 1;
 	     vid < VLAN_N_VID;
 	     vid = fm10k_find_next_vlan(interface, vid)) {
-		hw->mac.ops.update_mc_addr(hw, glort, addr, vid, sync);
+		err = fm10k_queue_mac_request(interface, glort,
+					      addr, vid, sync);
+		if (err)
+			return err;
 	}
 
 	return 0;
@@ -1046,7 +1159,8 @@ static void fm10k_set_rx_mode(struct net_device *dev)
 	if (interface->xcast_mode != xcast_mode) {
 		/* update VLAN table */
 		if (xcast_mode == FM10K_XCAST_MODE_PROMISC)
-			hw->mac.ops.update_vlan(hw, FM10K_VLAN_ALL, 0, true);
+			fm10k_queue_vlan_request(interface, FM10K_VLAN_ALL,
+						 0, true);
 		if (interface->xcast_mode == FM10K_XCAST_MODE_PROMISC)
 			fm10k_clear_unused_vlans(interface);
 
@@ -1094,22 +1208,20 @@ void fm10k_restore_rx_state(struct fm10k_intfc *interface)
 					       interface->glort_count, true);
 
 	/* update VLAN table */
-	hw->mac.ops.update_vlan(hw, FM10K_VLAN_ALL, 0,
-				xcast_mode == FM10K_XCAST_MODE_PROMISC);
+	fm10k_queue_vlan_request(interface, FM10K_VLAN_ALL, 0,
+				 xcast_mode == FM10K_XCAST_MODE_PROMISC);
 
 	/* Add filter for VLAN 0 */
-	hw->mac.ops.update_vlan(hw, 0, 0, true);
+	fm10k_queue_vlan_request(interface, 0, 0, true);
 
 	/* update table with current entries */
 	for (vid = hw->mac.default_vid ? fm10k_find_next_vlan(interface, 0) : 1;
 	     vid < VLAN_N_VID;
 	     vid = fm10k_find_next_vlan(interface, vid)) {
-		hw->mac.ops.update_vlan(hw, vid, 0, true);
+		fm10k_queue_vlan_request(interface, vid, 0, true);
 
-		/* Update unicast entries if host's mailbox is ready */
-		if (fm10k_host_mbx_ready(interface))
-			hw->mac.ops.update_uc_addr(hw, glort, hw->mac.addr,
-						   vid, true, 0);
+		fm10k_queue_mac_request(interface, glort,
+					hw->mac.addr, vid, true);
 	}
 
 	/* update xcast mode before synchronizing addresses if host's mailbox
@@ -1136,6 +1248,13 @@ void fm10k_reset_rx_state(struct fm10k_intfc *interface)
 	struct net_device *netdev = interface->netdev;
 	struct fm10k_hw *hw = &interface->hw;
 
+	/* Wait for MAC/VLAN work to finish */
+	while (test_bit(__FM10K_MACVLAN_SCHED, interface->state))
+		usleep_range(1000, 2000);
+
+	/* Cancel pending MAC/VLAN requests */
+	fm10k_clear_macvlan_queue(interface, interface->glort, true);
+
 	fm10k_mbx_lock(interface);
 
 	/* clear the logical port state on lower device if host's mailbox is
@@ -1370,8 +1489,8 @@ static void *fm10k_dfwd_add_station(struct net_device *dev,
 	if (fm10k_host_mbx_ready(interface)) {
 		hw->mac.ops.update_xcast_mode(hw, glort,
 					      FM10K_XCAST_MODE_MULTI);
-		hw->mac.ops.update_uc_addr(hw, glort, sdev->dev_addr,
-					   0, true, 0);
+		fm10k_queue_mac_request(interface, glort, sdev->dev_addr,
+					0, true);
 	}
 
 	fm10k_mbx_unlock(interface);
@@ -1410,8 +1529,8 @@ static void fm10k_dfwd_del_station(struct net_device *dev, void *priv)
 	if (fm10k_host_mbx_ready(interface)) {
 		hw->mac.ops.update_xcast_mode(hw, glort,
 					      FM10K_XCAST_MODE_NONE);
-		hw->mac.ops.update_uc_addr(hw, glort, sdev->dev_addr,
-					   0, false, 0);
+		fm10k_queue_mac_request(interface, glort, sdev->dev_addr,
+					0, false);
 	}
 
 	fm10k_mbx_unlock(interface);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index 63784576ae8b..1e9ae3197b17 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -91,6 +91,76 @@ static int fm10k_hw_ready(struct fm10k_intfc *interface)
 	return FM10K_REMOVED(hw->hw_addr) ? -ENODEV : 0;
 }
 
+/**
+ * fm10k_macvlan_schedule - Schedule MAC/VLAN queue task
+ * @interface: fm10k private interface structure
+ *
+ * Schedule the MAC/VLAN queue monitor task. If the MAC/VLAN task cannot be
+ * started immediately, request that it be restarted when possible.
+ */
+void fm10k_macvlan_schedule(struct fm10k_intfc *interface)
+{
+	/* Avoid processing the MAC/VLAN queue when the service task is
+	 * disabled, or when we're resetting the device.
+	 */
+	if (!test_bit(__FM10K_MACVLAN_DISABLE, interface->state) &&
+	    !test_and_set_bit(__FM10K_MACVLAN_SCHED, interface->state)) {
+		clear_bit(__FM10K_MACVLAN_REQUEST, interface->state);
+		/* We delay the actual start of execution in order to allow
+		 * multiple MAC/VLAN updates to accumulate before handling
+		 * them, and to allow some time to let the mailbox drain
+		 * between runs.
+		 */
+		queue_delayed_work(fm10k_workqueue,
+				   &interface->macvlan_task, 10);
+	} else {
+		set_bit(__FM10K_MACVLAN_REQUEST, interface->state);
+	}
+}
+
+/**
+ * fm10k_stop_macvlan_task - Stop the MAC/VLAN queue monitor
+ * @interface: fm10k private interface structure
+ *
+ * Wait until the MAC/VLAN queue task has stopped, and cancel any future
+ * requests.
+ */
+static void fm10k_stop_macvlan_task(struct fm10k_intfc *interface)
+{
+	/* Disable the MAC/VLAN work item */
+	set_bit(__FM10K_MACVLAN_DISABLE, interface->state);
+
+	/* Make sure we waited until any current invocations have stopped */
+	cancel_delayed_work_sync(&interface->macvlan_task);
+
+	/* We set the __FM10K_MACVLAN_SCHED bit when we schedule the task.
+	 * However, it may not be unset of the MAC/VLAN task never actually
+	 * got a chance to run. Since we've canceled the task here, and it
+	 * cannot be rescheuled right now, we need to ensure the scheduled bit
+	 * gets unset.
+	 */
+	clear_bit(__FM10K_MACVLAN_SCHED, interface->state);
+}
+
+/**
+ * fm10k_resume_macvlan_task - Restart the MAC/VLAN queue monitor
+ * @interface: fm10k private interface structure
+ *
+ * Clear the __FM10K_MACVLAN_DISABLE bit and, if a request occurred, schedule
+ * the MAC/VLAN work monitor.
+ */
+static void fm10k_resume_macvlan_task(struct fm10k_intfc *interface)
+{
+	/* Re-enable the MAC/VLAN work item */
+	clear_bit(__FM10K_MACVLAN_DISABLE, interface->state);
+
+	/* We might have received a MAC/VLAN request while disabled. If so,
+	 * kick off the queue now.
+	 */
+	if (test_bit(__FM10K_MACVLAN_REQUEST, interface->state))
+		fm10k_macvlan_schedule(interface);
+}
+
 void fm10k_service_event_schedule(struct fm10k_intfc *interface)
 {
 	if (!test_bit(__FM10K_SERVICE_DISABLE, interface->state) &&
@@ -118,6 +188,27 @@ static void fm10k_service_event_complete(struct fm10k_intfc *interface)
 		fm10k_service_event_schedule(interface);
 }
 
+static void fm10k_stop_service_event(struct fm10k_intfc *interface)
+{
+	set_bit(__FM10K_SERVICE_DISABLE, interface->state);
+	cancel_work_sync(&interface->service_task);
+
+	/* It's possible that cancel_work_sync stopped the service task from
+	 * running before it could actually start. In this case the
+	 * __FM10K_SERVICE_SCHED bit will never be cleared. Since we know that
+	 * the service task cannot be running at this point, we need to clear
+	 * the scheduled bit, as otherwise the service task may never be
+	 * restarted.
+	 */
+	clear_bit(__FM10K_SERVICE_SCHED, interface->state);
+}
+
+static void fm10k_start_service_event(struct fm10k_intfc *interface)
+{
+	clear_bit(__FM10K_SERVICE_DISABLE, interface->state);
+	fm10k_service_event_schedule(interface);
+}
+
 /**
  * fm10k_service_timer - Timer Call-back
  * @data: pointer to interface cast into an unsigned long
@@ -132,36 +223,15 @@ static void fm10k_service_timer(unsigned long data)
 	fm10k_service_event_schedule(interface);
 }
 
-static void fm10k_detach_subtask(struct fm10k_intfc *interface)
-{
-	struct net_device *netdev = interface->netdev;
-	u32 __iomem *hw_addr;
-	u32 value;
-
-	/* do nothing if device is still present or hw_addr is set */
-	if (netif_device_present(netdev) || interface->hw.hw_addr)
-		return;
-
-	/* check the real address space to see if we've recovered */
-	hw_addr = READ_ONCE(interface->uc_addr);
-	value = readl(hw_addr);
-	if (~value) {
-		interface->hw.hw_addr = interface->uc_addr;
-		netif_device_attach(netdev);
-		set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags);
-		netdev_warn(netdev, "PCIe link restored, device now attached\n");
-		return;
-	}
-
-	rtnl_lock();
-
-	if (netif_running(netdev))
-		dev_close(netdev);
-
-	rtnl_unlock();
-}
-
-static void fm10k_prepare_for_reset(struct fm10k_intfc *interface)
+/**
+ * fm10k_prepare_for_reset - Prepare the driver and device for a pending reset
+ * @interface: fm10k private data structure
+ *
+ * This function prepares for a device reset by shutting as much down as we
+ * can. It does nothing and returns false if __FM10K_RESETTING was already set
+ * prior to calling this function. It returns true if it actually did work.
+ */
+static bool fm10k_prepare_for_reset(struct fm10k_intfc *interface)
 {
 	struct net_device *netdev = interface->netdev;
 
@@ -170,8 +240,15 @@ static void fm10k_prepare_for_reset(struct fm10k_intfc *interface)
 	/* put off any impending NetWatchDogTimeout */
 	netif_trans_update(netdev);
 
-	while (test_and_set_bit(__FM10K_RESETTING, interface->state))
-		usleep_range(1000, 2000);
+	/* Nothing to do if a reset is already in progress */
+	if (test_and_set_bit(__FM10K_RESETTING, interface->state))
+		return false;
+
+	/* As the MAC/VLAN task will be accessing registers it must not be
+	 * running while we reset. Although the task will not be scheduled
+	 * once we start resetting it may already be running
+	 */
+	fm10k_stop_macvlan_task(interface);
 
 	rtnl_lock();
 
@@ -189,6 +266,8 @@ static void fm10k_prepare_for_reset(struct fm10k_intfc *interface)
 	interface->last_reset = jiffies + (10 * HZ);
 
 	rtnl_unlock();
+
+	return true;
 }
 
 static int fm10k_handle_reset(struct fm10k_intfc *interface)
@@ -197,6 +276,8 @@ static int fm10k_handle_reset(struct fm10k_intfc *interface)
 	struct fm10k_hw *hw = &interface->hw;
 	int err;
 
+	WARN_ON(!test_bit(__FM10K_RESETTING, interface->state));
+
 	rtnl_lock();
 
 	pci_set_master(interface->pdev);
@@ -253,6 +334,8 @@ static int fm10k_handle_reset(struct fm10k_intfc *interface)
 
 	rtnl_unlock();
 
+	fm10k_resume_macvlan_task(interface);
+
 	clear_bit(__FM10K_RESETTING, interface->state);
 
 	return err;
@@ -270,27 +353,80 @@ reinit_err:
 	return err;
 }
 
-static void fm10k_reinit(struct fm10k_intfc *interface)
+static void fm10k_detach_subtask(struct fm10k_intfc *interface)
 {
+	struct net_device *netdev = interface->netdev;
+	u32 __iomem *hw_addr;
+	u32 value;
 	int err;
 
-	fm10k_prepare_for_reset(interface);
+	/* do nothing if netdev is still present or hw_addr is set */
+	if (netif_device_present(netdev) || interface->hw.hw_addr)
+		return;
 
-	err = fm10k_handle_reset(interface);
-	if (err)
-		dev_err(&interface->pdev->dev,
-			"fm10k_handle_reset failed: %d\n", err);
+	/* We've lost the PCIe register space, and can no longer access the
+	 * device. Shut everything except the detach subtask down and prepare
+	 * to reset the device in case we recover. If we actually prepare for
+	 * reset, indicate that we're detached.
+	 */
+	if (fm10k_prepare_for_reset(interface))
+		set_bit(__FM10K_RESET_DETACHED, interface->state);
+
+	/* check the real address space to see if we've recovered */
+	hw_addr = READ_ONCE(interface->uc_addr);
+	value = readl(hw_addr);
+	if (~value) {
+		/* Make sure the reset was initiated because we detached,
+		 * otherwise we might race with a different reset flow.
+		 */
+		if (!test_and_clear_bit(__FM10K_RESET_DETACHED,
+					interface->state))
+			return;
+
+		/* Restore the hardware address */
+		interface->hw.hw_addr = interface->uc_addr;
+
+		/* PCIe link has been restored, and the device is active
+		 * again. Restore everything and reset the device.
+		 */
+		err = fm10k_handle_reset(interface);
+		if (err) {
+			netdev_err(netdev, "Unable to reset device: %d\n", err);
+			interface->hw.hw_addr = NULL;
+			return;
+		}
+
+		/* Re-attach the netdev */
+		netif_device_attach(netdev);
+		netdev_warn(netdev, "PCIe link restored, device now attached\n");
+		return;
+	}
 }
 
 static void fm10k_reset_subtask(struct fm10k_intfc *interface)
 {
+	int err;
+
 	if (!test_and_clear_bit(FM10K_FLAG_RESET_REQUESTED,
 				interface->flags))
 		return;
 
+	/* If another thread has already prepared to reset the device, we
+	 * should not attempt to handle a reset here, since we'd race with
+	 * that thread. This may happen if we suspend the device or if the
+	 * PCIe link is lost. In this case, we'll just ignore the RESET
+	 * request, as it will (eventually) be taken care of when the thread
+	 * which actually started the reset is finished.
+	 */
+	if (!fm10k_prepare_for_reset(interface))
+		return;
+
 	netdev_err(interface->netdev, "Reset interface\n");
 
-	fm10k_reinit(interface);
+	err = fm10k_handle_reset(interface);
+	if (err)
+		dev_err(&interface->pdev->dev,
+			"fm10k_handle_reset failed: %d\n", err);
 }
 
 /**
@@ -360,6 +496,10 @@ static void fm10k_watchdog_update_host_state(struct fm10k_intfc *interface)
  **/
 static void fm10k_mbx_subtask(struct fm10k_intfc *interface)
 {
+	/* If we're resetting, bail out */
+	if (test_bit(__FM10K_RESETTING, interface->state))
+		return;
+
 	/* process upstream mailbox and update device state */
 	fm10k_watchdog_update_host_state(interface);
 
@@ -609,9 +749,11 @@ static void fm10k_service_task(struct work_struct *work)
 
 	interface = container_of(work, struct fm10k_intfc, service_task);
 
+	/* Check whether we're detached first */
+	fm10k_detach_subtask(interface);
+
 	/* tasks run even when interface is down */
 	fm10k_mbx_subtask(interface);
-	fm10k_detach_subtask(interface);
 	fm10k_reset_subtask(interface);
 
 	/* tasks only run when interface is up */
@@ -623,6 +765,112 @@ static void fm10k_service_task(struct work_struct *work)
 }
 
 /**
+ * fm10k_macvlan_task - send queued MAC/VLAN requests to switch manager
+ * @work: pointer to work_struct containing our data
+ *
+ * This work item handles sending MAC/VLAN updates to the switch manager. When
+ * the interface is up, it will attempt to queue mailbox messages to the
+ * switch manager requesting updates for MAC/VLAN pairs. If the Tx fifo of the
+ * mailbox is full, it will reschedule itself to try again in a short while.
+ * This ensures that the driver does not overload the switch mailbox with too
+ * many simultaneous requests, causing an unnecessary reset.
+ **/
+static void fm10k_macvlan_task(struct work_struct *work)
+{
+	struct fm10k_macvlan_request *item;
+	struct fm10k_intfc *interface;
+	struct delayed_work *dwork;
+	struct list_head *requests;
+	struct fm10k_hw *hw;
+	unsigned long flags;
+
+	dwork = to_delayed_work(work);
+	interface = container_of(dwork, struct fm10k_intfc, macvlan_task);
+	hw = &interface->hw;
+	requests = &interface->macvlan_requests;
+
+	do {
+		/* Pop the first item off the list */
+		spin_lock_irqsave(&interface->macvlan_lock, flags);
+		item = list_first_entry_or_null(requests,
+						struct fm10k_macvlan_request,
+						list);
+		if (item)
+			list_del_init(&item->list);
+
+		spin_unlock_irqrestore(&interface->macvlan_lock, flags);
+
+		/* We have no more items to process */
+		if (!item)
+			goto done;
+
+		fm10k_mbx_lock(interface);
+
+		/* Check that we have plenty of space to send the message. We
+		 * want to ensure that the mailbox stays low enough to avoid a
+		 * change in the host state, otherwise we may see spurious
+		 * link up / link down notifications.
+		 */
+		if (!hw->mbx.ops.tx_ready(&hw->mbx, FM10K_VFMBX_MSG_MTU + 5)) {
+			hw->mbx.ops.process(hw, &hw->mbx);
+			set_bit(__FM10K_MACVLAN_REQUEST, interface->state);
+			fm10k_mbx_unlock(interface);
+
+			/* Put the request back on the list */
+			spin_lock_irqsave(&interface->macvlan_lock, flags);
+			list_add(&item->list, requests);
+			spin_unlock_irqrestore(&interface->macvlan_lock, flags);
+			break;
+		}
+
+		switch (item->type) {
+		case FM10K_MC_MAC_REQUEST:
+			hw->mac.ops.update_mc_addr(hw,
+						   item->mac.glort,
+						   item->mac.addr,
+						   item->mac.vid,
+						   item->set);
+			break;
+		case FM10K_UC_MAC_REQUEST:
+			hw->mac.ops.update_uc_addr(hw,
+						   item->mac.glort,
+						   item->mac.addr,
+						   item->mac.vid,
+						   item->set,
+						   0);
+			break;
+		case FM10K_VLAN_REQUEST:
+			hw->mac.ops.update_vlan(hw,
+						item->vlan.vid,
+						item->vlan.vsi,
+						item->set);
+			break;
+		default:
+			break;
+		}
+
+		fm10k_mbx_unlock(interface);
+
+		/* Free the item now that we've sent the update */
+		kfree(item);
+	} while (true);
+
+done:
+	WARN_ON(!test_bit(__FM10K_MACVLAN_SCHED, interface->state));
+
+	/* flush memory to make sure state is correct */
+	smp_mb__before_atomic();
+	clear_bit(__FM10K_MACVLAN_SCHED, interface->state);
+
+	/* If a MAC/VLAN request was scheduled since we started, we should
+	 * re-schedule. However, there is no reason to re-schedule if there is
+	 * no work to do.
+	 */
+	if (test_bit(__FM10K_MACVLAN_REQUEST, interface->state))
+		fm10k_macvlan_schedule(interface);
+}
+
+/**
  * fm10k_configure_tx_ring - Configure Tx ring after Reset
  * @interface: board private structure
  * @ring: structure containing ring specific data
@@ -1544,7 +1792,7 @@ int fm10k_qv_request_irq(struct fm10k_intfc *interface)
 	struct net_device *dev = interface->netdev;
 	struct fm10k_hw *hw = &interface->hw;
 	struct msix_entry *entry;
-	int ri = 0, ti = 0;
+	unsigned int ri = 0, ti = 0;
 	int vector, err;
 
 	entry = &interface->msix_entries[NON_Q_VECTORS(hw)];
@@ -1554,15 +1802,15 @@ int fm10k_qv_request_irq(struct fm10k_intfc *interface)
 
 		/* name the vector */
 		if (q_vector->tx.count && q_vector->rx.count) {
-			snprintf(q_vector->name, sizeof(q_vector->name) - 1,
-				 "%s-TxRx-%d", dev->name, ri++);
+			snprintf(q_vector->name, sizeof(q_vector->name),
+				 "%s-TxRx-%u", dev->name, ri++);
 			ti++;
 		} else if (q_vector->rx.count) {
-			snprintf(q_vector->name, sizeof(q_vector->name) - 1,
-				 "%s-rx-%d", dev->name, ri++);
+			snprintf(q_vector->name, sizeof(q_vector->name),
+				 "%s-rx-%u", dev->name, ri++);
 		} else if (q_vector->tx.count) {
-			snprintf(q_vector->name, sizeof(q_vector->name) - 1,
-				 "%s-tx-%d", dev->name, ti++);
+			snprintf(q_vector->name, sizeof(q_vector->name),
+				 "%s-tx-%u", dev->name, ti++);
 		} else {
 			/* skip this unused q_vector */
 			continue;
@@ -1800,9 +2048,6 @@ static int fm10k_sw_init(struct fm10k_intfc *interface,
 		netdev->vlan_features |= NETIF_F_HIGHDMA;
 	}
 
-	/* delay any future reset requests */
-	interface->last_reset = jiffies + (10 * HZ);
-
 	/* reset and initialize the hardware so it is in a known state */
 	err = hw->mac.ops.reset_hw(hw);
 	if (err) {
@@ -1857,9 +2102,16 @@ static int fm10k_sw_init(struct fm10k_intfc *interface,
 	INIT_LIST_HEAD(&interface->vxlan_port);
 	INIT_LIST_HEAD(&interface->geneve_port);
 
+	/* Initialize the MAC/VLAN queue */
+	INIT_LIST_HEAD(&interface->macvlan_requests);
+
 	netdev_rss_key_fill(rss_key, sizeof(rss_key));
 	memcpy(interface->rssrk, rss_key, sizeof(rss_key));
 
+	/* Initialize the mailbox lock */
+	spin_lock_init(&interface->mbx_lock);
+	spin_lock_init(&interface->macvlan_lock);
+
 	/* Start off interface as being down */
 	set_bit(__FM10K_DOWN, interface->state);
 	set_bit(__FM10K_UPDATING_STATS, interface->state);
@@ -2067,6 +2319,9 @@ static int fm10k_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		    (unsigned long)interface);
 	INIT_WORK(&interface->service_task, fm10k_service_task);
 
+	/* Setup the MAC/VLAN queue */
+	INIT_DELAYED_WORK(&interface->macvlan_task, fm10k_macvlan_task);
+
 	/* kick off service timer now, even when interface is down */
 	mod_timer(&interface->service_timer, (HZ * 2) + jiffies);
 
@@ -2079,8 +2334,9 @@ static int fm10k_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* enable SR-IOV after registering netdev to enforce PF/VF ordering */
 	fm10k_iov_configure(pdev, 0);
 
-	/* clear the service task disable bit to allow service task to start */
+	/* clear the service task disable bit and kick off service task */
 	clear_bit(__FM10K_SERVICE_DISABLE, interface->state);
+	fm10k_service_event_schedule(interface);
 
 	return 0;
 
@@ -2118,8 +2374,11 @@ static void fm10k_remove(struct pci_dev *pdev)
 
 	del_timer_sync(&interface->service_timer);
 
-	set_bit(__FM10K_SERVICE_DISABLE, interface->state);
-	cancel_work_sync(&interface->service_task);
+	fm10k_stop_service_event(interface);
+	fm10k_stop_macvlan_task(interface);
+
+	/* Remove all pending MAC/VLAN requests */
+	fm10k_clear_macvlan_queue(interface, interface->glort, true);
 
 	/* free netdev, this may bounce the interrupts due to setup_tc */
 	if (netdev->reg_state == NETREG_REGISTERED)
@@ -2156,11 +2415,14 @@ static void fm10k_prepare_suspend(struct fm10k_intfc *interface)
 	 * a surprise remove if the PCIe device is disabled while we're
 	 * stopped. We stop the watchdog task until after we resume software
 	 * activity.
+	 *
+	 * Note that the MAC/VLAN task will be stopped as part of preparing
+	 * for reset so we don't need to handle it here.
 	 */
-	set_bit(__FM10K_SERVICE_DISABLE, interface->state);
-	cancel_work_sync(&interface->service_task);
+	fm10k_stop_service_event(interface);
 
-	fm10k_prepare_for_reset(interface);
+	if (fm10k_prepare_for_reset(interface))
+		set_bit(__FM10K_RESET_SUSPENDED, interface->state);
 }
 
 static int fm10k_handle_resume(struct fm10k_intfc *interface)
@@ -2168,6 +2430,13 @@ static int fm10k_handle_resume(struct fm10k_intfc *interface)
 	struct fm10k_hw *hw = &interface->hw;
 	int err;
 
+	/* Even if we didn't properly prepare for reset in
+	 * fm10k_prepare_suspend, we'll attempt to resume anyways.
+	 */
+	if (!test_and_clear_bit(__FM10K_RESET_SUSPENDED, interface->state))
+		dev_warn(&interface->pdev->dev,
+			 "Device was shut down as part of suspend... Attempting to recover\n");
+
 	/* reset statistics starting values */
 	hw->mac.ops.rebind_hw_stats(hw, &interface->stats);
 
@@ -2185,45 +2454,30 @@ static int fm10k_handle_resume(struct fm10k_intfc *interface)
 	interface->link_down_event = jiffies + (HZ);
 	set_bit(__FM10K_LINK_DOWN, interface->state);
 
-	/* clear the service task disable bit to allow service task to start */
-	clear_bit(__FM10K_SERVICE_DISABLE, interface->state);
-	fm10k_service_event_schedule(interface);
+	/* restart the service task */
+	fm10k_start_service_event(interface);
+
+	/* Restart the MAC/VLAN request queue in-case of outstanding events */
+	fm10k_macvlan_schedule(interface);
 
 	return err;
 }
 
 #ifdef CONFIG_PM
 /**
- * fm10k_resume - Restore device to pre-sleep state
- * @pdev: PCI device information struct
+ * fm10k_resume - Generic PM resume hook
+ * @dev: generic device structure
  *
- * fm10k_resume is called after the system has powered back up from a sleep
- * state and is ready to resume operation.  This function is meant to restore
- * the device back to its pre-sleep state.
+ * Generic PM hook used when waking the device from a low power state after
+ * suspend or hibernation. This function does not need to handle lower PCIe
+ * device state as the stack takes care of that for us.
  **/
-static int fm10k_resume(struct pci_dev *pdev)
+static int fm10k_resume(struct device *dev)
 {
-	struct fm10k_intfc *interface = pci_get_drvdata(pdev);
+	struct fm10k_intfc *interface = pci_get_drvdata(to_pci_dev(dev));
 	struct net_device *netdev = interface->netdev;
 	struct fm10k_hw *hw = &interface->hw;
-	u32 err;
-
-	pci_set_power_state(pdev, PCI_D0);
-	pci_restore_state(pdev);
-
-	/* pci_restore_state clears dev->state_saved so call
-	 * pci_save_state to restore it.
-	 */
-	pci_save_state(pdev);
-
-	err = pci_enable_device_mem(pdev);
-	if (err) {
-		dev_err(&pdev->dev, "Cannot enable PCI device from suspend\n");
-		return err;
-	}
-	pci_set_master(pdev);
-
-	pci_wake_from_d3(pdev, false);
+	int err;
 
 	/* refresh hw_addr in case it was dropped */
 	hw->hw_addr = interface->uc_addr;
@@ -2238,36 +2492,27 @@ static int fm10k_resume(struct pci_dev *pdev)
 }
 
 /**
- * fm10k_suspend - Prepare the device for a system sleep state
- * @pdev: PCI device information struct
+ * fm10k_suspend - Generic PM suspend hook
+ * @dev: generic device structure
  *
- * fm10k_suspend is meant to shutdown the device prior to the system entering
- * a sleep state.  The fm10k hardware does not support wake on lan so the
- * driver simply needs to shut down the device so it is in a low power state.
+ * Generic PM hook used when setting the device into a low power state for
+ * system suspend or hibernation. This function does not need to handle lower
+ * PCIe device state as the stack takes care of that for us.
  **/
-static int fm10k_suspend(struct pci_dev *pdev,
-			 pm_message_t __always_unused state)
+static int fm10k_suspend(struct device *dev)
 {
-	struct fm10k_intfc *interface = pci_get_drvdata(pdev);
+	struct fm10k_intfc *interface = pci_get_drvdata(to_pci_dev(dev));
 	struct net_device *netdev = interface->netdev;
-	int err = 0;
 
 	netif_device_detach(netdev);
 
 	fm10k_prepare_suspend(interface);
 
-	err = pci_save_state(pdev);
-	if (err)
-		return err;
-
-	pci_disable_device(pdev);
-	pci_wake_from_d3(pdev, false);
-	pci_set_power_state(pdev, PCI_D3hot);
-
 	return 0;
 }
 
 #endif /* CONFIG_PM */
+
 /**
  * fm10k_io_error_detected - called when PCI error is detected
  * @pdev: Pointer to PCI device
@@ -2343,11 +2588,18 @@ static void fm10k_io_resume(struct pci_dev *pdev)
 
 	if (err)
 		dev_warn(&pdev->dev,
-			 "fm10k_io_resume failed: %d\n", err);
+			 "%s failed: %d\n", __func__, err);
 	else
 		netif_device_attach(netdev);
 }
 
+/**
+ * fm10k_io_reset_prepare - called when PCI function is about to be reset
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the PCI function is about to be reset,
+ * allowing the device driver to prepare for it.
+ */
 static void fm10k_io_reset_prepare(struct pci_dev *pdev)
 {
 	/* warn incase we have any active VF devices */
@@ -2357,6 +2609,13 @@ static void fm10k_io_reset_prepare(struct pci_dev *pdev)
 	fm10k_prepare_suspend(pci_get_drvdata(pdev));
 }
 
+/**
+ * fm10k_io_reset_done - called when PCI function has finished resetting
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called just after the PCI function is reset, such as via
+ * /sys/class/net/<enpX>/device/reset or similar.
+ */
 static void fm10k_io_reset_done(struct pci_dev *pdev)
 {
 	struct fm10k_intfc *interface = pci_get_drvdata(pdev);
@@ -2364,7 +2623,7 @@ static void fm10k_io_reset_done(struct pci_dev *pdev)
 
 	if (err) {
 		dev_warn(&pdev->dev,
-			 "fm10k_io_reset_notify failed: %d\n", err);
+			 "%s failed: %d\n", __func__, err);
 		netif_device_detach(interface->netdev);
 	}
 }
@@ -2377,15 +2636,18 @@ static const struct pci_error_handlers fm10k_err_handler = {
 	.reset_done = fm10k_io_reset_done,
 };
 
+static SIMPLE_DEV_PM_OPS(fm10k_pm_ops, fm10k_suspend, fm10k_resume);
+
 static struct pci_driver fm10k_driver = {
 	.name			= fm10k_driver_name,
 	.id_table		= fm10k_pci_tbl,
 	.probe			= fm10k_probe,
 	.remove			= fm10k_remove,
 #ifdef CONFIG_PM
-	.suspend		= fm10k_suspend,
-	.resume			= fm10k_resume,
-#endif
+	.driver = {
+		.pm		= &fm10k_pm_ops,
+	},
+#endif /* CONFIG_PM */
 	.sriov_configure	= fm10k_iov_configure,
 	.err_handler		= &fm10k_err_handler
 };
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
index 40ee0242a80a..425d814aed4d 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
@@ -1,5 +1,5 @@
 /* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
+ * Copyright(c) 2013 - 2017 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -1186,7 +1186,7 @@ s32 fm10k_iov_msg_msix_pf(struct fm10k_hw *hw, u32 **results,
  * Will report an error if the VLAN ID is out of range. For VID = 0, it will
  * return either the pf_vid or sw_vid depending on which one is set.
  */
-static s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid)
+s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid)
 {
 	if (!vid)
 		return vf_info->pf_vid ? vf_info->pf_vid : vf_info->sw_vid;
@@ -1334,19 +1334,19 @@ static u8 fm10k_iov_supported_xcast_mode_pf(struct fm10k_vf_info *vf_info,
 	case FM10K_XCAST_MODE_PROMISC:
 		if (vf_flags & FM10K_VF_FLAG_PROMISC_CAPABLE)
 			return FM10K_XCAST_MODE_PROMISC;
-		/* fallthough */
+		/* fall through */
 	case FM10K_XCAST_MODE_ALLMULTI:
 		if (vf_flags & FM10K_VF_FLAG_ALLMULTI_CAPABLE)
 			return FM10K_XCAST_MODE_ALLMULTI;
-		/* fallthough */
+		/* fall through */
 	case FM10K_XCAST_MODE_MULTI:
 		if (vf_flags & FM10K_VF_FLAG_MULTI_CAPABLE)
 			return FM10K_XCAST_MODE_MULTI;
-		/* fallthough */
+		/* fall through */
 	case FM10K_XCAST_MODE_NONE:
 		if (vf_flags & FM10K_VF_FLAG_NONE_CAPABLE)
 			return FM10K_XCAST_MODE_NONE;
-		/* fallthough */
+		/* fall through */
 	default:
 		break;
 	}
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h
index 3336d3c10760..e04d41f1a532 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h
@@ -1,5 +1,5 @@
 /* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
+ * Copyright(c) 2013 - 2017 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -114,6 +114,7 @@ extern const struct fm10k_tlv_attr fm10k_err_msg_attr[];
 #define FM10K_PF_MSG_ERR_HANDLER(msg, func) \
 	FM10K_MSG_HANDLER(FM10K_PF_MSG_ID_##msg, fm10k_err_msg_attr, func)
 
+s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid);
 s32 fm10k_iov_msg_msix_pf(struct fm10k_hw *, u32 **, struct fm10k_mbx_info *);
 s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *, u32 **,
 			      struct fm10k_mbx_info *);
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index d0c1bf5441d8..439c63cb2a0c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -77,6 +77,7 @@
 #define i40e_default_queues_per_vmdq(pf) \
 		(((pf)->hw_features & I40E_HW_RSS_AQ_CAPABLE) ? 4 : 1)
 #define I40E_DEFAULT_QUEUES_PER_VF	4
+#define I40E_MAX_VF_QUEUES		16
 #define I40E_DEFAULT_QUEUES_PER_TC	1 /* should be a power of 2 */
 #define i40e_pf_get_max_q_per_tc(pf) \
 		(((pf)->hw_features & I40E_HW_128_QP_RSS_CAPABLE) ? 128 : 64)
@@ -136,6 +137,7 @@ enum i40e_state_t {
 	__I40E_MDD_EVENT_PENDING,
 	__I40E_VFLR_EVENT_PENDING,
 	__I40E_RESET_RECOVERY_PENDING,
+	__I40E_MISC_IRQ_REQUESTED,
 	__I40E_RESET_INTR_RECEIVED,
 	__I40E_REINIT_REQUESTED,
 	__I40E_PF_RESET_REQUESTED,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index ba04988e0598..9dcb2a961197 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -607,6 +607,18 @@ i40e_status i40e_init_adminq(struct i40e_hw *hw)
 			   &oem_lo);
 	hw->nvm.oem_ver = ((u32)oem_hi << 16) | oem_lo;
 
+	if (hw->mac.type == I40E_MAC_XL710 &&
+	    hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
+	    hw->aq.api_min_ver >= I40E_MINOR_VER_GET_LINK_INFO_XL710) {
+		hw->flags |= I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE;
+	}
+
+	/* The ability to RX (not drop) 802.1ad frames was added in API 1.7 */
+	if (hw->aq.api_maj_ver > 1 ||
+	    (hw->aq.api_maj_ver == 1 &&
+	     hw->aq.api_min_ver >= 7))
+		hw->flags |= I40E_HW_FLAG_802_1AD_CAPABLE;
+
 	if (hw->aq.api_maj_ver > I40E_FW_API_VERSION_MAJOR) {
 		ret_code = I40E_ERR_FIRMWARE_API_VERSION;
 		goto init_adminq_free_arq;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index 5d5f422cbae5..4c85ea9cd89a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -34,7 +34,15 @@
  */
 
 #define I40E_FW_API_VERSION_MAJOR	0x0001
-#define I40E_FW_API_VERSION_MINOR	0x0005
+#define I40E_FW_API_VERSION_MINOR_X722	0x0005
+#define I40E_FW_API_VERSION_MINOR_X710	0x0007
+
+#define I40E_FW_MINOR_VERSION(_h) ((_h)->mac.type == I40E_MAC_XL710 ? \
+					I40E_FW_API_VERSION_MINOR_X710 : \
+					I40E_FW_API_VERSION_MINOR_X722)
+
+/* API version 1.7 implements additional link and PHY-specific APIs  */
+#define I40E_MINOR_VER_GET_LINK_INFO_XL710 0x0007
 
 struct i40e_aq_desc {
 	__le16 flags;
@@ -236,6 +244,8 @@ enum i40e_admin_queue_opc {
 	i40e_aqc_opc_set_phy_debug		= 0x0622,
 	i40e_aqc_opc_upload_ext_phy_fm		= 0x0625,
 	i40e_aqc_opc_run_phy_activity		= 0x0626,
+	i40e_aqc_opc_set_phy_register		= 0x0628,
+	i40e_aqc_opc_get_phy_register		= 0x0629,
 
 	/* NVM commands */
 	i40e_aqc_opc_nvm_read			= 0x0701,
@@ -765,7 +775,22 @@ struct i40e_aqc_set_switch_config {
 #define I40E_AQ_SET_SWITCH_CFG_PROMISC		0x0001
 #define I40E_AQ_SET_SWITCH_CFG_L2_FILTER	0x0002
 	__le16	valid_flags;
-	u8	reserved[12];
+	/* The ethertype in switch_tag is dropped on ingress and used
+	 * internally by the switch. Set this to zero for the default
+	 * of 0x88a8 (802.1ad). Should be zero for firmware API
+	 * versions lower than 1.7.
+	 */
+	__le16	switch_tag;
+	/* The ethertypes in first_tag and second_tag are used to
+	 * match the outer and inner VLAN tags (respectively) when HW
+	 * double VLAN tagging is enabled via the set port parameters
+	 * AQ command. Otherwise these are both ignored. Set them to
+	 * zero for their defaults of 0x8100 (802.1Q). Should be zero
+	 * for firmware API versions lower than 1.7.
+	 */
+	__le16	first_tag;
+	__le16	second_tag;
+	u8	reserved[6];
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config);
@@ -1726,6 +1751,8 @@ enum i40e_aq_phy_type {
 	I40E_PHY_TYPE_10GBASE_CR1_CU		= 0xB,
 	I40E_PHY_TYPE_10GBASE_AOC		= 0xC,
 	I40E_PHY_TYPE_40GBASE_AOC		= 0xD,
+	I40E_PHY_TYPE_UNRECOGNIZED		= 0xE,
+	I40E_PHY_TYPE_UNSUPPORTED		= 0xF,
 	I40E_PHY_TYPE_100BASE_TX		= 0x11,
 	I40E_PHY_TYPE_1000BASE_T		= 0x12,
 	I40E_PHY_TYPE_10GBASE_T			= 0x13,
@@ -1744,6 +1771,8 @@ enum i40e_aq_phy_type {
 	I40E_PHY_TYPE_25GBASE_CR		= 0x20,
 	I40E_PHY_TYPE_25GBASE_SR		= 0x21,
 	I40E_PHY_TYPE_25GBASE_LR		= 0x22,
+	I40E_PHY_TYPE_EMPTY			= 0xFE,
+	I40E_PHY_TYPE_DEFAULT			= 0xFF,
 	I40E_PHY_TYPE_MAX
 };
 
@@ -1934,19 +1963,31 @@ struct i40e_aqc_get_link_status {
 #define I40E_AQ_25G_SERDES_UCODE_ERR	0X04
 #define I40E_AQ_25G_NIMB_UCODE_ERR	0X05
 	u8	loopback; /* use defines from i40e_aqc_set_lb_mode */
+/* Since firmware API 1.7 loopback field keeps power class info as well */
+#define I40E_AQ_LOOPBACK_MASK		0x07
+#define I40E_AQ_PWR_CLASS_SHIFT_LB	6
+#define I40E_AQ_PWR_CLASS_MASK_LB	(0x03 << I40E_AQ_PWR_CLASS_SHIFT_LB)
 	__le16	max_frame_size;
 	u8	config;
 #define I40E_AQ_CONFIG_FEC_KR_ENA	0x01
 #define I40E_AQ_CONFIG_FEC_RS_ENA	0x02
 #define I40E_AQ_CONFIG_CRC_ENA		0x04
 #define I40E_AQ_CONFIG_PACING_MASK	0x78
-	u8	power_desc;
+	union {
+		struct {
+			u8	power_desc;
 #define I40E_AQ_LINK_POWER_CLASS_1	0x00
 #define I40E_AQ_LINK_POWER_CLASS_2	0x01
 #define I40E_AQ_LINK_POWER_CLASS_3	0x02
 #define I40E_AQ_LINK_POWER_CLASS_4	0x03
 #define I40E_AQ_PWR_CLASS_MASK		0x03
-	u8	reserved[4];
+			u8	reserved[4];
+		};
+		struct {
+			u8	link_type[4];
+			u8	link_type_ext;
+		};
+	};
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_get_link_status);
@@ -2029,6 +2070,22 @@ struct i40e_aqc_run_phy_activity {
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_run_phy_activity);
 
+/* Set PHY Register command (0x0628) */
+/* Get PHY Register command (0x0629) */
+struct i40e_aqc_phy_register_access {
+	u8	phy_interface;
+#define I40E_AQ_PHY_REG_ACCESS_INTERNAL	0
+#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL	1
+#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE	2
+	u8	dev_address;
+	u8	reserved1[2];
+	__le32	reg_address;
+	__le32	reg_value;
+	u8	reserved2[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_phy_register_access);
+
 /* NVM Read command (indirect 0x0701)
  * NVM Erase commands (direct 0x0702)
  * NVM Update commands (indirect 0x0703)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 111426ba5fbc..60542beda7ad 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -1593,8 +1593,10 @@ i40e_status i40e_aq_get_phy_capabilities(struct i40e_hw *hw,
 		status = I40E_ERR_UNKNOWN_PHY;
 
 	if (report_init) {
-		hw->phy.phy_types = le32_to_cpu(abilities->phy_type);
-		hw->phy.phy_types |= ((u64)abilities->phy_type_ext << 32);
+		if (hw->mac.type ==  I40E_MAC_XL710 &&
+		    hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
+		    hw->aq.api_min_ver >= I40E_MINOR_VER_GET_LINK_INFO_XL710)
+			status = i40e_aq_get_link_info(hw, true, NULL, NULL);
 	}
 
 	return status;
@@ -1819,7 +1821,7 @@ i40e_status i40e_aq_get_link_info(struct i40e_hw *hw,
 	hw_link_info->fec_info = resp->config & (I40E_AQ_CONFIG_FEC_KR_ENA |
 						 I40E_AQ_CONFIG_FEC_RS_ENA);
 	hw_link_info->ext_info = resp->ext_info;
-	hw_link_info->loopback = resp->loopback;
+	hw_link_info->loopback = resp->loopback & I40E_AQ_LOOPBACK_MASK;
 	hw_link_info->max_frame_size = le16_to_cpu(resp->max_frame_size);
 	hw_link_info->pacing = resp->config & I40E_AQ_CONFIG_PACING_MASK;
 
@@ -1850,6 +1852,15 @@ i40e_status i40e_aq_get_link_info(struct i40e_hw *hw,
 	     hw->aq.fw_min_ver < 40)) && hw_link_info->phy_type == 0xE)
 		hw_link_info->phy_type = I40E_PHY_TYPE_10GBASE_SFPP_CU;
 
+	if (hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
+	    hw->aq.api_min_ver >= 7) {
+		__le32 tmp;
+
+		memcpy(&tmp, resp->link_type, sizeof(tmp));
+		hw->phy.phy_types = le32_to_cpu(tmp);
+		hw->phy.phy_types |= ((u64)resp->link_type_ext << 32);
+	}
+
 	/* save link status information */
 	if (link)
 		*link = *hw_link_info;
@@ -2391,7 +2402,11 @@ enum i40e_status_code i40e_aq_set_switch_config(struct i40e_hw *hw,
 					  i40e_aqc_opc_set_switch_config);
 	scfg->flags = cpu_to_le16(flags);
 	scfg->valid_flags = cpu_to_le16(valid_flags);
-
+	if (hw->flags & I40E_HW_FLAG_802_1AD_CAPABLE) {
+		scfg->switch_tag = cpu_to_le16(hw->switch_tag);
+		scfg->first_tag = cpu_to_le16(hw->first_tag);
+		scfg->second_tag = cpu_to_le16(hw->second_tag);
+	}
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
 
 	return status;
@@ -4826,6 +4841,74 @@ phy_blinking_end:
 }
 
 /**
+ * i40e_led_get_reg - read LED register
+ * @hw: pointer to the HW structure
+ * @led_addr: LED register address
+ * @reg_val: read register value
+ **/
+static enum i40e_status_code i40e_led_get_reg(struct i40e_hw *hw, u16 led_addr,
+					      u32 *reg_val)
+{
+	enum i40e_status_code status;
+	u8 phy_addr = 0;
+	u8 port_num;
+	u32 i;
+
+	*reg_val = 0;
+	if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) {
+		status =
+		       i40e_aq_get_phy_register(hw,
+						I40E_AQ_PHY_REG_ACCESS_EXTERNAL,
+						I40E_PHY_COM_REG_PAGE,
+						I40E_PHY_LED_PROV_REG_1,
+						reg_val, NULL);
+	} else {
+		i = rd32(hw, I40E_PFGEN_PORTNUM);
+		port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
+		phy_addr = i40e_get_phy_address(hw, port_num);
+		status = i40e_read_phy_register_clause45(hw,
+							 I40E_PHY_COM_REG_PAGE,
+							 led_addr, phy_addr,
+							 (u16 *)reg_val);
+	}
+	return status;
+}
+
+/**
+ * i40e_led_set_reg - write LED register
+ * @hw: pointer to the HW structure
+ * @led_addr: LED register address
+ * @reg_val: register value to write
+ **/
+static enum i40e_status_code i40e_led_set_reg(struct i40e_hw *hw, u16 led_addr,
+					      u32 reg_val)
+{
+	enum i40e_status_code status;
+	u8 phy_addr = 0;
+	u8 port_num;
+	u32 i;
+
+	if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) {
+		status =
+		       i40e_aq_set_phy_register(hw,
+						I40E_AQ_PHY_REG_ACCESS_EXTERNAL,
+						I40E_PHY_COM_REG_PAGE,
+						I40E_PHY_LED_PROV_REG_1,
+						reg_val, NULL);
+	} else {
+		i = rd32(hw, I40E_PFGEN_PORTNUM);
+		port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
+		phy_addr = i40e_get_phy_address(hw, port_num);
+		status = i40e_write_phy_register_clause45(hw,
+							  I40E_PHY_COM_REG_PAGE,
+							  led_addr, phy_addr,
+							  (u16)reg_val);
+	}
+
+	return status;
+}
+
+/**
  * i40e_led_get_phy - return current on/off mode
  * @hw: pointer to the hw struct
  * @led_addr: address of led register to use
@@ -4842,7 +4925,19 @@ i40e_status i40e_led_get_phy(struct i40e_hw *hw, u16 *led_addr,
 	u16 temp_addr;
 	u8 port_num;
 	u32 i;
-
+	u32 reg_val_aq;
+
+	if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) {
+		status =
+		      i40e_aq_get_phy_register(hw,
+					       I40E_AQ_PHY_REG_ACCESS_EXTERNAL,
+					       I40E_PHY_COM_REG_PAGE,
+					       I40E_PHY_LED_PROV_REG_1,
+					       &reg_val_aq, NULL);
+		if (status == I40E_SUCCESS)
+			*val = (u16)reg_val_aq;
+		return status;
+	}
 	temp_addr = I40E_PHY_LED_PROV_REG_1;
 	i = rd32(hw, I40E_PFGEN_PORTNUM);
 	port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
@@ -4877,51 +4972,38 @@ i40e_status i40e_led_set_phy(struct i40e_hw *hw, bool on,
 			     u16 led_addr, u32 mode)
 {
 	i40e_status status = 0;
-	u16 led_ctl = 0;
-	u16 led_reg = 0;
-	u8 phy_addr = 0;
-	u8 port_num;
-	u32 i;
+	u32 led_ctl = 0;
+	u32 led_reg = 0;
 
-	i = rd32(hw, I40E_PFGEN_PORTNUM);
-	port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
-	phy_addr = i40e_get_phy_address(hw, port_num);
-	status = i40e_read_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE,
-						 led_addr, phy_addr, &led_reg);
+	status = i40e_led_get_reg(hw, led_addr, &led_reg);
 	if (status)
 		return status;
 	led_ctl = led_reg;
 	if (led_reg & I40E_PHY_LED_LINK_MODE_MASK) {
 		led_reg = 0;
-		status = i40e_write_phy_register_clause45(hw,
-							  I40E_PHY_COM_REG_PAGE,
-							  led_addr, phy_addr,
-							  led_reg);
+		status = i40e_led_set_reg(hw, led_addr, led_reg);
 		if (status)
 			return status;
 	}
-	status = i40e_read_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE,
-						 led_addr, phy_addr, &led_reg);
+	status = i40e_led_get_reg(hw, led_addr, &led_reg);
 	if (status)
 		goto restore_config;
 	if (on)
 		led_reg = I40E_PHY_LED_MANUAL_ON;
 	else
 		led_reg = 0;
-	status = i40e_write_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE,
-						  led_addr, phy_addr, led_reg);
+
+	status = i40e_led_set_reg(hw, led_addr, led_reg);
 	if (status)
 		goto restore_config;
 	if (mode & I40E_PHY_LED_MODE_ORIG) {
 		led_ctl = (mode & I40E_PHY_LED_MODE_MASK);
-		status = i40e_write_phy_register_clause45(hw,
-						 I40E_PHY_COM_REG_PAGE,
-						 led_addr, phy_addr, led_ctl);
+		status = i40e_led_set_reg(hw, led_addr, led_ctl);
 	}
 	return status;
+
 restore_config:
-	status = i40e_write_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE,
-						  led_addr, phy_addr, led_ctl);
+	status = i40e_led_set_reg(hw, led_addr, led_ctl);
 	return status;
 }
 
@@ -5052,6 +5134,75 @@ do_retry:
 }
 
 /**
+ * i40e_aq_set_phy_register
+ * @hw: pointer to the hw struct
+ * @phy_select: select which phy should be accessed
+ * @dev_addr: PHY device address
+ * @reg_addr: PHY register address
+ * @reg_val: new register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Write the external PHY register.
+ **/
+i40e_status i40e_aq_set_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 reg_val,
+				     struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_phy_register_access *cmd =
+		(struct i40e_aqc_phy_register_access *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_set_phy_register);
+
+	cmd->phy_interface = phy_select;
+	cmd->dev_address = dev_addr;
+	cmd->reg_address = cpu_to_le32(reg_addr);
+	cmd->reg_value = cpu_to_le32(reg_val);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40e_aq_get_phy_register
+ * @hw: pointer to the hw struct
+ * @phy_select: select which phy should be accessed
+ * @dev_addr: PHY device address
+ * @reg_addr: PHY register address
+ * @reg_val: read register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Read the external PHY register.
+ **/
+i40e_status i40e_aq_get_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 *reg_val,
+				     struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_phy_register_access *cmd =
+		(struct i40e_aqc_phy_register_access *)&desc.params.raw;
+	i40e_status status;
+
+	i40e_fill_default_direct_cmd_desc(&desc,
+					  i40e_aqc_opc_get_phy_register);
+
+	cmd->phy_interface = phy_select;
+	cmd->dev_address = dev_addr;
+	cmd->reg_address = cpu_to_le32(reg_addr);
+
+	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+	if (!status)
+		*reg_val = le32_to_cpu(cmd->reg_value);
+
+	return status;
+}
+
+/**
  * i40e_aq_write_ppp - Write pipeline personalization profile (ppp)
  * @hw: pointer to the hw struct
  * @buff: command buffer (size in bytes = buff_size)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 05e89864f781..1136d02e2e95 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -4196,6 +4196,158 @@ flags_complete:
 	return 0;
 }
 
+/**
+ * i40e_get_module_info - get (Q)SFP+ module type info
+ * @netdev: network interface device structure
+ * @modinfo: module EEPROM size and layout information structure
+ **/
+static int i40e_get_module_info(struct net_device *netdev,
+				struct ethtool_modinfo *modinfo)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	u32 sff8472_comp = 0;
+	u32 sff8472_swap = 0;
+	u32 sff8636_rev = 0;
+	i40e_status status;
+	u32 type = 0;
+
+	/* Check if firmware supports reading module EEPROM. */
+	if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE)) {
+		netdev_err(vsi->netdev, "Module EEPROM memory read not supported. Please update the NVM image.\n");
+		return -EINVAL;
+	}
+
+	status = i40e_update_link_info(hw);
+	if (status)
+		return -EIO;
+
+	if (hw->phy.link_info.phy_type == I40E_PHY_TYPE_EMPTY) {
+		netdev_err(vsi->netdev, "Cannot read module EEPROM memory. No module connected.\n");
+		return -EINVAL;
+	}
+
+	type = hw->phy.link_info.module_type[0];
+
+	switch (type) {
+	case I40E_MODULE_TYPE_SFP:
+		status = i40e_aq_get_phy_register(hw,
+				I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE,
+				I40E_I2C_EEPROM_DEV_ADDR,
+				I40E_MODULE_SFF_8472_COMP,
+				&sff8472_comp, NULL);
+		if (status)
+			return -EIO;
+
+		status = i40e_aq_get_phy_register(hw,
+				I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE,
+				I40E_I2C_EEPROM_DEV_ADDR,
+				I40E_MODULE_SFF_8472_SWAP,
+				&sff8472_swap, NULL);
+		if (status)
+			return -EIO;
+
+		/* Check if the module requires address swap to access
+		 * the other EEPROM memory page.
+		 */
+		if (sff8472_swap & I40E_MODULE_SFF_ADDR_MODE) {
+			netdev_warn(vsi->netdev, "Module address swap to access page 0xA2 is not supported.\n");
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		} else if (sff8472_comp == 0x00) {
+			/* Module is not SFF-8472 compliant */
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8472;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		}
+		break;
+	case I40E_MODULE_TYPE_QSFP_PLUS:
+		/* Read from memory page 0. */
+		status = i40e_aq_get_phy_register(hw,
+				I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE,
+				0,
+				I40E_MODULE_REVISION_ADDR,
+				&sff8636_rev, NULL);
+		if (status)
+			return -EIO;
+		/* Determine revision compliance byte */
+		if (sff8636_rev > 0x02) {
+			/* Module is SFF-8636 compliant */
+			modinfo->type = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = I40E_MODULE_QSFP_MAX_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = I40E_MODULE_QSFP_MAX_LEN;
+		}
+		break;
+	case I40E_MODULE_TYPE_QSFP28:
+		modinfo->type = ETH_MODULE_SFF_8636;
+		modinfo->eeprom_len = I40E_MODULE_QSFP_MAX_LEN;
+		break;
+	default:
+		netdev_err(vsi->netdev, "Module type unrecognized\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * i40e_get_module_eeprom - fills buffer with (Q)SFP+ module memory contents
+ * @netdev: network interface device structure
+ * @ee: EEPROM dump request structure
+ * @data: buffer to be filled with EEPROM contents
+ **/
+static int i40e_get_module_eeprom(struct net_device *netdev,
+				  struct ethtool_eeprom *ee,
+				  u8 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	bool is_sfp = false;
+	i40e_status status;
+	u32 value = 0;
+	int i;
+
+	if (!ee || !ee->len || !data)
+		return -EINVAL;
+
+	if (hw->phy.link_info.module_type[0] == I40E_MODULE_TYPE_SFP)
+		is_sfp = true;
+
+	for (i = 0; i < ee->len; i++) {
+		u32 offset = i + ee->offset;
+		u32 addr = is_sfp ? I40E_I2C_EEPROM_DEV_ADDR : 0;
+
+		/* Check if we need to access the other memory page */
+		if (is_sfp) {
+			if (offset >= ETH_MODULE_SFF_8079_LEN) {
+				offset -= ETH_MODULE_SFF_8079_LEN;
+				addr = I40E_I2C_EEPROM_DEV_ADDR2;
+			}
+		} else {
+			while (offset >= ETH_MODULE_SFF_8436_LEN) {
+				/* Compute memory page number and offset. */
+				offset -= ETH_MODULE_SFF_8436_LEN / 2;
+				addr++;
+			}
+		}
+
+		status = i40e_aq_get_phy_register(hw,
+				I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE,
+				addr, offset, &value, NULL);
+		if (status)
+			return -EIO;
+		data[i] = value;
+	}
+	return 0;
+}
+
 static const struct ethtool_ops i40e_ethtool_ops = {
 	.get_drvinfo		= i40e_get_drvinfo,
 	.get_regs_len		= i40e_get_regs_len,
@@ -4228,6 +4380,8 @@ static const struct ethtool_ops i40e_ethtool_ops = {
 	.set_rxfh		= i40e_set_rxfh,
 	.get_channels		= i40e_get_channels,
 	.set_channels		= i40e_set_channels,
+	.get_module_info	= i40e_get_module_info,
+	.get_module_eeprom	= i40e_get_module_eeprom,
 	.get_ts_info		= i40e_get_ts_info,
 	.get_priv_flags		= i40e_get_priv_flags,
 	.set_priv_flags		= i40e_set_priv_flags,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 6498da8806cb..3f9e89b054ec 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -600,6 +600,20 @@ static void i40e_stat_update32(struct i40e_hw *hw, u32 reg,
 }
 
 /**
+ * i40e_stat_update_and_clear32 - read and clear hw reg, update a 32 bit stat
+ * @hw: ptr to the hardware info
+ * @reg: the hw reg to read and clear
+ * @stat: ptr to the stat
+ **/
+static void i40e_stat_update_and_clear32(struct i40e_hw *hw, u32 reg, u64 *stat)
+{
+	u32 new_data = rd32(hw, reg);
+
+	wr32(hw, reg, 1); /* must write a nonzero value to clear register */
+	*stat += new_data;
+}
+
+/**
  * i40e_update_eth_stats - Update VSI-specific ethernet statistics counters.
  * @vsi: the VSI to be updated
  **/
@@ -1040,18 +1054,15 @@ static void i40e_update_pf_stats(struct i40e_pf *pf)
 			   &osd->rx_jabber, &nsd->rx_jabber);
 
 	/* FDIR stats */
-	i40e_stat_update32(hw,
-			   I40E_GLQF_PCNT(I40E_FD_ATR_STAT_IDX(pf->hw.pf_id)),
-			   pf->stat_offsets_loaded,
-			   &osd->fd_atr_match, &nsd->fd_atr_match);
-	i40e_stat_update32(hw,
-			   I40E_GLQF_PCNT(I40E_FD_SB_STAT_IDX(pf->hw.pf_id)),
-			   pf->stat_offsets_loaded,
-			   &osd->fd_sb_match, &nsd->fd_sb_match);
-	i40e_stat_update32(hw,
-		      I40E_GLQF_PCNT(I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id)),
-		      pf->stat_offsets_loaded,
-		      &osd->fd_atr_tunnel_match, &nsd->fd_atr_tunnel_match);
+	i40e_stat_update_and_clear32(hw,
+			I40E_GLQF_PCNT(I40E_FD_ATR_STAT_IDX(hw->pf_id)),
+			&nsd->fd_atr_match);
+	i40e_stat_update_and_clear32(hw,
+			I40E_GLQF_PCNT(I40E_FD_SB_STAT_IDX(hw->pf_id)),
+			&nsd->fd_sb_match);
+	i40e_stat_update_and_clear32(hw,
+			I40E_GLQF_PCNT(I40E_FD_ATR_TUNNEL_STAT_IDX(hw->pf_id)),
+			&nsd->fd_atr_tunnel_match);
 
 	val = rd32(hw, I40E_PRTPM_EEE_STAT);
 	nsd->tx_lpi_status =
@@ -3593,14 +3604,20 @@ static int i40e_vsi_enable_irq(struct i40e_vsi *vsi)
 }
 
 /**
- * i40e_stop_misc_vector - Stop the vector that handles non-queue events
+ * i40e_free_misc_vector - Free the vector that handles non-queue events
  * @pf: board private structure
  **/
-static void i40e_stop_misc_vector(struct i40e_pf *pf)
+static void i40e_free_misc_vector(struct i40e_pf *pf)
 {
 	/* Disable ICR 0 */
 	wr32(&pf->hw, I40E_PFINT_ICR0_ENA, 0);
 	i40e_flush(&pf->hw);
+
+	if (pf->flags & I40E_FLAG_MSIX_ENABLED && pf->msix_entries) {
+		synchronize_irq(pf->msix_entries[0].vector);
+		free_irq(pf->msix_entries[0].vector, pf);
+		clear_bit(__I40E_MISC_IRQ_REQUESTED, pf->state);
+	}
 }
 
 /**
@@ -4455,11 +4472,7 @@ static void i40e_clear_interrupt_scheme(struct i40e_pf *pf)
 {
 	int i;
 
-	i40e_stop_misc_vector(pf);
-	if (pf->flags & I40E_FLAG_MSIX_ENABLED && pf->msix_entries) {
-		synchronize_irq(pf->msix_entries[0].vector);
-		free_irq(pf->msix_entries[0].vector, pf);
-	}
+	i40e_free_misc_vector(pf);
 
 	i40e_put_lump(pf->irq_pile, pf->iwarp_base_vector,
 		      I40E_IWARP_IRQ_PILE_ID);
@@ -5346,13 +5359,14 @@ out:
 void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 {
 	enum i40e_aq_link_speed new_speed;
+	struct i40e_pf *pf = vsi->back;
 	char *speed = "Unknown";
 	char *fc = "Unknown";
 	char *fec = "";
 	char *req_fec = "";
 	char *an = "";
 
-	new_speed = vsi->back->hw.phy.link_info.link_speed;
+	new_speed = pf->hw.phy.link_info.link_speed;
 
 	if ((vsi->current_isup == isup) && (vsi->current_speed == new_speed))
 		return;
@@ -5366,13 +5380,13 @@ void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 	/* Warn user if link speed on NPAR enabled partition is not at
 	 * least 10GB
 	 */
-	if (vsi->back->hw.func_caps.npar_enable &&
-	    (vsi->back->hw.phy.link_info.link_speed == I40E_LINK_SPEED_1GB ||
-	     vsi->back->hw.phy.link_info.link_speed == I40E_LINK_SPEED_100MB))
+	if (pf->hw.func_caps.npar_enable &&
+	    (pf->hw.phy.link_info.link_speed == I40E_LINK_SPEED_1GB ||
+	     pf->hw.phy.link_info.link_speed == I40E_LINK_SPEED_100MB))
 		netdev_warn(vsi->netdev,
 			    "The partition detected link speed that is less than 10Gbps\n");
 
-	switch (vsi->back->hw.phy.link_info.link_speed) {
+	switch (pf->hw.phy.link_info.link_speed) {
 	case I40E_LINK_SPEED_40GB:
 		speed = "40 G";
 		break;
@@ -5395,7 +5409,7 @@ void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 		break;
 	}
 
-	switch (vsi->back->hw.fc.current_mode) {
+	switch (pf->hw.fc.current_mode) {
 	case I40E_FC_FULL:
 		fc = "RX/TX";
 		break;
@@ -5410,18 +5424,18 @@ void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 		break;
 	}
 
-	if (vsi->back->hw.phy.link_info.link_speed == I40E_LINK_SPEED_25GB) {
+	if (pf->hw.phy.link_info.link_speed == I40E_LINK_SPEED_25GB) {
 		req_fec = ", Requested FEC: None";
 		fec = ", FEC: None";
 		an = ", Autoneg: False";
 
-		if (vsi->back->hw.phy.link_info.an_info & I40E_AQ_AN_COMPLETED)
+		if (pf->hw.phy.link_info.an_info & I40E_AQ_AN_COMPLETED)
 			an = ", Autoneg: True";
 
-		if (vsi->back->hw.phy.link_info.fec_info &
+		if (pf->hw.phy.link_info.fec_info &
 		    I40E_AQ_CONFIG_FEC_KR_ENA)
 			fec = ", FEC: CL74 FC-FEC/BASE-R";
-		else if (vsi->back->hw.phy.link_info.fec_info &
+		else if (pf->hw.phy.link_info.fec_info &
 			 I40E_AQ_CONFIG_FEC_RS_ENA)
 			fec = ", FEC: CL108 RS-FEC";
 
@@ -5470,15 +5484,6 @@ static int i40e_up_complete(struct i40e_vsi *vsi)
 		i40e_print_link_message(vsi, true);
 		netif_tx_start_all_queues(vsi->netdev);
 		netif_carrier_on(vsi->netdev);
-	} else if (vsi->netdev) {
-		i40e_print_link_message(vsi, false);
-		/* need to check for qualified module here*/
-		if ((pf->hw.phy.link_info.link_info &
-			I40E_AQ_MEDIA_AVAILABLE) &&
-		    (!(pf->hw.phy.link_info.an_info &
-			I40E_AQ_QUALIFIED_MODULE)))
-			netdev_err(vsi->netdev,
-				   "the driver failed to link because an unqualified module was detected.");
 	}
 
 	/* replay FDIR SB filters */
@@ -6429,8 +6434,7 @@ static void i40e_link_event(struct i40e_pf *pf)
 	     new_link == netif_carrier_ok(vsi->netdev)))
 		return;
 
-	if (!test_bit(__I40E_VSI_DOWN, vsi->state))
-		i40e_print_link_message(vsi, new_link);
+	i40e_print_link_message(vsi, new_link);
 
 	/* Notify the base of the switch tree connected to
 	 * the link.  Floating VEBs are not notified.
@@ -8350,6 +8354,57 @@ static int i40e_init_interrupt_scheme(struct i40e_pf *pf)
 	return 0;
 }
 
+#ifdef CONFIG_PM
+/**
+ * i40e_restore_interrupt_scheme - Restore the interrupt scheme
+ * @pf: private board data structure
+ *
+ * Restore the interrupt scheme that was cleared when we suspended the
+ * device. This should be called during resume to re-allocate the q_vectors
+ * and reacquire IRQs.
+ */
+static int i40e_restore_interrupt_scheme(struct i40e_pf *pf)
+{
+	int err, i;
+
+	/* We cleared the MSI and MSI-X flags when disabling the old interrupt
+	 * scheme. We need to re-enabled them here in order to attempt to
+	 * re-acquire the MSI or MSI-X vectors
+	 */
+	pf->flags |= (I40E_FLAG_MSIX_ENABLED | I40E_FLAG_MSI_ENABLED);
+
+	err = i40e_init_interrupt_scheme(pf);
+	if (err)
+		return err;
+
+	/* Now that we've re-acquired IRQs, we need to remap the vectors and
+	 * rings together again.
+	 */
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		if (pf->vsi[i]) {
+			err = i40e_vsi_alloc_q_vectors(pf->vsi[i]);
+			if (err)
+				goto err_unwind;
+			i40e_vsi_map_rings_to_vectors(pf->vsi[i]);
+		}
+	}
+
+	err = i40e_setup_misc_vector(pf);
+	if (err)
+		goto err_unwind;
+
+	return 0;
+
+err_unwind:
+	while (i--) {
+		if (pf->vsi[i])
+			i40e_vsi_free_q_vectors(pf->vsi[i]);
+	}
+
+	return err;
+}
+#endif /* CONFIG_PM */
+
 /**
  * i40e_setup_misc_vector - Setup the misc vector to handle non queue events
  * @pf: board private structure
@@ -8363,13 +8418,12 @@ static int i40e_setup_misc_vector(struct i40e_pf *pf)
 	struct i40e_hw *hw = &pf->hw;
 	int err = 0;
 
-	/* Only request the irq if this is the first time through, and
-	 * not when we're rebuilding after a Reset
-	 */
-	if (!test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state)) {
+	/* Only request the IRQ once, the first time through. */
+	if (!test_and_set_bit(__I40E_MISC_IRQ_REQUESTED, pf->state)) {
 		err = request_irq(pf->msix_entries[0].vector,
 				  i40e_intr, 0, pf->int_name, pf);
 		if (err) {
+			clear_bit(__I40E_MISC_IRQ_REQUESTED, pf->state);
 			dev_info(&pf->pdev->dev,
 				 "request_irq for %s failed: %d\n",
 				 pf->int_name, err);
@@ -11039,6 +11093,7 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit)
 static void i40e_determine_queue_usage(struct i40e_pf *pf)
 {
 	int queues_left;
+	int q_max;
 
 	pf->num_lan_qps = 0;
 
@@ -11085,10 +11140,12 @@ static void i40e_determine_queue_usage(struct i40e_pf *pf)
 					I40E_FLAG_DCB_ENABLED);
 			dev_info(&pf->pdev->dev, "not enough queues for DCB. DCB is disabled.\n");
 		}
-		pf->num_lan_qps = max_t(int, pf->rss_size_max,
-					num_online_cpus());
-		pf->num_lan_qps = min_t(int, pf->num_lan_qps,
-					pf->hw.func_caps.num_tx_qp);
+
+		/* limit lan qps to the smaller of qps, cpus or msix */
+		q_max = max_t(int, pf->rss_size_max, num_online_cpus());
+		q_max = min_t(int, q_max, pf->hw.func_caps.num_tx_qp);
+		q_max = min_t(int, q_max, pf->hw.func_caps.num_msix_vectors);
+		pf->num_lan_qps = q_max;
 
 		queues_left -= pf->num_lan_qps;
 	}
@@ -11304,6 +11361,13 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	hw->bus.bus_id = pdev->bus->number;
 	pf->instance = pfs_found;
 
+	/* Select something other than the 802.1ad ethertype for the
+	 * switch to use internally and drop on ingress.
+	 */
+	hw->switch_tag = 0xffff;
+	hw->first_tag = ETH_P_8021AD;
+	hw->second_tag = ETH_P_8021Q;
+
 	INIT_LIST_HEAD(&pf->l3_flex_pit_list);
 	INIT_LIST_HEAD(&pf->l4_flex_pit_list);
 
@@ -11380,11 +11444,10 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		 i40e_nvm_version_str(hw));
 
 	if (hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
-	    hw->aq.api_min_ver > I40E_FW_API_VERSION_MINOR)
+	    hw->aq.api_min_ver > I40E_FW_MINOR_VERSION(hw))
 		dev_info(&pdev->dev,
 			 "The driver for the device detected a newer version of the NVM image than expected. Please install the most recent version of the network driver.\n");
-	else if (hw->aq.api_maj_ver < I40E_FW_API_VERSION_MAJOR ||
-		 hw->aq.api_min_ver < (I40E_FW_API_VERSION_MINOR - 1))
+	else if (hw->aq.api_maj_ver == 1 && hw->aq.api_min_ver < 4)
 		dev_info(&pdev->dev,
 			 "The driver for the device detected an older version of the NVM image than expected. Please update the NVM image.\n");
 
@@ -12048,18 +12111,25 @@ static void i40e_shutdown(struct pci_dev *pdev)
 
 #ifdef CONFIG_PM
 /**
- * i40e_suspend - PCI callback for moving to D3
- * @pdev: PCI device information struct
+ * i40e_suspend - PM callback for moving to D3
+ * @dev: generic device information structure
  **/
-static int i40e_suspend(struct pci_dev *pdev, pm_message_t state)
+static int i40e_suspend(struct device *dev)
 {
+	struct pci_dev *pdev = to_pci_dev(dev);
 	struct i40e_pf *pf = pci_get_drvdata(pdev);
 	struct i40e_hw *hw = &pf->hw;
-	int retval = 0;
 
-	set_bit(__I40E_SUSPENDED, pf->state);
+	/* If we're already suspended, then there is nothing to do */
+	if (test_and_set_bit(__I40E_SUSPENDED, pf->state))
+		return 0;
+
 	set_bit(__I40E_DOWN, pf->state);
 
+	/* Ensure service task will not be running */
+	del_timer_sync(&pf->service_timer);
+	cancel_work_sync(&pf->service_task);
+
 	if (pf->wol_en && (pf->hw_features & I40E_HW_WOL_MC_MAGIC_PKT_WAKE))
 		i40e_enable_mc_magic_wake(pf);
 
@@ -12068,81 +12138,72 @@ static int i40e_suspend(struct pci_dev *pdev, pm_message_t state)
 	wr32(hw, I40E_PFPM_APM, (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0));
 	wr32(hw, I40E_PFPM_WUFC, (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0));
 
-	i40e_stop_misc_vector(pf);
-	if (pf->msix_entries) {
-		synchronize_irq(pf->msix_entries[0].vector);
-		free_irq(pf->msix_entries[0].vector, pf);
-	}
-	retval = pci_save_state(pdev);
-	if (retval)
-		return retval;
-
-	pci_wake_from_d3(pdev, pf->wol_en);
-	pci_set_power_state(pdev, PCI_D3hot);
+	/* Clear the interrupt scheme and release our IRQs so that the system
+	 * can safely hibernate even when there are a large number of CPUs.
+	 * Otherwise hibernation might fail when mapping all the vectors back
+	 * to CPU0.
+	 */
+	i40e_clear_interrupt_scheme(pf);
 
-	return retval;
+	return 0;
 }
 
 /**
- * i40e_resume - PCI callback for waking up from D3
- * @pdev: PCI device information struct
+ * i40e_resume - PM callback for waking up from D3
+ * @dev: generic device information structure
  **/
-static int i40e_resume(struct pci_dev *pdev)
+static int i40e_resume(struct device *dev)
 {
+	struct pci_dev *pdev = to_pci_dev(dev);
 	struct i40e_pf *pf = pci_get_drvdata(pdev);
-	u32 err;
+	int err;
 
-	pci_set_power_state(pdev, PCI_D0);
-	pci_restore_state(pdev);
-	/* pci_restore_state() clears dev->state_saves, so
-	 * call pci_save_state() again to restore it.
-	 */
-	pci_save_state(pdev);
+	/* If we're not suspended, then there is nothing to do */
+	if (!test_bit(__I40E_SUSPENDED, pf->state))
+		return 0;
 
-	err = pci_enable_device_mem(pdev);
+	/* We cleared the interrupt scheme when we suspended, so we need to
+	 * restore it now to resume device functionality.
+	 */
+	err = i40e_restore_interrupt_scheme(pf);
 	if (err) {
-		dev_err(&pdev->dev, "Cannot enable PCI device from suspend\n");
-		return err;
+		dev_err(&pdev->dev, "Cannot restore interrupt scheme: %d\n",
+			err);
 	}
-	pci_set_master(pdev);
 
-	/* no wakeup events while running */
-	pci_wake_from_d3(pdev, false);
-
-	/* handling the reset will rebuild the device state */
-	if (test_and_clear_bit(__I40E_SUSPENDED, pf->state)) {
-		clear_bit(__I40E_DOWN, pf->state);
-		if (pf->msix_entries) {
-			err = request_irq(pf->msix_entries[0].vector,
-					  i40e_intr, 0, pf->int_name, pf);
-			if (err) {
-				dev_err(&pf->pdev->dev,
-					"request_irq for %s failed: %d\n",
-					pf->int_name, err);
-			}
-		}
-		i40e_reset_and_rebuild(pf, false, false);
-	}
+	clear_bit(__I40E_DOWN, pf->state);
+	i40e_reset_and_rebuild(pf, false, false);
+
+	/* Clear suspended state last after everything is recovered */
+	clear_bit(__I40E_SUSPENDED, pf->state);
+
+	/* Restart the service task */
+	mod_timer(&pf->service_timer,
+		  round_jiffies(jiffies + pf->service_timer_period));
 
 	return 0;
 }
 
-#endif
+#endif /* CONFIG_PM */
+
 static const struct pci_error_handlers i40e_err_handler = {
 	.error_detected = i40e_pci_error_detected,
 	.slot_reset = i40e_pci_error_slot_reset,
 	.resume = i40e_pci_error_resume,
 };
 
+static SIMPLE_DEV_PM_OPS(i40e_pm_ops, i40e_suspend, i40e_resume);
+
 static struct pci_driver i40e_driver = {
 	.name     = i40e_driver_name,
 	.id_table = i40e_pci_tbl,
 	.probe    = i40e_probe,
 	.remove   = i40e_remove,
 #ifdef CONFIG_PM
-	.suspend  = i40e_suspend,
-	.resume   = i40e_resume,
-#endif
+	.driver   = {
+		.pm = &i40e_pm_ops,
+	},
+#endif /* CONFIG_PM */
 	.shutdown = i40e_shutdown,
 	.err_handler = &i40e_err_handler,
 	.sriov_configure = i40e_pci_sriov_configure,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index a39b13197891..01502561035c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -360,6 +360,15 @@ i40e_status i40e_aq_rx_ctl_write_register(struct i40e_hw *hw,
 				u32 reg_addr, u32 reg_val,
 				struct i40e_asq_cmd_details *cmd_details);
 void i40e_write_rx_ctl(struct i40e_hw *hw, u32 reg_addr, u32 reg_val);
+i40e_status i40e_aq_set_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 reg_val,
+				     struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_get_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 *reg_val,
+				     struct i40e_asq_cmd_details *cmd_details);
+
 i40e_status i40e_read_phy_register_clause22(struct i40e_hw *hw,
 					    u16 reg, u8 phy_addr, u16 *value);
 i40e_status i40e_write_phy_register_clause22(struct i40e_hw *hw,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 1519dfb851d0..d9fdf69bbc6e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -960,14 +960,14 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
 {
 	enum i40e_latency_range new_latency_range = rc->latency_range;
 	u32 new_itr = rc->itr;
-	int bytes_per_int;
+	int bytes_per_usec;
 	unsigned int usecs, estimated_usecs;
 
 	if (rc->total_packets == 0 || !rc->itr)
 		return false;
 
 	usecs = (rc->itr << 1) * ITR_COUNTDOWN_START;
-	bytes_per_int = rc->total_bytes / usecs;
+	bytes_per_usec = rc->total_bytes / usecs;
 
 	/* The calculations in this algorithm depend on interrupts actually
 	 * firing at the ITR rate. This may not happen if the packet rate is
@@ -993,18 +993,18 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
 	 */
 	switch (new_latency_range) {
 	case I40E_LOWEST_LATENCY:
-		if (bytes_per_int > 10)
+		if (bytes_per_usec > 10)
 			new_latency_range = I40E_LOW_LATENCY;
 		break;
 	case I40E_LOW_LATENCY:
-		if (bytes_per_int > 20)
+		if (bytes_per_usec > 20)
 			new_latency_range = I40E_BULK_LATENCY;
-		else if (bytes_per_int <= 10)
+		else if (bytes_per_usec <= 10)
 			new_latency_range = I40E_LOWEST_LATENCY;
 		break;
 	case I40E_BULK_LATENCY:
 	default:
-		if (bytes_per_int <= 20)
+		if (bytes_per_usec <= 20)
 			new_latency_range = I40E_LOW_LATENCY;
 		break;
 	}
@@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      i40e_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index fd4bbdd88b57..4b32b1d38a66 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -428,6 +428,18 @@ struct i40e_nvm_access {
 	u8 data[1];
 };
 
+/* (Q)SFP module access definitions */
+#define I40E_I2C_EEPROM_DEV_ADDR	0xA0
+#define I40E_I2C_EEPROM_DEV_ADDR2	0xA2
+#define I40E_MODULE_TYPE_ADDR		0x00
+#define I40E_MODULE_REVISION_ADDR	0x01
+#define I40E_MODULE_SFF_8472_COMP	0x5E
+#define I40E_MODULE_SFF_8472_SWAP	0x5C
+#define I40E_MODULE_SFF_ADDR_MODE	0x04
+#define I40E_MODULE_TYPE_QSFP_PLUS	0x0D
+#define I40E_MODULE_TYPE_QSFP28		0x11
+#define I40E_MODULE_QSFP_MAX_LEN	640
+
 /* PCI bus types */
 enum i40e_bus_type {
 	i40e_bus_type_unknown = 0,
@@ -598,8 +610,15 @@ struct i40e_hw {
 	struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */
 
 #define I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE BIT_ULL(0)
+#define I40E_HW_FLAG_802_1AD_CAPABLE        BIT_ULL(1)
+#define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE  BIT_ULL(2)
 	u64 flags;
 
+	/* Used in set switch config AQ command */
+	u16 switch_tag;
+	u16 first_tag;
+	u16 second_tag;
+
 	/* debug mask */
 	u32 debug_mask;
 	char err_str[16];
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 4d1e670f490e..04568137e029 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -154,15 +154,30 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf)
 
 /**
  * i40e_vc_disable_vf
- * @pf: pointer to the PF info
  * @vf: pointer to the VF info
  *
- * Disable the VF through a SW reset
+ * Disable the VF through a SW reset.
  **/
-static inline void i40e_vc_disable_vf(struct i40e_pf *pf, struct i40e_vf *vf)
+static inline void i40e_vc_disable_vf(struct i40e_vf *vf)
 {
+	int i;
+
 	i40e_vc_notify_vf_reset(vf);
-	i40e_reset_vf(vf, false);
+
+	/* We want to ensure that an actual reset occurs initiated after this
+	 * function was called. However, we do not want to wait forever, so
+	 * we'll give a reasonable time and print a message if we failed to
+	 * ensure a reset.
+	 */
+	for (i = 0; i < 20; i++) {
+		if (i40e_reset_vf(vf, false))
+			return;
+		usleep_range(10000, 20000);
+	}
+
+	dev_warn(&vf->pf->pdev->dev,
+		 "Failed to initiate reset for VF %d after 200 milliseconds\n",
+		 vf->vf_id);
 }
 
 /**
@@ -423,6 +438,9 @@ static int i40e_config_iwarp_qvlist(struct i40e_vf *vf,
 	       (sizeof(struct virtchnl_iwarp_qv_info) *
 						(qvlist_info->num_vectors - 1));
 	vf->qvlist_info = kzalloc(size, GFP_KERNEL);
+	if (!vf->qvlist_info)
+		return -ENOMEM;
+
 	vf->qvlist_info->num_vectors = qvlist_info->num_vectors;
 
 	msix_vf = pf->hw.func_caps.num_msix_vectors_vf;
@@ -815,6 +833,14 @@ static void i40e_free_vf_res(struct i40e_vf *vf)
 	 */
 	clear_bit(I40E_VF_STATE_INIT, &vf->vf_states);
 
+	/* It's possible the VF had requeuested more queues than the default so
+	 * do the accounting here when we're about to free them.
+	 */
+	if (vf->num_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF) {
+		pf->queues_left += vf->num_queue_pairs -
+				   I40E_DEFAULT_QUEUES_PER_VF;
+	}
+
 	/* free vsi & disconnect it from the parent uplink */
 	if (vf->lan_vsi_idx) {
 		i40e_vsi_release(pf->vsi[vf->lan_vsi_idx]);
@@ -853,7 +879,8 @@ static void i40e_free_vf_res(struct i40e_vf *vf)
 	}
 	/* reset some of the state variables keeping track of the resources */
 	vf->num_queue_pairs = 0;
-	vf->vf_states = 0;
+	clear_bit(I40E_VF_STATE_MC_PROMISC, &vf->vf_states);
+	clear_bit(I40E_VF_STATE_UC_PROMISC, &vf->vf_states);
 }
 
 /**
@@ -868,12 +895,27 @@ static int i40e_alloc_vf_res(struct i40e_vf *vf)
 	int total_queue_pairs = 0;
 	int ret;
 
+	if (vf->num_req_queues &&
+	    vf->num_req_queues <= pf->queues_left + I40E_DEFAULT_QUEUES_PER_VF)
+		pf->num_vf_qps = vf->num_req_queues;
+	else
+		pf->num_vf_qps = I40E_DEFAULT_QUEUES_PER_VF;
+
 	/* allocate hw vsi context & associated resources */
 	ret = i40e_alloc_vsi_res(vf, I40E_VSI_SRIOV);
 	if (ret)
 		goto error_alloc;
 	total_queue_pairs += pf->vsi[vf->lan_vsi_idx]->alloc_queue_pairs;
 
+	/* We account for each VF to get a default number of queue pairs.  If
+	 * the VF has now requested more, we need to account for that to make
+	 * certain we never request more queues than we actually have left in
+	 * HW.
+	 */
+	if (total_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF)
+		pf->queues_left -=
+			total_queue_pairs - I40E_DEFAULT_QUEUES_PER_VF;
+
 	if (vf->trusted)
 		set_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps);
 	else
@@ -1008,8 +1050,8 @@ static void i40e_cleanup_reset_vf(struct i40e_vf *vf)
 		set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states);
 		clear_bit(I40E_VF_STATE_DISABLED, &vf->vf_states);
 		/* Do not notify the client during VF init */
-		if (test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE,
-				       &vf->vf_states))
+		if (!test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE,
+					&vf->vf_states))
 			i40e_notify_client_of_vf_reset(pf, abs_vf_id);
 		vf->num_vlan = 0;
 	}
@@ -1026,9 +1068,9 @@ static void i40e_cleanup_reset_vf(struct i40e_vf *vf)
  * @vf: pointer to the VF structure
  * @flr: VFLR was issued or not
  *
- * reset the VF
+ * Returns true if the VF is reset, false otherwise.
  **/
-void i40e_reset_vf(struct i40e_vf *vf, bool flr)
+bool i40e_reset_vf(struct i40e_vf *vf, bool flr)
 {
 	struct i40e_pf *pf = vf->pf;
 	struct i40e_hw *hw = &pf->hw;
@@ -1036,9 +1078,11 @@ void i40e_reset_vf(struct i40e_vf *vf, bool flr)
 	u32 reg;
 	int i;
 
-	/* If VFs have been disabled, there is no need to reset */
+	/* If the VFs have been disabled, this means something else is
+	 * resetting the VF, so we shouldn't continue.
+	 */
 	if (test_and_set_bit(__I40E_VF_DISABLE, pf->state))
-		return;
+		return false;
 
 	i40e_trigger_vf_reset(vf, flr);
 
@@ -1075,6 +1119,8 @@ void i40e_reset_vf(struct i40e_vf *vf, bool flr)
 
 	i40e_flush(hw);
 	clear_bit(__I40E_VF_DISABLE, pf->state);
+
+	return true;
 }
 
 /**
@@ -1086,8 +1132,10 @@ void i40e_reset_vf(struct i40e_vf *vf, bool flr)
  * VF, then do all the waiting in one chunk, and finally finish restoring each
  * VF after the wait. This is useful during PF routines which need to reset
  * all VFs, as otherwise it must perform these resets in a serialized fashion.
+ *
+ * Returns true if any VFs were reset, and false otherwise.
  **/
-void i40e_reset_all_vfs(struct i40e_pf *pf, bool flr)
+bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr)
 {
 	struct i40e_hw *hw = &pf->hw;
 	struct i40e_vf *vf;
@@ -1096,11 +1144,11 @@ void i40e_reset_all_vfs(struct i40e_pf *pf, bool flr)
 
 	/* If we don't have any VFs, then there is nothing to reset */
 	if (!pf->num_alloc_vfs)
-		return;
+		return false;
 
 	/* If VFs have been disabled, there is no need to reset */
 	if (test_and_set_bit(__I40E_VF_DISABLE, pf->state))
-		return;
+		return false;
 
 	/* Begin reset on all VFs at once */
 	for (v = 0; v < pf->num_alloc_vfs; v++)
@@ -1175,6 +1223,8 @@ void i40e_reset_all_vfs(struct i40e_pf *pf, bool flr)
 
 	i40e_flush(hw);
 	clear_bit(__I40E_VF_DISABLE, pf->state);
+
+	return true;
 }
 
 /**
@@ -1537,6 +1587,8 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 	    (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_IWARP)) {
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_IWARP;
 		set_bit(I40E_VF_STATE_IWARPENA, &vf->vf_states);
+	} else {
+		clear_bit(I40E_VF_STATE_IWARPENA, &vf->vf_states);
 	}
 
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PF) {
@@ -1579,6 +1631,9 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 					VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
 	}
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_REQ_QUEUES)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_REQ_QUEUES;
+
 	vfres->num_vsis = num_vsis;
 	vfres->num_queue_pairs = vf->num_queue_pairs;
 	vfres->max_vectors = pf->hw.func_caps.num_msix_vectors_vf;
@@ -1987,6 +2042,52 @@ error_param:
 }
 
 /**
+ * i40e_vc_request_queues_msg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * VFs get a default number of queues but can use this message to request a
+ * different number.  Will respond with either the number requested or the
+ * maximum we can support.
+ **/
+static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg, int msglen)
+{
+	struct virtchnl_vf_res_request *vfres =
+		(struct virtchnl_vf_res_request *)msg;
+	int req_pairs = vfres->num_queue_pairs;
+	int cur_pairs = vf->num_queue_pairs;
+	struct i40e_pf *pf = vf->pf;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states))
+		return -EINVAL;
+
+	if (req_pairs <= 0) {
+		dev_err(&pf->pdev->dev,
+			"VF %d tried to request %d queues.  Ignoring.\n",
+			vf->vf_id, req_pairs);
+	} else if (req_pairs > I40E_MAX_VF_QUEUES) {
+		dev_err(&pf->pdev->dev,
+			"VF %d tried to request more than %d queues.\n",
+			vf->vf_id,
+			I40E_MAX_VF_QUEUES);
+		vfres->num_queue_pairs = I40E_MAX_VF_QUEUES;
+	} else if (req_pairs - cur_pairs > pf->queues_left) {
+		dev_warn(&pf->pdev->dev,
+			 "VF %d requested %d more queues, but only %d left.\n",
+			 vf->vf_id,
+			 req_pairs - cur_pairs,
+			 pf->queues_left);
+		vfres->num_queue_pairs = pf->queues_left + cur_pairs;
+	} else {
+		vf->num_req_queues = req_pairs;
+	}
+
+	return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES, 0,
+				      (u8 *)vfres, sizeof(vfres));
+}
+
+/**
  * i40e_vc_get_stats_msg
  * @vf: pointer to the VF info
  * @msg: pointer to the msg buffer
@@ -2708,6 +2809,9 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
 		ret = i40e_vc_disable_vlan_stripping(vf, msg, msglen);
 		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		ret = i40e_vc_request_queues_msg(vf, msg, msglen);
+		break;
 
 	case VIRTCHNL_OP_UNKNOWN:
 	default:
@@ -2840,7 +2944,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
 	}
 
 	/* Force the VF driver stop so it has to reload with new MAC address */
-	i40e_vc_disable_vf(pf, vf);
+	i40e_vc_disable_vf(vf);
 	dev_info(&pf->pdev->dev, "Reload the VF driver to make this change effective.\n");
 
 error_param:
@@ -2848,6 +2952,34 @@ error_param:
 }
 
 /**
+ * i40e_vsi_has_vlans - True if VSI has configured VLANs
+ * @vsi: pointer to the vsi
+ *
+ * Check if a VSI has configured any VLANs. False if we have a port VLAN or if
+ * we have no configured VLANs. Do not call while holding the
+ * mac_filter_hash_lock.
+ */
+static bool i40e_vsi_has_vlans(struct i40e_vsi *vsi)
+{
+	bool have_vlans;
+
+	/* If we have a port VLAN, then the VSI cannot have any VLANs
+	 * configured, as all MAC/VLAN filters will be assigned to the PVID.
+	 */
+	if (vsi->info.pvid)
+		return false;
+
+	/* Since we don't have a PVID, we know that if the device is in VLAN
+	 * mode it must be because of a VLAN filter configured on this VSI.
+	 */
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+	have_vlans = i40e_is_vsi_in_vlan(vsi);
+	spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+	return have_vlans;
+}
+
+/**
  * i40e_ndo_set_vf_port_vlan
  * @netdev: network interface device structure
  * @vf_id: VF identifier
@@ -2899,10 +3031,7 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
 		/* duplicate request, so just return success */
 		goto error_pvid;
 
-	/* Locked once because multiple functions below iterate list */
-	spin_lock_bh(&vsi->mac_filter_hash_lock);
-
-	if (le16_to_cpu(vsi->info.pvid) == 0 && i40e_is_vsi_in_vlan(vsi)) {
+	if (i40e_vsi_has_vlans(vsi)) {
 		dev_err(&pf->pdev->dev,
 			"VF %d has already configured VLAN filters and the administrator is requesting a port VLAN override.\nPlease unload and reload the VF driver for this change to take effect.\n",
 			vf_id);
@@ -2910,11 +3039,14 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
 		 * the right thing by reconfiguring his network correctly
 		 * and then reloading the VF driver.
 		 */
-		i40e_vc_disable_vf(pf, vf);
+		i40e_vc_disable_vf(vf);
 		/* During reset the VF got a new VSI, so refresh the pointer. */
 		vsi = pf->vsi[vf->lan_vsi_idx];
 	}
 
+	/* Locked once because multiple functions below iterate list */
+	spin_lock_bh(&vsi->mac_filter_hash_lock);
+
 	/* Check for condition where there was already a port VLAN ID
 	 * filter set and now it is being deleted by setting it to zero.
 	 * Additionally check for the condition where there was a port
@@ -3279,14 +3411,11 @@ int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting)
 
 	vf = &pf->vf[vf_id];
 
-	if (!vf)
-		return -EINVAL;
 	if (setting == vf->trusted)
 		goto out;
 
 	vf->trusted = setting;
-	i40e_vc_notify_vf_reset(vf);
-	i40e_reset_vf(vf, false);
+	i40e_vc_disable_vf(vf);
 	dev_info(&pf->pdev->dev, "VF %u is now %strusted\n",
 		 vf_id, setting ? "" : "un");
 out:
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 1f4b0c504368..5efc4f92bb37 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -56,7 +56,6 @@ enum i40e_vf_states {
 	I40E_VF_STATE_INIT = 0,
 	I40E_VF_STATE_ACTIVE,
 	I40E_VF_STATE_IWARPENA,
-	I40E_VF_STATE_FCOEENA,
 	I40E_VF_STATE_DISABLED,
 	I40E_VF_STATE_MC_PROMISC,
 	I40E_VF_STATE_UC_PROMISC,
@@ -97,6 +96,7 @@ struct i40e_vf {
 	u16 lan_vsi_id;		/* ID as used by firmware */
 
 	u8 num_queue_pairs;	/* num of qps assigned to VF vsis */
+	u8 num_req_queues;	/* num of requested qps */
 	u64 num_mdd_events;	/* num of mdd events detected */
 	/* num of continuous malformed or invalid msgs detected */
 	u64 num_invalid_msgs;
@@ -121,8 +121,8 @@ int i40e_alloc_vfs(struct i40e_pf *pf, u16 num_alloc_vfs);
 int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
 			   u32 v_retval, u8 *msg, u16 msglen);
 int i40e_vc_process_vflr_event(struct i40e_pf *pf);
-void i40e_reset_vf(struct i40e_vf *vf, bool flr);
-void i40e_reset_all_vfs(struct i40e_pf *pf, bool flr);
+bool i40e_reset_vf(struct i40e_vf *vf, bool flr);
+bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr);
 void i40e_vc_notify_vf_reset(struct i40e_vf *vf);
 
 /* VF configuration related iplink handlers */
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
index 83e63e55c4b4..ed5602f4bbcd 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
@@ -34,7 +34,15 @@
  */
 
 #define I40E_FW_API_VERSION_MAJOR	0x0001
-#define I40E_FW_API_VERSION_MINOR	0x0005
+#define I40E_FW_API_VERSION_MINOR_X722	0x0005
+#define I40E_FW_API_VERSION_MINOR_X710	0x0007
+
+#define I40E_FW_MINOR_VERSION(_h) ((_h)->mac.type == I40E_MAC_XL710 ? \
+					I40E_FW_API_VERSION_MINOR_X710 : \
+					I40E_FW_API_VERSION_MINOR_X722)
+
+/* API version 1.7 implements additional link and PHY-specific APIs  */
+#define I40E_MINOR_VER_GET_LINK_INFO_XL710 0x0007
 
 struct i40e_aq_desc {
 	__le16 flags;
@@ -236,6 +244,8 @@ enum i40e_admin_queue_opc {
 	i40e_aqc_opc_set_phy_debug		= 0x0622,
 	i40e_aqc_opc_upload_ext_phy_fm		= 0x0625,
 	i40e_aqc_opc_run_phy_activity		= 0x0626,
+	i40e_aqc_opc_set_phy_register		= 0x0628,
+	i40e_aqc_opc_get_phy_register		= 0x0629,
 
 	/* NVM commands */
 	i40e_aqc_opc_nvm_read			= 0x0701,
@@ -761,7 +771,22 @@ struct i40e_aqc_set_switch_config {
 #define I40E_AQ_SET_SWITCH_CFG_PROMISC		0x0001
 #define I40E_AQ_SET_SWITCH_CFG_L2_FILTER	0x0002
 	__le16	valid_flags;
-	u8	reserved[12];
+	/* The ethertype in switch_tag is dropped on ingress and used
+	 * internally by the switch. Set this to zero for the default
+	 * of 0x88a8 (802.1ad). Should be zero for firmware API
+	 * versions lower than 1.7.
+	 */
+	__le16	switch_tag;
+	/* The ethertypes in first_tag and second_tag are used to
+	 * match the outer and inner VLAN tags (respectively) when HW
+	 * double VLAN tagging is enabled via the set port parameters
+	 * AQ command. Otherwise these are both ignored. Set them to
+	 * zero for their defaults of 0x8100 (802.1Q). Should be zero
+	 * for firmware API versions lower than 1.7.
+	 */
+	__le16	first_tag;
+	__le16	second_tag;
+	u8	reserved[6];
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config);
@@ -1722,6 +1747,8 @@ enum i40e_aq_phy_type {
 	I40E_PHY_TYPE_10GBASE_CR1_CU		= 0xB,
 	I40E_PHY_TYPE_10GBASE_AOC		= 0xC,
 	I40E_PHY_TYPE_40GBASE_AOC		= 0xD,
+	I40E_PHY_TYPE_UNRECOGNIZED		= 0xE,
+	I40E_PHY_TYPE_UNSUPPORTED		= 0xF,
 	I40E_PHY_TYPE_100BASE_TX		= 0x11,
 	I40E_PHY_TYPE_1000BASE_T		= 0x12,
 	I40E_PHY_TYPE_10GBASE_T			= 0x13,
@@ -1740,6 +1767,8 @@ enum i40e_aq_phy_type {
 	I40E_PHY_TYPE_25GBASE_CR		= 0x20,
 	I40E_PHY_TYPE_25GBASE_SR		= 0x21,
 	I40E_PHY_TYPE_25GBASE_LR		= 0x22,
+	I40E_PHY_TYPE_EMPTY			= 0xFE,
+	I40E_PHY_TYPE_DEFAULT			= 0xFF,
 	I40E_PHY_TYPE_MAX
 };
 
@@ -1930,19 +1959,31 @@ struct i40e_aqc_get_link_status {
 #define I40E_AQ_25G_SERDES_UCODE_ERR	0X04
 #define I40E_AQ_25G_NIMB_UCODE_ERR	0X05
 	u8	loopback; /* use defines from i40e_aqc_set_lb_mode */
+/* Since firmware API 1.7 loopback field keeps power class info as well */
+#define I40E_AQ_LOOPBACK_MASK		0x07
+#define I40E_AQ_PWR_CLASS_SHIFT_LB	6
+#define I40E_AQ_PWR_CLASS_MASK_LB	(0x03 << I40E_AQ_PWR_CLASS_SHIFT_LB)
 	__le16	max_frame_size;
 	u8	config;
 #define I40E_AQ_CONFIG_FEC_KR_ENA	0x01
 #define I40E_AQ_CONFIG_FEC_RS_ENA	0x02
 #define I40E_AQ_CONFIG_CRC_ENA		0x04
 #define I40E_AQ_CONFIG_PACING_MASK	0x78
-	u8	power_desc;
+	union {
+		struct {
+			u8	power_desc;
 #define I40E_AQ_LINK_POWER_CLASS_1	0x00
 #define I40E_AQ_LINK_POWER_CLASS_2	0x01
 #define I40E_AQ_LINK_POWER_CLASS_3	0x02
 #define I40E_AQ_LINK_POWER_CLASS_4	0x03
 #define I40E_AQ_PWR_CLASS_MASK		0x03
-	u8	reserved[4];
+			u8	reserved[4];
+		};
+		struct {
+			u8	link_type[4];
+			u8	link_type_ext;
+		};
+	};
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_get_link_status);
@@ -2022,6 +2063,22 @@ struct i40e_aqc_run_phy_activity {
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_run_phy_activity);
 
+/* Set PHY Register command (0x0628) */
+/* Get PHY Register command (0x0629) */
+struct i40e_aqc_phy_register_access {
+	u8	phy_interface;
+#define I40E_AQ_PHY_REG_ACCESS_INTERNAL	0
+#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL	1
+#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE	2
+	u8	dev_address;
+	u8	reserved1[2];
+	__le32	reg_address;
+	__le32	reg_value;
+	u8	reserved2[4];
+};
+
+I40E_CHECK_CMD_LENGTH(i40e_aqc_phy_register_access);
+
 /* NVM Read command (indirect 0x0701)
  * NVM Erase commands (direct 0x0702)
  * NVM Update commands (indirect 0x0703)
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_common.c b/drivers/net/ethernet/intel/i40evf/i40e_common.c
index 8d3a2bfe186a..7d70bf69b249 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_common.c
@@ -1042,6 +1042,75 @@ do_retry:
 }
 
 /**
+ * i40evf_aq_set_phy_register
+ * @hw: pointer to the hw struct
+ * @phy_select: select which phy should be accessed
+ * @dev_addr: PHY device address
+ * @reg_addr: PHY register address
+ * @reg_val: new register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Reset the external PHY.
+ **/
+i40e_status i40evf_aq_set_phy_register(struct i40e_hw *hw,
+				       u8 phy_select, u8 dev_addr,
+				       u32 reg_addr, u32 reg_val,
+				       struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_phy_register_access *cmd =
+		(struct i40e_aqc_phy_register_access *)&desc.params.raw;
+	i40e_status status;
+
+	i40evf_fill_default_direct_cmd_desc(&desc,
+					    i40e_aqc_opc_set_phy_register);
+
+	cmd->phy_interface = phy_select;
+	cmd->dev_address = dev_addr;
+	cmd->reg_address = cpu_to_le32(reg_addr);
+	cmd->reg_value = cpu_to_le32(reg_val);
+
+	status = i40evf_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+
+	return status;
+}
+
+/**
+ * i40evf_aq_get_phy_register
+ * @hw: pointer to the hw struct
+ * @phy_select: select which phy should be accessed
+ * @dev_addr: PHY device address
+ * @reg_addr: PHY register address
+ * @reg_val: read register value
+ * @cmd_details: pointer to command details structure or NULL
+ *
+ * Reset the external PHY.
+ **/
+i40e_status i40evf_aq_get_phy_register(struct i40e_hw *hw,
+				       u8 phy_select, u8 dev_addr,
+				       u32 reg_addr, u32 *reg_val,
+				       struct i40e_asq_cmd_details *cmd_details)
+{
+	struct i40e_aq_desc desc;
+	struct i40e_aqc_phy_register_access *cmd =
+		(struct i40e_aqc_phy_register_access *)&desc.params.raw;
+	i40e_status status;
+
+	i40evf_fill_default_direct_cmd_desc(&desc,
+					    i40e_aqc_opc_get_phy_register);
+
+	cmd->phy_interface = phy_select;
+	cmd->dev_address = dev_addr;
+	cmd->reg_address = cpu_to_le32(reg_addr);
+
+	status = i40evf_asq_send_command(hw, &desc, NULL, 0, cmd_details);
+	if (!status)
+		*reg_val = le32_to_cpu(cmd->reg_value);
+
+	return status;
+}
+
+/**
  * i40e_aq_send_msg_to_pf
  * @hw: pointer to the hardware structure
  * @v_opcode: opcodes for VF-PF communication
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h
index c9836bba487d..b624b5994075 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h
@@ -111,6 +111,15 @@ i40e_status i40evf_aq_rx_ctl_write_register(struct i40e_hw *hw,
 				u32 reg_addr, u32 reg_val,
 				struct i40e_asq_cmd_details *cmd_details);
 void i40evf_write_rx_ctl(struct i40e_hw *hw, u32 reg_addr, u32 reg_val);
+i40e_status i40e_aq_set_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 reg_val,
+				     struct i40e_asq_cmd_details *cmd_details);
+i40e_status i40e_aq_get_phy_register(struct i40e_hw *hw,
+				     u8 phy_select, u8 dev_addr,
+				     u32 reg_addr, u32 *reg_val,
+				     struct i40e_asq_cmd_details *cmd_details);
+
 i40e_status i40e_read_phy_register(struct i40e_hw *hw, u8 page,
 				   u16 reg, u8 phy_addr, u16 *value);
 i40e_status i40e_write_phy_register(struct i40e_hw *hw, u8 page,
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index c32c62462c84..37e1de886d48 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -358,14 +358,14 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
 {
 	enum i40e_latency_range new_latency_range = rc->latency_range;
 	u32 new_itr = rc->itr;
-	int bytes_per_int;
+	int bytes_per_usec;
 	unsigned int usecs, estimated_usecs;
 
 	if (rc->total_packets == 0 || !rc->itr)
 		return false;
 
 	usecs = (rc->itr << 1) * ITR_COUNTDOWN_START;
-	bytes_per_int = rc->total_bytes / usecs;
+	bytes_per_usec = rc->total_bytes / usecs;
 
 	/* The calculations in this algorithm depend on interrupts actually
 	 * firing at the ITR rate. This may not happen if the packet rate is
@@ -391,18 +391,18 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
 	 */
 	switch (new_latency_range) {
 	case I40E_LOWEST_LATENCY:
-		if (bytes_per_int > 10)
+		if (bytes_per_usec > 10)
 			new_latency_range = I40E_LOW_LATENCY;
 		break;
 	case I40E_LOW_LATENCY:
-		if (bytes_per_int > 20)
+		if (bytes_per_usec > 20)
 			new_latency_range = I40E_BULK_LATENCY;
-		else if (bytes_per_int <= 10)
+		else if (bytes_per_usec <= 10)
 			new_latency_range = I40E_LOWEST_LATENCY;
 		break;
 	case I40E_BULK_LATENCY:
 	default:
-		if (bytes_per_int <= 20)
+		if (bytes_per_usec <= 20)
 			new_latency_range = I40E_LOW_LATENCY;
 		break;
 	}
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h
index 2ea919d9cdcf..9364b67fff9c 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h
@@ -401,6 +401,18 @@ struct i40e_nvm_access {
 	u8 data[1];
 };
 
+/* (Q)SFP module access definitions */
+#define I40E_I2C_EEPROM_DEV_ADDR	0xA0
+#define I40E_I2C_EEPROM_DEV_ADDR2	0xA2
+#define I40E_MODULE_TYPE_ADDR		0x00
+#define I40E_MODULE_REVISION_ADDR	0x01
+#define I40E_MODULE_SFF_8472_COMP	0x5E
+#define I40E_MODULE_SFF_8472_SWAP	0x5C
+#define I40E_MODULE_SFF_ADDR_MODE	0x04
+#define I40E_MODULE_TYPE_QSFP_PLUS	0x0D
+#define I40E_MODULE_TYPE_QSFP28		0x11
+#define I40E_MODULE_QSFP_MAX_LEN	640
+
 /* PCI bus types */
 enum i40e_bus_type {
 	i40e_bus_type_unknown = 0,
@@ -556,11 +568,19 @@ struct i40e_hw {
 	/* LLDP/DCBX Status */
 	u16 dcbx_status;
 
+#define I40E_HW_FLAG_802_1AD_CAPABLE        BIT_ULL(1)
+#define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE  BIT_ULL(2)
+
 	/* DCBX info */
 	struct i40e_dcbx_config local_dcbx_config; /* Oper/Local Cfg */
 	struct i40e_dcbx_config remote_dcbx_config; /* Peer Cfg */
 	struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */
 
+	/* Used in set switch config AQ command */
+	u16 switch_tag;
+	u16 first_tag;
+	u16 second_tag;
+
 	/* debug mask */
 	u32 debug_mask;
 	char err_str[16];
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h
index 82f69031e5cd..5982362c5643 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf.h
@@ -102,6 +102,7 @@ struct i40e_vsi {
 #define I40E_TX_CTXTDESC(R, i) \
 	(&(((struct i40e_tx_context_desc *)((R)->desc))[i]))
 #define MAX_QUEUES 16
+#define I40EVF_MAX_REQ_QUEUES 4
 
 #define I40EVF_HKEY_ARRAY_SIZE ((I40E_VFQF_HKEY_MAX_INDEX + 1) * 4)
 #define I40EVF_HLUT_ARRAY_SIZE ((I40E_VFQF_HLUT_MAX_INDEX + 1) * 4)
@@ -200,6 +201,7 @@ struct i40evf_adapter {
 	struct list_head vlan_filter_list;
 	char misc_vector_name[IFNAMSIZ + 9];
 	int num_active_queues;
+	int num_req_queues;
 
 	/* TX */
 	struct i40e_ring *tx_rings;
@@ -235,6 +237,7 @@ struct i40evf_adapter {
 #define I40EVF_FLAG_PROMISC_ON			BIT(18)
 #define I40EVF_FLAG_ALLMULTI_ON			BIT(19)
 #define I40EVF_FLAG_LEGACY_RX			BIT(20)
+#define I40EVF_FLAG_REINIT_ITR_NEEDED		BIT(21)
 /* duplicates for common code */
 #define I40E_FLAG_DCB_ENABLED			0
 #define I40E_FLAG_RX_CSUM_ENABLED		I40EVF_FLAG_RX_CSUM_ENABLED
@@ -349,6 +352,7 @@ void i40evf_deconfigure_queues(struct i40evf_adapter *adapter);
 void i40evf_enable_queues(struct i40evf_adapter *adapter);
 void i40evf_disable_queues(struct i40evf_adapter *adapter);
 void i40evf_map_queues(struct i40evf_adapter *adapter);
+int i40evf_request_queues(struct i40evf_adapter *adapter, int num);
 void i40evf_add_ether_addrs(struct i40evf_adapter *adapter);
 void i40evf_del_ether_addrs(struct i40evf_adapter *adapter);
 void i40evf_add_vlans(struct i40evf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
index 65874d6b3ab9..da006fa3fec1 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
@@ -669,7 +669,7 @@ static void i40evf_get_channels(struct net_device *netdev,
 	struct i40evf_adapter *adapter = netdev_priv(netdev);
 
 	/* Report maximum channels */
-	ch->max_combined = adapter->num_active_queues;
+	ch->max_combined = I40EVF_MAX_REQ_QUEUES;
 
 	ch->max_other = NONQ_VECS;
 	ch->other_count = NONQ_VECS;
@@ -678,6 +678,41 @@ static void i40evf_get_channels(struct net_device *netdev,
 }
 
 /**
+ * i40evf_set_channels: set the new channel count
+ * @netdev: network interface device structure
+ * @ch: channel information structure
+ *
+ * Negotiate a new number of channels with the PF then do a reset.  During
+ * reset we'll realloc queues and fix the RSS table.  Returns 0 on success,
+ * negative on failure.
+ **/
+static int i40evf_set_channels(struct net_device *netdev,
+			       struct ethtool_channels *ch)
+{
+	struct i40evf_adapter *adapter = netdev_priv(netdev);
+	int num_req = ch->combined_count;
+
+	if (num_req != adapter->num_active_queues &&
+	    !(adapter->vf_res->vf_cap_flags &
+	      VIRTCHNL_VF_OFFLOAD_REQ_QUEUES)) {
+		dev_info(&adapter->pdev->dev, "PF is not capable of queue negotiation.\n");
+		return -EINVAL;
+	}
+
+	/* All of these should have already been checked by ethtool before this
+	 * even gets to us, but just to be sure.
+	 */
+	if (num_req <= 0 || num_req > I40EVF_MAX_REQ_QUEUES)
+		return -EINVAL;
+
+	if (ch->rx_count || ch->tx_count || ch->other_count != NONQ_VECS)
+		return -EINVAL;
+
+	adapter->num_req_queues = num_req;
+	return i40evf_request_queues(adapter, num_req);
+}
+
+/**
  * i40evf_get_rxfh_key_size - get the RSS hash key size
  * @netdev: network interface device structure
  *
@@ -785,6 +820,7 @@ static const struct ethtool_ops i40evf_ethtool_ops = {
 	.get_rxfh		= i40evf_get_rxfh,
 	.set_rxfh		= i40evf_set_rxfh,
 	.get_channels		= i40evf_get_channels,
+	.set_channels		= i40evf_set_channels,
 	.get_rxfh_key_size	= i40evf_get_rxfh_key_size,
 	.get_link_ksettings	= i40evf_get_link_ksettings,
 };
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 1825d956bb00..f2f1e754c2ce 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -46,7 +46,7 @@ static const char i40evf_driver_string[] =
 
 #define DRV_VERSION_MAJOR 3
 #define DRV_VERSION_MINOR 0
-#define DRV_VERSION_BUILD 0
+#define DRV_VERSION_BUILD 1
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 	     __stringify(DRV_VERSION_MINOR) "." \
 	     __stringify(DRV_VERSION_BUILD) \
@@ -430,57 +430,26 @@ i40evf_map_vector_to_txq(struct i40evf_adapter *adapter, int v_idx, int t_idx)
  * group the rings as "efficiently" as possible.  You would add new
  * mapping configurations in here.
  **/
-static int i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter)
+static void i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter)
 {
+	int rings_remaining = adapter->num_active_queues;
+	int ridx = 0, vidx = 0;
 	int q_vectors;
-	int v_start = 0;
-	int rxr_idx = 0, txr_idx = 0;
-	int rxr_remaining = adapter->num_active_queues;
-	int txr_remaining = adapter->num_active_queues;
-	int i, j;
-	int rqpv, tqpv;
-	int err = 0;
 
 	q_vectors = adapter->num_msix_vectors - NONQ_VECS;
 
-	/* The ideal configuration...
-	 * We have enough vectors to map one per queue.
-	 */
-	if (q_vectors >= (rxr_remaining * 2)) {
-		for (; rxr_idx < rxr_remaining; v_start++, rxr_idx++)
-			i40evf_map_vector_to_rxq(adapter, v_start, rxr_idx);
-
-		for (; txr_idx < txr_remaining; v_start++, txr_idx++)
-			i40evf_map_vector_to_txq(adapter, v_start, txr_idx);
-		goto out;
-	}
+	for (; ridx < rings_remaining; ridx++) {
+		i40evf_map_vector_to_rxq(adapter, vidx, ridx);
+		i40evf_map_vector_to_txq(adapter, vidx, ridx);
 
-	/* If we don't have enough vectors for a 1-to-1
-	 * mapping, we'll have to group them so there are
-	 * multiple queues per vector.
-	 * Re-adjusting *qpv takes care of the remainder.
-	 */
-	for (i = v_start; i < q_vectors; i++) {
-		rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors - i);
-		for (j = 0; j < rqpv; j++) {
-			i40evf_map_vector_to_rxq(adapter, i, rxr_idx);
-			rxr_idx++;
-			rxr_remaining--;
-		}
-	}
-	for (i = v_start; i < q_vectors; i++) {
-		tqpv = DIV_ROUND_UP(txr_remaining, q_vectors - i);
-		for (j = 0; j < tqpv; j++) {
-			i40evf_map_vector_to_txq(adapter, i, txr_idx);
-			txr_idx++;
-			txr_remaining--;
-		}
+		/* In the case where we have more queues than vectors, continue
+		 * round-robin on vectors until all queues are mapped.
+		 */
+		if (++vidx >= q_vectors)
+			vidx = 0;
 	}
 
-out:
 	adapter->aq_required |= I40EVF_FLAG_AQ_MAP_VECTORS;
-
-	return err;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -1217,9 +1186,18 @@ static int i40evf_alloc_queues(struct i40evf_adapter *adapter)
 {
 	int i, num_active_queues;
 
-	num_active_queues = min_t(int,
-				  adapter->vsi_res->num_queue_pairs,
-				  (int)(num_online_cpus()));
+	/* If we're in reset reallocating queues we don't actually know yet for
+	 * certain the PF gave us the number of queues we asked for but we'll
+	 * assume it did.  Once basic reset is finished we'll confirm once we
+	 * start negotiating config with PF.
+	 */
+	if (adapter->num_req_queues)
+		num_active_queues = adapter->num_req_queues;
+	else
+		num_active_queues = min_t(int,
+					  adapter->vsi_res->num_queue_pairs,
+					  (int)(num_online_cpus()));
+
 
 	adapter->tx_rings = kcalloc(num_active_queues,
 				    sizeof(struct i40e_ring), GFP_KERNEL);
@@ -1568,6 +1546,46 @@ static void i40evf_free_rss(struct i40evf_adapter *adapter)
 }
 
 /**
+ * i40evf_reinit_interrupt_scheme - Reallocate queues and vectors
+ * @adapter: board private structure
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40evf_reinit_interrupt_scheme(struct i40evf_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int err;
+
+	if (netif_running(netdev))
+		i40evf_free_traffic_irqs(adapter);
+	i40evf_free_misc_irq(adapter);
+	i40evf_reset_interrupt_capability(adapter);
+	i40evf_free_q_vectors(adapter);
+	i40evf_free_queues(adapter);
+
+	err =  i40evf_init_interrupt_scheme(adapter);
+	if (err)
+		goto err;
+
+	netif_tx_stop_all_queues(netdev);
+
+	err = i40evf_request_misc_irq(adapter);
+	if (err)
+		goto err;
+
+	set_bit(__I40E_VSI_DOWN, adapter->vsi.state);
+
+	i40evf_map_rings_to_vectors(adapter);
+
+	if (RSS_AQ(adapter))
+		adapter->aq_required |= I40EVF_FLAG_AQ_CONFIGURE_RSS;
+	else
+		err = i40evf_init_rss(adapter);
+err:
+	return err;
+}
+
+/**
  * i40evf_watchdog_timer - Periodic call-back timer
  * @data: pointer to adapter disguised as unsigned long
  **/
@@ -1913,8 +1931,15 @@ continue_reset:
 	if (err)
 		dev_info(&adapter->pdev->dev, "Failed to init adminq: %d\n",
 			 err);
+	adapter->aq_required = 0;
+
+	if (adapter->flags & I40EVF_FLAG_REINIT_ITR_NEEDED) {
+		err = i40evf_reinit_interrupt_scheme(adapter);
+		if (err)
+			goto reset_err;
+	}
 
-	adapter->aq_required = I40EVF_FLAG_AQ_GET_CONFIG;
+	adapter->aq_required |= I40EVF_FLAG_AQ_GET_CONFIG;
 	adapter->aq_required |= I40EVF_FLAG_AQ_MAP_VECTORS;
 
 	/* re-add all MAC filters */
@@ -1944,6 +1969,15 @@ continue_reset:
 		if (err)
 			goto reset_err;
 
+		if (adapter->flags & I40EVF_FLAG_REINIT_ITR_NEEDED) {
+			err = i40evf_request_traffic_irqs(adapter,
+							  netdev->name);
+			if (err)
+				goto reset_err;
+
+			adapter->flags &= ~I40EVF_FLAG_REINIT_ITR_NEEDED;
+		}
+
 		i40evf_configure(adapter);
 
 		i40evf_up_complete(adapter);
@@ -2459,9 +2493,9 @@ static int i40evf_check_reset_complete(struct i40e_hw *hw)
 int i40evf_process_config(struct i40evf_adapter *adapter)
 {
 	struct virtchnl_vf_resource *vfres = adapter->vf_res;
+	int i, num_req_queues = adapter->num_req_queues;
 	struct net_device *netdev = adapter->netdev;
 	struct i40e_vsi *vsi = &adapter->vsi;
-	int i;
 	netdev_features_t hw_enc_features;
 	netdev_features_t hw_features;
 
@@ -2475,6 +2509,23 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
 		return -ENODEV;
 	}
 
+	if (num_req_queues &&
+	    num_req_queues != adapter->vsi_res->num_queue_pairs) {
+		/* Problem.  The PF gave us fewer queues than what we had
+		 * negotiated in our request.  Need a reset to see if we can't
+		 * get back to a working state.
+		 */
+		dev_err(&adapter->pdev->dev,
+			"Requested %d queues, but PF only gave us %d.\n",
+			num_req_queues,
+			adapter->vsi_res->num_queue_pairs);
+		adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED;
+		adapter->num_req_queues = adapter->vsi_res->num_queue_pairs;
+		i40evf_schedule_reset(adapter);
+		return -ENODEV;
+	}
+	adapter->num_req_queues = 0;
+
 	hw_enc_features = NETIF_F_SG			|
 			  NETIF_F_IP_CSUM		|
 			  NETIF_F_IPV6_CSUM		|
@@ -2686,9 +2737,8 @@ static void i40evf_init_task(struct work_struct *work)
 		ether_addr_copy(netdev->perm_addr, adapter->hw.mac.addr);
 	}
 
-	init_timer(&adapter->watchdog_timer);
-	adapter->watchdog_timer.function = &i40evf_watchdog_timer;
-	adapter->watchdog_timer.data = (unsigned long)adapter;
+	setup_timer(&adapter->watchdog_timer, &i40evf_watchdog_timer,
+		    (unsigned long)adapter);
 	mod_timer(&adapter->watchdog_timer, jiffies + 1);
 
 	adapter->tx_desc_count = I40EVF_DEFAULT_TXD;
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
index 85876f4fb1fb..2bb81c39d85f 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
@@ -52,7 +52,7 @@ static int i40evf_send_pf_msg(struct i40evf_adapter *adapter,
 
 	err = i40e_aq_send_msg_to_pf(hw, op, 0, msg, len, NULL);
 	if (err)
-		dev_err(&adapter->pdev->dev, "Unable to send opcode %d to PF, err %s, aq_err %s\n",
+		dev_dbg(&adapter->pdev->dev, "Unable to send opcode %d to PF, err %s, aq_err %s\n",
 			op, i40evf_stat_str(hw, err),
 			i40evf_aq_str(hw, hw->aq.asq_last_status));
 	return err;
@@ -160,7 +160,8 @@ int i40evf_send_vf_config_msg(struct i40evf_adapter *adapter)
 	       VIRTCHNL_VF_OFFLOAD_WB_ON_ITR |
 	       VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 |
 	       VIRTCHNL_VF_OFFLOAD_ENCAP |
-	       VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM;
+	       VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM |
+	       VIRTCHNL_VF_OFFLOAD_REQ_QUEUES;
 
 	adapter->current_op = VIRTCHNL_OP_GET_VF_RESOURCES;
 	adapter->aq_required &= ~I40EVF_FLAG_AQ_GET_CONFIG;
@@ -385,6 +386,32 @@ void i40evf_map_queues(struct i40evf_adapter *adapter)
 }
 
 /**
+ * i40evf_request_queues
+ * @adapter: adapter structure
+ * @num: number of requested queues
+ *
+ * We get a default number of queues from the PF.  This enables us to request a
+ * different number.  Returns 0 on success, negative on failure
+ **/
+int i40evf_request_queues(struct i40evf_adapter *adapter, int num)
+{
+	struct virtchnl_vf_res_request vfres;
+
+	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
+		/* bail because we already have a command pending */
+		dev_err(&adapter->pdev->dev, "Cannot request queues, command %d pending\n",
+			adapter->current_op);
+		return -EBUSY;
+	}
+
+	vfres.num_queue_pairs = num;
+
+	adapter->current_op = VIRTCHNL_OP_REQUEST_QUEUES;
+	return i40evf_send_pf_msg(adapter, VIRTCHNL_OP_REQUEST_QUEUES,
+				  (u8 *)&vfres, sizeof(vfres));
+}
+
+/**
  * i40evf_add_ether_addrs
  * @adapter: adapter structure
  * @addrs: the MAC address filters to add (contiguous)
@@ -1068,6 +1095,21 @@ void i40evf_virtchnl_completion(struct i40evf_adapter *adapter,
 				 "Invalid message %d from PF\n", v_opcode);
 		}
 		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES: {
+		struct virtchnl_vf_res_request *vfres =
+			(struct virtchnl_vf_res_request *)msg;
+		if (vfres->num_queue_pairs == adapter->num_req_queues) {
+			adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED;
+			i40evf_schedule_reset(adapter);
+		} else {
+			dev_info(&adapter->pdev->dev,
+				 "Requested %d queues, PF can support %d\n",
+				 adapter->num_req_queues,
+				 vfres->num_queue_pairs);
+			adapter->num_req_queues = 0;
+		}
+		}
+		break;
 	default:
 		if (adapter->current_op && (v_opcode != adapter->current_op))
 			dev_warn(&adapter->pdev->dev, "Expected response %d from PF, received %d\n",
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index 5a713199653c..1e6ec2277d54 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -508,9 +508,8 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	adapter->part_num = ixgb_get_ee_pba_number(&adapter->hw);
 
-	init_timer(&adapter->watchdog_timer);
-	adapter->watchdog_timer.function = ixgb_watchdog;
-	adapter->watchdog_timer.data = (unsigned long)adapter;
+	setup_timer(&adapter->watchdog_timer, ixgb_watchdog,
+		    (unsigned long)adapter);
 
 	INIT_WORK(&adapter->tx_timeout_task, ixgb_tx_timeout_task);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d962368d08d0..3942c6208745 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2133,6 +2133,21 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
 #if L1_CACHE_BYTES < 128
 	prefetch(xdp->data + L1_CACHE_BYTES);
 #endif
+	/* Note, we get here by enabling legacy-rx via:
+	 *
+	 *    ethtool --set-priv-flags <dev> legacy-rx on
+	 *
+	 * In this mode, we currently get 0 extra XDP headroom as
+	 * opposed to having legacy-rx off, where we process XDP
+	 * packets going to stack via ixgbe_build_skb(). The latter
+	 * provides us currently with 192 bytes of headroom.
+	 *
+	 * For ixgbe_construct_skb() mode it means that the
+	 * xdp->data_meta will always point to xdp->data, since
+	 * the helper cannot expand the head. Should this ever
+	 * change in future for legacy-rx mode on, then lets also
+	 * add xdp->data_meta handling here.
+	 */
 
 	/* allocate a skb to store the frags */
 	skb = napi_alloc_skb(&rx_ring->q_vector->napi, IXGBE_RX_HDR_SIZE);
@@ -2165,6 +2180,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 				       struct xdp_buff *xdp,
 				       union ixgbe_adv_rx_desc *rx_desc)
 {
+	unsigned int metasize = xdp->data - xdp->data_meta;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
@@ -2174,10 +2190,14 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 #endif
 	struct sk_buff *skb;
 
-	/* prefetch first cache line of first page */
-	prefetch(xdp->data);
+	/* Prefetch first cache line of first page. If xdp->data_meta
+	 * is unused, this points extactly as xdp->data, otherwise we
+	 * likely have a consumer accessing first few bytes of meta
+	 * data, and then actual data.
+	 */
+	prefetch(xdp->data_meta);
 #if L1_CACHE_BYTES < 128
-	prefetch(xdp->data + L1_CACHE_BYTES);
+	prefetch(xdp->data_meta + L1_CACHE_BYTES);
 #endif
 
 	/* build an skb to around the page buffer */
@@ -2188,6 +2208,8 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 	/* update pointers within the skb to store the data */
 	skb_reserve(skb, xdp->data - xdp->data_hard_start);
 	__skb_put(skb, xdp->data_end - xdp->data);
+	if (metasize)
+		skb_metadata_set(skb, metasize);
 
 	/* record DMA address if this is the start of a chain of buffers */
 	if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
@@ -2326,6 +2348,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp.data_meta = xdp.data;
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c
index 3c0a6451273d..7cecd9dbc111 100644
--- a/drivers/net/ethernet/korina.c
+++ b/drivers/net/ethernet/korina.c
@@ -4,6 +4,7 @@
  *  Copyright 2004 IDT Inc. (rischelp@idt.com)
  *  Copyright 2006 Felix Fietkau <nbd@openwrt.org>
  *  Copyright 2008 Florian Fainelli <florian@openwrt.org>
+ *  Copyright 2017 Roman Yeryomin <roman@advem.lv>
  *
  *  This program is free software; you can redistribute  it and/or modify it
  *  under  the terms of  the GNU General  Public License as published by the
@@ -64,9 +65,9 @@
 #include <asm/mach-rc32434/eth.h>
 #include <asm/mach-rc32434/dma_v.h>
 
-#define DRV_NAME        "korina"
-#define DRV_VERSION     "0.10"
-#define DRV_RELDATE     "04Mar2008"
+#define DRV_NAME	"korina"
+#define DRV_VERSION	"0.20"
+#define DRV_RELDATE	"15Sep2017"
 
 #define STATION_ADDRESS_HIGH(dev) (((dev)->dev_addr[0] << 8) | \
 				   ((dev)->dev_addr[1]))
@@ -75,7 +76,7 @@
 				   ((dev)->dev_addr[4] << 8)  | \
 				   ((dev)->dev_addr[5]))
 
-#define MII_CLOCK 1250000 	/* no more than 2.5MHz */
+#define MII_CLOCK	1250000 /* no more than 2.5MHz */
 
 /* the following must be powers of two */
 #define KORINA_NUM_RDS	64  /* number of receive descriptors */
@@ -87,15 +88,19 @@
 #define KORINA_RBSIZE	1536 /* size of one resource buffer = Ether MTU */
 #define KORINA_RDS_MASK	(KORINA_NUM_RDS - 1)
 #define KORINA_TDS_MASK	(KORINA_NUM_TDS - 1)
-#define RD_RING_SIZE 	(KORINA_NUM_RDS * sizeof(struct dma_desc))
+#define RD_RING_SIZE	(KORINA_NUM_RDS * sizeof(struct dma_desc))
 #define TD_RING_SIZE	(KORINA_NUM_TDS * sizeof(struct dma_desc))
 
-#define TX_TIMEOUT 	(6000 * HZ / 1000)
+#define TX_TIMEOUT	(6000 * HZ / 1000)
 
-enum chain_status { desc_filled, desc_empty };
-#define IS_DMA_FINISHED(X)   (((X) & (DMA_DESC_FINI)) != 0)
-#define IS_DMA_DONE(X)   (((X) & (DMA_DESC_DONE)) != 0)
-#define RCVPKT_LENGTH(X)     (((X) & ETH_RX_LEN) >> ETH_RX_LEN_BIT)
+enum chain_status {
+	desc_filled,
+	desc_empty
+};
+
+#define IS_DMA_FINISHED(X)	(((X) & (DMA_DESC_FINI)) != 0)
+#define IS_DMA_DONE(X)		(((X) & (DMA_DESC_DONE)) != 0)
+#define RCVPKT_LENGTH(X)	(((X) & ETH_RX_LEN) >> ETH_RX_LEN_BIT)
 
 /* Information that need to be kept for each board. */
 struct korina_private {
@@ -122,10 +127,8 @@ struct korina_private {
 
 	int rx_irq;
 	int tx_irq;
-	int ovr_irq;
-	int und_irq;
 
-	spinlock_t lock;        /* NIC xmit lock */
+	spinlock_t lock;	/* NIC xmit lock */
 
 	int dma_halt_cnt;
 	int dma_run_cnt;
@@ -148,17 +151,17 @@ static inline void korina_start_dma(struct dma_reg *ch, u32 dma_addr)
 static inline void korina_abort_dma(struct net_device *dev,
 					struct dma_reg *ch)
 {
-       if (readl(&ch->dmac) & DMA_CHAN_RUN_BIT) {
-	       writel(0x10, &ch->dmac);
+	if (readl(&ch->dmac) & DMA_CHAN_RUN_BIT) {
+		writel(0x10, &ch->dmac);
 
-	       while (!(readl(&ch->dmas) & DMA_STAT_HALT))
-		       netif_trans_update(dev);
+		while (!(readl(&ch->dmas) & DMA_STAT_HALT))
+			netif_trans_update(dev);
 
-	       writel(0, &ch->dmas);
-       }
+		writel(0, &ch->dmas);
+	}
 
-       writel(0, &ch->dmadptr);
-       writel(0, &ch->dmandptr);
+	writel(0, &ch->dmadptr);
+	writel(0, &ch->dmandptr);
 }
 
 static inline void korina_chain_dma(struct dma_reg *ch, u32 dma_addr)
@@ -365,59 +368,60 @@ static int korina_rx(struct net_device *dev, int limit)
 		if ((KORINA_RBSIZE - (u32)DMA_COUNT(rd->control)) == 0)
 			break;
 
-		/* Update statistics counters */
-		if (devcs & ETH_RX_CRC)
-			dev->stats.rx_crc_errors++;
-		if (devcs & ETH_RX_LOR)
-			dev->stats.rx_length_errors++;
-		if (devcs & ETH_RX_LE)
-			dev->stats.rx_length_errors++;
-		if (devcs & ETH_RX_OVR)
-			dev->stats.rx_fifo_errors++;
-		if (devcs & ETH_RX_CV)
-			dev->stats.rx_frame_errors++;
-		if (devcs & ETH_RX_CES)
-			dev->stats.rx_length_errors++;
-		if (devcs & ETH_RX_MP)
-			dev->stats.multicast++;
+		/* check that this is a whole packet
+		 * WARNING: DMA_FD bit incorrectly set
+		 * in Rc32434 (errata ref #077) */
+		if (!(devcs & ETH_RX_LD))
+			goto next;
 
-		if ((devcs & ETH_RX_LD) != ETH_RX_LD) {
-			/* check that this is a whole packet
-			 * WARNING: DMA_FD bit incorrectly set
-			 * in Rc32434 (errata ref #077) */
+		if (!(devcs & ETH_RX_ROK)) {
+			/* Update statistics counters */
 			dev->stats.rx_errors++;
 			dev->stats.rx_dropped++;
-		} else if ((devcs & ETH_RX_ROK)) {
-			pkt_len = RCVPKT_LENGTH(devcs);
+			if (devcs & ETH_RX_CRC)
+				dev->stats.rx_crc_errors++;
+			if (devcs & ETH_RX_LE)
+				dev->stats.rx_length_errors++;
+			if (devcs & ETH_RX_OVR)
+				dev->stats.rx_fifo_errors++;
+			if (devcs & ETH_RX_CV)
+				dev->stats.rx_frame_errors++;
+			if (devcs & ETH_RX_CES)
+				dev->stats.rx_frame_errors++;
+
+			goto next;
+		}
 
-			/* must be the (first and) last
-			 * descriptor then */
-			pkt_buf = (u8 *)lp->rx_skb[lp->rx_next_done]->data;
+		pkt_len = RCVPKT_LENGTH(devcs);
 
-			/* invalidate the cache */
-			dma_cache_inv((unsigned long)pkt_buf, pkt_len - 4);
+		/* must be the (first and) last
+		 * descriptor then */
+		pkt_buf = (u8 *)lp->rx_skb[lp->rx_next_done]->data;
 
-			/* Malloc up new buffer. */
-			skb_new = netdev_alloc_skb_ip_align(dev, KORINA_RBSIZE);
+		/* invalidate the cache */
+		dma_cache_inv((unsigned long)pkt_buf, pkt_len - 4);
 
-			if (!skb_new)
-				break;
-			/* Do not count the CRC */
-			skb_put(skb, pkt_len - 4);
-			skb->protocol = eth_type_trans(skb, dev);
+		/* Malloc up new buffer. */
+		skb_new = netdev_alloc_skb_ip_align(dev, KORINA_RBSIZE);
 
-			/* Pass the packet to upper layers */
-			netif_receive_skb(skb);
-			dev->stats.rx_packets++;
-			dev->stats.rx_bytes += pkt_len;
+		if (!skb_new)
+			break;
+		/* Do not count the CRC */
+		skb_put(skb, pkt_len - 4);
+		skb->protocol = eth_type_trans(skb, dev);
 
-			/* Update the mcast stats */
-			if (devcs & ETH_RX_MP)
-				dev->stats.multicast++;
+		/* Pass the packet to upper layers */
+		napi_gro_receive(&lp->napi, skb);
+		dev->stats.rx_packets++;
+		dev->stats.rx_bytes += pkt_len;
 
-			lp->rx_skb[lp->rx_next_done] = skb_new;
-		}
+		/* Update the mcast stats */
+		if (devcs & ETH_RX_MP)
+			dev->stats.multicast++;
+
+		lp->rx_skb[lp->rx_next_done] = skb_new;
 
+next:
 		rd->devcs = 0;
 
 		/* Restore descriptor's curr_addr */
@@ -686,7 +690,7 @@ static int korina_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 
 /* ethtool helpers */
 static void netdev_get_drvinfo(struct net_device *dev,
-			struct ethtool_drvinfo *info)
+				struct ethtool_drvinfo *info)
 {
 	struct korina_private *lp = netdev_priv(dev);
 
@@ -729,10 +733,10 @@ static u32 netdev_get_link(struct net_device *dev)
 }
 
 static const struct ethtool_ops netdev_ethtool_ops = {
-	.get_drvinfo            = netdev_get_drvinfo,
-	.get_link               = netdev_get_link,
-	.get_link_ksettings     = netdev_get_link_ksettings,
-	.set_link_ksettings     = netdev_set_link_ksettings,
+	.get_drvinfo		= netdev_get_drvinfo,
+	.get_link		= netdev_get_link,
+	.get_link_ksettings	= netdev_get_link_ksettings,
+	.set_link_ksettings	= netdev_set_link_ksettings,
 };
 
 static int korina_alloc_ring(struct net_device *dev)
@@ -864,7 +868,7 @@ static int korina_init(struct net_device *dev)
 	/* Management Clock Prescaler Divisor
 	 * Clock independent setting */
 	writel(((idt_cpu_freq) / MII_CLOCK + 1) & ~1,
-		       &lp->eth_regs->ethmcp);
+			&lp->eth_regs->ethmcp);
 
 	/* don't transmit until fifo contains 48b */
 	writel(48, &lp->eth_regs->ethfifott);
@@ -891,8 +895,6 @@ static void korina_restart_task(struct work_struct *work)
 	 */
 	disable_irq(lp->rx_irq);
 	disable_irq(lp->tx_irq);
-	disable_irq(lp->ovr_irq);
-	disable_irq(lp->und_irq);
 
 	writel(readl(&lp->tx_dma_regs->dmasm) |
 				DMA_STAT_FINI | DMA_STAT_ERR,
@@ -911,40 +913,10 @@ static void korina_restart_task(struct work_struct *work)
 	}
 	korina_multicast_list(dev);
 
-	enable_irq(lp->und_irq);
-	enable_irq(lp->ovr_irq);
 	enable_irq(lp->tx_irq);
 	enable_irq(lp->rx_irq);
 }
 
-static void korina_clear_and_restart(struct net_device *dev, u32 value)
-{
-	struct korina_private *lp = netdev_priv(dev);
-
-	netif_stop_queue(dev);
-	writel(value, &lp->eth_regs->ethintfc);
-	schedule_work(&lp->restart_task);
-}
-
-/* Ethernet Tx Underflow interrupt */
-static irqreturn_t korina_und_interrupt(int irq, void *dev_id)
-{
-	struct net_device *dev = dev_id;
-	struct korina_private *lp = netdev_priv(dev);
-	unsigned int und;
-
-	spin_lock(&lp->lock);
-
-	und = readl(&lp->eth_regs->ethintfc);
-
-	if (und & ETH_INT_FC_UND)
-		korina_clear_and_restart(dev, und & ~ETH_INT_FC_UND);
-
-	spin_unlock(&lp->lock);
-
-	return IRQ_HANDLED;
-}
-
 static void korina_tx_timeout(struct net_device *dev)
 {
 	struct korina_private *lp = netdev_priv(dev);
@@ -952,25 +924,6 @@ static void korina_tx_timeout(struct net_device *dev)
 	schedule_work(&lp->restart_task);
 }
 
-/* Ethernet Rx Overflow interrupt */
-static irqreturn_t
-korina_ovr_interrupt(int irq, void *dev_id)
-{
-	struct net_device *dev = dev_id;
-	struct korina_private *lp = netdev_priv(dev);
-	unsigned int ovr;
-
-	spin_lock(&lp->lock);
-	ovr = readl(&lp->eth_regs->ethintfc);
-
-	if (ovr & ETH_INT_FC_OVR)
-		korina_clear_and_restart(dev, ovr & ~ETH_INT_FC_OVR);
-
-	spin_unlock(&lp->lock);
-
-	return IRQ_HANDLED;
-}
-
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static void korina_poll_controller(struct net_device *dev)
 {
@@ -993,48 +946,26 @@ static int korina_open(struct net_device *dev)
 	}
 
 	/* Install the interrupt handler
-	 * that handles the Done Finished
-	 * Ovr and Und Events */
+	 * that handles the Done Finished */
 	ret = request_irq(lp->rx_irq, korina_rx_dma_interrupt,
 			0, "Korina ethernet Rx", dev);
 	if (ret < 0) {
 		printk(KERN_ERR "%s: unable to get Rx DMA IRQ %d\n",
-		    dev->name, lp->rx_irq);
+			dev->name, lp->rx_irq);
 		goto err_release;
 	}
 	ret = request_irq(lp->tx_irq, korina_tx_dma_interrupt,
 			0, "Korina ethernet Tx", dev);
 	if (ret < 0) {
 		printk(KERN_ERR "%s: unable to get Tx DMA IRQ %d\n",
-		    dev->name, lp->tx_irq);
+			dev->name, lp->tx_irq);
 		goto err_free_rx_irq;
 	}
 
-	/* Install handler for overrun error. */
-	ret = request_irq(lp->ovr_irq, korina_ovr_interrupt,
-			0, "Ethernet Overflow", dev);
-	if (ret < 0) {
-		printk(KERN_ERR "%s: unable to get OVR IRQ %d\n",
-		    dev->name, lp->ovr_irq);
-		goto err_free_tx_irq;
-	}
-
-	/* Install handler for underflow error. */
-	ret = request_irq(lp->und_irq, korina_und_interrupt,
-			0, "Ethernet Underflow", dev);
-	if (ret < 0) {
-		printk(KERN_ERR "%s: unable to get UND IRQ %d\n",
-		    dev->name, lp->und_irq);
-		goto err_free_ovr_irq;
-	}
 	mod_timer(&lp->media_check_timer, jiffies + 1);
 out:
 	return ret;
 
-err_free_ovr_irq:
-	free_irq(lp->ovr_irq, dev);
-err_free_tx_irq:
-	free_irq(lp->tx_irq, dev);
 err_free_rx_irq:
 	free_irq(lp->rx_irq, dev);
 err_release:
@@ -1052,8 +983,6 @@ static int korina_close(struct net_device *dev)
 	/* Disable interrupts */
 	disable_irq(lp->rx_irq);
 	disable_irq(lp->tx_irq);
-	disable_irq(lp->ovr_irq);
-	disable_irq(lp->und_irq);
 
 	korina_abort_tx(dev);
 	tmp = readl(&lp->tx_dma_regs->dmasm);
@@ -1073,8 +1002,6 @@ static int korina_close(struct net_device *dev)
 
 	free_irq(lp->rx_irq, dev);
 	free_irq(lp->tx_irq, dev);
-	free_irq(lp->ovr_irq, dev);
-	free_irq(lp->und_irq, dev);
 
 	return 0;
 }
@@ -1113,8 +1040,6 @@ static int korina_probe(struct platform_device *pdev)
 
 	lp->rx_irq = platform_get_irq_byname(pdev, "korina_rx");
 	lp->tx_irq = platform_get_irq_byname(pdev, "korina_tx");
-	lp->ovr_irq = platform_get_irq_byname(pdev, "korina_ovr");
-	lp->und_irq = platform_get_irq_byname(pdev, "korina_und");
 
 	r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "korina_regs");
 	dev->base_addr = r->start;
@@ -1162,7 +1087,7 @@ static int korina_probe(struct platform_device *pdev)
 	dev->netdev_ops = &korina_netdev_ops;
 	dev->ethtool_ops = &netdev_ethtool_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
-	netif_napi_add(dev, &lp->napi, korina_poll, 64);
+	netif_napi_add(dev, &lp->napi, korina_poll, NAPI_POLL_WEIGHT);
 
 	lp->phy_addr = (((lp->rx_irq == 0x2c? 1:0) << 8) | 0x05);
 	lp->mii_if.dev = dev;
@@ -1226,5 +1151,6 @@ module_platform_driver(korina_driver);
 MODULE_AUTHOR("Philip Rischel <rischelp@idt.com>");
 MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>");
 MODULE_AUTHOR("Florian Fainelli <florian@openwrt.org>");
+MODULE_AUTHOR("Roman Yeryomin <roman@advem.lv>");
 MODULE_DESCRIPTION("IDT RC32434 (Korina) Ethernet driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 9c86cb7cb988..97efe4733661 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -6460,7 +6460,6 @@ static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev)
 		if (mvpp2_tx_frag_process(port, skb, aggr_txq, txq)) {
 			tx_desc_unmap_put(port, txq, tx_desc);
 			frags = 0;
-			goto out;
 		}
 	}
 
diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c
index 993724959a7c..91b1c154fd29 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c
@@ -1496,9 +1496,8 @@ static int pxa168_eth_probe(struct platform_device *pdev)
 	netif_napi_add(dev, &pep->napi, pxa168_rx_poll, pep->rx_ring_size);
 
 	memset(&pep->timeout, 0, sizeof(struct timer_list));
-	init_timer(&pep->timeout);
-	pep->timeout.function = rxq_refill_timer_wrapper;
-	pep->timeout.data = (unsigned long)pep;
+	setup_timer(&pep->timeout, rxq_refill_timer_wrapper,
+		    (unsigned long)pep);
 
 	pep->smi_bus = mdiobus_alloc();
 	if (!pep->smi_bus) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 53daa6ca5d83..de0f9e5e42ec 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -277,7 +277,7 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev)
 	phys_addr_t addr;
 
 	INIT_LIST_HEAD(&priv->catas_err.list);
-	init_timer(&priv->catas_err.timer);
+	setup_timer(&priv->catas_err.timer, poll_catas, (unsigned long)dev);
 	priv->catas_err.map = NULL;
 
 	if (!mlx4_is_slave(dev)) {
@@ -293,8 +293,6 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev)
 		}
 	}
 
-	priv->catas_err.timer.data     = (unsigned long) dev;
-	priv->catas_err.timer.function = poll_catas;
 	priv->catas_err.timer.expires  =
 		round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
 	add_timer(&priv->catas_err.timer);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c827eb..8f9cb8abc497 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 			xdp.data_hard_start = va - frags[0].page_offset;
 			xdp.data = va;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_end = xdp.data + length;
 			orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c
index 4614ddfa91eb..6a7c8b04447e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c
@@ -256,7 +256,7 @@ struct sk_buff *mlx5e_ipsec_handle_tx_skb(struct net_device *netdev,
 			goto drop;
 		}
 	mdata = mlx5e_ipsec_add_metadata(skb);
-	if (unlikely(IS_ERR(mdata))) {
+	if (IS_ERR(mdata)) {
 		atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_metadata);
 		goto drop;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 15a1687483cc..bc654aeae5d4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -797,6 +797,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		return false;
 
 	xdp.data = va + *rx_headroom;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 	xdp.data_hard_start = va;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 8aea0a065e56..a89a68ce53ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -320,15 +320,13 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 
-	init_timer(&health->timer);
+	setup_timer(&health->timer, poll_health, (unsigned long)dev);
 	health->sick = 0;
 	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
 	clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
 	health->health = &dev->iseg->health;
 	health->health_counter = &dev->iseg->health_counter;
 
-	health->timer.data = (unsigned long)dev;
-	health->timer.function = poll_health;
 	health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL);
 	add_timer(&health->timer);
 }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile
index 891ff418bb5e..4816504419fc 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/Makefile
+++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile
@@ -17,7 +17,8 @@ mlxsw_spectrum-objs		:= spectrum.o spectrum_buffers.o \
 				   spectrum_kvdl.o spectrum_acl_tcam.o \
 				   spectrum_acl.o spectrum_flower.o \
 				   spectrum_cnt.o spectrum_fid.o \
-				   spectrum_ipip.o
+				   spectrum_ipip.o spectrum_acl_flex_actions.o \
+				   spectrum_mr.o spectrum_mr_tcam.o
 mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB)	+= spectrum_dcb.o
 mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o
 obj-$(CONFIG_MLXSW_MINIMAL)	+= mlxsw_minimal.o
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
index 5ae110172c22..6a979a09ab72 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
@@ -399,23 +399,25 @@ u32 mlxsw_afa_block_first_set_kvdl_index(struct mlxsw_afa_block *block)
 }
 EXPORT_SYMBOL(mlxsw_afa_block_first_set_kvdl_index);
 
-void mlxsw_afa_block_continue(struct mlxsw_afa_block *block)
+int mlxsw_afa_block_continue(struct mlxsw_afa_block *block)
 {
-	if (WARN_ON(block->finished))
-		return;
+	if (block->finished)
+		return -EINVAL;
 	mlxsw_afa_set_goto_set(block->cur_set,
 			       MLXSW_AFA_SET_GOTO_BINDING_CMD_NONE, 0);
 	block->finished = true;
+	return 0;
 }
 EXPORT_SYMBOL(mlxsw_afa_block_continue);
 
-void mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id)
+int mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id)
 {
-	if (WARN_ON(block->finished))
-		return;
+	if (block->finished)
+		return -EINVAL;
 	mlxsw_afa_set_goto_set(block->cur_set,
 			       MLXSW_AFA_SET_GOTO_BINDING_CMD_JUMP, group_id);
 	block->finished = true;
+	return 0;
 }
 EXPORT_SYMBOL(mlxsw_afa_block_jump);
 
@@ -674,6 +676,7 @@ enum mlxsw_afa_trapdisc_trap_action {
 MLXSW_ITEM32(afa, trapdisc, trap_action, 0x00, 24, 4);
 
 enum mlxsw_afa_trapdisc_forward_action {
+	MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD = 1,
 	MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD = 3,
 };
 
@@ -712,7 +715,7 @@ int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block)
 }
 EXPORT_SYMBOL(mlxsw_afa_block_append_drop);
 
-int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block)
+int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id)
 {
 	char *act = mlxsw_afa_block_append_action(block,
 						  MLXSW_AFA_TRAPDISC_CODE,
@@ -722,11 +725,27 @@ int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block)
 		return -ENOBUFS;
 	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP,
 				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD,
-				MLXSW_TRAP_ID_ACL0);
+				trap_id);
 	return 0;
 }
 EXPORT_SYMBOL(mlxsw_afa_block_append_trap);
 
+int mlxsw_afa_block_append_trap_and_forward(struct mlxsw_afa_block *block,
+					    u16 trap_id)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_TRAPDISC_CODE,
+						  MLXSW_AFA_TRAPDISC_SIZE);
+
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP,
+				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD,
+				trap_id);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_trap_and_forward);
+
 /* Forwarding Action
  * -----------------
  * Forwarding Action can be used to implement Policy Based Switching (PBS)
@@ -891,3 +910,74 @@ int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid)
 	return 0;
 }
 EXPORT_SYMBOL(mlxsw_afa_block_append_fid_set);
+
+/* MC Routing Action
+ * -----------------
+ * The Multicast router action. Can be used by RMFT_V2 - Router Multicast
+ * Forwarding Table Version 2 Register.
+ */
+
+#define MLXSW_AFA_MCROUTER_CODE 0x10
+#define MLXSW_AFA_MCROUTER_SIZE 2
+
+enum mlxsw_afa_mcrouter_rpf_action {
+	MLXSW_AFA_MCROUTER_RPF_ACTION_NOP,
+	MLXSW_AFA_MCROUTER_RPF_ACTION_TRAP,
+	MLXSW_AFA_MCROUTER_RPF_ACTION_DISCARD_ERROR,
+};
+
+/* afa_mcrouter_rpf_action */
+MLXSW_ITEM32(afa, mcrouter, rpf_action, 0x00, 28, 3);
+
+/* afa_mcrouter_expected_irif */
+MLXSW_ITEM32(afa, mcrouter, expected_irif, 0x00, 0, 16);
+
+/* afa_mcrouter_min_mtu */
+MLXSW_ITEM32(afa, mcrouter, min_mtu, 0x08, 0, 16);
+
+enum mlxsw_afa_mrouter_vrmid {
+	MLXSW_AFA_MCROUTER_VRMID_INVALID,
+	MLXSW_AFA_MCROUTER_VRMID_VALID
+};
+
+/* afa_mcrouter_vrmid
+ * Valid RMID: rigr_rmid_index is used as RMID
+ */
+MLXSW_ITEM32(afa, mcrouter, vrmid, 0x0C, 31, 1);
+
+/* afa_mcrouter_rigr_rmid_index
+ * When the vrmid field is set to invalid, the field is used as pointer to
+ * Router Interface Group (RIGR) Table in the KVD linear.
+ * When the vrmid is set to valid, the field is used as RMID index, ranged
+ * from 0 to max_mid - 1. The index is to the Port Group Table.
+ */
+MLXSW_ITEM32(afa, mcrouter, rigr_rmid_index, 0x0C, 0, 24);
+
+static inline void
+mlxsw_afa_mcrouter_pack(char *payload,
+			enum mlxsw_afa_mcrouter_rpf_action rpf_action,
+			u16 expected_irif, u16 min_mtu,
+			enum mlxsw_afa_mrouter_vrmid vrmid, u32 rigr_rmid_index)
+
+{
+	mlxsw_afa_mcrouter_rpf_action_set(payload, rpf_action);
+	mlxsw_afa_mcrouter_expected_irif_set(payload, expected_irif);
+	mlxsw_afa_mcrouter_min_mtu_set(payload, min_mtu);
+	mlxsw_afa_mcrouter_vrmid_set(payload, vrmid);
+	mlxsw_afa_mcrouter_rigr_rmid_index_set(payload, rigr_rmid_index);
+}
+
+int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block,
+				    u16 expected_irif, u16 min_mtu,
+				    bool rmid_valid, u32 kvdl_index)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_MCROUTER_CODE,
+						  MLXSW_AFA_MCROUTER_SIZE);
+	if (!act)
+		return -ENOBUFS;
+	mlxsw_afa_mcrouter_pack(act, MLXSW_AFA_MCROUTER_RPF_ACTION_TRAP,
+				expected_irif, min_mtu, rmid_valid, kvdl_index);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_mcrouter);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
index f99c341b2497..a8d3314c3a24 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
@@ -57,10 +57,12 @@ void mlxsw_afa_block_destroy(struct mlxsw_afa_block *block);
 int mlxsw_afa_block_commit(struct mlxsw_afa_block *block);
 char *mlxsw_afa_block_first_set(struct mlxsw_afa_block *block);
 u32 mlxsw_afa_block_first_set_kvdl_index(struct mlxsw_afa_block *block);
-void mlxsw_afa_block_continue(struct mlxsw_afa_block *block);
-void mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id);
+int mlxsw_afa_block_continue(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id);
 int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block);
-int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id);
+int mlxsw_afa_block_append_trap_and_forward(struct mlxsw_afa_block *block,
+					    u16 trap_id);
 int mlxsw_afa_block_append_fwd(struct mlxsw_afa_block *block,
 			       u8 local_port, bool in_port);
 int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block,
@@ -68,5 +70,8 @@ int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block,
 int mlxsw_afa_block_append_counter(struct mlxsw_afa_block *block,
 				   u32 counter_index);
 int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid);
+int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block,
+				    u16 expected_irif, u16 min_mtu,
+				    bool rmid_valid, u32 kvdl_index);
 
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index cc27c5de5a1d..d44e673a4c4e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -2142,15 +2142,14 @@ MLXSW_REG_DEFINE(pefa, MLXSW_REG_PEFA_ID, MLXSW_REG_PEFA_LEN);
  */
 MLXSW_ITEM32(reg, pefa, index, 0x00, 0, 24);
 
-#define MLXSW_REG_PXXX_FLEX_ACTION_SET_LEN 0xA8
+#define MLXSW_REG_FLEX_ACTION_SET_LEN 0xA8
 
 /* reg_pefa_flex_action_set
  * Action-set to perform when rule is matched.
  * Must be zero padded if action set is shorter.
  * Access: RW
  */
-MLXSW_ITEM_BUF(reg, pefa, flex_action_set, 0x08,
-	       MLXSW_REG_PXXX_FLEX_ACTION_SET_LEN);
+MLXSW_ITEM_BUF(reg, pefa, flex_action_set, 0x08, MLXSW_REG_FLEX_ACTION_SET_LEN);
 
 static inline void mlxsw_reg_pefa_pack(char *payload, u32 index,
 				       const char *flex_action_set)
@@ -2243,7 +2242,7 @@ MLXSW_ITEM_BUF(reg, ptce2, mask, 0x80,
  * Access: RW
  */
 MLXSW_ITEM_BUF(reg, ptce2, flex_action_set, 0xE0,
-	       MLXSW_REG_PXXX_FLEX_ACTION_SET_LEN);
+	       MLXSW_REG_FLEX_ACTION_SET_LEN);
 
 static inline void mlxsw_reg_ptce2_pack(char *payload, bool valid,
 					enum mlxsw_reg_ptce2_op op,
@@ -3682,12 +3681,15 @@ enum mlxsw_reg_htgt_trap_group {
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD,
 	MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND,
@@ -3992,6 +3994,12 @@ MLXSW_ITEM32(reg, ritr, ipv4, 0x00, 29, 1);
  */
 MLXSW_ITEM32(reg, ritr, ipv6, 0x00, 28, 1);
 
+/* reg_ritr_ipv4_mc
+ * IPv4 multicast routing enable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4_mc, 0x00, 27, 1);
+
 enum mlxsw_reg_ritr_if_type {
 	/* VLAN interface. */
 	MLXSW_REG_RITR_VLAN_IF,
@@ -4049,6 +4057,14 @@ MLXSW_ITEM32(reg, ritr, ipv4_fe, 0x04, 29, 1);
  */
 MLXSW_ITEM32(reg, ritr, ipv6_fe, 0x04, 28, 1);
 
+/* reg_ritr_ipv4_mc_fe
+ * IPv4 Multicast Forwarding Enable.
+ * When disabled, forwarding is blocked but local traffic (traps and IP to me)
+ * will be enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4_mc_fe, 0x04, 27, 1);
+
 /* reg_ritr_lb_en
  * Loop-back filter enable for unicast packets.
  * If the flag is set then loop-back filter for unicast packets is
@@ -4271,11 +4287,13 @@ static inline void mlxsw_reg_ritr_pack(char *payload, bool enable,
 	mlxsw_reg_ritr_enable_set(payload, enable);
 	mlxsw_reg_ritr_ipv4_set(payload, 1);
 	mlxsw_reg_ritr_ipv6_set(payload, 1);
+	mlxsw_reg_ritr_ipv4_mc_set(payload, 1);
 	mlxsw_reg_ritr_type_set(payload, type);
 	mlxsw_reg_ritr_op_set(payload, op);
 	mlxsw_reg_ritr_rif_set(payload, rif);
 	mlxsw_reg_ritr_ipv4_fe_set(payload, 1);
 	mlxsw_reg_ritr_ipv6_fe_set(payload, 1);
+	mlxsw_reg_ritr_ipv4_mc_fe_set(payload, 1);
 	mlxsw_reg_ritr_lb_en_set(payload, 1);
 	mlxsw_reg_ritr_virtual_router_set(payload, vr_id);
 	mlxsw_reg_ritr_mtu_set(payload, mtu);
@@ -4311,6 +4329,57 @@ mlxsw_reg_ritr_loopback_ipip4_pack(char *payload,
 	mlxsw_reg_ritr_loopback_ipip_usip4_set(payload, usip);
 }
 
+/* RTAR - Router TCAM Allocation Register
+ * --------------------------------------
+ * This register is used for allocation of regions in the TCAM table.
+ */
+#define MLXSW_REG_RTAR_ID 0x8004
+#define MLXSW_REG_RTAR_LEN 0x20
+
+MLXSW_REG_DEFINE(rtar, MLXSW_REG_RTAR_ID, MLXSW_REG_RTAR_LEN);
+
+enum mlxsw_reg_rtar_op {
+	MLXSW_REG_RTAR_OP_ALLOCATE,
+	MLXSW_REG_RTAR_OP_RESIZE,
+	MLXSW_REG_RTAR_OP_DEALLOCATE,
+};
+
+/* reg_rtar_op
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rtar, op, 0x00, 28, 4);
+
+enum mlxsw_reg_rtar_key_type {
+	MLXSW_REG_RTAR_KEY_TYPE_IPV4_MULTICAST = 1,
+	MLXSW_REG_RTAR_KEY_TYPE_IPV6_MULTICAST = 3
+};
+
+/* reg_rtar_key_type
+ * TCAM key type for the region.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rtar, key_type, 0x00, 0, 8);
+
+/* reg_rtar_region_size
+ * TCAM region size. When allocating/resizing this is the requested
+ * size, the response is the actual size.
+ * Note: Actual size may be larger than requested.
+ * Reserved for op = Deallocate
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rtar, region_size, 0x04, 0, 16);
+
+static inline void mlxsw_reg_rtar_pack(char *payload,
+				       enum mlxsw_reg_rtar_op op,
+				       enum mlxsw_reg_rtar_key_type key_type,
+				       u16 region_size)
+{
+	MLXSW_REG_ZERO(rtar, payload);
+	mlxsw_reg_rtar_op_set(payload, op);
+	mlxsw_reg_rtar_key_type_set(payload, key_type);
+	mlxsw_reg_rtar_region_size_set(payload, region_size);
+}
+
 /* RATR - Router Adjacency Table Register
  * --------------------------------------
  * The RATR register is used to configure the Router Adjacency (next-hop)
@@ -4480,6 +4549,27 @@ MLXSW_ITEM32(reg, ratr, ipip_ipv4_udip, 0x18, 0, 32);
  */
 MLXSW_ITEM32(reg, ratr, ipip_ipv6_ptr, 0x1C, 0, 24);
 
+enum mlxsw_reg_flow_counter_set_type {
+	/* No count */
+	MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT = 0x00,
+	/* Count packets and bytes */
+	MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES = 0x03,
+	/* Count only packets */
+	MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS = 0x05,
+};
+
+/* reg_ratr_counter_set_type
+ * Counter set type for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, counter_set_type, 0x28, 24, 8);
+
+/* reg_ratr_counter_index
+ * Counter index for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, counter_index, 0x28, 0, 24);
+
 static inline void
 mlxsw_reg_ratr_pack(char *payload,
 		    enum mlxsw_reg_ratr_op op, bool valid,
@@ -4507,6 +4597,20 @@ static inline void mlxsw_reg_ratr_ipip4_entry_pack(char *payload, u32 ipv4_udip)
 	mlxsw_reg_ratr_ipip_ipv4_udip_set(payload, ipv4_udip);
 }
 
+static inline void mlxsw_reg_ratr_counter_pack(char *payload, u64 counter_index,
+					       bool counter_enable)
+{
+	enum mlxsw_reg_flow_counter_set_type set_type;
+
+	if (counter_enable)
+		set_type = MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES;
+	else
+		set_type = MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT;
+
+	mlxsw_reg_ratr_counter_index_set(payload, counter_index);
+	mlxsw_reg_ratr_counter_set_type_set(payload, set_type);
+}
+
 /* RICNT - Router Interface Counter Register
  * -----------------------------------------
  * The RICNT register retrieves per port performance counters
@@ -4630,6 +4734,65 @@ static inline void mlxsw_reg_ricnt_pack(char *payload, u32 index,
 					     MLXSW_REG_RICNT_COUNTER_SET_TYPE_BASIC);
 }
 
+/* RRCR - Router Rules Copy Register Layout
+ * ----------------------------------------
+ * This register is used for moving and copying route entry rules.
+ */
+#define MLXSW_REG_RRCR_ID 0x800F
+#define MLXSW_REG_RRCR_LEN 0x24
+
+MLXSW_REG_DEFINE(rrcr, MLXSW_REG_RRCR_ID, MLXSW_REG_RRCR_LEN);
+
+enum mlxsw_reg_rrcr_op {
+	/* Move rules */
+	MLXSW_REG_RRCR_OP_MOVE,
+	/* Copy rules */
+	MLXSW_REG_RRCR_OP_COPY,
+};
+
+/* reg_rrcr_op
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rrcr, op, 0x00, 28, 4);
+
+/* reg_rrcr_offset
+ * Offset within the region from which to copy/move.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rrcr, offset, 0x00, 0, 16);
+
+/* reg_rrcr_size
+ * The number of rules to copy/move.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rrcr, size, 0x04, 0, 16);
+
+/* reg_rrcr_table_id
+ * Identifier of the table on which to perform the operation. Encoding is the
+ * same as in RTAR.key_type
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rrcr, table_id, 0x10, 0, 4);
+
+/* reg_rrcr_dest_offset
+ * Offset within the region to which to copy/move
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rrcr, dest_offset, 0x20, 0, 16);
+
+static inline void mlxsw_reg_rrcr_pack(char *payload, enum mlxsw_reg_rrcr_op op,
+				       u16 offset, u16 size,
+				       enum mlxsw_reg_rtar_key_type table_id,
+				       u16 dest_offset)
+{
+	MLXSW_REG_ZERO(rrcr, payload);
+	mlxsw_reg_rrcr_op_set(payload, op);
+	mlxsw_reg_rrcr_offset_set(payload, offset);
+	mlxsw_reg_rrcr_size_set(payload, size);
+	mlxsw_reg_rrcr_table_id_set(payload, table_id);
+	mlxsw_reg_rrcr_dest_offset_set(payload, dest_offset);
+}
+
 /* RALTA - Router Algorithmic LPM Tree Allocation Register
  * -------------------------------------------------------
  * RALTA is used to allocate the LPM trees of the SHSPM method.
@@ -5169,15 +5332,6 @@ enum mlxsw_reg_rauht_trap_id {
  */
 MLXSW_ITEM32(reg, rauht, trap_id, 0x60, 0, 9);
 
-enum mlxsw_reg_flow_counter_set_type {
-	/* No count */
-	MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT = 0x00,
-	/* Count packets and bytes */
-	MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES = 0x03,
-	/* Count only packets */
-	MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS = 0x05,
-};
-
 /* reg_rauht_counter_set_type
  * Counter set type for flow counters
  * Access: RW
@@ -5596,6 +5750,229 @@ mlxsw_reg_rtdp_ipip4_pack(char *payload, u16 irif,
 	mlxsw_reg_rtdp_ipip_expected_gre_key_set(payload, expected_gre_key);
 }
 
+/* RIGR-V2 - Router Interface Group Register Version 2
+ * ---------------------------------------------------
+ * The RIGR_V2 register is used to add, remove and query egress interface list
+ * of a multicast forwarding entry.
+ */
+#define MLXSW_REG_RIGR2_ID 0x8023
+#define MLXSW_REG_RIGR2_LEN 0xB0
+
+#define MLXSW_REG_RIGR2_MAX_ERIFS 32
+
+MLXSW_REG_DEFINE(rigr2, MLXSW_REG_RIGR2_ID, MLXSW_REG_RIGR2_LEN);
+
+/* reg_rigr2_rigr_index
+ * KVD Linear index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rigr2, rigr_index, 0x04, 0, 24);
+
+/* reg_rigr2_vnext
+ * Next RIGR Index is valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, vnext, 0x08, 31, 1);
+
+/* reg_rigr2_next_rigr_index
+ * Next RIGR Index. The index is to the KVD linear.
+ * Reserved when vnxet = '0'.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, next_rigr_index, 0x08, 0, 24);
+
+/* reg_rigr2_vrmid
+ * RMID Index is valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, vrmid, 0x20, 31, 1);
+
+/* reg_rigr2_rmid_index
+ * RMID Index.
+ * Range 0 .. max_mid - 1
+ * Reserved when vrmid = '0'.
+ * The index is to the Port Group Table (PGT)
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, rmid_index, 0x20, 0, 16);
+
+/* reg_rigr2_erif_entry_v
+ * Egress Router Interface is valid.
+ * Note that low-entries must be set if high-entries are set. For
+ * example: if erif_entry[2].v is set then erif_entry[1].v and
+ * erif_entry[0].v must be set.
+ * Index can be from 0 to cap_mc_erif_list_entries-1
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, rigr2, erif_entry_v, 0x24, 31, 1, 4, 0, false);
+
+/* reg_rigr2_erif_entry_erif
+ * Egress Router Interface.
+ * Valid range is from 0 to cap_max_router_interfaces - 1
+ * Index can be from 0 to MLXSW_REG_RIGR2_MAX_ERIFS - 1
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, rigr2, erif_entry_erif, 0x24, 0, 16, 4, 0, false);
+
+static inline void mlxsw_reg_rigr2_pack(char *payload, u32 rigr_index,
+					bool vnext, u32 next_rigr_index)
+{
+	MLXSW_REG_ZERO(rigr2, payload);
+	mlxsw_reg_rigr2_rigr_index_set(payload, rigr_index);
+	mlxsw_reg_rigr2_vnext_set(payload, vnext);
+	mlxsw_reg_rigr2_next_rigr_index_set(payload, next_rigr_index);
+	mlxsw_reg_rigr2_vrmid_set(payload, 0);
+	mlxsw_reg_rigr2_rmid_index_set(payload, 0);
+}
+
+static inline void mlxsw_reg_rigr2_erif_entry_pack(char *payload, int index,
+						   bool v, u16 erif)
+{
+	mlxsw_reg_rigr2_erif_entry_v_set(payload, index, v);
+	mlxsw_reg_rigr2_erif_entry_erif_set(payload, index, erif);
+}
+
+/* RMFT-V2 - Router Multicast Forwarding Table Version 2 Register
+ * --------------------------------------------------------------
+ * The RMFT_V2 register is used to configure and query the multicast table.
+ */
+#define MLXSW_REG_RMFT2_ID 0x8027
+#define MLXSW_REG_RMFT2_LEN 0x174
+
+MLXSW_REG_DEFINE(rmft2, MLXSW_REG_RMFT2_ID, MLXSW_REG_RMFT2_LEN);
+
+/* reg_rmft2_v
+ * Valid
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, v, 0x00, 31, 1);
+
+enum mlxsw_reg_rmft2_type {
+	MLXSW_REG_RMFT2_TYPE_IPV4,
+	MLXSW_REG_RMFT2_TYPE_IPV6
+};
+
+/* reg_rmft2_type
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rmft2, type, 0x00, 28, 2);
+
+enum mlxsw_sp_reg_rmft2_op {
+	/* For Write:
+	 * Write operation. Used to write a new entry to the table. All RW
+	 * fields are relevant for new entry. Activity bit is set for new
+	 * entries - Note write with v (Valid) 0 will delete the entry.
+	 * For Query:
+	 * Read operation
+	 */
+	MLXSW_REG_RMFT2_OP_READ_WRITE,
+};
+
+/* reg_rmft2_op
+ * Operation.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, rmft2, op, 0x00, 20, 2);
+
+/* reg_rmft2_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on the specific
+ * entry.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, rmft2, a, 0x00, 16, 1);
+
+/* reg_rmft2_offset
+ * Offset within the multicast forwarding table to write to.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rmft2, offset, 0x00, 0, 16);
+
+/* reg_rmft2_virtual_router
+ * Virtual Router ID. Range from 0..cap_max_virtual_routers-1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, virtual_router, 0x04, 0, 16);
+
+enum mlxsw_reg_rmft2_irif_mask {
+	MLXSW_REG_RMFT2_IRIF_MASK_IGNORE,
+	MLXSW_REG_RMFT2_IRIF_MASK_COMPARE
+};
+
+/* reg_rmft2_irif_mask
+ * Ingress RIF mask.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, irif_mask, 0x08, 24, 1);
+
+/* reg_rmft2_irif
+ * Ingress RIF index.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, irif, 0x08, 0, 16);
+
+/* reg_rmft2_dip4
+ * Destination IPv4 address
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, dip4, 0x1C, 0, 32);
+
+/* reg_rmft2_dip4_mask
+ * A bit that is set directs the TCAM to compare the corresponding bit in key. A
+ * bit that is clear directs the TCAM to ignore the corresponding bit in key.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, dip4_mask, 0x2C, 0, 32);
+
+/* reg_rmft2_sip4
+ * Source IPv4 address
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, sip4, 0x3C, 0, 32);
+
+/* reg_rmft2_sip4_mask
+ * A bit that is set directs the TCAM to compare the corresponding bit in key. A
+ * bit that is clear directs the TCAM to ignore the corresponding bit in key.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, sip4_mask, 0x4C, 0, 32);
+
+/* reg_rmft2_flexible_action_set
+ * ACL action set. The only supported action types in this field and in any
+ * action-set pointed from here are as follows:
+ * 00h: ACTION_NULL
+ * 01h: ACTION_MAC_TTL, only TTL configuration is supported.
+ * 03h: ACTION_TRAP
+ * 06h: ACTION_QOS
+ * 08h: ACTION_POLICING_MONITORING
+ * 10h: ACTION_ROUTER_MC
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, flexible_action_set, 0x80,
+	       MLXSW_REG_FLEX_ACTION_SET_LEN);
+
+static inline void
+mlxsw_reg_rmft2_ipv4_pack(char *payload, bool v, u16 offset, u16 virtual_router,
+			  enum mlxsw_reg_rmft2_irif_mask irif_mask, u16 irif,
+			  u32 dip4, u32 dip4_mask, u32 sip4, u32 sip4_mask,
+			  const char *flexible_action_set)
+{
+	MLXSW_REG_ZERO(rmft2, payload);
+	mlxsw_reg_rmft2_v_set(payload, v);
+	mlxsw_reg_rmft2_type_set(payload, MLXSW_REG_RMFT2_TYPE_IPV4);
+	mlxsw_reg_rmft2_op_set(payload, MLXSW_REG_RMFT2_OP_READ_WRITE);
+	mlxsw_reg_rmft2_offset_set(payload, offset);
+	mlxsw_reg_rmft2_virtual_router_set(payload, virtual_router);
+	mlxsw_reg_rmft2_irif_mask_set(payload, irif_mask);
+	mlxsw_reg_rmft2_irif_set(payload, irif);
+	mlxsw_reg_rmft2_dip4_set(payload, dip4);
+	mlxsw_reg_rmft2_dip4_mask_set(payload, dip4_mask);
+	mlxsw_reg_rmft2_sip4_set(payload, sip4);
+	mlxsw_reg_rmft2_sip4_mask_set(payload, sip4_mask);
+	if (flexible_action_set)
+		mlxsw_reg_rmft2_flexible_action_set_memcpy_to(payload,
+							      flexible_action_set);
+}
+
 /* MFCR - Management Fan Control Register
  * --------------------------------------
  * This register controls the settings of the Fan Speed PWM mechanism.
@@ -6856,9 +7233,11 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
 	MLXSW_REG(hpkt),
 	MLXSW_REG(rgcr),
 	MLXSW_REG(ritr),
+	MLXSW_REG(rtar),
 	MLXSW_REG(ratr),
 	MLXSW_REG(rtdp),
 	MLXSW_REG(ricnt),
+	MLXSW_REG(rrcr),
 	MLXSW_REG(ralta),
 	MLXSW_REG(ralst),
 	MLXSW_REG(raltb),
@@ -6866,6 +7245,8 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
 	MLXSW_REG(rauht),
 	MLXSW_REG(raleu),
 	MLXSW_REG(rauhtd),
+	MLXSW_REG(rigr2),
+	MLXSW_REG(rmft2),
 	MLXSW_REG(mfcr),
 	MLXSW_REG(mfsc),
 	MLXSW_REG(mfsm),
diff --git a/drivers/net/ethernet/mellanox/mlxsw/resources.h b/drivers/net/ethernet/mellanox/mlxsw/resources.h
index 9556d934714b..087aad52c195 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/resources.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/resources.h
@@ -63,6 +63,7 @@ enum mlxsw_res_id {
 	MLXSW_RES_ID_MAX_CPU_POLICERS,
 	MLXSW_RES_ID_MAX_VRS,
 	MLXSW_RES_ID_MAX_RIFS,
+	MLXSW_RES_ID_MC_ERIF_LIST_ENTRIES,
 	MLXSW_RES_ID_MAX_LPM_TREES,
 
 	/* Internal resources.
@@ -100,6 +101,7 @@ static u16 mlxsw_res_ids[] = {
 	[MLXSW_RES_ID_MAX_CPU_POLICERS] = 0x2A13,
 	[MLXSW_RES_ID_MAX_VRS] = 0x2C01,
 	[MLXSW_RES_ID_MAX_RIFS] = 0x2C02,
+	[MLXSW_RES_ID_MC_ERIF_LIST_ENTRIES] = 0x2C10,
 	[MLXSW_RES_ID_MAX_LPM_TREES] = 0x2C30,
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 696b99e65a5a..5cd4df08ce97 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -69,6 +69,7 @@
 #include "txheader.h"
 #include "spectrum_cnt.h"
 #include "spectrum_dpipe.h"
+#include "spectrum_acl_flex_actions.h"
 #include "../mlxfw/mlxfw.h"
 
 #define MLXSW_FWREV_MAJOR 13
@@ -3311,6 +3312,14 @@ static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port,
 	return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
 }
 
+static void mlxsw_sp_rx_listener_mr_mark_func(struct sk_buff *skb,
+					      u8 local_port, void *priv)
+{
+	skb->offload_mr_fwd_mark = 1;
+	skb->offload_fwd_mark = 1;
+	return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
+}
+
 static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port,
 					     void *priv)
 {
@@ -3354,6 +3363,10 @@ out:
 	MLXSW_RXL(mlxsw_sp_rx_listener_mark_func, _trap_id, _action,	\
 		_is_ctrl, SP_##_trap_group, DISCARD)
 
+#define MLXSW_SP_RXL_MR_MARK(_trap_id, _action, _trap_group, _is_ctrl)	\
+	MLXSW_RXL(mlxsw_sp_rx_listener_mr_mark_func, _trap_id, _action,	\
+		_is_ctrl, SP_##_trap_group, DISCARD)
+
 #define MLXSW_SP_EVENTL(_func, _trap_id)		\
 	MLXSW_EVENTL(_func, _trap_id, SP_EVENT)
 
@@ -3420,6 +3433,11 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = {
 		  false, SP_IP2ME, DISCARD),
 	/* ACL trap */
 	MLXSW_SP_RXL_NO_MARK(ACL0, TRAP_TO_CPU, IP2ME, false),
+	/* Multicast Router Traps */
+	MLXSW_SP_RXL_MARK(IPV4_PIM, TRAP_TO_CPU, PIM, false),
+	MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false),
+	MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false),
+	MLXSW_SP_RXL_MR_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false),
 };
 
 static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core)
@@ -3445,6 +3463,8 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core)
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP:
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP:
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF:
 			rate = 128;
 			burst_size = 7;
 			break;
@@ -3460,6 +3480,7 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core)
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST:
 			rate = 1024;
 			burst_size = 7;
 			break;
@@ -3505,6 +3526,7 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core)
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP:
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP:
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM:
 			priority = 5;
 			tc = 5;
 			break;
@@ -3521,12 +3543,14 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core)
 			break;
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP:
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF:
 			priority = 2;
 			tc = 2;
 			break;
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS:
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
 		case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST:
 			priority = 1;
 			tc = 1;
 			break;
@@ -3693,6 +3717,18 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 		goto err_switchdev_init;
 	}
 
+	err = mlxsw_sp_counter_pool_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to init counter pool\n");
+		goto err_counter_pool_init;
+	}
+
+	err = mlxsw_sp_afa_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize ACL actions\n");
+		goto err_afa_init;
+	}
+
 	err = mlxsw_sp_router_init(mlxsw_sp);
 	if (err) {
 		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize router\n");
@@ -3711,12 +3747,6 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 		goto err_acl_init;
 	}
 
-	err = mlxsw_sp_counter_pool_init(mlxsw_sp);
-	if (err) {
-		dev_err(mlxsw_sp->bus_info->dev, "Failed to init counter pool\n");
-		goto err_counter_pool_init;
-	}
-
 	err = mlxsw_sp_dpipe_init(mlxsw_sp);
 	if (err) {
 		dev_err(mlxsw_sp->bus_info->dev, "Failed to init pipeline debug\n");
@@ -3734,14 +3764,16 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 err_ports_create:
 	mlxsw_sp_dpipe_fini(mlxsw_sp);
 err_dpipe_init:
-	mlxsw_sp_counter_pool_fini(mlxsw_sp);
-err_counter_pool_init:
 	mlxsw_sp_acl_fini(mlxsw_sp);
 err_acl_init:
 	mlxsw_sp_span_fini(mlxsw_sp);
 err_span_init:
 	mlxsw_sp_router_fini(mlxsw_sp);
 err_router_init:
+	mlxsw_sp_afa_fini(mlxsw_sp);
+err_afa_init:
+	mlxsw_sp_counter_pool_fini(mlxsw_sp);
+err_counter_pool_init:
 	mlxsw_sp_switchdev_fini(mlxsw_sp);
 err_switchdev_init:
 	mlxsw_sp_lag_fini(mlxsw_sp);
@@ -3760,10 +3792,11 @@ static void mlxsw_sp_fini(struct mlxsw_core *mlxsw_core)
 
 	mlxsw_sp_ports_remove(mlxsw_sp);
 	mlxsw_sp_dpipe_fini(mlxsw_sp);
-	mlxsw_sp_counter_pool_fini(mlxsw_sp);
 	mlxsw_sp_acl_fini(mlxsw_sp);
 	mlxsw_sp_span_fini(mlxsw_sp);
 	mlxsw_sp_router_fini(mlxsw_sp);
+	mlxsw_sp_afa_fini(mlxsw_sp);
+	mlxsw_sp_counter_pool_fini(mlxsw_sp);
 	mlxsw_sp_switchdev_fini(mlxsw_sp);
 	mlxsw_sp_lag_fini(mlxsw_sp);
 	mlxsw_sp_buffers_fini(mlxsw_sp);
@@ -3986,14 +4019,21 @@ static int mlxsw_sp_lag_index_get(struct mlxsw_sp *mlxsw_sp,
 static bool
 mlxsw_sp_master_lag_check(struct mlxsw_sp *mlxsw_sp,
 			  struct net_device *lag_dev,
-			  struct netdev_lag_upper_info *lag_upper_info)
+			  struct netdev_lag_upper_info *lag_upper_info,
+			  struct netlink_ext_ack *extack)
 {
 	u16 lag_id;
 
-	if (mlxsw_sp_lag_index_get(mlxsw_sp, lag_dev, &lag_id) != 0)
+	if (mlxsw_sp_lag_index_get(mlxsw_sp, lag_dev, &lag_id) != 0) {
+		NL_SET_ERR_MSG(extack,
+			       "spectrum: Exceeded number of supported LAG devices");
 		return false;
-	if (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH)
+	}
+	if (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) {
+		NL_SET_ERR_MSG(extack,
+			       "spectrum: LAG device using unsupported Tx type");
 		return false;
+	}
 	return true;
 }
 
@@ -4198,6 +4238,7 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
 {
 	struct netdev_notifier_changeupper_info *info;
 	struct mlxsw_sp_port *mlxsw_sp_port;
+	struct netlink_ext_ack *extack;
 	struct net_device *upper_dev;
 	struct mlxsw_sp *mlxsw_sp;
 	int err = 0;
@@ -4205,6 +4246,7 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
 	mlxsw_sp_port = netdev_priv(dev);
 	mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	info = ptr;
+	extack = netdev_notifier_info_to_extack(&info->info);
 
 	switch (event) {
 	case NETDEV_PRECHANGEUPPER:
@@ -4212,25 +4254,43 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
 		if (!is_vlan_dev(upper_dev) &&
 		    !netif_is_lag_master(upper_dev) &&
 		    !netif_is_bridge_master(upper_dev) &&
-		    !netif_is_ovs_master(upper_dev))
+		    !netif_is_ovs_master(upper_dev)) {
+			NL_SET_ERR_MSG(extack,
+				       "spectrum: Unknown upper device type");
 			return -EINVAL;
+		}
 		if (!info->linking)
 			break;
-		if (netdev_has_any_upper_dev(upper_dev))
+		if (netdev_has_any_upper_dev(upper_dev)) {
+			NL_SET_ERR_MSG(extack,
+				       "spectrum: Enslaving a port to a device that already has an upper device is not supported");
 			return -EINVAL;
+		}
 		if (netif_is_lag_master(upper_dev) &&
 		    !mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev,
-					       info->upper_info))
+					       info->upper_info, extack))
 			return -EINVAL;
-		if (netif_is_lag_master(upper_dev) && vlan_uses_dev(dev))
+		if (netif_is_lag_master(upper_dev) && vlan_uses_dev(dev)) {
+			NL_SET_ERR_MSG(extack,
+				       "spectrum: Master device is a LAG master and this device has a VLAN");
 			return -EINVAL;
+		}
 		if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) &&
-		    !netif_is_lag_master(vlan_dev_real_dev(upper_dev)))
+		    !netif_is_lag_master(vlan_dev_real_dev(upper_dev))) {
+			NL_SET_ERR_MSG(extack,
+				       "spectrum: Can not put a VLAN on a LAG port");
 			return -EINVAL;
-		if (netif_is_ovs_master(upper_dev) && vlan_uses_dev(dev))
+		}
+		if (netif_is_ovs_master(upper_dev) && vlan_uses_dev(dev)) {
+			NL_SET_ERR_MSG(extack,
+				       "spectrum: Master device is an OVS master and this device has a VLAN");
 			return -EINVAL;
-		if (netif_is_ovs_port(dev) && is_vlan_dev(upper_dev))
+		}
+		if (netif_is_ovs_port(dev) && is_vlan_dev(upper_dev)) {
+			NL_SET_ERR_MSG(extack,
+				       "spectrum: Can not put a VLAN on an OVS port");
 			return -EINVAL;
+		}
 		break;
 	case NETDEV_CHANGEUPPER:
 		upper_dev = info->upper_dev;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 84ce83acdc19..ae67e6046098 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -94,7 +94,8 @@ struct mlxsw_sp_mid {
 	unsigned char addr[ETH_ALEN];
 	u16 fid;
 	u16 mid;
-	unsigned int ref_count;
+	bool in_hw;
+	unsigned long *ports_in_mid; /* bits array */
 };
 
 enum mlxsw_sp_span_type {
@@ -138,6 +139,7 @@ struct mlxsw_sp_port_mall_tc_entry {
 struct mlxsw_sp_sb;
 struct mlxsw_sp_bridge;
 struct mlxsw_sp_router;
+struct mlxsw_sp_mr;
 struct mlxsw_sp_acl;
 struct mlxsw_sp_counter_pool;
 struct mlxsw_sp_fid_core;
@@ -152,6 +154,8 @@ struct mlxsw_sp {
 	struct mlxsw_sp_sb *sb;
 	struct mlxsw_sp_bridge *bridge;
 	struct mlxsw_sp_router *router;
+	struct mlxsw_sp_mr *mr;
+	struct mlxsw_afa *afa;
 	struct mlxsw_sp_acl *acl;
 	struct mlxsw_sp_fid_core *fid_core;
 	struct {
@@ -466,9 +470,9 @@ void mlxsw_sp_acl_rulei_keymask_buf(struct mlxsw_sp_acl_rule_info *rulei,
 				    enum mlxsw_afk_element element,
 				    const char *key_value,
 				    const char *mask_value, unsigned int len);
-void mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei);
-void mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei,
-				 u16 group_id);
+int mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei,
+				u16 group_id);
 int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei);
 int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei);
 int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
index 4b2455e3e079..93dcd315f7d6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
@@ -52,7 +52,6 @@
 struct mlxsw_sp_acl {
 	struct mlxsw_sp *mlxsw_sp;
 	struct mlxsw_afk *afk;
-	struct mlxsw_afa *afa;
 	struct mlxsw_sp_fid *dummy_fid;
 	const struct mlxsw_sp_acl_ops *ops;
 	struct rhashtable ruleset_ht;
@@ -333,7 +332,7 @@ mlxsw_sp_acl_rulei_create(struct mlxsw_sp_acl *acl)
 	rulei = kzalloc(sizeof(*rulei), GFP_KERNEL);
 	if (!rulei)
 		return NULL;
-	rulei->act_block = mlxsw_afa_block_create(acl->afa);
+	rulei->act_block = mlxsw_afa_block_create(acl->mlxsw_sp->afa);
 	if (IS_ERR(rulei->act_block)) {
 		err = PTR_ERR(rulei->act_block);
 		goto err_afa_block_create;
@@ -379,15 +378,15 @@ void mlxsw_sp_acl_rulei_keymask_buf(struct mlxsw_sp_acl_rule_info *rulei,
 				 key_value, mask_value, len);
 }
 
-void mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei)
+int mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei)
 {
-	mlxsw_afa_block_continue(rulei->act_block);
+	return mlxsw_afa_block_continue(rulei->act_block);
 }
 
-void mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei,
-				 u16 group_id)
+int mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei,
+				u16 group_id)
 {
-	mlxsw_afa_block_jump(rulei->act_block, group_id);
+	return mlxsw_afa_block_jump(rulei->act_block, group_id);
 }
 
 int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei)
@@ -397,7 +396,8 @@ int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei)
 
 int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei)
 {
-	return mlxsw_afa_block_append_trap(rulei->act_block);
+	return mlxsw_afa_block_append_trap(rulei->act_block,
+					   MLXSW_TRAP_ID_ACL0);
 }
 
 int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp,
@@ -653,85 +653,6 @@ int mlxsw_sp_acl_rule_get_stats(struct mlxsw_sp *mlxsw_sp,
 	return 0;
 }
 
-#define MLXSW_SP_KDVL_ACT_EXT_SIZE 1
-
-static int mlxsw_sp_act_kvdl_set_add(void *priv, u32 *p_kvdl_index,
-				     char *enc_actions, bool is_first)
-{
-	struct mlxsw_sp *mlxsw_sp = priv;
-	char pefa_pl[MLXSW_REG_PEFA_LEN];
-	u32 kvdl_index;
-	int err;
-
-	/* The first action set of a TCAM entry is stored directly in TCAM,
-	 * not KVD linear area.
-	 */
-	if (is_first)
-		return 0;
-
-	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KDVL_ACT_EXT_SIZE,
-				  &kvdl_index);
-	if (err)
-		return err;
-	mlxsw_reg_pefa_pack(pefa_pl, kvdl_index, enc_actions);
-	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pefa), pefa_pl);
-	if (err)
-		goto err_pefa_write;
-	*p_kvdl_index = kvdl_index;
-	return 0;
-
-err_pefa_write:
-	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
-	return err;
-}
-
-static void mlxsw_sp_act_kvdl_set_del(void *priv, u32 kvdl_index,
-				      bool is_first)
-{
-	struct mlxsw_sp *mlxsw_sp = priv;
-
-	if (is_first)
-		return;
-	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
-}
-
-static int mlxsw_sp_act_kvdl_fwd_entry_add(void *priv, u32 *p_kvdl_index,
-					   u8 local_port)
-{
-	struct mlxsw_sp *mlxsw_sp = priv;
-	char ppbs_pl[MLXSW_REG_PPBS_LEN];
-	u32 kvdl_index;
-	int err;
-
-	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, 1, &kvdl_index);
-	if (err)
-		return err;
-	mlxsw_reg_ppbs_pack(ppbs_pl, kvdl_index, local_port);
-	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbs), ppbs_pl);
-	if (err)
-		goto err_ppbs_write;
-	*p_kvdl_index = kvdl_index;
-	return 0;
-
-err_ppbs_write:
-	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
-	return err;
-}
-
-static void mlxsw_sp_act_kvdl_fwd_entry_del(void *priv, u32 kvdl_index)
-{
-	struct mlxsw_sp *mlxsw_sp = priv;
-
-	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
-}
-
-static const struct mlxsw_afa_ops mlxsw_sp_act_afa_ops = {
-	.kvdl_set_add		= mlxsw_sp_act_kvdl_set_add,
-	.kvdl_set_del		= mlxsw_sp_act_kvdl_set_del,
-	.kvdl_fwd_entry_add	= mlxsw_sp_act_kvdl_fwd_entry_add,
-	.kvdl_fwd_entry_del	= mlxsw_sp_act_kvdl_fwd_entry_del,
-};
-
 int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp)
 {
 	const struct mlxsw_sp_acl_ops *acl_ops = &mlxsw_sp_acl_tcam_ops;
@@ -753,14 +674,6 @@ int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp)
 		goto err_afk_create;
 	}
 
-	acl->afa = mlxsw_afa_create(MLXSW_CORE_RES_GET(mlxsw_sp->core,
-						       ACL_ACTIONS_PER_SET),
-				    &mlxsw_sp_act_afa_ops, mlxsw_sp);
-	if (IS_ERR(acl->afa)) {
-		err = PTR_ERR(acl->afa);
-		goto err_afa_create;
-	}
-
 	err = rhashtable_init(&acl->ruleset_ht,
 			      &mlxsw_sp_acl_ruleset_ht_params);
 	if (err)
@@ -792,8 +705,6 @@ err_acl_ops_init:
 err_fid_get:
 	rhashtable_destroy(&acl->ruleset_ht);
 err_rhashtable_init:
-	mlxsw_afa_destroy(acl->afa);
-err_afa_create:
 	mlxsw_afk_destroy(acl->afk);
 err_afk_create:
 	kfree(acl);
@@ -810,7 +721,6 @@ void mlxsw_sp_acl_fini(struct mlxsw_sp *mlxsw_sp)
 	WARN_ON(!list_empty(&acl->rules));
 	mlxsw_sp_fid_put(acl->dummy_fid);
 	rhashtable_destroy(&acl->ruleset_ht);
-	mlxsw_afa_destroy(acl->afa);
 	mlxsw_afk_destroy(acl->afk);
 	kfree(acl);
 }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c
new file mode 100644
index 000000000000..4d3340ed0291
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c
@@ -0,0 +1,129 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spectrum_acl_flex_actions.h"
+#include "core_acl_flex_actions.h"
+
+#define MLXSW_SP_KVDL_ACT_EXT_SIZE 1
+
+static int mlxsw_sp_act_kvdl_set_add(void *priv, u32 *p_kvdl_index,
+				     char *enc_actions, bool is_first)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	char pefa_pl[MLXSW_REG_PEFA_LEN];
+	u32 kvdl_index;
+	int err;
+
+	/* The first action set of a TCAM entry is stored directly in TCAM,
+	 * not KVD linear area.
+	 */
+	if (is_first)
+		return 0;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ACT_EXT_SIZE,
+				  &kvdl_index);
+	if (err)
+		return err;
+	mlxsw_reg_pefa_pack(pefa_pl, kvdl_index, enc_actions);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pefa), pefa_pl);
+	if (err)
+		goto err_pefa_write;
+	*p_kvdl_index = kvdl_index;
+	return 0;
+
+err_pefa_write:
+	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
+	return err;
+}
+
+static void mlxsw_sp_act_kvdl_set_del(void *priv, u32 kvdl_index,
+				      bool is_first)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	if (is_first)
+		return;
+	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
+}
+
+static int mlxsw_sp_act_kvdl_fwd_entry_add(void *priv, u32 *p_kvdl_index,
+					   u8 local_port)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	char ppbs_pl[MLXSW_REG_PPBS_LEN];
+	u32 kvdl_index;
+	int err;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, 1, &kvdl_index);
+	if (err)
+		return err;
+	mlxsw_reg_ppbs_pack(ppbs_pl, kvdl_index, local_port);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbs), ppbs_pl);
+	if (err)
+		goto err_ppbs_write;
+	*p_kvdl_index = kvdl_index;
+	return 0;
+
+err_ppbs_write:
+	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
+	return err;
+}
+
+static void mlxsw_sp_act_kvdl_fwd_entry_del(void *priv, u32 kvdl_index)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index);
+}
+
+static const struct mlxsw_afa_ops mlxsw_sp_act_afa_ops = {
+	.kvdl_set_add		= mlxsw_sp_act_kvdl_set_add,
+	.kvdl_set_del		= mlxsw_sp_act_kvdl_set_del,
+	.kvdl_fwd_entry_add	= mlxsw_sp_act_kvdl_fwd_entry_add,
+	.kvdl_fwd_entry_del	= mlxsw_sp_act_kvdl_fwd_entry_del,
+};
+
+int mlxsw_sp_afa_init(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_sp->afa = mlxsw_afa_create(MLXSW_CORE_RES_GET(mlxsw_sp->core,
+							    ACL_ACTIONS_PER_SET),
+					 &mlxsw_sp_act_afa_ops, mlxsw_sp);
+	return PTR_ERR_OR_ZERO(mlxsw_sp->afa);
+}
+
+void mlxsw_sp_afa_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_afa_destroy(mlxsw_sp->afa);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
new file mode 100644
index 000000000000..2726192836ad
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
@@ -0,0 +1,44 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_ACL_FLEX_KEYS_H
+#define _MLXSW_SPECTRUM_ACL_FLEX_KEYS_H
+
+#include "spectrum.h"
+
+int mlxsw_sp_afa_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_afa_fini(struct mlxsw_sp *mlxsw_sp);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
index 50b40de1fb91..7e8284b46968 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
@@ -608,7 +608,10 @@ mlxsw_sp_acl_tcam_region_catchall_add(struct mlxsw_sp *mlxsw_sp,
 		goto err_rulei_create;
 	}
 
-	mlxsw_sp_acl_rulei_act_continue(rulei);
+	err = mlxsw_sp_acl_rulei_act_continue(rulei);
+	if (WARN_ON(err))
+		goto err_rulei_act_continue;
+
 	err = mlxsw_sp_acl_rulei_commit(rulei);
 	if (err)
 		goto err_rulei_commit;
@@ -623,6 +626,7 @@ mlxsw_sp_acl_tcam_region_catchall_add(struct mlxsw_sp *mlxsw_sp,
 
 err_rule_insert:
 err_rulei_commit:
+err_rulei_act_continue:
 	mlxsw_sp_acl_rulei_destroy(rulei);
 err_rulei_create:
 	parman_item_remove(region->parman, parman_prio, parman_item);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
index 51e6846da72b..a056f23d3a0e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
@@ -43,21 +43,36 @@ enum mlxsw_sp_field_metadata_id {
 	MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT,
 	MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD,
 	MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP,
+	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX,
+	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX,
 };
 
 static struct devlink_dpipe_field mlxsw_sp_dpipe_fields_metadata[] = {
-	{ .name = "erif_port",
-	  .id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT,
-	  .bitwidth = 32,
-	  .mapping_type = DEVLINK_DPIPE_FIELD_MAPPING_TYPE_IFINDEX,
+	{
+		.name = "erif_port",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT,
+		.bitwidth = 32,
+		.mapping_type = DEVLINK_DPIPE_FIELD_MAPPING_TYPE_IFINDEX,
 	},
-	{ .name = "l3_forward",
-	  .id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD,
-	  .bitwidth = 1,
+	{
+		.name = "l3_forward",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD,
+		.bitwidth = 1,
 	},
-	{ .name = "l3_drop",
-	  .id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP,
-	  .bitwidth = 1,
+	{
+		.name = "l3_drop",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP,
+		.bitwidth = 1,
+	},
+	{
+		.name = "adj_index",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX,
+		.bitwidth = 32,
+	},
+	{
+		.name = "adj_hash_index",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX,
+		.bitwidth = 32,
 	},
 };
 
@@ -826,6 +841,359 @@ static void mlxsw_sp_dpipe_host6_table_fini(struct mlxsw_sp *mlxsw_sp)
 				       MLXSW_SP_DPIPE_TABLE_NAME_HOST6);
 }
 
+static int mlxsw_sp_dpipe_table_adj_matches_dump(void *priv,
+						 struct sk_buff *skb)
+{
+	struct devlink_dpipe_match match = {0};
+	int err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX;
+
+	err = devlink_dpipe_match_put(skb, &match);
+	if (err)
+		return err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX;
+
+	return devlink_dpipe_match_put(skb, &match);
+}
+
+static int mlxsw_sp_dpipe_table_adj_actions_dump(void *priv,
+						 struct sk_buff *skb)
+{
+	struct devlink_dpipe_action action = {0};
+	int err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &devlink_dpipe_header_ethernet;
+	action.field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+
+	err = devlink_dpipe_action_put(skb, &action);
+	if (err)
+		return err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &mlxsw_sp_dpipe_header_metadata;
+	action.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+
+	return devlink_dpipe_action_put(skb, &action);
+}
+
+static u64 mlxsw_sp_dpipe_table_adj_size(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_nexthop *nh;
+	u64 size = 0;
+
+	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router)
+		if (mlxsw_sp_nexthop_offload(nh) &&
+		    !mlxsw_sp_nexthop_group_has_ipip(nh))
+			size++;
+	return size;
+}
+
+enum mlxsw_sp_dpipe_table_adj_match {
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX,
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX,
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT,
+};
+
+enum mlxsw_sp_dpipe_table_adj_action {
+	MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC,
+	MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT,
+	MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT,
+};
+
+static void
+mlxsw_sp_dpipe_table_adj_match_action_prepare(struct devlink_dpipe_match *matches,
+					      struct devlink_dpipe_action *actions)
+{
+	struct devlink_dpipe_action *action;
+	struct devlink_dpipe_match *match;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &devlink_dpipe_header_ethernet;
+	action->field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &mlxsw_sp_dpipe_header_metadata;
+	action->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+}
+
+static int
+mlxsw_sp_dpipe_table_adj_entry_prepare(struct devlink_dpipe_entry *entry,
+				       struct devlink_dpipe_value *match_values,
+				       struct devlink_dpipe_match *matches,
+				       struct devlink_dpipe_value *action_values,
+				       struct devlink_dpipe_action *actions)
+{	struct devlink_dpipe_value *action_value;
+	struct devlink_dpipe_value *match_value;
+	struct devlink_dpipe_action *action;
+	struct devlink_dpipe_match *match;
+
+	entry->match_values = match_values;
+	entry->match_values_count = MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT;
+
+	entry->action_values = action_values;
+	entry->action_values_count = MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+	action_value = &action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u64);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		return -ENOMEM;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+	action_value = &action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u32);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void
+__mlxsw_sp_dpipe_table_adj_entry_fill(struct devlink_dpipe_entry *entry,
+				      u32 adj_index, u32 adj_hash_index,
+				      unsigned char *ha,
+				      struct mlxsw_sp_rif *rif)
+{
+	struct devlink_dpipe_value *value;
+	u32 *p_rif_value;
+	u32 *p_index;
+
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+	p_index = value->value;
+	*p_index = adj_index;
+
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+	p_index = value->value;
+	*p_index = adj_hash_index;
+
+	value = &entry->action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+	ether_addr_copy(value->value, ha);
+
+	value = &entry->action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+	p_rif_value = value->value;
+	*p_rif_value = mlxsw_sp_rif_index(rif);
+	value->mapping_value = mlxsw_sp_rif_dev_ifindex(rif);
+	value->mapping_valid = true;
+}
+
+static void mlxsw_sp_dpipe_table_adj_entry_fill(struct mlxsw_sp *mlxsw_sp,
+						struct mlxsw_sp_nexthop *nh,
+						struct devlink_dpipe_entry *entry)
+{
+	struct mlxsw_sp_rif *rif = mlxsw_sp_nexthop_rif(nh);
+	unsigned char *ha = mlxsw_sp_nexthop_ha(nh);
+	u32 adj_hash_index = 0;
+	u32 adj_index = 0;
+	int err;
+
+	mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_hash_index);
+	__mlxsw_sp_dpipe_table_adj_entry_fill(entry, adj_index,
+					      adj_hash_index, ha, rif);
+	err = mlxsw_sp_nexthop_counter_get(mlxsw_sp, nh, &entry->counter);
+	if (!err)
+		entry->counter_valid = true;
+}
+
+static int
+mlxsw_sp_dpipe_table_adj_entries_get(struct mlxsw_sp *mlxsw_sp,
+				     struct devlink_dpipe_entry *entry,
+				     bool counters_enabled,
+				     struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct mlxsw_sp_nexthop *nh;
+	int entry_index = 0;
+	int nh_count_max;
+	int nh_count = 0;
+	int nh_skip;
+	int j;
+	int err;
+
+	rtnl_lock();
+	nh_count_max = mlxsw_sp_dpipe_table_adj_size(mlxsw_sp);
+start_again:
+	err = devlink_dpipe_entry_ctx_prepare(dump_ctx);
+	if (err)
+		goto err_ctx_prepare;
+	j = 0;
+	nh_skip = nh_count;
+	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) {
+		if (!mlxsw_sp_nexthop_offload(nh) ||
+		    mlxsw_sp_nexthop_group_has_ipip(nh))
+			continue;
+
+		if (nh_count < nh_skip)
+			goto skip;
+
+		mlxsw_sp_dpipe_table_adj_entry_fill(mlxsw_sp, nh, entry);
+		entry->index = entry_index;
+		err = devlink_dpipe_entry_ctx_append(dump_ctx, entry);
+		if (err) {
+			if (err == -EMSGSIZE) {
+				if (!j)
+					goto err_entry_append;
+				break;
+			}
+			goto err_entry_append;
+		}
+		entry_index++;
+		j++;
+skip:
+		nh_count++;
+	}
+
+	devlink_dpipe_entry_ctx_close(dump_ctx);
+	if (nh_count != nh_count_max)
+		goto start_again;
+	rtnl_unlock();
+
+	return 0;
+
+err_ctx_prepare:
+err_entry_append:
+	rtnl_unlock();
+	return err;
+}
+
+static int
+mlxsw_sp_dpipe_table_adj_entries_dump(void *priv, bool counters_enabled,
+				      struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct devlink_dpipe_value action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT];
+	struct devlink_dpipe_value match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT];
+	struct devlink_dpipe_action actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT];
+	struct devlink_dpipe_match matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT];
+	struct devlink_dpipe_entry entry = {0};
+	struct mlxsw_sp *mlxsw_sp = priv;
+	int err;
+
+	memset(matches, 0, MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT *
+			   sizeof(matches[0]));
+	memset(match_values, 0, MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT *
+				sizeof(match_values[0]));
+	memset(actions, 0, MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT *
+			   sizeof(actions[0]));
+	memset(action_values, 0, MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT *
+				 sizeof(action_values[0]));
+
+	mlxsw_sp_dpipe_table_adj_match_action_prepare(matches, actions);
+	err = mlxsw_sp_dpipe_table_adj_entry_prepare(&entry,
+						     match_values, matches,
+						     action_values, actions);
+	if (err)
+		goto out;
+
+	err = mlxsw_sp_dpipe_table_adj_entries_get(mlxsw_sp, &entry,
+						   counters_enabled, dump_ctx);
+out:
+	devlink_dpipe_entry_clear(&entry);
+	return err;
+}
+
+static int mlxsw_sp_dpipe_table_adj_counters_update(void *priv, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_nexthop *nh;
+	u32 adj_hash_index = 0;
+	u32 adj_index = 0;
+
+	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) {
+		if (!mlxsw_sp_nexthop_offload(nh) ||
+		    mlxsw_sp_nexthop_group_has_ipip(nh))
+			continue;
+
+		mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_hash_index);
+		if (enable)
+			mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
+		else
+			mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
+		mlxsw_sp_nexthop_update(mlxsw_sp,
+					adj_index + adj_hash_index, nh);
+	}
+	return 0;
+}
+
+static u64
+mlxsw_sp_dpipe_table_adj_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	u64 size;
+
+	rtnl_lock();
+	size = mlxsw_sp_dpipe_table_adj_size(mlxsw_sp);
+	rtnl_unlock();
+
+	return size;
+}
+
+static struct devlink_dpipe_table_ops mlxsw_sp_dpipe_table_adj_ops = {
+	.matches_dump = mlxsw_sp_dpipe_table_adj_matches_dump,
+	.actions_dump = mlxsw_sp_dpipe_table_adj_actions_dump,
+	.entries_dump = mlxsw_sp_dpipe_table_adj_entries_dump,
+	.counters_set_update = mlxsw_sp_dpipe_table_adj_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_adj_size_get,
+};
+
+static int mlxsw_sp_dpipe_adj_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	return devlink_dpipe_table_register(devlink,
+					    MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
+					    &mlxsw_sp_dpipe_table_adj_ops,
+					    mlxsw_sp, false);
+}
+
+static void mlxsw_sp_dpipe_adj_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_ADJ);
+}
+
 int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
@@ -846,8 +1214,14 @@ int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp)
 	err = mlxsw_sp_dpipe_host6_table_init(mlxsw_sp);
 	if (err)
 		goto err_host6_table_init;
-	return 0;
 
+	err = mlxsw_sp_dpipe_adj_table_init(mlxsw_sp);
+	if (err)
+		goto err_adj_table_init;
+
+	return 0;
+err_adj_table_init:
+	mlxsw_sp_dpipe_host6_table_fini(mlxsw_sp);
 err_host6_table_init:
 	mlxsw_sp_dpipe_host4_table_fini(mlxsw_sp);
 err_host4_table_init:
@@ -861,6 +1235,7 @@ void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
 
+	mlxsw_sp_dpipe_adj_table_fini(mlxsw_sp);
 	mlxsw_sp_dpipe_host6_table_fini(mlxsw_sp);
 	mlxsw_sp_dpipe_host4_table_fini(mlxsw_sp);
 	mlxsw_sp_dpipe_erif_table_fini(mlxsw_sp);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h
index 283fde4e6783..815d543cf114 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h
@@ -56,5 +56,6 @@ static inline void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp)
 #define MLXSW_SP_DPIPE_TABLE_NAME_ERIF "mlxsw_erif"
 #define MLXSW_SP_DPIPE_TABLE_NAME_HOST4 "mlxsw_host4"
 #define MLXSW_SP_DPIPE_TABLE_NAME_HOST6 "mlxsw_host6"
+#define MLXSW_SP_DPIPE_TABLE_NAME_ADJ "mlxsw_adj"
 
 #endif /* _MLXSW_PIPELINE_H_*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 8aace9a06a5d..2f0e57857ea4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -63,7 +63,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
 
 	tcf_exts_to_list(exts, &actions);
 	list_for_each_entry(a, &actions, list) {
-		if (is_tcf_gact_shot(a)) {
+		if (is_tcf_gact_ok(a)) {
+			err = mlxsw_sp_acl_rulei_act_continue(rulei);
+			if (err)
+				return err;
+		} else if (is_tcf_gact_shot(a)) {
 			err = mlxsw_sp_acl_rulei_act_drop(rulei);
 			if (err)
 				return err;
@@ -84,7 +88,9 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
 				return PTR_ERR(ruleset);
 
 			group_id = mlxsw_sp_acl_ruleset_group_id(ruleset);
-			mlxsw_sp_acl_rulei_act_jump(rulei, group_id);
+			err = mlxsw_sp_acl_rulei_act_jump(rulei, group_id);
+			if (err)
+				return err;
 		} else if (is_tcf_mirred_egress_redirect(a)) {
 			int ifindex = tcf_mirred_ifindex(a);
 			struct net_device *out_dev;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
new file mode 100644
index 000000000000..1f84bb8e9135
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
@@ -0,0 +1,1011 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/rhashtable.h>
+
+#include "spectrum_mr.h"
+#include "spectrum_router.h"
+
+struct mlxsw_sp_mr {
+	const struct mlxsw_sp_mr_ops *mr_ops;
+	void *catchall_route_priv;
+	struct delayed_work stats_update_dw;
+	struct list_head table_list;
+#define MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL 5000 /* ms */
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+struct mlxsw_sp_mr_vif {
+	struct net_device *dev;
+	const struct mlxsw_sp_rif *rif;
+	unsigned long vif_flags;
+
+	/* A list of route_vif_entry structs that point to routes that the VIF
+	 * instance is used as one of the egress VIFs
+	 */
+	struct list_head route_evif_list;
+
+	/* A list of route_vif_entry structs that point to routes that the VIF
+	 * instance is used as an ingress VIF
+	 */
+	struct list_head route_ivif_list;
+};
+
+struct mlxsw_sp_mr_route_vif_entry {
+	struct list_head vif_node;
+	struct list_head route_node;
+	struct mlxsw_sp_mr_vif *mr_vif;
+	struct mlxsw_sp_mr_route *mr_route;
+};
+
+struct mlxsw_sp_mr_table {
+	struct list_head node;
+	enum mlxsw_sp_l3proto proto;
+	struct mlxsw_sp *mlxsw_sp;
+	u32 vr_id;
+	struct mlxsw_sp_mr_vif vifs[MAXVIFS];
+	struct list_head route_list;
+	struct rhashtable route_ht;
+	char catchall_route_priv[0];
+	/* catchall_route_priv has to be always the last item */
+};
+
+struct mlxsw_sp_mr_route {
+	struct list_head node;
+	struct rhash_head ht_node;
+	struct mlxsw_sp_mr_route_key key;
+	enum mlxsw_sp_mr_route_action route_action;
+	u16 min_mtu;
+	struct mfc_cache *mfc4;
+	void *route_priv;
+	const struct mlxsw_sp_mr_table *mr_table;
+	/* A list of route_vif_entry structs that point to the egress VIFs */
+	struct list_head evif_list;
+	/* A route_vif_entry struct that point to the ingress VIF */
+	struct mlxsw_sp_mr_route_vif_entry ivif;
+};
+
+static const struct rhashtable_params mlxsw_sp_mr_route_ht_params = {
+	.key_len = sizeof(struct mlxsw_sp_mr_route_key),
+	.key_offset = offsetof(struct mlxsw_sp_mr_route, key),
+	.head_offset = offsetof(struct mlxsw_sp_mr_route, ht_node),
+	.automatic_shrinking = true,
+};
+
+static bool mlxsw_sp_mr_vif_regular(const struct mlxsw_sp_mr_vif *vif)
+{
+	return !(vif->vif_flags & (VIFF_TUNNEL | VIFF_REGISTER));
+}
+
+static bool mlxsw_sp_mr_vif_valid(const struct mlxsw_sp_mr_vif *vif)
+{
+	return mlxsw_sp_mr_vif_regular(vif) && vif->dev && vif->rif;
+}
+
+static bool mlxsw_sp_mr_vif_exists(const struct mlxsw_sp_mr_vif *vif)
+{
+	return vif->dev;
+}
+
+static bool
+mlxsw_sp_mr_route_ivif_in_evifs(const struct mlxsw_sp_mr_route *mr_route)
+{
+	vifi_t ivif;
+
+	switch (mr_route->mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		ivif = mr_route->mfc4->mfc_parent;
+		return mr_route->mfc4->mfc_un.res.ttls[ivif] != 255;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+	return false;
+}
+
+static int
+mlxsw_sp_mr_route_valid_evifs_num(const struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	int valid_evifs;
+
+	valid_evifs = 0;
+	list_for_each_entry(rve, &mr_route->evif_list, route_node)
+		if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+			valid_evifs++;
+	return valid_evifs;
+}
+
+static bool mlxsw_sp_mr_route_starg(const struct mlxsw_sp_mr_route *mr_route)
+{
+	switch (mr_route->mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		return mr_route->key.source_mask.addr4 == INADDR_ANY;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+	return false;
+}
+
+static enum mlxsw_sp_mr_route_action
+mlxsw_sp_mr_route_action(const struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	/* If the ingress port is not regular and resolved, trap the route */
+	if (!mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* The kernel does not match a (*,G) route that the ingress interface is
+	 * not one of the egress interfaces, so trap these kind of routes.
+	 */
+	if (mlxsw_sp_mr_route_starg(mr_route) &&
+	    !mlxsw_sp_mr_route_ivif_in_evifs(mr_route))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* If the route has no valid eVIFs, trap it. */
+	if (!mlxsw_sp_mr_route_valid_evifs_num(mr_route))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* If one of the eVIFs has no RIF, trap-and-forward the route as there
+	 * is some more routing to do in software too.
+	 */
+	list_for_each_entry(rve, &mr_route->evif_list, route_node)
+		if (mlxsw_sp_mr_vif_exists(rve->mr_vif) && !rve->mr_vif->rif)
+			return MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD;
+
+	return MLXSW_SP_MR_ROUTE_ACTION_FORWARD;
+}
+
+static enum mlxsw_sp_mr_route_prio
+mlxsw_sp_mr_route_prio(const struct mlxsw_sp_mr_route *mr_route)
+{
+	return mlxsw_sp_mr_route_starg(mr_route) ?
+		MLXSW_SP_MR_ROUTE_PRIO_STARG : MLXSW_SP_MR_ROUTE_PRIO_SG;
+}
+
+static void mlxsw_sp_mr_route4_key(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route_key *key,
+				   const struct mfc_cache *mfc)
+{
+	bool starg = (mfc->mfc_origin == INADDR_ANY);
+
+	memset(key, 0, sizeof(*key));
+	key->vrid = mr_table->vr_id;
+	key->proto = mr_table->proto;
+	key->group.addr4 = mfc->mfc_mcastgrp;
+	key->group_mask.addr4 = 0xffffffff;
+	key->source.addr4 = mfc->mfc_origin;
+	key->source_mask.addr4 = starg ? 0 : 0xffffffff;
+}
+
+static int mlxsw_sp_mr_route_evif_link(struct mlxsw_sp_mr_route *mr_route,
+				       struct mlxsw_sp_mr_vif *mr_vif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	rve = kzalloc(sizeof(*rve), GFP_KERNEL);
+	if (!rve)
+		return -ENOMEM;
+	rve->mr_route = mr_route;
+	rve->mr_vif = mr_vif;
+	list_add_tail(&rve->route_node, &mr_route->evif_list);
+	list_add_tail(&rve->vif_node, &mr_vif->route_evif_list);
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_evif_unlink(struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	list_del(&rve->route_node);
+	list_del(&rve->vif_node);
+	kfree(rve);
+}
+
+static void mlxsw_sp_mr_route_ivif_link(struct mlxsw_sp_mr_route *mr_route,
+					struct mlxsw_sp_mr_vif *mr_vif)
+{
+	mr_route->ivif.mr_route = mr_route;
+	mr_route->ivif.mr_vif = mr_vif;
+	list_add_tail(&mr_route->ivif.vif_node, &mr_vif->route_ivif_list);
+}
+
+static void mlxsw_sp_mr_route_ivif_unlink(struct mlxsw_sp_mr_route *mr_route)
+{
+	list_del(&mr_route->ivif.vif_node);
+}
+
+static int
+mlxsw_sp_mr_route_info_create(struct mlxsw_sp_mr_table *mr_table,
+			      struct mlxsw_sp_mr_route *mr_route,
+			      struct mlxsw_sp_mr_route_info *route_info)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	u16 *erif_indices;
+	u16 irif_index;
+	u16 erif = 0;
+
+	erif_indices = kmalloc_array(MAXVIFS, sizeof(*erif_indices),
+				     GFP_KERNEL);
+	if (!erif_indices)
+		return -ENOMEM;
+
+	list_for_each_entry(rve, &mr_route->evif_list, route_node) {
+		if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+			u16 rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
+
+			erif_indices[erif++] = rifi;
+		}
+	}
+
+	if (mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
+		irif_index = mlxsw_sp_rif_index(mr_route->ivif.mr_vif->rif);
+	else
+		irif_index = 0;
+
+	route_info->irif_index = irif_index;
+	route_info->erif_indices = erif_indices;
+	route_info->min_mtu = mr_route->min_mtu;
+	route_info->route_action = mr_route->route_action;
+	route_info->erif_num = erif;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_info_destroy(struct mlxsw_sp_mr_route_info *route_info)
+{
+	kfree(route_info->erif_indices);
+}
+
+static int mlxsw_sp_mr_route_write(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route *mr_route,
+				   bool replace)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr_route_info route_info;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	int err;
+
+	err = mlxsw_sp_mr_route_info_create(mr_table, mr_route, &route_info);
+	if (err)
+		return err;
+
+	if (!replace) {
+		struct mlxsw_sp_mr_route_params route_params;
+
+		mr_route->route_priv = kzalloc(mr->mr_ops->route_priv_size,
+					       GFP_KERNEL);
+		if (!mr_route->route_priv) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		route_params.key = mr_route->key;
+		route_params.value = route_info;
+		route_params.prio = mlxsw_sp_mr_route_prio(mr_route);
+		err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
+					       mr_route->route_priv,
+					       &route_params);
+		if (err)
+			kfree(mr_route->route_priv);
+	} else {
+		err = mr->mr_ops->route_update(mlxsw_sp, mr_route->route_priv,
+					       &route_info);
+	}
+out:
+	mlxsw_sp_mr_route_info_destroy(&route_info);
+	return err;
+}
+
+static void mlxsw_sp_mr_route_erase(struct mlxsw_sp_mr_table *mr_table,
+				    struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv, mr_route->route_priv);
+	kfree(mr_route->route_priv);
+}
+
+static struct mlxsw_sp_mr_route *
+mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table,
+			  struct mfc_cache *mfc)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
+	struct mlxsw_sp_mr_route *mr_route;
+	int err = 0;
+	int i;
+
+	/* Allocate and init a new route and fill it with parameters */
+	mr_route = kzalloc(sizeof(*mr_route), GFP_KERNEL);
+	if (!mr_route)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&mr_route->evif_list);
+	mlxsw_sp_mr_route4_key(mr_table, &mr_route->key, mfc);
+
+	/* Find min_mtu and link iVIF and eVIFs */
+	mr_route->min_mtu = ETH_MAX_MTU;
+	ipmr_cache_hold(mfc);
+	mr_route->mfc4 = mfc;
+	mr_route->mr_table = mr_table;
+	for (i = 0; i < MAXVIFS; i++) {
+		if (mfc->mfc_un.res.ttls[i] != 255) {
+			err = mlxsw_sp_mr_route_evif_link(mr_route,
+							  &mr_table->vifs[i]);
+			if (err)
+				goto err;
+			if (mr_table->vifs[i].dev &&
+			    mr_table->vifs[i].dev->mtu < mr_route->min_mtu)
+				mr_route->min_mtu = mr_table->vifs[i].dev->mtu;
+		}
+	}
+	mlxsw_sp_mr_route_ivif_link(mr_route, &mr_table->vifs[mfc->mfc_parent]);
+
+	mr_route->route_action = mlxsw_sp_mr_route_action(mr_route);
+	return mr_route;
+err:
+	ipmr_cache_put(mfc);
+	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
+		mlxsw_sp_mr_route_evif_unlink(rve);
+	kfree(mr_route);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_mr_route4_destroy(struct mlxsw_sp_mr_table *mr_table,
+				       struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
+
+	mlxsw_sp_mr_route_ivif_unlink(mr_route);
+	ipmr_cache_put(mr_route->mfc4);
+	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
+		mlxsw_sp_mr_route_evif_unlink(rve);
+	kfree(mr_route);
+}
+
+static void mlxsw_sp_mr_route_destroy(struct mlxsw_sp_mr_table *mr_table,
+				      struct mlxsw_sp_mr_route *mr_route)
+{
+	switch (mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_sp_mr_route4_destroy(mr_table, mr_route);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void mlxsw_sp_mr_mfc_offload_set(struct mlxsw_sp_mr_route *mr_route,
+					bool offload)
+{
+	switch (mr_route->mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		if (offload)
+			mr_route->mfc4->mfc_flags |= MFC_OFFLOAD;
+		else
+			mr_route->mfc4->mfc_flags &= ~MFC_OFFLOAD;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void mlxsw_sp_mr_mfc_offload_update(struct mlxsw_sp_mr_route *mr_route)
+{
+	bool offload;
+
+	offload = mr_route->route_action != MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	mlxsw_sp_mr_mfc_offload_set(mr_route, offload);
+}
+
+static void __mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table,
+				    struct mlxsw_sp_mr_route *mr_route)
+{
+	mlxsw_sp_mr_mfc_offload_set(mr_route, false);
+	mlxsw_sp_mr_route_erase(mr_table, mr_route);
+	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
+			       mlxsw_sp_mr_route_ht_params);
+	list_del(&mr_route->node);
+	mlxsw_sp_mr_route_destroy(mr_table, mr_route);
+}
+
+int mlxsw_sp_mr_route4_add(struct mlxsw_sp_mr_table *mr_table,
+			   struct mfc_cache *mfc, bool replace)
+{
+	struct mlxsw_sp_mr_route *mr_orig_route = NULL;
+	struct mlxsw_sp_mr_route *mr_route;
+	int err;
+
+	/* If the route is a (*,*) route, abort, as these kind of routes are
+	 * used for proxy routes.
+	 */
+	if (mfc->mfc_origin == INADDR_ANY && mfc->mfc_mcastgrp == INADDR_ANY) {
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		return -EINVAL;
+	}
+
+	/* Create a new route */
+	mr_route = mlxsw_sp_mr_route4_create(mr_table, mfc);
+	if (IS_ERR(mr_route))
+		return PTR_ERR(mr_route);
+
+	/* Find any route with a matching key */
+	mr_orig_route = rhashtable_lookup_fast(&mr_table->route_ht,
+					       &mr_route->key,
+					       mlxsw_sp_mr_route_ht_params);
+	if (replace) {
+		/* On replace case, make the route point to the new route_priv.
+		 */
+		if (WARN_ON(!mr_orig_route)) {
+			err = -ENOENT;
+			goto err_no_orig_route;
+		}
+		mr_route->route_priv = mr_orig_route->route_priv;
+	} else if (mr_orig_route) {
+		/* On non replace case, if another route with the same key was
+		 * found, abort, as duplicate routes are used for proxy routes.
+		 */
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		err = -EINVAL;
+		goto err_duplicate_route;
+	}
+
+	/* Put it in the table data-structures */
+	list_add_tail(&mr_route->node, &mr_table->route_list);
+	err = rhashtable_insert_fast(&mr_table->route_ht,
+				     &mr_route->ht_node,
+				     mlxsw_sp_mr_route_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	/* Write the route to the hardware */
+	err = mlxsw_sp_mr_route_write(mr_table, mr_route, replace);
+	if (err)
+		goto err_mr_route_write;
+
+	/* Destroy the original route */
+	if (replace) {
+		rhashtable_remove_fast(&mr_table->route_ht,
+				       &mr_orig_route->ht_node,
+				       mlxsw_sp_mr_route_ht_params);
+		list_del(&mr_orig_route->node);
+		mlxsw_sp_mr_route4_destroy(mr_table, mr_orig_route);
+	}
+
+	mlxsw_sp_mr_mfc_offload_update(mr_route);
+	return 0;
+
+err_mr_route_write:
+	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
+			       mlxsw_sp_mr_route_ht_params);
+err_rhashtable_insert:
+	list_del(&mr_route->node);
+err_no_orig_route:
+err_duplicate_route:
+	mlxsw_sp_mr_route4_destroy(mr_table, mr_route);
+	return err;
+}
+
+void mlxsw_sp_mr_route4_del(struct mlxsw_sp_mr_table *mr_table,
+			    struct mfc_cache *mfc)
+{
+	struct mlxsw_sp_mr_route *mr_route;
+	struct mlxsw_sp_mr_route_key key;
+
+	mlxsw_sp_mr_route4_key(mr_table, &key, mfc);
+	mr_route = rhashtable_lookup_fast(&mr_table->route_ht, &key,
+					  mlxsw_sp_mr_route_ht_params);
+	if (mr_route)
+		__mlxsw_sp_mr_route_del(mr_table, mr_route);
+}
+
+/* Should be called after the VIF struct is updated */
+static int
+mlxsw_sp_mr_route_ivif_resolve(struct mlxsw_sp_mr_table *mr_table,
+			       struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 irif_index;
+	int err;
+
+	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return 0;
+
+	/* rve->mr_vif->rif is guaranteed to be valid at this stage */
+	irif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+	err = mr->mr_ops->route_irif_update(mlxsw_sp, rve->mr_route->route_priv,
+					    irif_index);
+	if (err)
+		return err;
+
+	err = mr->mr_ops->route_action_update(mlxsw_sp,
+					      rve->mr_route->route_priv,
+					      route_action);
+	if (err)
+		/* No need to rollback here because the iRIF change only takes
+		 * place after the action has been updated.
+		 */
+		return err;
+
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_ivif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				 struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	mr->mr_ops->route_action_update(mlxsw_sp, rve->mr_route->route_priv,
+					MLXSW_SP_MR_ROUTE_ACTION_TRAP);
+	rve->mr_route->route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+}
+
+/* Should be called after the RIF struct is updated */
+static int
+mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
+			       struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 erif_index = 0;
+	int err;
+
+	/* Update the route action, as the new eVIF can be a tunnel or a pimreg
+	 * device which will require updating the action.
+	 */
+	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action != rve->mr_route->route_action) {
+		err = mr->mr_ops->route_action_update(mlxsw_sp,
+						      rve->mr_route->route_priv,
+						      route_action);
+		if (err)
+			return err;
+	}
+
+	/* Add the eRIF */
+	if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+		erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+		err = mr->mr_ops->route_erif_add(mlxsw_sp,
+						 rve->mr_route->route_priv,
+						 erif_index);
+		if (err)
+			goto err_route_erif_add;
+	}
+
+	/* Update the minimum MTU */
+	if (rve->mr_vif->dev->mtu < rve->mr_route->min_mtu) {
+		rve->mr_route->min_mtu = rve->mr_vif->dev->mtu;
+		err = mr->mr_ops->route_min_mtu_update(mlxsw_sp,
+						       rve->mr_route->route_priv,
+						       rve->mr_route->min_mtu);
+		if (err)
+			goto err_route_min_mtu_update;
+	}
+
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+	return 0;
+
+err_route_min_mtu_update:
+	if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+		mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv,
+					   erif_index);
+err_route_erif_add:
+	if (route_action != rve->mr_route->route_action)
+		mr->mr_ops->route_action_update(mlxsw_sp,
+						rve->mr_route->route_priv,
+						rve->mr_route->route_action);
+	return err;
+}
+
+/* Should be called before the RIF struct is updated */
+static void
+mlxsw_sp_mr_route_evif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				 struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 rifi;
+
+	/* If the unresolved RIF was not valid, no need to delete it */
+	if (!mlxsw_sp_mr_vif_valid(rve->mr_vif))
+		return;
+
+	/* Update the route action: if there is only one valid eVIF in the
+	 * route, set the action to trap as the VIF deletion will lead to zero
+	 * valid eVIFs. On any other case, use the mlxsw_sp_mr_route_action to
+	 * determine the route action.
+	 */
+	if (mlxsw_sp_mr_route_valid_evifs_num(rve->mr_route) == 1)
+		route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	else
+		route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action != rve->mr_route->route_action)
+		mr->mr_ops->route_action_update(mlxsw_sp,
+						rve->mr_route->route_priv,
+						route_action);
+
+	/* Delete the erif from the route */
+	rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
+	mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv, rifi);
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+}
+
+static int mlxsw_sp_mr_vif_resolve(struct mlxsw_sp_mr_table *mr_table,
+				   struct net_device *dev,
+				   struct mlxsw_sp_mr_vif *mr_vif,
+				   unsigned long vif_flags,
+				   const struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *irve, *erve;
+	int err;
+
+	/* Update the VIF */
+	mr_vif->dev = dev;
+	mr_vif->rif = rif;
+	mr_vif->vif_flags = vif_flags;
+
+	/* Update all routes where this VIF is used as an unresolved iRIF */
+	list_for_each_entry(irve, &mr_vif->route_ivif_list, vif_node) {
+		err = mlxsw_sp_mr_route_ivif_resolve(mr_table, irve);
+		if (err)
+			goto err_irif_unresolve;
+	}
+
+	/* Update all routes where this VIF is used as an unresolved eRIF */
+	list_for_each_entry(erve, &mr_vif->route_evif_list, vif_node) {
+		err = mlxsw_sp_mr_route_evif_resolve(mr_table, erve);
+		if (err)
+			goto err_erif_unresolve;
+	}
+	return 0;
+
+err_erif_unresolve:
+	list_for_each_entry_from_reverse(erve, &mr_vif->route_evif_list,
+					 vif_node)
+		mlxsw_sp_mr_route_evif_unresolve(mr_table, erve);
+err_irif_unresolve:
+	list_for_each_entry_from_reverse(irve, &mr_vif->route_ivif_list,
+					 vif_node)
+		mlxsw_sp_mr_route_ivif_unresolve(mr_table, irve);
+	mr_vif->rif = NULL;
+	return err;
+}
+
+static void mlxsw_sp_mr_vif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				      struct net_device *dev,
+				      struct mlxsw_sp_mr_vif *mr_vif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	/* Update all routes where this VIF is used as an unresolved eRIF */
+	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node)
+		mlxsw_sp_mr_route_evif_unresolve(mr_table, rve);
+
+	/* Update all routes where this VIF is used as an unresolved iRIF */
+	list_for_each_entry(rve, &mr_vif->route_ivif_list, vif_node)
+		mlxsw_sp_mr_route_ivif_unresolve(mr_table, rve);
+
+	/* Update the VIF */
+	mr_vif->dev = dev;
+	mr_vif->rif = NULL;
+}
+
+int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
+			struct net_device *dev, vifi_t vif_index,
+			unsigned long vif_flags, const struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
+
+	if (WARN_ON(vif_index >= MAXVIFS))
+		return -EINVAL;
+	if (mr_vif->dev)
+		return -EEXIST;
+	return mlxsw_sp_mr_vif_resolve(mr_table, dev, mr_vif, vif_flags, rif);
+}
+
+void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index)
+{
+	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
+
+	if (WARN_ON(vif_index >= MAXVIFS))
+		return;
+	if (WARN_ON(!mr_vif->dev))
+		return;
+	mlxsw_sp_mr_vif_unresolve(mr_table, NULL, mr_vif);
+}
+
+struct mlxsw_sp_mr_vif *
+mlxsw_sp_mr_dev_vif_lookup(struct mlxsw_sp_mr_table *mr_table,
+			   const struct net_device *dev)
+{
+	vifi_t vif_index;
+
+	for (vif_index = 0; vif_index < MAXVIFS; vif_index++)
+		if (mr_table->vifs[vif_index].dev == dev)
+			return &mr_table->vifs[vif_index];
+	return NULL;
+}
+
+int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
+			const struct mlxsw_sp_rif *rif)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return 0;
+
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return 0;
+	return mlxsw_sp_mr_vif_resolve(mr_table, mr_vif->dev, mr_vif,
+				       mr_vif->vif_flags, rif);
+}
+
+void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
+			 const struct mlxsw_sp_rif *rif)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return;
+
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return;
+	mlxsw_sp_mr_vif_unresolve(mr_table, mr_vif->dev, mr_vif);
+}
+
+void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
+				const struct mlxsw_sp_rif *rif, int mtu)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return;
+
+	/* Search for a VIF that use that RIF */
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return;
+
+	/* Update all the routes that uses that VIF as eVIF */
+	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node) {
+		if (mtu < rve->mr_route->min_mtu) {
+			rve->mr_route->min_mtu = mtu;
+			mr->mr_ops->route_min_mtu_update(mlxsw_sp,
+							 rve->mr_route->route_priv,
+							 mtu);
+		}
+	}
+}
+
+struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
+						   u32 vr_id,
+						   enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_mr_route_params catchall_route_params = {
+		.prio = MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
+		.key = {
+			.vrid = vr_id,
+		},
+		.value = {
+			.route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP,
+		}
+	};
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	struct mlxsw_sp_mr_table *mr_table;
+	int err;
+	int i;
+
+	mr_table = kzalloc(sizeof(*mr_table) + mr->mr_ops->route_priv_size,
+			   GFP_KERNEL);
+	if (!mr_table)
+		return ERR_PTR(-ENOMEM);
+
+	mr_table->vr_id = vr_id;
+	mr_table->mlxsw_sp = mlxsw_sp;
+	mr_table->proto = proto;
+	INIT_LIST_HEAD(&mr_table->route_list);
+
+	err = rhashtable_init(&mr_table->route_ht,
+			      &mlxsw_sp_mr_route_ht_params);
+	if (err)
+		goto err_route_rhashtable_init;
+
+	for (i = 0; i < MAXVIFS; i++) {
+		INIT_LIST_HEAD(&mr_table->vifs[i].route_evif_list);
+		INIT_LIST_HEAD(&mr_table->vifs[i].route_ivif_list);
+	}
+
+	err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
+				       mr_table->catchall_route_priv,
+				       &catchall_route_params);
+	if (err)
+		goto err_ops_route_create;
+	list_add_tail(&mr_table->node, &mr->table_list);
+	return mr_table;
+
+err_ops_route_create:
+	rhashtable_destroy(&mr_table->route_ht);
+err_route_rhashtable_init:
+	kfree(mr_table);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	WARN_ON(!mlxsw_sp_mr_table_empty(mr_table));
+	list_del(&mr_table->node);
+	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv,
+				  &mr_table->catchall_route_priv);
+	rhashtable_destroy(&mr_table->route_ht);
+	kfree(mr_table);
+}
+
+void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table)
+{
+	struct mlxsw_sp_mr_route *mr_route, *tmp;
+	int i;
+
+	list_for_each_entry_safe(mr_route, tmp, &mr_table->route_list, node)
+		__mlxsw_sp_mr_route_del(mr_table, mr_route);
+
+	for (i = 0; i < MAXVIFS; i++) {
+		mr_table->vifs[i].dev = NULL;
+		mr_table->vifs[i].rif = NULL;
+	}
+}
+
+bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table)
+{
+	int i;
+
+	for (i = 0; i < MAXVIFS; i++)
+		if (mr_table->vifs[i].dev)
+			return false;
+	return list_empty(&mr_table->route_list);
+}
+
+static void mlxsw_sp_mr_route_stats_update(struct mlxsw_sp *mlxsw_sp,
+					   struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u64 packets, bytes;
+
+	if (mr_route->route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return;
+
+	mr->mr_ops->route_stats(mlxsw_sp, mr_route->route_priv, &packets,
+				&bytes);
+
+	switch (mr_route->mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		if (mr_route->mfc4->mfc_un.res.pkt != packets)
+			mr_route->mfc4->mfc_un.res.lastuse = jiffies;
+		mr_route->mfc4->mfc_un.res.pkt = packets;
+		mr_route->mfc4->mfc_un.res.bytes = bytes;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void mlxsw_sp_mr_stats_update(struct work_struct *work)
+{
+	struct mlxsw_sp_mr *mr = container_of(work, struct mlxsw_sp_mr,
+					      stats_update_dw.work);
+	struct mlxsw_sp_mr_table *mr_table;
+	struct mlxsw_sp_mr_route *mr_route;
+	unsigned long interval;
+
+	rtnl_lock();
+	list_for_each_entry(mr_table, &mr->table_list, node)
+		list_for_each_entry(mr_route, &mr_table->route_list, node)
+			mlxsw_sp_mr_route_stats_update(mr_table->mlxsw_sp,
+						       mr_route);
+	rtnl_unlock();
+
+	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
+	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
+}
+
+int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
+		     const struct mlxsw_sp_mr_ops *mr_ops)
+{
+	struct mlxsw_sp_mr *mr;
+	unsigned long interval;
+	int err;
+
+	mr = kzalloc(sizeof(*mr) + mr_ops->priv_size, GFP_KERNEL);
+	if (!mr)
+		return -ENOMEM;
+	mr->mr_ops = mr_ops;
+	mlxsw_sp->mr = mr;
+	INIT_LIST_HEAD(&mr->table_list);
+
+	err = mr_ops->init(mlxsw_sp, mr->priv);
+	if (err)
+		goto err;
+
+	/* Create the delayed work for counter updates */
+	INIT_DELAYED_WORK(&mr->stats_update_dw, mlxsw_sp_mr_stats_update);
+	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
+	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
+	return 0;
+err:
+	kfree(mr);
+	return err;
+}
+
+void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	cancel_delayed_work_sync(&mr->stats_update_dw);
+	mr->mr_ops->fini(mr->priv);
+	kfree(mr);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
new file mode 100644
index 000000000000..5d26a122af49
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
@@ -0,0 +1,134 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_MCROUTER_H
+#define _MLXSW_SPECTRUM_MCROUTER_H
+
+#include <linux/mroute.h>
+#include "spectrum_router.h"
+#include "spectrum.h"
+
+enum mlxsw_sp_mr_route_action {
+	MLXSW_SP_MR_ROUTE_ACTION_FORWARD,
+	MLXSW_SP_MR_ROUTE_ACTION_TRAP,
+	MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD,
+};
+
+enum mlxsw_sp_mr_route_prio {
+	MLXSW_SP_MR_ROUTE_PRIO_SG,
+	MLXSW_SP_MR_ROUTE_PRIO_STARG,
+	MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
+	__MLXSW_SP_MR_ROUTE_PRIO_MAX
+};
+
+#define MLXSW_SP_MR_ROUTE_PRIO_MAX (__MLXSW_SP_MR_ROUTE_PRIO_MAX - 1)
+
+struct mlxsw_sp_mr_route_key {
+	int vrid;
+	enum mlxsw_sp_l3proto proto;
+	union mlxsw_sp_l3addr group;
+	union mlxsw_sp_l3addr group_mask;
+	union mlxsw_sp_l3addr source;
+	union mlxsw_sp_l3addr source_mask;
+};
+
+struct mlxsw_sp_mr_route_info {
+	enum mlxsw_sp_mr_route_action route_action;
+	u16 irif_index;
+	u16 *erif_indices;
+	size_t erif_num;
+	u16 min_mtu;
+};
+
+struct mlxsw_sp_mr_route_params {
+	struct mlxsw_sp_mr_route_key key;
+	struct mlxsw_sp_mr_route_info value;
+	enum mlxsw_sp_mr_route_prio prio;
+};
+
+struct mlxsw_sp_mr_ops {
+	int priv_size;
+	int route_priv_size;
+	int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	int (*route_create)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			    void *route_priv,
+			    struct mlxsw_sp_mr_route_params *route_params);
+	int (*route_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			    struct mlxsw_sp_mr_route_info *route_info);
+	int (*route_stats)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			   u64 *packets, u64 *bytes);
+	int (*route_action_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				   enum mlxsw_sp_mr_route_action route_action);
+	int (*route_min_mtu_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				    u16 min_mtu);
+	int (*route_irif_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				 u16 irif_index);
+	int (*route_erif_add)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      u16 erif_index);
+	int (*route_erif_del)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      u16 erif_index);
+	void (*route_destroy)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			      void *route_priv);
+	void (*fini)(void *priv);
+};
+
+struct mlxsw_sp_mr;
+struct mlxsw_sp_mr_table;
+
+int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
+		     const struct mlxsw_sp_mr_ops *mr_ops);
+void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_mr_route4_add(struct mlxsw_sp_mr_table *mr_table,
+			   struct mfc_cache *mfc, bool replace);
+void mlxsw_sp_mr_route4_del(struct mlxsw_sp_mr_table *mr_table,
+			    struct mfc_cache *mfc);
+int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
+			struct net_device *dev, vifi_t vif_index,
+			unsigned long vif_flags,
+			const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index);
+int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
+			const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
+			 const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
+				const struct mlxsw_sp_rif *rif, int mtu);
+struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
+						   u32 tb_id,
+						   enum mlxsw_sp_l3proto proto);
+void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table);
+void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table);
+bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
new file mode 100644
index 000000000000..39c21c70ac32
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
@@ -0,0 +1,838 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/parman.h>
+
+#include "reg.h"
+#include "spectrum.h"
+#include "core_acl_flex_actions.h"
+#include "spectrum_mr.h"
+
+struct mlxsw_sp_mr_tcam_region {
+	struct mlxsw_sp *mlxsw_sp;
+	enum mlxsw_reg_rtar_key_type rtar_key_type;
+	struct parman *parman;
+	struct parman_prio *parman_prios;
+};
+
+struct mlxsw_sp_mr_tcam {
+	struct mlxsw_sp_mr_tcam_region ipv4_tcam_region;
+};
+
+/* This struct maps to one RIGR2 register entry */
+struct mlxsw_sp_mr_erif_sublist {
+	struct list_head list;
+	u32 rigr2_kvdl_index;
+	int num_erifs;
+	u16 erif_indices[MLXSW_REG_RIGR2_MAX_ERIFS];
+	bool synced;
+};
+
+struct mlxsw_sp_mr_tcam_erif_list {
+	struct list_head erif_sublists;
+	u32 kvdl_index;
+};
+
+static bool
+mlxsw_sp_mr_erif_sublist_full(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_mr_erif_sublist *erif_sublist)
+{
+	int erif_list_entries = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						   MC_ERIF_LIST_ENTRIES);
+
+	return erif_sublist->num_erifs == erif_list_entries;
+}
+
+static void
+mlxsw_sp_mr_erif_list_init(struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	INIT_LIST_HEAD(&erif_list->erif_sublists);
+}
+
+#define MLXSW_SP_KVDL_RIGR2_SIZE 1
+
+static struct mlxsw_sp_mr_erif_sublist *
+mlxsw_sp_mr_erif_sublist_create(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist;
+	int err;
+
+	erif_sublist = kzalloc(sizeof(*erif_sublist), GFP_KERNEL);
+	if (!erif_sublist)
+		return ERR_PTR(-ENOMEM);
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_RIGR2_SIZE,
+				  &erif_sublist->rigr2_kvdl_index);
+	if (err) {
+		kfree(erif_sublist);
+		return ERR_PTR(err);
+	}
+
+	list_add_tail(&erif_sublist->list, &erif_list->erif_sublists);
+	return erif_sublist;
+}
+
+static void
+mlxsw_sp_mr_erif_sublist_destroy(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_mr_erif_sublist *erif_sublist)
+{
+	list_del(&erif_sublist->list);
+	mlxsw_sp_kvdl_free(mlxsw_sp, erif_sublist->rigr2_kvdl_index);
+	kfree(erif_sublist);
+}
+
+static int
+mlxsw_sp_mr_erif_list_add(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_mr_tcam_erif_list *erif_list,
+			  u16 erif_index)
+{
+	struct mlxsw_sp_mr_erif_sublist *sublist;
+
+	/* If either there is no erif_entry or the last one is full, allocate a
+	 * new one.
+	 */
+	if (list_empty(&erif_list->erif_sublists)) {
+		sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp, erif_list);
+		if (IS_ERR(sublist))
+			return PTR_ERR(sublist);
+		erif_list->kvdl_index = sublist->rigr2_kvdl_index;
+	} else {
+		sublist = list_last_entry(&erif_list->erif_sublists,
+					  struct mlxsw_sp_mr_erif_sublist,
+					  list);
+		sublist->synced = false;
+		if (mlxsw_sp_mr_erif_sublist_full(mlxsw_sp, sublist)) {
+			sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp,
+								  erif_list);
+			if (IS_ERR(sublist))
+				return PTR_ERR(sublist);
+		}
+	}
+
+	/* Add the eRIF to the last entry's last index */
+	sublist->erif_indices[sublist->num_erifs++] = erif_index;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_erif_list_flush(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist, *tmp;
+
+	list_for_each_entry_safe(erif_sublist, tmp, &erif_list->erif_sublists,
+				 list)
+		mlxsw_sp_mr_erif_sublist_destroy(mlxsw_sp, erif_sublist);
+}
+
+static int
+mlxsw_sp_mr_erif_list_commit(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *curr_sublist;
+	char rigr2_pl[MLXSW_REG_RIGR2_LEN];
+	int err;
+	int i;
+
+	list_for_each_entry(curr_sublist, &erif_list->erif_sublists, list) {
+		if (curr_sublist->synced)
+			continue;
+
+		/* If the sublist is not the last one, pack the next index */
+		if (list_is_last(&curr_sublist->list,
+				 &erif_list->erif_sublists)) {
+			mlxsw_reg_rigr2_pack(rigr2_pl,
+					     curr_sublist->rigr2_kvdl_index,
+					     false, 0);
+		} else {
+			struct mlxsw_sp_mr_erif_sublist *next_sublist;
+
+			next_sublist = list_next_entry(curr_sublist, list);
+			mlxsw_reg_rigr2_pack(rigr2_pl,
+					     curr_sublist->rigr2_kvdl_index,
+					     true,
+					     next_sublist->rigr2_kvdl_index);
+		}
+
+		/* Pack all the erifs */
+		for (i = 0; i < curr_sublist->num_erifs; i++) {
+			u16 erif_index = curr_sublist->erif_indices[i];
+
+			mlxsw_reg_rigr2_erif_entry_pack(rigr2_pl, i, true,
+							erif_index);
+		}
+
+		/* Write the entry */
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rigr2),
+				      rigr2_pl);
+		if (err)
+			/* No need of a rollback here because this
+			 * hardware entry should not be pointed yet.
+			 */
+			return err;
+		curr_sublist->synced = true;
+	}
+	return 0;
+}
+
+static void mlxsw_sp_mr_erif_list_move(struct mlxsw_sp_mr_tcam_erif_list *to,
+				       struct mlxsw_sp_mr_tcam_erif_list *from)
+{
+	list_splice(&from->erif_sublists, &to->erif_sublists);
+	to->kvdl_index = from->kvdl_index;
+}
+
+struct mlxsw_sp_mr_tcam_route {
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	u32 counter_index;
+	struct parman_item parman_item;
+	struct parman_prio *parman_prio;
+	enum mlxsw_sp_mr_route_action action;
+	struct mlxsw_sp_mr_route_key key;
+	u16 irif_index;
+	u16 min_mtu;
+};
+
+static struct mlxsw_afa_block *
+mlxsw_sp_mr_tcam_afa_block_create(struct mlxsw_sp *mlxsw_sp,
+				  enum mlxsw_sp_mr_route_action route_action,
+				  u16 irif_index, u32 counter_index,
+				  u16 min_mtu,
+				  struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	afa_block = mlxsw_afa_block_create(mlxsw_sp->afa);
+	if (!afa_block)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlxsw_afa_block_append_counter(afa_block, counter_index);
+	if (err)
+		goto err;
+
+	switch (route_action) {
+	case MLXSW_SP_MR_ROUTE_ACTION_TRAP:
+		err = mlxsw_afa_block_append_trap(afa_block,
+						  MLXSW_TRAP_ID_ACL1);
+		if (err)
+			goto err;
+		break;
+	case MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD:
+	case MLXSW_SP_MR_ROUTE_ACTION_FORWARD:
+		/* If we are about to append a multicast router action, commit
+		 * the erif_list.
+		 */
+		err = mlxsw_sp_mr_erif_list_commit(mlxsw_sp, erif_list);
+		if (err)
+			goto err;
+
+		err = mlxsw_afa_block_append_mcrouter(afa_block, irif_index,
+						      min_mtu, false,
+						      erif_list->kvdl_index);
+		if (err)
+			goto err;
+
+		if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD) {
+			err = mlxsw_afa_block_append_trap_and_forward(afa_block,
+								      MLXSW_TRAP_ID_ACL2);
+			if (err)
+				goto err;
+		}
+		break;
+	default:
+		err = -EINVAL;
+		goto err;
+	}
+
+	err = mlxsw_afa_block_commit(afa_block);
+	if (err)
+		goto err;
+	return afa_block;
+err:
+	mlxsw_afa_block_destroy(afa_block);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_mr_tcam_afa_block_destroy(struct mlxsw_afa_block *afa_block)
+{
+	mlxsw_afa_block_destroy(afa_block);
+}
+
+static int mlxsw_sp_mr_tcam_route_replace(struct mlxsw_sp *mlxsw_sp,
+					  struct parman_item *parman_item,
+					  struct mlxsw_sp_mr_route_key *key,
+					  struct mlxsw_afa_block *afa_block)
+{
+	char rmft2_pl[MLXSW_REG_RMFT2_LEN];
+
+	switch (key->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, true, parman_item->index,
+					  key->vrid,
+					  MLXSW_REG_RMFT2_IRIF_MASK_IGNORE, 0,
+					  ntohl(key->group.addr4),
+					  ntohl(key->group_mask.addr4),
+					  ntohl(key->source.addr4),
+					  ntohl(key->source_mask.addr4),
+					  mlxsw_afa_block_first_set(afa_block));
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+	default:
+		WARN_ON_ONCE(1);
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl);
+}
+
+static int mlxsw_sp_mr_tcam_route_remove(struct mlxsw_sp *mlxsw_sp, int vrid,
+					 struct parman_item *parman_item)
+{
+	char rmft2_pl[MLXSW_REG_RMFT2_LEN];
+
+	mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, false, parman_item->index, vrid,
+				  0, 0, 0, 0, 0, 0, NULL);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl);
+}
+
+static int
+mlxsw_sp_mr_tcam_erif_populate(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_mr_tcam_erif_list *erif_list,
+			       struct mlxsw_sp_mr_route_info *route_info)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < route_info->erif_num; i++) {
+		u16 erif_index = route_info->erif_indices[i];
+
+		err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, erif_list,
+						erif_index);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int
+mlxsw_sp_mr_tcam_route_parman_item_add(struct mlxsw_sp_mr_tcam *mr_tcam,
+				       struct mlxsw_sp_mr_tcam_route *route,
+				       enum mlxsw_sp_mr_route_prio prio)
+{
+	struct parman_prio *parman_prio = NULL;
+	int err;
+
+	switch (route->key.proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		parman_prio = &mr_tcam->ipv4_tcam_region.parman_prios[prio];
+		err = parman_item_add(mr_tcam->ipv4_tcam_region.parman,
+				      parman_prio, &route->parman_item);
+		if (err)
+			return err;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+	default:
+		WARN_ON_ONCE(1);
+	}
+	route->parman_prio = parman_prio;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_tcam_route_parman_item_remove(struct mlxsw_sp_mr_tcam *mr_tcam,
+					  struct mlxsw_sp_mr_tcam_route *route)
+{
+	switch (route->key.proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		parman_item_remove(mr_tcam->ipv4_tcam_region.parman,
+				   route->parman_prio, &route->parman_item);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+
+static int
+mlxsw_sp_mr_tcam_route_create(struct mlxsw_sp *mlxsw_sp, void *priv,
+			      void *route_priv,
+			      struct mlxsw_sp_mr_route_params *route_params)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+	int err;
+
+	route->key = route_params->key;
+	route->irif_index = route_params->value.irif_index;
+	route->min_mtu = route_params->value.min_mtu;
+	route->action = route_params->value.route_action;
+
+	/* Create the egress RIFs list */
+	mlxsw_sp_mr_erif_list_init(&route->erif_list);
+	err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &route->erif_list,
+					     &route_params->value);
+	if (err)
+		goto err_erif_populate;
+
+	/* Create the flow counter */
+	err = mlxsw_sp_flow_counter_alloc(mlxsw_sp, &route->counter_index);
+	if (err)
+		goto err_counter_alloc;
+
+	/* Create the flexible action block */
+	route->afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+							     route->action,
+							     route->irif_index,
+							     route->counter_index,
+							     route->min_mtu,
+							     &route->erif_list);
+	if (IS_ERR(route->afa_block)) {
+		err = PTR_ERR(route->afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Allocate place in the TCAM */
+	err = mlxsw_sp_mr_tcam_route_parman_item_add(mr_tcam, route,
+						     route_params->prio);
+	if (err)
+		goto err_parman_item_add;
+
+	/* Write the route to the TCAM */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, route->afa_block);
+	if (err)
+		goto err_route_replace;
+	return 0;
+
+err_route_replace:
+	mlxsw_sp_mr_tcam_route_parman_item_remove(mr_tcam, route);
+err_parman_item_add:
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+err_afa_block_create:
+	mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index);
+err_erif_populate:
+err_counter_alloc:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	return err;
+}
+
+static void mlxsw_sp_mr_tcam_route_destroy(struct mlxsw_sp *mlxsw_sp,
+					   void *priv, void *route_priv)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+
+	mlxsw_sp_mr_tcam_route_remove(mlxsw_sp, route->key.vrid,
+				      &route->parman_item);
+	mlxsw_sp_mr_tcam_route_parman_item_remove(mr_tcam, route);
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+}
+
+static int mlxsw_sp_mr_tcam_route_stats(struct mlxsw_sp *mlxsw_sp,
+					void *route_priv, u64 *packets,
+					u64 *bytes)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+
+	return mlxsw_sp_flow_counter_get(mlxsw_sp, route->counter_index,
+					 packets, bytes);
+}
+
+static int
+mlxsw_sp_mr_tcam_route_action_update(struct mlxsw_sp *mlxsw_sp,
+				     void *route_priv,
+				     enum mlxsw_sp_mr_route_action route_action)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new flexible action block */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route_action,
+						      route->irif_index,
+						      route->counter_index,
+						      route->min_mtu,
+						      &route->erif_list);
+	if (IS_ERR(afa_block))
+		return PTR_ERR(afa_block);
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err;
+
+	/* Delete the old one */
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	route->afa_block = afa_block;
+	route->action = route_action;
+	return 0;
+err:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+	return err;
+}
+
+static int mlxsw_sp_mr_tcam_route_min_mtu_update(struct mlxsw_sp *mlxsw_sp,
+						 void *route_priv, u16 min_mtu)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new flexible action block */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+						      route->action,
+						      route->irif_index,
+						      route->counter_index,
+						      min_mtu,
+						      &route->erif_list);
+	if (IS_ERR(afa_block))
+		return PTR_ERR(afa_block);
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err;
+
+	/* Delete the old one */
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	route->afa_block = afa_block;
+	route->min_mtu = min_mtu;
+	return 0;
+err:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+	return err;
+}
+
+static int mlxsw_sp_mr_tcam_route_irif_update(struct mlxsw_sp *mlxsw_sp,
+					      void *route_priv, u16 irif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+
+	if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return -EINVAL;
+	route->irif_index = irif_index;
+	return 0;
+}
+
+static int mlxsw_sp_mr_tcam_route_erif_add(struct mlxsw_sp *mlxsw_sp,
+					   void *route_priv, u16 erif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	int err;
+
+	err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &route->erif_list,
+					erif_index);
+	if (err)
+		return err;
+
+	/* Commit the action only if the route action is not TRAP */
+	if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return mlxsw_sp_mr_erif_list_commit(mlxsw_sp,
+						    &route->erif_list);
+	return 0;
+}
+
+static int mlxsw_sp_mr_tcam_route_erif_del(struct mlxsw_sp *mlxsw_sp,
+					   void *route_priv, u16 erif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist;
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+	int i;
+
+	/* Create a copy of the original erif_list without the deleted entry */
+	mlxsw_sp_mr_erif_list_init(&erif_list);
+	list_for_each_entry(erif_sublist, &route->erif_list.erif_sublists, list) {
+		for (i = 0; i < erif_sublist->num_erifs; i++) {
+			u16 curr_erif = erif_sublist->erif_indices[i];
+
+			if (curr_erif == erif_index)
+				continue;
+			err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &erif_list,
+							curr_erif);
+			if (err)
+				goto err_erif_list_add;
+		}
+	}
+
+	/* Create the flexible action block pointing to the new erif_list */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route->action,
+						      route->irif_index,
+						      route->counter_index,
+						      route->min_mtu,
+						      &erif_list);
+	if (IS_ERR(afa_block)) {
+		err = PTR_ERR(afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err_route_write;
+
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	route->afa_block = afa_block;
+	mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list);
+	return 0;
+
+err_route_write:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+err_afa_block_create:
+err_erif_list_add:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list);
+	return err;
+}
+
+static int
+mlxsw_sp_mr_tcam_route_update(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      struct mlxsw_sp_mr_route_info *route_info)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new erif_list */
+	mlxsw_sp_mr_erif_list_init(&erif_list);
+	err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &erif_list, route_info);
+	if (err)
+		goto err_erif_populate;
+
+	/* Create the flexible action block pointing to the new erif_list */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+						      route_info->route_action,
+						      route_info->irif_index,
+						      route->counter_index,
+						      route_info->min_mtu,
+						      &erif_list);
+	if (IS_ERR(afa_block)) {
+		err = PTR_ERR(afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err_route_write;
+
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	route->afa_block = afa_block;
+	mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list);
+	route->action = route_info->route_action;
+	route->irif_index = route_info->irif_index;
+	route->min_mtu = route_info->min_mtu;
+	return 0;
+
+err_route_write:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+err_afa_block_create:
+err_erif_populate:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list);
+	return err;
+}
+
+#define MLXSW_SP_MR_TCAM_REGION_BASE_COUNT 16
+#define MLXSW_SP_MR_TCAM_REGION_RESIZE_STEP 16
+
+static int
+mlxsw_sp_mr_tcam_region_alloc(struct mlxsw_sp_mr_tcam_region *mr_tcam_region)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_ALLOCATE,
+			    mr_tcam_region->rtar_key_type,
+			    MLXSW_SP_MR_TCAM_REGION_BASE_COUNT);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static void
+mlxsw_sp_mr_tcam_region_free(struct mlxsw_sp_mr_tcam_region *mr_tcam_region)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_DEALLOCATE,
+			    mr_tcam_region->rtar_key_type, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static int mlxsw_sp_mr_tcam_region_parman_resize(void *priv,
+						 unsigned long new_count)
+{
+	struct mlxsw_sp_mr_tcam_region *mr_tcam_region = priv;
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+	u64 max_tcam_rules;
+
+	max_tcam_rules = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_TCAM_RULES);
+	if (new_count > max_tcam_rules)
+		return -EINVAL;
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_RESIZE,
+			    mr_tcam_region->rtar_key_type, new_count);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static void mlxsw_sp_mr_tcam_region_parman_move(void *priv,
+						unsigned long from_index,
+						unsigned long to_index,
+						unsigned long count)
+{
+	struct mlxsw_sp_mr_tcam_region *mr_tcam_region = priv;
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rrcr_pl[MLXSW_REG_RRCR_LEN];
+
+	mlxsw_reg_rrcr_pack(rrcr_pl, MLXSW_REG_RRCR_OP_MOVE,
+			    from_index, count,
+			    mr_tcam_region->rtar_key_type, to_index);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rrcr), rrcr_pl);
+}
+
+static const struct parman_ops mlxsw_sp_mr_tcam_region_parman_ops = {
+	.base_count	= MLXSW_SP_MR_TCAM_REGION_BASE_COUNT,
+	.resize_step	= MLXSW_SP_MR_TCAM_REGION_RESIZE_STEP,
+	.resize		= mlxsw_sp_mr_tcam_region_parman_resize,
+	.move		= mlxsw_sp_mr_tcam_region_parman_move,
+	.algo		= PARMAN_ALGO_TYPE_LSORT,
+};
+
+static int
+mlxsw_sp_mr_tcam_region_init(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_mr_tcam_region *mr_tcam_region,
+			     enum mlxsw_reg_rtar_key_type rtar_key_type)
+{
+	struct parman_prio *parman_prios;
+	struct parman *parman;
+	int err;
+	int i;
+
+	mr_tcam_region->rtar_key_type = rtar_key_type;
+	mr_tcam_region->mlxsw_sp = mlxsw_sp;
+
+	err = mlxsw_sp_mr_tcam_region_alloc(mr_tcam_region);
+	if (err)
+		return err;
+
+	parman = parman_create(&mlxsw_sp_mr_tcam_region_parman_ops,
+			       mr_tcam_region);
+	if (!parman) {
+		err = -ENOMEM;
+		goto err_parman_create;
+	}
+	mr_tcam_region->parman = parman;
+
+	parman_prios = kmalloc_array(MLXSW_SP_MR_ROUTE_PRIO_MAX + 1,
+				     sizeof(*parman_prios), GFP_KERNEL);
+	if (!parman_prios) {
+		err = -ENOMEM;
+		goto err_parman_prios_alloc;
+	}
+	mr_tcam_region->parman_prios = parman_prios;
+
+	for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++)
+		parman_prio_init(mr_tcam_region->parman,
+				 &mr_tcam_region->parman_prios[i], i);
+	return 0;
+
+err_parman_prios_alloc:
+	parman_destroy(parman);
+err_parman_create:
+	mlxsw_sp_mr_tcam_region_free(mr_tcam_region);
+	return err;
+}
+
+static void
+mlxsw_sp_mr_tcam_region_fini(struct mlxsw_sp_mr_tcam_region *mr_tcam_region)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++)
+		parman_prio_fini(&mr_tcam_region->parman_prios[i]);
+	kfree(mr_tcam_region->parman_prios);
+	parman_destroy(mr_tcam_region->parman);
+	mlxsw_sp_mr_tcam_region_free(mr_tcam_region);
+}
+
+static int mlxsw_sp_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MC_ERIF_LIST_ENTRIES) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_MAX_TCAM_RULES))
+		return -EIO;
+
+	return mlxsw_sp_mr_tcam_region_init(mlxsw_sp,
+					    &mr_tcam->ipv4_tcam_region,
+					    MLXSW_REG_RTAR_KEY_TYPE_IPV4_MULTICAST);
+}
+
+static void mlxsw_sp_mr_tcam_fini(void *priv)
+{
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+
+	mlxsw_sp_mr_tcam_region_fini(&mr_tcam->ipv4_tcam_region);
+}
+
+const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops = {
+	.priv_size = sizeof(struct mlxsw_sp_mr_tcam),
+	.route_priv_size = sizeof(struct mlxsw_sp_mr_tcam_route),
+	.init = mlxsw_sp_mr_tcam_init,
+	.route_create = mlxsw_sp_mr_tcam_route_create,
+	.route_update = mlxsw_sp_mr_tcam_route_update,
+	.route_stats = mlxsw_sp_mr_tcam_route_stats,
+	.route_action_update = mlxsw_sp_mr_tcam_route_action_update,
+	.route_min_mtu_update = mlxsw_sp_mr_tcam_route_min_mtu_update,
+	.route_irif_update = mlxsw_sp_mr_tcam_route_irif_update,
+	.route_erif_add = mlxsw_sp_mr_tcam_route_erif_add,
+	.route_erif_del = mlxsw_sp_mr_tcam_route_erif_del,
+	.route_destroy = mlxsw_sp_mr_tcam_route_destroy,
+	.fini = mlxsw_sp_mr_tcam_fini,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
new file mode 100644
index 000000000000..f9b59ee25406
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
@@ -0,0 +1,43 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_MCROUTER_TCAM_H
+#define _MLXSW_SPECTRUM_MCROUTER_TCAM_H
+
+#include "spectrum.h"
+#include "spectrum_mr.h"
+
+extern const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops;
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 032089efc1a0..e0f8ea4ed7af 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -65,6 +65,8 @@
 #include "spectrum_cnt.h"
 #include "spectrum_dpipe.h"
 #include "spectrum_ipip.h"
+#include "spectrum_mr.h"
+#include "spectrum_mr_tcam.h"
 #include "spectrum_router.h"
 
 struct mlxsw_sp_vr;
@@ -78,6 +80,7 @@ struct mlxsw_sp_router {
 	struct rhashtable neigh_ht;
 	struct rhashtable nexthop_group_ht;
 	struct rhashtable nexthop_ht;
+	struct list_head nexthop_list;
 	struct {
 		struct mlxsw_sp_lpm_tree *trees;
 		unsigned int tree_count;
@@ -458,6 +461,7 @@ struct mlxsw_sp_vr {
 	unsigned int rif_count;
 	struct mlxsw_sp_fib *fib4;
 	struct mlxsw_sp_fib *fib6;
+	struct mlxsw_sp_mr_table *mr4_table;
 };
 
 static const struct rhashtable_params mlxsw_sp_fib_ht_params;
@@ -652,7 +656,7 @@ static void mlxsw_sp_lpm_fini(struct mlxsw_sp *mlxsw_sp)
 
 static bool mlxsw_sp_vr_is_used(const struct mlxsw_sp_vr *vr)
 {
-	return !!vr->fib4 || !!vr->fib6;
+	return !!vr->fib4 || !!vr->fib6 || !!vr->mr4_table;
 }
 
 static struct mlxsw_sp_vr *mlxsw_sp_vr_find_unused(struct mlxsw_sp *mlxsw_sp)
@@ -692,8 +696,8 @@ static int mlxsw_sp_vr_lpm_tree_unbind(struct mlxsw_sp *mlxsw_sp,
 
 static u32 mlxsw_sp_fix_tb_id(u32 tb_id)
 {
-	/* For our purpose, squash main and local table into one */
-	if (tb_id == RT_TABLE_LOCAL)
+	/* For our purpose, squash main, default and local tables into one */
+	if (tb_id == RT_TABLE_LOCAL || tb_id == RT_TABLE_DEFAULT)
 		tb_id = RT_TABLE_MAIN;
 	return tb_id;
 }
@@ -743,9 +747,18 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp,
 		err = PTR_ERR(vr->fib6);
 		goto err_fib6_create;
 	}
+	vr->mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id,
+						 MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(vr->mr4_table)) {
+		err = PTR_ERR(vr->mr4_table);
+		goto err_mr_table_create;
+	}
 	vr->tb_id = tb_id;
 	return vr;
 
+err_mr_table_create:
+	mlxsw_sp_fib_destroy(vr->fib6);
+	vr->fib6 = NULL;
 err_fib6_create:
 	mlxsw_sp_fib_destroy(vr->fib4);
 	vr->fib4 = NULL;
@@ -754,6 +767,8 @@ err_fib6_create:
 
 static void mlxsw_sp_vr_destroy(struct mlxsw_sp_vr *vr)
 {
+	mlxsw_sp_mr_table_destroy(vr->mr4_table);
+	vr->mr4_table = NULL;
 	mlxsw_sp_fib_destroy(vr->fib6);
 	vr->fib6 = NULL;
 	mlxsw_sp_fib_destroy(vr->fib4);
@@ -774,7 +789,8 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id)
 static void mlxsw_sp_vr_put(struct mlxsw_sp_vr *vr)
 {
 	if (!vr->rif_count && list_empty(&vr->fib4->node_list) &&
-	    list_empty(&vr->fib6->node_list))
+	    list_empty(&vr->fib6->node_list) &&
+	    mlxsw_sp_mr_table_empty(vr->mr4_table))
 		mlxsw_sp_vr_destroy(vr);
 }
 
@@ -1316,7 +1332,7 @@ mlxsw_sp_rif_neigh_next(struct mlxsw_sp_rif *rif,
 						typeof(*neigh_entry),
 						rif_list_node);
 	}
-	if (neigh_entry->rif_list_node.next == &rif->neigh_list)
+	if (list_is_last(&neigh_entry->rif_list_node, &rif->neigh_list))
 		return NULL;
 	return list_next_entry(neigh_entry, rif_list_node);
 }
@@ -1664,7 +1680,7 @@ __mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp,
 		err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(rauhtd),
 				      rauhtd_pl);
 		if (err) {
-			dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to dump neighbour talbe\n");
+			dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to dump neighbour table\n");
 			break;
 		}
 		num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl);
@@ -2028,6 +2044,7 @@ struct mlxsw_sp_nexthop_key {
 struct mlxsw_sp_nexthop {
 	struct list_head neigh_list_node; /* member of neigh entry list */
 	struct list_head rif_list_node;
+	struct list_head router_list_node;
 	struct mlxsw_sp_nexthop_group *nh_grp; /* pointer back to the group
 						* this belongs to
 						*/
@@ -2050,6 +2067,8 @@ struct mlxsw_sp_nexthop {
 		struct mlxsw_sp_neigh_entry *neigh_entry;
 		struct mlxsw_sp_ipip_entry *ipip_entry;
 	};
+	unsigned int counter_index;
+	bool counter_valid;
 };
 
 struct mlxsw_sp_nexthop_group {
@@ -2066,6 +2085,112 @@ struct mlxsw_sp_nexthop_group {
 #define nh_rif	nexthops[0].rif
 };
 
+void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop *nh)
+{
+	struct devlink *devlink;
+
+	devlink = priv_to_devlink(mlxsw_sp->core);
+	if (!devlink_dpipe_table_counter_enabled(devlink,
+						 MLXSW_SP_DPIPE_TABLE_NAME_ADJ))
+		return;
+
+	if (mlxsw_sp_flow_counter_alloc(mlxsw_sp, &nh->counter_index))
+		return;
+
+	nh->counter_valid = true;
+}
+
+void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh->counter_valid)
+		return;
+	mlxsw_sp_flow_counter_free(mlxsw_sp, nh->counter_index);
+	nh->counter_valid = false;
+}
+
+int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_nexthop *nh, u64 *p_counter)
+{
+	if (!nh->counter_valid)
+		return -EINVAL;
+
+	return mlxsw_sp_flow_counter_get(mlxsw_sp, nh->counter_index,
+					 p_counter, NULL);
+}
+
+struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router,
+					       struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh) {
+		if (list_empty(&router->nexthop_list))
+			return NULL;
+		else
+			return list_first_entry(&router->nexthop_list,
+						typeof(*nh), router_list_node);
+	}
+	if (list_is_last(&nh->router_list_node, &router->nexthop_list))
+		return NULL;
+	return list_next_entry(nh, router_list_node);
+}
+
+bool mlxsw_sp_nexthop_offload(struct mlxsw_sp_nexthop *nh)
+{
+	return nh->offloaded;
+}
+
+unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh->offloaded)
+		return NULL;
+	return nh->neigh_entry->ha;
+}
+
+int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
+			     u32 *p_adj_hash_index)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = nh->nh_grp;
+	u32 adj_hash_index = 0;
+	int i;
+
+	if (!nh->offloaded || !nh_grp->adj_index_valid)
+		return -EINVAL;
+
+	*p_adj_index = nh_grp->adj_index;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh_iter = &nh_grp->nexthops[i];
+
+		if (nh_iter == nh)
+			break;
+		if (nh_iter->offloaded)
+			adj_hash_index++;
+	}
+
+	*p_adj_hash_index = adj_hash_index;
+	return 0;
+}
+
+struct mlxsw_sp_rif *mlxsw_sp_nexthop_rif(struct mlxsw_sp_nexthop *nh)
+{
+	return nh->rif;
+}
+
+bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = nh->nh_grp;
+	int i;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh_iter = &nh_grp->nexthops[i];
+
+		if (nh_iter->type == MLXSW_SP_NEXTHOP_TYPE_IPIP)
+			return true;
+	}
+	return false;
+}
+
 static struct fib_info *
 mlxsw_sp_nexthop4_group_fi(const struct mlxsw_sp_nexthop_group *nh_grp)
 {
@@ -2323,8 +2448,8 @@ static int mlxsw_sp_adj_index_mass_update(struct mlxsw_sp *mlxsw_sp,
 	return 0;
 }
 
-static int mlxsw_sp_nexthop_mac_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
-				       struct mlxsw_sp_nexthop *nh)
+int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+			    struct mlxsw_sp_nexthop *nh)
 {
 	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
 	char ratr_pl[MLXSW_REG_RATR_LEN];
@@ -2333,6 +2458,11 @@ static int mlxsw_sp_nexthop_mac_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
 			    true, MLXSW_REG_RATR_TYPE_ETHERNET,
 			    adj_index, neigh_entry->rif);
 	mlxsw_reg_ratr_eth_entry_pack(ratr_pl, neigh_entry->ha);
+	if (nh->counter_valid)
+		mlxsw_reg_ratr_counter_pack(ratr_pl, nh->counter_index, true);
+	else
+		mlxsw_reg_ratr_counter_pack(ratr_pl, 0, false);
+
 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl);
 }
 
@@ -2367,7 +2497,7 @@ mlxsw_sp_nexthop_group_update(struct mlxsw_sp *mlxsw_sp,
 		if (nh->update || reallocate) {
 			switch (nh->type) {
 			case MLXSW_SP_NEXTHOP_TYPE_ETH:
-				err = mlxsw_sp_nexthop_mac_update
+				err = mlxsw_sp_nexthop_update
 					    (mlxsw_sp, adj_index, nh);
 				break;
 			case MLXSW_SP_NEXTHOP_TYPE_IPIP:
@@ -2789,6 +2919,9 @@ static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp,
 	if (err)
 		return err;
 
+	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
+	list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
+
 	if (!dev)
 		return 0;
 
@@ -2812,6 +2945,8 @@ static void mlxsw_sp_nexthop4_fini(struct mlxsw_sp *mlxsw_sp,
 				   struct mlxsw_sp_nexthop *nh)
 {
 	mlxsw_sp_nexthop4_type_fini(mlxsw_sp, nh);
+	list_del(&nh->router_list_node);
+	mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
 	mlxsw_sp_nexthop_remove(mlxsw_sp, nh);
 }
 
@@ -3121,7 +3256,7 @@ mlxsw_sp_fib_entry_offload_refresh(struct mlxsw_sp_fib_entry *fib_entry,
 			return;
 		if (mlxsw_sp_fib_entry_should_offload(fib_entry))
 			mlxsw_sp_fib_entry_offload_set(fib_entry);
-		else if (!mlxsw_sp_fib_entry_should_offload(fib_entry))
+		else
 			mlxsw_sp_fib_entry_offload_unset(fib_entry);
 		return;
 	default:
@@ -4053,6 +4188,9 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
 
 	nh->nh_grp = nh_grp;
 	memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr));
+	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
+
+	list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
 
 	if (!dev)
 		return 0;
@@ -4065,6 +4203,8 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
 				   struct mlxsw_sp_nexthop *nh)
 {
 	mlxsw_sp_nexthop6_type_fini(mlxsw_sp, nh);
+	list_del(&nh->router_list_node);
+	mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
 }
 
 static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
@@ -4615,6 +4755,75 @@ static int __mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp,
 	return 0;
 }
 
+static int mlxsw_sp_router_fibmr_add(struct mlxsw_sp *mlxsw_sp,
+				     struct mfc_entry_notifier_info *men_info,
+				     bool replace)
+{
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, men_info->tb_id);
+	if (IS_ERR(vr))
+		return PTR_ERR(vr);
+
+	return mlxsw_sp_mr_route4_add(vr->mr4_table, men_info->mfc, replace);
+}
+
+static void mlxsw_sp_router_fibmr_del(struct mlxsw_sp *mlxsw_sp,
+				      struct mfc_entry_notifier_info *men_info)
+{
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, men_info->tb_id);
+	if (WARN_ON(!vr))
+		return;
+
+	mlxsw_sp_mr_route4_del(vr->mr4_table, men_info->mfc);
+	mlxsw_sp_vr_put(vr);
+}
+
+static int
+mlxsw_sp_router_fibmr_vif_add(struct mlxsw_sp *mlxsw_sp,
+			      struct vif_entry_notifier_info *ven_info)
+{
+	struct mlxsw_sp_rif *rif;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, ven_info->tb_id);
+	if (IS_ERR(vr))
+		return PTR_ERR(vr);
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, ven_info->dev);
+	return mlxsw_sp_mr_vif_add(vr->mr4_table, ven_info->dev,
+				   ven_info->vif_index,
+				   ven_info->vif_flags, rif);
+}
+
+static void
+mlxsw_sp_router_fibmr_vif_del(struct mlxsw_sp *mlxsw_sp,
+			      struct vif_entry_notifier_info *ven_info)
+{
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, ven_info->tb_id);
+	if (WARN_ON(!vr))
+		return;
+
+	mlxsw_sp_mr_vif_del(vr->mr4_table, ven_info->vif_index);
+	mlxsw_sp_vr_put(vr);
+}
+
 static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp)
 {
 	enum mlxsw_reg_ralxx_protocol proto = MLXSW_REG_RALXX_PROTOCOL_IPV4;
@@ -4625,6 +4834,10 @@ static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp)
 	if (err)
 		return err;
 
+	/* The multicast router code does not need an abort trap as by default,
+	 * packets that don't match any routes are trapped to the CPU.
+	 */
+
 	proto = MLXSW_REG_RALXX_PROTOCOL_IPV6;
 	return __mlxsw_sp_router_set_abort_trap(mlxsw_sp, proto,
 						MLXSW_SP_LPM_TREE_MIN + 1);
@@ -4706,6 +4919,8 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp)
 
 		if (!mlxsw_sp_vr_is_used(vr))
 			continue;
+
+		mlxsw_sp_mr_table_flush(vr->mr4_table);
 		mlxsw_sp_vr_fib_flush(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV4);
 
 		/* If virtual router was only used for IPv4, then it's no
@@ -4738,6 +4953,8 @@ struct mlxsw_sp_fib_event_work {
 		struct fib_entry_notifier_info fen_info;
 		struct fib_rule_notifier_info fr_info;
 		struct fib_nh_notifier_info fnh_info;
+		struct mfc_entry_notifier_info men_info;
+		struct vif_entry_notifier_info ven_info;
 	};
 	struct mlxsw_sp *mlxsw_sp;
 	unsigned long event;
@@ -4824,6 +5041,55 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
 	kfree(fib_work);
 }
 
+static void mlxsw_sp_router_fibmr_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_fib_event_work *fib_work =
+		container_of(work, struct mlxsw_sp_fib_event_work, work);
+	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
+	struct fib_rule *rule;
+	bool replace;
+	int err;
+
+	rtnl_lock();
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD:
+		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
+
+		err = mlxsw_sp_router_fibmr_add(mlxsw_sp, &fib_work->men_info,
+						replace);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		ipmr_cache_put(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_ENTRY_DEL:
+		mlxsw_sp_router_fibmr_del(mlxsw_sp, &fib_work->men_info);
+		ipmr_cache_put(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_VIF_ADD:
+		err = mlxsw_sp_router_fibmr_vif_add(mlxsw_sp,
+						    &fib_work->ven_info);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		dev_put(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_VIF_DEL:
+		mlxsw_sp_router_fibmr_vif_del(mlxsw_sp,
+					      &fib_work->ven_info);
+		dev_put(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_RULE_ADD: /* fall through */
+	case FIB_EVENT_RULE_DEL:
+		rule = fib_work->fr_info.rule;
+		if (!ipmr_rule_default(rule) && !rule->l3mdev)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		fib_rule_put(rule);
+		break;
+	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
 static void mlxsw_sp_router_fib4_event(struct mlxsw_sp_fib_event_work *fib_work,
 				       struct fib_notifier_info *info)
 {
@@ -4869,6 +5135,30 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
 	}
 }
 
+static void
+mlxsw_sp_router_fibmr_event(struct mlxsw_sp_fib_event_work *fib_work,
+			    struct fib_notifier_info *info)
+{
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		memcpy(&fib_work->men_info, info, sizeof(fib_work->men_info));
+		ipmr_cache_hold(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_VIF_ADD: /* fall through */
+	case FIB_EVENT_VIF_DEL:
+		memcpy(&fib_work->ven_info, info, sizeof(fib_work->ven_info));
+		dev_hold(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_RULE_ADD: /* fall through */
+	case FIB_EVENT_RULE_DEL:
+		memcpy(&fib_work->fr_info, info, sizeof(fib_work->fr_info));
+		fib_rule_get(fib_work->fr_info.rule);
+		break;
+	}
+}
+
 /* Called with rcu_read_lock() */
 static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 				     unsigned long event, void *ptr)
@@ -4878,7 +5168,8 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	struct mlxsw_sp_router *router;
 
 	if (!net_eq(info->net, &init_net) ||
-	    (info->family != AF_INET && info->family != AF_INET6))
+	    (info->family != AF_INET && info->family != AF_INET6 &&
+	     info->family != RTNL_FAMILY_IPMR))
 		return NOTIFY_DONE;
 
 	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
@@ -4898,6 +5189,10 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 		INIT_WORK(&fib_work->work, mlxsw_sp_router_fib6_event_work);
 		mlxsw_sp_router_fib6_event(fib_work, info);
 		break;
+	case RTNL_FAMILY_IPMR:
+		INIT_WORK(&fib_work->work, mlxsw_sp_router_fibmr_event_work);
+		mlxsw_sp_router_fibmr_event(fib_work, info);
+		break;
 	}
 
 	mlxsw_core_schedule_work(&fib_work->work);
@@ -5058,6 +5353,11 @@ int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif)
 	return rif->dev->ifindex;
 }
 
+const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif)
+{
+	return rif->dev;
+}
+
 static struct mlxsw_sp_rif *
 mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
 		    const struct mlxsw_sp_rif_params *params)
@@ -5107,11 +5407,17 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
 	if (err)
 		goto err_configure;
 
+	err = mlxsw_sp_mr_rif_add(vr->mr4_table, rif);
+	if (err)
+		goto err_mr_rif_add;
+
 	mlxsw_sp_rif_counters_alloc(rif);
 	mlxsw_sp->router->rifs[rif_index] = rif;
 
 	return rif;
 
+err_mr_rif_add:
+	ops->deconfigure(rif);
 err_configure:
 	if (fid)
 		mlxsw_sp_fid_put(fid);
@@ -5136,6 +5442,7 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif)
 
 	mlxsw_sp->router->rifs[rif->rif_index] = NULL;
 	mlxsw_sp_rif_counters_free(rif);
+	mlxsw_sp_mr_rif_del(vr->mr4_table, rif);
 	ops->deconfigure(rif);
 	if (fid)
 		/* Loopback RIFs are not associated with a FID. */
@@ -5477,6 +5784,17 @@ int mlxsw_sp_netdevice_router_port_event(struct net_device *dev)
 	if (err)
 		goto err_rif_fdb_op;
 
+	if (rif->mtu != dev->mtu) {
+		struct mlxsw_sp_vr *vr;
+
+		/* The RIF is relevant only to its mr_table instance, as unlike
+		 * unicast routing, in multicast routing a RIF cannot be shared
+		 * between several multicast routing tables.
+		 */
+		vr = &mlxsw_sp->router->vrs[rif->vr_id];
+		mlxsw_sp_mr_rif_mtu_update(vr->mr4_table, rif, dev->mtu);
+	}
+
 	ether_addr_copy(rif->addr, dev->dev_addr);
 	rif->mtu = dev->mtu;
 
@@ -5995,10 +6313,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
 	if (err)
 		goto err_nexthop_group_ht_init;
 
+	INIT_LIST_HEAD(&mlxsw_sp->router->nexthop_list);
 	err = mlxsw_sp_lpm_init(mlxsw_sp);
 	if (err)
 		goto err_lpm_init;
 
+	err = mlxsw_sp_mr_init(mlxsw_sp, &mlxsw_sp_mr_tcam_ops);
+	if (err)
+		goto err_mr_init;
+
 	err = mlxsw_sp_vrs_init(mlxsw_sp);
 	if (err)
 		goto err_vrs_init;
@@ -6020,6 +6343,8 @@ err_register_fib_notifier:
 err_neigh_init:
 	mlxsw_sp_vrs_fini(mlxsw_sp);
 err_vrs_init:
+	mlxsw_sp_mr_fini(mlxsw_sp);
+err_mr_init:
 	mlxsw_sp_lpm_fini(mlxsw_sp);
 err_lpm_init:
 	rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht);
@@ -6041,6 +6366,7 @@ void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
 	unregister_fib_notifier(&mlxsw_sp->router->fib_nb);
 	mlxsw_sp_neigh_fini(mlxsw_sp);
 	mlxsw_sp_vrs_fini(mlxsw_sp);
+	mlxsw_sp_mr_fini(mlxsw_sp);
 	mlxsw_sp_lpm_fini(mlxsw_sp);
 	rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht);
 	rhashtable_destroy(&mlxsw_sp->router->nexthop_ht);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
index 345fcc4f38e9..3d449180b035 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
@@ -62,6 +62,7 @@ enum mlxsw_sp_rif_counter_dir {
 };
 
 struct mlxsw_sp_neigh_entry;
+struct mlxsw_sp_nexthop;
 
 struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp,
 					   u16 rif_index);
@@ -69,6 +70,7 @@ u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif);
 u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif);
 u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif);
 int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif);
+const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif);
 int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp,
 				   struct mlxsw_sp_rif *rif,
 				   enum mlxsw_sp_rif_counter_dir dir,
@@ -107,5 +109,24 @@ union mlxsw_sp_l3addr
 mlxsw_sp_ipip_netdev_daddr(enum mlxsw_sp_l3proto proto,
 			   const struct net_device *ol_dev);
 __be32 mlxsw_sp_ipip_netdev_daddr4(const struct net_device *ol_dev);
+struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router,
+					       struct mlxsw_sp_nexthop *nh);
+bool mlxsw_sp_nexthop_offload(struct mlxsw_sp_nexthop *nh);
+unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh);
+int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
+			     u32 *p_adj_hash_index);
+struct mlxsw_sp_rif *mlxsw_sp_nexthop_rif(struct mlxsw_sp_nexthop *nh);
+bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh);
+#define mlxsw_sp_nexthop_for_each(nh, router)				\
+	for (nh = mlxsw_sp_nexthop_next(router, NULL); nh;		\
+	     nh = mlxsw_sp_nexthop_next(router, nh))
+int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_nexthop *nh, u64 *p_counter);
+int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+			    struct mlxsw_sp_nexthop *nh);
+void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop *nh);
+void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh);
 
 #endif /* _MLXSW_ROUTER_H_*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index d39ffbfcc436..0f9eac5f4ebf 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -67,7 +67,6 @@ struct mlxsw_sp_bridge {
 	u32 ageing_time;
 	bool vlan_enabled_exists;
 	struct list_head bridges_list;
-	struct list_head mids_list;
 	DECLARE_BITMAP(mids_bitmap, MLXSW_SP_MID_MAX);
 	const struct mlxsw_sp_bridge_ops *bridge_8021q_ops;
 	const struct mlxsw_sp_bridge_ops *bridge_8021d_ops;
@@ -77,6 +76,7 @@ struct mlxsw_sp_bridge_device {
 	struct net_device *dev;
 	struct list_head list;
 	struct list_head ports_list;
+	struct list_head mids_list;
 	u8 vlan_enabled:1,
 	   multicast_enabled:1;
 	const struct mlxsw_sp_bridge_ops *ops;
@@ -121,6 +121,20 @@ mlxsw_sp_bridge_port_fdb_flush(struct mlxsw_sp *mlxsw_sp,
 			       struct mlxsw_sp_bridge_port *bridge_port,
 			       u16 fid_index);
 
+static void
+mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port,
+			       struct mlxsw_sp_bridge_port *bridge_port);
+
+static void
+mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct mlxsw_sp_bridge_device
+				   *bridge_device);
+
+static void
+mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 bool add);
+
 static struct mlxsw_sp_bridge_device *
 mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge,
 			    const struct net_device *br_dev)
@@ -161,6 +175,7 @@ mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
 	} else {
 		bridge_device->ops = bridge->bridge_8021d_ops;
 	}
+	INIT_LIST_HEAD(&bridge_device->mids_list);
 	list_add(&bridge_device->list, &bridge->bridges_list);
 
 	return bridge_device;
@@ -174,6 +189,7 @@ mlxsw_sp_bridge_device_destroy(struct mlxsw_sp_bridge *bridge,
 	if (bridge_device->vlan_enabled)
 		bridge->vlan_enabled_exists = false;
 	WARN_ON(!list_empty(&bridge_device->ports_list));
+	WARN_ON(!list_empty(&bridge_device->mids_list));
 	kfree(bridge_device);
 }
 
@@ -249,7 +265,8 @@ mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device,
 	bridge_port->dev = brport_dev;
 	bridge_port->bridge_device = bridge_device;
 	bridge_port->stp_state = BR_STATE_DISABLED;
-	bridge_port->flags = BR_LEARNING | BR_FLOOD | BR_LEARNING_SYNC;
+	bridge_port->flags = BR_LEARNING | BR_FLOOD | BR_LEARNING_SYNC |
+			     BR_MCAST_FLOOD;
 	INIT_LIST_HEAD(&bridge_port->vlans_list);
 	list_add(&bridge_port->list, &bridge_device->ports_list);
 	bridge_port->ref_count = 1;
@@ -455,7 +472,8 @@ static int mlxsw_sp_port_attr_get(struct net_device *dev,
 					       &attr->u.brport_flags);
 		break;
 	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
-		attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD;
+		attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD |
+					       BR_MCAST_FLOOD;
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -640,8 +658,18 @@ static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port,
 	if (err)
 		return err;
 
-	memcpy(&bridge_port->flags, &brport_flags, sizeof(brport_flags));
+	if (bridge_port->bridge_device->multicast_enabled)
+		goto out;
 
+	err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port,
+						   MLXSW_SP_FLOOD_TYPE_MC,
+						   brport_flags &
+						   BR_MCAST_FLOOD);
+	if (err)
+		return err;
+
+out:
+	memcpy(&bridge_port->flags, &brport_flags, sizeof(brport_flags));
 	return 0;
 }
 
@@ -699,10 +727,10 @@ static int mlxsw_sp_port_attr_br_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port,
 	return -EINVAL;
 }
 
-static int mlxsw_sp_port_attr_mc_router_set(struct mlxsw_sp_port *mlxsw_sp_port,
-					    struct switchdev_trans *trans,
-					    struct net_device *orig_dev,
-					    bool is_port_mc_router)
+static int mlxsw_sp_port_attr_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					  struct switchdev_trans *trans,
+					  struct net_device *orig_dev,
+					  bool is_port_mrouter)
 {
 	struct mlxsw_sp_bridge_port *bridge_port;
 	int err;
@@ -720,15 +748,26 @@ static int mlxsw_sp_port_attr_mc_router_set(struct mlxsw_sp_port *mlxsw_sp_port,
 
 	err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port,
 						   MLXSW_SP_FLOOD_TYPE_MC,
-						   is_port_mc_router);
+						   is_port_mrouter);
 	if (err)
 		return err;
 
+	mlxsw_sp_port_mrouter_update_mdb(mlxsw_sp_port, bridge_port,
+					 is_port_mrouter);
 out:
-	bridge_port->mrouter = is_port_mc_router;
+	bridge_port->mrouter = is_port_mrouter;
 	return 0;
 }
 
+static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port)
+{
+	const struct mlxsw_sp_bridge_device *bridge_device;
+
+	bridge_device = bridge_port->bridge_device;
+	return bridge_device->multicast_enabled ? bridge_port->mrouter :
+					bridge_port->flags & BR_MCAST_FLOOD;
+}
+
 static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
 					 struct switchdev_trans *trans,
 					 struct net_device *orig_dev,
@@ -749,9 +788,15 @@ static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
 	if (!bridge_device)
 		return 0;
 
+	if (bridge_device->multicast_enabled != !mc_disabled) {
+		bridge_device->multicast_enabled = !mc_disabled;
+		mlxsw_sp_bridge_mdb_mc_enable_sync(mlxsw_sp_port,
+						   bridge_device);
+	}
+
 	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
 		enum mlxsw_sp_flood_type packet_type = MLXSW_SP_FLOOD_TYPE_MC;
-		bool member = mc_disabled ? true : bridge_port->mrouter;
+		bool member = mlxsw_sp_mc_flood(bridge_port);
 
 		err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port,
 							   bridge_port,
@@ -793,9 +838,9 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
 						     attr->u.vlan_filtering);
 		break;
 	case SWITCHDEV_ATTR_ID_PORT_MROUTER:
-		err = mlxsw_sp_port_attr_mc_router_set(mlxsw_sp_port, trans,
-						       attr->orig_dev,
-						       attr->u.mrouter);
+		err = mlxsw_sp_port_attr_mrouter_set(mlxsw_sp_port, trans,
+						     attr->orig_dev,
+						     attr->u.mrouter);
 		break;
 	case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED:
 		err = mlxsw_sp_port_mc_disabled_set(mlxsw_sp_port, trans,
@@ -810,14 +855,6 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
 	return err;
 }
 
-static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port)
-{
-	const struct mlxsw_sp_bridge_device *bridge_device;
-
-	bridge_device = bridge_port->bridge_device;
-	return !bridge_device->multicast_enabled ? true : bridge_port->mrouter;
-}
-
 static int
 mlxsw_sp_port_vlan_fid_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
 			    struct mlxsw_sp_bridge_port *bridge_port)
@@ -955,24 +992,28 @@ mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
 	struct mlxsw_sp_bridge_vlan *bridge_vlan;
 	struct mlxsw_sp_bridge_port *bridge_port;
 	u16 vid = mlxsw_sp_port_vlan->vid;
-	bool last;
+	bool last_port, last_vlan;
 
 	if (WARN_ON(mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021Q &&
 		    mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021D))
 		return;
 
 	bridge_port = mlxsw_sp_port_vlan->bridge_port;
+	last_vlan = list_is_singular(&bridge_port->vlans_list);
 	bridge_vlan = mlxsw_sp_bridge_vlan_find(bridge_port, vid);
-	last = list_is_singular(&bridge_vlan->port_vlan_list);
+	last_port = list_is_singular(&bridge_vlan->port_vlan_list);
 
 	list_del(&mlxsw_sp_port_vlan->bridge_vlan_node);
 	mlxsw_sp_bridge_vlan_put(bridge_vlan);
 	mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid, BR_STATE_DISABLED);
 	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, false);
-	if (last)
+	if (last_port)
 		mlxsw_sp_bridge_port_fdb_flush(mlxsw_sp_port->mlxsw_sp,
 					       bridge_port,
 					       mlxsw_sp_fid_index(fid));
+	if (last_vlan)
+		mlxsw_sp_bridge_port_mdb_flush(mlxsw_sp_port, bridge_port);
+
 	mlxsw_sp_port_vlan_fid_leave(mlxsw_sp_port_vlan);
 
 	mlxsw_sp_bridge_port_put(mlxsw_sp_port->mlxsw_sp->bridge, bridge_port);
@@ -1182,7 +1223,7 @@ mlxsw_sp_port_fdb_set(struct mlxsw_sp_port *mlxsw_sp_port,
 }
 
 static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr,
-				u16 fid, u16 mid, bool adding)
+				u16 fid, u16 mid_idx, bool adding)
 {
 	char *sfd_pl;
 	int err;
@@ -1193,16 +1234,15 @@ static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr,
 
 	mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0);
 	mlxsw_reg_sfd_mc_pack(sfd_pl, 0, addr, fid,
-			      MLXSW_REG_SFD_REC_ACTION_NOP, mid);
+			      MLXSW_REG_SFD_REC_ACTION_NOP, mid_idx);
 	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
 	kfree(sfd_pl);
 	return err;
 }
 
-static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 mid,
-				  bool add, bool clear_all_ports)
+static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx,
+					 long *ports_bitmap)
 {
-	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	char *smid_pl;
 	int err, i;
 
@@ -1210,66 +1250,201 @@ static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 mid,
 	if (!smid_pl)
 		return -ENOMEM;
 
-	mlxsw_reg_smid_pack(smid_pl, mid, mlxsw_sp_port->local_port, add);
-	if (clear_all_ports) {
-		for (i = 1; i < mlxsw_core_max_ports(mlxsw_sp->core); i++)
-			if (mlxsw_sp->ports[i])
-				mlxsw_reg_smid_port_mask_set(smid_pl, i, 1);
+	mlxsw_reg_smid_pack(smid_pl, mid_idx, 0, false);
+	for (i = 1; i < mlxsw_core_max_ports(mlxsw_sp->core); i++) {
+		if (mlxsw_sp->ports[i])
+			mlxsw_reg_smid_port_mask_set(smid_pl, i, 1);
 	}
+
+	for_each_set_bit(i, ports_bitmap, mlxsw_core_max_ports(mlxsw_sp->core))
+		mlxsw_reg_smid_port_set(smid_pl, i, 1);
+
 	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
 	kfree(smid_pl);
 	return err;
 }
 
-static struct mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp *mlxsw_sp,
-					      const unsigned char *addr,
-					      u16 fid)
+static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  u16 mid_idx, bool add)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *smid_pl;
+	int err;
+
+	smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL);
+	if (!smid_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_smid_pack(smid_pl, mid_idx, mlxsw_sp_port->local_port, add);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
+	kfree(smid_pl);
+	return err;
+}
+
+static struct
+mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp_bridge_device *bridge_device,
+				const unsigned char *addr,
+				u16 fid)
 {
 	struct mlxsw_sp_mid *mid;
 
-	list_for_each_entry(mid, &mlxsw_sp->bridge->mids_list, list) {
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
 		if (ether_addr_equal(mid->addr, addr) && mid->fid == fid)
 			return mid;
 	}
 	return NULL;
 }
 
-static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp,
-						const unsigned char *addr,
-						u16 fid)
+static void
+mlxsw_sp_bridge_port_get_ports_bitmap(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_bridge_port *bridge_port,
+				      unsigned long *ports_bitmap)
 {
-	struct mlxsw_sp_mid *mid;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u64 max_lag_members, i;
+	int lag_id;
+
+	if (!bridge_port->lagged) {
+		set_bit(bridge_port->system_port, ports_bitmap);
+	} else {
+		max_lag_members = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						     MAX_LAG_MEMBERS);
+		lag_id = bridge_port->lag_id;
+		for (i = 0; i < max_lag_members; i++) {
+			mlxsw_sp_port = mlxsw_sp_port_lagged_get(mlxsw_sp,
+								 lag_id, i);
+			if (mlxsw_sp_port)
+				set_bit(mlxsw_sp_port->local_port,
+					ports_bitmap);
+		}
+	}
+}
+
+static void
+mlxsw_sp_mc_get_mrouters_bitmap(unsigned long *flood_bitmap,
+				struct mlxsw_sp_bridge_device *bridge_device,
+				struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
+		if (bridge_port->mrouter) {
+			mlxsw_sp_bridge_port_get_ports_bitmap(mlxsw_sp,
+							      bridge_port,
+							      flood_bitmap);
+		}
+	}
+}
+
+static bool
+mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_mid *mid,
+			    struct mlxsw_sp_bridge_device *bridge_device)
+{
+	long *flood_bitmap;
+	int num_of_ports;
+	int alloc_size;
 	u16 mid_idx;
+	int err;
 
 	mid_idx = find_first_zero_bit(mlxsw_sp->bridge->mids_bitmap,
 				      MLXSW_SP_MID_MAX);
 	if (mid_idx == MLXSW_SP_MID_MAX)
-		return NULL;
+		return false;
+
+	num_of_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+	alloc_size = sizeof(long) * BITS_TO_LONGS(num_of_ports);
+	flood_bitmap = kzalloc(alloc_size, GFP_KERNEL);
+	if (!flood_bitmap)
+		return false;
+
+	bitmap_copy(flood_bitmap,  mid->ports_in_mid, num_of_ports);
+	mlxsw_sp_mc_get_mrouters_bitmap(flood_bitmap, bridge_device, mlxsw_sp);
+
+	mid->mid = mid_idx;
+	err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap);
+	kfree(flood_bitmap);
+	if (err)
+		return false;
+
+	err = mlxsw_sp_port_mdb_op(mlxsw_sp, mid->addr, mid->fid, mid_idx,
+				   true);
+	if (err)
+		return false;
+
+	set_bit(mid_idx, mlxsw_sp->bridge->mids_bitmap);
+	mid->in_hw = true;
+	return true;
+}
+
+static int mlxsw_sp_mc_remove_mdb_entry(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_mid *mid)
+{
+	if (!mid->in_hw)
+		return 0;
+
+	clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap);
+	mid->in_hw = false;
+	return mlxsw_sp_port_mdb_op(mlxsw_sp, mid->addr, mid->fid, mid->mid,
+				    false);
+}
+
+static struct
+mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_bridge_device *bridge_device,
+				  const unsigned char *addr,
+				  u16 fid)
+{
+	struct mlxsw_sp_mid *mid;
+	size_t alloc_size;
 
 	mid = kzalloc(sizeof(*mid), GFP_KERNEL);
 	if (!mid)
 		return NULL;
 
-	set_bit(mid_idx, mlxsw_sp->bridge->mids_bitmap);
+	alloc_size = sizeof(unsigned long) *
+		     BITS_TO_LONGS(mlxsw_core_max_ports(mlxsw_sp->core));
+
+	mid->ports_in_mid = kzalloc(alloc_size, GFP_KERNEL);
+	if (!mid->ports_in_mid)
+		goto err_ports_in_mid_alloc;
+
 	ether_addr_copy(mid->addr, addr);
 	mid->fid = fid;
-	mid->mid = mid_idx;
-	mid->ref_count = 0;
-	list_add_tail(&mid->list, &mlxsw_sp->bridge->mids_list);
+	mid->in_hw = false;
+
+	if (!bridge_device->multicast_enabled)
+		goto out;
+
+	if (!mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid, bridge_device))
+		goto err_write_mdb_entry;
 
+out:
+	list_add_tail(&mid->list, &bridge_device->mids_list);
 	return mid;
+
+err_write_mdb_entry:
+	kfree(mid->ports_in_mid);
+err_ports_in_mid_alloc:
+	kfree(mid);
+	return NULL;
 }
 
-static int __mlxsw_sp_mc_dec_ref(struct mlxsw_sp *mlxsw_sp,
-				 struct mlxsw_sp_mid *mid)
+static int mlxsw_sp_port_remove_from_mid(struct mlxsw_sp_port *mlxsw_sp_port,
+					 struct mlxsw_sp_mid *mid)
 {
-	if (--mid->ref_count == 0) {
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	int err = 0;
+
+	clear_bit(mlxsw_sp_port->local_port, mid->ports_in_mid);
+	if (bitmap_empty(mid->ports_in_mid,
+			 mlxsw_core_max_ports(mlxsw_sp->core))) {
+		err = mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid);
 		list_del(&mid->list);
-		clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap);
+		kfree(mid->ports_in_mid);
 		kfree(mid);
-		return 1;
 	}
-	return 0;
+	return err;
 }
 
 static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port,
@@ -1302,39 +1477,72 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port,
 
 	fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid);
 
-	mid = __mlxsw_sp_mc_get(mlxsw_sp, mdb->addr, fid_index);
+	mid = __mlxsw_sp_mc_get(bridge_device, mdb->addr, fid_index);
 	if (!mid) {
-		mid = __mlxsw_sp_mc_alloc(mlxsw_sp, mdb->addr, fid_index);
+		mid = __mlxsw_sp_mc_alloc(mlxsw_sp, bridge_device, mdb->addr,
+					  fid_index);
 		if (!mid) {
 			netdev_err(dev, "Unable to allocate MC group\n");
 			return -ENOMEM;
 		}
 	}
-	mid->ref_count++;
+	set_bit(mlxsw_sp_port->local_port, mid->ports_in_mid);
+
+	if (!bridge_device->multicast_enabled)
+		return 0;
+
+	if (bridge_port->mrouter)
+		return 0;
 
-	err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true,
-				     mid->ref_count == 1);
+	err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true);
 	if (err) {
 		netdev_err(dev, "Unable to set SMID\n");
 		goto err_out;
 	}
 
-	if (mid->ref_count == 1) {
-		err = mlxsw_sp_port_mdb_op(mlxsw_sp, mdb->addr, fid_index,
-					   mid->mid, true);
-		if (err) {
-			netdev_err(dev, "Unable to set MC SFD\n");
-			goto err_out;
-		}
-	}
-
 	return 0;
 
 err_out:
-	__mlxsw_sp_mc_dec_ref(mlxsw_sp, mid);
+	mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid);
 	return err;
 }
 
+static void
+mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct mlxsw_sp_bridge_device
+				   *bridge_device)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_mid *mid;
+	bool mc_enabled;
+
+	mc_enabled = bridge_device->multicast_enabled;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (mc_enabled)
+			mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid,
+						    bridge_device);
+		else
+			mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid);
+	}
+}
+
+static void
+mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 bool add)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_mid *mid;
+
+	bridge_device = bridge_port->bridge_device;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (!test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid))
+			mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, add);
+	}
+}
+
 static int mlxsw_sp_port_obj_add(struct net_device *dev,
 				 const struct switchdev_obj *obj,
 				 struct switchdev_trans *trans)
@@ -1399,6 +1607,30 @@ static int mlxsw_sp_port_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port,
 	return 0;
 }
 
+static int
+__mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
+			struct mlxsw_sp_bridge_port *bridge_port,
+			struct mlxsw_sp_mid *mid)
+{
+	struct net_device *dev = mlxsw_sp_port->dev;
+	int err;
+
+	if (bridge_port->bridge_device->multicast_enabled) {
+		if (bridge_port->bridge_device->multicast_enabled) {
+			err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid,
+						     false);
+			if (err)
+				netdev_err(dev, "Unable to remove port from SMID\n");
+		}
+	}
+
+	err = mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid);
+	if (err)
+		netdev_err(dev, "Unable to remove MC SFD\n");
+
+	return err;
+}
+
 static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
 				 const struct switchdev_obj_port_mdb *mdb)
 {
@@ -1410,8 +1642,6 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
 	struct mlxsw_sp_bridge_port *bridge_port;
 	struct mlxsw_sp_mid *mid;
 	u16 fid_index;
-	u16 mid_idx;
-	int err = 0;
 
 	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
 	if (!bridge_port)
@@ -1426,25 +1656,33 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
 
 	fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid);
 
-	mid = __mlxsw_sp_mc_get(mlxsw_sp, mdb->addr, fid_index);
+	mid = __mlxsw_sp_mc_get(bridge_device, mdb->addr, fid_index);
 	if (!mid) {
 		netdev_err(dev, "Unable to remove port from MC DB\n");
 		return -EINVAL;
 	}
 
-	err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false, false);
-	if (err)
-		netdev_err(dev, "Unable to remove port from SMID\n");
+	return __mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port, mid);
+}
 
-	mid_idx = mid->mid;
-	if (__mlxsw_sp_mc_dec_ref(mlxsw_sp, mid)) {
-		err = mlxsw_sp_port_mdb_op(mlxsw_sp, mdb->addr, fid_index,
-					   mid_idx, false);
-		if (err)
-			netdev_err(dev, "Unable to remove MC SFD\n");
-	}
+static void
+mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port,
+			       struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_mid *mid, *tmp;
 
-	return err;
+	bridge_device = bridge_port->bridge_device;
+
+	list_for_each_entry_safe(mid, tmp, &bridge_device->mids_list, list) {
+		if (test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid)) {
+			__mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port,
+						mid);
+		} else if (bridge_device->multicast_enabled &&
+			   bridge_port->mrouter) {
+			mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false);
+		}
+	}
 }
 
 static int mlxsw_sp_port_obj_del(struct net_device *dev,
@@ -1981,17 +2219,6 @@ static void mlxsw_sp_fdb_fini(struct mlxsw_sp *mlxsw_sp)
 
 }
 
-static void mlxsw_sp_mids_fini(struct mlxsw_sp *mlxsw_sp)
-{
-	struct mlxsw_sp_mid *mid, *tmp;
-
-	list_for_each_entry_safe(mid, tmp, &mlxsw_sp->bridge->mids_list, list) {
-		list_del(&mid->list);
-		clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap);
-		kfree(mid);
-	}
-}
-
 int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct mlxsw_sp_bridge *bridge;
@@ -2003,7 +2230,6 @@ int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp)
 	bridge->mlxsw_sp = mlxsw_sp;
 
 	INIT_LIST_HEAD(&mlxsw_sp->bridge->bridges_list);
-	INIT_LIST_HEAD(&mlxsw_sp->bridge->mids_list);
 
 	bridge->bridge_8021q_ops = &mlxsw_sp_bridge_8021q_ops;
 	bridge->bridge_8021d_ops = &mlxsw_sp_bridge_8021d_ops;
@@ -2014,7 +2240,6 @@ int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp)
 void mlxsw_sp_switchdev_fini(struct mlxsw_sp *mlxsw_sp)
 {
 	mlxsw_sp_fdb_fini(mlxsw_sp);
-	mlxsw_sp_mids_fini(mlxsw_sp);
 	WARN_ON(!list_empty(&mlxsw_sp->bridge->bridges_list));
 	kfree(mlxsw_sp->bridge);
 }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h
index f396a1fef633..ec6cef8267ae 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/trap.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h
@@ -62,6 +62,8 @@ enum {
 	MLXSW_TRAP_ID_TTLERROR = 0x53,
 	MLXSW_TRAP_ID_LBERROR = 0x54,
 	MLXSW_TRAP_ID_IPV4_OSPF = 0x55,
+	MLXSW_TRAP_ID_IPV4_PIM = 0x58,
+	MLXSW_TRAP_ID_RPF = 0x5C,
 	MLXSW_TRAP_ID_IP2ME = 0x5F,
 	MLXSW_TRAP_ID_IPV6_UNSPECIFIED_ADDRESS = 0x60,
 	MLXSW_TRAP_ID_IPV6_LINK_LOCAL_DEST = 0x61,
@@ -89,6 +91,10 @@ enum {
 	MLXSW_TRAP_ID_ROUTER_ALERT_IPV4 = 0xD6,
 	MLXSW_TRAP_ID_ROUTER_ALERT_IPV6 = 0xD7,
 	MLXSW_TRAP_ID_ACL0 = 0x1C0,
+	/* Multicast trap used for routes with trap action */
+	MLXSW_TRAP_ID_ACL1 = 0x1C1,
+	/* Multicast trap used for routes with trap-and-forward action */
+	MLXSW_TRAP_ID_ACL2 = 0x1C2,
 
 	MLXSW_TRAP_ID_MAX = 0x1FF
 };
diff --git a/drivers/net/ethernet/natsemi/natsemi.c b/drivers/net/ethernet/natsemi/natsemi.c
index 18af2a23a933..dedeacd0bbca 100644
--- a/drivers/net/ethernet/natsemi/natsemi.c
+++ b/drivers/net/ethernet/natsemi/natsemi.c
@@ -1571,10 +1571,8 @@ static int netdev_open(struct net_device *dev)
 			dev->name, (int)readl(ioaddr + ChipCmd));
 
 	/* Set the timer to check for link beat. */
-	init_timer(&np->timer);
+	setup_timer(&np->timer, netdev_timer, (unsigned long)dev);
 	np->timer.expires = round_jiffies(jiffies + NATSEMI_TIMER_FREQ);
-	np->timer.data = (unsigned long)dev;
-	np->timer.function = netdev_timer; /* timer handler */
 	add_timer(&np->timer);
 
 	return 0;
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index 729095db3e08..99d3c7884a4a 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -1652,9 +1652,7 @@ static int ns83820_open(struct net_device *ndev)
 	writel(0, dev->base + TXDP_HI);
 	writel(desc, dev->base + TXDP);
 
-	init_timer(&dev->tx_watchdog);
-	dev->tx_watchdog.data = (unsigned long)ndev;
-	dev->tx_watchdog.function = ns83820_tx_watch;
+	setup_timer(&dev->tx_watchdog, ns83820_tx_watch, (unsigned long)ndev);
 	mod_timer(&dev->tx_watchdog, jiffies + 2*HZ);
 
 	netif_start_queue(ndev);	/* FIXME: wait for phy to come up */
diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile
index 96e579a15cbe..becaacf1554d 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -37,7 +37,8 @@ nfp-objs += \
 	    flower/main.o \
 	    flower/match.o \
 	    flower/metadata.o \
-	    flower/offload.o
+	    flower/offload.o \
+	    flower/tunnel_conf.o
 endif
 
 ifeq ($(CONFIG_BPF_SYSCALL),y)
diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index db9750695dc7..38f3835ae176 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -37,6 +37,7 @@
 #include <net/tc_act/tc_gact.h>
 #include <net/tc_act/tc_mirred.h>
 #include <net/tc_act/tc_vlan.h>
+#include <net/tc_act/tc_tunnel_key.h>
 
 #include "cmsg.h"
 #include "main.h"
@@ -80,14 +81,27 @@ nfp_fl_push_vlan(struct nfp_fl_push_vlan *push_vlan,
 	push_vlan->vlan_tci = cpu_to_be16(tmp_push_vlan_tci);
 }
 
+static bool nfp_fl_netdev_is_tunnel_type(struct net_device *out_dev,
+					 enum nfp_flower_tun_type tun_type)
+{
+	if (!out_dev->rtnl_link_ops)
+		return false;
+
+	if (!strcmp(out_dev->rtnl_link_ops->kind, "vxlan"))
+		return tun_type == NFP_FL_TUNNEL_VXLAN;
+
+	return false;
+}
+
 static int
 nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action,
 	      struct nfp_fl_payload *nfp_flow, bool last,
-	      struct net_device *in_dev)
+	      struct net_device *in_dev, enum nfp_flower_tun_type tun_type,
+	      int *tun_out_cnt)
 {
 	size_t act_size = sizeof(struct nfp_fl_output);
+	u16 tmp_output_op, tmp_flags;
 	struct net_device *out_dev;
-	u16 tmp_output_op;
 	int ifindex;
 
 	/* Set action opcode to output action. */
@@ -97,25 +111,114 @@ nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action,
 
 	output->a_op = cpu_to_be16(tmp_output_op);
 
-	/* Set action output parameters. */
-	output->flags = cpu_to_be16(last ? NFP_FL_OUT_FLAGS_LAST : 0);
-
 	ifindex = tcf_mirred_ifindex(action);
 	out_dev = __dev_get_by_index(dev_net(in_dev), ifindex);
 	if (!out_dev)
 		return -EOPNOTSUPP;
 
-	/* Only offload egress ports are on the same device as the ingress
-	 * port.
+	tmp_flags = last ? NFP_FL_OUT_FLAGS_LAST : 0;
+
+	if (tun_type) {
+		/* Verify the egress netdev matches the tunnel type. */
+		if (!nfp_fl_netdev_is_tunnel_type(out_dev, tun_type))
+			return -EOPNOTSUPP;
+
+		if (*tun_out_cnt)
+			return -EOPNOTSUPP;
+		(*tun_out_cnt)++;
+
+		output->flags = cpu_to_be16(tmp_flags |
+					    NFP_FL_OUT_FLAGS_USE_TUN);
+		output->port = cpu_to_be32(NFP_FL_PORT_TYPE_TUN | tun_type);
+	} else {
+		/* Set action output parameters. */
+		output->flags = cpu_to_be16(tmp_flags);
+
+		/* Only offload if egress ports are on the same device as the
+		 * ingress port.
+		 */
+		if (!switchdev_port_same_parent_id(in_dev, out_dev))
+			return -EOPNOTSUPP;
+
+		output->port = cpu_to_be32(nfp_repr_get_port_id(out_dev));
+		if (!output->port)
+			return -EOPNOTSUPP;
+	}
+	nfp_flow->meta.shortcut = output->port;
+
+	return 0;
+}
+
+static bool nfp_fl_supported_tun_port(const struct tc_action *action)
+{
+	struct ip_tunnel_info *tun = tcf_tunnel_info(action);
+
+	return tun->key.tp_dst == htons(NFP_FL_VXLAN_PORT);
+}
+
+static struct nfp_fl_pre_tunnel *nfp_fl_pre_tunnel(char *act_data, int act_len)
+{
+	size_t act_size = sizeof(struct nfp_fl_pre_tunnel);
+	struct nfp_fl_pre_tunnel *pre_tun_act;
+	u16 tmp_pre_tun_op;
+
+	/* Pre_tunnel action must be first on action list.
+	 * If other actions already exist they need pushed forward.
 	 */
-	if (!switchdev_port_same_parent_id(in_dev, out_dev))
-		return -EOPNOTSUPP;
+	if (act_len)
+		memmove(act_data + act_size, act_data, act_len);
+
+	pre_tun_act = (struct nfp_fl_pre_tunnel *)act_data;
+
+	memset(pre_tun_act, 0, act_size);
+
+	tmp_pre_tun_op =
+		FIELD_PREP(NFP_FL_ACT_LEN_LW, act_size >> NFP_FL_LW_SIZ) |
+		FIELD_PREP(NFP_FL_ACT_JMP_ID, NFP_FL_ACTION_OPCODE_PRE_TUNNEL);
+
+	pre_tun_act->a_op = cpu_to_be16(tmp_pre_tun_op);
 
-	output->port = cpu_to_be32(nfp_repr_get_port_id(out_dev));
-	if (!output->port)
+	return pre_tun_act;
+}
+
+static int
+nfp_fl_set_vxlan(struct nfp_fl_set_vxlan *set_vxlan,
+		 const struct tc_action *action,
+		 struct nfp_fl_pre_tunnel *pre_tun)
+{
+	struct ip_tunnel_info *vxlan = tcf_tunnel_info(action);
+	size_t act_size = sizeof(struct nfp_fl_set_vxlan);
+	u32 tmp_set_vxlan_type_index = 0;
+	u16 tmp_set_vxlan_op;
+	/* Currently support one pre-tunnel so index is always 0. */
+	int pretun_idx = 0;
+
+	if (vxlan->options_len) {
+		/* Do not support options e.g. vxlan gpe. */
 		return -EOPNOTSUPP;
+	}
 
-	nfp_flow->meta.shortcut = output->port;
+	tmp_set_vxlan_op =
+		FIELD_PREP(NFP_FL_ACT_LEN_LW, act_size >> NFP_FL_LW_SIZ) |
+		FIELD_PREP(NFP_FL_ACT_JMP_ID,
+			   NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL);
+
+	set_vxlan->a_op = cpu_to_be16(tmp_set_vxlan_op);
+
+	/* Set tunnel type and pre-tunnel index. */
+	tmp_set_vxlan_type_index |=
+		FIELD_PREP(NFP_FL_IPV4_TUNNEL_TYPE, NFP_FL_TUNNEL_VXLAN) |
+		FIELD_PREP(NFP_FL_IPV4_PRE_TUN_INDEX, pretun_idx);
+
+	set_vxlan->tun_type_index = cpu_to_be32(tmp_set_vxlan_type_index);
+
+	set_vxlan->tun_id = vxlan->key.tun_id;
+	set_vxlan->tun_flags = vxlan->key.tun_flags;
+	set_vxlan->ipv4_ttl = vxlan->key.ttl;
+	set_vxlan->ipv4_tos = vxlan->key.tos;
+
+	/* Complete pre_tunnel action. */
+	pre_tun->ipv4_dst = vxlan->key.u.ipv4.dst;
 
 	return 0;
 }
@@ -123,8 +226,11 @@ nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action,
 static int
 nfp_flower_loop_action(const struct tc_action *a,
 		       struct nfp_fl_payload *nfp_fl, int *a_len,
-		       struct net_device *netdev)
+		       struct net_device *netdev,
+		       enum nfp_flower_tun_type *tun_type, int *tun_out_cnt)
 {
+	struct nfp_fl_pre_tunnel *pre_tun;
+	struct nfp_fl_set_vxlan *s_vxl;
 	struct nfp_fl_push_vlan *psh_v;
 	struct nfp_fl_pop_vlan *pop_v;
 	struct nfp_fl_output *output;
@@ -137,7 +243,8 @@ nfp_flower_loop_action(const struct tc_action *a,
 			return -EOPNOTSUPP;
 
 		output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len];
-		err = nfp_fl_output(output, a, nfp_fl, true, netdev);
+		err = nfp_fl_output(output, a, nfp_fl, true, netdev, *tun_type,
+				    tun_out_cnt);
 		if (err)
 			return err;
 
@@ -147,7 +254,8 @@ nfp_flower_loop_action(const struct tc_action *a,
 			return -EOPNOTSUPP;
 
 		output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len];
-		err = nfp_fl_output(output, a, nfp_fl, false, netdev);
+		err = nfp_fl_output(output, a, nfp_fl, false, netdev, *tun_type,
+				    tun_out_cnt);
 		if (err)
 			return err;
 
@@ -170,6 +278,29 @@ nfp_flower_loop_action(const struct tc_action *a,
 
 		nfp_fl_push_vlan(psh_v, a);
 		*a_len += sizeof(struct nfp_fl_push_vlan);
+	} else if (is_tcf_tunnel_set(a) && nfp_fl_supported_tun_port(a)) {
+		/* Pre-tunnel action is required for tunnel encap.
+		 * This checks for next hop entries on NFP.
+		 * If none, the packet falls back before applying other actions.
+		 */
+		if (*a_len + sizeof(struct nfp_fl_pre_tunnel) +
+		    sizeof(struct nfp_fl_set_vxlan) > NFP_FL_MAX_A_SIZ)
+			return -EOPNOTSUPP;
+
+		*tun_type = NFP_FL_TUNNEL_VXLAN;
+		pre_tun = nfp_fl_pre_tunnel(nfp_fl->action_data, *a_len);
+		nfp_fl->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_NULL);
+		*a_len += sizeof(struct nfp_fl_pre_tunnel);
+
+		s_vxl = (struct nfp_fl_set_vxlan *)&nfp_fl->action_data[*a_len];
+		err = nfp_fl_set_vxlan(s_vxl, a, pre_tun);
+		if (err)
+			return err;
+
+		*a_len += sizeof(struct nfp_fl_set_vxlan);
+	} else if (is_tcf_tunnel_release(a)) {
+		/* Tunnel decap is handled by default so accept action. */
+		return 0;
 	} else {
 		/* Currently we do not handle any other actions. */
 		return -EOPNOTSUPP;
@@ -182,18 +313,22 @@ int nfp_flower_compile_action(struct tc_cls_flower_offload *flow,
 			      struct net_device *netdev,
 			      struct nfp_fl_payload *nfp_flow)
 {
-	int act_len, act_cnt, err;
+	int act_len, act_cnt, err, tun_out_cnt;
+	enum nfp_flower_tun_type tun_type;
 	const struct tc_action *a;
 	LIST_HEAD(actions);
 
 	memset(nfp_flow->action_data, 0, NFP_FL_MAX_A_SIZ);
 	nfp_flow->meta.act_len = 0;
+	tun_type = NFP_FL_TUNNEL_NONE;
 	act_len = 0;
 	act_cnt = 0;
+	tun_out_cnt = 0;
 
 	tcf_exts_to_list(flow->exts, &actions);
 	list_for_each_entry(a, &actions, list) {
-		err = nfp_flower_loop_action(a, nfp_flow, &act_len, netdev);
+		err = nfp_flower_loop_action(a, nfp_flow, &act_len, netdev,
+					     &tun_type, &tun_out_cnt);
 		if (err)
 			return err;
 		act_cnt++;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
index c3ca05d10fe1..6b71c719deba 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -38,17 +38,10 @@
 #include <net/dst_metadata.h>
 
 #include "main.h"
-#include "../nfpcore/nfp_cpp.h"
 #include "../nfp_net.h"
 #include "../nfp_net_repr.h"
 #include "./cmsg.h"
 
-#define nfp_flower_cmsg_warn(app, fmt, args...)				\
-	do {								\
-		if (net_ratelimit())					\
-			nfp_warn((app)->cpp, fmt, ## args);		\
-	} while (0)
-
 static struct nfp_flower_cmsg_hdr *
 nfp_flower_cmsg_get_hdr(struct sk_buff *skb)
 {
@@ -188,6 +181,15 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 	case NFP_FLOWER_CMSG_TYPE_FLOW_STATS:
 		nfp_flower_rx_flow_stats(app, skb);
 		break;
+	case NFP_FLOWER_CMSG_TYPE_NO_NEIGH:
+		nfp_tunnel_request_route(app, skb);
+		break;
+	case NFP_FLOWER_CMSG_TYPE_ACTIVE_TUNS:
+		nfp_tunnel_keep_alive(app, skb);
+		break;
+	case NFP_FLOWER_CMSG_TYPE_TUN_NEIGH:
+		/* Acks from the NFP that the route is added - ignore. */
+		break;
 	default:
 		nfp_flower_cmsg_warn(app, "Cannot handle invalid repr control type %u\n",
 				     type);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index a2ec60344236..504ddaa21701 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -39,6 +39,7 @@
 #include <linux/types.h>
 
 #include "../nfp_app.h"
+#include "../nfpcore/nfp_cpp.h"
 
 #define NFP_FLOWER_LAYER_META		BIT(0)
 #define NFP_FLOWER_LAYER_PORT		BIT(1)
@@ -67,10 +68,12 @@
 #define NFP_FL_LW_SIZ			2
 
 /* Action opcodes */
-#define NFP_FL_ACTION_OPCODE_OUTPUT	0
-#define NFP_FL_ACTION_OPCODE_PUSH_VLAN	1
-#define NFP_FL_ACTION_OPCODE_POP_VLAN	2
-#define NFP_FL_ACTION_OPCODE_NUM	32
+#define NFP_FL_ACTION_OPCODE_OUTPUT		0
+#define NFP_FL_ACTION_OPCODE_PUSH_VLAN		1
+#define NFP_FL_ACTION_OPCODE_POP_VLAN		2
+#define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL	6
+#define NFP_FL_ACTION_OPCODE_PRE_TUNNEL		17
+#define NFP_FL_ACTION_OPCODE_NUM		32
 
 #define NFP_FL_ACT_JMP_ID		GENMASK(15, 8)
 #define NFP_FL_ACT_LEN_LW		GENMASK(7, 0)
@@ -83,6 +86,22 @@
 #define NFP_FL_PUSH_VLAN_CFI		BIT(12)
 #define NFP_FL_PUSH_VLAN_VID		GENMASK(11, 0)
 
+/* Tunnel ports */
+#define NFP_FL_PORT_TYPE_TUN		0x50000000
+#define NFP_FL_IPV4_TUNNEL_TYPE		GENMASK(7, 4)
+#define NFP_FL_IPV4_PRE_TUN_INDEX	GENMASK(2, 0)
+
+#define nfp_flower_cmsg_warn(app, fmt, args...)                         \
+	do {                                                            \
+		if (net_ratelimit())                                    \
+			nfp_warn((app)->cpp, fmt, ## args);             \
+	} while (0)
+
+enum nfp_flower_tun_type {
+	NFP_FL_TUNNEL_NONE =	0,
+	NFP_FL_TUNNEL_VXLAN =	2,
+};
+
 struct nfp_fl_output {
 	__be16 a_op;
 	__be16 flags;
@@ -115,6 +134,25 @@ struct nfp_flower_meta_one {
 	u16 reserved;
 };
 
+struct nfp_fl_pre_tunnel {
+	__be16 a_op;
+	__be16 reserved;
+	__be32 ipv4_dst;
+	/* reserved for use with IPv6 addresses */
+	__be32 extra[3];
+};
+
+struct nfp_fl_set_vxlan {
+	__be16 a_op;
+	__be16 reserved;
+	__be64 tun_id;
+	__be32 tun_type_index;
+	__be16 tun_flags;
+	u8 ipv4_ttl;
+	u8 ipv4_tos;
+	__be32 extra[2];
+} __packed;
+
 /* Metadata with L2 (1W/4B)
  * ----------------------------------------------------------------
  *    3                   2                   1
@@ -230,6 +268,36 @@ struct nfp_flower_ipv6 {
 	struct in6_addr ipv6_dst;
 };
 
+/* Flow Frame VXLAN --> Tunnel details (4W/16B)
+ * -----------------------------------------------------------------
+ *    3                   2                   1
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                         ipv4_addr_src                         |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                         ipv4_addr_dst                         |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |           tun_flags           |       tos     |       ttl     |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |   gpe_flags   |            Reserved           | Next Protocol |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                     VNI                       |   Reserved    |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct nfp_flower_vxlan {
+	__be32 ip_src;
+	__be32 ip_dst;
+	__be16 tun_flags;
+	u8 tos;
+	u8 ttl;
+	u8 gpe_flags;
+	u8 reserved[2];
+	u8 nxt_proto;
+	__be32 tun_id;
+};
+
+#define NFP_FL_TUN_VNI_OFFSET 8
+
 /* The base header for a control message packet.
  * Defines an 8-bit version, and an 8-bit type, padded
  * to a 32-bit word. Rest of the packet is type-specific.
@@ -249,6 +317,11 @@ enum nfp_flower_cmsg_type_port {
 	NFP_FLOWER_CMSG_TYPE_FLOW_DEL =		2,
 	NFP_FLOWER_CMSG_TYPE_MAC_REPR =		7,
 	NFP_FLOWER_CMSG_TYPE_PORT_MOD =		8,
+	NFP_FLOWER_CMSG_TYPE_NO_NEIGH =		10,
+	NFP_FLOWER_CMSG_TYPE_TUN_MAC =		11,
+	NFP_FLOWER_CMSG_TYPE_ACTIVE_TUNS =	12,
+	NFP_FLOWER_CMSG_TYPE_TUN_NEIGH =	13,
+	NFP_FLOWER_CMSG_TYPE_TUN_IPS =		14,
 	NFP_FLOWER_CMSG_TYPE_FLOW_STATS =	15,
 	NFP_FLOWER_CMSG_TYPE_PORT_ECHO =	16,
 	NFP_FLOWER_CMSG_TYPE_MAX =		32,
@@ -282,6 +355,7 @@ enum nfp_flower_cmsg_port_type {
 	NFP_FLOWER_CMSG_PORT_TYPE_UNSPEC =	0x0,
 	NFP_FLOWER_CMSG_PORT_TYPE_PHYS_PORT =	0x1,
 	NFP_FLOWER_CMSG_PORT_TYPE_PCIE_PORT =	0x2,
+	NFP_FLOWER_CMSG_PORT_TYPE_OTHER_PORT =  0x3,
 };
 
 enum nfp_flower_cmsg_port_vnic_type {
@@ -323,6 +397,11 @@ static inline void *nfp_flower_cmsg_get_data(struct sk_buff *skb)
 	return (unsigned char *)skb->data + NFP_FLOWER_CMSG_HLEN;
 }
 
+static inline int nfp_flower_cmsg_get_data_len(struct sk_buff *skb)
+{
+	return skb->len - NFP_FLOWER_CMSG_HLEN;
+}
+
 struct sk_buff *
 nfp_flower_cmsg_mac_repr_start(struct nfp_app *app, unsigned int num_ports);
 void
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 91fe03617106..e46e7c60d491 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -436,6 +436,16 @@ static void nfp_flower_clean(struct nfp_app *app)
 	app->priv = NULL;
 }
 
+static int nfp_flower_start(struct nfp_app *app)
+{
+	return nfp_tunnel_config_start(app);
+}
+
+static void nfp_flower_stop(struct nfp_app *app)
+{
+	nfp_tunnel_config_stop(app);
+}
+
 const struct nfp_app_type app_flower = {
 	.id		= NFP_APP_FLOWER_NIC,
 	.name		= "flower",
@@ -453,6 +463,9 @@ const struct nfp_app_type app_flower = {
 	.repr_open	= nfp_flower_repr_netdev_open,
 	.repr_stop	= nfp_flower_repr_netdev_stop,
 
+	.start		= nfp_flower_start,
+	.stop		= nfp_flower_stop,
+
 	.ctrl_msg_rx	= nfp_flower_cmsg_rx,
 
 	.sriov_enable	= nfp_flower_sriov_enable,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index c20dd00a1cae..12c319a219d8 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -58,6 +58,8 @@ struct nfp_app;
 #define NFP_FL_MASK_REUSE_TIME_NS	40000
 #define NFP_FL_MASK_ID_LOCATION		1
 
+#define NFP_FL_VXLAN_PORT		4789
+
 struct nfp_fl_mask_id {
 	struct circ_buf mask_id_free_list;
 	struct timespec64 *last_used;
@@ -82,6 +84,18 @@ struct nfp_fl_stats_id {
  * @flow_table:		Hash table used to store flower rules
  * @cmsg_work:		Workqueue for control messages processing
  * @cmsg_skbs:		List of skbs for control message processing
+ * @nfp_mac_off_list:	List of MAC addresses to offload
+ * @nfp_mac_index_list:	List of unique 8-bit indexes for non NFP netdevs
+ * @nfp_ipv4_off_list:	List of IPv4 addresses to offload
+ * @nfp_neigh_off_list:	List of neighbour offloads
+ * @nfp_mac_off_lock:	Lock for the MAC address list
+ * @nfp_mac_index_lock:	Lock for the MAC index list
+ * @nfp_ipv4_off_lock:	Lock for the IPv4 address list
+ * @nfp_neigh_off_lock:	Lock for the neighbour address list
+ * @nfp_mac_off_ids:	IDA to manage id assignment for offloaded macs
+ * @nfp_mac_off_count:	Number of MACs in address list
+ * @nfp_tun_mac_nb:	Notifier to monitor link state
+ * @nfp_tun_neigh_nb:	Notifier to monitor neighbour state
  */
 struct nfp_flower_priv {
 	struct nfp_app *app;
@@ -94,6 +108,18 @@ struct nfp_flower_priv {
 	DECLARE_HASHTABLE(flow_table, NFP_FLOWER_HASH_BITS);
 	struct work_struct cmsg_work;
 	struct sk_buff_head cmsg_skbs;
+	struct list_head nfp_mac_off_list;
+	struct list_head nfp_mac_index_list;
+	struct list_head nfp_ipv4_off_list;
+	struct list_head nfp_neigh_off_list;
+	struct mutex nfp_mac_off_lock;
+	struct mutex nfp_mac_index_lock;
+	struct mutex nfp_ipv4_off_lock;
+	struct mutex nfp_neigh_off_lock;
+	struct ida nfp_mac_off_ids;
+	int nfp_mac_off_count;
+	struct notifier_block nfp_tun_mac_nb;
+	struct notifier_block nfp_tun_neigh_nb;
 };
 
 struct nfp_fl_key_ls {
@@ -126,6 +152,7 @@ struct nfp_fl_payload {
 	struct rcu_head rcu;
 	spinlock_t lock; /* lock stats */
 	struct nfp_fl_stats stats;
+	__be32 nfp_tun_ipv4_addr;
 	char *unmasked_data;
 	char *mask_data;
 	char *action_data;
@@ -163,4 +190,12 @@ nfp_flower_remove_fl_table(struct nfp_app *app, unsigned long tc_flower_cookie);
 
 void nfp_flower_rx_flow_stats(struct nfp_app *app, struct sk_buff *skb);
 
+int nfp_tunnel_config_start(struct nfp_app *app);
+void nfp_tunnel_config_stop(struct nfp_app *app);
+void nfp_tunnel_write_macs(struct nfp_app *app);
+void nfp_tunnel_del_ipv4_off(struct nfp_app *app, __be32 ipv4);
+void nfp_tunnel_add_ipv4_off(struct nfp_app *app, __be32 ipv4);
+void nfp_tunnel_request_route(struct nfp_app *app, struct sk_buff *skb);
+void nfp_tunnel_keep_alive(struct nfp_app *app, struct sk_buff *skb);
+
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c
index d25b5038c3a2..865a815ab92a 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/match.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/match.c
@@ -77,14 +77,17 @@ nfp_flower_compile_meta(struct nfp_flower_meta_one *frame, u8 key_type)
 
 static int
 nfp_flower_compile_port(struct nfp_flower_in_port *frame, u32 cmsg_port,
-			bool mask_version)
+			bool mask_version, enum nfp_flower_tun_type tun_type)
 {
 	if (mask_version) {
 		frame->in_port = cpu_to_be32(~0);
 		return 0;
 	}
 
-	frame->in_port = cpu_to_be32(cmsg_port);
+	if (tun_type)
+		frame->in_port = cpu_to_be32(NFP_FL_PORT_TYPE_TUN | tun_type);
+	else
+		frame->in_port = cpu_to_be32(cmsg_port);
 
 	return 0;
 }
@@ -189,15 +192,56 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame,
 	}
 }
 
+static void
+nfp_flower_compile_vxlan(struct nfp_flower_vxlan *frame,
+			 struct tc_cls_flower_offload *flow,
+			 bool mask_version, __be32 *tun_dst)
+{
+	struct fl_flow_key *target = mask_version ? flow->mask : flow->key;
+	struct flow_dissector_key_ipv4_addrs *vxlan_ips;
+	struct flow_dissector_key_keyid *vni;
+
+	/* Wildcard TOS/TTL/GPE_FLAGS/NXT_PROTO for now. */
+	memset(frame, 0, sizeof(struct nfp_flower_vxlan));
+
+	if (dissector_uses_key(flow->dissector,
+			       FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+		u32 temp_vni;
+
+		vni = skb_flow_dissector_target(flow->dissector,
+						FLOW_DISSECTOR_KEY_ENC_KEYID,
+						target);
+		temp_vni = be32_to_cpu(vni->keyid) << NFP_FL_TUN_VNI_OFFSET;
+		frame->tun_id = cpu_to_be32(temp_vni);
+	}
+
+	if (dissector_uses_key(flow->dissector,
+			       FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
+		vxlan_ips =
+		   skb_flow_dissector_target(flow->dissector,
+					     FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+					     target);
+		frame->ip_src = vxlan_ips->src;
+		frame->ip_dst = vxlan_ips->dst;
+		*tun_dst = vxlan_ips->dst;
+	}
+}
+
 int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow,
 				  struct nfp_fl_key_ls *key_ls,
 				  struct net_device *netdev,
 				  struct nfp_fl_payload *nfp_flow)
 {
+	enum nfp_flower_tun_type tun_type = NFP_FL_TUNNEL_NONE;
+	__be32 tun_dst, tun_dst_mask = 0;
+	struct nfp_repr *netdev_repr;
 	int err;
 	u8 *ext;
 	u8 *msk;
 
+	if (key_ls->key_layer & NFP_FLOWER_LAYER_VXLAN)
+		tun_type = NFP_FL_TUNNEL_VXLAN;
+
 	memset(nfp_flow->unmasked_data, 0, key_ls->key_size);
 	memset(nfp_flow->mask_data, 0, key_ls->key_size);
 
@@ -216,14 +260,14 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow,
 		/* Populate Exact Port data. */
 		err = nfp_flower_compile_port((struct nfp_flower_in_port *)ext,
 					      nfp_repr_get_port_id(netdev),
-					      false);
+					      false, tun_type);
 		if (err)
 			return err;
 
 		/* Populate Mask Port Data. */
 		err = nfp_flower_compile_port((struct nfp_flower_in_port *)msk,
 					      nfp_repr_get_port_id(netdev),
-					      true);
+					      true, tun_type);
 		if (err)
 			return err;
 
@@ -291,5 +335,28 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow,
 		msk += sizeof(struct nfp_flower_ipv6);
 	}
 
+	if (key_ls->key_layer & NFP_FLOWER_LAYER_VXLAN) {
+		/* Populate Exact VXLAN Data. */
+		nfp_flower_compile_vxlan((struct nfp_flower_vxlan *)ext,
+					 flow, false, &tun_dst);
+		/* Populate Mask VXLAN Data. */
+		nfp_flower_compile_vxlan((struct nfp_flower_vxlan *)msk,
+					 flow, true, &tun_dst_mask);
+		ext += sizeof(struct nfp_flower_vxlan);
+		msk += sizeof(struct nfp_flower_vxlan);
+
+		/* Configure tunnel end point MAC. */
+		if (nfp_netdev_is_nfp_repr(netdev)) {
+			netdev_repr = netdev_priv(netdev);
+			nfp_tunnel_write_macs(netdev_repr->app);
+
+			/* Store the tunnel destination in the rule data.
+			 * This must be present and be an exact match.
+			 */
+			nfp_flow->nfp_tun_ipv4_addr = tun_dst;
+			nfp_tunnel_add_ipv4_off(netdev_repr->app, tun_dst);
+		}
+	}
+
 	return 0;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/flower/metadata.c b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
index 3226ddc55f99..193520ef23f0 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/metadata.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
@@ -140,7 +140,7 @@ exit_rcu_unlock:
 
 void nfp_flower_rx_flow_stats(struct nfp_app *app, struct sk_buff *skb)
 {
-	unsigned int msg_len = skb->len - NFP_FLOWER_CMSG_HLEN;
+	unsigned int msg_len = nfp_flower_cmsg_get_data_len(skb);
 	struct nfp_fl_stats_frame *stats_frame;
 	unsigned char *msg;
 	int i;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index a18b4d2b1d3e..3d9537ebdea4 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -52,8 +52,25 @@
 	 BIT(FLOW_DISSECTOR_KEY_PORTS) | \
 	 BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | \
 	 BIT(FLOW_DISSECTOR_KEY_VLAN) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | \
 	 BIT(FLOW_DISSECTOR_KEY_IP))
 
+#define NFP_FLOWER_WHITELIST_TUN_DISSECTOR \
+	(BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_PORTS))
+
+#define NFP_FLOWER_WHITELIST_TUN_DISSECTOR_R \
+	(BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \
+	 BIT(FLOW_DISSECTOR_KEY_ENC_PORTS))
+
 static int
 nfp_flower_xmit_flow(struct net_device *netdev,
 		     struct nfp_fl_payload *nfp_flow, u8 mtype)
@@ -125,15 +142,58 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
 	if (flow->dissector->used_keys & ~NFP_FLOWER_WHITELIST_DISSECTOR)
 		return -EOPNOTSUPP;
 
+	/* If any tun dissector is used then the required set must be used. */
+	if (flow->dissector->used_keys & NFP_FLOWER_WHITELIST_TUN_DISSECTOR &&
+	    (flow->dissector->used_keys & NFP_FLOWER_WHITELIST_TUN_DISSECTOR_R)
+	    != NFP_FLOWER_WHITELIST_TUN_DISSECTOR_R)
+		return -EOPNOTSUPP;
+
+	key_layer_two = 0;
+	key_layer = NFP_FLOWER_LAYER_PORT | NFP_FLOWER_LAYER_MAC;
+	key_size = sizeof(struct nfp_flower_meta_one) +
+		   sizeof(struct nfp_flower_in_port) +
+		   sizeof(struct nfp_flower_mac_mpls);
+
 	if (dissector_uses_key(flow->dissector,
 			       FLOW_DISSECTOR_KEY_ENC_CONTROL)) {
+		struct flow_dissector_key_ipv4_addrs *mask_ipv4 = NULL;
+		struct flow_dissector_key_ports *mask_enc_ports = NULL;
+		struct flow_dissector_key_ports *enc_ports = NULL;
 		struct flow_dissector_key_control *mask_enc_ctl =
 			skb_flow_dissector_target(flow->dissector,
 						  FLOW_DISSECTOR_KEY_ENC_CONTROL,
 						  flow->mask);
-		/* We are expecting a tunnel. For now we ignore offloading. */
-		if (mask_enc_ctl->addr_type)
+		struct flow_dissector_key_control *enc_ctl =
+			skb_flow_dissector_target(flow->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_CONTROL,
+						  flow->key);
+		if (mask_enc_ctl->addr_type != 0xffff ||
+		    enc_ctl->addr_type != FLOW_DISSECTOR_KEY_IPV4_ADDRS)
+			return -EOPNOTSUPP;
+
+		/* These fields are already verified as used. */
+		mask_ipv4 =
+			skb_flow_dissector_target(flow->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+						  flow->mask);
+		if (mask_ipv4->dst != cpu_to_be32(~0))
+			return -EOPNOTSUPP;
+
+		mask_enc_ports =
+			skb_flow_dissector_target(flow->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_PORTS,
+						  flow->mask);
+		enc_ports =
+			skb_flow_dissector_target(flow->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_PORTS,
+						  flow->key);
+
+		if (mask_enc_ports->dst != cpu_to_be16(~0) ||
+		    enc_ports->dst != htons(NFP_FL_VXLAN_PORT))
 			return -EOPNOTSUPP;
+
+		key_layer |= NFP_FLOWER_LAYER_VXLAN;
+		key_size += sizeof(struct nfp_flower_vxlan);
 	}
 
 	if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
@@ -151,12 +211,6 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
 						    FLOW_DISSECTOR_KEY_IP,
 						    flow->mask);
 
-	key_layer_two = 0;
-	key_layer = NFP_FLOWER_LAYER_PORT | NFP_FLOWER_LAYER_MAC;
-	key_size = sizeof(struct nfp_flower_meta_one) +
-		   sizeof(struct nfp_flower_in_port) +
-		   sizeof(struct nfp_flower_mac_mpls);
-
 	if (mask_basic && mask_basic->n_proto) {
 		/* Ethernet type is present in the key. */
 		switch (key_basic->n_proto) {
@@ -252,6 +306,7 @@ nfp_flower_allocate_new(struct nfp_fl_key_ls *key_layer)
 	if (!flow_pay->action_data)
 		goto err_free_mask;
 
+	flow_pay->nfp_tun_ipv4_addr = 0;
 	flow_pay->meta.flags = 0;
 	spin_lock_init(&flow_pay->lock);
 
@@ -361,6 +416,9 @@ nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev,
 	if (err)
 		goto err_free_flow;
 
+	if (nfp_flow->nfp_tun_ipv4_addr)
+		nfp_tunnel_del_ipv4_off(app, nfp_flow->nfp_tun_ipv4_addr);
+
 	err = nfp_flower_xmit_flow(netdev, nfp_flow,
 				   NFP_FLOWER_CMSG_TYPE_FLOW_DEL);
 	if (err)
diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
new file mode 100644
index 000000000000..c495f8f38506
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <net/netevent.h>
+#include <linux/idr.h>
+#include <net/dst_metadata.h>
+#include <net/arp.h>
+
+#include "cmsg.h"
+#include "main.h"
+#include "../nfp_net_repr.h"
+#include "../nfp_net.h"
+
+#define NFP_FL_MAX_ROUTES               32
+
+/**
+ * struct nfp_tun_active_tuns - periodic message of active tunnels
+ * @seq:		sequence number of the message
+ * @count:		number of tunnels report in message
+ * @flags:		options part of the request
+ * @ipv4:		dest IPv4 address of active route
+ * @egress_port:	port the encapsulated packet egressed
+ * @extra:		reserved for future use
+ * @tun_info:		tunnels that have sent traffic in reported period
+ */
+struct nfp_tun_active_tuns {
+	__be32 seq;
+	__be32 count;
+	__be32 flags;
+	struct route_ip_info {
+		__be32 ipv4;
+		__be32 egress_port;
+		__be32 extra[2];
+	} tun_info[];
+};
+
+/**
+ * struct nfp_tun_neigh - neighbour/route entry on the NFP
+ * @dst_ipv4:	destination IPv4 address
+ * @src_ipv4:	source IPv4 address
+ * @dst_addr:	destination MAC address
+ * @src_addr:	source MAC address
+ * @port_id:	NFP port to output packet on - associated with source IPv4
+ */
+struct nfp_tun_neigh {
+	__be32 dst_ipv4;
+	__be32 src_ipv4;
+	u8 dst_addr[ETH_ALEN];
+	u8 src_addr[ETH_ALEN];
+	__be32 port_id;
+};
+
+/**
+ * struct nfp_tun_req_route_ipv4 - NFP requests a route/neighbour lookup
+ * @ingress_port:	ingress port of packet that signalled request
+ * @ipv4_addr:		destination ipv4 address for route
+ * @reserved:		reserved for future use
+ */
+struct nfp_tun_req_route_ipv4 {
+	__be32 ingress_port;
+	__be32 ipv4_addr;
+	__be32 reserved[2];
+};
+
+/**
+ * struct nfp_ipv4_route_entry - routes that are offloaded to the NFP
+ * @ipv4_addr:	destination of route
+ * @list:	list pointer
+ */
+struct nfp_ipv4_route_entry {
+	__be32 ipv4_addr;
+	struct list_head list;
+};
+
+#define NFP_FL_IPV4_ADDRS_MAX        32
+
+/**
+ * struct nfp_tun_ipv4_addr - set the IP address list on the NFP
+ * @count:	number of IPs populated in the array
+ * @ipv4_addr:	array of IPV4_ADDRS_MAX 32 bit IPv4 addresses
+ */
+struct nfp_tun_ipv4_addr {
+	__be32 count;
+	__be32 ipv4_addr[NFP_FL_IPV4_ADDRS_MAX];
+};
+
+/**
+ * struct nfp_ipv4_addr_entry - cached IPv4 addresses
+ * @ipv4_addr:	IP address
+ * @ref_count:	number of rules currently using this IP
+ * @list:	list pointer
+ */
+struct nfp_ipv4_addr_entry {
+	__be32 ipv4_addr;
+	int ref_count;
+	struct list_head list;
+};
+
+/**
+ * struct nfp_tun_mac_addr - configure MAC address of tunnel EP on NFP
+ * @reserved:	reserved for future use
+ * @count:	number of MAC addresses in the message
+ * @index:	index of MAC address in the lookup table
+ * @addr:	interface MAC address
+ * @addresses:	series of MACs to offload
+ */
+struct nfp_tun_mac_addr {
+	__be16 reserved;
+	__be16 count;
+	struct index_mac_addr {
+		__be16 index;
+		u8 addr[ETH_ALEN];
+	} addresses[];
+};
+
+/**
+ * struct nfp_tun_mac_offload_entry - list of MACs to offload
+ * @index:	index of MAC address for offloading
+ * @addr:	interface MAC address
+ * @list:	list pointer
+ */
+struct nfp_tun_mac_offload_entry {
+	__be16 index;
+	u8 addr[ETH_ALEN];
+	struct list_head list;
+};
+
+#define NFP_MAX_MAC_INDEX       0xff
+
+/**
+ * struct nfp_tun_mac_non_nfp_idx - converts non NFP netdev ifindex to 8-bit id
+ * @ifindex:	netdev ifindex of the device
+ * @index:	index of netdevs mac on NFP
+ * @list:	list pointer
+ */
+struct nfp_tun_mac_non_nfp_idx {
+	int ifindex;
+	u8 index;
+	struct list_head list;
+};
+
+void nfp_tunnel_keep_alive(struct nfp_app *app, struct sk_buff *skb)
+{
+	struct nfp_tun_active_tuns *payload;
+	struct net_device *netdev;
+	int count, i, pay_len;
+	struct neighbour *n;
+	__be32 ipv4_addr;
+	u32 port;
+
+	payload = nfp_flower_cmsg_get_data(skb);
+	count = be32_to_cpu(payload->count);
+	if (count > NFP_FL_MAX_ROUTES) {
+		nfp_flower_cmsg_warn(app, "Tunnel keep-alive request exceeds max routes.\n");
+		return;
+	}
+
+	pay_len = nfp_flower_cmsg_get_data_len(skb);
+	if (pay_len != sizeof(struct nfp_tun_active_tuns) +
+	    sizeof(struct route_ip_info) * count) {
+		nfp_flower_cmsg_warn(app, "Corruption in tunnel keep-alive message.\n");
+		return;
+	}
+
+	for (i = 0; i < count; i++) {
+		ipv4_addr = payload->tun_info[i].ipv4;
+		port = be32_to_cpu(payload->tun_info[i].egress_port);
+		netdev = nfp_app_repr_get(app, port);
+		if (!netdev)
+			continue;
+
+		n = neigh_lookup(&arp_tbl, &ipv4_addr, netdev);
+		if (!n)
+			continue;
+
+		/* Update the used timestamp of neighbour */
+		neigh_event_send(n, NULL);
+		neigh_release(n);
+	}
+}
+
+static bool nfp_tun_is_netdev_to_offload(struct net_device *netdev)
+{
+	if (!netdev->rtnl_link_ops)
+		return false;
+	if (!strcmp(netdev->rtnl_link_ops->kind, "openvswitch"))
+		return true;
+	if (!strcmp(netdev->rtnl_link_ops->kind, "vxlan"))
+		return true;
+
+	return false;
+}
+
+static int
+nfp_flower_xmit_tun_conf(struct nfp_app *app, u8 mtype, u16 plen, void *pdata)
+{
+	struct sk_buff *skb;
+	unsigned char *msg;
+
+	skb = nfp_flower_cmsg_alloc(app, plen, mtype);
+	if (!skb)
+		return -ENOMEM;
+
+	msg = nfp_flower_cmsg_get_data(skb);
+	memcpy(msg, pdata, nfp_flower_cmsg_get_data_len(skb));
+
+	nfp_ctrl_tx(app->ctrl, skb);
+	return 0;
+}
+
+static bool nfp_tun_has_route(struct nfp_app *app, __be32 ipv4_addr)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_ipv4_route_entry *entry;
+	struct list_head *ptr, *storage;
+
+	mutex_lock(&priv->nfp_neigh_off_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_neigh_off_list) {
+		entry = list_entry(ptr, struct nfp_ipv4_route_entry, list);
+		if (entry->ipv4_addr == ipv4_addr) {
+			mutex_unlock(&priv->nfp_neigh_off_lock);
+			return true;
+		}
+	}
+	mutex_unlock(&priv->nfp_neigh_off_lock);
+	return false;
+}
+
+static void nfp_tun_add_route_to_cache(struct nfp_app *app, __be32 ipv4_addr)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_ipv4_route_entry *entry;
+	struct list_head *ptr, *storage;
+
+	mutex_lock(&priv->nfp_neigh_off_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_neigh_off_list) {
+		entry = list_entry(ptr, struct nfp_ipv4_route_entry, list);
+		if (entry->ipv4_addr == ipv4_addr) {
+			mutex_unlock(&priv->nfp_neigh_off_lock);
+			return;
+		}
+	}
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		mutex_unlock(&priv->nfp_neigh_off_lock);
+		nfp_flower_cmsg_warn(app, "Mem error when storing new route.\n");
+		return;
+	}
+
+	entry->ipv4_addr = ipv4_addr;
+	list_add_tail(&entry->list, &priv->nfp_neigh_off_list);
+	mutex_unlock(&priv->nfp_neigh_off_lock);
+}
+
+static void nfp_tun_del_route_from_cache(struct nfp_app *app, __be32 ipv4_addr)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_ipv4_route_entry *entry;
+	struct list_head *ptr, *storage;
+
+	mutex_lock(&priv->nfp_neigh_off_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_neigh_off_list) {
+		entry = list_entry(ptr, struct nfp_ipv4_route_entry, list);
+		if (entry->ipv4_addr == ipv4_addr) {
+			list_del(&entry->list);
+			kfree(entry);
+			break;
+		}
+	}
+	mutex_unlock(&priv->nfp_neigh_off_lock);
+}
+
+static void
+nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app,
+		    struct flowi4 *flow, struct neighbour *neigh)
+{
+	struct nfp_tun_neigh payload;
+
+	/* Only offload representor IPv4s for now. */
+	if (!nfp_netdev_is_nfp_repr(netdev))
+		return;
+
+	memset(&payload, 0, sizeof(struct nfp_tun_neigh));
+	payload.dst_ipv4 = flow->daddr;
+
+	/* If entry has expired send dst IP with all other fields 0. */
+	if (!(neigh->nud_state & NUD_VALID)) {
+		nfp_tun_del_route_from_cache(app, payload.dst_ipv4);
+		/* Trigger ARP to verify invalid neighbour state. */
+		neigh_event_send(neigh, NULL);
+		goto send_msg;
+	}
+
+	/* Have a valid neighbour so populate rest of entry. */
+	payload.src_ipv4 = flow->saddr;
+	ether_addr_copy(payload.src_addr, netdev->dev_addr);
+	neigh_ha_snapshot(payload.dst_addr, neigh, netdev);
+	payload.port_id = cpu_to_be32(nfp_repr_get_port_id(netdev));
+	/* Add destination of new route to NFP cache. */
+	nfp_tun_add_route_to_cache(app, payload.dst_ipv4);
+
+send_msg:
+	nfp_flower_xmit_tun_conf(app, NFP_FLOWER_CMSG_TYPE_TUN_NEIGH,
+				 sizeof(struct nfp_tun_neigh),
+				 (unsigned char *)&payload);
+}
+
+static int
+nfp_tun_neigh_event_handler(struct notifier_block *nb, unsigned long event,
+			    void *ptr)
+{
+	struct nfp_flower_priv *app_priv;
+	struct netevent_redirect *redir;
+	struct flowi4 flow = {};
+	struct neighbour *n;
+	struct nfp_app *app;
+	struct rtable *rt;
+	int err;
+
+	switch (event) {
+	case NETEVENT_REDIRECT:
+		redir = (struct netevent_redirect *)ptr;
+		n = redir->neigh;
+		break;
+	case NETEVENT_NEIGH_UPDATE:
+		n = (struct neighbour *)ptr;
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	flow.daddr = *(__be32 *)n->primary_key;
+
+	/* Only concerned with route changes for representors. */
+	if (!nfp_netdev_is_nfp_repr(n->dev))
+		return NOTIFY_DONE;
+
+	app_priv = container_of(nb, struct nfp_flower_priv, nfp_tun_neigh_nb);
+	app = app_priv->app;
+
+	/* Only concerned with changes to routes already added to NFP. */
+	if (!nfp_tun_has_route(app, flow.daddr))
+		return NOTIFY_DONE;
+
+#if IS_ENABLED(CONFIG_INET)
+	/* Do a route lookup to populate flow data. */
+	rt = ip_route_output_key(dev_net(n->dev), &flow);
+	err = PTR_ERR_OR_ZERO(rt);
+	if (err)
+		return NOTIFY_DONE;
+#else
+	return NOTIFY_DONE;
+#endif
+
+	flow.flowi4_proto = IPPROTO_UDP;
+	nfp_tun_write_neigh(n->dev, app, &flow, n);
+
+	return NOTIFY_OK;
+}
+
+void nfp_tunnel_request_route(struct nfp_app *app, struct sk_buff *skb)
+{
+	struct nfp_tun_req_route_ipv4 *payload;
+	struct net_device *netdev;
+	struct flowi4 flow = {};
+	struct neighbour *n;
+	struct rtable *rt;
+	int err;
+
+	payload = nfp_flower_cmsg_get_data(skb);
+
+	netdev = nfp_app_repr_get(app, be32_to_cpu(payload->ingress_port));
+	if (!netdev)
+		goto route_fail_warning;
+
+	flow.daddr = payload->ipv4_addr;
+	flow.flowi4_proto = IPPROTO_UDP;
+
+#if IS_ENABLED(CONFIG_INET)
+	/* Do a route lookup on same namespace as ingress port. */
+	rt = ip_route_output_key(dev_net(netdev), &flow);
+	err = PTR_ERR_OR_ZERO(rt);
+	if (err)
+		goto route_fail_warning;
+#else
+	goto route_fail_warning;
+#endif
+
+	/* Get the neighbour entry for the lookup */
+	n = dst_neigh_lookup(&rt->dst, &flow.daddr);
+	ip_rt_put(rt);
+	if (!n)
+		goto route_fail_warning;
+	nfp_tun_write_neigh(n->dev, app, &flow, n);
+	neigh_release(n);
+	return;
+
+route_fail_warning:
+	nfp_flower_cmsg_warn(app, "Requested route not found.\n");
+}
+
+static void nfp_tun_write_ipv4_list(struct nfp_app *app)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_ipv4_addr_entry *entry;
+	struct nfp_tun_ipv4_addr payload;
+	struct list_head *ptr, *storage;
+	int count;
+
+	memset(&payload, 0, sizeof(struct nfp_tun_ipv4_addr));
+	mutex_lock(&priv->nfp_ipv4_off_lock);
+	count = 0;
+	list_for_each_safe(ptr, storage, &priv->nfp_ipv4_off_list) {
+		if (count >= NFP_FL_IPV4_ADDRS_MAX) {
+			mutex_unlock(&priv->nfp_ipv4_off_lock);
+			nfp_flower_cmsg_warn(app, "IPv4 offload exceeds limit.\n");
+			return;
+		}
+		entry = list_entry(ptr, struct nfp_ipv4_addr_entry, list);
+		payload.ipv4_addr[count++] = entry->ipv4_addr;
+	}
+	payload.count = cpu_to_be32(count);
+	mutex_unlock(&priv->nfp_ipv4_off_lock);
+
+	nfp_flower_xmit_tun_conf(app, NFP_FLOWER_CMSG_TYPE_TUN_IPS,
+				 sizeof(struct nfp_tun_ipv4_addr),
+				 &payload);
+}
+
+void nfp_tunnel_add_ipv4_off(struct nfp_app *app, __be32 ipv4)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_ipv4_addr_entry *entry;
+	struct list_head *ptr, *storage;
+
+	mutex_lock(&priv->nfp_ipv4_off_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_ipv4_off_list) {
+		entry = list_entry(ptr, struct nfp_ipv4_addr_entry, list);
+		if (entry->ipv4_addr == ipv4) {
+			entry->ref_count++;
+			mutex_unlock(&priv->nfp_ipv4_off_lock);
+			return;
+		}
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		mutex_unlock(&priv->nfp_ipv4_off_lock);
+		nfp_flower_cmsg_warn(app, "Mem error when offloading IP address.\n");
+		return;
+	}
+	entry->ipv4_addr = ipv4;
+	entry->ref_count = 1;
+	list_add_tail(&entry->list, &priv->nfp_ipv4_off_list);
+	mutex_unlock(&priv->nfp_ipv4_off_lock);
+
+	nfp_tun_write_ipv4_list(app);
+}
+
+void nfp_tunnel_del_ipv4_off(struct nfp_app *app, __be32 ipv4)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_ipv4_addr_entry *entry;
+	struct list_head *ptr, *storage;
+
+	mutex_lock(&priv->nfp_ipv4_off_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_ipv4_off_list) {
+		entry = list_entry(ptr, struct nfp_ipv4_addr_entry, list);
+		if (entry->ipv4_addr == ipv4) {
+			entry->ref_count--;
+			if (!entry->ref_count) {
+				list_del(&entry->list);
+				kfree(entry);
+			}
+			break;
+		}
+	}
+	mutex_unlock(&priv->nfp_ipv4_off_lock);
+
+	nfp_tun_write_ipv4_list(app);
+}
+
+void nfp_tunnel_write_macs(struct nfp_app *app)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_tun_mac_offload_entry *entry;
+	struct nfp_tun_mac_addr *payload;
+	struct list_head *ptr, *storage;
+	int mac_count, err, pay_size;
+
+	mutex_lock(&priv->nfp_mac_off_lock);
+	if (!priv->nfp_mac_off_count) {
+		mutex_unlock(&priv->nfp_mac_off_lock);
+		return;
+	}
+
+	pay_size = sizeof(struct nfp_tun_mac_addr) +
+		   sizeof(struct index_mac_addr) * priv->nfp_mac_off_count;
+
+	payload = kzalloc(pay_size, GFP_KERNEL);
+	if (!payload) {
+		mutex_unlock(&priv->nfp_mac_off_lock);
+		return;
+	}
+
+	payload->count = cpu_to_be16(priv->nfp_mac_off_count);
+
+	mac_count = 0;
+	list_for_each_safe(ptr, storage, &priv->nfp_mac_off_list) {
+		entry = list_entry(ptr, struct nfp_tun_mac_offload_entry,
+				   list);
+		payload->addresses[mac_count].index = entry->index;
+		ether_addr_copy(payload->addresses[mac_count].addr,
+				entry->addr);
+		mac_count++;
+	}
+
+	err = nfp_flower_xmit_tun_conf(app, NFP_FLOWER_CMSG_TYPE_TUN_MAC,
+				       pay_size, payload);
+
+	kfree(payload);
+
+	if (err) {
+		mutex_unlock(&priv->nfp_mac_off_lock);
+		/* Write failed so retain list for future retry. */
+		return;
+	}
+
+	/* If list was successfully offloaded, flush it. */
+	list_for_each_safe(ptr, storage, &priv->nfp_mac_off_list) {
+		entry = list_entry(ptr, struct nfp_tun_mac_offload_entry,
+				   list);
+		list_del(&entry->list);
+		kfree(entry);
+	}
+
+	priv->nfp_mac_off_count = 0;
+	mutex_unlock(&priv->nfp_mac_off_lock);
+}
+
+static int nfp_tun_get_mac_idx(struct nfp_app *app, int ifindex)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_tun_mac_non_nfp_idx *entry;
+	struct list_head *ptr, *storage;
+	int idx;
+
+	mutex_lock(&priv->nfp_mac_index_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_mac_index_list) {
+		entry = list_entry(ptr, struct nfp_tun_mac_non_nfp_idx, list);
+		if (entry->ifindex == ifindex) {
+			idx = entry->index;
+			mutex_unlock(&priv->nfp_mac_index_lock);
+			return idx;
+		}
+	}
+
+	idx = ida_simple_get(&priv->nfp_mac_off_ids, 0,
+			     NFP_MAX_MAC_INDEX, GFP_KERNEL);
+	if (idx < 0) {
+		mutex_unlock(&priv->nfp_mac_index_lock);
+		return idx;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		mutex_unlock(&priv->nfp_mac_index_lock);
+		return -ENOMEM;
+	}
+	entry->ifindex = ifindex;
+	entry->index = idx;
+	list_add_tail(&entry->list, &priv->nfp_mac_index_list);
+	mutex_unlock(&priv->nfp_mac_index_lock);
+
+	return idx;
+}
+
+static void nfp_tun_del_mac_idx(struct nfp_app *app, int ifindex)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_tun_mac_non_nfp_idx *entry;
+	struct list_head *ptr, *storage;
+
+	mutex_lock(&priv->nfp_mac_index_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_mac_index_list) {
+		entry = list_entry(ptr, struct nfp_tun_mac_non_nfp_idx, list);
+		if (entry->ifindex == ifindex) {
+			ida_simple_remove(&priv->nfp_mac_off_ids,
+					  entry->index);
+			list_del(&entry->list);
+			kfree(entry);
+			break;
+		}
+	}
+	mutex_unlock(&priv->nfp_mac_index_lock);
+}
+
+static void nfp_tun_add_to_mac_offload_list(struct net_device *netdev,
+					    struct nfp_app *app)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_tun_mac_offload_entry *entry;
+	u16 nfp_mac_idx;
+	int port = 0;
+
+	/* Check if MAC should be offloaded. */
+	if (!is_valid_ether_addr(netdev->dev_addr))
+		return;
+
+	if (nfp_netdev_is_nfp_repr(netdev))
+		port = nfp_repr_get_port_id(netdev);
+	else if (!nfp_tun_is_netdev_to_offload(netdev))
+		return;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		nfp_flower_cmsg_warn(app, "Mem fail when offloading MAC.\n");
+		return;
+	}
+
+	if (FIELD_GET(NFP_FLOWER_CMSG_PORT_TYPE, port) ==
+	    NFP_FLOWER_CMSG_PORT_TYPE_PHYS_PORT) {
+		nfp_mac_idx = port << 8 | NFP_FLOWER_CMSG_PORT_TYPE_PHYS_PORT;
+	} else if (FIELD_GET(NFP_FLOWER_CMSG_PORT_TYPE, port) ==
+		   NFP_FLOWER_CMSG_PORT_TYPE_PCIE_PORT) {
+		port = FIELD_GET(NFP_FLOWER_CMSG_PORT_VNIC, port);
+		nfp_mac_idx = port << 8 | NFP_FLOWER_CMSG_PORT_TYPE_PCIE_PORT;
+	} else {
+		/* Must assign our own unique 8-bit index. */
+		int idx = nfp_tun_get_mac_idx(app, netdev->ifindex);
+
+		if (idx < 0) {
+			nfp_flower_cmsg_warn(app, "Can't assign non-repr MAC index.\n");
+			kfree(entry);
+			return;
+		}
+		nfp_mac_idx = idx << 8 | NFP_FLOWER_CMSG_PORT_TYPE_OTHER_PORT;
+	}
+
+	entry->index = cpu_to_be16(nfp_mac_idx);
+	ether_addr_copy(entry->addr, netdev->dev_addr);
+
+	mutex_lock(&priv->nfp_mac_off_lock);
+	priv->nfp_mac_off_count++;
+	list_add_tail(&entry->list, &priv->nfp_mac_off_list);
+	mutex_unlock(&priv->nfp_mac_off_lock);
+}
+
+static int nfp_tun_mac_event_handler(struct notifier_block *nb,
+				     unsigned long event, void *ptr)
+{
+	struct nfp_flower_priv *app_priv;
+	struct net_device *netdev;
+	struct nfp_app *app;
+
+	if (event == NETDEV_DOWN || event == NETDEV_UNREGISTER) {
+		app_priv = container_of(nb, struct nfp_flower_priv,
+					nfp_tun_mac_nb);
+		app = app_priv->app;
+		netdev = netdev_notifier_info_to_dev(ptr);
+
+		/* If non-nfp netdev then free its offload index. */
+		if (nfp_tun_is_netdev_to_offload(netdev))
+			nfp_tun_del_mac_idx(app, netdev->ifindex);
+	} else if (event == NETDEV_UP || event == NETDEV_CHANGEADDR ||
+		   event == NETDEV_REGISTER) {
+		app_priv = container_of(nb, struct nfp_flower_priv,
+					nfp_tun_mac_nb);
+		app = app_priv->app;
+		netdev = netdev_notifier_info_to_dev(ptr);
+
+		nfp_tun_add_to_mac_offload_list(netdev, app);
+
+		/* Force a list write to keep NFP up to date. */
+		nfp_tunnel_write_macs(app);
+	}
+	return NOTIFY_OK;
+}
+
+int nfp_tunnel_config_start(struct nfp_app *app)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct net_device *netdev;
+	int err;
+
+	/* Initialise priv data for MAC offloading. */
+	priv->nfp_mac_off_count = 0;
+	mutex_init(&priv->nfp_mac_off_lock);
+	INIT_LIST_HEAD(&priv->nfp_mac_off_list);
+	priv->nfp_tun_mac_nb.notifier_call = nfp_tun_mac_event_handler;
+	mutex_init(&priv->nfp_mac_index_lock);
+	INIT_LIST_HEAD(&priv->nfp_mac_index_list);
+	ida_init(&priv->nfp_mac_off_ids);
+
+	/* Initialise priv data for IPv4 offloading. */
+	mutex_init(&priv->nfp_ipv4_off_lock);
+	INIT_LIST_HEAD(&priv->nfp_ipv4_off_list);
+
+	/* Initialise priv data for neighbour offloading. */
+	mutex_init(&priv->nfp_neigh_off_lock);
+	INIT_LIST_HEAD(&priv->nfp_neigh_off_list);
+	priv->nfp_tun_neigh_nb.notifier_call = nfp_tun_neigh_event_handler;
+
+	err = register_netdevice_notifier(&priv->nfp_tun_mac_nb);
+	if (err)
+		goto err_free_mac_ida;
+
+	err = register_netevent_notifier(&priv->nfp_tun_neigh_nb);
+	if (err)
+		goto err_unreg_mac_nb;
+
+	/* Parse netdevs already registered for MACs that need offloaded. */
+	rtnl_lock();
+	for_each_netdev(&init_net, netdev)
+		nfp_tun_add_to_mac_offload_list(netdev, app);
+	rtnl_unlock();
+
+	return 0;
+
+err_unreg_mac_nb:
+	unregister_netdevice_notifier(&priv->nfp_tun_mac_nb);
+err_free_mac_ida:
+	ida_destroy(&priv->nfp_mac_off_ids);
+	return err;
+}
+
+void nfp_tunnel_config_stop(struct nfp_app *app)
+{
+	struct nfp_tun_mac_offload_entry *mac_entry;
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_ipv4_route_entry *route_entry;
+	struct nfp_tun_mac_non_nfp_idx *mac_idx;
+	struct nfp_ipv4_addr_entry *ip_entry;
+	struct list_head *ptr, *storage;
+
+	unregister_netdevice_notifier(&priv->nfp_tun_mac_nb);
+	unregister_netevent_notifier(&priv->nfp_tun_neigh_nb);
+
+	/* Free any memory that may be occupied by MAC list. */
+	mutex_lock(&priv->nfp_mac_off_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_mac_off_list) {
+		mac_entry = list_entry(ptr, struct nfp_tun_mac_offload_entry,
+				       list);
+		list_del(&mac_entry->list);
+		kfree(mac_entry);
+	}
+	mutex_unlock(&priv->nfp_mac_off_lock);
+
+	/* Free any memory that may be occupied by MAC index list. */
+	mutex_lock(&priv->nfp_mac_index_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_mac_index_list) {
+		mac_idx = list_entry(ptr, struct nfp_tun_mac_non_nfp_idx,
+				     list);
+		list_del(&mac_idx->list);
+		kfree(mac_idx);
+	}
+	mutex_unlock(&priv->nfp_mac_index_lock);
+
+	ida_destroy(&priv->nfp_mac_off_ids);
+
+	/* Free any memory that may be occupied by ipv4 list. */
+	mutex_lock(&priv->nfp_ipv4_off_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_ipv4_off_list) {
+		ip_entry = list_entry(ptr, struct nfp_ipv4_addr_entry, list);
+		list_del(&ip_entry->list);
+		kfree(ip_entry);
+	}
+	mutex_unlock(&priv->nfp_ipv4_off_lock);
+
+	/* Free any memory that may be occupied by the route list. */
+	mutex_lock(&priv->nfp_neigh_off_lock);
+	list_for_each_safe(ptr, storage, &priv->nfp_neigh_off_list) {
+		route_entry = list_entry(ptr, struct nfp_ipv4_route_entry,
+					 list);
+		list_del(&route_entry->list);
+		kfree(route_entry);
+	}
+	mutex_unlock(&priv->nfp_neigh_off_lock);
+}
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1c0187f0af51..d2f73feb8497 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1574,26 +1574,6 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
 	return true;
 }
 
-static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
-			   unsigned int *off, unsigned int *len)
-{
-	struct xdp_buff xdp;
-	void *orig_data;
-	int ret;
-
-	xdp.data_hard_start = hard_start;
-	xdp.data = data + *off;
-	xdp.data_end = data + *off + *len;
-
-	orig_data = xdp.data;
-	ret = bpf_prog_run_xdp(prog, &xdp);
-
-	*len -= xdp.data - orig_data;
-	*off += xdp.data - orig_data;
-
-	return ret;
-}
-
 /**
  * nfp_net_rx() - receive up to @budget packets on @rx_ring
  * @rx_ring:   RX ring to receive from
@@ -1629,6 +1609,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 		struct nfp_meta_parsed meta;
 		struct net_device *netdev;
 		dma_addr_t new_dma_addr;
+		u32 meta_len_xdp = 0;
 		void *new_frag;
 
 		idx = D_IDX(rx_ring, rx_ring->rd_p);
@@ -1707,16 +1688,24 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 
 		if (xdp_prog && !(rxd->rxd.flags & PCIE_DESC_RX_BPF &&
 				  dp->bpf_offload_xdp) && !meta.portid) {
+			void *orig_data = rxbuf->frag + pkt_off;
 			unsigned int dma_off;
-			void *hard_start;
+			struct xdp_buff xdp;
 			int act;
 
-			hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
+			xdp.data_hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
+			xdp.data = orig_data;
+			xdp.data_meta = orig_data;
+			xdp.data_end = orig_data + pkt_len;
+
+			act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+			pkt_len -= xdp.data - orig_data;
+			pkt_off += xdp.data - orig_data;
 
-			act = nfp_net_run_xdp(xdp_prog, rxbuf->frag, hard_start,
-					      &pkt_off, &pkt_len);
 			switch (act) {
 			case XDP_PASS:
+				meta_len_xdp = xdp.data - xdp.data_meta;
 				break;
 			case XDP_TX:
 				dma_off = pkt_off - NFP_NET_RX_BUF_HEADROOM;
@@ -1784,6 +1773,8 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 		if (rxd->rxd.flags & PCIE_DESC_RX_VLAN)
 			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
 					       le16_to_cpu(rxd->rxd.vlan));
+		if (meta_len_xdp)
+			skb_metadata_set(skb, meta_len_xdp);
 
 		napi_gro_receive(&rx_ring->r_vec->napi, skb);
 	}
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 994a83a1f0a5..a235e8881af9 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -1813,12 +1813,12 @@ static int nv_alloc_rx(struct net_device *dev)
 		struct sk_buff *skb = netdev_alloc_skb(dev, np->rx_buf_sz + NV_RX_ALLOC_PAD);
 		if (skb) {
 			np->put_rx_ctx->skb = skb;
-			np->put_rx_ctx->dma = pci_map_single(np->pci_dev,
+			np->put_rx_ctx->dma = dma_map_single(&np->pci_dev->dev,
 							     skb->data,
 							     skb_tailroom(skb),
-							     PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(np->pci_dev,
-						  np->put_rx_ctx->dma)) {
+							     DMA_FROM_DEVICE);
+			if (unlikely(dma_mapping_error(&np->pci_dev->dev,
+						       np->put_rx_ctx->dma))) {
 				kfree_skb(skb);
 				goto packet_dropped;
 			}
@@ -1854,12 +1854,12 @@ static int nv_alloc_rx_optimized(struct net_device *dev)
 		struct sk_buff *skb = netdev_alloc_skb(dev, np->rx_buf_sz + NV_RX_ALLOC_PAD);
 		if (skb) {
 			np->put_rx_ctx->skb = skb;
-			np->put_rx_ctx->dma = pci_map_single(np->pci_dev,
+			np->put_rx_ctx->dma = dma_map_single(&np->pci_dev->dev,
 							     skb->data,
 							     skb_tailroom(skb),
-							     PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(np->pci_dev,
-						  np->put_rx_ctx->dma)) {
+							     DMA_FROM_DEVICE);
+			if (unlikely(dma_mapping_error(&np->pci_dev->dev,
+						       np->put_rx_ctx->dma))) {
 				kfree_skb(skb);
 				goto packet_dropped;
 			}
@@ -1977,9 +1977,9 @@ static void nv_unmap_txskb(struct fe_priv *np, struct nv_skb_map *tx_skb)
 {
 	if (tx_skb->dma) {
 		if (tx_skb->dma_single)
-			pci_unmap_single(np->pci_dev, tx_skb->dma,
+			dma_unmap_single(&np->pci_dev->dev, tx_skb->dma,
 					 tx_skb->dma_len,
-					 PCI_DMA_TODEVICE);
+					 DMA_TO_DEVICE);
 		else
 			pci_unmap_page(np->pci_dev, tx_skb->dma,
 				       tx_skb->dma_len,
@@ -2047,10 +2047,10 @@ static void nv_drain_rx(struct net_device *dev)
 		}
 		wmb();
 		if (np->rx_skb[i].skb) {
-			pci_unmap_single(np->pci_dev, np->rx_skb[i].dma,
+			dma_unmap_single(&np->pci_dev->dev, np->rx_skb[i].dma,
 					 (skb_end_pointer(np->rx_skb[i].skb) -
-					  np->rx_skb[i].skb->data),
-					 PCI_DMA_FROMDEVICE);
+					 np->rx_skb[i].skb->data),
+					 DMA_FROM_DEVICE);
 			dev_kfree_skb(np->rx_skb[i].skb);
 			np->rx_skb[i].skb = NULL;
 		}
@@ -2224,10 +2224,11 @@ static netdev_tx_t nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		prev_tx = put_tx;
 		prev_tx_ctx = np->put_tx_ctx;
 		bcnt = (size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : size;
-		np->put_tx_ctx->dma = pci_map_single(np->pci_dev, skb->data + offset, bcnt,
-						PCI_DMA_TODEVICE);
-		if (pci_dma_mapping_error(np->pci_dev,
-					  np->put_tx_ctx->dma)) {
+		np->put_tx_ctx->dma = dma_map_single(&np->pci_dev->dev,
+						     skb->data + offset, bcnt,
+						     DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(&np->pci_dev->dev,
+					       np->put_tx_ctx->dma))) {
 			/* on DMA mapping error - drop the packet */
 			dev_kfree_skb_any(skb);
 			u64_stats_update_begin(&np->swstats_tx_syncp);
@@ -2267,7 +2268,8 @@ static netdev_tx_t nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 							frag, offset,
 							bcnt,
 							DMA_TO_DEVICE);
-			if (dma_mapping_error(&np->pci_dev->dev, np->put_tx_ctx->dma)) {
+			if (unlikely(dma_mapping_error(&np->pci_dev->dev,
+						       np->put_tx_ctx->dma))) {
 
 				/* Unwind the mapped fragments */
 				do {
@@ -2373,10 +2375,11 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 		prev_tx = put_tx;
 		prev_tx_ctx = np->put_tx_ctx;
 		bcnt = (size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : size;
-		np->put_tx_ctx->dma = pci_map_single(np->pci_dev, skb->data + offset, bcnt,
-						PCI_DMA_TODEVICE);
-		if (pci_dma_mapping_error(np->pci_dev,
-					  np->put_tx_ctx->dma)) {
+		np->put_tx_ctx->dma = dma_map_single(&np->pci_dev->dev,
+						     skb->data + offset, bcnt,
+						     DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(&np->pci_dev->dev,
+					       np->put_tx_ctx->dma))) {
 			/* on DMA mapping error - drop the packet */
 			dev_kfree_skb_any(skb);
 			u64_stats_update_begin(&np->swstats_tx_syncp);
@@ -2417,7 +2420,8 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 							bcnt,
 							DMA_TO_DEVICE);
 
-			if (dma_mapping_error(&np->pci_dev->dev, np->put_tx_ctx->dma)) {
+			if (unlikely(dma_mapping_error(&np->pci_dev->dev,
+						       np->put_tx_ctx->dma))) {
 
 				/* Unwind the mapped fragments */
 				do {
@@ -2810,9 +2814,9 @@ static int nv_rx_process(struct net_device *dev, int limit)
 		 * TODO: check if a prefetch of the first cacheline improves
 		 * the performance.
 		 */
-		pci_unmap_single(np->pci_dev, np->get_rx_ctx->dma,
-				np->get_rx_ctx->dma_len,
-				PCI_DMA_FROMDEVICE);
+		dma_unmap_single(&np->pci_dev->dev, np->get_rx_ctx->dma,
+				 np->get_rx_ctx->dma_len,
+				 DMA_FROM_DEVICE);
 		skb = np->get_rx_ctx->skb;
 		np->get_rx_ctx->skb = NULL;
 
@@ -2916,9 +2920,9 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
 		 * TODO: check if a prefetch of the first cacheline improves
 		 * the performance.
 		 */
-		pci_unmap_single(np->pci_dev, np->get_rx_ctx->dma,
-				np->get_rx_ctx->dma_len,
-				PCI_DMA_FROMDEVICE);
+		dma_unmap_single(&np->pci_dev->dev, np->get_rx_ctx->dma,
+				 np->get_rx_ctx->dma_len,
+				 DMA_FROM_DEVICE);
 		skb = np->get_rx_ctx->skb;
 		np->get_rx_ctx->skb = NULL;
 
@@ -5070,11 +5074,11 @@ static int nv_loopback_test(struct net_device *dev)
 		ret = 0;
 		goto out;
 	}
-	test_dma_addr = pci_map_single(np->pci_dev, tx_skb->data,
+	test_dma_addr = dma_map_single(&np->pci_dev->dev, tx_skb->data,
 				       skb_tailroom(tx_skb),
-				       PCI_DMA_FROMDEVICE);
-	if (pci_dma_mapping_error(np->pci_dev,
-				  test_dma_addr)) {
+				       DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(&np->pci_dev->dev,
+				       test_dma_addr))) {
 		dev_kfree_skb_any(tx_skb);
 		goto out;
 	}
@@ -5129,9 +5133,9 @@ static int nv_loopback_test(struct net_device *dev)
 		}
 	}
 
-	pci_unmap_single(np->pci_dev, test_dma_addr,
-		       (skb_end_pointer(tx_skb) - tx_skb->data),
-		       PCI_DMA_TODEVICE);
+	dma_unmap_single(&np->pci_dev->dev, test_dma_addr,
+			 (skb_end_pointer(tx_skb) - tx_skb->data),
+			 DMA_TO_DEVICE);
 	dev_kfree_skb_any(tx_skb);
  out:
 	/* stop engines */
diff --git a/drivers/net/ethernet/packetengines/hamachi.c b/drivers/net/ethernet/packetengines/hamachi.c
index 482b85e4d665..77bc7cca8980 100644
--- a/drivers/net/ethernet/packetengines/hamachi.c
+++ b/drivers/net/ethernet/packetengines/hamachi.c
@@ -979,10 +979,8 @@ static int hamachi_open(struct net_device *dev)
 			   dev->name, readw(ioaddr + RxStatus), readw(ioaddr + TxStatus));
 	}
 	/* Set the timer to check for link beat. */
-	init_timer(&hmp->timer);
+	setup_timer(&hmp->timer, hamachi_timer, (unsigned long)dev);
 	hmp->timer.expires = RUN_AT((24*HZ)/10);			/* 2.4 sec. */
-	hmp->timer.data = (unsigned long)dev;
-	hmp->timer.function = hamachi_timer;				/* timer handler */
 	add_timer(&hmp->timer);
 
 	return 0;
diff --git a/drivers/net/ethernet/packetengines/yellowfin.c b/drivers/net/ethernet/packetengines/yellowfin.c
index fa7770da6ef8..33c241f52a71 100644
--- a/drivers/net/ethernet/packetengines/yellowfin.c
+++ b/drivers/net/ethernet/packetengines/yellowfin.c
@@ -632,10 +632,8 @@ static int yellowfin_open(struct net_device *dev)
 	}
 
 	/* Set the timer to check for link beat. */
-	init_timer(&yp->timer);
+	setup_timer(&yp->timer, yellowfin_timer, (unsigned long)dev);
 	yp->timer.expires = jiffies + 3*HZ;
-	yp->timer.data = (unsigned long)dev;
-	yp->timer.function = yellowfin_timer;				/* timer handler */
 	add_timer(&yp->timer);
 out:
 	return rc;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index af106be8cc08..afd07ad91631 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -2069,6 +2069,12 @@ static void qed_rdma_set_pf_params(struct qed_hwfn *p_hwfn,
 
 	num_srqs = min_t(u32, 32 * 1024, p_params->num_srqs);
 
+	if (p_hwfn->mcp_info->func_info.protocol == QED_PCI_ETH_RDMA) {
+		DP_NOTICE(p_hwfn,
+			  "Current day drivers don't support RoCE & iWARP simultaneously on the same PF. Default to RoCE-only\n");
+		p_hwfn->hw_info.personality = QED_PCI_ETH_ROCE;
+	}
+
 	switch (p_hwfn->hw_info.personality) {
 	case QED_PCI_ETH_IWARP:
 		/* Each QP requires one connection */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 9d989c96278c..8fc9c811f6e3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -41,6 +41,7 @@
 #include "qed_rdma.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
+#include "qed_ooo.h"
 
 #define QED_IWARP_ORD_DEFAULT		32
 #define QED_IWARP_IRD_DEFAULT		32
@@ -119,6 +120,13 @@ static void qed_iwarp_cid_cleaned(struct qed_hwfn *p_hwfn, u32 cid)
 	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
 }
 
+void qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn,
+			      struct iwarp_init_func_params *p_ramrod)
+{
+	p_ramrod->ll2_ooo_q_index = RESC_START(p_hwfn, QED_LL2_QUEUE) +
+				    p_hwfn->p_rdma_info->iwarp.ll2_ooo_handle;
+}
+
 static int qed_iwarp_alloc_cid(struct qed_hwfn *p_hwfn, u32 *cid)
 {
 	int rc;
@@ -1725,6 +1733,14 @@ qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
 
 	memset(&cm_info, 0, sizeof(cm_info));
 	ll2_syn_handle = p_hwfn->p_rdma_info->iwarp.ll2_syn_handle;
+
+	/* Check if packet was received with errors... */
+	if (data->err_flags) {
+		DP_NOTICE(p_hwfn, "Error received on SYN packet: 0x%x\n",
+			  data->err_flags);
+		goto err;
+	}
+
 	if (GET_FIELD(data->parse_flags,
 		      PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED) &&
 	    GET_FIELD(data->parse_flags, PARSING_AND_ERR_FLAGS_L4CHKSMERROR)) {
@@ -1876,6 +1892,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
 	}
 
+	if (iwarp_info->ll2_ooo_handle != QED_IWARP_HANDLE_INVAL) {
+		rc = qed_ll2_terminate_connection(p_hwfn,
+						  iwarp_info->ll2_ooo_handle);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to terminate ooo connection\n");
+
+		qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_ooo_handle);
+		iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
+	}
+
 	qed_llh_remove_mac_filter(p_hwfn,
 				  p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr);
 	return rc;
@@ -1927,10 +1953,12 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn,
 	struct qed_iwarp_info *iwarp_info;
 	struct qed_ll2_acquire_data data;
 	struct qed_ll2_cbs cbs;
+	u16 n_ooo_bufs;
 	int rc = 0;
 
 	iwarp_info = &p_hwfn->p_rdma_info->iwarp;
 	iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
+	iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
 
 	iwarp_info->max_mtu = params->max_mtu;
 
@@ -1978,6 +2006,29 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn,
 	if (rc)
 		goto err;
 
+	/* Start OOO connection */
+	data.input.conn_type = QED_LL2_TYPE_OOO;
+	data.input.mtu = params->max_mtu;
+
+	n_ooo_bufs = (QED_IWARP_MAX_OOO * QED_IWARP_RCV_WND_SIZE_DEF) /
+		     iwarp_info->max_mtu;
+	n_ooo_bufs = min_t(u32, n_ooo_bufs, QED_IWARP_LL2_OOO_MAX_RX_SIZE);
+
+	data.input.rx_num_desc = n_ooo_bufs;
+	data.input.rx_num_ooo_buffers = n_ooo_bufs;
+
+	data.input.tx_max_bds_per_packet = 1;	/* will never be fragmented */
+	data.input.tx_num_desc = QED_IWARP_LL2_OOO_DEF_TX_SIZE;
+	data.p_connection_handle = &iwarp_info->ll2_ooo_handle;
+
+	rc = qed_ll2_acquire_connection(p_hwfn, &data);
+	if (rc)
+		goto err;
+
+	rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_ooo_handle);
+	if (rc)
+		goto err;
+
 	return rc;
 err:
 	qed_iwarp_ll2_stop(p_hwfn, p_ptt);
@@ -2014,6 +2065,7 @@ int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 
 	qed_spq_register_async_cb(p_hwfn, PROTOCOLID_IWARP,
 				  qed_iwarp_async_event);
+	qed_ooo_setup(p_hwfn);
 
 	return qed_iwarp_ll2_start(p_hwfn, params, p_ptt);
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
index 148ef3c33a5d..9e2bfde894df 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -47,7 +47,12 @@ enum qed_iwarp_qp_state qed_roce2iwarp_state(enum qed_roce_qp_state state);
 #define QED_IWARP_LL2_SYN_TX_SIZE       (128)
 #define QED_IWARP_LL2_SYN_RX_SIZE       (256)
 #define QED_IWARP_MAX_SYN_PKT_SIZE      (128)
-#define QED_IWARP_HANDLE_INVAL			(0xff)
+
+#define QED_IWARP_LL2_OOO_DEF_TX_SIZE   (256)
+#define QED_IWARP_MAX_OOO		(16)
+#define QED_IWARP_LL2_OOO_MAX_RX_SIZE   (16384)
+
+#define QED_IWARP_HANDLE_INVAL		(0xff)
 
 struct qed_iwarp_ll2_buff {
 	void *data;
@@ -67,6 +72,7 @@ struct qed_iwarp_info {
 	u8 crc_needed;
 	u8 tcp_flags;
 	u8 ll2_syn_handle;
+	u8 ll2_ooo_handle;
 	u8 peer2peer;
 	enum mpa_negotiation_mode mpa_rev;
 	enum mpa_rtr_type rtr_type;
@@ -147,6 +153,9 @@ int qed_iwarp_alloc(struct qed_hwfn *p_hwfn);
 int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 		    struct qed_rdma_start_in_params *params);
 
+void qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn,
+			      struct iwarp_init_func_params *p_ramrod);
+
 int qed_iwarp_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 
 void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index c06ad4f0758e..250afa5486cf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -413,6 +413,7 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
 				  struct qed_ll2_comp_rx_data *data)
 {
 	data->parse_flags = le16_to_cpu(p_cqe->rx_cqe_fp.parse_flags.flags);
+	data->err_flags = le16_to_cpu(p_cqe->rx_cqe_fp.err_flags.flags);
 	data->length.packet_length =
 	    le16_to_cpu(p_cqe->rx_cqe_fp.packet_length);
 	data->vlan = le16_to_cpu(p_cqe->rx_cqe_fp.vlan);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 376485d99357..8b99c7d26f34 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1691,12 +1691,12 @@ qed_mcp_get_shmem_proto_mfw(struct qed_hwfn *p_hwfn,
 	case FW_MB_PARAM_GET_PF_RDMA_ROCE:
 		*p_proto = QED_PCI_ETH_ROCE;
 		break;
+	case FW_MB_PARAM_GET_PF_RDMA_IWARP:
+		*p_proto = QED_PCI_ETH_IWARP;
+		break;
 	case FW_MB_PARAM_GET_PF_RDMA_BOTH:
-		DP_NOTICE(p_hwfn,
-			  "Current day drivers don't support RoCE & iWARP. Default to RoCE-only\n");
-		*p_proto = QED_PCI_ETH_ROCE;
+		*p_proto = QED_PCI_ETH_RDMA;
 		break;
-	case FW_MB_PARAM_GET_PF_RDMA_IWARP:
 	default:
 		DP_NOTICE(p_hwfn,
 			  "MFW answers GET_PF_RDMA_PROTOCOL but param is %08x\n",
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index 6fb99518a61f..c8c4b3940564 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -156,7 +156,10 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
 		return rc;
 
 	p_hwfn->p_rdma_info = p_rdma_info;
-	p_rdma_info->proto = PROTOCOLID_ROCE;
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+		p_rdma_info->proto = PROTOCOLID_IWARP;
+	else
+		p_rdma_info->proto = PROTOCOLID_ROCE;
 
 	num_cons = qed_cxt_get_proto_cid_count(p_hwfn, p_rdma_info->proto,
 					       NULL);
@@ -206,11 +209,11 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
 		goto free_pd_map;
 	}
 
-	/* Allocate bitmap for cq's. The maximum number of CQs is bounded to
-	 * twice the number of QPs.
+	/* Allocate bitmap for cq's. The maximum number of CQs is bound to
+	 * the number of connections we support. (num_qps in iWARP or
+	 * num_qps/2 in RoCE).
 	 */
-	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->cq_map,
-				 p_rdma_info->num_qps * 2, "CQ");
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->cq_map, num_cons, "CQ");
 	if (rc) {
 		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
 			   "Failed to allocate cq bitmap, rc = %d\n", rc);
@@ -219,10 +222,10 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
 
 	/* Allocate bitmap for toggle bit for cq icids
 	 * We toggle the bit every time we create or resize cq for a given icid.
-	 * The maximum number of CQs is bounded to  twice the number of QPs.
+	 * Size needs to equal the size of the cq bmap.
 	 */
 	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->toggle_bits,
-				 p_rdma_info->num_qps * 2, "Toggle");
+				 num_cons, "Toggle");
 	if (rc) {
 		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
 			   "Failed to allocate toogle bits, rc = %d\n", rc);
@@ -548,10 +551,13 @@ static int qed_rdma_start_fw(struct qed_hwfn *p_hwfn,
 	if (rc)
 		return rc;
 
-	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
+	if (QED_IS_IWARP_PERSONALITY(p_hwfn)) {
+		qed_iwarp_init_fw_ramrod(p_hwfn,
+					 &p_ent->ramrod.iwarp_init_func.iwarp);
 		p_ramrod = &p_ent->ramrod.iwarp_init_func.rdma;
-	else
+	} else {
 		p_ramrod = &p_ent->ramrod.roce_init_func.rdma;
+	}
 
 	p_params_header = &p_ramrod->params_header;
 	p_params_header->cnq_start_offset = (u8)RESC_START(p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 46d0c3cb83a5..a1d33f35aad3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -377,6 +377,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		p_ramrod->personality = PERSONALITY_ISCSI;
 		break;
 	case QED_PCI_ETH_ROCE:
+	case QED_PCI_ETH_IWARP:
 		p_ramrod->personality = PERSONALITY_RDMA_AND_ETH;
 		break;
 	default:
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 6fc854b120b0..48ec4c56cddf 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 
 	xdp.data_hard_start = page_address(bd->data);
 	xdp.data = xdp.data_hard_start + *data_offset;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 
 	/* Queues always have a full reset currently, so for the time
diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c
index 2991179c2fd0..05479d435469 100644
--- a/drivers/net/ethernet/qlogic/qla3xxx.c
+++ b/drivers/net/ethernet/qlogic/qla3xxx.c
@@ -3891,10 +3891,8 @@ static int ql3xxx_probe(struct pci_dev *pdev,
 	INIT_DELAYED_WORK(&qdev->tx_timeout_work, ql_tx_timeout_work);
 	INIT_DELAYED_WORK(&qdev->link_state_work, ql_link_state_machine_work);
 
-	init_timer(&qdev->adapter_timer);
-	qdev->adapter_timer.function = ql3xxx_timer;
+	setup_timer(&qdev->adapter_timer, ql3xxx_timer, (unsigned long)qdev);
 	qdev->adapter_timer.expires = jiffies + HZ * 2;	/* two second delay */
-	qdev->adapter_timer.data = (unsigned long)qdev;
 
 	if (!cards_found) {
 		pr_alert("%s\n", DRV_STRING);
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 1e33aea59f50..8403eea08d0e 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -175,7 +175,7 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev,
 	if (err)
 		goto err1;
 
-	err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL);
+	err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL, extack);
 	if (err)
 		goto err2;
 
diff --git a/drivers/net/ethernet/realtek/atp.c b/drivers/net/ethernet/realtek/atp.c
index bed34684994f..bdc3833fab7e 100644
--- a/drivers/net/ethernet/realtek/atp.c
+++ b/drivers/net/ethernet/realtek/atp.c
@@ -438,10 +438,8 @@ static int net_open(struct net_device *dev)
 
 	hardware_init(dev);
 
-	init_timer(&lp->timer);
+	setup_timer(&lp->timer, atp_timed_checker, (unsigned long)dev);
 	lp->timer.expires = jiffies + TIMED_CHECKER;
-	lp->timer.data = (unsigned long)dev;
-	lp->timer.function = atp_timed_checker;    /* timer handler */
 	add_timer(&lp->timer);
 
 	netif_start_queue(dev);
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index fdf30bfa403b..a8822a756e08 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -403,8 +403,9 @@ static void ravb_emac_init(struct net_device *ndev)
 	/* Receive frame limit set register */
 	ravb_write(ndev, ndev->mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN, RFLR);
 
-	/* PAUSE prohibition */
+	/* EMAC Mode: PAUSE prohibition; Duplex; RX Checksum; TX; RX */
 	ravb_write(ndev, ECMR_ZPF | (priv->duplex ? ECMR_DM : 0) |
+		   (ndev->features & NETIF_F_RXCSUM ? ECMR_RCSC : 0) |
 		   ECMR_TE | ECMR_RE, ECMR);
 
 	ravb_set_rate(ndev);
@@ -520,6 +521,19 @@ static void ravb_get_tx_tstamp(struct net_device *ndev)
 	}
 }
 
+static void ravb_rx_csum(struct sk_buff *skb)
+{
+	u8 *hw_csum;
+
+	/* The hardware checksum is 2 bytes appended to packet data */
+	if (unlikely(skb->len < 2))
+		return;
+	hw_csum = skb_tail_pointer(skb) - 2;
+	skb->csum = csum_unfold((__force __sum16)get_unaligned_le16(hw_csum));
+	skb->ip_summed = CHECKSUM_COMPLETE;
+	skb_trim(skb, skb->len - 2);
+}
+
 /* Packet receive function for Ethernet AVB */
 static bool ravb_rx(struct net_device *ndev, int *quota, int q)
 {
@@ -587,8 +601,11 @@ static bool ravb_rx(struct net_device *ndev, int *quota, int q)
 				ts.tv_nsec = le32_to_cpu(desc->ts_n);
 				shhwtstamps->hwtstamp = timespec64_to_ktime(ts);
 			}
+
 			skb_put(skb, pkt_len);
 			skb->protocol = eth_type_trans(skb, ndev);
+			if (ndev->features & NETIF_F_RXCSUM)
+				ravb_rx_csum(skb);
 			napi_gro_receive(&priv->napi[q], skb);
 			stats->rx_packets++;
 			stats->rx_bytes += pkt_len;
@@ -1842,6 +1859,38 @@ static int ravb_do_ioctl(struct net_device *ndev, struct ifreq *req, int cmd)
 	return phy_mii_ioctl(phydev, req, cmd);
 }
 
+static void ravb_set_rx_csum(struct net_device *ndev, bool enable)
+{
+	struct ravb_private *priv = netdev_priv(ndev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	/* Disable TX and RX */
+	ravb_rcv_snd_disable(ndev);
+
+	/* Modify RX Checksum setting */
+	ravb_modify(ndev, ECMR, ECMR_RCSC, enable ? ECMR_RCSC : 0);
+
+	/* Enable TX and RX */
+	ravb_rcv_snd_enable(ndev);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static int ravb_set_features(struct net_device *ndev,
+			     netdev_features_t features)
+{
+	netdev_features_t changed = ndev->features ^ features;
+
+	if (changed & NETIF_F_RXCSUM)
+		ravb_set_rx_csum(ndev, features & NETIF_F_RXCSUM);
+
+	ndev->features = features;
+
+	return 0;
+}
+
 static const struct net_device_ops ravb_netdev_ops = {
 	.ndo_open		= ravb_open,
 	.ndo_stop		= ravb_close,
@@ -1853,6 +1902,7 @@ static const struct net_device_ops ravb_netdev_ops = {
 	.ndo_do_ioctl		= ravb_do_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_set_features	= ravb_set_features,
 };
 
 /* MDIO bus init function */
@@ -2004,6 +2054,9 @@ static int ravb_probe(struct platform_device *pdev)
 	if (!ndev)
 		return -ENOMEM;
 
+	ndev->features = NETIF_F_RXCSUM;
+	ndev->hw_features = NETIF_F_RXCSUM;
+
 	pm_runtime_enable(&pdev->dev);
 	pm_runtime_get_sync(&pdev->dev);
 
diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c
index 40bd88362e3d..cb61247b0526 100644
--- a/drivers/net/ethernet/sis/sis900.c
+++ b/drivers/net/ethernet/sis/sis900.c
@@ -1065,10 +1065,8 @@ sis900_open(struct net_device *net_dev)
 
 	/* Set the timer to switch to check for link beat and perhaps switch
 	   to an alternate media type. */
-	init_timer(&sis_priv->timer);
+	setup_timer(&sis_priv->timer, sis900_timer, (unsigned long)net_dev);
 	sis_priv->timer.expires = jiffies + HZ;
-	sis_priv->timer.data = (unsigned long)net_dev;
-	sis_priv->timer.function = sis900_timer;
 	add_timer(&sis_priv->timer);
 
 	return 0;
diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c
index 6a0e1d4b597c..2a9724898fcf 100644
--- a/drivers/net/ethernet/smsc/epic100.c
+++ b/drivers/net/ethernet/smsc/epic100.c
@@ -739,10 +739,8 @@ static int epic_open(struct net_device *dev)
 
 	/* Set the timer to switch to check for link beat and perhaps switch
 	   to an alternate media type. */
-	init_timer(&ep->timer);
+	setup_timer(&ep->timer, epic_timer, (unsigned long)dev);
 	ep->timer.expires = jiffies + 3*HZ;
-	ep->timer.data = (unsigned long)dev;
-	ep->timer.function = epic_timer;				/* timer handler */
 	add_timer(&ep->timer);
 
 	return rc;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 1763e48c84e2..f41661a04f23 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2217,10 +2217,8 @@ static void stmmac_init_tx_coalesce(struct stmmac_priv *priv)
 {
 	priv->tx_coal_frames = STMMAC_TX_FRAMES;
 	priv->tx_coal_timer = STMMAC_COAL_TX_TIMER;
-	init_timer(&priv->txtimer);
+	setup_timer(&priv->txtimer, stmmac_tx_timer, (unsigned long)priv);
 	priv->txtimer.expires = STMMAC_COAL_TIMER(priv->tx_coal_timer);
-	priv->txtimer.data = (unsigned long)priv;
-	priv->txtimer.function = stmmac_tx_timer;
 	add_timer(&priv->txtimer);
 }
 
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index 382993c1561c..a74d78f64af9 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -5039,10 +5039,7 @@ static int cas_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	spin_lock_init(&cp->stat_lock[N_TX_RINGS]);
 	mutex_init(&cp->pm_mutex);
 
-	init_timer(&cp->link_timer);
-	cp->link_timer.function = cas_link_timer;
-	cp->link_timer.data = (unsigned long) cp;
-
+	setup_timer(&cp->link_timer, cas_link_timer, (unsigned long)cp);
 #if 1
 	/* Just in case the implementation of atomic operations
 	 * change so that an explicit initialization is necessary.
diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c
index 5b56c24b6ed2..5feeaa9f0a9e 100644
--- a/drivers/net/ethernet/sun/ldmvsw.c
+++ b/drivers/net/ethernet/sun/ldmvsw.c
@@ -307,7 +307,7 @@ static int vsw_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 
 	/* Get (or create) the vnet associated with this port */
 	vp = vsw_get_vnet(hp, vdev->mp, &handle);
-	if (unlikely(IS_ERR(vp))) {
+	if (IS_ERR(vp)) {
 		err = PTR_ERR(vp);
 		pr_err("Failed to get vnet for vsw-port\n");
 		mdesc_release(hp);
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 6a4e8e1bbd90..bde19b307d0d 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -6123,10 +6123,8 @@ static int niu_open(struct net_device *dev)
 
 	err = niu_init_hw(np);
 	if (!err) {
-		init_timer(&np->timer);
+		setup_timer(&np->timer, niu_timer, (unsigned long)np);
 		np->timer.expires = jiffies + HZ;
-		np->timer.data = (unsigned long) np;
-		np->timer.function = niu_timer;
 
 		err = niu_enable_interrupts(np, 1);
 		if (err)
diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c
index fa607d062cb3..b75ab8f44968 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -2910,9 +2910,7 @@ static int gem_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	gp->msg_enable = DEFAULT_MSG;
 
-	init_timer(&gp->link_timer);
-	gp->link_timer.function = gem_link_timer;
-	gp->link_timer.data = (unsigned long) gp;
+	setup_timer(&gp->link_timer, gem_link_timer, (unsigned long)gp);
 
 	INIT_WORK(&gp->reset_task, gem_reset_task);
 
diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index ddd43e09111e..cd1185e66133 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -859,9 +859,7 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 	cpsw_ale_control_set(ale, 0, ALE_ENABLE, 1);
 	cpsw_ale_control_set(ale, 0, ALE_CLEAR, 1);
 
-	init_timer(&ale->timer);
-	ale->timer.data	    = (unsigned long)ale;
-	ale->timer.function = cpsw_ale_timer;
+	setup_timer(&ale->timer, cpsw_ale_timer, (unsigned long)ale);
 	if (ale->ageout) {
 		ale->timer.expires = jiffies + ale->ageout;
 		add_timer(&ale->timer);
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 28cb38af1a34..4ad821655e51 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -3616,9 +3616,8 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev,
 	}
 	spin_unlock_bh(&gbe_dev->hw_stats_lock);
 
-	init_timer(&gbe_dev->timer);
-	gbe_dev->timer.data	 = (unsigned long)gbe_dev;
-	gbe_dev->timer.function = netcp_ethss_timer;
+	setup_timer(&gbe_dev->timer, netcp_ethss_timer,
+		    (unsigned long)gbe_dev);
 	gbe_dev->timer.expires	 = jiffies + GBE_TIMER_INTERVAL;
 	add_timer(&gbe_dev->timer);
 	*inst_priv = gbe_dev;
diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c
index cec9e70ab995..a913538d3213 100644
--- a/drivers/net/ethernet/toshiba/spider_net.c
+++ b/drivers/net/ethernet/toshiba/spider_net.c
@@ -2256,16 +2256,14 @@ spider_net_setup_netdev(struct spider_net_card *card)
 
 	pci_set_drvdata(card->pdev, netdev);
 
-	init_timer(&card->tx_timer);
-	card->tx_timer.function =
-		(void (*)(unsigned long)) spider_net_cleanup_tx_ring;
-	card->tx_timer.data = (unsigned long) card;
+	setup_timer(&card->tx_timer,
+		    (void(*)(unsigned long))spider_net_cleanup_tx_ring,
+		    (unsigned long)card);
 	netdev->irq = card->pdev->irq;
 
 	card->aneg_count = 0;
-	init_timer(&card->aneg_timer);
-	card->aneg_timer.function = spider_net_link_phy;
-	card->aneg_timer.data = (unsigned long) card;
+	setup_timer(&card->aneg_timer, spider_net_link_phy,
+		    (unsigned long)card);
 
 	netif_napi_add(netdev, &card->napi,
 		       spider_net_poll, SPIDER_NET_NAPI_WEIGHT);
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 021a8ec411ab..97fe8dfb602d 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -623,9 +623,7 @@ static int sixpack_open(struct tty_struct *tty)
 
 	netif_start_queue(dev);
 
-	init_timer(&sp->tx_t);
-	sp->tx_t.function = sp_xmit_on_air;
-	sp->tx_t.data = (unsigned long) sp;
+	setup_timer(&sp->tx_t, sp_xmit_on_air, (unsigned long)sp);
 
 	init_timer(&sp->resync_t);
 
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 1503f10122f7..1e62d00732f2 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -840,6 +840,7 @@ static int epp_open(struct net_device *dev)
 	unsigned char tmp[128];
 	unsigned char stat;
 	unsigned long tstart;
+	struct pardev_cb par_cb;
 	
         if (!pp) {
                 printk(KERN_ERR "%s: parport at 0x%lx unknown\n", bc_drvname, dev->base_addr);
@@ -859,8 +860,21 @@ static int epp_open(struct net_device *dev)
                 return -EIO;
 	}
 	memset(&bc->modem, 0, sizeof(bc->modem));
-        bc->pdev = parport_register_device(pp, dev->name, NULL, epp_wakeup, 
-					   NULL, PARPORT_DEV_EXCL, dev);
+	memset(&par_cb, 0, sizeof(par_cb));
+	par_cb.wakeup = epp_wakeup;
+	par_cb.private = (void *)dev;
+	par_cb.flags = PARPORT_DEV_EXCL;
+	for (i = 0; i < NR_PORTS; i++)
+		if (baycom_device[i] == dev)
+			break;
+
+	if (i == NR_PORTS) {
+		pr_err("%s: no device found\n", bc_drvname);
+		parport_put_port(pp);
+		return -ENODEV;
+	}
+
+	bc->pdev = parport_register_dev_model(pp, dev->name, &par_cb, i);
 	parport_put_port(pp);
         if (!bc->pdev) {
                 printk(KERN_ERR "%s: cannot register parport at 0x%lx\n", bc_drvname, pp->base);
@@ -1185,6 +1199,23 @@ MODULE_LICENSE("GPL");
 
 /* --------------------------------------------------------------------- */
 
+static int baycom_epp_par_probe(struct pardevice *par_dev)
+{
+	struct device_driver *drv = par_dev->dev.driver;
+	int len = strlen(drv->name);
+
+	if (strncmp(par_dev->name, drv->name, len))
+		return -ENODEV;
+
+	return 0;
+}
+
+static struct parport_driver baycom_epp_par_driver = {
+	.name = "bce",
+	.probe = baycom_epp_par_probe,
+	.devmodel = true,
+};
+
 static void __init baycom_epp_dev_setup(struct net_device *dev)
 {
 	struct baycom_state *bc = netdev_priv(dev);
@@ -1204,10 +1235,15 @@ static void __init baycom_epp_dev_setup(struct net_device *dev)
 
 static int __init init_baycomepp(void)
 {
-	int i, found = 0;
+	int i, found = 0, ret;
 	char set_hw = 1;
 
 	printk(bc_drvinfo);
+
+	ret = parport_register_driver(&baycom_epp_par_driver);
+	if (ret)
+		return ret;
+
 	/*
 	 * register net devices
 	 */
@@ -1241,7 +1277,12 @@ static int __init init_baycomepp(void)
 		found++;
 	}
 
-	return found ? 0 : -ENXIO;
+	if (found == 0) {
+		parport_unregister_driver(&baycom_epp_par_driver);
+		return -ENXIO;
+	}
+
+	return 0;
 }
 
 static void __exit cleanup_baycomepp(void)
@@ -1260,6 +1301,7 @@ static void __exit cleanup_baycomepp(void)
 				printk(paranoia_str, "cleanup_module");
 		}
 	}
+	parport_unregister_driver(&baycom_epp_par_driver);
 }
 
 module_init(init_baycomepp);
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index aec6c26563cf..54bf8e6e4a09 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -477,7 +477,8 @@ static void ax_encaps(struct net_device *dev, unsigned char *icp, int len)
 				  cmd = 0;
 				}
 				ax->crcauto = (cmd ? 0 : 1);
-				printk(KERN_INFO "mkiss: %s: crc mode %s %d\n", ax->dev->name, (len) ? "set to" : "is", cmd);
+				printk(KERN_INFO "mkiss: %s: crc mode set to %d\n",
+				       ax->dev->name, cmd);
 			}
 			spin_unlock_bh(&ax->buflock);
 			netif_start_queue(dev);
diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c
index 71ddadbf2368..76cc140774a2 100644
--- a/drivers/net/hippi/rrunner.c
+++ b/drivers/net/hippi/rrunner.c
@@ -1229,10 +1229,8 @@ static int rr_open(struct net_device *dev)
 
 	/* Set the timer to switch to check for link beat and perhaps switch
 	   to an alternate media type. */
-	init_timer(&rrpriv->timer);
+	setup_timer(&rrpriv->timer, rr_timer, (unsigned long)dev);
 	rrpriv->timer.expires = RUN_AT(5*HZ);           /* 5 sec. watchdog */
-	rrpriv->timer.data = (unsigned long)dev;
-	rrpriv->timer.function = rr_timer;               /* timer handler */
 	add_timer(&rrpriv->timer);
 
 	netif_start_queue(dev);
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 5176be76ca7d..6f550e15a41c 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -686,6 +686,8 @@ struct netvsc_ethtool_stats {
 	unsigned long tx_busy;
 	unsigned long tx_send_full;
 	unsigned long rx_comp_busy;
+	unsigned long stop_queue;
+	unsigned long wake_queue;
 };
 
 struct netvsc_vf_pcpu_stats {
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 8d5077fb0492..6e5194916bbe 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -484,7 +484,7 @@ static int netvsc_connect_vsp(struct hv_device *device,
 			      struct netvsc_device *net_device,
 			      const struct netvsc_device_info *device_info)
 {
-	const u32 ver_list[] = {
+	static const u32 ver_list[] = {
 		NVSP_PROTOCOL_VERSION_1, NVSP_PROTOCOL_VERSION_2,
 		NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5
 	};
@@ -609,6 +609,7 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device,
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long)desc->trans_id;
 	struct net_device *ndev = hv_get_drvdata(device);
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
 	struct vmbus_channel *channel = device->channel;
 	u16 q_idx = 0;
 	int queue_sends;
@@ -643,8 +644,10 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device,
 
 	if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) &&
 	    (hv_ringbuf_avail_percent(&channel->outbound) > RING_AVAIL_PERCENT_HIWATER ||
-	     queue_sends < 1))
+	     queue_sends < 1)) {
 		netif_tx_wake_queue(netdev_get_tx_queue(ndev, q_idx));
+		ndev_ctx->eth_stats.wake_queue++;
+	}
 }
 
 static void netvsc_send_completion(struct netvsc_device *net_device,
@@ -749,6 +752,7 @@ static inline int netvsc_send_pkt(
 		&net_device->chan_table[packet->q_idx];
 	struct vmbus_channel *out_channel = nvchan->channel;
 	struct net_device *ndev = hv_get_drvdata(device);
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
 	struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx);
 	u64 req_id;
 	int ret;
@@ -789,12 +793,16 @@ static inline int netvsc_send_pkt(
 	if (ret == 0) {
 		atomic_inc_return(&nvchan->queue_sends);
 
-		if (ring_avail < RING_AVAIL_PERCENT_LOWATER)
+		if (ring_avail < RING_AVAIL_PERCENT_LOWATER) {
 			netif_tx_stop_queue(txq);
+			ndev_ctx->eth_stats.stop_queue++;
+		}
 	} else if (ret == -EAGAIN) {
 		netif_tx_stop_queue(txq);
+		ndev_ctx->eth_stats.stop_queue++;
 		if (atomic_read(&nvchan->queue_sends) < 1) {
 			netif_tx_wake_queue(txq);
+			ndev_ctx->eth_stats.wake_queue++;
 			ret = -ENOSPC;
 		}
 	} else {
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index a32ae02e1b6c..dfb986421ec6 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1126,6 +1126,8 @@ static const struct {
 	{ "tx_busy",	  offsetof(struct netvsc_ethtool_stats, tx_busy) },
 	{ "tx_send_full", offsetof(struct netvsc_ethtool_stats, tx_send_full) },
 	{ "rx_comp_busy", offsetof(struct netvsc_ethtool_stats, rx_comp_busy) },
+	{ "stop_queue", offsetof(struct netvsc_ethtool_stats, stop_queue) },
+	{ "wake_queue", offsetof(struct netvsc_ethtool_stats, wake_queue) },
 }, vf_stats[] = {
 	{ "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats, rx_packets) },
 	{ "vf_rx_bytes",   offsetof(struct netvsc_vf_pcpu_stats, rx_bytes) },
@@ -1746,7 +1748,7 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
 		goto rx_handler_failed;
 	}
 
-	ret = netdev_upper_dev_link(vf_netdev, ndev);
+	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
 	if (ret != 0) {
 		netdev_err(vf_netdev,
 			   "can not set master device %s (err = %d)\n",
@@ -1935,6 +1937,12 @@ static int netvsc_probe(struct hv_device *dev,
 	/* We always need headroom for rndis header */
 	net->needed_headroom = RNDIS_AND_PPI_SIZE;
 
+	/* Initialize the number of queues to be 1, we may change it if more
+	 * channels are offered later.
+	 */
+	netif_set_real_num_tx_queues(net, 1);
+	netif_set_real_num_rx_queues(net, 1);
+
 	/* Notify the netvsc driver of the new device */
 	memset(&device_info, 0, sizeof(device_info));
 	device_info.ring_size = ring_size;
diff --git a/drivers/net/ieee802154/atusb.c b/drivers/net/ieee802154/atusb.c
index ef688518ad77..115fa3f37a86 100644
--- a/drivers/net/ieee802154/atusb.c
+++ b/drivers/net/ieee802154/atusb.c
@@ -21,6 +21,9 @@
  *
  * USB initialization is
  * Copyright (c) 2013 Alexander Aring <alex.aring@gmail.com>
+ *
+ * Busware HUL support is
+ * Copyright (c) 2017 Josef Filzmaier <j.filzmaier@gmx.at>
  */
 
 #include <linux/kernel.h>
@@ -42,9 +45,12 @@
 #define ATUSB_ALLOC_DELAY_MS	100	/* delay after failed allocation */
 #define ATUSB_TX_TIMEOUT_MS	200	/* on the air timeout */
 
+struct atusb_chip_data;
+
 struct atusb {
 	struct ieee802154_hw *hw;
 	struct usb_device *usb_dev;
+	struct atusb_chip_data *data;
 	int shutdown;			/* non-zero if shutting down */
 	int err;			/* set by first error */
 
@@ -65,6 +71,14 @@ struct atusb {
 	unsigned char fw_hw_type;	/* Firmware hardware type */
 };
 
+struct atusb_chip_data {
+	u16 t_channel_switch;
+	int rssi_base_val;
+
+	int (*set_channel)(struct ieee802154_hw*, u8, u8);
+	int (*set_txpower)(struct ieee802154_hw*, s32);
+};
+
 /* ----- USB commands without data ----------------------------------------- */
 
 /* To reduce the number of error checks in the code, we record the first error
@@ -163,6 +177,18 @@ static int atusb_write_subreg(struct atusb *atusb, uint8_t reg, uint8_t mask,
 	return ret;
 }
 
+static int atusb_read_subreg(struct atusb *lp,
+			     unsigned int addr, unsigned int mask,
+			     unsigned int shift)
+{
+	int rc;
+
+	rc = atusb_read_reg(lp, addr);
+	rc = (rc & mask) >> shift;
+
+	return rc;
+}
+
 static int atusb_get_and_clear_error(struct atusb *atusb)
 {
 	int err = atusb->err;
@@ -379,18 +405,6 @@ static int atusb_xmit(struct ieee802154_hw *hw, struct sk_buff *skb)
 	return ret;
 }
 
-static int atusb_channel(struct ieee802154_hw *hw, u8 page, u8 channel)
-{
-	struct atusb *atusb = hw->priv;
-	int ret;
-
-	ret = atusb_write_subreg(atusb, SR_CHANNEL, channel);
-	if (ret < 0)
-		return ret;
-	msleep(1);	/* @@@ ugly synchronization */
-	return 0;
-}
-
 static int atusb_ed(struct ieee802154_hw *hw, u8 *level)
 {
 	BUG_ON(!level);
@@ -475,6 +489,17 @@ static const s32 atusb_powers[ATUSB_MAX_TX_POWERS + 1] = {
 };
 
 static int
+atusb_txpower(struct ieee802154_hw *hw, s32 mbm)
+{
+	struct atusb *atusb = hw->priv;
+
+	if (atusb->data)
+		return atusb->data->set_txpower(hw, mbm);
+	else
+		return -ENOTSUPP;
+}
+
+static int
 atusb_set_txpower(struct ieee802154_hw *hw, s32 mbm)
 {
 	struct atusb *atusb = hw->priv;
@@ -488,12 +513,43 @@ atusb_set_txpower(struct ieee802154_hw *hw, s32 mbm)
 	return -EINVAL;
 }
 
+static int
+hulusb_set_txpower(struct ieee802154_hw *hw, s32 mbm)
+{
+	u32 i;
+
+	for (i = 0; i < hw->phy->supported.tx_powers_size; i++) {
+		if (hw->phy->supported.tx_powers[i] == mbm)
+			return atusb_write_subreg(hw->priv, SR_TX_PWR_212, i);
+	}
+
+	return -EINVAL;
+}
+
 #define ATUSB_MAX_ED_LEVELS 0xF
 static const s32 atusb_ed_levels[ATUSB_MAX_ED_LEVELS + 1] = {
 	-9100, -8900, -8700, -8500, -8300, -8100, -7900, -7700, -7500, -7300,
 	-7100, -6900, -6700, -6500, -6300, -6100,
 };
 
+#define AT86RF212_MAX_TX_POWERS 0x1F
+static const s32 at86rf212_powers[AT86RF212_MAX_TX_POWERS + 1] = {
+	500, 400, 300, 200, 100, 0, -100, -200, -300, -400, -500, -600, -700,
+	-800, -900, -1000, -1100, -1200, -1300, -1400, -1500, -1600, -1700,
+	-1800, -1900, -2000, -2100, -2200, -2300, -2400, -2500, -2600,
+};
+
+#define AT86RF2XX_MAX_ED_LEVELS 0xF
+static const s32 at86rf212_ed_levels_100[AT86RF2XX_MAX_ED_LEVELS + 1] = {
+	-10000, -9800, -9600, -9400, -9200, -9000, -8800, -8600, -8400, -8200,
+	-8000, -7800, -7600, -7400, -7200, -7000,
+};
+
+static const s32 at86rf212_ed_levels_98[AT86RF2XX_MAX_ED_LEVELS + 1] = {
+	-9800, -9600, -9400, -9200, -9000, -8800, -8600, -8400, -8200, -8000,
+	-7800, -7600, -7400, -7200, -7000, -6800,
+};
+
 static int
 atusb_set_cca_mode(struct ieee802154_hw *hw, const struct wpan_phy_cca *cca)
 {
@@ -527,6 +583,30 @@ atusb_set_cca_mode(struct ieee802154_hw *hw, const struct wpan_phy_cca *cca)
 	return atusb_write_subreg(atusb, SR_CCA_MODE, val);
 }
 
+static int hulusb_set_cca_ed_level(struct atusb *lp, int rssi_base_val)
+{
+	unsigned int cca_ed_thres;
+
+	cca_ed_thres = atusb_read_subreg(lp, SR_CCA_ED_THRES);
+
+	switch (rssi_base_val) {
+	case -98:
+		lp->hw->phy->supported.cca_ed_levels = at86rf212_ed_levels_98;
+		lp->hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(at86rf212_ed_levels_98);
+		lp->hw->phy->cca_ed_level = at86rf212_ed_levels_98[cca_ed_thres];
+		break;
+	case -100:
+		lp->hw->phy->supported.cca_ed_levels = at86rf212_ed_levels_100;
+		lp->hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(at86rf212_ed_levels_100);
+		lp->hw->phy->cca_ed_level = at86rf212_ed_levels_100[cca_ed_thres];
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	return 0;
+}
+
 static int
 atusb_set_cca_ed_level(struct ieee802154_hw *hw, s32 mbm)
 {
@@ -541,6 +621,92 @@ atusb_set_cca_ed_level(struct ieee802154_hw *hw, s32 mbm)
 	return -EINVAL;
 }
 
+static int atusb_channel(struct ieee802154_hw *hw, u8 page, u8 channel)
+{
+	struct atusb *atusb = hw->priv;
+	int ret = -ENOTSUPP;
+
+	if (atusb->data) {
+		ret = atusb->data->set_channel(hw, page, channel);
+		/* @@@ ugly synchronization */
+		msleep(atusb->data->t_channel_switch);
+	}
+
+	return ret;
+}
+
+static int atusb_set_channel(struct ieee802154_hw *hw, u8 page, u8 channel)
+{
+	struct atusb *atusb = hw->priv;
+	int ret;
+
+	ret = atusb_write_subreg(atusb, SR_CHANNEL, channel);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+static int hulusb_set_channel(struct ieee802154_hw *hw, u8 page, u8 channel)
+{
+	int rc;
+	int rssi_base_val;
+
+	struct atusb *lp = hw->priv;
+
+	if (channel == 0)
+		rc = atusb_write_subreg(lp, SR_SUB_MODE, 0);
+	else
+		rc = atusb_write_subreg(lp, SR_SUB_MODE, 1);
+	if (rc < 0)
+		return rc;
+
+	if (page == 0) {
+		rc = atusb_write_subreg(lp, SR_BPSK_QPSK, 0);
+		rssi_base_val = -100;
+	} else {
+		rc = atusb_write_subreg(lp, SR_BPSK_QPSK, 1);
+		rssi_base_val = -98;
+	}
+	if (rc < 0)
+		return rc;
+
+	rc = hulusb_set_cca_ed_level(lp, rssi_base_val);
+	if (rc < 0)
+		return rc;
+
+	/* This sets the symbol_duration according frequency on the 212.
+	 * TODO move this handling while set channel and page in cfg802154.
+	 * We can do that, this timings are according 802.15.4 standard.
+	 * If we do that in cfg802154, this is a more generic calculation.
+	 *
+	 * This should also protected from ifs_timer. Means cancel timer and
+	 * init with a new value. For now, this is okay.
+	 */
+	if (channel == 0) {
+		if (page == 0) {
+			/* SUB:0 and BPSK:0 -> BPSK-20 */
+			lp->hw->phy->symbol_duration = 50;
+		} else {
+			/* SUB:1 and BPSK:0 -> BPSK-40 */
+			lp->hw->phy->symbol_duration = 25;
+		}
+	} else {
+		if (page == 0)
+			/* SUB:0 and BPSK:1 -> OQPSK-100/200/400 */
+			lp->hw->phy->symbol_duration = 40;
+		else
+			/* SUB:1 and BPSK:1 -> OQPSK-250/500/1000 */
+			lp->hw->phy->symbol_duration = 16;
+	}
+
+	lp->hw->phy->lifs_period = IEEE802154_LIFS_PERIOD *
+				   lp->hw->phy->symbol_duration;
+	lp->hw->phy->sifs_period = IEEE802154_SIFS_PERIOD *
+				   lp->hw->phy->symbol_duration;
+
+	return atusb_write_subreg(lp, SR_CHANNEL, channel);
+}
+
 static int
 atusb_set_csma_params(struct ieee802154_hw *hw, u8 min_be, u8 max_be, u8 retries)
 {
@@ -559,6 +725,14 @@ atusb_set_csma_params(struct ieee802154_hw *hw, u8 min_be, u8 max_be, u8 retries
 }
 
 static int
+hulusb_set_lbt(struct ieee802154_hw *hw, bool on)
+{
+	struct atusb *atusb = hw->priv;
+
+	return atusb_write_subreg(atusb, SR_CSMA_LBT_MODE, on);
+}
+
+static int
 atusb_set_frame_retries(struct ieee802154_hw *hw, s8 retries)
 {
 	struct atusb *atusb = hw->priv;
@@ -593,6 +767,20 @@ atusb_set_promiscuous_mode(struct ieee802154_hw *hw, const bool on)
 	return 0;
 }
 
+struct atusb_chip_data atusb_chip_data = {
+	.t_channel_switch = 1,
+	.rssi_base_val = -91,
+	.set_txpower = atusb_set_txpower,
+	.set_channel = atusb_set_channel,
+};
+
+struct atusb_chip_data hulusb_chip_data = {
+	.t_channel_switch = 11,
+	.rssi_base_val = -100,
+	.set_txpower = hulusb_set_txpower,
+	.set_channel = hulusb_set_channel,
+};
+
 static const struct ieee802154_ops atusb_ops = {
 	.owner			= THIS_MODULE,
 	.xmit_async		= atusb_xmit,
@@ -601,7 +789,8 @@ static const struct ieee802154_ops atusb_ops = {
 	.start			= atusb_start,
 	.stop			= atusb_stop,
 	.set_hw_addr_filt	= atusb_set_hw_addr_filt,
-	.set_txpower		= atusb_set_txpower,
+	.set_txpower		= atusb_txpower,
+	.set_lbt		= hulusb_set_lbt,
 	.set_cca_mode		= atusb_set_cca_mode,
 	.set_cca_ed_level	= atusb_set_cca_ed_level,
 	.set_csma_params	= atusb_set_csma_params,
@@ -614,6 +803,7 @@ static const struct ieee802154_ops atusb_ops = {
 static int atusb_get_and_show_revision(struct atusb *atusb)
 {
 	struct usb_device *usb_dev = atusb->usb_dev;
+	char *hw_name;
 	unsigned char *buffer;
 	int ret;
 
@@ -630,9 +820,31 @@ static int atusb_get_and_show_revision(struct atusb *atusb)
 		atusb->fw_ver_min = buffer[1];
 		atusb->fw_hw_type = buffer[2];
 
+		switch (atusb->fw_hw_type) {
+		case ATUSB_HW_TYPE_100813:
+		case ATUSB_HW_TYPE_101216:
+		case ATUSB_HW_TYPE_110131:
+			hw_name = "ATUSB";
+			atusb->data = &atusb_chip_data;
+			break;
+		case ATUSB_HW_TYPE_RZUSB:
+			hw_name = "RZUSB";
+			atusb->data = &atusb_chip_data;
+			break;
+		case ATUSB_HW_TYPE_HULUSB:
+			hw_name = "HULUSB";
+			atusb->data = &hulusb_chip_data;
+			break;
+		default:
+			hw_name = "UNKNOWN";
+			atusb->err = -ENOTSUPP;
+			ret = -ENOTSUPP;
+			break;
+		}
+
 		dev_info(&usb_dev->dev,
-			 "Firmware: major: %u, minor: %u, hardware type: %u\n",
-			 atusb->fw_ver_maj, atusb->fw_ver_min, atusb->fw_hw_type);
+			 "Firmware: major: %u, minor: %u, hardware type: %s (%d)\n",
+			 atusb->fw_ver_maj, atusb->fw_ver_min, hw_name, atusb->fw_hw_type);
 	}
 	if (atusb->fw_ver_maj == 0 && atusb->fw_ver_min < 2) {
 		dev_info(&usb_dev->dev,
@@ -667,11 +879,12 @@ static int atusb_get_and_show_build(struct atusb *atusb)
 	return ret;
 }
 
-static int atusb_get_and_show_chip(struct atusb *atusb)
+static int atusb_get_and_conf_chip(struct atusb *atusb)
 {
 	struct usb_device *usb_dev = atusb->usb_dev;
 	uint8_t man_id_0, man_id_1, part_num, version_num;
 	const char *chip;
+	struct ieee802154_hw *hw = atusb->hw;
 
 	man_id_0 = atusb_read_reg(atusb, RG_MAN_ID_0);
 	man_id_1 = atusb_read_reg(atusb, RG_MAN_ID_1);
@@ -681,6 +894,22 @@ static int atusb_get_and_show_chip(struct atusb *atusb)
 	if (atusb->err)
 		return atusb->err;
 
+	hw->flags = IEEE802154_HW_TX_OMIT_CKSUM | IEEE802154_HW_AFILT |
+		    IEEE802154_HW_PROMISCUOUS | IEEE802154_HW_CSMA_PARAMS;
+
+	hw->phy->flags = WPAN_PHY_FLAG_TXPOWER | WPAN_PHY_FLAG_CCA_ED_LEVEL |
+			 WPAN_PHY_FLAG_CCA_MODE;
+
+	hw->phy->supported.cca_modes = BIT(NL802154_CCA_ENERGY) |
+				       BIT(NL802154_CCA_CARRIER) |
+				       BIT(NL802154_CCA_ENERGY_CARRIER);
+	hw->phy->supported.cca_opts = BIT(NL802154_CCA_OPT_ENERGY_CARRIER_AND) |
+				      BIT(NL802154_CCA_OPT_ENERGY_CARRIER_OR);
+
+	hw->phy->cca.mode = NL802154_CCA_ENERGY;
+
+	hw->phy->current_page = 0;
+
 	if ((man_id_1 << 8 | man_id_0) != ATUSB_JEDEC_ATMEL) {
 		dev_err(&usb_dev->dev,
 			"non-Atmel transceiver xxxx%02x%02x\n",
@@ -691,9 +920,36 @@ static int atusb_get_and_show_chip(struct atusb *atusb)
 	switch (part_num) {
 	case 2:
 		chip = "AT86RF230";
+		atusb->hw->phy->supported.channels[0] = 0x7FFF800;
+		atusb->hw->phy->current_channel = 11;	/* reset default */
+		atusb->hw->phy->symbol_duration = 16;
+		atusb->hw->phy->supported.tx_powers = atusb_powers;
+		atusb->hw->phy->supported.tx_powers_size = ARRAY_SIZE(atusb_powers);
+		hw->phy->supported.cca_ed_levels = atusb_ed_levels;
+		hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(atusb_ed_levels);
 		break;
 	case 3:
 		chip = "AT86RF231";
+		atusb->hw->phy->supported.channels[0] = 0x7FFF800;
+		atusb->hw->phy->current_channel = 11;	/* reset default */
+		atusb->hw->phy->symbol_duration = 16;
+		atusb->hw->phy->supported.tx_powers = atusb_powers;
+		atusb->hw->phy->supported.tx_powers_size = ARRAY_SIZE(atusb_powers);
+		hw->phy->supported.cca_ed_levels = atusb_ed_levels;
+		hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(atusb_ed_levels);
+		break;
+	case 7:
+		chip = "AT86RF212";
+		atusb->hw->flags |= IEEE802154_HW_LBT;
+		atusb->hw->phy->supported.channels[0] = 0x00007FF;
+		atusb->hw->phy->supported.channels[2] = 0x00007FF;
+		atusb->hw->phy->current_channel = 5;
+		atusb->hw->phy->symbol_duration = 25;
+		atusb->hw->phy->supported.lbt = NL802154_SUPPORTED_BOOL_BOTH;
+		atusb->hw->phy->supported.tx_powers = at86rf212_powers;
+		atusb->hw->phy->supported.tx_powers_size = ARRAY_SIZE(at86rf212_powers);
+		atusb->hw->phy->supported.cca_ed_levels = at86rf212_ed_levels_100;
+		atusb->hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(at86rf212_ed_levels_100);
 		break;
 	default:
 		dev_err(&usb_dev->dev,
@@ -702,6 +958,9 @@ static int atusb_get_and_show_chip(struct atusb *atusb)
 		goto fail;
 	}
 
+	hw->phy->transmit_power = hw->phy->supported.tx_powers[0];
+	hw->phy->cca_ed_level = hw->phy->supported.cca_ed_levels[7];
+
 	dev_info(&usb_dev->dev, "ATUSB: %s version %d\n", chip, version_num);
 
 	return 0;
@@ -794,32 +1053,9 @@ static int atusb_probe(struct usb_interface *interface,
 		goto fail;
 
 	hw->parent = &usb_dev->dev;
-	hw->flags = IEEE802154_HW_TX_OMIT_CKSUM | IEEE802154_HW_AFILT |
-		    IEEE802154_HW_PROMISCUOUS | IEEE802154_HW_CSMA_PARAMS;
-
-	hw->phy->flags = WPAN_PHY_FLAG_TXPOWER | WPAN_PHY_FLAG_CCA_ED_LEVEL |
-			 WPAN_PHY_FLAG_CCA_MODE;
-
-	hw->phy->supported.cca_modes = BIT(NL802154_CCA_ENERGY) |
-		BIT(NL802154_CCA_CARRIER) | BIT(NL802154_CCA_ENERGY_CARRIER);
-	hw->phy->supported.cca_opts = BIT(NL802154_CCA_OPT_ENERGY_CARRIER_AND) |
-		BIT(NL802154_CCA_OPT_ENERGY_CARRIER_OR);
-
-	hw->phy->supported.cca_ed_levels = atusb_ed_levels;
-	hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(atusb_ed_levels);
-
-	hw->phy->cca.mode = NL802154_CCA_ENERGY;
-
-	hw->phy->current_page = 0;
-	hw->phy->current_channel = 11;	/* reset default */
-	hw->phy->supported.channels[0] = 0x7FFF800;
-	hw->phy->supported.tx_powers = atusb_powers;
-	hw->phy->supported.tx_powers_size = ARRAY_SIZE(atusb_powers);
-	hw->phy->transmit_power = hw->phy->supported.tx_powers[0];
-	hw->phy->cca_ed_level = hw->phy->supported.cca_ed_levels[7];
 
 	atusb_command(atusb, ATUSB_RF_RESET, 0);
-	atusb_get_and_show_chip(atusb);
+	atusb_get_and_conf_chip(atusb);
 	atusb_get_and_show_revision(atusb);
 	atusb_get_and_show_build(atusb);
 	atusb_set_extended_addr(atusb);
@@ -941,5 +1177,6 @@ MODULE_AUTHOR("Alexander Aring <alex.aring@gmail.com>");
 MODULE_AUTHOR("Richard Sharpe <realrichardsharpe@gmail.com>");
 MODULE_AUTHOR("Stefan Schmidt <stefan@datenfreihafen.org>");
 MODULE_AUTHOR("Werner Almesberger <werner@almesberger.net>");
+MODULE_AUTHOR("Josef Filzmaier <j.filzmaier@gmx.at>");
 MODULE_DESCRIPTION("ATUSB IEEE 802.15.4 Driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/ieee802154/atusb.h b/drivers/net/ieee802154/atusb.h
index b22bbaa77590..555d14bf14a3 100644
--- a/drivers/net/ieee802154/atusb.h
+++ b/drivers/net/ieee802154/atusb.h
@@ -50,6 +50,14 @@ enum atusb_requests {
 	ATUSB_EUI64_READ,
 };
 
+enum {
+	ATUSB_HW_TYPE_100813,	/* 2010-08-13 */
+	ATUSB_HW_TYPE_101216,	/* 2010-12-16 */
+	ATUSB_HW_TYPE_110131,	/* 2011-01-31, ATmega32U2-based */
+	ATUSB_HW_TYPE_RZUSB,	/* Atmel Raven USB dongle with at86rf230 */
+	ATUSB_HW_TYPE_HULUSB,	/* Busware HUL USB dongle with at86rf212 */
+};
+
 /*
  * Direction	bRequest		wValue		wIndex	wLength
  *
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 8870bd2a2e8a..0008da7e9d4c 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -231,6 +231,9 @@ static void ifb_setup(struct net_device *dev)
 	eth_hw_addr_random(dev);
 	dev->needs_free_netdev = true;
 	dev->priv_destructor = ifb_dev_free;
+
+	dev->min_mtu = 0;
+	dev->max_mtu = 0;
 }
 
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index c74893c1e620..57c3856bab05 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -584,7 +584,7 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev,
 	if (err < 0)
 		goto remove_ida;
 
-	err = netdev_upper_dev_link(phy_dev, dev);
+	err = netdev_upper_dev_link(phy_dev, dev, extack);
 	if (err) {
 		goto unregister_netdev;
 	}
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 98e4deaa3a6a..ccbe4eaffe4d 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -3244,7 +3244,7 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 				       &macsec_netdev_addr_lock_key,
 				       macsec_get_nest_level(dev));
 
-	err = netdev_upper_dev_link(real_dev, dev);
+	err = netdev_upper_dev_link(real_dev, dev, extack);
 	if (err < 0)
 		goto unregister;
 
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d2aea961e0f4..858bd66511a2 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1231,11 +1231,14 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
 			return -EADDRNOTAVAIL;
 	}
 
-	if (data && data[IFLA_MACVLAN_FLAGS] &&
+	if (!data)
+		return 0;
+
+	if (data[IFLA_MACVLAN_FLAGS] &&
 	    nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~MACVLAN_FLAG_NOPROMISC)
 		return -EINVAL;
 
-	if (data && data[IFLA_MACVLAN_MODE]) {
+	if (data[IFLA_MACVLAN_MODE]) {
 		switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) {
 		case MACVLAN_MODE_PRIVATE:
 		case MACVLAN_MODE_VEPA:
@@ -1248,7 +1251,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
-	if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
+	if (data[IFLA_MACVLAN_MACADDR_MODE]) {
 		switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) {
 		case MACVLAN_MACADDR_ADD:
 		case MACVLAN_MACADDR_DEL:
@@ -1260,7 +1263,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
-	if (data && data[IFLA_MACVLAN_MACADDR]) {
+	if (data[IFLA_MACVLAN_MACADDR]) {
 		if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN)
 			return -EINVAL;
 
@@ -1268,7 +1271,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
 			return -EADDRNOTAVAIL;
 	}
 
-	if (data && data[IFLA_MACVLAN_MACADDR_COUNT])
+	if (data[IFLA_MACVLAN_MACADDR_COUNT])
 		return -EINVAL;
 
 	return 0;
@@ -1341,7 +1344,8 @@ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode,
 }
 
 int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
-			   struct nlattr *tb[], struct nlattr *data[])
+			   struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvlan_port *port;
@@ -1430,7 +1434,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 		goto destroy_macvlan_port;
 
 	dev->priv_flags |= IFF_MACVLAN;
-	err = netdev_upper_dev_link(lowerdev, dev);
+	err = netdev_upper_dev_link(lowerdev, dev, extack);
 	if (err)
 		goto unregister_netdev;
 
@@ -1453,7 +1457,7 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[],
 			   struct netlink_ext_ack *extack)
 {
-	return macvlan_common_newlink(src_net, dev, tb, data);
+	return macvlan_common_newlink(src_net, dev, tb, data, extack);
 }
 
 void macvlan_dellink(struct net_device *dev, struct list_head *head)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index c2d0ea2fb019..f62aea2fcfa9 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -105,7 +105,7 @@ static int macvtap_newlink(struct net *src_net, struct net_device *dev,
 	/* Don't put anything that may fail after macvlan_common_newlink
 	 * because we can't undo what it does.
 	 */
-	err = macvlan_common_newlink(src_net, dev, tb, data);
+	err = macvlan_common_newlink(src_net, dev, tb, data, extack);
 	if (err) {
 		netdev_rx_handler_unregister(dev);
 		return err;
diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index 9cbe645e3d89..d4670ecdb1ba 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -22,17 +22,29 @@
 #define RTL821x_INER		0x12
 #define RTL821x_INER_INIT	0x6400
 #define RTL821x_INSR		0x13
+#define RTL821x_PAGE_SELECT	0x1f
 #define RTL8211E_INER_LINK_STATUS 0x400
 
 #define RTL8211F_INER_LINK_STATUS 0x0010
 #define RTL8211F_INSR		0x1d
-#define RTL8211F_PAGE_SELECT	0x1f
 #define RTL8211F_TX_DELAY	0x100
 
+#define RTL8201F_ISR		0x1e
+#define RTL8201F_IER		0x13
+
 MODULE_DESCRIPTION("Realtek PHY driver");
 MODULE_AUTHOR("Johnson Leung");
 MODULE_LICENSE("GPL");
 
+static int rtl8201_ack_interrupt(struct phy_device *phydev)
+{
+	int err;
+
+	err = phy_read(phydev, RTL8201F_ISR);
+
+	return (err < 0) ? err : 0;
+}
+
 static int rtl821x_ack_interrupt(struct phy_device *phydev)
 {
 	int err;
@@ -46,14 +58,33 @@ static int rtl8211f_ack_interrupt(struct phy_device *phydev)
 {
 	int err;
 
-	phy_write(phydev, RTL8211F_PAGE_SELECT, 0xa43);
+	phy_write(phydev, RTL821x_PAGE_SELECT, 0xa43);
 	err = phy_read(phydev, RTL8211F_INSR);
 	/* restore to default page 0 */
-	phy_write(phydev, RTL8211F_PAGE_SELECT, 0x0);
+	phy_write(phydev, RTL821x_PAGE_SELECT, 0x0);
 
 	return (err < 0) ? err : 0;
 }
 
+static int rtl8201_config_intr(struct phy_device *phydev)
+{
+	int err;
+
+	/* switch to page 7 */
+	phy_write(phydev, RTL821x_PAGE_SELECT, 0x7);
+
+	if (phydev->interrupts == PHY_INTERRUPT_ENABLED)
+		err = phy_write(phydev, RTL8201F_IER,
+				BIT(13) | BIT(12) | BIT(11));
+	else
+		err = phy_write(phydev, RTL8201F_IER, 0);
+
+	/* restore to default page 0 */
+	phy_write(phydev, RTL821x_PAGE_SELECT, 0x0);
+
+	return err;
+}
+
 static int rtl8211b_config_intr(struct phy_device *phydev)
 {
 	int err;
@@ -102,7 +133,7 @@ static int rtl8211f_config_init(struct phy_device *phydev)
 	if (ret < 0)
 		return ret;
 
-	phy_write(phydev, RTL8211F_PAGE_SELECT, 0xd08);
+	phy_write(phydev, RTL821x_PAGE_SELECT, 0xd08);
 	reg = phy_read(phydev, 0x11);
 
 	/* enable TX-delay for rgmii-id and rgmii-txid, otherwise disable it */
@@ -114,7 +145,7 @@ static int rtl8211f_config_init(struct phy_device *phydev)
 
 	phy_write(phydev, 0x11, reg);
 	/* restore to default page 0 */
-	phy_write(phydev, RTL8211F_PAGE_SELECT, 0x0);
+	phy_write(phydev, RTL821x_PAGE_SELECT, 0x0);
 
 	return 0;
 }
@@ -129,6 +160,18 @@ static struct phy_driver realtek_drvs[] = {
 		.config_aneg    = &genphy_config_aneg,
 		.read_status    = &genphy_read_status,
 	}, {
+		.phy_id		= 0x001cc816,
+		.name		= "RTL8201F 10/100Mbps Ethernet",
+		.phy_id_mask	= 0x001fffff,
+		.features	= PHY_BASIC_FEATURES,
+		.flags		= PHY_HAS_INTERRUPT,
+		.config_aneg	= &genphy_config_aneg,
+		.read_status	= &genphy_read_status,
+		.ack_interrupt	= &rtl8201_ack_interrupt,
+		.config_intr	= &rtl8201_config_intr,
+		.suspend	= genphy_suspend,
+		.resume		= genphy_resume,
+	}, {
 		.phy_id		= 0x001cc912,
 		.name		= "RTL8211B Gigabit Ethernet",
 		.phy_id_mask	= 0x001fffff,
@@ -181,6 +224,7 @@ static struct phy_driver realtek_drvs[] = {
 module_phy_driver(realtek_drvs);
 
 static struct mdio_device_id __maybe_unused realtek_tbl[] = {
+	{ 0x001cc816, 0x001fffff },
 	{ 0x001cc912, 0x001fffff },
 	{ 0x001cc914, 0x001fffff },
 	{ 0x001cc915, 0x001fffff },
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 436dd78c396a..eb8a18991d8c 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -763,12 +763,8 @@ static struct slip *sl_alloc(dev_t line)
 	sl->mode        = SL_MODE_DEFAULT;
 #ifdef CONFIG_SLIP_SMART
 	/* initialize timer_list struct */
-	init_timer(&sl->keepalive_timer);
-	sl->keepalive_timer.data = (unsigned long)sl;
-	sl->keepalive_timer.function = sl_keepalive;
-	init_timer(&sl->outfill_timer);
-	sl->outfill_timer.data = (unsigned long)sl;
-	sl->outfill_timer.function = sl_outfill;
+	setup_timer(&sl->keepalive_timer, sl_keepalive, (unsigned long)sl);
+	setup_timer(&sl->outfill_timer, sl_outfill, (unsigned long)sl);
 #endif
 	slip_devs[i] = dev;
 	return sl;
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index ae53e899259f..a468439969df 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1112,7 +1112,7 @@ static int team_upper_dev_link(struct team *team, struct team_port *port)
 
 	lag_upper_info.tx_type = team->mode->lag_tx_type;
 	err = netdev_master_upper_dev_link(port->dev, team->dev, NULL,
-					   &lag_upper_info);
+					   &lag_upper_info, NULL);
 	if (err)
 		return err;
 	port->dev->priv_flags |= IFF_TEAM_PORT;
@@ -1914,7 +1914,8 @@ static int team_netpoll_setup(struct net_device *dev,
 }
 #endif
 
-static int team_add_slave(struct net_device *dev, struct net_device *port_dev)
+static int team_add_slave(struct net_device *dev, struct net_device *port_dev,
+			  struct netlink_ext_ack *extack)
 {
 	struct team *team = netdev_priv(dev);
 	int err;
diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index 1468ddf424cc..a5ef97010eb3 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -137,7 +137,13 @@ static struct team_port *lb_htpm_select_tx_port(struct team *team,
 						struct sk_buff *skb,
 						unsigned char hash)
 {
-	return rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash));
+	struct team_port *port;
+
+	port = rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash));
+	if (likely(port))
+		return port;
+	/* If no valid port in the table, fall back to simple hash */
+	return lb_hash_select_tx_port(team, lb_priv, skb, hash);
 }
 
 struct lb_select_tx_port {
diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c
new file mode 100644
index 000000000000..1a7bc0bf4598
--- /dev/null
+++ b/drivers/net/thunderbolt.c
@@ -0,0 +1,1362 @@
+/*
+ * Networking over Thunderbolt cable using Apple ThunderboltIP protocol
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Amir Levy <amir.jer.levy@intel.com>
+ *          Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/atomic.h>
+#include <linux/highmem.h>
+#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/sizes.h>
+#include <linux/thunderbolt.h>
+#include <linux/uuid.h>
+#include <linux/workqueue.h>
+
+#include <net/ip6_checksum.h>
+
+/* Protocol timeouts in ms */
+#define TBNET_LOGIN_DELAY	4500
+#define TBNET_LOGIN_TIMEOUT	500
+#define TBNET_LOGOUT_TIMEOUT	100
+
+#define TBNET_RING_SIZE		256
+#define TBNET_LOCAL_PATH	0xf
+#define TBNET_LOGIN_RETRIES	60
+#define TBNET_LOGOUT_RETRIES	5
+#define TBNET_MATCH_FRAGS_ID	BIT(1)
+#define TBNET_MAX_MTU		SZ_64K
+#define TBNET_FRAME_SIZE	SZ_4K
+#define TBNET_MAX_PAYLOAD_SIZE	\
+	(TBNET_FRAME_SIZE - sizeof(struct thunderbolt_ip_frame_header))
+/* Rx packets need to hold space for skb_shared_info */
+#define TBNET_RX_MAX_SIZE	\
+	(TBNET_FRAME_SIZE + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+#define TBNET_RX_PAGE_ORDER	get_order(TBNET_RX_MAX_SIZE)
+#define TBNET_RX_PAGE_SIZE	(PAGE_SIZE << TBNET_RX_PAGE_ORDER)
+
+#define TBNET_L0_PORT_NUM(route) ((route) & GENMASK(5, 0))
+
+/**
+ * struct thunderbolt_ip_frame_header - Header for each Thunderbolt frame
+ * @frame_size: size of the data with the frame
+ * @frame_index: running index on the frames
+ * @frame_id: ID of the frame to match frames to specific packet
+ * @frame_count: how many frames assembles a full packet
+ *
+ * Each data frame passed to the high-speed DMA ring has this header. If
+ * the XDomain network directory announces that %TBNET_MATCH_FRAGS_ID is
+ * supported then @frame_id is filled, otherwise it stays %0.
+ */
+struct thunderbolt_ip_frame_header {
+	u32 frame_size;
+	u16 frame_index;
+	u16 frame_id;
+	u32 frame_count;
+};
+
+enum thunderbolt_ip_frame_pdf {
+	TBIP_PDF_FRAME_START = 1,
+	TBIP_PDF_FRAME_END,
+};
+
+enum thunderbolt_ip_type {
+	TBIP_LOGIN,
+	TBIP_LOGIN_RESPONSE,
+	TBIP_LOGOUT,
+	TBIP_STATUS,
+};
+
+struct thunderbolt_ip_header {
+	u32 route_hi;
+	u32 route_lo;
+	u32 length_sn;
+	uuid_t uuid;
+	uuid_t initiator_uuid;
+	uuid_t target_uuid;
+	u32 type;
+	u32 command_id;
+};
+
+#define TBIP_HDR_LENGTH_MASK		GENMASK(5, 0)
+#define TBIP_HDR_SN_MASK		GENMASK(28, 27)
+#define TBIP_HDR_SN_SHIFT		27
+
+struct thunderbolt_ip_login {
+	struct thunderbolt_ip_header hdr;
+	u32 proto_version;
+	u32 transmit_path;
+	u32 reserved[4];
+};
+
+#define TBIP_LOGIN_PROTO_VERSION	1
+
+struct thunderbolt_ip_login_response {
+	struct thunderbolt_ip_header hdr;
+	u32 status;
+	u32 receiver_mac[2];
+	u32 receiver_mac_len;
+	u32 reserved[4];
+};
+
+struct thunderbolt_ip_logout {
+	struct thunderbolt_ip_header hdr;
+};
+
+struct thunderbolt_ip_status {
+	struct thunderbolt_ip_header hdr;
+	u32 status;
+};
+
+struct tbnet_stats {
+	u64 tx_packets;
+	u64 rx_packets;
+	u64 tx_bytes;
+	u64 rx_bytes;
+	u64 rx_errors;
+	u64 tx_errors;
+	u64 rx_length_errors;
+	u64 rx_over_errors;
+	u64 rx_crc_errors;
+	u64 rx_missed_errors;
+};
+
+struct tbnet_frame {
+	struct net_device *dev;
+	struct page *page;
+	struct ring_frame frame;
+};
+
+struct tbnet_ring {
+	struct tbnet_frame frames[TBNET_RING_SIZE];
+	unsigned int cons;
+	unsigned int prod;
+	struct tb_ring *ring;
+};
+
+/**
+ * struct tbnet - ThunderboltIP network driver private data
+ * @svc: XDomain service the driver is bound to
+ * @xd: XDomain the service blongs to
+ * @handler: ThunderboltIP configuration protocol handler
+ * @dev: Networking device
+ * @napi: NAPI structure for Rx polling
+ * @stats: Network statistics
+ * @skb: Network packet that is currently processed on Rx path
+ * @command_id: ID used for next configuration protocol packet
+ * @login_sent: ThunderboltIP login message successfully sent
+ * @login_received: ThunderboltIP login message received from the remote
+ *		    host
+ * @transmit_path: HopID the other end needs to use building the
+ *		   opposite side path.
+ * @connection_lock: Lock serializing access to @login_sent,
+ *		     @login_received and @transmit_path.
+ * @login_retries: Number of login retries currently done
+ * @login_work: Worker to send ThunderboltIP login packets
+ * @connected_work: Worker that finalizes the ThunderboltIP connection
+ *		    setup and enables DMA paths for high speed data
+ *		    transfers
+ * @rx_hdr: Copy of the currently processed Rx frame. Used when a
+ *	    network packet consists of multiple Thunderbolt frames.
+ *	    In host byte order.
+ * @rx_ring: Software ring holding Rx frames
+ * @frame_id: Frame ID use for next Tx packet
+ *            (if %TBNET_MATCH_FRAGS_ID is supported in both ends)
+ * @tx_ring: Software ring holding Tx frames
+ */
+struct tbnet {
+	const struct tb_service *svc;
+	struct tb_xdomain *xd;
+	struct tb_protocol_handler handler;
+	struct net_device *dev;
+	struct napi_struct napi;
+	struct tbnet_stats stats;
+	struct sk_buff *skb;
+	atomic_t command_id;
+	bool login_sent;
+	bool login_received;
+	u32 transmit_path;
+	struct mutex connection_lock;
+	int login_retries;
+	struct delayed_work login_work;
+	struct work_struct connected_work;
+	struct thunderbolt_ip_frame_header rx_hdr;
+	struct tbnet_ring rx_ring;
+	atomic_t frame_id;
+	struct tbnet_ring tx_ring;
+};
+
+/* Network property directory UUID: c66189ca-1cce-4195-bdb8-49592e5f5a4f */
+static const uuid_t tbnet_dir_uuid =
+	UUID_INIT(0xc66189ca, 0x1cce, 0x4195,
+		  0xbd, 0xb8, 0x49, 0x59, 0x2e, 0x5f, 0x5a, 0x4f);
+
+/* ThunderboltIP protocol UUID: 798f589e-3616-8a47-97c6-5664a920c8dd */
+static const uuid_t tbnet_svc_uuid =
+	UUID_INIT(0x798f589e, 0x3616, 0x8a47,
+		  0x97, 0xc6, 0x56, 0x64, 0xa9, 0x20, 0xc8, 0xdd);
+
+static struct tb_property_dir *tbnet_dir;
+
+static void tbnet_fill_header(struct thunderbolt_ip_header *hdr, u64 route,
+	u8 sequence, const uuid_t *initiator_uuid, const uuid_t *target_uuid,
+	enum thunderbolt_ip_type type, size_t size, u32 command_id)
+{
+	u32 length_sn;
+
+	/* Length does not include route_hi/lo and length_sn fields */
+	length_sn = (size - 3 * 4) / 4;
+	length_sn |= (sequence << TBIP_HDR_SN_SHIFT) & TBIP_HDR_SN_MASK;
+
+	hdr->route_hi = upper_32_bits(route);
+	hdr->route_lo = lower_32_bits(route);
+	hdr->length_sn = length_sn;
+	uuid_copy(&hdr->uuid, &tbnet_svc_uuid);
+	uuid_copy(&hdr->initiator_uuid, initiator_uuid);
+	uuid_copy(&hdr->target_uuid, target_uuid);
+	hdr->type = type;
+	hdr->command_id = command_id;
+}
+
+static int tbnet_login_response(struct tbnet *net, u64 route, u8 sequence,
+				u32 command_id)
+{
+	struct thunderbolt_ip_login_response reply;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&reply, 0, sizeof(reply));
+	tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid,
+			  xd->remote_uuid, TBIP_LOGIN_RESPONSE, sizeof(reply),
+			  command_id);
+	memcpy(reply.receiver_mac, net->dev->dev_addr, ETH_ALEN);
+	reply.receiver_mac_len = ETH_ALEN;
+
+	return tb_xdomain_response(xd, &reply, sizeof(reply),
+				   TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+static int tbnet_login_request(struct tbnet *net, u8 sequence)
+{
+	struct thunderbolt_ip_login_response reply;
+	struct thunderbolt_ip_login request;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&request, 0, sizeof(request));
+	tbnet_fill_header(&request.hdr, xd->route, sequence, xd->local_uuid,
+			  xd->remote_uuid, TBIP_LOGIN, sizeof(request),
+			  atomic_inc_return(&net->command_id));
+
+	request.proto_version = TBIP_LOGIN_PROTO_VERSION;
+	request.transmit_path = TBNET_LOCAL_PATH;
+
+	return tb_xdomain_request(xd, &request, sizeof(request),
+				  TB_CFG_PKG_XDOMAIN_RESP, &reply,
+				  sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP,
+				  TBNET_LOGIN_TIMEOUT);
+}
+
+static int tbnet_logout_response(struct tbnet *net, u64 route, u8 sequence,
+				 u32 command_id)
+{
+	struct thunderbolt_ip_status reply;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&reply, 0, sizeof(reply));
+	tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid,
+			  xd->remote_uuid, TBIP_STATUS, sizeof(reply),
+			  atomic_inc_return(&net->command_id));
+	return tb_xdomain_response(xd, &reply, sizeof(reply),
+				   TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+static int tbnet_logout_request(struct tbnet *net)
+{
+	struct thunderbolt_ip_logout request;
+	struct thunderbolt_ip_status reply;
+	struct tb_xdomain *xd = net->xd;
+
+	memset(&request, 0, sizeof(request));
+	tbnet_fill_header(&request.hdr, xd->route, 0, xd->local_uuid,
+			  xd->remote_uuid, TBIP_LOGOUT, sizeof(request),
+			  atomic_inc_return(&net->command_id));
+
+	return tb_xdomain_request(xd, &request, sizeof(request),
+				  TB_CFG_PKG_XDOMAIN_RESP, &reply,
+				  sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP,
+				  TBNET_LOGOUT_TIMEOUT);
+}
+
+static void start_login(struct tbnet *net)
+{
+	mutex_lock(&net->connection_lock);
+	net->login_sent = false;
+	net->login_received = false;
+	mutex_unlock(&net->connection_lock);
+
+	queue_delayed_work(system_long_wq, &net->login_work,
+			   msecs_to_jiffies(1000));
+}
+
+static void stop_login(struct tbnet *net)
+{
+	cancel_delayed_work_sync(&net->login_work);
+	cancel_work_sync(&net->connected_work);
+}
+
+static inline unsigned int tbnet_frame_size(const struct tbnet_frame *tf)
+{
+	return tf->frame.size ? : TBNET_FRAME_SIZE;
+}
+
+static void tbnet_free_buffers(struct tbnet_ring *ring)
+{
+	unsigned int i;
+
+	for (i = 0; i < TBNET_RING_SIZE; i++) {
+		struct device *dma_dev = tb_ring_dma_device(ring->ring);
+		struct tbnet_frame *tf = &ring->frames[i];
+		enum dma_data_direction dir;
+		unsigned int order;
+		size_t size;
+
+		if (!tf->page)
+			continue;
+
+		if (ring->ring->is_tx) {
+			dir = DMA_TO_DEVICE;
+			order = 0;
+			size = tbnet_frame_size(tf);
+		} else {
+			dir = DMA_FROM_DEVICE;
+			order = TBNET_RX_PAGE_ORDER;
+			size = TBNET_RX_PAGE_SIZE;
+		}
+
+		if (tf->frame.buffer_phy)
+			dma_unmap_page(dma_dev, tf->frame.buffer_phy, size,
+				       dir);
+
+		__free_pages(tf->page, order);
+		tf->page = NULL;
+	}
+
+	ring->cons = 0;
+	ring->prod = 0;
+}
+
+static void tbnet_tear_down(struct tbnet *net, bool send_logout)
+{
+	netif_carrier_off(net->dev);
+	netif_stop_queue(net->dev);
+
+	stop_login(net);
+
+	mutex_lock(&net->connection_lock);
+
+	if (net->login_sent && net->login_received) {
+		int retries = TBNET_LOGOUT_RETRIES;
+
+		while (send_logout && retries-- > 0) {
+			int ret = tbnet_logout_request(net);
+			if (ret != -ETIMEDOUT)
+				break;
+		}
+
+		tb_ring_stop(net->rx_ring.ring);
+		tb_ring_stop(net->tx_ring.ring);
+		tbnet_free_buffers(&net->rx_ring);
+		tbnet_free_buffers(&net->tx_ring);
+
+		if (tb_xdomain_disable_paths(net->xd))
+			netdev_warn(net->dev, "failed to disable DMA paths\n");
+	}
+
+	net->login_retries = 0;
+	net->login_sent = false;
+	net->login_received = false;
+
+	mutex_unlock(&net->connection_lock);
+}
+
+static int tbnet_handle_packet(const void *buf, size_t size, void *data)
+{
+	const struct thunderbolt_ip_login *pkg = buf;
+	struct tbnet *net = data;
+	u32 command_id;
+	int ret = 0;
+	u8 sequence;
+	u64 route;
+
+	/* Make sure the packet is for us */
+	if (size < sizeof(struct thunderbolt_ip_header))
+		return 0;
+	if (!uuid_equal(&pkg->hdr.initiator_uuid, net->xd->remote_uuid))
+		return 0;
+	if (!uuid_equal(&pkg->hdr.target_uuid, net->xd->local_uuid))
+		return 0;
+
+	route = ((u64)pkg->hdr.route_hi << 32) | pkg->hdr.route_lo;
+	route &= ~BIT_ULL(63);
+	if (route != net->xd->route)
+		return 0;
+
+	sequence = pkg->hdr.length_sn & TBIP_HDR_SN_MASK;
+	sequence >>= TBIP_HDR_SN_SHIFT;
+	command_id = pkg->hdr.command_id;
+
+	switch (pkg->hdr.type) {
+	case TBIP_LOGIN:
+		if (!netif_running(net->dev))
+			break;
+
+		ret = tbnet_login_response(net, route, sequence,
+					   pkg->hdr.command_id);
+		if (!ret) {
+			mutex_lock(&net->connection_lock);
+			net->login_received = true;
+			net->transmit_path = pkg->transmit_path;
+
+			/* If we reached the number of max retries or
+			 * previous logout, schedule another round of
+			 * login retries
+			 */
+			if (net->login_retries >= TBNET_LOGIN_RETRIES ||
+			    !net->login_sent) {
+				net->login_retries = 0;
+				queue_delayed_work(system_long_wq,
+						   &net->login_work, 0);
+			}
+			mutex_unlock(&net->connection_lock);
+
+			queue_work(system_long_wq, &net->connected_work);
+		}
+		break;
+
+	case TBIP_LOGOUT:
+		ret = tbnet_logout_response(net, route, sequence, command_id);
+		if (!ret)
+			tbnet_tear_down(net, false);
+		break;
+
+	default:
+		return 0;
+	}
+
+	if (ret)
+		netdev_warn(net->dev, "failed to send ThunderboltIP response\n");
+
+	return 1;
+}
+
+static unsigned int tbnet_available_buffers(const struct tbnet_ring *ring)
+{
+	return ring->prod - ring->cons;
+}
+
+static int tbnet_alloc_rx_buffers(struct tbnet *net, unsigned int nbuffers)
+{
+	struct tbnet_ring *ring = &net->rx_ring;
+	int ret;
+
+	while (nbuffers--) {
+		struct device *dma_dev = tb_ring_dma_device(ring->ring);
+		unsigned int index = ring->prod & (TBNET_RING_SIZE - 1);
+		struct tbnet_frame *tf = &ring->frames[index];
+		dma_addr_t dma_addr;
+
+		if (tf->page)
+			break;
+
+		/* Allocate page (order > 0) so that it can hold maximum
+		 * ThunderboltIP frame (4kB) and the additional room for
+		 * SKB shared info required by build_skb().
+		 */
+		tf->page = dev_alloc_pages(TBNET_RX_PAGE_ORDER);
+		if (!tf->page) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		dma_addr = dma_map_page(dma_dev, tf->page, 0,
+					TBNET_RX_PAGE_SIZE, DMA_FROM_DEVICE);
+		if (dma_mapping_error(dma_dev, dma_addr)) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		tf->frame.buffer_phy = dma_addr;
+		tf->dev = net->dev;
+
+		tb_ring_rx(ring->ring, &tf->frame);
+
+		ring->prod++;
+	}
+
+	return 0;
+
+err_free:
+	tbnet_free_buffers(ring);
+	return ret;
+}
+
+static struct tbnet_frame *tbnet_get_tx_buffer(struct tbnet *net)
+{
+	struct tbnet_ring *ring = &net->tx_ring;
+	struct tbnet_frame *tf;
+	unsigned int index;
+
+	if (!tbnet_available_buffers(ring))
+		return NULL;
+
+	index = ring->cons++ & (TBNET_RING_SIZE - 1);
+
+	tf = &ring->frames[index];
+	tf->frame.size = 0;
+	tf->frame.buffer_phy = 0;
+
+	return tf;
+}
+
+static void tbnet_tx_callback(struct tb_ring *ring, struct ring_frame *frame,
+			      bool canceled)
+{
+	struct tbnet_frame *tf = container_of(frame, typeof(*tf), frame);
+	struct device *dma_dev = tb_ring_dma_device(ring);
+	struct tbnet *net = netdev_priv(tf->dev);
+
+	dma_unmap_page(dma_dev, tf->frame.buffer_phy, tbnet_frame_size(tf),
+		       DMA_TO_DEVICE);
+
+	/* Return buffer to the ring */
+	net->tx_ring.prod++;
+
+	if (tbnet_available_buffers(&net->tx_ring) >= TBNET_RING_SIZE / 2)
+		netif_wake_queue(net->dev);
+}
+
+static int tbnet_alloc_tx_buffers(struct tbnet *net)
+{
+	struct tbnet_ring *ring = &net->tx_ring;
+	unsigned int i;
+
+	for (i = 0; i < TBNET_RING_SIZE; i++) {
+		struct tbnet_frame *tf = &ring->frames[i];
+
+		tf->page = alloc_page(GFP_KERNEL);
+		if (!tf->page) {
+			tbnet_free_buffers(ring);
+			return -ENOMEM;
+		}
+
+		tf->dev = net->dev;
+		tf->frame.callback = tbnet_tx_callback;
+		tf->frame.sof = TBIP_PDF_FRAME_START;
+		tf->frame.eof = TBIP_PDF_FRAME_END;
+	}
+
+	ring->cons = 0;
+	ring->prod = TBNET_RING_SIZE - 1;
+
+	return 0;
+}
+
+static void tbnet_connected_work(struct work_struct *work)
+{
+	struct tbnet *net = container_of(work, typeof(*net), connected_work);
+	bool connected;
+	int ret;
+
+	if (netif_carrier_ok(net->dev))
+		return;
+
+	mutex_lock(&net->connection_lock);
+	connected = net->login_sent && net->login_received;
+	mutex_unlock(&net->connection_lock);
+
+	if (!connected)
+		return;
+
+	/* Both logins successful so enable the high-speed DMA paths and
+	 * start the network device queue.
+	 */
+	ret = tb_xdomain_enable_paths(net->xd, TBNET_LOCAL_PATH,
+				      net->rx_ring.ring->hop,
+				      net->transmit_path,
+				      net->tx_ring.ring->hop);
+	if (ret) {
+		netdev_err(net->dev, "failed to enable DMA paths\n");
+		return;
+	}
+
+	tb_ring_start(net->tx_ring.ring);
+	tb_ring_start(net->rx_ring.ring);
+
+	ret = tbnet_alloc_rx_buffers(net, TBNET_RING_SIZE);
+	if (ret)
+		goto err_stop_rings;
+
+	ret = tbnet_alloc_tx_buffers(net);
+	if (ret)
+		goto err_free_rx_buffers;
+
+	netif_carrier_on(net->dev);
+	netif_start_queue(net->dev);
+	return;
+
+err_free_rx_buffers:
+	tbnet_free_buffers(&net->rx_ring);
+err_stop_rings:
+	tb_ring_stop(net->rx_ring.ring);
+	tb_ring_stop(net->tx_ring.ring);
+}
+
+static void tbnet_login_work(struct work_struct *work)
+{
+	struct tbnet *net = container_of(work, typeof(*net), login_work.work);
+	unsigned long delay = msecs_to_jiffies(TBNET_LOGIN_DELAY);
+	int ret;
+
+	if (netif_carrier_ok(net->dev))
+		return;
+
+	ret = tbnet_login_request(net, net->login_retries % 4);
+	if (ret) {
+		if (net->login_retries++ < TBNET_LOGIN_RETRIES) {
+			queue_delayed_work(system_long_wq, &net->login_work,
+					   delay);
+		} else {
+			netdev_info(net->dev, "ThunderboltIP login timed out\n");
+		}
+	} else {
+		net->login_retries = 0;
+
+		mutex_lock(&net->connection_lock);
+		net->login_sent = true;
+		mutex_unlock(&net->connection_lock);
+
+		queue_work(system_long_wq, &net->connected_work);
+	}
+}
+
+static bool tbnet_check_frame(struct tbnet *net, const struct tbnet_frame *tf,
+			      const struct thunderbolt_ip_frame_header *hdr)
+{
+	u32 frame_id, frame_count, frame_size, frame_index;
+	unsigned int size;
+
+	if (tf->frame.flags & RING_DESC_CRC_ERROR) {
+		net->stats.rx_crc_errors++;
+		return false;
+	} else if (tf->frame.flags & RING_DESC_BUFFER_OVERRUN) {
+		net->stats.rx_over_errors++;
+		return false;
+	}
+
+	/* Should be greater than just header i.e. contains data */
+	size = tbnet_frame_size(tf);
+	if (size <= sizeof(*hdr)) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+
+	frame_count = le32_to_cpu(hdr->frame_count);
+	frame_size = le32_to_cpu(hdr->frame_size);
+	frame_index = le16_to_cpu(hdr->frame_index);
+	frame_id = le16_to_cpu(hdr->frame_id);
+
+	if ((frame_size > size - sizeof(*hdr)) || !frame_size) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+
+	/* In case we're in the middle of packet, validate the frame
+	 * header based on first fragment of the packet.
+	 */
+	if (net->skb && net->rx_hdr.frame_count) {
+		/* Check the frame count fits the count field */
+		if (frame_count != net->rx_hdr.frame_count) {
+			net->stats.rx_length_errors++;
+			return false;
+		}
+
+		/* Check the frame identifiers are incremented correctly,
+		 * and id is matching.
+		 */
+		if (frame_index != net->rx_hdr.frame_index + 1 ||
+		    frame_id != net->rx_hdr.frame_id) {
+			net->stats.rx_missed_errors++;
+			return false;
+		}
+
+		if (net->skb->len + frame_size > TBNET_MAX_MTU) {
+			net->stats.rx_length_errors++;
+			return false;
+		}
+
+		return true;
+	}
+
+	/* Start of packet, validate the frame header */
+	if (frame_count == 0 || frame_count > TBNET_RING_SIZE / 4) {
+		net->stats.rx_length_errors++;
+		return false;
+	}
+	if (frame_index != 0) {
+		net->stats.rx_missed_errors++;
+		return false;
+	}
+
+	return true;
+}
+
+static int tbnet_poll(struct napi_struct *napi, int budget)
+{
+	struct tbnet *net = container_of(napi, struct tbnet, napi);
+	unsigned int cleaned_count = tbnet_available_buffers(&net->rx_ring);
+	struct device *dma_dev = tb_ring_dma_device(net->rx_ring.ring);
+	unsigned int rx_packets = 0;
+
+	while (rx_packets < budget) {
+		const struct thunderbolt_ip_frame_header *hdr;
+		unsigned int hdr_size = sizeof(*hdr);
+		struct sk_buff *skb = NULL;
+		struct ring_frame *frame;
+		struct tbnet_frame *tf;
+		struct page *page;
+		bool last = true;
+		u32 frame_size;
+
+		/* Return some buffers to hardware, one at a time is too
+		 * slow so allocate MAX_SKB_FRAGS buffers at the same
+		 * time.
+		 */
+		if (cleaned_count >= MAX_SKB_FRAGS) {
+			tbnet_alloc_rx_buffers(net, cleaned_count);
+			cleaned_count = 0;
+		}
+
+		frame = tb_ring_poll(net->rx_ring.ring);
+		if (!frame)
+			break;
+
+		dma_unmap_page(dma_dev, frame->buffer_phy,
+			       TBNET_RX_PAGE_SIZE, DMA_FROM_DEVICE);
+
+		tf = container_of(frame, typeof(*tf), frame);
+
+		page = tf->page;
+		tf->page = NULL;
+		net->rx_ring.cons++;
+		cleaned_count++;
+
+		hdr = page_address(page);
+		if (!tbnet_check_frame(net, tf, hdr)) {
+			__free_pages(page, TBNET_RX_PAGE_ORDER);
+			dev_kfree_skb_any(net->skb);
+			net->skb = NULL;
+			continue;
+		}
+
+		frame_size = le32_to_cpu(hdr->frame_size);
+
+		skb = net->skb;
+		if (!skb) {
+			skb = build_skb(page_address(page),
+					TBNET_RX_PAGE_SIZE);
+			if (!skb) {
+				__free_pages(page, TBNET_RX_PAGE_ORDER);
+				net->stats.rx_errors++;
+				break;
+			}
+
+			skb_reserve(skb, hdr_size);
+			skb_put(skb, frame_size);
+
+			net->skb = skb;
+		} else {
+			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+					page, hdr_size, frame_size,
+					TBNET_RX_PAGE_SIZE - hdr_size);
+		}
+
+		net->rx_hdr.frame_size = frame_size;
+		net->rx_hdr.frame_count = le32_to_cpu(hdr->frame_count);
+		net->rx_hdr.frame_index = le16_to_cpu(hdr->frame_index);
+		net->rx_hdr.frame_id = le16_to_cpu(hdr->frame_id);
+		last = net->rx_hdr.frame_index == net->rx_hdr.frame_count - 1;
+
+		rx_packets++;
+		net->stats.rx_bytes += frame_size;
+
+		if (last) {
+			skb->protocol = eth_type_trans(skb, net->dev);
+			napi_gro_receive(&net->napi, skb);
+			net->skb = NULL;
+		}
+	}
+
+	net->stats.rx_packets += rx_packets;
+
+	if (cleaned_count)
+		tbnet_alloc_rx_buffers(net, cleaned_count);
+
+	if (rx_packets >= budget)
+		return budget;
+
+	napi_complete_done(napi, rx_packets);
+	/* Re-enable the ring interrupt */
+	tb_ring_poll_complete(net->rx_ring.ring);
+
+	return rx_packets;
+}
+
+static void tbnet_start_poll(void *data)
+{
+	struct tbnet *net = data;
+
+	napi_schedule(&net->napi);
+}
+
+static int tbnet_open(struct net_device *dev)
+{
+	struct tbnet *net = netdev_priv(dev);
+	struct tb_xdomain *xd = net->xd;
+	u16 sof_mask, eof_mask;
+	struct tb_ring *ring;
+
+	netif_carrier_off(dev);
+
+	ring = tb_ring_alloc_tx(xd->tb->nhi, -1, TBNET_RING_SIZE,
+				RING_FLAG_FRAME);
+	if (!ring) {
+		netdev_err(dev, "failed to allocate Tx ring\n");
+		return -ENOMEM;
+	}
+	net->tx_ring.ring = ring;
+
+	sof_mask = BIT(TBIP_PDF_FRAME_START);
+	eof_mask = BIT(TBIP_PDF_FRAME_END);
+
+	ring = tb_ring_alloc_rx(xd->tb->nhi, -1, TBNET_RING_SIZE,
+				RING_FLAG_FRAME | RING_FLAG_E2E, sof_mask,
+				eof_mask, tbnet_start_poll, net);
+	if (!ring) {
+		netdev_err(dev, "failed to allocate Rx ring\n");
+		tb_ring_free(net->tx_ring.ring);
+		net->tx_ring.ring = NULL;
+		return -ENOMEM;
+	}
+	net->rx_ring.ring = ring;
+
+	napi_enable(&net->napi);
+	start_login(net);
+
+	return 0;
+}
+
+static int tbnet_stop(struct net_device *dev)
+{
+	struct tbnet *net = netdev_priv(dev);
+
+	napi_disable(&net->napi);
+
+	tbnet_tear_down(net, true);
+
+	tb_ring_free(net->rx_ring.ring);
+	net->rx_ring.ring = NULL;
+	tb_ring_free(net->tx_ring.ring);
+	net->tx_ring.ring = NULL;
+
+	return 0;
+}
+
+static bool tbnet_xmit_map(struct device *dma_dev, struct tbnet_frame *tf)
+{
+	dma_addr_t dma_addr;
+
+	dma_addr = dma_map_page(dma_dev, tf->page, 0, tbnet_frame_size(tf),
+				DMA_TO_DEVICE);
+	if (dma_mapping_error(dma_dev, dma_addr))
+		return false;
+
+	tf->frame.buffer_phy = dma_addr;
+	return true;
+}
+
+static bool tbnet_xmit_csum_and_map(struct tbnet *net, struct sk_buff *skb,
+	struct tbnet_frame **frames, u32 frame_count)
+{
+	struct thunderbolt_ip_frame_header *hdr = page_address(frames[0]->page);
+	struct device *dma_dev = tb_ring_dma_device(net->tx_ring.ring);
+	__wsum wsum = htonl(skb->len - skb_transport_offset(skb));
+	unsigned int i, len, offset = skb_transport_offset(skb);
+	__be16 protocol = skb->protocol;
+	void *data = skb->data;
+	void *dest = hdr + 1;
+	__sum16 *tucso;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		/* No need to calculate checksum so we just update the
+		 * total frame count and map the frames for DMA.
+		 */
+		for (i = 0; i < frame_count; i++) {
+			hdr = page_address(frames[i]->page);
+			hdr->frame_count = cpu_to_le32(frame_count);
+			if (!tbnet_xmit_map(dma_dev, frames[i]))
+				goto err_unmap;
+		}
+
+		return true;
+	}
+
+	if (protocol == htons(ETH_P_8021Q)) {
+		struct vlan_hdr *vhdr, vh;
+
+		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(vh), &vh);
+		if (!vhdr)
+			return false;
+
+		protocol = vhdr->h_vlan_encapsulated_proto;
+	}
+
+	/* Data points on the beginning of packet.
+	 * Check is the checksum absolute place in the packet.
+	 * ipcso will update IP checksum.
+	 * tucso will update TCP/UPD checksum.
+	 */
+	if (protocol == htons(ETH_P_IP)) {
+		__sum16 *ipcso = dest + ((void *)&(ip_hdr(skb)->check) - data);
+
+		*ipcso = 0;
+		*ipcso = ip_fast_csum(dest + skb_network_offset(skb),
+				      ip_hdr(skb)->ihl);
+
+		if (ip_hdr(skb)->protocol == IPPROTO_TCP)
+			tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data);
+		else if (ip_hdr(skb)->protocol == IPPROTO_UDP)
+			tucso = dest + ((void *)&(udp_hdr(skb)->check) - data);
+		else
+			return false;
+
+		*tucso = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+					    ip_hdr(skb)->daddr, 0,
+					    ip_hdr(skb)->protocol, 0);
+	} else if (skb_is_gso_v6(skb)) {
+		tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data);
+		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					  &ipv6_hdr(skb)->daddr, 0,
+					  IPPROTO_TCP, 0);
+		return false;
+	} else if (protocol == htons(ETH_P_IPV6)) {
+		tucso = dest + skb_checksum_start_offset(skb) + skb->csum_offset;
+		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					  &ipv6_hdr(skb)->daddr, 0,
+					  ipv6_hdr(skb)->nexthdr, 0);
+	} else {
+		return false;
+	}
+
+	/* First frame was headers, rest of the frames contain data.
+	 * Calculate checksum over each frame.
+	 */
+	for (i = 0; i < frame_count; i++) {
+		hdr = page_address(frames[i]->page);
+		dest = (void *)(hdr + 1) + offset;
+		len = le32_to_cpu(hdr->frame_size) - offset;
+		wsum = csum_partial(dest, len, wsum);
+		hdr->frame_count = cpu_to_le32(frame_count);
+
+		offset = 0;
+	}
+
+	*tucso = csum_fold(wsum);
+
+	/* Checksum is finally calculated and we don't touch the memory
+	 * anymore, so DMA map the frames now.
+	 */
+	for (i = 0; i < frame_count; i++) {
+		if (!tbnet_xmit_map(dma_dev, frames[i]))
+			goto err_unmap;
+	}
+
+	return true;
+
+err_unmap:
+	while (i--)
+		dma_unmap_page(dma_dev, frames[i]->frame.buffer_phy,
+			       tbnet_frame_size(frames[i]), DMA_TO_DEVICE);
+
+	return false;
+}
+
+static void *tbnet_kmap_frag(struct sk_buff *skb, unsigned int frag_num,
+			     unsigned int *len)
+{
+	const skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_num];
+
+	*len = skb_frag_size(frag);
+	return kmap_atomic(skb_frag_page(frag)) + frag->page_offset;
+}
+
+static netdev_tx_t tbnet_start_xmit(struct sk_buff *skb,
+				    struct net_device *dev)
+{
+	struct tbnet *net = netdev_priv(dev);
+	struct tbnet_frame *frames[MAX_SKB_FRAGS];
+	u16 frame_id = atomic_read(&net->frame_id);
+	struct thunderbolt_ip_frame_header *hdr;
+	unsigned int len = skb_headlen(skb);
+	unsigned int data_len = skb->len;
+	unsigned int nframes, i;
+	unsigned int frag = 0;
+	void *src = skb->data;
+	u32 frame_index = 0;
+	bool unmap = false;
+	void *dest;
+
+	nframes = DIV_ROUND_UP(data_len, TBNET_MAX_PAYLOAD_SIZE);
+	if (tbnet_available_buffers(&net->tx_ring) < nframes) {
+		netif_stop_queue(net->dev);
+		return NETDEV_TX_BUSY;
+	}
+
+	frames[frame_index] = tbnet_get_tx_buffer(net);
+	if (!frames[frame_index])
+		goto err_drop;
+
+	hdr = page_address(frames[frame_index]->page);
+	dest = hdr + 1;
+
+	/* If overall packet is bigger than the frame data size */
+	while (data_len > TBNET_MAX_PAYLOAD_SIZE) {
+		unsigned int size_left = TBNET_MAX_PAYLOAD_SIZE;
+
+		hdr->frame_size = cpu_to_le32(TBNET_MAX_PAYLOAD_SIZE);
+		hdr->frame_index = cpu_to_le16(frame_index);
+		hdr->frame_id = cpu_to_le16(frame_id);
+
+		do {
+			if (len > size_left) {
+				/* Copy data onto Tx buffer data with
+				 * full frame size then break and go to
+				 * next frame
+				 */
+				memcpy(dest, src, size_left);
+				len -= size_left;
+				dest += size_left;
+				src += size_left;
+				break;
+			}
+
+			memcpy(dest, src, len);
+			size_left -= len;
+			dest += len;
+
+			if (unmap) {
+				kunmap_atomic(src);
+				unmap = false;
+			}
+
+			/* Ensure all fragments have been processed */
+			if (frag < skb_shinfo(skb)->nr_frags) {
+				/* Map and then unmap quickly */
+				src = tbnet_kmap_frag(skb, frag++, &len);
+				unmap = true;
+			} else if (unlikely(size_left > 0)) {
+				goto err_drop;
+			}
+		} while (size_left > 0);
+
+		data_len -= TBNET_MAX_PAYLOAD_SIZE;
+		frame_index++;
+
+		frames[frame_index] = tbnet_get_tx_buffer(net);
+		if (!frames[frame_index])
+			goto err_drop;
+
+		hdr = page_address(frames[frame_index]->page);
+		dest = hdr + 1;
+	}
+
+	hdr->frame_size = cpu_to_le32(data_len);
+	hdr->frame_index = cpu_to_le16(frame_index);
+	hdr->frame_id = cpu_to_le16(frame_id);
+
+	frames[frame_index]->frame.size = data_len + sizeof(*hdr);
+
+	/* In case the remaining data_len is smaller than a frame */
+	while (len < data_len) {
+		memcpy(dest, src, len);
+		data_len -= len;
+		dest += len;
+
+		if (unmap) {
+			kunmap_atomic(src);
+			unmap = false;
+		}
+
+		if (frag < skb_shinfo(skb)->nr_frags) {
+			src = tbnet_kmap_frag(skb, frag++, &len);
+			unmap = true;
+		} else if (unlikely(data_len > 0)) {
+			goto err_drop;
+		}
+	}
+
+	memcpy(dest, src, data_len);
+
+	if (unmap)
+		kunmap_atomic(src);
+
+	if (!tbnet_xmit_csum_and_map(net, skb, frames, frame_index + 1))
+		goto err_drop;
+
+	for (i = 0; i < frame_index + 1; i++)
+		tb_ring_tx(net->tx_ring.ring, &frames[i]->frame);
+
+	if (net->svc->prtcstns & TBNET_MATCH_FRAGS_ID)
+		atomic_inc(&net->frame_id);
+
+	net->stats.tx_packets++;
+	net->stats.tx_bytes += skb->len;
+
+	dev_consume_skb_any(skb);
+
+	return NETDEV_TX_OK;
+
+err_drop:
+	/* We can re-use the buffers */
+	net->tx_ring.cons -= frame_index;
+
+	dev_kfree_skb_any(skb);
+	net->stats.tx_errors++;
+
+	return NETDEV_TX_OK;
+}
+
+static void tbnet_get_stats64(struct net_device *dev,
+			      struct rtnl_link_stats64 *stats)
+{
+	struct tbnet *net = netdev_priv(dev);
+
+	stats->tx_packets = net->stats.tx_packets;
+	stats->rx_packets = net->stats.rx_packets;
+	stats->tx_bytes = net->stats.tx_bytes;
+	stats->rx_bytes = net->stats.rx_bytes;
+	stats->rx_errors = net->stats.rx_errors + net->stats.rx_length_errors +
+		net->stats.rx_over_errors + net->stats.rx_crc_errors +
+		net->stats.rx_missed_errors;
+	stats->tx_errors = net->stats.tx_errors;
+	stats->rx_length_errors = net->stats.rx_length_errors;
+	stats->rx_over_errors = net->stats.rx_over_errors;
+	stats->rx_crc_errors = net->stats.rx_crc_errors;
+	stats->rx_missed_errors = net->stats.rx_missed_errors;
+}
+
+static const struct net_device_ops tbnet_netdev_ops = {
+	.ndo_open = tbnet_open,
+	.ndo_stop = tbnet_stop,
+	.ndo_start_xmit = tbnet_start_xmit,
+	.ndo_get_stats64 = tbnet_get_stats64,
+};
+
+static void tbnet_generate_mac(struct net_device *dev)
+{
+	const struct tbnet *net = netdev_priv(dev);
+	const struct tb_xdomain *xd = net->xd;
+	u8 phy_port;
+	u32 hash;
+
+	phy_port = tb_phy_port_from_link(TBNET_L0_PORT_NUM(xd->route));
+
+	/* Unicast and locally administered MAC */
+	dev->dev_addr[0] = phy_port << 4 | 0x02;
+	hash = jhash2((u32 *)xd->local_uuid, 4, 0);
+	memcpy(dev->dev_addr + 1, &hash, sizeof(hash));
+	hash = jhash2((u32 *)xd->local_uuid, 4, hash);
+	dev->dev_addr[5] = hash & 0xff;
+}
+
+static int tbnet_probe(struct tb_service *svc, const struct tb_service_id *id)
+{
+	struct tb_xdomain *xd = tb_service_parent(svc);
+	struct net_device *dev;
+	struct tbnet *net;
+	int ret;
+
+	dev = alloc_etherdev(sizeof(*net));
+	if (!dev)
+		return -ENOMEM;
+
+	SET_NETDEV_DEV(dev, &svc->dev);
+
+	net = netdev_priv(dev);
+	INIT_DELAYED_WORK(&net->login_work, tbnet_login_work);
+	INIT_WORK(&net->connected_work, tbnet_connected_work);
+	mutex_init(&net->connection_lock);
+	atomic_set(&net->command_id, 0);
+	atomic_set(&net->frame_id, 0);
+	net->svc = svc;
+	net->dev = dev;
+	net->xd = xd;
+
+	tbnet_generate_mac(dev);
+
+	strcpy(dev->name, "thunderbolt%d");
+	dev->netdev_ops = &tbnet_netdev_ops;
+
+	/* ThunderboltIP takes advantage of TSO packets but instead of
+	 * segmenting them we just split the packet into Thunderbolt
+	 * frames (maximum payload size of each frame is 4084 bytes) and
+	 * calculate checksum over the whole packet here.
+	 *
+	 * The receiving side does the opposite if the host OS supports
+	 * LRO, otherwise it needs to split the large packet into MTU
+	 * sized smaller packets.
+	 *
+	 * In order to receive large packets from the networking stack,
+	 * we need to announce support for most of the offloading
+	 * features here.
+	 */
+	dev->hw_features = NETIF_F_SG | NETIF_F_ALL_TSO | NETIF_F_GRO |
+			   NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	dev->features = dev->hw_features | NETIF_F_HIGHDMA;
+	dev->hard_header_len += sizeof(struct thunderbolt_ip_frame_header);
+
+	netif_napi_add(dev, &net->napi, tbnet_poll, NAPI_POLL_WEIGHT);
+
+	/* MTU range: 68 - 65522 */
+	dev->min_mtu = ETH_MIN_MTU;
+	dev->max_mtu = TBNET_MAX_MTU - ETH_HLEN;
+
+	net->handler.uuid = &tbnet_svc_uuid;
+	net->handler.callback = tbnet_handle_packet,
+	net->handler.data = net;
+	tb_register_protocol_handler(&net->handler);
+
+	tb_service_set_drvdata(svc, net);
+
+	ret = register_netdev(dev);
+	if (ret) {
+		tb_unregister_protocol_handler(&net->handler);
+		free_netdev(dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void tbnet_remove(struct tb_service *svc)
+{
+	struct tbnet *net = tb_service_get_drvdata(svc);
+
+	unregister_netdev(net->dev);
+	tb_unregister_protocol_handler(&net->handler);
+	free_netdev(net->dev);
+}
+
+static void tbnet_shutdown(struct tb_service *svc)
+{
+	tbnet_tear_down(tb_service_get_drvdata(svc), true);
+}
+
+static int __maybe_unused tbnet_suspend(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tbnet *net = tb_service_get_drvdata(svc);
+
+	stop_login(net);
+	if (netif_running(net->dev)) {
+		netif_device_detach(net->dev);
+		tb_ring_stop(net->rx_ring.ring);
+		tb_ring_stop(net->tx_ring.ring);
+		tbnet_free_buffers(&net->rx_ring);
+		tbnet_free_buffers(&net->tx_ring);
+	}
+
+	return 0;
+}
+
+static int __maybe_unused tbnet_resume(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tbnet *net = tb_service_get_drvdata(svc);
+
+	netif_carrier_off(net->dev);
+	if (netif_running(net->dev)) {
+		netif_device_attach(net->dev);
+		start_login(net);
+	}
+
+	return 0;
+}
+
+static const struct dev_pm_ops tbnet_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(tbnet_suspend, tbnet_resume)
+};
+
+static const struct tb_service_id tbnet_ids[] = {
+	{ TB_SERVICE("network", 1) },
+	{ },
+};
+MODULE_DEVICE_TABLE(tbsvc, tbnet_ids);
+
+static struct tb_service_driver tbnet_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "thunderbolt-net",
+		.pm = &tbnet_pm_ops,
+	},
+	.probe = tbnet_probe,
+	.remove = tbnet_remove,
+	.shutdown = tbnet_shutdown,
+	.id_table = tbnet_ids,
+};
+
+static int __init tbnet_init(void)
+{
+	int ret;
+
+	tbnet_dir = tb_property_create_dir(&tbnet_dir_uuid);
+	if (!tbnet_dir)
+		return -ENOMEM;
+
+	tb_property_add_immediate(tbnet_dir, "prtcid", 1);
+	tb_property_add_immediate(tbnet_dir, "prtcvers", 1);
+	tb_property_add_immediate(tbnet_dir, "prtcrevs", 1);
+	tb_property_add_immediate(tbnet_dir, "prtcstns",
+				  TBNET_MATCH_FRAGS_ID);
+
+	ret = tb_register_property_dir("network", tbnet_dir);
+	if (ret) {
+		tb_property_free_dir(tbnet_dir);
+		return ret;
+	}
+
+	return tb_register_service_driver(&tbnet_driver);
+}
+module_init(tbnet_init);
+
+static void __exit tbnet_exit(void)
+{
+	tb_unregister_service_driver(&tbnet_driver);
+	tb_unregister_property_dir("network", tbnet_dir);
+	tb_property_free_dir(tbnet_dir);
+}
+module_exit(tbnet_exit);
+
+MODULE_AUTHOR("Amir Levy <amir.jer.levy@intel.com>");
+MODULE_AUTHOR("Michael Jamet <michael.jamet@intel.com>");
+MODULE_AUTHOR("Mika Westerberg <mika.westerberg@linux.intel.com>");
+MODULE_DESCRIPTION("Thunderbolt network driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5ce580f413b9..57e4c31fa84a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@
 #include <linux/skb_array.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/mutex.h>
 
 #include <linux/uaccess.h>
 
@@ -121,7 +122,8 @@ do {								\
 #define TUN_VNET_BE     0x40000000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE)
+		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -172,6 +174,8 @@ struct tun_file {
 		u16 queue_index;
 		unsigned int ifindex;
 	};
+	struct napi_struct napi;
+	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -229,6 +233,74 @@ struct tun_struct {
 	struct bpf_prog __rcu *xdp_prog;
 };
 
+static int tun_napi_receive(struct napi_struct *napi, int budget)
+{
+	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
+	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+	struct sk_buff_head process_queue;
+	struct sk_buff *skb;
+	int received = 0;
+
+	__skb_queue_head_init(&process_queue);
+
+	spin_lock(&queue->lock);
+	skb_queue_splice_tail_init(queue, &process_queue);
+	spin_unlock(&queue->lock);
+
+	while (received < budget && (skb = __skb_dequeue(&process_queue))) {
+		napi_gro_receive(napi, skb);
+		++received;
+	}
+
+	if (!skb_queue_empty(&process_queue)) {
+		spin_lock(&queue->lock);
+		skb_queue_splice(&process_queue, queue);
+		spin_unlock(&queue->lock);
+	}
+
+	return received;
+}
+
+static int tun_napi_poll(struct napi_struct *napi, int budget)
+{
+	unsigned int received;
+
+	received = tun_napi_receive(napi, budget);
+
+	if (received < budget)
+		napi_complete_done(napi, received);
+
+	return received;
+}
+
+static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
+			  bool napi_en)
+{
+	if (napi_en) {
+		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
+			       NAPI_POLL_WEIGHT);
+		napi_enable(&tfile->napi);
+		mutex_init(&tfile->napi_mutex);
+	}
+}
+
+static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile)
+{
+	if (tun->flags & IFF_NAPI)
+		napi_disable(&tfile->napi);
+}
+
+static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
+{
+	if (tun->flags & IFF_NAPI)
+		netif_napi_del(&tfile->napi);
+}
+
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+	return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -541,6 +613,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 
 	tun = rtnl_dereference(tfile->tun);
 
+	if (tun && clean) {
+		tun_napi_disable(tun, tfile);
+		tun_napi_del(tun, tfile);
+	}
+
 	if (tun && !tfile->detached) {
 		u16 index = tfile->queue_index;
 		BUG_ON(index >= tun->numqueues);
@@ -598,6 +675,7 @@ static void tun_detach_all(struct net_device *dev)
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
 		BUG_ON(!tfile);
+		tun_napi_disable(tun, tfile);
 		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
 		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 		RCU_INIT_POINTER(tfile->tun, NULL);
@@ -613,6 +691,7 @@ static void tun_detach_all(struct net_device *dev)
 	synchronize_net();
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
+		tun_napi_del(tun, tfile);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
 		sock_put(&tfile->sk);
@@ -631,7 +710,8 @@ static void tun_detach_all(struct net_device *dev)
 		module_put(THIS_MODULE);
 }
 
-static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
+static int tun_attach(struct tun_struct *tun, struct file *file,
+		      bool skip_filter, bool napi)
 {
 	struct tun_file *tfile = file->private_data;
 	struct net_device *dev = tun->dev;
@@ -677,10 +757,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 	tun->numqueues++;
 
-	if (tfile->detached)
+	if (tfile->detached) {
 		tun_enable_queue(tfile);
-	else
+	} else {
 		sock_hold(&tfile->sk);
+		tun_napi_init(tun, tfile, napi);
+	}
 
 	tun_set_real_num_queues(tun);
 
@@ -692,7 +774,7 @@ out:
 	return err;
 }
 
-static struct tun_struct *__tun_get(struct tun_file *tfile)
+static struct tun_struct *tun_get(struct tun_file *tfile)
 {
 	struct tun_struct *tun;
 
@@ -705,11 +787,6 @@ static struct tun_struct *__tun_get(struct tun_file *tfile)
 	return tun;
 }
 
-static struct tun_struct *tun_get(struct file *file)
-{
-	return __tun_get(file->private_data);
-}
-
 static void tun_put(struct tun_struct *tun)
 {
 	dev_put(tun->dev);
@@ -956,13 +1033,32 @@ static void tun_poll_controller(struct net_device *dev)
 	 * Tun only receives frames when:
 	 * 1) the char device endpoint gets data from user space
 	 * 2) the tun socket gets a sendmsg call from user space
-	 * Since both of those are synchronous operations, we are guaranteed
-	 * never to have pending data when we poll for it
-	 * so there is nothing to do here but return.
+	 * If NAPI is not enabled, since both of those are synchronous
+	 * operations, we are guaranteed never to have pending data when we poll
+	 * for it so there is nothing to do here but return.
 	 * We need this though so netpoll recognizes us as an interface that
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
+	 * If NAPI is enabled, however, we need to schedule polling for all
+	 * queues unless we are using napi_gro_frags(), which we call in
+	 * process context and not in NAPI context.
 	 */
+	struct tun_struct *tun = netdev_priv(dev);
+
+	if (tun->flags & IFF_NAPI) {
+		struct tun_file *tfile;
+		int i;
+
+		if (tun_napi_frags_enabled(tun))
+			return;
+
+		rcu_read_lock();
+		for (i = 0; i < tun->numqueues; i++) {
+			tfile = rcu_dereference(tun->tfiles[i]);
+			napi_schedule(&tfile->napi);
+		}
+		rcu_read_unlock();
+	}
 	return;
 }
 #endif
@@ -1149,7 +1245,7 @@ static void tun_net_init(struct net_device *dev)
 static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 {
 	struct tun_file *tfile = file->private_data;
-	struct tun_struct *tun = __tun_get(tfile);
+	struct tun_struct *tun = tun_get(tfile);
 	struct sock *sk;
 	unsigned int mask = 0;
 
@@ -1178,6 +1274,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+					    size_t len,
+					    const struct iov_iter *it)
+{
+	struct sk_buff *skb;
+	size_t linear;
+	int err;
+	int i;
+
+	if (it->nr_segs > MAX_SKB_FRAGS + 1)
+		return ERR_PTR(-ENOMEM);
+
+	local_bh_disable();
+	skb = napi_get_frags(&tfile->napi);
+	local_bh_enable();
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	linear = iov_iter_single_seg_count(it);
+	err = __skb_grow(skb, linear);
+	if (err)
+		goto free;
+
+	skb->len = len;
+	skb->data_len = len - linear;
+	skb->truesize += skb->data_len;
+
+	for (i = 1; i < it->nr_segs; i++) {
+		size_t fragsz = it->iov[i].iov_len;
+		unsigned long offset;
+		struct page *page;
+		void *data;
+
+		if (fragsz == 0 || fragsz > PAGE_SIZE) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		local_bh_disable();
+		data = napi_alloc_frag(fragsz);
+		local_bh_enable();
+		if (!data) {
+			err = -ENOMEM;
+			goto free;
+		}
+
+		page = virt_to_head_page(data);
+		offset = data - page_address(page);
+		skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+	}
+
+	return skb;
+free:
+	/* frees skb and all frags allocated with napi_alloc_frag() */
+	napi_free_frags(&tfile->napi);
+	return ERR_PTR(err);
+}
+
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
 static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1314,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		xdp.data_hard_start = buf;
 		xdp.data = buf + pad;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -1390,6 +1545,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int err;
 	u32 rxhash;
 	int skb_xdp = 1;
+	bool frags = tun_napi_frags_enabled(tun);
 
 	if (!(tun->dev->flags & IFF_UP))
 		return -EIO;
@@ -1447,7 +1603,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			zerocopy = true;
 	}
 
-	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
 		/* For the packet that is not easy to be processed
 		 * (e.g gso or jumbo packet), we will do it at after
 		 * skb was created with generic XDP routine.
@@ -1468,10 +1624,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 				linear = tun16_to_cpu(tun, gso.hdr_len);
 		}
 
-		skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+		if (frags) {
+			mutex_lock(&tfile->napi_mutex);
+			skb = tun_napi_alloc_frags(tfile, copylen, from);
+			/* tun_napi_alloc_frags() enforces a layout for the skb.
+			 * If zerocopy is enabled, then this layout will be
+			 * overwritten by zerocopy_sg_from_iter().
+			 */
+			zerocopy = false;
+		} else {
+			skb = tun_alloc_skb(tfile, align, copylen, linear,
+					    noblock);
+		}
+
 		if (IS_ERR(skb)) {
 			if (PTR_ERR(skb) != -EAGAIN)
 				this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			if (frags)
+				mutex_unlock(&tfile->napi_mutex);
 			return PTR_ERR(skb);
 		}
 
@@ -1483,6 +1653,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		if (err) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
 			kfree_skb(skb);
+			if (frags) {
+				tfile->napi.skb = NULL;
+				mutex_unlock(&tfile->napi_mutex);
+			}
+
 			return -EFAULT;
 		}
 	}
@@ -1490,6 +1665,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
 		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
 		kfree_skb(skb);
+		if (frags) {
+			tfile->napi.skb = NULL;
+			mutex_unlock(&tfile->napi_mutex);
+		}
+
 		return -EINVAL;
 	}
 
@@ -1517,7 +1697,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb->dev = tun->dev;
 		break;
 	case IFF_TAP:
-		skb->protocol = eth_type_trans(skb, tun->dev);
+		if (!frags)
+			skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
 
@@ -1551,11 +1732,41 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	}
 
 	rxhash = __skb_get_hash_symmetric(skb);
-#ifndef CONFIG_4KSTACKS
-	tun_rx_batched(tun, tfile, skb, more);
-#else
-	netif_rx_ni(skb);
-#endif
+
+	if (frags) {
+		/* Exercise flow dissector code path. */
+		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+
+		if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
+			this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			napi_free_frags(&tfile->napi);
+			mutex_unlock(&tfile->napi_mutex);
+			WARN_ON(1);
+			return -ENOMEM;
+		}
+
+		local_bh_disable();
+		napi_gro_frags(&tfile->napi);
+		local_bh_enable();
+		mutex_unlock(&tfile->napi_mutex);
+	} else if (tun->flags & IFF_NAPI) {
+		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+		int queue_len;
+
+		spin_lock_bh(&queue->lock);
+		__skb_queue_tail(queue, skb);
+		queue_len = skb_queue_len(queue);
+		spin_unlock(&queue->lock);
+
+		if (!more || queue_len > NAPI_POLL_WEIGHT)
+			napi_schedule(&tfile->napi);
+
+		local_bh_enable();
+	} else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
+		tun_rx_batched(tun, tfile, skb, more);
+	} else {
+		netif_rx_ni(skb);
+	}
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -1571,8 +1782,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
-	struct tun_struct *tun = tun_get(file);
 	struct tun_file *tfile = file->private_data;
+	struct tun_struct *tun = tun_get(tfile);
 	ssize_t result;
 
 	if (!tun)
@@ -1756,7 +1967,7 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
 	struct tun_file *tfile = file->private_data;
-	struct tun_struct *tun = __tun_get(tfile);
+	struct tun_struct *tun = tun_get(tfile);
 	ssize_t len = iov_iter_count(to), ret;
 
 	if (!tun)
@@ -1833,7 +2044,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 {
 	int ret;
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
-	struct tun_struct *tun = __tun_get(tfile);
+	struct tun_struct *tun = tun_get(tfile);
 
 	if (!tun)
 		return -EBADFD;
@@ -1849,7 +2060,7 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
 		       int flags)
 {
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
-	struct tun_struct *tun = __tun_get(tfile);
+	struct tun_struct *tun = tun_get(tfile);
 	int ret;
 
 	if (!tun)
@@ -1881,7 +2092,7 @@ static int tun_peek_len(struct socket *sock)
 	struct tun_struct *tun;
 	int ret = 0;
 
-	tun = __tun_get(tfile);
+	tun = tun_get(tfile);
 	if (!tun)
 		return 0;
 
@@ -1961,6 +2172,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	if (tfile->detached)
 		return -EINVAL;
 
+	if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		if (!(ifr->ifr_flags & IFF_NAPI) ||
+		    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
+			return -EINVAL;
+	}
+
 	dev = __dev_get_by_name(net, ifr->ifr_name);
 	if (dev) {
 		if (ifr->ifr_flags & IFF_TUN_EXCL)
@@ -1982,7 +2202,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		if (err < 0)
 			return err;
 
-		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER);
+		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
+				 ifr->ifr_flags & IFF_NAPI);
 		if (err < 0)
 			return err;
 
@@ -2068,7 +2289,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 				       NETIF_F_HW_VLAN_STAG_TX);
 
 		INIT_LIST_HEAD(&tun->disabled);
-		err = tun_attach(tun, file, false);
+		err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI);
 		if (err < 0)
 			goto err_free_flow;
 
@@ -2218,7 +2439,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
 		ret = security_tun_dev_attach_queue(tun->security);
 		if (ret < 0)
 			goto unlock;
-		ret = tun_attach(tun, file, false);
+		ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI);
 	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
 		tun = rtnl_dereference(tfile->tun);
 		if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
@@ -2267,7 +2488,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	ret = 0;
 	rtnl_lock();
 
-	tun = __tun_get(tfile);
+	tun = tun_get(tfile);
 	if (cmd == TUNSETIFF) {
 		ret = -EEXIST;
 		if (tun)
@@ -2614,15 +2835,16 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 }
 
 #ifdef CONFIG_PROC_FS
-static void tun_chr_show_fdinfo(struct seq_file *m, struct file *f)
+static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file)
 {
+	struct tun_file *tfile = file->private_data;
 	struct tun_struct *tun;
 	struct ifreq ifr;
 
 	memset(&ifr, 0, sizeof(ifr));
 
 	rtnl_lock();
-	tun = tun_get(f);
+	tun = tun_get(tfile);
 	if (tun)
 		tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
 	rtnl_unlock();
diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c
index 522d2900cd1d..f4d7362eb325 100644
--- a/drivers/net/usb/asix_common.c
+++ b/drivers/net/usb/asix_common.c
@@ -245,7 +245,7 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 	 * - We are allowed to put 4 bytes at tail if skb_cloned()
 	 *   is false (and if we have 4 bytes of tailroom)
 	 *
-	 * TCP packets for example are cloned, but skb_header_release()
+	 * TCP packets for example are cloned, but __skb_header_release()
 	 * was called in tcp stack, allowing us to use headroom for our needs.
 	 */
 	if (!skb_header_cloned(skb) &&
diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c
index dbc90313f472..aeb62e17d19d 100644
--- a/drivers/net/usb/catc.c
+++ b/drivers/net/usb/catc.c
@@ -805,9 +805,7 @@ static int catc_probe(struct usb_interface *intf, const struct usb_device_id *id
 	spin_lock_init(&catc->tx_lock);
 	spin_lock_init(&catc->ctrl_lock);
 
-	init_timer(&catc->timer);
-	catc->timer.data = (long) catc;
-	catc->timer.function = catc_stats_timer;
+	setup_timer(&catc->timer, catc_stats_timer, (long)catc);
 
 	catc->ctrl_urb = usb_alloc_urb(0, GFP_KERNEL);
 	catc->tx_urb = usb_alloc_urb(0, GFP_KERNEL);
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 8c3733608271..db7279d5b250 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -221,7 +221,7 @@ static int qmimux_register_device(struct net_device *real_dev, u8 mux_id)
 	/* Account for reference in struct qmimux_priv_priv */
 	dev_hold(real_dev);
 
-	err = netdev_upper_dev_link(real_dev, new_dev);
+	err = netdev_upper_dev_link(real_dev, new_dev, NULL);
 	if (err)
 		goto out_unregister_netdev;
 
diff --git a/drivers/net/virtio_net. b/drivers/net/virtio_net.
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/drivers/net/virtio_net.
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 511f8339fa96..fc059f193e7d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -29,6 +29,7 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/average.h>
+#include <linux/filter.h>
 #include <net/route.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
@@ -372,9 +373,20 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
-static bool virtnet_xdp_xmit(struct virtnet_info *vi,
-			     struct receive_queue *rq,
-			     struct xdp_buff *xdp)
+static void virtnet_xdp_flush(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct send_queue *sq;
+	unsigned int qp;
+
+	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+	sq = &vi->sq[qp];
+
+	virtqueue_kick(sq->vq);
+}
+
+static bool __virtnet_xdp_xmit(struct virtnet_info *vi,
+			       struct xdp_buff *xdp)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	unsigned int len;
@@ -408,10 +420,19 @@ static bool virtnet_xdp_xmit(struct virtnet_info *vi,
 		return false;
 	}
 
-	virtqueue_kick(sq->vq);
 	return true;
 }
 
+static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	bool sent = __virtnet_xdp_xmit(vi, xdp);
+
+	if (!sent)
+		return -ENOSPC;
+	return 0;
+}
+
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
 {
 	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
@@ -484,7 +505,8 @@ static struct sk_buff *receive_small(struct net_device *dev,
 				     struct virtnet_info *vi,
 				     struct receive_queue *rq,
 				     void *buf, void *ctx,
-				     unsigned int len)
+				     unsigned int len,
+				     bool *xdp_xmit)
 {
 	struct sk_buff *skb;
 	struct bpf_prog *xdp_prog;
@@ -494,7 +516,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	struct page *page = virt_to_head_page(buf);
-	unsigned int delta = 0;
+	unsigned int delta = 0, err;
 	struct page *xdp_page;
 	len -= vi->hdr_len;
 
@@ -532,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 
 		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
 		xdp.data = xdp.data_hard_start + xdp_headroom;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -542,8 +565,16 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			delta = orig_data - xdp.data;
 			break;
 		case XDP_TX:
-			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
+			if (unlikely(!__virtnet_xdp_xmit(vi, &xdp)))
 				trace_xdp_exception(vi->dev, xdp_prog, act);
+			else
+				*xdp_xmit = true;
+			rcu_read_unlock();
+			goto xdp_xmit;
+		case XDP_REDIRECT:
+			err = xdp_do_redirect(dev, &xdp, xdp_prog);
+			if (!err)
+				*xdp_xmit = true;
 			rcu_read_unlock();
 			goto xdp_xmit;
 		default:
@@ -604,7 +635,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 					 struct receive_queue *rq,
 					 void *buf,
 					 void *ctx,
-					 unsigned int len)
+					 unsigned int len,
+					 bool *xdp_xmit)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
 	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
@@ -614,6 +646,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	struct bpf_prog *xdp_prog;
 	unsigned int truesize;
 	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
+	int err;
 
 	head_skb = NULL;
 
@@ -654,9 +687,13 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		data = page_address(xdp_page) + offset;
 		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 		xdp.data = data + vi->hdr_len;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
+		if (act != XDP_PASS)
+			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
+
 		switch (act) {
 		case XDP_PASS:
 			/* recalculate offset to account for any header
@@ -672,18 +709,24 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 				put_page(page);
 				head_skb = page_to_skb(vi, rq, xdp_page,
 						       offset, len, PAGE_SIZE);
-				ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 				return head_skb;
 			}
 			break;
 		case XDP_TX:
-			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
+			if (unlikely(!__virtnet_xdp_xmit(vi, &xdp)))
 				trace_xdp_exception(vi->dev, xdp_prog, act);
-			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
+			else
+				*xdp_xmit = true;
 			if (unlikely(xdp_page != page))
 				goto err_xdp;
 			rcu_read_unlock();
 			goto xdp_xmit;
+		case XDP_REDIRECT:
+			err = xdp_do_redirect(dev, &xdp, xdp_prog);
+			if (!err)
+				*xdp_xmit = true;
+			rcu_read_unlock();
+			goto xdp_xmit;
 		default:
 			bpf_warn_invalid_xdp_action(act);
 		case XDP_ABORTED:
@@ -691,7 +734,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		case XDP_DROP:
 			if (unlikely(xdp_page != page))
 				__free_pages(xdp_page, 0);
-			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 			goto err_xdp;
 		}
 	}
@@ -789,7 +831,7 @@ xdp_xmit:
 }
 
 static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
-		       void *buf, unsigned int len, void **ctx)
+		       void *buf, unsigned int len, void **ctx, bool *xdp_xmit)
 {
 	struct net_device *dev = vi->dev;
 	struct sk_buff *skb;
@@ -810,11 +852,11 @@ static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 	}
 
 	if (vi->mergeable_rx_bufs)
-		skb = receive_mergeable(dev, vi, rq, buf, ctx, len);
+		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit);
 	else if (vi->big_packets)
 		skb = receive_big(dev, vi, rq, buf, len);
 	else
-		skb = receive_small(dev, vi, rq, buf, ctx, len);
+		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit);
 
 	if (unlikely(!skb))
 		return 0;
@@ -1072,7 +1114,7 @@ static void refill_work(struct work_struct *work)
 	}
 }
 
-static int virtnet_receive(struct receive_queue *rq, int budget)
+static int virtnet_receive(struct receive_queue *rq, int budget, bool *xdp_xmit)
 {
 	struct virtnet_info *vi = rq->vq->vdev->priv;
 	unsigned int len, received = 0, bytes = 0;
@@ -1084,13 +1126,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget)
 
 		while (received < budget &&
 		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
-			bytes += receive_buf(vi, rq, buf, len, ctx);
+			bytes += receive_buf(vi, rq, buf, len, ctx, xdp_xmit);
 			received++;
 		}
 	} else {
 		while (received < budget &&
 		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
-			bytes += receive_buf(vi, rq, buf, len, NULL);
+			bytes += receive_buf(vi, rq, buf, len, NULL, xdp_xmit);
 			received++;
 		}
 	}
@@ -1162,15 +1204,19 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
 	struct receive_queue *rq =
 		container_of(napi, struct receive_queue, napi);
 	unsigned int received;
+	bool xdp_xmit = false;
 
 	virtnet_poll_cleantx(rq);
 
-	received = virtnet_receive(rq, budget);
+	received = virtnet_receive(rq, budget, &xdp_xmit);
 
 	/* Out of packets? */
 	if (received < budget)
 		virtqueue_napi_complete(napi, rq->vq, received);
 
+	if (xdp_xmit)
+		xdp_do_flush_map();
+
 	return received;
 }
 
@@ -2070,6 +2116,8 @@ static const struct net_device_ops virtnet_netdev = {
 	.ndo_poll_controller = virtnet_netpoll,
 #endif
 	.ndo_xdp		= virtnet_xdp,
+	.ndo_xdp_xmit		= virtnet_xdp_xmit,
+	.ndo_xdp_flush		= virtnet_xdp_flush,
 	.ndo_features_check	= passthru_features_check,
 };
 
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9b243e6f3008..0b54f553228e 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -132,7 +132,6 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
 	skb_orphan(skb);
 
 	skb_dst_set(skb, dst);
-	skb_dst_force(skb);
 
 	/* set pkt_type to avoid skb hitting packet taps twice -
 	 * once on Tx and again in Rx processing
@@ -765,18 +764,22 @@ static void cycle_netdev(struct net_device *dev)
 	}
 }
 
-static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
+static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
+			    struct netlink_ext_ack *extack)
 {
 	int ret;
 
 	/* do not allow loopback device to be enslaved to a VRF.
 	 * The vrf device acts as the loopback for the vrf.
 	 */
-	if (port_dev == dev_net(dev)->loopback_dev)
+	if (port_dev == dev_net(dev)->loopback_dev) {
+		NL_SET_ERR_MSG(extack,
+			       "Can not enslave loopback device to a VRF");
 		return -EOPNOTSUPP;
+	}
 
 	port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
-	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
+	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack);
 	if (ret < 0)
 		goto err;
 
@@ -789,12 +792,19 @@ err:
 	return ret;
 }
 
-static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
+static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
+			 struct netlink_ext_ack *extack)
 {
-	if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev))
+	if (netif_is_l3_master(port_dev)) {
+		NL_SET_ERR_MSG(extack,
+			       "Can not enslave an L3 master device to a VRF");
+		return -EINVAL;
+	}
+
+	if (netif_is_l3_slave(port_dev))
 		return -EINVAL;
 
-	return do_vrf_add_slave(dev, port_dev);
+	return do_vrf_add_slave(dev, port_dev, extack);
 }
 
 /* inverse of do_vrf_add_slave */
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index a043fb1367bd..64f176496da4 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -1127,10 +1127,8 @@ static int dscc4_open(struct net_device *dev)
 done:
 	netif_start_queue(dev);
 
-        init_timer(&dpriv->timer);
+	setup_timer(&dpriv->timer, dscc4_timer, (unsigned long)dev);
         dpriv->timer.expires = jiffies + 10*HZ;
-        dpriv->timer.data = (unsigned long)dev;
-	dpriv->timer.function = dscc4_timer;
         add_timer(&dpriv->timer);
 	netif_carrier_on(dev);
 
diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c
index a408abc25512..c696d42f4502 100644
--- a/drivers/net/wan/hdlc_cisco.c
+++ b/drivers/net/wan/hdlc_cisco.c
@@ -293,10 +293,8 @@ static void cisco_start(struct net_device *dev)
 	st->up = st->txseq = st->rxseq = 0;
 	spin_unlock_irqrestore(&st->lock, flags);
 
-	init_timer(&st->timer);
+	setup_timer(&st->timer, cisco_timer, (unsigned long)dev);
 	st->timer.expires = jiffies + HZ; /* First poll after 1 s */
-	st->timer.function = cisco_timer;
-	st->timer.data = (unsigned long)dev;
 	add_timer(&st->timer);
 }
 
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index 78596e42a3f3..425a47ffed25 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1003,11 +1003,10 @@ static void fr_start(struct net_device *dev)
 		state(hdlc)->n391cnt = 0;
 		state(hdlc)->txseq = state(hdlc)->rxseq = 0;
 
-		init_timer(&state(hdlc)->timer);
+		setup_timer(&state(hdlc)->timer, fr_timer,
+			    (unsigned long)dev);
 		/* First poll after 1 s */
 		state(hdlc)->timer.expires = jiffies + HZ;
-		state(hdlc)->timer.function = fr_timer;
-		state(hdlc)->timer.data = (unsigned long)dev;
 		add_timer(&state(hdlc)->timer);
 	} else
 		fr_set_link_state(1, dev);
diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c
index 0d2e00ece804..c7721c729541 100644
--- a/drivers/net/wan/hdlc_ppp.c
+++ b/drivers/net/wan/hdlc_ppp.c
@@ -610,9 +610,7 @@ static void ppp_start(struct net_device *dev)
 	for (i = 0; i < IDX_COUNT; i++) {
 		struct proto *proto = &ppp->protos[i];
 		proto->dev = dev;
-		init_timer(&proto->timer);
-		proto->timer.function = ppp_timer;
-		proto->timer.data = (unsigned long)proto;
+		setup_timer(&proto->timer, ppp_timer, (unsigned long)proto);
 		proto->state = CLOSED;
 	}
 	ppp->protos[IDX_LCP].pid = PID_LCP;
diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c
index 4698450c77d1..ae69d65158e6 100644
--- a/drivers/net/wan/lmc/lmc_main.c
+++ b/drivers/net/wan/lmc/lmc_main.c
@@ -1084,10 +1084,8 @@ static int lmc_open(struct net_device *dev)
      * Setup a timer for the watchdog on probe, and start it running.
      * Since lmc_ok == 0, it will be a NOP for now.
      */
-    init_timer (&sc->timer);
+    setup_timer(&sc->timer, lmc_watchdog, (unsigned long)dev);
     sc->timer.expires = jiffies + HZ;
-    sc->timer.data = (unsigned long) dev;
-    sc->timer.function = lmc_watchdog;
     add_timer (&sc->timer);
 
     lmc_trace(dev, "lmc_open out");
diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 236c62538036..0cc48902dbb9 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -1617,10 +1617,8 @@ static void setup_sdla(struct net_device *dev)
 	flp->deassoc		= sdla_deassoc;
 	flp->dlci_conf		= sdla_dlci_conf;
 
-	init_timer(&flp->timer);
+	setup_timer(&flp->timer, sdla_poll, (unsigned long)dev);
 	flp->timer.expires	= 1;
-	flp->timer.data		= (unsigned long) dev;
-	flp->timer.function	= sdla_poll;
 }
 
 static struct net_device *sdla;
diff --git a/drivers/net/wireless/ath/ath6kl/txrx.c b/drivers/net/wireless/ath/ath6kl/txrx.c
index e6b2517e6334..4e5cc2b7045a 100644
--- a/drivers/net/wireless/ath/ath6kl/txrx.c
+++ b/drivers/net/wireless/ath/ath6kl/txrx.c
@@ -1753,9 +1753,7 @@ void aggr_conn_init(struct ath6kl_vif *vif, struct aggr_info *aggr_info,
 
 	aggr_conn->aggr_sz = AGGR_SZ_DEFAULT;
 	aggr_conn->dev = vif->ndev;
-	init_timer(&aggr_conn->timer);
-	aggr_conn->timer.function = aggr_timeout;
-	aggr_conn->timer.data = (unsigned long) aggr_conn;
+	setup_timer(&aggr_conn->timer, aggr_timeout, (unsigned long)aggr_conn);
 	aggr_conn->aggr_info = aggr_info;
 
 	aggr_conn->timer_scheduled = false;
diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c
index b68436b23a63..e816d53c2c05 100644
--- a/drivers/net/wireless/atmel/atmel.c
+++ b/drivers/net/wireless/atmel/atmel.c
@@ -1579,11 +1579,10 @@ struct net_device *init_atmel_card(unsigned short irq, unsigned long port,
 	priv->default_beacon_period = priv->beacon_period = 100;
 	priv->listen_interval = 1;
 
-	init_timer(&priv->management_timer);
+	setup_timer(&priv->management_timer, atmel_management_timer,
+		    (unsigned long)dev);
 	spin_lock_init(&priv->irqlock);
 	spin_lock_init(&priv->timerlock);
-	priv->management_timer.function = atmel_management_timer;
-	priv->management_timer.data = (unsigned long) dev;
 
 	dev->netdev_ops = &atmel_netdev_ops;
 	dev->wireless_handlers = &atmel_handler_def;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 4157c90ad973..4bfd43a80273 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -3261,9 +3261,8 @@ static void brcmf_init_escan(struct brcmf_cfg80211_info *cfg)
 			    brcmf_cfg80211_escan_handler);
 	cfg->escan_info.escan_state = WL_ESCAN_STATE_IDLE;
 	/* Init scan_timeout timer */
-	init_timer(&cfg->escan_timeout);
-	cfg->escan_timeout.data = (unsigned long) cfg;
-	cfg->escan_timeout.function = brcmf_escan_timeout;
+	setup_timer(&cfg->escan_timeout, brcmf_escan_timeout,
+		    (unsigned long)cfg);
 	INIT_WORK(&cfg->escan_timeout_work,
 		  brcmf_cfg80211_escan_timeout_worker);
 }
diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c
index 070dfd68bb83..7ad286d6e9a7 100644
--- a/drivers/net/wireless/rsi/rsi_91x_hal.c
+++ b/drivers/net/wireless/rsi/rsi_91x_hal.c
@@ -411,9 +411,8 @@ static void bl_cmd_timeout(unsigned long priv)
 
 static int bl_start_cmd_timer(struct rsi_hw *adapter, u32 timeout)
 {
-	init_timer(&adapter->bl_cmd_timer);
-	adapter->bl_cmd_timer.data = (unsigned long)adapter;
-	adapter->bl_cmd_timer.function = (void *)&bl_cmd_timeout;
+	setup_timer(&adapter->bl_cmd_timer, (void *)&bl_cmd_timeout,
+		    (unsigned long)adapter);
 	adapter->bl_cmd_timer.expires = (msecs_to_jiffies(timeout) + jiffies);
 
 	adapter->blcmd_timer_expired = false;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index ee8ed9da00ad..dcfcb153918c 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -520,8 +520,7 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 
 	queue->credit_bytes = queue->remaining_credit = ~0UL;
 	queue->credit_usec  = 0UL;
-	init_timer(&queue->credit_timeout);
-	queue->credit_timeout.function = xenvif_tx_credit_callback;
+	setup_timer(&queue->credit_timeout, xenvif_tx_credit_callback, 0UL);
 	queue->credit_window_start = get_jiffies_64();
 
 	queue->rx_queue_max = XENVIF_RX_QUEUE_BYTES;
diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 59e09854c4f7..91fcadbede80 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -182,6 +182,21 @@ struct qeth_sbp_info {
 	__u32 reflect_promisc_primary:1;
 };
 
+struct qeth_vnicc_info {
+	/* supported/currently configured VNICCs; updated in IPA exchanges */
+	u32 sup_chars;
+	u32 cur_chars;
+	/* supported commands: bitmasks which VNICCs support respective cmd */
+	u32 set_char_sup;
+	u32 getset_timeout_sup;
+	/* timeout value for the learning characteristic */
+	u32 learning_timeout;
+	/* characteristics wanted/configured by user */
+	u32 wanted_chars;
+	/* has user explicitly enabled rx_bcast while online? */
+	bool rx_bcast_enabled;
+};
+
 static inline int qeth_is_ipa_supported(struct qeth_ipa_info *ipa,
 		enum qeth_ipa_funcs func)
 {
@@ -673,6 +688,7 @@ struct qeth_card_options {
 	struct qeth_routing_info route6;
 	struct qeth_ipa_info ipa6;
 	struct qeth_sbp_info sbp; /* SETBRIDGEPORT options */
+	struct qeth_vnicc_info vnicc; /* VNICC options */
 	int fake_broadcast;
 	int layer2;
 	int performance_stats;
@@ -946,13 +962,13 @@ int qeth_get_priority_queue(struct qeth_card *, struct sk_buff *, int, int);
 int qeth_get_elements_no(struct qeth_card *card, struct sk_buff *skb,
 			 int extra_elems, int data_offset);
 int qeth_get_elements_for_frags(struct sk_buff *);
-int qeth_do_send_packet_fast(struct qeth_card *card,
-			     struct qeth_qdio_out_q *queue, struct sk_buff *skb,
+int qeth_do_send_packet_fast(struct qeth_qdio_out_q *queue, struct sk_buff *skb,
 			     struct qeth_hdr *hdr, unsigned int offset,
 			     unsigned int hd_len);
 int qeth_do_send_packet(struct qeth_card *card, struct qeth_qdio_out_q *queue,
 			struct sk_buff *skb, struct qeth_hdr *hdr,
-			unsigned int hd_len, unsigned int offset, int elements);
+			unsigned int offset, unsigned int hd_len,
+			int elements_needed);
 int qeth_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 int qeth_core_get_sset_count(struct net_device *, int);
 void qeth_core_get_ethtool_stats(struct net_device *,
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index bae7440abc01..68e118f1202e 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -4040,35 +4040,23 @@ static int qeth_fill_buffer(struct qeth_qdio_out_q *queue,
 	return flush_cnt;
 }
 
-int qeth_do_send_packet_fast(struct qeth_card *card,
-			     struct qeth_qdio_out_q *queue, struct sk_buff *skb,
+int qeth_do_send_packet_fast(struct qeth_qdio_out_q *queue, struct sk_buff *skb,
 			     struct qeth_hdr *hdr, unsigned int offset,
 			     unsigned int hd_len)
 {
-	struct qeth_qdio_out_buffer *buffer;
-	int index;
+	int index = queue->next_buf_to_fill;
+	struct qeth_qdio_out_buffer *buffer = queue->bufs[index];
 
-	/* spin until we get the queue ... */
-	while (atomic_cmpxchg(&queue->state, QETH_OUT_Q_UNLOCKED,
-			      QETH_OUT_Q_LOCKED) != QETH_OUT_Q_UNLOCKED);
-	/* ... now we've got the queue */
-	index = queue->next_buf_to_fill;
-	buffer = queue->bufs[queue->next_buf_to_fill];
 	/*
 	 * check if buffer is empty to make sure that we do not 'overtake'
 	 * ourselves and try to fill a buffer that is already primed
 	 */
 	if (atomic_read(&buffer->state) != QETH_QDIO_BUF_EMPTY)
-		goto out;
-	queue->next_buf_to_fill = (queue->next_buf_to_fill + 1) %
-					  QDIO_MAX_BUFFERS_PER_Q;
-	atomic_set(&queue->state, QETH_OUT_Q_UNLOCKED);
+		return -EBUSY;
+	queue->next_buf_to_fill = (index + 1) % QDIO_MAX_BUFFERS_PER_Q;
 	qeth_fill_buffer(queue, buffer, skb, hdr, offset, hd_len);
 	qeth_flush_buffers(queue, index, 1);
 	return 0;
-out:
-	atomic_set(&queue->state, QETH_OUT_Q_UNLOCKED);
-	return -EBUSY;
 }
 EXPORT_SYMBOL_GPL(qeth_do_send_packet_fast);
 
@@ -4923,7 +4911,6 @@ static void qeth_qdio_establish_cq(struct qeth_card *card,
 	if (card->options.cq == QETH_CQ_ENABLED) {
 		int offset = QDIO_MAX_BUFFERS_PER_Q *
 			     (card->qdio.no_in_queues - 1);
-		i = QDIO_MAX_BUFFERS_PER_Q * (card->qdio.no_in_queues - 1);
 		for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; ++i) {
 			in_sbal_ptrs[offset + i] = (struct qdio_buffer *)
 				virt_to_phys(card->qdio.c_q->bufs[i].buffer);
diff --git a/drivers/s390/net/qeth_core_mpc.c b/drivers/s390/net/qeth_core_mpc.c
index 6dd7d05e5693..5f8a2b834d39 100644
--- a/drivers/s390/net/qeth_core_mpc.c
+++ b/drivers/s390/net/qeth_core_mpc.c
@@ -167,7 +167,7 @@ static struct ipa_rc_msg qeth_ipa_rc_msg[] = {
 	{IPA_RC_IP_TABLE_FULL,		"Add Addr IP Table Full - ipv6"},
 	{IPA_RC_UNKNOWN_ERROR,		"IPA command failed - reason unknown"},
 	{IPA_RC_UNSUPPORTED_COMMAND,	"Command not supported"},
-	{IPA_RC_TRACE_ALREADY_ACTIVE,	"trace already active"},
+	{IPA_RC_VNICC_OOSEQ,		"Command issued out of sequence"},
 	{IPA_RC_INVALID_FORMAT,		"invalid format or length"},
 	{IPA_RC_DUP_IPV6_REMOTE, "ipv6 address already registered remote"},
 	{IPA_RC_SBP_IQD_NOT_CONFIGURED,	"Not configured for bridgeport"},
@@ -193,6 +193,7 @@ static struct ipa_rc_msg qeth_ipa_rc_msg[] = {
 	{IPA_RC_L2_INVALID_VLAN_ID,	"L2 invalid vlan id"},
 	{IPA_RC_L2_DUP_VLAN_ID,		"L2 duplicate vlan id"},
 	{IPA_RC_L2_VLAN_ID_NOT_FOUND,	"L2 vlan id not found"},
+	{IPA_RC_VNICC_VNICBP,		"VNIC is BridgePort"},
 	{IPA_RC_SBP_OSA_NOT_CONFIGURED,	"Not configured for bridgeport"},
 	{IPA_RC_SBP_OSA_OS_MISMATCH,	"OS mismatch"},
 	{IPA_RC_SBP_OSA_ANO_DEV_PRIMARY, "Primary bridgeport exists already"},
@@ -253,6 +254,7 @@ static struct ipa_cmd_names qeth_ipa_cmd_names[] = {
 	{IPA_CMD_DELGMAC,	"delgmac"},
 	{IPA_CMD_SETVLAN,	"setvlan"},
 	{IPA_CMD_DELVLAN,	"delvlan"},
+	{IPA_CMD_VNICC,		"vnic_characteristics"},
 	{IPA_CMD_SETBRIDGEPORT_OSA,	"set_bridge_port(osa)"},
 	{IPA_CMD_SETCCID,	"setccid"},
 	{IPA_CMD_DELCCID,	"delccid"},
diff --git a/drivers/s390/net/qeth_core_mpc.h b/drivers/s390/net/qeth_core_mpc.h
index 912e0107de8f..6a7654df6e78 100644
--- a/drivers/s390/net/qeth_core_mpc.h
+++ b/drivers/s390/net/qeth_core_mpc.h
@@ -90,6 +90,7 @@ enum qeth_ipa_cmds {
 	IPA_CMD_DELGMAC			= 0x24,
 	IPA_CMD_SETVLAN			= 0x25,
 	IPA_CMD_DELVLAN			= 0x26,
+	IPA_CMD_VNICC			= 0x2a,
 	IPA_CMD_SETBRIDGEPORT_OSA	= 0x2b,
 	IPA_CMD_SETCCID			= 0x41,
 	IPA_CMD_DELCCID			= 0x42,
@@ -165,6 +166,8 @@ enum qeth_ipa_return_codes {
 	IPA_RC_L2_INVALID_VLAN_ID	= 0x2015,
 	IPA_RC_L2_DUP_VLAN_ID		= 0x2016,
 	IPA_RC_L2_VLAN_ID_NOT_FOUND	= 0x2017,
+	IPA_RC_L2_VLAN_ID_NOT_ALLOWED	= 0x2050,
+	IPA_RC_VNICC_VNICBP		= 0x20B0,
 	IPA_RC_SBP_OSA_NOT_CONFIGURED	= 0x2B0C,
 	IPA_RC_SBP_OSA_OS_MISMATCH	= 0x2B10,
 	IPA_RC_SBP_OSA_ANO_DEV_PRIMARY	= 0x2B14,
@@ -197,6 +200,9 @@ enum qeth_ipa_return_codes {
 	IPA_RC_ENOMEM			= 0xfffe,
 	IPA_RC_FFFF			= 0xffff
 };
+/* for VNIC Characteristics */
+#define IPA_RC_VNICC_OOSEQ 0x0005
+
 /* for SET_DIAGNOSTIC_ASSIST */
 #define IPA_RC_INVALID_SUBCMD		IPA_RC_IP_TABLE_FULL
 #define IPA_RC_HARDWARE_AUTH_ERROR	IPA_RC_UNKNOWN_ERROR
@@ -551,6 +557,71 @@ struct qeth_ipacmd_diagass {
 	__u8   cdata[64];
 } __attribute__ ((packed));
 
+/* VNIC Characteristics IPA Command: *****************************************/
+/* IPA commands/sub commands for VNICC */
+#define IPA_VNICC_QUERY_CHARS		0x00000000L
+#define IPA_VNICC_QUERY_CMDS		0x00000001L
+#define IPA_VNICC_ENABLE		0x00000002L
+#define IPA_VNICC_DISABLE		0x00000004L
+#define IPA_VNICC_SET_TIMEOUT		0x00000008L
+#define IPA_VNICC_GET_TIMEOUT		0x00000010L
+
+/* VNICC flags */
+#define QETH_VNICC_FLOODING		0x80000000
+#define QETH_VNICC_MCAST_FLOODING	0x40000000
+#define QETH_VNICC_LEARNING		0x20000000
+#define QETH_VNICC_TAKEOVER_SETVMAC	0x10000000
+#define QETH_VNICC_TAKEOVER_LEARNING	0x08000000
+#define QETH_VNICC_BRIDGE_INVISIBLE	0x04000000
+#define QETH_VNICC_RX_BCAST		0x02000000
+
+/* VNICC default values */
+#define QETH_VNICC_ALL			0xff000000
+#define QETH_VNICC_DEFAULT		QETH_VNICC_RX_BCAST
+/* default VNICC timeout in seconds */
+#define QETH_VNICC_DEFAULT_TIMEOUT	600
+
+/* VNICC header */
+struct qeth_ipacmd_vnicc_hdr {
+	u32 sup;
+	u32 cur;
+};
+
+/* VNICC sub command header */
+struct qeth_vnicc_sub_hdr {
+	u16 data_length;
+	u16 reserved;
+	u32 sub_command;
+};
+
+/* query supported commands for VNIC characteristic */
+struct qeth_vnicc_query_cmds {
+	u32 vnic_char;
+	u32 sup_cmds;
+};
+
+/* enable/disable VNIC characteristic */
+struct qeth_vnicc_set_char {
+	u32 vnic_char;
+};
+
+/* get/set timeout for VNIC characteristic */
+struct qeth_vnicc_getset_timeout {
+	u32 vnic_char;
+	u32 timeout;
+};
+
+/* complete VNICC IPA command message */
+struct qeth_ipacmd_vnicc {
+	struct qeth_ipacmd_vnicc_hdr hdr;
+	struct qeth_vnicc_sub_hdr sub_hdr;
+	union {
+		struct qeth_vnicc_query_cmds query_cmds;
+		struct qeth_vnicc_set_char set_char;
+		struct qeth_vnicc_getset_timeout getset_timeout;
+	};
+};
+
 /* SETBRIDGEPORT IPA Command:	 *********************************************/
 enum qeth_ipa_sbp_cmd {
 	IPA_SBP_QUERY_COMMANDS_SUPPORTED	= 0x00000000L,
@@ -692,6 +763,7 @@ struct qeth_ipa_cmd {
 		struct qeth_ipacmd_diagass		diagass;
 		struct qeth_ipacmd_setbridgeport	sbp;
 		struct qeth_ipacmd_addr_change		addrchange;
+		struct qeth_ipacmd_vnicc		vnicc;
 	} data;
 } __attribute__ ((packed));
 
diff --git a/drivers/s390/net/qeth_l2.h b/drivers/s390/net/qeth_l2.h
index 0d59f9a45ea9..241df6b98ab4 100644
--- a/drivers/s390/net/qeth_l2.h
+++ b/drivers/s390/net/qeth_l2.h
@@ -14,6 +14,12 @@ int qeth_l2_create_device_attributes(struct device *);
 void qeth_l2_remove_device_attributes(struct device *);
 void qeth_l2_setup_bridgeport_attrs(struct qeth_card *card);
 
+int qeth_l2_vnicc_set_state(struct qeth_card *card, u32 vnicc, bool state);
+int qeth_l2_vnicc_get_state(struct qeth_card *card, u32 vnicc, bool *state);
+int qeth_l2_vnicc_set_timeout(struct qeth_card *card, u32 timeout);
+int qeth_l2_vnicc_get_timeout(struct qeth_card *card, u32 *timeout);
+bool qeth_l2_vnicc_is_in_use(struct qeth_card *card);
+
 struct qeth_mac {
 	u8 mac_addr[OSA_ADDR_LEN];
 	u8 is_uc:1;
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 760b023eae95..7b61c2ef4c74 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -33,6 +33,10 @@ static void qeth_bridge_state_change(struct qeth_card *card,
 					struct qeth_ipa_cmd *cmd);
 static void qeth_bridge_host_event(struct qeth_card *card,
 					struct qeth_ipa_cmd *cmd);
+static void qeth_l2_vnicc_set_defaults(struct qeth_card *card);
+static void qeth_l2_vnicc_init(struct qeth_card *card);
+static bool qeth_l2_vnicc_recover_timeout(struct qeth_card *card, u32 vnicc,
+					  u32 *timeout);
 
 static int qeth_l2_verify_dev(struct net_device *dev)
 {
@@ -74,7 +78,7 @@ static struct net_device *qeth_l2_netdev_by_devno(unsigned char *read_dev_no)
 	return ndev;
 }
 
-static int qeth_setdel_makerc(struct qeth_card *card, int retcode)
+static int qeth_setdelmac_makerc(struct qeth_card *card, int retcode)
 {
 	int rc;
 
@@ -124,8 +128,8 @@ static int qeth_l2_send_setdelmac(struct qeth_card *card, __u8 *mac,
 	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
 	cmd->data.setdelmac.mac_length = OSA_ADDR_LEN;
 	memcpy(&cmd->data.setdelmac.mac, mac, OSA_ADDR_LEN);
-	return qeth_setdel_makerc(card, qeth_send_ipa_cmd(card, iob,
-					NULL, NULL));
+	return qeth_setdelmac_makerc(card, qeth_send_ipa_cmd(card, iob,
+					   NULL, NULL));
 }
 
 static int qeth_l2_send_setmac(struct qeth_card *card, __u8 *mac)
@@ -285,17 +289,40 @@ static void qeth_l2_fill_header(struct qeth_hdr *hdr, struct sk_buff *skb,
 	}
 }
 
+static int qeth_setdelvlan_makerc(struct qeth_card *card, int retcode)
+{
+	if (retcode)
+		QETH_CARD_TEXT_(card, 2, "err%04x", retcode);
+
+	switch (retcode) {
+	case IPA_RC_SUCCESS:
+		return 0;
+	case IPA_RC_L2_INVALID_VLAN_ID:
+		return -EINVAL;
+	case IPA_RC_L2_DUP_VLAN_ID:
+		return -EEXIST;
+	case IPA_RC_L2_VLAN_ID_NOT_FOUND:
+		return -ENOENT;
+	case IPA_RC_L2_VLAN_ID_NOT_ALLOWED:
+		return -EPERM;
+	case -ENOMEM:
+		return -ENOMEM;
+	default:
+		return -EIO;
+	}
+}
+
 static int qeth_l2_send_setdelvlan_cb(struct qeth_card *card,
-			struct qeth_reply *reply, unsigned long data)
+				      struct qeth_reply *reply,
+				      unsigned long data)
 {
-	struct qeth_ipa_cmd *cmd;
+	struct qeth_ipa_cmd *cmd = (struct qeth_ipa_cmd *) data;
 
 	QETH_CARD_TEXT(card, 2, "L2sdvcb");
-	cmd = (struct qeth_ipa_cmd *) data;
 	if (cmd->hdr.return_code) {
-		QETH_DBF_MESSAGE(2, "Error in processing VLAN %i on %s: 0x%x. "
-			  "Continuing\n", cmd->data.setdelvlan.vlan_id,
-			  QETH_CARD_IFNAME(card), cmd->hdr.return_code);
+		QETH_DBF_MESSAGE(2, "Error in processing VLAN %i on %s: 0x%x.\n",
+				 cmd->data.setdelvlan.vlan_id,
+				 QETH_CARD_IFNAME(card), cmd->hdr.return_code);
 		QETH_CARD_TEXT_(card, 2, "L2VL%4x", cmd->hdr.command);
 		QETH_CARD_TEXT_(card, 2, "err%d", cmd->hdr.return_code);
 	}
@@ -303,7 +330,7 @@ static int qeth_l2_send_setdelvlan_cb(struct qeth_card *card,
 }
 
 static int qeth_l2_send_setdelvlan(struct qeth_card *card, __u16 i,
-				enum qeth_ipa_cmds ipacmd)
+				   enum qeth_ipa_cmds ipacmd)
 {
 	struct qeth_ipa_cmd *cmd;
 	struct qeth_cmd_buffer *iob;
@@ -314,8 +341,8 @@ static int qeth_l2_send_setdelvlan(struct qeth_card *card, __u16 i,
 		return -ENOMEM;
 	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
 	cmd->data.setdelvlan.vlan_id = i;
-	return qeth_send_ipa_cmd(card, iob,
-				 qeth_l2_send_setdelvlan_cb, NULL);
+	return qeth_setdelvlan_makerc(card, qeth_send_ipa_cmd(card, iob,
+					    qeth_l2_send_setdelvlan_cb, NULL));
 }
 
 static void qeth_l2_process_vlans(struct qeth_card *card)
@@ -694,7 +721,7 @@ static int qeth_l2_xmit_iqd(struct qeth_card *card, struct sk_buff *skb,
 		rc = -E2BIG;
 		goto out;
 	}
-	rc = qeth_do_send_packet_fast(card, queue, skb, hdr, data_offset,
+	rc = qeth_do_send_packet_fast(queue, skb, hdr, data_offset,
 				      sizeof(*hdr) + data_offset);
 out:
 	if (rc)
@@ -919,6 +946,7 @@ static int qeth_l2_probe_device(struct ccwgroup_device *gdev)
 	hash_init(card->mac_htable);
 	card->options.layer2 = 1;
 	card->info.hwtrap = 0;
+	qeth_l2_vnicc_set_defaults(card);
 	return 0;
 }
 
@@ -1045,9 +1073,14 @@ static int qeth_l2_start_ipassists(struct qeth_card *card)
 
 static void qeth_l2_trace_features(struct qeth_card *card)
 {
-	QETH_CARD_TEXT(card, 2, "l2featur");
+	/* Set BridgePort features */
+	QETH_CARD_TEXT(card, 2, "featuSBP");
 	QETH_CARD_HEX(card, 2, &card->options.sbp.supported_funcs,
 		      sizeof(card->options.sbp.supported_funcs));
+	/* VNIC Characteristics features */
+	QETH_CARD_TEXT(card, 2, "feaVNICC");
+	QETH_CARD_HEX(card, 2, &card->options.vnicc.sup_chars,
+		      sizeof(card->options.vnicc.sup_chars));
 }
 
 static int __qeth_l2_set_online(struct ccwgroup_device *gdev, int recovery_mode)
@@ -1072,8 +1105,6 @@ static int __qeth_l2_set_online(struct ccwgroup_device *gdev, int recovery_mode)
 	if (card->options.sbp.supported_funcs)
 		dev_info(&card->gdev->dev,
 		"The device represents a Bridge Capable Port\n");
-	qeth_trace_features(card);
-	qeth_l2_trace_features(card);
 
 	if (!card->dev && qeth_l2_setup_netdev(card)) {
 		rc = -ENODEV;
@@ -1090,6 +1121,12 @@ static int __qeth_l2_set_online(struct ccwgroup_device *gdev, int recovery_mode)
 	} else
 		card->info.hwtrap = 0;
 
+	/* for the rx_bcast characteristic, init VNICC after setmac */
+	qeth_l2_vnicc_init(card);
+
+	qeth_trace_features(card);
+	qeth_l2_trace_features(card);
+
 	qeth_l2_setup_bridgeport_attrs(card);
 
 	card->state = CARD_STATE_HARDSETUP;
@@ -2039,6 +2076,454 @@ int qeth_bridgeport_an_set(struct qeth_card *card, int enable)
 }
 EXPORT_SYMBOL_GPL(qeth_bridgeport_an_set);
 
+static bool qeth_bridgeport_is_in_use(struct qeth_card *card)
+{
+	return (card->options.sbp.role || card->options.sbp.reflect_promisc ||
+		card->options.sbp.hostnotification);
+}
+
+/* VNIC Characteristics support */
+
+/* handle VNICC IPA command return codes; convert to error codes */
+static int qeth_l2_vnicc_makerc(struct qeth_card *card, int ipa_rc)
+{
+	int rc;
+
+	switch (ipa_rc) {
+	case IPA_RC_SUCCESS:
+		return ipa_rc;
+	case IPA_RC_L2_UNSUPPORTED_CMD:
+	case IPA_RC_NOTSUPP:
+		rc = -EOPNOTSUPP;
+		break;
+	case IPA_RC_VNICC_OOSEQ:
+		rc = -EALREADY;
+		break;
+	case IPA_RC_VNICC_VNICBP:
+		rc = -EBUSY;
+		break;
+	case IPA_RC_L2_ADDR_TABLE_FULL:
+		rc = -ENOSPC;
+		break;
+	case IPA_RC_L2_MAC_NOT_AUTH_BY_ADP:
+		rc = -EACCES;
+		break;
+	default:
+		rc = -EIO;
+	}
+
+	QETH_CARD_TEXT_(card, 2, "err%04x", ipa_rc);
+	return rc;
+}
+
+/* generic VNICC request call back control */
+struct _qeth_l2_vnicc_request_cbctl {
+	u32 sub_cmd;
+	struct {
+		u32 vnic_char;
+		u32 timeout;
+	} param;
+	struct {
+		union{
+			u32 *sup_cmds;
+			u32 *timeout;
+		};
+	} result;
+};
+
+/* generic VNICC request call back */
+static int qeth_l2_vnicc_request_cb(struct qeth_card *card,
+				    struct qeth_reply *reply,
+				    unsigned long data)
+{
+	struct _qeth_l2_vnicc_request_cbctl *cbctl =
+		(struct _qeth_l2_vnicc_request_cbctl *) reply->param;
+	struct qeth_ipa_cmd *cmd = (struct qeth_ipa_cmd *) data;
+	struct qeth_ipacmd_vnicc *rep = &cmd->data.vnicc;
+
+	QETH_CARD_TEXT(card, 2, "vniccrcb");
+	if (cmd->hdr.return_code)
+		return 0;
+	/* return results to caller */
+	card->options.vnicc.sup_chars = rep->hdr.sup;
+	card->options.vnicc.cur_chars = rep->hdr.cur;
+
+	if (cbctl->sub_cmd == IPA_VNICC_QUERY_CMDS)
+		*cbctl->result.sup_cmds = rep->query_cmds.sup_cmds;
+
+	if (cbctl->sub_cmd == IPA_VNICC_GET_TIMEOUT)
+		*cbctl->result.timeout = rep->getset_timeout.timeout;
+
+	return 0;
+}
+
+/* generic VNICC request */
+static int qeth_l2_vnicc_request(struct qeth_card *card,
+				 struct _qeth_l2_vnicc_request_cbctl *cbctl)
+{
+	struct qeth_ipacmd_vnicc *req;
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+	int rc;
+
+	QETH_CARD_TEXT(card, 2, "vniccreq");
+
+	/* get new buffer for request */
+	iob = qeth_get_ipacmd_buffer(card, IPA_CMD_VNICC, 0);
+	if (!iob)
+		return -ENOMEM;
+
+	/* create header for request */
+	cmd = (struct qeth_ipa_cmd *)(iob->data + IPA_PDU_HEADER_SIZE);
+	req = &cmd->data.vnicc;
+
+	/* create sub command header for request */
+	req->sub_hdr.data_length = sizeof(req->sub_hdr);
+	req->sub_hdr.sub_command = cbctl->sub_cmd;
+
+	/* create sub command specific request fields */
+	switch (cbctl->sub_cmd) {
+	case IPA_VNICC_QUERY_CHARS:
+		break;
+	case IPA_VNICC_QUERY_CMDS:
+		req->sub_hdr.data_length += sizeof(req->query_cmds);
+		req->query_cmds.vnic_char = cbctl->param.vnic_char;
+		break;
+	case IPA_VNICC_ENABLE:
+	case IPA_VNICC_DISABLE:
+		req->sub_hdr.data_length += sizeof(req->set_char);
+		req->set_char.vnic_char = cbctl->param.vnic_char;
+		break;
+	case IPA_VNICC_SET_TIMEOUT:
+		req->getset_timeout.timeout = cbctl->param.timeout;
+		/* fallthrough */
+	case IPA_VNICC_GET_TIMEOUT:
+		req->sub_hdr.data_length += sizeof(req->getset_timeout);
+		req->getset_timeout.vnic_char = cbctl->param.vnic_char;
+		break;
+	default:
+		qeth_release_buffer(iob->channel, iob);
+		return -EOPNOTSUPP;
+	}
+
+	/* send request */
+	rc = qeth_send_ipa_cmd(card, iob, qeth_l2_vnicc_request_cb,
+			       (void *) cbctl);
+
+	return qeth_l2_vnicc_makerc(card, rc);
+}
+
+/* VNICC query VNIC characteristics request */
+static int qeth_l2_vnicc_query_chars(struct qeth_card *card)
+{
+	struct _qeth_l2_vnicc_request_cbctl cbctl;
+
+	/* prepare callback control */
+	cbctl.sub_cmd = IPA_VNICC_QUERY_CHARS;
+
+	QETH_CARD_TEXT(card, 2, "vniccqch");
+	return qeth_l2_vnicc_request(card, &cbctl);
+}
+
+/* VNICC query sub commands request */
+static int qeth_l2_vnicc_query_cmds(struct qeth_card *card, u32 vnic_char,
+				    u32 *sup_cmds)
+{
+	struct _qeth_l2_vnicc_request_cbctl cbctl;
+
+	/* prepare callback control */
+	cbctl.sub_cmd = IPA_VNICC_QUERY_CMDS;
+	cbctl.param.vnic_char = vnic_char;
+	cbctl.result.sup_cmds = sup_cmds;
+
+	QETH_CARD_TEXT(card, 2, "vniccqcm");
+	return qeth_l2_vnicc_request(card, &cbctl);
+}
+
+/* VNICC enable/disable characteristic request */
+static int qeth_l2_vnicc_set_char(struct qeth_card *card, u32 vnic_char,
+				      u32 cmd)
+{
+	struct _qeth_l2_vnicc_request_cbctl cbctl;
+
+	/* prepare callback control */
+	cbctl.sub_cmd = cmd;
+	cbctl.param.vnic_char = vnic_char;
+
+	QETH_CARD_TEXT(card, 2, "vniccedc");
+	return qeth_l2_vnicc_request(card, &cbctl);
+}
+
+/* VNICC get/set timeout for characteristic request */
+static int qeth_l2_vnicc_getset_timeout(struct qeth_card *card, u32 vnicc,
+					u32 cmd, u32 *timeout)
+{
+	struct _qeth_l2_vnicc_request_cbctl cbctl;
+
+	/* prepare callback control */
+	cbctl.sub_cmd = cmd;
+	cbctl.param.vnic_char = vnicc;
+	if (cmd == IPA_VNICC_SET_TIMEOUT)
+		cbctl.param.timeout = *timeout;
+	if (cmd == IPA_VNICC_GET_TIMEOUT)
+		cbctl.result.timeout = timeout;
+
+	QETH_CARD_TEXT(card, 2, "vniccgst");
+	return qeth_l2_vnicc_request(card, &cbctl);
+}
+
+/* set current VNICC flag state; called from sysfs store function */
+int qeth_l2_vnicc_set_state(struct qeth_card *card, u32 vnicc, bool state)
+{
+	int rc = 0;
+	u32 cmd;
+
+	QETH_CARD_TEXT(card, 2, "vniccsch");
+
+	/* do not change anything if BridgePort is enabled */
+	if (qeth_bridgeport_is_in_use(card))
+		return -EBUSY;
+
+	/* check if characteristic and enable/disable are supported */
+	if (!(card->options.vnicc.sup_chars & vnicc) ||
+	    !(card->options.vnicc.set_char_sup & vnicc))
+		return -EOPNOTSUPP;
+
+	/* set enable/disable command and store wanted characteristic */
+	if (state) {
+		cmd = IPA_VNICC_ENABLE;
+		card->options.vnicc.wanted_chars |= vnicc;
+	} else {
+		cmd = IPA_VNICC_DISABLE;
+		card->options.vnicc.wanted_chars &= ~vnicc;
+	}
+
+	/* do we need to do anything? */
+	if (card->options.vnicc.cur_chars == card->options.vnicc.wanted_chars)
+		return rc;
+
+	/* if card is not ready, simply stop here */
+	if (!qeth_card_hw_is_reachable(card)) {
+		if (state)
+			card->options.vnicc.cur_chars |= vnicc;
+		else
+			card->options.vnicc.cur_chars &= ~vnicc;
+		return rc;
+	}
+
+	rc = qeth_l2_vnicc_set_char(card, vnicc, cmd);
+	if (rc)
+		card->options.vnicc.wanted_chars =
+			card->options.vnicc.cur_chars;
+	else {
+		/* successful online VNICC change; handle special cases */
+		if (state && vnicc == QETH_VNICC_RX_BCAST)
+			card->options.vnicc.rx_bcast_enabled = true;
+		if (!state && vnicc == QETH_VNICC_LEARNING)
+			qeth_l2_vnicc_recover_timeout(card, vnicc,
+					&card->options.vnicc.learning_timeout);
+	}
+
+	return rc;
+}
+
+/* get current VNICC flag state; called from sysfs show function */
+int qeth_l2_vnicc_get_state(struct qeth_card *card, u32 vnicc, bool *state)
+{
+	int rc = 0;
+
+	QETH_CARD_TEXT(card, 2, "vniccgch");
+
+	/* do not get anything if BridgePort is enabled */
+	if (qeth_bridgeport_is_in_use(card))
+		return -EBUSY;
+
+	/* check if characteristic is supported */
+	if (!(card->options.vnicc.sup_chars & vnicc))
+		return -EOPNOTSUPP;
+
+	/* if card is ready, query current VNICC state */
+	if (qeth_card_hw_is_reachable(card))
+		rc = qeth_l2_vnicc_query_chars(card);
+
+	*state = (card->options.vnicc.cur_chars & vnicc) ? true : false;
+	return rc;
+}
+
+/* set VNICC timeout; called from sysfs store function. Currently, only learning
+ * supports timeout
+ */
+int qeth_l2_vnicc_set_timeout(struct qeth_card *card, u32 timeout)
+{
+	int rc = 0;
+
+	QETH_CARD_TEXT(card, 2, "vniccsto");
+
+	/* do not change anything if BridgePort is enabled */
+	if (qeth_bridgeport_is_in_use(card))
+		return -EBUSY;
+
+	/* check if characteristic and set_timeout are supported */
+	if (!(card->options.vnicc.sup_chars & QETH_VNICC_LEARNING) ||
+	    !(card->options.vnicc.getset_timeout_sup & QETH_VNICC_LEARNING))
+		return -EOPNOTSUPP;
+
+	/* do we need to do anything? */
+	if (card->options.vnicc.learning_timeout == timeout)
+		return rc;
+
+	/* if card is not ready, simply store the value internally and return */
+	if (!qeth_card_hw_is_reachable(card)) {
+		card->options.vnicc.learning_timeout = timeout;
+		return rc;
+	}
+
+	/* send timeout value to card; if successful, store value internally */
+	rc = qeth_l2_vnicc_getset_timeout(card, QETH_VNICC_LEARNING,
+					  IPA_VNICC_SET_TIMEOUT, &timeout);
+	if (!rc)
+		card->options.vnicc.learning_timeout = timeout;
+
+	return rc;
+}
+
+/* get current VNICC timeout; called from sysfs show function. Currently, only
+ * learning supports timeout
+ */
+int qeth_l2_vnicc_get_timeout(struct qeth_card *card, u32 *timeout)
+{
+	int rc = 0;
+
+	QETH_CARD_TEXT(card, 2, "vniccgto");
+
+	/* do not get anything if BridgePort is enabled */
+	if (qeth_bridgeport_is_in_use(card))
+		return -EBUSY;
+
+	/* check if characteristic and get_timeout are supported */
+	if (!(card->options.vnicc.sup_chars & QETH_VNICC_LEARNING) ||
+	    !(card->options.vnicc.getset_timeout_sup & QETH_VNICC_LEARNING))
+		return -EOPNOTSUPP;
+	/* if card is ready, get timeout. Otherwise, just return stored value */
+	*timeout = card->options.vnicc.learning_timeout;
+	if (qeth_card_hw_is_reachable(card))
+		rc = qeth_l2_vnicc_getset_timeout(card, QETH_VNICC_LEARNING,
+						  IPA_VNICC_GET_TIMEOUT,
+						  timeout);
+
+	return rc;
+}
+
+/* check if VNICC is currently enabled */
+bool qeth_l2_vnicc_is_in_use(struct qeth_card *card)
+{
+	/* if everything is turned off, VNICC is not active */
+	if (!card->options.vnicc.cur_chars)
+		return false;
+	/* default values are only OK if rx_bcast was not enabled by user
+	 * or the card is offline.
+	 */
+	if (card->options.vnicc.cur_chars == QETH_VNICC_DEFAULT) {
+		if (!card->options.vnicc.rx_bcast_enabled ||
+		    !qeth_card_hw_is_reachable(card))
+			return false;
+	}
+	return true;
+}
+
+/* recover user timeout setting */
+static bool qeth_l2_vnicc_recover_timeout(struct qeth_card *card, u32 vnicc,
+					  u32 *timeout)
+{
+	if (card->options.vnicc.sup_chars & vnicc &&
+	    card->options.vnicc.getset_timeout_sup & vnicc &&
+	    !qeth_l2_vnicc_getset_timeout(card, vnicc, IPA_VNICC_SET_TIMEOUT,
+					  timeout))
+		return false;
+	*timeout = QETH_VNICC_DEFAULT_TIMEOUT;
+	return true;
+}
+
+/* recover user characteristic setting */
+static bool qeth_l2_vnicc_recover_char(struct qeth_card *card, u32 vnicc,
+				       bool enable)
+{
+	u32 cmd = enable ? IPA_VNICC_ENABLE : IPA_VNICC_DISABLE;
+
+	if (card->options.vnicc.sup_chars & vnicc &&
+	    card->options.vnicc.set_char_sup & vnicc &&
+	    !qeth_l2_vnicc_set_char(card, vnicc, cmd))
+		return false;
+	card->options.vnicc.wanted_chars &= ~vnicc;
+	card->options.vnicc.wanted_chars |= QETH_VNICC_DEFAULT & vnicc;
+	return true;
+}
+
+/* (re-)initialize VNICC */
+static void qeth_l2_vnicc_init(struct qeth_card *card)
+{
+	u32 *timeout = &card->options.vnicc.learning_timeout;
+	unsigned int chars_len, i;
+	unsigned long chars_tmp;
+	u32 sup_cmds, vnicc;
+	bool enable, error;
+
+	QETH_CARD_TEXT(card, 2, "vniccini");
+	/* reset rx_bcast */
+	card->options.vnicc.rx_bcast_enabled = 0;
+	/* initial query and storage of VNIC characteristics */
+	if (qeth_l2_vnicc_query_chars(card)) {
+		if (card->options.vnicc.wanted_chars != QETH_VNICC_DEFAULT ||
+		    *timeout != QETH_VNICC_DEFAULT_TIMEOUT)
+			dev_err(&card->gdev->dev, "Configuring the VNIC characteristics failed\n");
+		/* fail quietly if user didn't change the default config */
+		card->options.vnicc.sup_chars = 0;
+		card->options.vnicc.cur_chars = 0;
+		card->options.vnicc.wanted_chars = QETH_VNICC_DEFAULT;
+		return;
+	}
+	/* get supported commands for each supported characteristic */
+	chars_tmp = card->options.vnicc.sup_chars;
+	chars_len = sizeof(card->options.vnicc.sup_chars) * BITS_PER_BYTE;
+	for_each_set_bit(i, &chars_tmp, chars_len) {
+		vnicc = BIT(i);
+		qeth_l2_vnicc_query_cmds(card, vnicc, &sup_cmds);
+		if (!(sup_cmds & IPA_VNICC_SET_TIMEOUT) ||
+		    !(sup_cmds & IPA_VNICC_GET_TIMEOUT))
+			card->options.vnicc.getset_timeout_sup &= ~vnicc;
+		if (!(sup_cmds & IPA_VNICC_ENABLE) ||
+		    !(sup_cmds & IPA_VNICC_DISABLE))
+			card->options.vnicc.set_char_sup &= ~vnicc;
+	}
+	/* enforce assumed default values and recover settings, if changed  */
+	error = qeth_l2_vnicc_recover_timeout(card, QETH_VNICC_LEARNING,
+					      timeout);
+	chars_tmp = card->options.vnicc.wanted_chars ^ QETH_VNICC_DEFAULT;
+	chars_tmp |= QETH_VNICC_BRIDGE_INVISIBLE;
+	chars_len = sizeof(card->options.vnicc.wanted_chars) * BITS_PER_BYTE;
+	for_each_set_bit(i, &chars_tmp, chars_len) {
+		vnicc = BIT(i);
+		enable = card->options.vnicc.wanted_chars & vnicc;
+		error |= qeth_l2_vnicc_recover_char(card, vnicc, enable);
+	}
+	if (error)
+		dev_err(&card->gdev->dev, "Configuring the VNIC characteristics failed\n");
+}
+
+/* configure default values of VNIC characteristics */
+static void qeth_l2_vnicc_set_defaults(struct qeth_card *card)
+{
+	/* characteristics values */
+	card->options.vnicc.sup_chars = QETH_VNICC_ALL;
+	card->options.vnicc.cur_chars = QETH_VNICC_DEFAULT;
+	card->options.vnicc.learning_timeout = QETH_VNICC_DEFAULT_TIMEOUT;
+	/* supported commands */
+	card->options.vnicc.set_char_sup = QETH_VNICC_ALL;
+	card->options.vnicc.getset_timeout_sup = QETH_VNICC_LEARNING;
+	/* settings wanted by users */
+	card->options.vnicc.wanted_chars = QETH_VNICC_DEFAULT;
+}
+
 module_init(qeth_l2_init);
 module_exit(qeth_l2_exit);
 MODULE_AUTHOR("Frank Blaschka <frank.blaschka@de.ibm.com>");
diff --git a/drivers/s390/net/qeth_l2_sys.c b/drivers/s390/net/qeth_l2_sys.c
index 9696baa49e2d..4608daedb204 100644
--- a/drivers/s390/net/qeth_l2_sys.c
+++ b/drivers/s390/net/qeth_l2_sys.c
@@ -20,6 +20,9 @@ static ssize_t qeth_bridge_port_role_state_show(struct device *dev,
 	if (!card)
 		return -EINVAL;
 
+	if (qeth_l2_vnicc_is_in_use(card))
+		return sprintf(buf, "n/a (VNIC characteristics)\n");
+
 	if (qeth_card_hw_is_reachable(card) &&
 					card->options.sbp.supported_funcs)
 		rc = qeth_bridgeport_query_ports(card,
@@ -60,6 +63,11 @@ static ssize_t qeth_bridge_port_role_state_show(struct device *dev,
 static ssize_t qeth_bridge_port_role_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
+	struct qeth_card *card = dev_get_drvdata(dev);
+
+	if (qeth_l2_vnicc_is_in_use(card))
+		return sprintf(buf, "n/a (VNIC characteristics)\n");
+
 	return qeth_bridge_port_role_state_show(dev, attr, buf, 0);
 }
 
@@ -83,7 +91,10 @@ static ssize_t qeth_bridge_port_role_store(struct device *dev,
 
 	mutex_lock(&card->conf_mutex);
 
-	if (card->options.sbp.reflect_promisc) /* Forbid direct manipulation */
+	if (qeth_l2_vnicc_is_in_use(card))
+		rc = -EBUSY;
+	else if (card->options.sbp.reflect_promisc)
+		/* Forbid direct manipulation */
 		rc = -EPERM;
 	else if (qeth_card_hw_is_reachable(card)) {
 		rc = qeth_bridgeport_setrole(card, role);
@@ -103,6 +114,11 @@ static DEVICE_ATTR(bridge_role, 0644, qeth_bridge_port_role_show,
 static ssize_t qeth_bridge_port_state_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
+	struct qeth_card *card = dev_get_drvdata(dev);
+
+	if (qeth_l2_vnicc_is_in_use(card))
+		return sprintf(buf, "n/a (VNIC characteristics)\n");
+
 	return qeth_bridge_port_role_state_show(dev, attr, buf, 1);
 }
 
@@ -118,6 +134,9 @@ static ssize_t qeth_bridgeport_hostnotification_show(struct device *dev,
 	if (!card)
 		return -EINVAL;
 
+	if (qeth_l2_vnicc_is_in_use(card))
+		return sprintf(buf, "n/a (VNIC characteristics)\n");
+
 	enabled = card->options.sbp.hostnotification;
 
 	return sprintf(buf, "%d\n", enabled);
@@ -142,7 +161,9 @@ static ssize_t qeth_bridgeport_hostnotification_store(struct device *dev,
 
 	mutex_lock(&card->conf_mutex);
 
-	if (qeth_card_hw_is_reachable(card)) {
+	if (qeth_l2_vnicc_is_in_use(card))
+		rc = -EBUSY;
+	else if (qeth_card_hw_is_reachable(card)) {
 		rc = qeth_bridgeport_an_set(card, enable);
 		if (!rc)
 			card->options.sbp.hostnotification = enable;
@@ -167,6 +188,9 @@ static ssize_t qeth_bridgeport_reflect_show(struct device *dev,
 	if (!card)
 		return -EINVAL;
 
+	if (qeth_l2_vnicc_is_in_use(card))
+		return sprintf(buf, "n/a (VNIC characteristics)\n");
+
 	if (card->options.sbp.reflect_promisc) {
 		if (card->options.sbp.reflect_promisc_primary)
 			state = "primary";
@@ -202,7 +226,9 @@ static ssize_t qeth_bridgeport_reflect_store(struct device *dev,
 
 	mutex_lock(&card->conf_mutex);
 
-	if (card->options.sbp.role != QETH_SBP_ROLE_NONE)
+	if (qeth_l2_vnicc_is_in_use(card))
+		rc = -EBUSY;
+	else if (card->options.sbp.role != QETH_SBP_ROLE_NONE)
 		rc = -EPERM;
 	else {
 		card->options.sbp.reflect_promisc = enable;
@@ -231,16 +257,6 @@ static struct attribute_group qeth_l2_bridgeport_attr_group = {
 	.attrs = qeth_l2_bridgeport_attrs,
 };
 
-int qeth_l2_create_device_attributes(struct device *dev)
-{
-	return sysfs_create_group(&dev->kobj, &qeth_l2_bridgeport_attr_group);
-}
-
-void qeth_l2_remove_device_attributes(struct device *dev)
-{
-	sysfs_remove_group(&dev->kobj, &qeth_l2_bridgeport_attr_group);
-}
-
 /**
  * qeth_l2_setup_bridgeport_attrs() - set/restore attrs when turning online.
  * @card:			      qeth_card structure pointer
@@ -270,10 +286,168 @@ void qeth_l2_setup_bridgeport_attrs(struct qeth_card *card)
 		qeth_bridgeport_an_set(card, 0);
 }
 
+/* VNIC CHARS support */
+
+/* convert sysfs attr name to VNIC characteristic */
+static u32 qeth_l2_vnicc_sysfs_attr_to_char(const char *attr_name)
+{
+	if (sysfs_streq(attr_name, "flooding"))
+		return QETH_VNICC_FLOODING;
+	else if (sysfs_streq(attr_name, "mcast_flooding"))
+		return QETH_VNICC_MCAST_FLOODING;
+	else if (sysfs_streq(attr_name, "learning"))
+		return QETH_VNICC_LEARNING;
+	else if (sysfs_streq(attr_name, "takeover_setvmac"))
+		return QETH_VNICC_TAKEOVER_SETVMAC;
+	else if (sysfs_streq(attr_name, "takeover_learning"))
+		return QETH_VNICC_TAKEOVER_LEARNING;
+	else if (sysfs_streq(attr_name, "bridge_invisible"))
+		return QETH_VNICC_BRIDGE_INVISIBLE;
+	else if (sysfs_streq(attr_name, "rx_bcast"))
+		return QETH_VNICC_RX_BCAST;
+
+	return 0;
+}
+
+/* get current timeout setting */
+static ssize_t qeth_vnicc_timeout_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct qeth_card *card = dev_get_drvdata(dev);
+	u32 timeout;
+	int rc;
+
+	if (!card)
+		return -EINVAL;
+
+	rc = qeth_l2_vnicc_get_timeout(card, &timeout);
+	if (rc == -EBUSY)
+		return sprintf(buf, "n/a (BridgePort)\n");
+	if (rc == -EOPNOTSUPP)
+		return sprintf(buf, "n/a\n");
+	return rc ? rc : sprintf(buf, "%d\n", timeout);
+}
+
+/* change timeout setting */
+static ssize_t qeth_vnicc_timeout_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct qeth_card *card = dev_get_drvdata(dev);
+	u32 timeout;
+	int rc;
+
+	if (!card)
+		return -EINVAL;
+
+	rc = kstrtou32(buf, 10, &timeout);
+	if (rc)
+		return rc;
+
+	mutex_lock(&card->conf_mutex);
+	rc = qeth_l2_vnicc_set_timeout(card, timeout);
+	mutex_unlock(&card->conf_mutex);
+	return rc ? rc : count;
+}
+
+/* get current setting of characteristic */
+static ssize_t qeth_vnicc_char_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct qeth_card *card = dev_get_drvdata(dev);
+	bool state;
+	u32 vnicc;
+	int rc;
+
+	if (!card)
+		return -EINVAL;
+
+	vnicc = qeth_l2_vnicc_sysfs_attr_to_char(attr->attr.name);
+	rc = qeth_l2_vnicc_get_state(card, vnicc, &state);
+
+	if (rc == -EBUSY)
+		return sprintf(buf, "n/a (BridgePort)\n");
+	if (rc == -EOPNOTSUPP)
+		return sprintf(buf, "n/a\n");
+	return rc ? rc : sprintf(buf, "%d\n", state);
+}
+
+/* change setting of characteristic */
+static ssize_t qeth_vnicc_char_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct qeth_card *card = dev_get_drvdata(dev);
+	bool state;
+	u32 vnicc;
+	int rc;
+
+	if (!card)
+		return -EINVAL;
+
+	if (kstrtobool(buf, &state))
+		return -EINVAL;
+
+	vnicc = qeth_l2_vnicc_sysfs_attr_to_char(attr->attr.name);
+	mutex_lock(&card->conf_mutex);
+	rc = qeth_l2_vnicc_set_state(card, vnicc, state);
+	mutex_unlock(&card->conf_mutex);
+
+	return rc ? rc : count;
+}
+
+static DEVICE_ATTR(flooding, 0644, qeth_vnicc_char_show, qeth_vnicc_char_store);
+static DEVICE_ATTR(mcast_flooding, 0644, qeth_vnicc_char_show,
+		   qeth_vnicc_char_store);
+static DEVICE_ATTR(learning, 0644, qeth_vnicc_char_show, qeth_vnicc_char_store);
+static DEVICE_ATTR(learning_timeout, 0644, qeth_vnicc_timeout_show,
+		   qeth_vnicc_timeout_store);
+static DEVICE_ATTR(takeover_setvmac, 0644, qeth_vnicc_char_show,
+		   qeth_vnicc_char_store);
+static DEVICE_ATTR(takeover_learning, 0644, qeth_vnicc_char_show,
+		   qeth_vnicc_char_store);
+static DEVICE_ATTR(bridge_invisible, 0644, qeth_vnicc_char_show,
+		   qeth_vnicc_char_store);
+static DEVICE_ATTR(rx_bcast, 0644, qeth_vnicc_char_show, qeth_vnicc_char_store);
+
+static struct attribute *qeth_l2_vnicc_attrs[] = {
+	&dev_attr_flooding.attr,
+	&dev_attr_mcast_flooding.attr,
+	&dev_attr_learning.attr,
+	&dev_attr_learning_timeout.attr,
+	&dev_attr_takeover_setvmac.attr,
+	&dev_attr_takeover_learning.attr,
+	&dev_attr_bridge_invisible.attr,
+	&dev_attr_rx_bcast.attr,
+	NULL,
+};
+
+static struct attribute_group qeth_l2_vnicc_attr_group = {
+	.attrs = qeth_l2_vnicc_attrs,
+	.name = "vnicc",
+};
+
+static const struct attribute_group *qeth_l2_only_attr_groups[] = {
+	&qeth_l2_bridgeport_attr_group,
+	&qeth_l2_vnicc_attr_group,
+	NULL,
+};
+
+int qeth_l2_create_device_attributes(struct device *dev)
+{
+	return sysfs_create_groups(&dev->kobj, qeth_l2_only_attr_groups);
+}
+
+void qeth_l2_remove_device_attributes(struct device *dev)
+{
+	sysfs_remove_groups(&dev->kobj, qeth_l2_only_attr_groups);
+}
+
 const struct attribute_group *qeth_l2_attr_groups[] = {
 	&qeth_device_attr_group,
 	&qeth_device_blkt_group,
-	/* l2 specific, see l2_{create,remove}_device_attributes(): */
+	/* l2 specific, see qeth_l2_only_attr_groups: */
 	&qeth_l2_bridgeport_attr_group,
+	&qeth_l2_vnicc_attr_group,
 	NULL,
 };
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index ab661a431f7c..270ac9515f6b 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1646,13 +1646,12 @@ static int qeth_l3_vlan_rx_kill_vid(struct net_device *dev,
 	return 0;
 }
 
-static int qeth_l3_rebuild_skb(struct qeth_card *card, struct sk_buff *skb,
-			       struct qeth_hdr *hdr, unsigned short *vlan_id)
+static void qeth_l3_rebuild_skb(struct qeth_card *card, struct sk_buff *skb,
+				struct qeth_hdr *hdr)
 {
 	__u16 prot;
 	struct iphdr *ip_hdr;
 	unsigned char tg_addr[MAX_ADDR_LEN];
-	int is_vlan = 0;
 
 	if (!(hdr->hdr.l3.flags & QETH_HDR_PASSTHRU)) {
 		prot = (hdr->hdr.l3.flags & QETH_HDR_IPV6) ? ETH_P_IPV6 :
@@ -1706,11 +1705,14 @@ static int qeth_l3_rebuild_skb(struct qeth_card *card, struct sk_buff *skb,
 
 	skb->protocol = eth_type_trans(skb, card->dev);
 
-	if (hdr->hdr.l3.ext_flags &
-	    (QETH_HDR_EXT_VLAN_FRAME | QETH_HDR_EXT_INCLUDE_VLAN_TAG)) {
-		*vlan_id = (hdr->hdr.l3.ext_flags & QETH_HDR_EXT_VLAN_FRAME) ?
-		 hdr->hdr.l3.vlan_id : *((u16 *)&hdr->hdr.l3.dest_addr[12]);
-		is_vlan = 1;
+	/* copy VLAN tag from hdr into skb */
+	if (!card->options.sniffer &&
+	    (hdr->hdr.l3.ext_flags & (QETH_HDR_EXT_VLAN_FRAME |
+				      QETH_HDR_EXT_INCLUDE_VLAN_TAG))) {
+		u16 tag = (hdr->hdr.l3.ext_flags & QETH_HDR_EXT_VLAN_FRAME) ?
+				hdr->hdr.l3.vlan_id :
+				*((u16 *)&hdr->hdr.l3.dest_addr[12]);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), tag);
 	}
 
 	if (card->dev->features & NETIF_F_RXCSUM) {
@@ -1724,7 +1726,6 @@ static int qeth_l3_rebuild_skb(struct qeth_card *card, struct sk_buff *skb,
 			skb->ip_summed = CHECKSUM_NONE;
 	} else
 		skb->ip_summed = CHECKSUM_NONE;
-	return is_vlan;
 }
 
 static int qeth_l3_process_inbound_buffer(struct qeth_card *card,
@@ -1733,8 +1734,6 @@ static int qeth_l3_process_inbound_buffer(struct qeth_card *card,
 	int work_done = 0;
 	struct sk_buff *skb;
 	struct qeth_hdr *hdr;
-	__u16 vlan_tag = 0;
-	int is_vlan;
 	unsigned int len;
 	__u16 magic;
 
@@ -1764,12 +1763,8 @@ static int qeth_l3_process_inbound_buffer(struct qeth_card *card,
 					card->dev->addr_len);
 				netif_receive_skb(skb);
 			} else {
-				is_vlan = qeth_l3_rebuild_skb(card, skb, hdr,
-						      &vlan_tag);
+				qeth_l3_rebuild_skb(card, skb, hdr);
 				len = skb->len;
-				if (is_vlan && !card->options.sniffer)
-					__vlan_hwaccel_put_tag(skb,
-						htons(ETH_P_8021Q), vlan_tag);
 				napi_gro_receive(&card->napi, skb);
 			}
 			break;
@@ -2771,8 +2766,8 @@ static netdev_tx_t qeth_l3_hard_start_xmit(struct sk_buff *skb,
 		rc = qeth_do_send_packet(card, queue, new_skb, hdr, hd_len,
 					 hd_len, elements);
 	} else
-		rc = qeth_do_send_packet_fast(card, queue, new_skb, hdr,
-					      data_offset, hd_len);
+		rc = qeth_do_send_packet_fast(queue, new_skb, hdr, data_offset,
+					      hd_len);
 
 	if (!rc) {
 		card->stats.tx_packets++;
diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
index e8bcc314cc5f..0ad0f7f4b23f 100644
--- a/drivers/s390/net/qeth_l3_sys.c
+++ b/drivers/s390/net/qeth_l3_sys.c
@@ -1028,52 +1028,31 @@ static const struct attribute_group qeth_device_rxip_group = {
 	.attrs = qeth_rxip_device_attrs,
 };
 
+static const struct attribute_group *qeth_l3_only_attr_groups[] = {
+	&qeth_l3_device_attr_group,
+	&qeth_device_ipato_group,
+	&qeth_device_vipa_group,
+	&qeth_device_rxip_group,
+	NULL,
+};
+
 int qeth_l3_create_device_attributes(struct device *dev)
 {
-	int ret;
-
-	ret = sysfs_create_group(&dev->kobj, &qeth_l3_device_attr_group);
-	if (ret)
-		return ret;
-
-	ret = sysfs_create_group(&dev->kobj, &qeth_device_ipato_group);
-	if (ret) {
-		sysfs_remove_group(&dev->kobj, &qeth_l3_device_attr_group);
-		return ret;
-	}
-
-	ret = sysfs_create_group(&dev->kobj, &qeth_device_vipa_group);
-	if (ret) {
-		sysfs_remove_group(&dev->kobj, &qeth_l3_device_attr_group);
-		sysfs_remove_group(&dev->kobj, &qeth_device_ipato_group);
-		return ret;
-	}
-
-	ret = sysfs_create_group(&dev->kobj, &qeth_device_rxip_group);
-	if (ret) {
-		sysfs_remove_group(&dev->kobj, &qeth_l3_device_attr_group);
-		sysfs_remove_group(&dev->kobj, &qeth_device_ipato_group);
-		sysfs_remove_group(&dev->kobj, &qeth_device_vipa_group);
-		return ret;
-	}
-	return 0;
+	return sysfs_create_groups(&dev->kobj, qeth_l3_only_attr_groups);
 }
 
 void qeth_l3_remove_device_attributes(struct device *dev)
 {
-	sysfs_remove_group(&dev->kobj, &qeth_l3_device_attr_group);
-	sysfs_remove_group(&dev->kobj, &qeth_device_ipato_group);
-	sysfs_remove_group(&dev->kobj, &qeth_device_vipa_group);
-	sysfs_remove_group(&dev->kobj, &qeth_device_rxip_group);
+	sysfs_remove_groups(&dev->kobj, qeth_l3_only_attr_groups);
 }
 
 const struct attribute_group *qeth_l3_attr_groups[] = {
 	&qeth_device_attr_group,
 	&qeth_device_blkt_group,
-	/* l3 specific, see l3_{create,remove}_device_attributes(): */
+	/* l3 specific, see qeth_l3_only_attr_groups: */
 	&qeth_l3_device_attr_group,
 	&qeth_device_ipato_group,
 	&qeth_device_vipa_group,
 	&qeth_device_rxip_group,
-NULL,
+	NULL,
 };
diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile
index 4900febc6c8a..f2f0de27252b 100644
--- a/drivers/thunderbolt/Makefile
+++ b/drivers/thunderbolt/Makefile
@@ -1,3 +1,3 @@
 obj-${CONFIG_THUNDERBOLT} := thunderbolt.o
 thunderbolt-objs := nhi.o ctl.o tb.o switch.o cap.o path.o tunnel_pci.o eeprom.o
-thunderbolt-objs += domain.o dma_port.o icm.o
+thunderbolt-objs += domain.o dma_port.o icm.o property.o xdomain.o
diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c
index fb40dd0588b9..d079dbba2c03 100644
--- a/drivers/thunderbolt/ctl.c
+++ b/drivers/thunderbolt/ctl.c
@@ -289,20 +289,6 @@ static void tb_cfg_print_error(struct tb_ctl *ctl,
 	}
 }
 
-static void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
-{
-	int i;
-	for (i = 0; i < len; i++)
-		dst[i] = cpu_to_be32(src[i]);
-}
-
-static void be32_to_cpu_array(u32 *dst, __be32 *src, size_t len)
-{
-	int i;
-	for (i = 0; i < len; i++)
-		dst[i] = be32_to_cpu(src[i]);
-}
-
 static __be32 tb_crc(const void *data, size_t len)
 {
 	return cpu_to_be32(~__crc32c_le(~0, data, len));
@@ -373,7 +359,7 @@ static int tb_ctl_tx(struct tb_ctl *ctl, const void *data, size_t len,
 	cpu_to_be32_array(pkg->buffer, data, len / 4);
 	*(__be32 *) (pkg->buffer + len) = tb_crc(pkg->buffer, len);
 
-	res = ring_tx(ctl->tx, &pkg->frame);
+	res = tb_ring_tx(ctl->tx, &pkg->frame);
 	if (res) /* ring is stopped */
 		tb_ctl_pkg_free(pkg);
 	return res;
@@ -382,15 +368,15 @@ static int tb_ctl_tx(struct tb_ctl *ctl, const void *data, size_t len,
 /**
  * tb_ctl_handle_event() - acknowledge a plug event, invoke ctl->callback
  */
-static void tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type,
+static bool tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type,
 				struct ctl_pkg *pkg, size_t size)
 {
-	ctl->callback(ctl->callback_data, type, pkg->buffer, size);
+	return ctl->callback(ctl->callback_data, type, pkg->buffer, size);
 }
 
 static void tb_ctl_rx_submit(struct ctl_pkg *pkg)
 {
-	ring_rx(pkg->ctl->rx, &pkg->frame); /*
+	tb_ring_rx(pkg->ctl->rx, &pkg->frame); /*
 					     * We ignore failures during stop.
 					     * All rx packets are referenced
 					     * from ctl->rx_packets, so we do
@@ -458,6 +444,8 @@ static void tb_ctl_rx_callback(struct tb_ring *ring, struct ring_frame *frame,
 		break;
 
 	case TB_CFG_PKG_EVENT:
+	case TB_CFG_PKG_XDOMAIN_RESP:
+	case TB_CFG_PKG_XDOMAIN_REQ:
 		if (*(__be32 *)(pkg->buffer + frame->size) != crc32) {
 			tb_ctl_err(pkg->ctl,
 				   "RX: checksum mismatch, dropping packet\n");
@@ -465,8 +453,9 @@ static void tb_ctl_rx_callback(struct tb_ring *ring, struct ring_frame *frame,
 		}
 		/* Fall through */
 	case TB_CFG_PKG_ICM_EVENT:
-		tb_ctl_handle_event(pkg->ctl, frame->eof, pkg, frame->size);
-		goto rx;
+		if (tb_ctl_handle_event(pkg->ctl, frame->eof, pkg, frame->size))
+			goto rx;
+		break;
 
 	default:
 		break;
@@ -625,11 +614,12 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data)
 	if (!ctl->frame_pool)
 		goto err;
 
-	ctl->tx = ring_alloc_tx(nhi, 0, 10, RING_FLAG_NO_SUSPEND);
+	ctl->tx = tb_ring_alloc_tx(nhi, 0, 10, RING_FLAG_NO_SUSPEND);
 	if (!ctl->tx)
 		goto err;
 
-	ctl->rx = ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND);
+	ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff,
+				0xffff, NULL, NULL);
 	if (!ctl->rx)
 		goto err;
 
@@ -662,9 +652,9 @@ void tb_ctl_free(struct tb_ctl *ctl)
 		return;
 
 	if (ctl->rx)
-		ring_free(ctl->rx);
+		tb_ring_free(ctl->rx);
 	if (ctl->tx)
-		ring_free(ctl->tx);
+		tb_ring_free(ctl->tx);
 
 	/* free RX packets */
 	for (i = 0; i < TB_CTL_RX_PKG_COUNT; i++)
@@ -683,8 +673,8 @@ void tb_ctl_start(struct tb_ctl *ctl)
 {
 	int i;
 	tb_ctl_info(ctl, "control channel starting...\n");
-	ring_start(ctl->tx); /* is used to ack hotplug packets, start first */
-	ring_start(ctl->rx);
+	tb_ring_start(ctl->tx); /* is used to ack hotplug packets, start first */
+	tb_ring_start(ctl->rx);
 	for (i = 0; i < TB_CTL_RX_PKG_COUNT; i++)
 		tb_ctl_rx_submit(ctl->rx_packets[i]);
 
@@ -705,8 +695,8 @@ void tb_ctl_stop(struct tb_ctl *ctl)
 	ctl->running = false;
 	mutex_unlock(&ctl->request_queue_lock);
 
-	ring_stop(ctl->rx);
-	ring_stop(ctl->tx);
+	tb_ring_stop(ctl->rx);
+	tb_ring_stop(ctl->tx);
 
 	if (!list_empty(&ctl->request_queue))
 		tb_ctl_WARN(ctl, "dangling request in request_queue\n");
diff --git a/drivers/thunderbolt/ctl.h b/drivers/thunderbolt/ctl.h
index 36fd28b1c1c5..85c49dd301ea 100644
--- a/drivers/thunderbolt/ctl.h
+++ b/drivers/thunderbolt/ctl.h
@@ -8,6 +8,7 @@
 #define _TB_CFG
 
 #include <linux/kref.h>
+#include <linux/thunderbolt.h>
 
 #include "nhi.h"
 #include "tb_msgs.h"
@@ -15,7 +16,7 @@
 /* control channel */
 struct tb_ctl;
 
-typedef void (*event_cb)(void *data, enum tb_cfg_pkg_type type,
+typedef bool (*event_cb)(void *data, enum tb_cfg_pkg_type type,
 			 const void *buf, size_t size);
 
 struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data);
diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
index 9f2dcd48974d..9b90115319ce 100644
--- a/drivers/thunderbolt/domain.c
+++ b/drivers/thunderbolt/domain.c
@@ -20,6 +20,98 @@
 
 static DEFINE_IDA(tb_domain_ida);
 
+static bool match_service_id(const struct tb_service_id *id,
+			     const struct tb_service *svc)
+{
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_KEY) {
+		if (strcmp(id->protocol_key, svc->key))
+			return false;
+	}
+
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_ID) {
+		if (id->protocol_id != svc->prtcid)
+			return false;
+	}
+
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) {
+		if (id->protocol_version != svc->prtcvers)
+			return false;
+	}
+
+	if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) {
+		if (id->protocol_revision != svc->prtcrevs)
+			return false;
+	}
+
+	return true;
+}
+
+static const struct tb_service_id *__tb_service_match(struct device *dev,
+						      struct device_driver *drv)
+{
+	struct tb_service_driver *driver;
+	const struct tb_service_id *ids;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc)
+		return NULL;
+
+	driver = container_of(drv, struct tb_service_driver, driver);
+	if (!driver->id_table)
+		return NULL;
+
+	for (ids = driver->id_table; ids->match_flags != 0; ids++) {
+		if (match_service_id(ids, svc))
+			return ids;
+	}
+
+	return NULL;
+}
+
+static int tb_service_match(struct device *dev, struct device_driver *drv)
+{
+	return !!__tb_service_match(dev, drv);
+}
+
+static int tb_service_probe(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tb_service_driver *driver;
+	const struct tb_service_id *id;
+
+	driver = container_of(dev->driver, struct tb_service_driver, driver);
+	id = __tb_service_match(dev, &driver->driver);
+
+	return driver->probe(svc, id);
+}
+
+static int tb_service_remove(struct device *dev)
+{
+	struct tb_service *svc = tb_to_service(dev);
+	struct tb_service_driver *driver;
+
+	driver = container_of(dev->driver, struct tb_service_driver, driver);
+	if (driver->remove)
+		driver->remove(svc);
+
+	return 0;
+}
+
+static void tb_service_shutdown(struct device *dev)
+{
+	struct tb_service_driver *driver;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc || !dev->driver)
+		return;
+
+	driver = container_of(dev->driver, struct tb_service_driver, driver);
+	if (driver->shutdown)
+		driver->shutdown(svc);
+}
+
 static const char * const tb_security_names[] = {
 	[TB_SECURITY_NONE] = "none",
 	[TB_SECURITY_USER] = "user",
@@ -52,6 +144,10 @@ static const struct attribute_group *domain_attr_groups[] = {
 
 struct bus_type tb_bus_type = {
 	.name = "thunderbolt",
+	.match = tb_service_match,
+	.probe = tb_service_probe,
+	.remove = tb_service_remove,
+	.shutdown = tb_service_shutdown,
 };
 
 static void tb_domain_release(struct device *dev)
@@ -128,17 +224,26 @@ err_free:
 	return NULL;
 }
 
-static void tb_domain_event_cb(void *data, enum tb_cfg_pkg_type type,
+static bool tb_domain_event_cb(void *data, enum tb_cfg_pkg_type type,
 			       const void *buf, size_t size)
 {
 	struct tb *tb = data;
 
 	if (!tb->cm_ops->handle_event) {
 		tb_warn(tb, "domain does not have event handler\n");
-		return;
+		return true;
 	}
 
-	tb->cm_ops->handle_event(tb, type, buf, size);
+	switch (type) {
+	case TB_CFG_PKG_XDOMAIN_REQ:
+	case TB_CFG_PKG_XDOMAIN_RESP:
+		return tb_xdomain_handle_request(tb, type, buf, size);
+
+	default:
+		tb->cm_ops->handle_event(tb, type, buf, size);
+	}
+
+	return true;
 }
 
 /**
@@ -443,9 +548,92 @@ int tb_domain_disconnect_pcie_paths(struct tb *tb)
 	return tb->cm_ops->disconnect_pcie_paths(tb);
 }
 
+/**
+ * tb_domain_approve_xdomain_paths() - Enable DMA paths for XDomain
+ * @tb: Domain enabling the DMA paths
+ * @xd: XDomain DMA paths are created to
+ *
+ * Calls connection manager specific method to enable DMA paths to the
+ * XDomain in question.
+ *
+ * Return: 0% in case of success and negative errno otherwise. In
+ * particular returns %-ENOTSUPP if the connection manager
+ * implementation does not support XDomains.
+ */
+int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	if (!tb->cm_ops->approve_xdomain_paths)
+		return -ENOTSUPP;
+
+	return tb->cm_ops->approve_xdomain_paths(tb, xd);
+}
+
+/**
+ * tb_domain_disconnect_xdomain_paths() - Disable DMA paths for XDomain
+ * @tb: Domain disabling the DMA paths
+ * @xd: XDomain whose DMA paths are disconnected
+ *
+ * Calls connection manager specific method to disconnect DMA paths to
+ * the XDomain in question.
+ *
+ * Return: 0% in case of success and negative errno otherwise. In
+ * particular returns %-ENOTSUPP if the connection manager
+ * implementation does not support XDomains.
+ */
+int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	if (!tb->cm_ops->disconnect_xdomain_paths)
+		return -ENOTSUPP;
+
+	return tb->cm_ops->disconnect_xdomain_paths(tb, xd);
+}
+
+static int disconnect_xdomain(struct device *dev, void *data)
+{
+	struct tb_xdomain *xd;
+	struct tb *tb = data;
+	int ret = 0;
+
+	xd = tb_to_xdomain(dev);
+	if (xd && xd->tb == tb)
+		ret = tb_xdomain_disable_paths(xd);
+
+	return ret;
+}
+
+/**
+ * tb_domain_disconnect_all_paths() - Disconnect all paths for the domain
+ * @tb: Domain whose paths are disconnected
+ *
+ * This function can be used to disconnect all paths (PCIe, XDomain) for
+ * example in preparation for host NVM firmware upgrade. After this is
+ * called the paths cannot be established without resetting the switch.
+ *
+ * Return: %0 in case of success and negative errno otherwise.
+ */
+int tb_domain_disconnect_all_paths(struct tb *tb)
+{
+	int ret;
+
+	ret = tb_domain_disconnect_pcie_paths(tb);
+	if (ret)
+		return ret;
+
+	return bus_for_each_dev(&tb_bus_type, NULL, tb, disconnect_xdomain);
+}
+
 int tb_domain_init(void)
 {
-	return bus_register(&tb_bus_type);
+	int ret;
+
+	ret = tb_xdomain_init();
+	if (ret)
+		return ret;
+	ret = bus_register(&tb_bus_type);
+	if (ret)
+		tb_xdomain_exit();
+
+	return ret;
 }
 
 void tb_domain_exit(void)
@@ -453,4 +641,5 @@ void tb_domain_exit(void)
 	bus_unregister(&tb_bus_type);
 	ida_destroy(&tb_domain_ida);
 	tb_switch_exit();
+	tb_xdomain_exit();
 }
diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c
index 53250fc057e1..ab02d13f40b7 100644
--- a/drivers/thunderbolt/icm.c
+++ b/drivers/thunderbolt/icm.c
@@ -60,6 +60,8 @@
  * @get_route: Find a route string for given switch
  * @device_connected: Handle device connected ICM message
  * @device_disconnected: Handle device disconnected ICM message
+ * @xdomain_connected - Handle XDomain connected ICM message
+ * @xdomain_disconnected - Handle XDomain disconnected ICM message
  */
 struct icm {
 	struct mutex request_lock;
@@ -74,6 +76,10 @@ struct icm {
 				 const struct icm_pkg_header *hdr);
 	void (*device_disconnected)(struct tb *tb,
 				    const struct icm_pkg_header *hdr);
+	void (*xdomain_connected)(struct tb *tb,
+				  const struct icm_pkg_header *hdr);
+	void (*xdomain_disconnected)(struct tb *tb,
+				     const struct icm_pkg_header *hdr);
 };
 
 struct icm_notification {
@@ -89,7 +95,10 @@ static inline struct tb *icm_to_tb(struct icm *icm)
 
 static inline u8 phy_port_from_route(u64 route, u8 depth)
 {
-	return tb_switch_phy_port_from_link(route >> ((depth - 1) * 8));
+	u8 link;
+
+	link = depth ? route >> ((depth - 1) * 8) : route;
+	return tb_phy_port_from_link(link);
 }
 
 static inline u8 dual_link_from_link(u8 link)
@@ -320,6 +329,51 @@ static int icm_fr_challenge_switch_key(struct tb *tb, struct tb_switch *sw,
 	return 0;
 }
 
+static int icm_fr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	struct icm_fr_pkg_approve_xdomain_response reply;
+	struct icm_fr_pkg_approve_xdomain request;
+	int ret;
+
+	memset(&request, 0, sizeof(request));
+	request.hdr.code = ICM_APPROVE_XDOMAIN;
+	request.link_info = xd->depth << ICM_LINK_INFO_DEPTH_SHIFT | xd->link;
+	memcpy(&request.remote_uuid, xd->remote_uuid, sizeof(*xd->remote_uuid));
+
+	request.transmit_path = xd->transmit_path;
+	request.transmit_ring = xd->transmit_ring;
+	request.receive_path = xd->receive_path;
+	request.receive_ring = xd->receive_ring;
+
+	memset(&reply, 0, sizeof(reply));
+	ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply),
+			  1, ICM_TIMEOUT);
+	if (ret)
+		return ret;
+
+	if (reply.hdr.flags & ICM_FLAGS_ERROR)
+		return -EIO;
+
+	return 0;
+}
+
+static int icm_fr_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+{
+	u8 phy_port;
+	u8 cmd;
+
+	phy_port = tb_phy_port_from_link(xd->link);
+	if (phy_port == 0)
+		cmd = NHI_MAILBOX_DISCONNECT_PA;
+	else
+		cmd = NHI_MAILBOX_DISCONNECT_PB;
+
+	nhi_mailbox_cmd(tb->nhi, cmd, 1);
+	usleep_range(10, 50);
+	nhi_mailbox_cmd(tb->nhi, cmd, 2);
+	return 0;
+}
+
 static void remove_switch(struct tb_switch *sw)
 {
 	struct tb_switch *parent_sw;
@@ -475,6 +529,141 @@ icm_fr_device_disconnected(struct tb *tb, const struct icm_pkg_header *hdr)
 	tb_switch_put(sw);
 }
 
+static void remove_xdomain(struct tb_xdomain *xd)
+{
+	struct tb_switch *sw;
+
+	sw = tb_to_switch(xd->dev.parent);
+	tb_port_at(xd->route, sw)->xdomain = NULL;
+	tb_xdomain_remove(xd);
+}
+
+static void
+icm_fr_xdomain_connected(struct tb *tb, const struct icm_pkg_header *hdr)
+{
+	const struct icm_fr_event_xdomain_connected *pkg =
+		(const struct icm_fr_event_xdomain_connected *)hdr;
+	struct tb_xdomain *xd;
+	struct tb_switch *sw;
+	u8 link, depth;
+	bool approved;
+	u64 route;
+
+	/*
+	 * After NVM upgrade adding root switch device fails because we
+	 * initiated reset. During that time ICM might still send
+	 * XDomain connected message which we ignore here.
+	 */
+	if (!tb->root_switch)
+		return;
+
+	link = pkg->link_info & ICM_LINK_INFO_LINK_MASK;
+	depth = (pkg->link_info & ICM_LINK_INFO_DEPTH_MASK) >>
+		ICM_LINK_INFO_DEPTH_SHIFT;
+	approved = pkg->link_info & ICM_LINK_INFO_APPROVED;
+
+	if (link > ICM_MAX_LINK || depth > ICM_MAX_DEPTH) {
+		tb_warn(tb, "invalid topology %u.%u, ignoring\n", link, depth);
+		return;
+	}
+
+	route = get_route(pkg->local_route_hi, pkg->local_route_lo);
+
+	xd = tb_xdomain_find_by_uuid(tb, &pkg->remote_uuid);
+	if (xd) {
+		u8 xd_phy_port, phy_port;
+
+		xd_phy_port = phy_port_from_route(xd->route, xd->depth);
+		phy_port = phy_port_from_route(route, depth);
+
+		if (xd->depth == depth && xd_phy_port == phy_port) {
+			xd->link = link;
+			xd->route = route;
+			xd->is_unplugged = false;
+			tb_xdomain_put(xd);
+			return;
+		}
+
+		/*
+		 * If we find an existing XDomain connection remove it
+		 * now. We need to go through login handshake and
+		 * everything anyway to be able to re-establish the
+		 * connection.
+		 */
+		remove_xdomain(xd);
+		tb_xdomain_put(xd);
+	}
+
+	/*
+	 * Look if there already exists an XDomain in the same place
+	 * than the new one and in that case remove it because it is
+	 * most likely another host that got disconnected.
+	 */
+	xd = tb_xdomain_find_by_link_depth(tb, link, depth);
+	if (!xd) {
+		u8 dual_link;
+
+		dual_link = dual_link_from_link(link);
+		if (dual_link)
+			xd = tb_xdomain_find_by_link_depth(tb, dual_link,
+							   depth);
+	}
+	if (xd) {
+		remove_xdomain(xd);
+		tb_xdomain_put(xd);
+	}
+
+	/*
+	 * If the user disconnected a switch during suspend and
+	 * connected another host to the same port, remove the switch
+	 * first.
+	 */
+	sw = get_switch_at_route(tb->root_switch, route);
+	if (sw)
+		remove_switch(sw);
+
+	sw = tb_switch_find_by_link_depth(tb, link, depth);
+	if (!sw) {
+		tb_warn(tb, "no switch exists at %u.%u, ignoring\n", link,
+			depth);
+		return;
+	}
+
+	xd = tb_xdomain_alloc(sw->tb, &sw->dev, route,
+			      &pkg->local_uuid, &pkg->remote_uuid);
+	if (!xd) {
+		tb_switch_put(sw);
+		return;
+	}
+
+	xd->link = link;
+	xd->depth = depth;
+
+	tb_port_at(route, sw)->xdomain = xd;
+
+	tb_xdomain_add(xd);
+	tb_switch_put(sw);
+}
+
+static void
+icm_fr_xdomain_disconnected(struct tb *tb, const struct icm_pkg_header *hdr)
+{
+	const struct icm_fr_event_xdomain_disconnected *pkg =
+		(const struct icm_fr_event_xdomain_disconnected *)hdr;
+	struct tb_xdomain *xd;
+
+	/*
+	 * If the connection is through one or multiple devices, the
+	 * XDomain device is removed along with them so it is fine if we
+	 * cannot find it here.
+	 */
+	xd = tb_xdomain_find_by_uuid(tb, &pkg->remote_uuid);
+	if (xd) {
+		remove_xdomain(xd);
+		tb_xdomain_put(xd);
+	}
+}
+
 static struct pci_dev *get_upstream_port(struct pci_dev *pdev)
 {
 	struct pci_dev *parent;
@@ -594,6 +783,12 @@ static void icm_handle_notification(struct work_struct *work)
 	case ICM_EVENT_DEVICE_DISCONNECTED:
 		icm->device_disconnected(tb, n->pkg);
 		break;
+	case ICM_EVENT_XDOMAIN_CONNECTED:
+		icm->xdomain_connected(tb, n->pkg);
+		break;
+	case ICM_EVENT_XDOMAIN_DISCONNECTED:
+		icm->xdomain_disconnected(tb, n->pkg);
+		break;
 	}
 
 	mutex_unlock(&tb->lock);
@@ -927,6 +1122,10 @@ static void icm_unplug_children(struct tb_switch *sw)
 
 		if (tb_is_upstream_port(port))
 			continue;
+		if (port->xdomain) {
+			port->xdomain->is_unplugged = true;
+			continue;
+		}
 		if (!port->remote)
 			continue;
 
@@ -943,6 +1142,13 @@ static void icm_free_unplugged_children(struct tb_switch *sw)
 
 		if (tb_is_upstream_port(port))
 			continue;
+
+		if (port->xdomain && port->xdomain->is_unplugged) {
+			tb_xdomain_remove(port->xdomain);
+			port->xdomain = NULL;
+			continue;
+		}
+
 		if (!port->remote)
 			continue;
 
@@ -1009,8 +1215,10 @@ static int icm_start(struct tb *tb)
 	tb->root_switch->no_nvm_upgrade = x86_apple_machine;
 
 	ret = tb_switch_add(tb->root_switch);
-	if (ret)
+	if (ret) {
 		tb_switch_put(tb->root_switch);
+		tb->root_switch = NULL;
+	}
 
 	return ret;
 }
@@ -1042,6 +1250,8 @@ static const struct tb_cm_ops icm_fr_ops = {
 	.add_switch_key = icm_fr_add_switch_key,
 	.challenge_switch_key = icm_fr_challenge_switch_key,
 	.disconnect_pcie_paths = icm_disconnect_pcie_paths,
+	.approve_xdomain_paths = icm_fr_approve_xdomain_paths,
+	.disconnect_xdomain_paths = icm_fr_disconnect_xdomain_paths,
 };
 
 struct tb *icm_probe(struct tb_nhi *nhi)
@@ -1064,6 +1274,8 @@ struct tb *icm_probe(struct tb_nhi *nhi)
 		icm->get_route = icm_fr_get_route;
 		icm->device_connected = icm_fr_device_connected;
 		icm->device_disconnected = icm_fr_device_disconnected;
+		icm->xdomain_connected = icm_fr_xdomain_connected;
+		icm->xdomain_disconnected = icm_fr_xdomain_disconnected;
 		tb->cm_ops = &icm_fr_ops;
 		break;
 
@@ -1077,6 +1289,8 @@ struct tb *icm_probe(struct tb_nhi *nhi)
 		icm->get_route = icm_ar_get_route;
 		icm->device_connected = icm_fr_device_connected;
 		icm->device_disconnected = icm_fr_device_disconnected;
+		icm->xdomain_connected = icm_fr_xdomain_connected;
+		icm->xdomain_disconnected = icm_fr_xdomain_disconnected;
 		tb->cm_ops = &icm_fr_ops;
 		break;
 	}
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 05af126a2435..0e79eebfcbb7 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -22,6 +22,14 @@
 #define RING_TYPE(ring) ((ring)->is_tx ? "TX ring" : "RX ring")
 
 /*
+ * Used to enable end-to-end workaround for missing RX packets. Do not
+ * use this ring for anything else.
+ */
+#define RING_E2E_UNUSED_HOPID	2
+/* HopIDs 0-7 are reserved by the Thunderbolt protocol */
+#define RING_FIRST_USABLE_HOPID	8
+
+/*
  * Minimal number of vectors when we use MSI-X. Two for control channel
  * Rx/Tx and the rest four are for cross domain DMA paths.
  */
@@ -206,8 +214,10 @@ static void ring_work(struct work_struct *work)
 	struct tb_ring *ring = container_of(work, typeof(*ring), work);
 	struct ring_frame *frame;
 	bool canceled = false;
+	unsigned long flags;
 	LIST_HEAD(done);
-	mutex_lock(&ring->lock);
+
+	spin_lock_irqsave(&ring->lock, flags);
 
 	if (!ring->running) {
 		/*  Move all frames to done and mark them as canceled. */
@@ -229,30 +239,14 @@ static void ring_work(struct work_struct *work)
 			frame->eof = ring->descriptors[ring->tail].eof;
 			frame->sof = ring->descriptors[ring->tail].sof;
 			frame->flags = ring->descriptors[ring->tail].flags;
-			if (frame->sof != 0)
-				dev_WARN(&ring->nhi->pdev->dev,
-					 "%s %d got unexpected SOF: %#x\n",
-					 RING_TYPE(ring), ring->hop,
-					 frame->sof);
-			/*
-			 * known flags:
-			 * raw not enabled, interupt not set: 0x2=0010
-			 * raw enabled: 0xa=1010
-			 * raw not enabled: 0xb=1011
-			 * partial frame (>MAX_FRAME_SIZE): 0xe=1110
-			 */
-			if (frame->flags != 0xa)
-				dev_WARN(&ring->nhi->pdev->dev,
-					 "%s %d got unexpected flags: %#x\n",
-					 RING_TYPE(ring), ring->hop,
-					 frame->flags);
 		}
 		ring->tail = (ring->tail + 1) % ring->size;
 	}
 	ring_write_descriptors(ring);
 
 invoke_callback:
-	mutex_unlock(&ring->lock); /* allow callbacks to schedule new work */
+	/* allow callbacks to schedule new work */
+	spin_unlock_irqrestore(&ring->lock, flags);
 	while (!list_empty(&done)) {
 		frame = list_first_entry(&done, typeof(*frame), list);
 		/*
@@ -260,29 +254,128 @@ invoke_callback:
 		 * Do not hold on to it.
 		 */
 		list_del_init(&frame->list);
-		frame->callback(ring, frame, canceled);
+		if (frame->callback)
+			frame->callback(ring, frame, canceled);
 	}
 }
 
-int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
+int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame)
 {
+	unsigned long flags;
 	int ret = 0;
-	mutex_lock(&ring->lock);
+
+	spin_lock_irqsave(&ring->lock, flags);
 	if (ring->running) {
 		list_add_tail(&frame->list, &ring->queue);
 		ring_write_descriptors(ring);
 	} else {
 		ret = -ESHUTDOWN;
 	}
-	mutex_unlock(&ring->lock);
+	spin_unlock_irqrestore(&ring->lock, flags);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(__tb_ring_enqueue);
+
+/**
+ * tb_ring_poll() - Poll one completed frame from the ring
+ * @ring: Ring to poll
+ *
+ * This function can be called when @start_poll callback of the @ring
+ * has been called. It will read one completed frame from the ring and
+ * return it to the caller. Returns %NULL if there is no more completed
+ * frames.
+ */
+struct ring_frame *tb_ring_poll(struct tb_ring *ring)
+{
+	struct ring_frame *frame = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->lock, flags);
+	if (!ring->running)
+		goto unlock;
+	if (ring_empty(ring))
+		goto unlock;
+
+	if (ring->descriptors[ring->tail].flags & RING_DESC_COMPLETED) {
+		frame = list_first_entry(&ring->in_flight, typeof(*frame),
+					 list);
+		list_del_init(&frame->list);
+
+		if (!ring->is_tx) {
+			frame->size = ring->descriptors[ring->tail].length;
+			frame->eof = ring->descriptors[ring->tail].eof;
+			frame->sof = ring->descriptors[ring->tail].sof;
+			frame->flags = ring->descriptors[ring->tail].flags;
+		}
+
+		ring->tail = (ring->tail + 1) % ring->size;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&ring->lock, flags);
+	return frame;
+}
+EXPORT_SYMBOL_GPL(tb_ring_poll);
+
+static void __ring_interrupt_mask(struct tb_ring *ring, bool mask)
+{
+	int idx = ring_interrupt_index(ring);
+	int reg = REG_RING_INTERRUPT_BASE + idx / 32 * 4;
+	int bit = idx % 32;
+	u32 val;
+
+	val = ioread32(ring->nhi->iobase + reg);
+	if (mask)
+		val &= ~BIT(bit);
+	else
+		val |= BIT(bit);
+	iowrite32(val, ring->nhi->iobase + reg);
+}
+
+/* Both @nhi->lock and @ring->lock should be held */
+static void __ring_interrupt(struct tb_ring *ring)
+{
+	if (!ring->running)
+		return;
+
+	if (ring->start_poll) {
+		__ring_interrupt_mask(ring, false);
+		ring->start_poll(ring->poll_data);
+	} else {
+		schedule_work(&ring->work);
+	}
+}
+
+/**
+ * tb_ring_poll_complete() - Re-start interrupt for the ring
+ * @ring: Ring to re-start the interrupt
+ *
+ * This will re-start (unmask) the ring interrupt once the user is done
+ * with polling.
+ */
+void tb_ring_poll_complete(struct tb_ring *ring)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->nhi->lock, flags);
+	spin_lock(&ring->lock);
+	if (ring->start_poll)
+		__ring_interrupt_mask(ring, false);
+	spin_unlock(&ring->lock);
+	spin_unlock_irqrestore(&ring->nhi->lock, flags);
+}
+EXPORT_SYMBOL_GPL(tb_ring_poll_complete);
 
 static irqreturn_t ring_msix(int irq, void *data)
 {
 	struct tb_ring *ring = data;
 
-	schedule_work(&ring->work);
+	spin_lock(&ring->nhi->lock);
+	spin_lock(&ring->lock);
+	__ring_interrupt(ring);
+	spin_unlock(&ring->lock);
+	spin_unlock(&ring->nhi->lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -320,30 +413,81 @@ static void ring_release_msix(struct tb_ring *ring)
 	ring->irq = 0;
 }
 
-static struct tb_ring *ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
-				  bool transmit, unsigned int flags)
+static int nhi_alloc_hop(struct tb_nhi *nhi, struct tb_ring *ring)
+{
+	int ret = 0;
+
+	spin_lock_irq(&nhi->lock);
+
+	if (ring->hop < 0) {
+		unsigned int i;
+
+		/*
+		 * Automatically allocate HopID from the non-reserved
+		 * range 8 .. hop_count - 1.
+		 */
+		for (i = RING_FIRST_USABLE_HOPID; i < nhi->hop_count; i++) {
+			if (ring->is_tx) {
+				if (!nhi->tx_rings[i]) {
+					ring->hop = i;
+					break;
+				}
+			} else {
+				if (!nhi->rx_rings[i]) {
+					ring->hop = i;
+					break;
+				}
+			}
+		}
+	}
+
+	if (ring->hop < 0 || ring->hop >= nhi->hop_count) {
+		dev_warn(&nhi->pdev->dev, "invalid hop: %d\n", ring->hop);
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+	if (ring->is_tx && nhi->tx_rings[ring->hop]) {
+		dev_warn(&nhi->pdev->dev, "TX hop %d already allocated\n",
+			 ring->hop);
+		ret = -EBUSY;
+		goto err_unlock;
+	} else if (!ring->is_tx && nhi->rx_rings[ring->hop]) {
+		dev_warn(&nhi->pdev->dev, "RX hop %d already allocated\n",
+			 ring->hop);
+		ret = -EBUSY;
+		goto err_unlock;
+	}
+
+	if (ring->is_tx)
+		nhi->tx_rings[ring->hop] = ring;
+	else
+		nhi->rx_rings[ring->hop] = ring;
+
+err_unlock:
+	spin_unlock_irq(&nhi->lock);
+
+	return ret;
+}
+
+static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
+				     bool transmit, unsigned int flags,
+				     u16 sof_mask, u16 eof_mask,
+				     void (*start_poll)(void *),
+				     void *poll_data)
 {
 	struct tb_ring *ring = NULL;
 	dev_info(&nhi->pdev->dev, "allocating %s ring %d of size %d\n",
 		 transmit ? "TX" : "RX", hop, size);
 
-	mutex_lock(&nhi->lock);
-	if (hop >= nhi->hop_count) {
-		dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop);
-		goto err;
-	}
-	if (transmit && nhi->tx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop);
-		goto err;
-	} else if (!transmit && nhi->rx_rings[hop]) {
-		dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop);
-		goto err;
-	}
+	/* Tx Ring 2 is reserved for E2E workaround */
+	if (transmit && hop == RING_E2E_UNUSED_HOPID)
+		return NULL;
+
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 	if (!ring)
-		goto err;
+		return NULL;
 
-	mutex_init(&ring->lock);
+	spin_lock_init(&ring->lock);
 	INIT_LIST_HEAD(&ring->queue);
 	INIT_LIST_HEAD(&ring->in_flight);
 	INIT_WORK(&ring->work, ring_work);
@@ -353,55 +497,88 @@ static struct tb_ring *ring_alloc(struct tb_nhi *nhi, u32 hop, int size,
 	ring->is_tx = transmit;
 	ring->size = size;
 	ring->flags = flags;
+	ring->sof_mask = sof_mask;
+	ring->eof_mask = eof_mask;
 	ring->head = 0;
 	ring->tail = 0;
 	ring->running = false;
-
-	if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND))
-		goto err;
+	ring->start_poll = start_poll;
+	ring->poll_data = poll_data;
 
 	ring->descriptors = dma_alloc_coherent(&ring->nhi->pdev->dev,
 			size * sizeof(*ring->descriptors),
 			&ring->descriptors_dma, GFP_KERNEL | __GFP_ZERO);
 	if (!ring->descriptors)
-		goto err;
+		goto err_free_ring;
+
+	if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND))
+		goto err_free_descs;
+
+	if (nhi_alloc_hop(nhi, ring))
+		goto err_release_msix;
 
-	if (transmit)
-		nhi->tx_rings[hop] = ring;
-	else
-		nhi->rx_rings[hop] = ring;
-	mutex_unlock(&nhi->lock);
 	return ring;
 
-err:
-	if (ring)
-		mutex_destroy(&ring->lock);
+err_release_msix:
+	ring_release_msix(ring);
+err_free_descs:
+	dma_free_coherent(&ring->nhi->pdev->dev,
+			  ring->size * sizeof(*ring->descriptors),
+			  ring->descriptors, ring->descriptors_dma);
+err_free_ring:
 	kfree(ring);
-	mutex_unlock(&nhi->lock);
+
 	return NULL;
 }
 
-struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags)
+/**
+ * tb_ring_alloc_tx() - Allocate DMA ring for transmit
+ * @nhi: Pointer to the NHI the ring is to be allocated
+ * @hop: HopID (ring) to allocate
+ * @size: Number of entries in the ring
+ * @flags: Flags for the ring
+ */
+struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags)
 {
-	return ring_alloc(nhi, hop, size, true, flags);
+	return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0, NULL, NULL);
 }
+EXPORT_SYMBOL_GPL(tb_ring_alloc_tx);
 
-struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags)
+/**
+ * tb_ring_alloc_rx() - Allocate DMA ring for receive
+ * @nhi: Pointer to the NHI the ring is to be allocated
+ * @hop: HopID (ring) to allocate. Pass %-1 for automatic allocation.
+ * @size: Number of entries in the ring
+ * @flags: Flags for the ring
+ * @sof_mask: Mask of PDF values that start a frame
+ * @eof_mask: Mask of PDF values that end a frame
+ * @start_poll: If not %NULL the ring will call this function when an
+ *		interrupt is triggered and masked, instead of callback
+ *		in each Rx frame.
+ * @poll_data: Optional data passed to @start_poll
+ */
+struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 void (*start_poll)(void *), void *poll_data)
 {
-	return ring_alloc(nhi, hop, size, false, flags);
+	return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask,
+			     start_poll, poll_data);
 }
+EXPORT_SYMBOL_GPL(tb_ring_alloc_rx);
 
 /**
- * ring_start() - enable a ring
+ * tb_ring_start() - enable a ring
  *
- * Must not be invoked in parallel with ring_stop().
+ * Must not be invoked in parallel with tb_ring_stop().
  */
-void ring_start(struct tb_ring *ring)
+void tb_ring_start(struct tb_ring *ring)
 {
-	mutex_lock(&ring->nhi->lock);
-	mutex_lock(&ring->lock);
+	u16 frame_size;
+	u32 flags;
+
+	spin_lock_irq(&ring->nhi->lock);
+	spin_lock(&ring->lock);
 	if (ring->nhi->going_away)
 		goto err;
 	if (ring->running) {
@@ -411,43 +588,65 @@ void ring_start(struct tb_ring *ring)
 	dev_info(&ring->nhi->pdev->dev, "starting %s %d\n",
 		 RING_TYPE(ring), ring->hop);
 
+	if (ring->flags & RING_FLAG_FRAME) {
+		/* Means 4096 */
+		frame_size = 0;
+		flags = RING_FLAG_ENABLE;
+	} else {
+		frame_size = TB_FRAME_SIZE;
+		flags = RING_FLAG_ENABLE | RING_FLAG_RAW;
+	}
+
+	if (ring->flags & RING_FLAG_E2E && !ring->is_tx) {
+		u32 hop;
+
+		/*
+		 * In order not to lose Rx packets we enable end-to-end
+		 * workaround which transfers Rx credits to an unused Tx
+		 * HopID.
+		 */
+		hop = RING_E2E_UNUSED_HOPID << REG_RX_OPTIONS_E2E_HOP_SHIFT;
+		hop &= REG_RX_OPTIONS_E2E_HOP_MASK;
+		flags |= hop | RING_FLAG_E2E_FLOW_CONTROL;
+	}
+
 	ring_iowrite64desc(ring, ring->descriptors_dma, 0);
 	if (ring->is_tx) {
 		ring_iowrite32desc(ring, ring->size, 12);
 		ring_iowrite32options(ring, 0, 4); /* time releated ? */
-		ring_iowrite32options(ring,
-				      RING_FLAG_ENABLE | RING_FLAG_RAW, 0);
+		ring_iowrite32options(ring, flags, 0);
 	} else {
-		ring_iowrite32desc(ring,
-				   (TB_FRAME_SIZE << 16) | ring->size, 12);
-		ring_iowrite32options(ring, 0xffffffff, 4); /* SOF EOF mask */
-		ring_iowrite32options(ring,
-				      RING_FLAG_ENABLE | RING_FLAG_RAW, 0);
+		u32 sof_eof_mask = ring->sof_mask << 16 | ring->eof_mask;
+
+		ring_iowrite32desc(ring, (frame_size << 16) | ring->size, 12);
+		ring_iowrite32options(ring, sof_eof_mask, 4);
+		ring_iowrite32options(ring, flags, 0);
 	}
 	ring_interrupt_active(ring, true);
 	ring->running = true;
 err:
-	mutex_unlock(&ring->lock);
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock(&ring->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 }
-
+EXPORT_SYMBOL_GPL(tb_ring_start);
 
 /**
- * ring_stop() - shutdown a ring
+ * tb_ring_stop() - shutdown a ring
  *
  * Must not be invoked from a callback.
  *
- * This method will disable the ring. Further calls to ring_tx/ring_rx will
- * return -ESHUTDOWN until ring_stop has been called.
+ * This method will disable the ring. Further calls to
+ * tb_ring_tx/tb_ring_rx will return -ESHUTDOWN until ring_stop has been
+ * called.
  *
  * All enqueued frames will be canceled and their callbacks will be executed
  * with frame->canceled set to true (on the callback thread). This method
  * returns only after all callback invocations have finished.
  */
-void ring_stop(struct tb_ring *ring)
+void tb_ring_stop(struct tb_ring *ring)
 {
-	mutex_lock(&ring->nhi->lock);
-	mutex_lock(&ring->lock);
+	spin_lock_irq(&ring->nhi->lock);
+	spin_lock(&ring->lock);
 	dev_info(&ring->nhi->pdev->dev, "stopping %s %d\n",
 		 RING_TYPE(ring), ring->hop);
 	if (ring->nhi->going_away)
@@ -468,8 +667,8 @@ void ring_stop(struct tb_ring *ring)
 	ring->running = false;
 
 err:
-	mutex_unlock(&ring->lock);
-	mutex_unlock(&ring->nhi->lock);
+	spin_unlock(&ring->lock);
+	spin_unlock_irq(&ring->nhi->lock);
 
 	/*
 	 * schedule ring->work to invoke callbacks on all remaining frames.
@@ -477,9 +676,10 @@ err:
 	schedule_work(&ring->work);
 	flush_work(&ring->work);
 }
+EXPORT_SYMBOL_GPL(tb_ring_stop);
 
 /*
- * ring_free() - free ring
+ * tb_ring_free() - free ring
  *
  * When this method returns all invocations of ring->callback will have
  * finished.
@@ -488,9 +688,9 @@ err:
  *
  * Must NOT be called from ring_frame->callback!
  */
-void ring_free(struct tb_ring *ring)
+void tb_ring_free(struct tb_ring *ring)
 {
-	mutex_lock(&ring->nhi->lock);
+	spin_lock_irq(&ring->nhi->lock);
 	/*
 	 * Dissociate the ring from the NHI. This also ensures that
 	 * nhi_interrupt_work cannot reschedule ring->work.
@@ -504,6 +704,7 @@ void ring_free(struct tb_ring *ring)
 		dev_WARN(&ring->nhi->pdev->dev, "%s %d still running\n",
 			 RING_TYPE(ring), ring->hop);
 	}
+	spin_unlock_irq(&ring->nhi->lock);
 
 	ring_release_msix(ring);
 
@@ -520,16 +721,15 @@ void ring_free(struct tb_ring *ring)
 		 RING_TYPE(ring),
 		 ring->hop);
 
-	mutex_unlock(&ring->nhi->lock);
 	/**
 	 * ring->work can no longer be scheduled (it is scheduled only
 	 * by nhi_interrupt_work, ring_stop and ring_msix). Wait for it
 	 * to finish before freeing the ring.
 	 */
 	flush_work(&ring->work);
-	mutex_destroy(&ring->lock);
 	kfree(ring);
 }
+EXPORT_SYMBOL_GPL(tb_ring_free);
 
 /**
  * nhi_mailbox_cmd() - Send a command through NHI mailbox
@@ -595,7 +795,7 @@ static void nhi_interrupt_work(struct work_struct *work)
 	int type = 0; /* current interrupt type 0: TX, 1: RX, 2: RX overflow */
 	struct tb_ring *ring;
 
-	mutex_lock(&nhi->lock);
+	spin_lock_irq(&nhi->lock);
 
 	/*
 	 * Starting at REG_RING_NOTIFY_BASE there are three status bitfields
@@ -630,10 +830,12 @@ static void nhi_interrupt_work(struct work_struct *work)
 				 hop);
 			continue;
 		}
-		/* we do not check ring->running, this is done in ring->work */
-		schedule_work(&ring->work);
+
+		spin_lock(&ring->lock);
+		__ring_interrupt(ring);
+		spin_unlock(&ring->lock);
 	}
-	mutex_unlock(&nhi->lock);
+	spin_unlock_irq(&nhi->lock);
 }
 
 static irqreturn_t nhi_msi(int irq, void *data)
@@ -651,6 +853,22 @@ static int nhi_suspend_noirq(struct device *dev)
 	return tb_domain_suspend_noirq(tb);
 }
 
+static void nhi_enable_int_throttling(struct tb_nhi *nhi)
+{
+	/* Throttling is specified in 256ns increments */
+	u32 throttle = DIV_ROUND_UP(128 * NSEC_PER_USEC, 256);
+	unsigned int i;
+
+	/*
+	 * Configure interrupt throttling for all vectors even if we
+	 * only use few.
+	 */
+	for (i = 0; i < MSIX_MAX_VECS; i++) {
+		u32 reg = REG_INT_THROTTLING_RATE + i * 4;
+		iowrite32(throttle, nhi->iobase + reg);
+	}
+}
+
 static int nhi_resume_noirq(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
@@ -663,6 +881,8 @@ static int nhi_resume_noirq(struct device *dev)
 	 */
 	if (!pci_device_is_present(pdev))
 		tb->nhi->going_away = true;
+	else
+		nhi_enable_int_throttling(tb->nhi);
 
 	return tb_domain_resume_noirq(tb);
 }
@@ -705,7 +925,6 @@ static void nhi_shutdown(struct tb_nhi *nhi)
 		devm_free_irq(&nhi->pdev->dev, nhi->pdev->irq, nhi);
 		flush_work(&nhi->interrupt_work);
 	}
-	mutex_destroy(&nhi->lock);
 	ida_destroy(&nhi->msix_ida);
 }
 
@@ -717,6 +936,8 @@ static int nhi_init_msi(struct tb_nhi *nhi)
 	/* In case someone left them on. */
 	nhi_disable_interrupts(nhi);
 
+	nhi_enable_int_throttling(nhi);
+
 	ida_init(&nhi->msix_ida);
 
 	/*
@@ -792,13 +1013,10 @@ static int nhi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return res;
 	}
 
-	mutex_init(&nhi->lock);
+	spin_lock_init(&nhi->lock);
 
 	pci_set_master(pdev);
 
-	/* magic value - clock related? */
-	iowrite32(3906250 / 10000, nhi->iobase + 0x38c00);
-
 	tb = icm_probe(nhi);
 	if (!tb)
 		tb = tb_probe(nhi);
diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h
index 5b5bb2c436be..771d09ca5dc5 100644
--- a/drivers/thunderbolt/nhi.h
+++ b/drivers/thunderbolt/nhi.h
@@ -7,144 +7,7 @@
 #ifndef DSL3510_H_
 #define DSL3510_H_
 
-#include <linux/idr.h>
-#include <linux/mutex.h>
-#include <linux/workqueue.h>
-
-/**
- * struct tb_nhi - thunderbolt native host interface
- * @lock: Must be held during ring creation/destruction. Is acquired by
- *	  interrupt_work when dispatching interrupts to individual rings.
- * @pdev: Pointer to the PCI device
- * @iobase: MMIO space of the NHI
- * @tx_rings: All Tx rings available on this host controller
- * @rx_rings: All Rx rings available on this host controller
- * @msix_ida: Used to allocate MSI-X vectors for rings
- * @going_away: The host controller device is about to disappear so when
- *		this flag is set, avoid touching the hardware anymore.
- * @interrupt_work: Work scheduled to handle ring interrupt when no
- *		    MSI-X is used.
- * @hop_count: Number of rings (end point hops) supported by NHI.
- */
-struct tb_nhi {
-	struct mutex lock;
-	struct pci_dev *pdev;
-	void __iomem *iobase;
-	struct tb_ring **tx_rings;
-	struct tb_ring **rx_rings;
-	struct ida msix_ida;
-	bool going_away;
-	struct work_struct interrupt_work;
-	u32 hop_count;
-};
-
-/**
- * struct tb_ring - thunderbolt TX or RX ring associated with a NHI
- * @lock: Lock serializing actions to this ring. Must be acquired after
- *	  nhi->lock.
- * @nhi: Pointer to the native host controller interface
- * @size: Size of the ring
- * @hop: Hop (DMA channel) associated with this ring
- * @head: Head of the ring (write next descriptor here)
- * @tail: Tail of the ring (complete next descriptor here)
- * @descriptors: Allocated descriptors for this ring
- * @queue: Queue holding frames to be transferred over this ring
- * @in_flight: Queue holding frames that are currently in flight
- * @work: Interrupt work structure
- * @is_tx: Is the ring Tx or Rx
- * @running: Is the ring running
- * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise.
- * @vector: MSI-X vector number the ring uses (only set if @irq is > 0)
- * @flags: Ring specific flags
- */
-struct tb_ring {
-	struct mutex lock;
-	struct tb_nhi *nhi;
-	int size;
-	int hop;
-	int head;
-	int tail;
-	struct ring_desc *descriptors;
-	dma_addr_t descriptors_dma;
-	struct list_head queue;
-	struct list_head in_flight;
-	struct work_struct work;
-	bool is_tx:1;
-	bool running:1;
-	int irq;
-	u8 vector;
-	unsigned int flags;
-};
-
-/* Leave ring interrupt enabled on suspend */
-#define RING_FLAG_NO_SUSPEND	BIT(0)
-
-struct ring_frame;
-typedef void (*ring_cb)(struct tb_ring*, struct ring_frame*, bool canceled);
-
-/**
- * struct ring_frame - for use with ring_rx/ring_tx
- */
-struct ring_frame {
-	dma_addr_t buffer_phy;
-	ring_cb callback;
-	struct list_head list;
-	u32 size:12; /* TX: in, RX: out*/
-	u32 flags:12; /* RX: out */
-	u32 eof:4; /* TX:in, RX: out */
-	u32 sof:4; /* TX:in, RX: out */
-};
-
-#define TB_FRAME_SIZE 0x100    /* minimum size for ring_rx */
-
-struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags);
-struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-			      unsigned int flags);
-void ring_start(struct tb_ring *ring);
-void ring_stop(struct tb_ring *ring);
-void ring_free(struct tb_ring *ring);
-
-int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
-
-/**
- * ring_rx() - enqueue a frame on an RX ring
- *
- * frame->buffer, frame->buffer_phy and frame->callback have to be set. The
- * buffer must contain at least TB_FRAME_SIZE bytes.
- *
- * frame->callback will be invoked with frame->size, frame->flags, frame->eof,
- * frame->sof set once the frame has been received.
- *
- * If ring_stop is called after the packet has been enqueued frame->callback
- * will be called with canceled set to true.
- *
- * Return: Returns ESHUTDOWN if ring_stop has been called. Zero otherwise.
- */
-static inline int ring_rx(struct tb_ring *ring, struct ring_frame *frame)
-{
-	WARN_ON(ring->is_tx);
-	return __ring_enqueue(ring, frame);
-}
-
-/**
- * ring_tx() - enqueue a frame on an TX ring
- *
- * frame->buffer, frame->buffer_phy, frame->callback, frame->size, frame->eof
- * and frame->sof have to be set.
- *
- * frame->callback will be invoked with once the frame has been transmitted.
- *
- * If ring_stop is called after the packet has been enqueued frame->callback
- * will be called with canceled set to true.
- *
- * Return: Returns ESHUTDOWN if ring_stop has been called. Zero otherwise.
- */
-static inline int ring_tx(struct tb_ring *ring, struct ring_frame *frame)
-{
-	WARN_ON(!ring->is_tx);
-	return __ring_enqueue(ring, frame);
-}
+#include <linux/thunderbolt.h>
 
 enum nhi_fw_mode {
 	NHI_FW_SAFE_MODE,
@@ -157,6 +20,8 @@ enum nhi_mailbox_cmd {
 	NHI_MAILBOX_SAVE_DEVS = 0x05,
 	NHI_MAILBOX_DISCONNECT_PCIE_PATHS = 0x06,
 	NHI_MAILBOX_DRV_UNLOADS = 0x07,
+	NHI_MAILBOX_DISCONNECT_PA = 0x10,
+	NHI_MAILBOX_DISCONNECT_PB = 0x11,
 	NHI_MAILBOX_ALLOW_ALL_DEVS = 0x23,
 };
 
diff --git a/drivers/thunderbolt/nhi_regs.h b/drivers/thunderbolt/nhi_regs.h
index 09ed574e92ff..5ed6934e31e7 100644
--- a/drivers/thunderbolt/nhi_regs.h
+++ b/drivers/thunderbolt/nhi_regs.h
@@ -17,13 +17,6 @@ enum ring_flags {
 	RING_FLAG_ENABLE = 1 << 31,
 };
 
-enum ring_desc_flags {
-	RING_DESC_ISOCH = 0x1, /* TX only? */
-	RING_DESC_COMPLETED = 0x2, /* set by NHI */
-	RING_DESC_POSTED = 0x4, /* always set this */
-	RING_DESC_INTERRUPT = 0x8, /* request an interrupt on completion */
-};
-
 /**
  * struct ring_desc - TX/RX ring entry
  *
@@ -77,6 +70,8 @@ struct ring_desc {
  * ..: unknown
  */
 #define REG_RX_OPTIONS_BASE	0x29800
+#define REG_RX_OPTIONS_E2E_HOP_MASK	GENMASK(22, 12)
+#define REG_RX_OPTIONS_E2E_HOP_SHIFT	12
 
 /*
  * three bitfields: tx, rx, rx overflow
@@ -95,6 +90,8 @@ struct ring_desc {
 #define REG_RING_INTERRUPT_BASE	0x38200
 #define RING_INTERRUPT_REG_COUNT(nhi) ((31 + 2 * nhi->hop_count) / 32)
 
+#define REG_INT_THROTTLING_RATE	0x38c00
+
 /* Interrupt Vector Allocation */
 #define REG_INT_VEC_ALLOC_BASE	0x38c40
 #define REG_INT_VEC_ALLOC_BITS	4
diff --git a/drivers/thunderbolt/property.c b/drivers/thunderbolt/property.c
new file mode 100644
index 000000000000..8fe913a95b4a
--- /dev/null
+++ b/drivers/thunderbolt/property.c
@@ -0,0 +1,670 @@
+/*
+ * Thunderbolt XDomain property support
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uuid.h>
+#include <linux/thunderbolt.h>
+
+struct tb_property_entry {
+	u32 key_hi;
+	u32 key_lo;
+	u16 length;
+	u8 reserved;
+	u8 type;
+	u32 value;
+};
+
+struct tb_property_rootdir_entry {
+	u32 magic;
+	u32 length;
+	struct tb_property_entry entries[];
+};
+
+struct tb_property_dir_entry {
+	u32 uuid[4];
+	struct tb_property_entry entries[];
+};
+
+#define TB_PROPERTY_ROOTDIR_MAGIC	0x55584401
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len,
+	bool is_root);
+
+static inline void parse_dwdata(void *dst, const void *src, size_t dwords)
+{
+	be32_to_cpu_array(dst, src, dwords);
+}
+
+static inline void format_dwdata(void *dst, const void *src, size_t dwords)
+{
+	cpu_to_be32_array(dst, src, dwords);
+}
+
+static bool tb_property_entry_valid(const struct tb_property_entry *entry,
+				  size_t block_len)
+{
+	switch (entry->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+	case TB_PROPERTY_TYPE_DATA:
+	case TB_PROPERTY_TYPE_TEXT:
+		if (entry->length > block_len)
+			return false;
+		if (entry->value + entry->length > block_len)
+			return false;
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		if (entry->length != 1)
+			return false;
+		break;
+	}
+
+	return true;
+}
+
+static bool tb_property_key_valid(const char *key)
+{
+	return key && strlen(key) <= TB_PROPERTY_KEY_SIZE;
+}
+
+static struct tb_property *
+tb_property_alloc(const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	property = kzalloc(sizeof(*property), GFP_KERNEL);
+	if (!property)
+		return NULL;
+
+	strcpy(property->key, key);
+	property->type = type;
+	INIT_LIST_HEAD(&property->list);
+
+	return property;
+}
+
+static struct tb_property *tb_property_parse(const u32 *block, size_t block_len,
+					const struct tb_property_entry *entry)
+{
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	struct tb_property *property;
+	struct tb_property_dir *dir;
+
+	if (!tb_property_entry_valid(entry, block_len))
+		return NULL;
+
+	parse_dwdata(key, entry, 2);
+	key[TB_PROPERTY_KEY_SIZE] = '\0';
+
+	property = tb_property_alloc(key, entry->type);
+	if (!property)
+		return NULL;
+
+	property->length = entry->length;
+
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		dir = __tb_property_parse_dir(block, block_len, entry->value,
+					      entry->length, false);
+		if (!dir) {
+			kfree(property);
+			return NULL;
+		}
+		property->value.dir = dir;
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		property->value.data = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.data) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.data, block + entry->value,
+			     entry->length);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		property->value.text = kcalloc(property->length, sizeof(u32),
+					       GFP_KERNEL);
+		if (!property->value.text) {
+			kfree(property);
+			return NULL;
+		}
+		parse_dwdata(property->value.text, block + entry->value,
+			     entry->length);
+		/* Force null termination */
+		property->value.text[property->length * 4 - 1] = '\0';
+		break;
+
+	case TB_PROPERTY_TYPE_VALUE:
+		property->value.immediate = entry->value;
+		break;
+
+	default:
+		property->type = TB_PROPERTY_TYPE_UNKNOWN;
+		break;
+	}
+
+	return property;
+}
+
+static struct tb_property_dir *__tb_property_parse_dir(const u32 *block,
+	size_t block_len, unsigned int dir_offset, size_t dir_len, bool is_root)
+{
+	const struct tb_property_entry *entries;
+	size_t i, content_len, nentries;
+	unsigned int content_offset;
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	if (is_root) {
+		content_offset = dir_offset + 2;
+		content_len = dir_len;
+	} else {
+		dir->uuid = kmemdup(&block[dir_offset], sizeof(*dir->uuid),
+				    GFP_KERNEL);
+		content_offset = dir_offset + 4;
+		content_len = dir_len - 4; /* Length includes UUID */
+	}
+
+	entries = (const struct tb_property_entry *)&block[content_offset];
+	nentries = content_len / (sizeof(*entries) / 4);
+
+	INIT_LIST_HEAD(&dir->properties);
+
+	for (i = 0; i < nentries; i++) {
+		struct tb_property *property;
+
+		property = tb_property_parse(block, block_len, &entries[i]);
+		if (!property) {
+			tb_property_free_dir(dir);
+			return NULL;
+		}
+
+		list_add_tail(&property->list, &dir->properties);
+	}
+
+	return dir;
+}
+
+/**
+ * tb_property_parse_dir() - Parses properties from given property block
+ * @block: Property block to parse
+ * @block_len: Number of dword elements in the property block
+ *
+ * This function parses the XDomain properties data block into format that
+ * can be traversed using the helper functions provided by this module.
+ * Upon success returns the parsed directory. In case of error returns
+ * %NULL. The resulting &struct tb_property_dir needs to be released by
+ * calling tb_property_free_dir() when not needed anymore.
+ *
+ * The @block is expected to be root directory.
+ */
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len)
+{
+	const struct tb_property_rootdir_entry *rootdir =
+		(const struct tb_property_rootdir_entry *)block;
+
+	if (rootdir->magic != TB_PROPERTY_ROOTDIR_MAGIC)
+		return NULL;
+	if (rootdir->length > block_len)
+		return NULL;
+
+	return __tb_property_parse_dir(block, block_len, 0, rootdir->length,
+				       true);
+}
+
+/**
+ * tb_property_create_dir() - Creates new property directory
+ * @uuid: UUID used to identify the particular directory
+ *
+ * Creates new, empty property directory. If @uuid is %NULL then the
+ * directory is assumed to be root directory.
+ */
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid)
+{
+	struct tb_property_dir *dir;
+
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return NULL;
+
+	INIT_LIST_HEAD(&dir->properties);
+	if (uuid) {
+		dir->uuid = kmemdup(uuid, sizeof(*dir->uuid), GFP_KERNEL);
+		if (!dir->uuid) {
+			kfree(dir);
+			return NULL;
+		}
+	}
+
+	return dir;
+}
+EXPORT_SYMBOL_GPL(tb_property_create_dir);
+
+static void tb_property_free(struct tb_property *property)
+{
+	switch (property->type) {
+	case TB_PROPERTY_TYPE_DIRECTORY:
+		tb_property_free_dir(property->value.dir);
+		break;
+
+	case TB_PROPERTY_TYPE_DATA:
+		kfree(property->value.data);
+		break;
+
+	case TB_PROPERTY_TYPE_TEXT:
+		kfree(property->value.text);
+		break;
+
+	default:
+		break;
+	}
+
+	kfree(property);
+}
+
+/**
+ * tb_property_free_dir() - Release memory allocated for property directory
+ * @dir: Directory to release
+ *
+ * This will release all the memory the directory occupies including all
+ * descendants. It is OK to pass %NULL @dir, then the function does
+ * nothing.
+ */
+void tb_property_free_dir(struct tb_property_dir *dir)
+{
+	struct tb_property *property, *tmp;
+
+	if (!dir)
+		return;
+
+	list_for_each_entry_safe(property, tmp, &dir->properties, list) {
+		list_del(&property->list);
+		tb_property_free(property);
+	}
+	kfree(dir->uuid);
+	kfree(dir);
+}
+EXPORT_SYMBOL_GPL(tb_property_free_dir);
+
+static size_t tb_property_dir_length(const struct tb_property_dir *dir,
+				     bool recurse, size_t *data_len)
+{
+	const struct tb_property *property;
+	size_t len = 0;
+
+	if (dir->uuid)
+		len += sizeof(*dir->uuid) / 4;
+	else
+		len += sizeof(struct tb_property_rootdir_entry) / 4;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		len += sizeof(struct tb_property_entry) / 4;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			if (recurse) {
+				len += tb_property_dir_length(
+					property->value.dir, recurse, data_len);
+			}
+			/* Reserve dword padding after each directory */
+			if (data_len)
+				*data_len += 1;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+		case TB_PROPERTY_TYPE_TEXT:
+			if (data_len)
+				*data_len += property->length;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	return len;
+}
+
+static ssize_t __tb_property_format_dir(const struct tb_property_dir *dir,
+	u32 *block, unsigned int start_offset, size_t block_len)
+{
+	unsigned int data_offset, dir_end;
+	const struct tb_property *property;
+	struct tb_property_entry *entry;
+	size_t dir_len, data_len = 0;
+	int ret;
+
+	/*
+	 * The structure of property block looks like following. Leaf
+	 * data/text is included right after the directory and each
+	 * directory follows each other (even nested ones).
+	 *
+	 * +----------+ <-- start_offset
+	 * |  header  | <-- root directory header
+	 * +----------+ ---
+	 * |  entry 0 | -^--------------------.
+	 * +----------+  |                    |
+	 * |  entry 1 | -|--------------------|--.
+	 * +----------+  |                    |  |
+	 * |  entry 2 | -|-----------------.  |  |
+	 * +----------+  |                 |  |  |
+	 * :          :  |  dir_len        |  |  |
+	 * .          .  |                 |  |  |
+	 * :          :  |                 |  |  |
+	 * +----------+  |                 |  |  |
+	 * |  entry n |  v                 |  |  |
+	 * +----------+ <-- data_offset    |  |  |
+	 * |  data 0  | <------------------|--'  |
+	 * +----------+                    |     |
+	 * |  data 1  | <------------------|-----'
+	 * +----------+                    |
+	 * | 00000000 | padding            |
+	 * +----------+ <-- dir_end <------'
+	 * |   UUID   | <-- directory UUID (child directory)
+	 * +----------+
+	 * |  entry 0 |
+	 * +----------+
+	 * |  entry 1 |
+	 * +----------+
+	 * :          :
+	 * .          .
+	 * :          :
+	 * +----------+
+	 * |  entry n |
+	 * +----------+
+	 * |  data 0  |
+	 * +----------+
+	 *
+	 * We use dir_end to hold pointer to the end of the directory. It
+	 * will increase as we add directories and each directory should be
+	 * added starting from previous dir_end.
+	 */
+	dir_len = tb_property_dir_length(dir, false, &data_len);
+	data_offset = start_offset + dir_len;
+	dir_end = start_offset + data_len + dir_len;
+
+	if (data_offset > dir_end)
+		return -EINVAL;
+	if (dir_end > block_len)
+		return -EINVAL;
+
+	/* Write headers first */
+	if (dir->uuid) {
+		struct tb_property_dir_entry *pe;
+
+		pe = (struct tb_property_dir_entry *)&block[start_offset];
+		memcpy(pe->uuid, dir->uuid, sizeof(pe->uuid));
+		entry = pe->entries;
+	} else {
+		struct tb_property_rootdir_entry *re;
+
+		re = (struct tb_property_rootdir_entry *)&block[start_offset];
+		re->magic = TB_PROPERTY_ROOTDIR_MAGIC;
+		re->length = dir_len - sizeof(*re) / 4;
+		entry = re->entries;
+	}
+
+	list_for_each_entry(property, &dir->properties, list) {
+		const struct tb_property_dir *child;
+
+		format_dwdata(entry, property->key, 2);
+		entry->type = property->type;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			child = property->value.dir;
+			ret = __tb_property_format_dir(child, block, dir_end,
+						       block_len);
+			if (ret < 0)
+				return ret;
+			entry->length = tb_property_dir_length(child, false,
+							       NULL);
+			entry->value = dir_end;
+			dir_end = ret;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+			format_dwdata(&block[data_offset], property->value.data,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_TEXT:
+			format_dwdata(&block[data_offset], property->value.text,
+				      property->length);
+			entry->length = property->length;
+			entry->value = data_offset;
+			data_offset += entry->length;
+			break;
+
+		case TB_PROPERTY_TYPE_VALUE:
+			entry->length = property->length;
+			entry->value = property->value.immediate;
+			break;
+
+		default:
+			break;
+		}
+
+		entry++;
+	}
+
+	return dir_end;
+}
+
+/**
+ * tb_property_format_dir() - Formats directory to the packed XDomain format
+ * @dir: Directory to format
+ * @block: Property block where the packed data is placed
+ * @block_len: Length of the property block
+ *
+ * This function formats the directory to the packed format that can be
+ * then send over the thunderbolt fabric to receiving host. Returns %0 in
+ * case of success and negative errno on faulure. Passing %NULL in @block
+ * returns number of entries the block takes.
+ */
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len)
+{
+	ssize_t ret;
+
+	if (!block) {
+		size_t dir_len, data_len = 0;
+
+		dir_len = tb_property_dir_length(dir, true, &data_len);
+		return dir_len + data_len;
+	}
+
+	ret = __tb_property_format_dir(dir, block, 0, block_len);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * tb_property_add_immediate() - Add immediate property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @value: Immediate value to store with the property
+ */
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_VALUE);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = 1;
+	property->value.immediate = value;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_immediate);
+
+/**
+ * tb_property_add_data() - Adds arbitrary data property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @buf: Data buffer to add
+ * @buflen: Number of bytes in the data buffer
+ *
+ * Function takes a copy of @buf and adds it to the directory.
+ */
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(buflen, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DATA);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	memcpy(property->value.data, buf, buflen);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_data);
+
+/**
+ * tb_property_add_text() - Adds string property to directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @text: String to add
+ *
+ * Function takes a copy of @text and adds it to the directory.
+ */
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text)
+{
+	/* Need to pad to dword boundary */
+	size_t size = round_up(strlen(text) + 1, 4);
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_TEXT);
+	if (!property)
+		return -ENOMEM;
+
+	property->length = size / 4;
+	property->value.data = kzalloc(size, GFP_KERNEL);
+	strcpy(property->value.text, text);
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_text);
+
+/**
+ * tb_property_add_dir() - Adds a directory to the parent directory
+ * @parent: Directory to add the property
+ * @key: Key for the property
+ * @dir: Directory to add
+ */
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir)
+{
+	struct tb_property *property;
+
+	if (!tb_property_key_valid(key))
+		return -EINVAL;
+
+	property = tb_property_alloc(key, TB_PROPERTY_TYPE_DIRECTORY);
+	if (!property)
+		return -ENOMEM;
+
+	property->value.dir = dir;
+
+	list_add_tail(&property->list, &parent->properties);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_property_add_dir);
+
+/**
+ * tb_property_remove() - Removes property from a parent directory
+ * @property: Property to remove
+ *
+ * Note memory for @property is released as well so it is not allowed to
+ * touch the object after call to this function.
+ */
+void tb_property_remove(struct tb_property *property)
+{
+	list_del(&property->list);
+	kfree(property);
+}
+EXPORT_SYMBOL_GPL(tb_property_remove);
+
+/**
+ * tb_property_find() - Find a property from a directory
+ * @dir: Directory where the property is searched
+ * @key: Key to look for
+ * @type: Type of the property
+ *
+ * Finds and returns property from the given directory. Does not recurse
+ * into sub-directories. Returns %NULL if the property was not found.
+ */
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+	const char *key, enum tb_property_type type)
+{
+	struct tb_property *property;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		if (property->type == type && !strcmp(property->key, key))
+			return property;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(tb_property_find);
+
+/**
+ * tb_property_get_next() - Get next property from directory
+ * @dir: Directory holding properties
+ * @prev: Previous property in the directory (%NULL returns the first)
+ */
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev)
+{
+	if (prev) {
+		if (list_is_last(&prev->list, &dir->properties))
+			return NULL;
+		return list_next_entry(prev, list);
+	}
+	return list_first_entry_or_null(&dir->properties, struct tb_property,
+					list);
+}
+EXPORT_SYMBOL_GPL(tb_property_get_next);
diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index 53f40c57df59..dfc357d33e1e 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -171,11 +171,11 @@ static int nvm_authenticate_host(struct tb_switch *sw)
 
 	/*
 	 * Root switch NVM upgrade requires that we disconnect the
-	 * existing PCIe paths first (in case it is not in safe mode
+	 * existing paths first (in case it is not in safe mode
 	 * already).
 	 */
 	if (!sw->safe_mode) {
-		ret = tb_domain_disconnect_pcie_paths(sw->tb);
+		ret = tb_domain_disconnect_all_paths(sw->tb);
 		if (ret)
 			return ret;
 		/*
@@ -1363,6 +1363,9 @@ void tb_switch_remove(struct tb_switch *sw)
 		if (sw->ports[i].remote)
 			tb_switch_remove(sw->ports[i].remote->sw);
 		sw->ports[i].remote = NULL;
+		if (sw->ports[i].xdomain)
+			tb_xdomain_remove(sw->ports[i].xdomain);
+		sw->ports[i].xdomain = NULL;
 	}
 
 	if (!sw->is_unplugged)
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index e0deee4f1eb0..74af9d4929ab 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -9,6 +9,7 @@
 
 #include <linux/nvmem-provider.h>
 #include <linux/pci.h>
+#include <linux/thunderbolt.h>
 #include <linux/uuid.h>
 
 #include "tb_regs.h"
@@ -39,23 +40,7 @@ struct tb_switch_nvm {
 	bool authenticating;
 };
 
-/**
- * enum tb_security_level - Thunderbolt security level
- * @TB_SECURITY_NONE: No security, legacy mode
- * @TB_SECURITY_USER: User approval required at minimum
- * @TB_SECURITY_SECURE: One time saved key required at minimum
- * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB)
- */
-enum tb_security_level {
-	TB_SECURITY_NONE,
-	TB_SECURITY_USER,
-	TB_SECURITY_SECURE,
-	TB_SECURITY_DPONLY,
-};
-
 #define TB_SWITCH_KEY_SIZE		32
-/* Each physical port contains 2 links on modern controllers */
-#define TB_SWITCH_LINKS_PER_PHY_PORT	2
 
 /**
  * struct tb_switch - a thunderbolt switch
@@ -125,14 +110,25 @@ struct tb_switch {
 
 /**
  * struct tb_port - a thunderbolt port, part of a tb_switch
+ * @config: Cached port configuration read from registers
+ * @sw: Switch the port belongs to
+ * @remote: Remote port (%NULL if not connected)
+ * @xdomain: Remote host (%NULL if not connected)
+ * @cap_phy: Offset, zero if not found
+ * @port: Port number on switch
+ * @disabled: Disabled by eeprom
+ * @dual_link_port: If the switch is connected using two ports, points
+ *		    to the other port.
+ * @link_nr: Is this primary or secondary port on the dual_link.
  */
 struct tb_port {
 	struct tb_regs_port_header config;
 	struct tb_switch *sw;
-	struct tb_port *remote; /* remote port, NULL if not connected */
-	int cap_phy; /* offset, zero if not found */
-	u8 port; /* port number on switch */
-	bool disabled; /* disabled by eeprom */
+	struct tb_port *remote;
+	struct tb_xdomain *xdomain;
+	int cap_phy;
+	u8 port;
+	bool disabled;
 	struct tb_port *dual_link_port;
 	u8 link_nr:1;
 };
@@ -205,6 +201,8 @@ struct tb_path {
  * @add_switch_key: Add key to switch
  * @challenge_switch_key: Challenge switch using key
  * @disconnect_pcie_paths: Disconnects PCIe paths before NVM update
+ * @approve_xdomain_paths: Approve (establish) XDomain DMA paths
+ * @disconnect_xdomain_paths: Disconnect XDomain DMA paths
  */
 struct tb_cm_ops {
 	int (*driver_ready)(struct tb *tb);
@@ -221,33 +219,8 @@ struct tb_cm_ops {
 	int (*challenge_switch_key)(struct tb *tb, struct tb_switch *sw,
 				    const u8 *challenge, u8 *response);
 	int (*disconnect_pcie_paths)(struct tb *tb);
-};
-
-/**
- * struct tb - main thunderbolt bus structure
- * @dev: Domain device
- * @lock: Big lock. Must be held when accessing any struct
- *	  tb_switch / struct tb_port.
- * @nhi: Pointer to the NHI structure
- * @ctl: Control channel for this domain
- * @wq: Ordered workqueue for all domain specific work
- * @root_switch: Root switch of this domain
- * @cm_ops: Connection manager specific operations vector
- * @index: Linux assigned domain number
- * @security_level: Current security level
- * @privdata: Private connection manager specific data
- */
-struct tb {
-	struct device dev;
-	struct mutex lock;
-	struct tb_nhi *nhi;
-	struct tb_ctl *ctl;
-	struct workqueue_struct *wq;
-	struct tb_switch *root_switch;
-	const struct tb_cm_ops *cm_ops;
-	int index;
-	enum tb_security_level security_level;
-	unsigned long privdata[0];
+	int (*approve_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd);
+	int (*disconnect_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd);
 };
 
 static inline void *tb_priv(struct tb *tb)
@@ -368,13 +341,14 @@ static inline int tb_port_write(struct tb_port *port, const void *buffer,
 struct tb *icm_probe(struct tb_nhi *nhi);
 struct tb *tb_probe(struct tb_nhi *nhi);
 
-extern struct bus_type tb_bus_type;
 extern struct device_type tb_domain_type;
 extern struct device_type tb_switch_type;
 
 int tb_domain_init(void);
 void tb_domain_exit(void);
 void tb_switch_exit(void);
+int tb_xdomain_init(void);
+void tb_xdomain_exit(void);
 
 struct tb *tb_domain_alloc(struct tb_nhi *nhi, size_t privsize);
 int tb_domain_add(struct tb *tb);
@@ -387,6 +361,9 @@ int tb_domain_approve_switch(struct tb *tb, struct tb_switch *sw);
 int tb_domain_approve_switch_key(struct tb *tb, struct tb_switch *sw);
 int tb_domain_challenge_switch_key(struct tb *tb, struct tb_switch *sw);
 int tb_domain_disconnect_pcie_paths(struct tb *tb);
+int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd);
+int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd);
+int tb_domain_disconnect_all_paths(struct tb *tb);
 
 static inline void tb_domain_put(struct tb *tb)
 {
@@ -409,11 +386,6 @@ struct tb_switch *tb_switch_find_by_link_depth(struct tb *tb, u8 link,
 					       u8 depth);
 struct tb_switch *tb_switch_find_by_uuid(struct tb *tb, const uuid_t *uuid);
 
-static inline unsigned int tb_switch_phy_port_from_link(unsigned int link)
-{
-	return (link - 1) / TB_SWITCH_LINKS_PER_PHY_PORT;
-}
-
 static inline void tb_switch_put(struct tb_switch *sw)
 {
 	put_device(&sw->dev);
@@ -471,4 +443,14 @@ static inline u64 tb_downstream_route(struct tb_port *port)
 	       | ((u64) port->port << (port->sw->config.depth * 8));
 }
 
+bool tb_xdomain_handle_request(struct tb *tb, enum tb_cfg_pkg_type type,
+			       const void *buf, size_t size);
+struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent,
+				    u64 route, const uuid_t *local_uuid,
+				    const uuid_t *remote_uuid);
+void tb_xdomain_add(struct tb_xdomain *xd);
+void tb_xdomain_remove(struct tb_xdomain *xd);
+struct tb_xdomain *tb_xdomain_find_by_link_depth(struct tb *tb, u8 link,
+						 u8 depth);
+
 #endif
diff --git a/drivers/thunderbolt/tb_msgs.h b/drivers/thunderbolt/tb_msgs.h
index de6441e4a060..b0a092baa605 100644
--- a/drivers/thunderbolt/tb_msgs.h
+++ b/drivers/thunderbolt/tb_msgs.h
@@ -15,23 +15,6 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 
-enum tb_cfg_pkg_type {
-	TB_CFG_PKG_READ = 1,
-	TB_CFG_PKG_WRITE = 2,
-	TB_CFG_PKG_ERROR = 3,
-	TB_CFG_PKG_NOTIFY_ACK = 4,
-	TB_CFG_PKG_EVENT = 5,
-	TB_CFG_PKG_XDOMAIN_REQ = 6,
-	TB_CFG_PKG_XDOMAIN_RESP = 7,
-	TB_CFG_PKG_OVERRIDE = 8,
-	TB_CFG_PKG_RESET = 9,
-	TB_CFG_PKG_ICM_EVENT = 10,
-	TB_CFG_PKG_ICM_CMD = 11,
-	TB_CFG_PKG_ICM_RESP = 12,
-	TB_CFG_PKG_PREPARE_TO_SLEEP = 0xd,
-
-};
-
 enum tb_cfg_space {
 	TB_CFG_HOPS = 0,
 	TB_CFG_PORT = 1,
@@ -118,11 +101,14 @@ enum icm_pkg_code {
 	ICM_CHALLENGE_DEVICE = 0x5,
 	ICM_ADD_DEVICE_KEY = 0x6,
 	ICM_GET_ROUTE = 0xa,
+	ICM_APPROVE_XDOMAIN = 0x10,
 };
 
 enum icm_event_code {
 	ICM_EVENT_DEVICE_CONNECTED = 3,
 	ICM_EVENT_DEVICE_DISCONNECTED = 4,
+	ICM_EVENT_XDOMAIN_CONNECTED = 6,
+	ICM_EVENT_XDOMAIN_DISCONNECTED = 7,
 };
 
 struct icm_pkg_header {
@@ -130,7 +116,7 @@ struct icm_pkg_header {
 	u8 flags;
 	u8 packet_id;
 	u8 total_packets;
-} __packed;
+};
 
 #define ICM_FLAGS_ERROR			BIT(0)
 #define ICM_FLAGS_NO_KEY		BIT(1)
@@ -139,20 +125,20 @@ struct icm_pkg_header {
 
 struct icm_pkg_driver_ready {
 	struct icm_pkg_header hdr;
-} __packed;
+};
 
 struct icm_pkg_driver_ready_response {
 	struct icm_pkg_header hdr;
 	u8 romver;
 	u8 ramver;
 	u16 security_level;
-} __packed;
+};
 
 /* Falcon Ridge & Alpine Ridge common messages */
 
 struct icm_fr_pkg_get_topology {
 	struct icm_pkg_header hdr;
-} __packed;
+};
 
 #define ICM_GET_TOPOLOGY_PACKETS	14
 
@@ -167,7 +153,7 @@ struct icm_fr_pkg_get_topology_response {
 	u32 reserved[2];
 	u32 ports[16];
 	u32 port_hop_info[16];
-} __packed;
+};
 
 #define ICM_SWITCH_USED			BIT(0)
 #define ICM_SWITCH_UPSTREAM_PORT_MASK	GENMASK(7, 1)
@@ -184,7 +170,7 @@ struct icm_fr_event_device_connected {
 	u8 connection_id;
 	u16 link_info;
 	u32 ep_name[55];
-} __packed;
+};
 
 #define ICM_LINK_INFO_LINK_MASK		0x7
 #define ICM_LINK_INFO_DEPTH_SHIFT	4
@@ -197,13 +183,32 @@ struct icm_fr_pkg_approve_device {
 	u8 connection_key;
 	u8 connection_id;
 	u16 reserved;
-} __packed;
+};
 
 struct icm_fr_event_device_disconnected {
 	struct icm_pkg_header hdr;
 	u16 reserved;
 	u16 link_info;
-} __packed;
+};
+
+struct icm_fr_event_xdomain_connected {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+	uuid_t local_uuid;
+	u32 local_route_hi;
+	u32 local_route_lo;
+	u32 remote_route_hi;
+	u32 remote_route_lo;
+};
+
+struct icm_fr_event_xdomain_disconnected {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+};
 
 struct icm_fr_pkg_add_device_key {
 	struct icm_pkg_header hdr;
@@ -212,7 +217,7 @@ struct icm_fr_pkg_add_device_key {
 	u8 connection_id;
 	u16 reserved;
 	u32 key[8];
-} __packed;
+};
 
 struct icm_fr_pkg_add_device_key_response {
 	struct icm_pkg_header hdr;
@@ -220,7 +225,7 @@ struct icm_fr_pkg_add_device_key_response {
 	u8 connection_key;
 	u8 connection_id;
 	u16 reserved;
-} __packed;
+};
 
 struct icm_fr_pkg_challenge_device {
 	struct icm_pkg_header hdr;
@@ -229,7 +234,7 @@ struct icm_fr_pkg_challenge_device {
 	u8 connection_id;
 	u16 reserved;
 	u32 challenge[8];
-} __packed;
+};
 
 struct icm_fr_pkg_challenge_device_response {
 	struct icm_pkg_header hdr;
@@ -239,7 +244,29 @@ struct icm_fr_pkg_challenge_device_response {
 	u16 reserved;
 	u32 challenge[8];
 	u32 response[8];
-} __packed;
+};
+
+struct icm_fr_pkg_approve_xdomain {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+};
+
+struct icm_fr_pkg_approve_xdomain_response {
+	struct icm_pkg_header hdr;
+	u16 reserved;
+	u16 link_info;
+	uuid_t remote_uuid;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+};
 
 /* Alpine Ridge only messages */
 
@@ -247,7 +274,7 @@ struct icm_ar_pkg_get_route {
 	struct icm_pkg_header hdr;
 	u16 reserved;
 	u16 link_info;
-} __packed;
+};
 
 struct icm_ar_pkg_get_route_response {
 	struct icm_pkg_header hdr;
@@ -255,6 +282,85 @@ struct icm_ar_pkg_get_route_response {
 	u16 link_info;
 	u32 route_hi;
 	u32 route_lo;
-} __packed;
+};
+
+/* XDomain messages */
+
+struct tb_xdomain_header {
+	u32 route_hi;
+	u32 route_lo;
+	u32 length_sn;
+};
+
+#define TB_XDOMAIN_LENGTH_MASK	GENMASK(5, 0)
+#define TB_XDOMAIN_SN_MASK	GENMASK(28, 27)
+#define TB_XDOMAIN_SN_SHIFT	27
+
+enum tb_xdp_type {
+	UUID_REQUEST_OLD = 1,
+	UUID_RESPONSE = 2,
+	PROPERTIES_REQUEST,
+	PROPERTIES_RESPONSE,
+	PROPERTIES_CHANGED_REQUEST,
+	PROPERTIES_CHANGED_RESPONSE,
+	ERROR_RESPONSE,
+	UUID_REQUEST = 12,
+};
+
+struct tb_xdp_header {
+	struct tb_xdomain_header xd_hdr;
+	uuid_t uuid;
+	u32 type;
+};
+
+struct tb_xdp_properties {
+	struct tb_xdp_header hdr;
+	uuid_t src_uuid;
+	uuid_t dst_uuid;
+	u16 offset;
+	u16 reserved;
+};
+
+struct tb_xdp_properties_response {
+	struct tb_xdp_header hdr;
+	uuid_t src_uuid;
+	uuid_t dst_uuid;
+	u16 offset;
+	u16 data_length;
+	u32 generation;
+	u32 data[0];
+};
+
+/*
+ * Max length of data array single XDomain property response is allowed
+ * to carry.
+ */
+#define TB_XDP_PROPERTIES_MAX_DATA_LENGTH	\
+	(((256 - 4 - sizeof(struct tb_xdp_properties_response))) / 4)
+
+/* Maximum size of the total property block in dwords we allow */
+#define TB_XDP_PROPERTIES_MAX_LENGTH		500
+
+struct tb_xdp_properties_changed {
+	struct tb_xdp_header hdr;
+	uuid_t src_uuid;
+};
+
+struct tb_xdp_properties_changed_response {
+	struct tb_xdp_header hdr;
+};
+
+enum tb_xdp_error {
+	ERROR_SUCCESS,
+	ERROR_UNKNOWN_PACKET,
+	ERROR_UNKNOWN_DOMAIN,
+	ERROR_NOT_SUPPORTED,
+	ERROR_NOT_READY,
+};
+
+struct tb_xdp_error_response {
+	struct tb_xdp_header hdr;
+	u32 error;
+};
 
 #endif
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
new file mode 100644
index 000000000000..f2d06f6f7be9
--- /dev/null
+++ b/drivers/thunderbolt/xdomain.c
@@ -0,0 +1,1576 @@
+/*
+ * Thunderbolt XDomain discovery protocol support
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/uuid.h>
+#include <linux/workqueue.h>
+
+#include "tb.h"
+
+#define XDOMAIN_DEFAULT_TIMEOUT			5000 /* ms */
+#define XDOMAIN_PROPERTIES_RETRIES		60
+#define XDOMAIN_PROPERTIES_CHANGED_RETRIES	10
+
+struct xdomain_request_work {
+	struct work_struct work;
+	struct tb_xdp_header *pkg;
+	struct tb *tb;
+};
+
+/* Serializes access to the properties and protocol handlers below */
+static DEFINE_MUTEX(xdomain_lock);
+
+/* Properties exposed to the remote domains */
+static struct tb_property_dir *xdomain_property_dir;
+static u32 *xdomain_property_block;
+static u32 xdomain_property_block_len;
+static u32 xdomain_property_block_gen;
+
+/* Additional protocol handlers */
+static LIST_HEAD(protocol_handlers);
+
+/* UUID for XDomain discovery protocol: b638d70e-42ff-40bb-97c2-90e2c0b2ff07 */
+static const uuid_t tb_xdp_uuid =
+	UUID_INIT(0xb638d70e, 0x42ff, 0x40bb,
+		  0x97, 0xc2, 0x90, 0xe2, 0xc0, 0xb2, 0xff, 0x07);
+
+static bool tb_xdomain_match(const struct tb_cfg_request *req,
+			     const struct ctl_pkg *pkg)
+{
+	switch (pkg->frame.eof) {
+	case TB_CFG_PKG_ERROR:
+		return true;
+
+	case TB_CFG_PKG_XDOMAIN_RESP: {
+		const struct tb_xdp_header *res_hdr = pkg->buffer;
+		const struct tb_xdp_header *req_hdr = req->request;
+		u8 req_seq, res_seq;
+
+		if (pkg->frame.size < req->response_size / 4)
+			return false;
+
+		/* Make sure route matches */
+		if ((res_hdr->xd_hdr.route_hi & ~BIT(31)) !=
+		     req_hdr->xd_hdr.route_hi)
+			return false;
+		if ((res_hdr->xd_hdr.route_lo) != req_hdr->xd_hdr.route_lo)
+			return false;
+
+		/* Then check that the sequence number matches */
+		res_seq = res_hdr->xd_hdr.length_sn & TB_XDOMAIN_SN_MASK;
+		res_seq >>= TB_XDOMAIN_SN_SHIFT;
+		req_seq = req_hdr->xd_hdr.length_sn & TB_XDOMAIN_SN_MASK;
+		req_seq >>= TB_XDOMAIN_SN_SHIFT;
+		if (res_seq != req_seq)
+			return false;
+
+		/* Check that the XDomain protocol matches */
+		if (!uuid_equal(&res_hdr->uuid, &req_hdr->uuid))
+			return false;
+
+		return true;
+	}
+
+	default:
+		return false;
+	}
+}
+
+static bool tb_xdomain_copy(struct tb_cfg_request *req,
+			    const struct ctl_pkg *pkg)
+{
+	memcpy(req->response, pkg->buffer, req->response_size);
+	req->result.err = 0;
+	return true;
+}
+
+static void response_ready(void *data)
+{
+	tb_cfg_request_put(data);
+}
+
+static int __tb_xdomain_response(struct tb_ctl *ctl, const void *response,
+				 size_t size, enum tb_cfg_pkg_type type)
+{
+	struct tb_cfg_request *req;
+
+	req = tb_cfg_request_alloc();
+	if (!req)
+		return -ENOMEM;
+
+	req->match = tb_xdomain_match;
+	req->copy = tb_xdomain_copy;
+	req->request = response;
+	req->request_size = size;
+	req->request_type = type;
+
+	return tb_cfg_request(ctl, req, response_ready, req);
+}
+
+/**
+ * tb_xdomain_response() - Send a XDomain response message
+ * @xd: XDomain to send the message
+ * @response: Response to send
+ * @size: Size of the response
+ * @type: PDF type of the response
+ *
+ * This can be used to send a XDomain response message to the other
+ * domain. No response for the message is expected.
+ *
+ * Return: %0 in case of success and negative errno in case of failure
+ */
+int tb_xdomain_response(struct tb_xdomain *xd, const void *response,
+			size_t size, enum tb_cfg_pkg_type type)
+{
+	return __tb_xdomain_response(xd->tb->ctl, response, size, type);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_response);
+
+static int __tb_xdomain_request(struct tb_ctl *ctl, const void *request,
+	size_t request_size, enum tb_cfg_pkg_type request_type, void *response,
+	size_t response_size, enum tb_cfg_pkg_type response_type,
+	unsigned int timeout_msec)
+{
+	struct tb_cfg_request *req;
+	struct tb_cfg_result res;
+
+	req = tb_cfg_request_alloc();
+	if (!req)
+		return -ENOMEM;
+
+	req->match = tb_xdomain_match;
+	req->copy = tb_xdomain_copy;
+	req->request = request;
+	req->request_size = request_size;
+	req->request_type = request_type;
+	req->response = response;
+	req->response_size = response_size;
+	req->response_type = response_type;
+
+	res = tb_cfg_request_sync(ctl, req, timeout_msec);
+
+	tb_cfg_request_put(req);
+
+	return res.err == 1 ? -EIO : res.err;
+}
+
+/**
+ * tb_xdomain_request() - Send a XDomain request
+ * @xd: XDomain to send the request
+ * @request: Request to send
+ * @request_size: Size of the request in bytes
+ * @request_type: PDF type of the request
+ * @response: Response is copied here
+ * @response_size: Expected size of the response in bytes
+ * @response_type: Expected PDF type of the response
+ * @timeout_msec: Timeout in milliseconds to wait for the response
+ *
+ * This function can be used to send XDomain control channel messages to
+ * the other domain. The function waits until the response is received
+ * or when timeout triggers. Whichever comes first.
+ *
+ * Return: %0 in case of success and negative errno in case of failure
+ */
+int tb_xdomain_request(struct tb_xdomain *xd, const void *request,
+	size_t request_size, enum tb_cfg_pkg_type request_type,
+	void *response, size_t response_size,
+	enum tb_cfg_pkg_type response_type, unsigned int timeout_msec)
+{
+	return __tb_xdomain_request(xd->tb->ctl, request, request_size,
+				    request_type, response, response_size,
+				    response_type, timeout_msec);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_request);
+
+static inline void tb_xdp_fill_header(struct tb_xdp_header *hdr, u64 route,
+	u8 sequence, enum tb_xdp_type type, size_t size)
+{
+	u32 length_sn;
+
+	length_sn = (size - sizeof(hdr->xd_hdr)) / 4;
+	length_sn |= (sequence << TB_XDOMAIN_SN_SHIFT) & TB_XDOMAIN_SN_MASK;
+
+	hdr->xd_hdr.route_hi = upper_32_bits(route);
+	hdr->xd_hdr.route_lo = lower_32_bits(route);
+	hdr->xd_hdr.length_sn = length_sn;
+	hdr->type = type;
+	memcpy(&hdr->uuid, &tb_xdp_uuid, sizeof(tb_xdp_uuid));
+}
+
+static int tb_xdp_handle_error(const struct tb_xdp_header *hdr)
+{
+	const struct tb_xdp_error_response *error;
+
+	if (hdr->type != ERROR_RESPONSE)
+		return 0;
+
+	error = (const struct tb_xdp_error_response *)hdr;
+
+	switch (error->error) {
+	case ERROR_UNKNOWN_PACKET:
+	case ERROR_UNKNOWN_DOMAIN:
+		return -EIO;
+	case ERROR_NOT_SUPPORTED:
+		return -ENOTSUPP;
+	case ERROR_NOT_READY:
+		return -EAGAIN;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int tb_xdp_error_response(struct tb_ctl *ctl, u64 route, u8 sequence,
+				 enum tb_xdp_error error)
+{
+	struct tb_xdp_error_response res;
+
+	memset(&res, 0, sizeof(res));
+	tb_xdp_fill_header(&res.hdr, route, sequence, ERROR_RESPONSE,
+			   sizeof(res));
+	res.error = error;
+
+	return __tb_xdomain_response(ctl, &res, sizeof(res),
+				     TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+static int tb_xdp_properties_request(struct tb_ctl *ctl, u64 route,
+	const uuid_t *src_uuid, const uuid_t *dst_uuid, int retry,
+	u32 **block, u32 *generation)
+{
+	struct tb_xdp_properties_response *res;
+	struct tb_xdp_properties req;
+	u16 data_len, len;
+	size_t total_size;
+	u32 *data = NULL;
+	int ret;
+
+	total_size = sizeof(*res) + TB_XDP_PROPERTIES_MAX_DATA_LENGTH * 4;
+	res = kzalloc(total_size, GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	memset(&req, 0, sizeof(req));
+	tb_xdp_fill_header(&req.hdr, route, retry % 4, PROPERTIES_REQUEST,
+			   sizeof(req));
+	memcpy(&req.src_uuid, src_uuid, sizeof(*src_uuid));
+	memcpy(&req.dst_uuid, dst_uuid, sizeof(*dst_uuid));
+
+	len = 0;
+	data_len = 0;
+
+	do {
+		ret = __tb_xdomain_request(ctl, &req, sizeof(req),
+					   TB_CFG_PKG_XDOMAIN_REQ, res,
+					   total_size, TB_CFG_PKG_XDOMAIN_RESP,
+					   XDOMAIN_DEFAULT_TIMEOUT);
+		if (ret)
+			goto err;
+
+		ret = tb_xdp_handle_error(&res->hdr);
+		if (ret)
+			goto err;
+
+		/*
+		 * Package length includes the whole payload without the
+		 * XDomain header. Validate first that the package is at
+		 * least size of the response structure.
+		 */
+		len = res->hdr.xd_hdr.length_sn & TB_XDOMAIN_LENGTH_MASK;
+		if (len < sizeof(*res) / 4) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		len += sizeof(res->hdr.xd_hdr) / 4;
+		len -= sizeof(*res) / 4;
+
+		if (res->offset != req.offset) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/*
+		 * First time allocate block that has enough space for
+		 * the whole properties block.
+		 */
+		if (!data) {
+			data_len = res->data_length;
+			if (data_len > TB_XDP_PROPERTIES_MAX_LENGTH) {
+				ret = -E2BIG;
+				goto err;
+			}
+
+			data = kcalloc(data_len, sizeof(u32), GFP_KERNEL);
+			if (!data) {
+				ret = -ENOMEM;
+				goto err;
+			}
+		}
+
+		memcpy(data + req.offset, res->data, len * 4);
+		req.offset += len;
+	} while (!data_len || req.offset < data_len);
+
+	*block = data;
+	*generation = res->generation;
+
+	kfree(res);
+
+	return data_len;
+
+err:
+	kfree(data);
+	kfree(res);
+
+	return ret;
+}
+
+static int tb_xdp_properties_response(struct tb *tb, struct tb_ctl *ctl,
+	u64 route, u8 sequence, const uuid_t *src_uuid,
+	const struct tb_xdp_properties *req)
+{
+	struct tb_xdp_properties_response *res;
+	size_t total_size;
+	u16 len;
+	int ret;
+
+	/*
+	 * Currently we expect all requests to be directed to us. The
+	 * protocol supports forwarding, though which we might add
+	 * support later on.
+	 */
+	if (!uuid_equal(src_uuid, &req->dst_uuid)) {
+		tb_xdp_error_response(ctl, route, sequence,
+				      ERROR_UNKNOWN_DOMAIN);
+		return 0;
+	}
+
+	mutex_lock(&xdomain_lock);
+
+	if (req->offset >= xdomain_property_block_len) {
+		mutex_unlock(&xdomain_lock);
+		return -EINVAL;
+	}
+
+	len = xdomain_property_block_len - req->offset;
+	len = min_t(u16, len, TB_XDP_PROPERTIES_MAX_DATA_LENGTH);
+	total_size = sizeof(*res) + len * 4;
+
+	res = kzalloc(total_size, GFP_KERNEL);
+	if (!res) {
+		mutex_unlock(&xdomain_lock);
+		return -ENOMEM;
+	}
+
+	tb_xdp_fill_header(&res->hdr, route, sequence, PROPERTIES_RESPONSE,
+			   total_size);
+	res->generation = xdomain_property_block_gen;
+	res->data_length = xdomain_property_block_len;
+	res->offset = req->offset;
+	uuid_copy(&res->src_uuid, src_uuid);
+	uuid_copy(&res->dst_uuid, &req->src_uuid);
+	memcpy(res->data, &xdomain_property_block[req->offset], len * 4);
+
+	mutex_unlock(&xdomain_lock);
+
+	ret = __tb_xdomain_response(ctl, res, total_size,
+				    TB_CFG_PKG_XDOMAIN_RESP);
+
+	kfree(res);
+	return ret;
+}
+
+static int tb_xdp_properties_changed_request(struct tb_ctl *ctl, u64 route,
+					     int retry, const uuid_t *uuid)
+{
+	struct tb_xdp_properties_changed_response res;
+	struct tb_xdp_properties_changed req;
+	int ret;
+
+	memset(&req, 0, sizeof(req));
+	tb_xdp_fill_header(&req.hdr, route, retry % 4,
+			   PROPERTIES_CHANGED_REQUEST, sizeof(req));
+	uuid_copy(&req.src_uuid, uuid);
+
+	memset(&res, 0, sizeof(res));
+	ret = __tb_xdomain_request(ctl, &req, sizeof(req),
+				   TB_CFG_PKG_XDOMAIN_REQ, &res, sizeof(res),
+				   TB_CFG_PKG_XDOMAIN_RESP,
+				   XDOMAIN_DEFAULT_TIMEOUT);
+	if (ret)
+		return ret;
+
+	return tb_xdp_handle_error(&res.hdr);
+}
+
+static int
+tb_xdp_properties_changed_response(struct tb_ctl *ctl, u64 route, u8 sequence)
+{
+	struct tb_xdp_properties_changed_response res;
+
+	memset(&res, 0, sizeof(res));
+	tb_xdp_fill_header(&res.hdr, route, sequence,
+			   PROPERTIES_CHANGED_RESPONSE, sizeof(res));
+	return __tb_xdomain_response(ctl, &res, sizeof(res),
+				     TB_CFG_PKG_XDOMAIN_RESP);
+}
+
+/**
+ * tb_register_protocol_handler() - Register protocol handler
+ * @handler: Handler to register
+ *
+ * This allows XDomain service drivers to hook into incoming XDomain
+ * messages. After this function is called the service driver needs to
+ * be able to handle calls to callback whenever a package with the
+ * registered protocol is received.
+ */
+int tb_register_protocol_handler(struct tb_protocol_handler *handler)
+{
+	if (!handler->uuid || !handler->callback)
+		return -EINVAL;
+	if (uuid_equal(handler->uuid, &tb_xdp_uuid))
+		return -EINVAL;
+
+	mutex_lock(&xdomain_lock);
+	list_add_tail(&handler->list, &protocol_handlers);
+	mutex_unlock(&xdomain_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tb_register_protocol_handler);
+
+/**
+ * tb_unregister_protocol_handler() - Unregister protocol handler
+ * @handler: Handler to unregister
+ *
+ * Removes the previously registered protocol handler.
+ */
+void tb_unregister_protocol_handler(struct tb_protocol_handler *handler)
+{
+	mutex_lock(&xdomain_lock);
+	list_del_init(&handler->list);
+	mutex_unlock(&xdomain_lock);
+}
+EXPORT_SYMBOL_GPL(tb_unregister_protocol_handler);
+
+static void tb_xdp_handle_request(struct work_struct *work)
+{
+	struct xdomain_request_work *xw = container_of(work, typeof(*xw), work);
+	const struct tb_xdp_header *pkg = xw->pkg;
+	const struct tb_xdomain_header *xhdr = &pkg->xd_hdr;
+	struct tb *tb = xw->tb;
+	struct tb_ctl *ctl = tb->ctl;
+	const uuid_t *uuid;
+	int ret = 0;
+	u8 sequence;
+	u64 route;
+
+	route = ((u64)xhdr->route_hi << 32 | xhdr->route_lo) & ~BIT_ULL(63);
+	sequence = xhdr->length_sn & TB_XDOMAIN_SN_MASK;
+	sequence >>= TB_XDOMAIN_SN_SHIFT;
+
+	mutex_lock(&tb->lock);
+	if (tb->root_switch)
+		uuid = tb->root_switch->uuid;
+	else
+		uuid = NULL;
+	mutex_unlock(&tb->lock);
+
+	if (!uuid) {
+		tb_xdp_error_response(ctl, route, sequence, ERROR_NOT_READY);
+		goto out;
+	}
+
+	switch (pkg->type) {
+	case PROPERTIES_REQUEST:
+		ret = tb_xdp_properties_response(tb, ctl, route, sequence, uuid,
+			(const struct tb_xdp_properties *)pkg);
+		break;
+
+	case PROPERTIES_CHANGED_REQUEST: {
+		const struct tb_xdp_properties_changed *xchg =
+			(const struct tb_xdp_properties_changed *)pkg;
+		struct tb_xdomain *xd;
+
+		ret = tb_xdp_properties_changed_response(ctl, route, sequence);
+
+		/*
+		 * Since the properties have been changed, let's update
+		 * the xdomain related to this connection as well in
+		 * case there is a change in services it offers.
+		 */
+		xd = tb_xdomain_find_by_uuid_locked(tb, &xchg->src_uuid);
+		if (xd) {
+			queue_delayed_work(tb->wq, &xd->get_properties_work,
+					   msecs_to_jiffies(50));
+			tb_xdomain_put(xd);
+		}
+
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	if (ret) {
+		tb_warn(tb, "failed to send XDomain response for %#x\n",
+			pkg->type);
+	}
+
+out:
+	kfree(xw->pkg);
+	kfree(xw);
+}
+
+static void
+tb_xdp_schedule_request(struct tb *tb, const struct tb_xdp_header *hdr,
+			size_t size)
+{
+	struct xdomain_request_work *xw;
+
+	xw = kmalloc(sizeof(*xw), GFP_KERNEL);
+	if (!xw)
+		return;
+
+	INIT_WORK(&xw->work, tb_xdp_handle_request);
+	xw->pkg = kmemdup(hdr, size, GFP_KERNEL);
+	xw->tb = tb;
+
+	queue_work(tb->wq, &xw->work);
+}
+
+/**
+ * tb_register_service_driver() - Register XDomain service driver
+ * @drv: Driver to register
+ *
+ * Registers new service driver from @drv to the bus.
+ */
+int tb_register_service_driver(struct tb_service_driver *drv)
+{
+	drv->driver.bus = &tb_bus_type;
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(tb_register_service_driver);
+
+/**
+ * tb_unregister_service_driver() - Unregister XDomain service driver
+ * @xdrv: Driver to unregister
+ *
+ * Unregisters XDomain service driver from the bus.
+ */
+void tb_unregister_service_driver(struct tb_service_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(tb_unregister_service_driver);
+
+static ssize_t key_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	/*
+	 * It should be null terminated but anything else is pretty much
+	 * allowed.
+	 */
+	return sprintf(buf, "%*pEp\n", (int)strlen(svc->key), svc->key);
+}
+static DEVICE_ATTR_RO(key);
+
+static int get_modalias(struct tb_service *svc, char *buf, size_t size)
+{
+	return snprintf(buf, size, "tbsvc:k%sp%08Xv%08Xr%08X", svc->key,
+			svc->prtcid, svc->prtcvers, svc->prtcrevs);
+}
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	/* Full buffer size except new line and null termination */
+	get_modalias(svc, buf, PAGE_SIZE - 2);
+	return sprintf(buf, "%s\n", buf);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static ssize_t prtcid_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "%u\n", svc->prtcid);
+}
+static DEVICE_ATTR_RO(prtcid);
+
+static ssize_t prtcvers_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "%u\n", svc->prtcvers);
+}
+static DEVICE_ATTR_RO(prtcvers);
+
+static ssize_t prtcrevs_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "%u\n", svc->prtcrevs);
+}
+static DEVICE_ATTR_RO(prtcrevs);
+
+static ssize_t prtcstns_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+
+	return sprintf(buf, "0x%08x\n", svc->prtcstns);
+}
+static DEVICE_ATTR_RO(prtcstns);
+
+static struct attribute *tb_service_attrs[] = {
+	&dev_attr_key.attr,
+	&dev_attr_modalias.attr,
+	&dev_attr_prtcid.attr,
+	&dev_attr_prtcvers.attr,
+	&dev_attr_prtcrevs.attr,
+	&dev_attr_prtcstns.attr,
+	NULL,
+};
+
+static struct attribute_group tb_service_attr_group = {
+	.attrs = tb_service_attrs,
+};
+
+static const struct attribute_group *tb_service_attr_groups[] = {
+	&tb_service_attr_group,
+	NULL,
+};
+
+static int tb_service_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+	char modalias[64];
+
+	get_modalias(svc, modalias, sizeof(modalias));
+	return add_uevent_var(env, "MODALIAS=%s", modalias);
+}
+
+static void tb_service_release(struct device *dev)
+{
+	struct tb_service *svc = container_of(dev, struct tb_service, dev);
+	struct tb_xdomain *xd = tb_service_parent(svc);
+
+	ida_simple_remove(&xd->service_ids, svc->id);
+	kfree(svc->key);
+	kfree(svc);
+}
+
+struct device_type tb_service_type = {
+	.name = "thunderbolt_service",
+	.groups = tb_service_attr_groups,
+	.uevent = tb_service_uevent,
+	.release = tb_service_release,
+};
+EXPORT_SYMBOL_GPL(tb_service_type);
+
+static int remove_missing_service(struct device *dev, void *data)
+{
+	struct tb_xdomain *xd = data;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc)
+		return 0;
+
+	if (!tb_property_find(xd->properties, svc->key,
+			      TB_PROPERTY_TYPE_DIRECTORY))
+		device_unregister(dev);
+
+	return 0;
+}
+
+static int find_service(struct device *dev, void *data)
+{
+	const struct tb_property *p = data;
+	struct tb_service *svc;
+
+	svc = tb_to_service(dev);
+	if (!svc)
+		return 0;
+
+	return !strcmp(svc->key, p->key);
+}
+
+static int populate_service(struct tb_service *svc,
+			    struct tb_property *property)
+{
+	struct tb_property_dir *dir = property->value.dir;
+	struct tb_property *p;
+
+	/* Fill in standard properties */
+	p = tb_property_find(dir, "prtcid", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcid = p->value.immediate;
+	p = tb_property_find(dir, "prtcvers", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcvers = p->value.immediate;
+	p = tb_property_find(dir, "prtcrevs", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcrevs = p->value.immediate;
+	p = tb_property_find(dir, "prtcstns", TB_PROPERTY_TYPE_VALUE);
+	if (p)
+		svc->prtcstns = p->value.immediate;
+
+	svc->key = kstrdup(property->key, GFP_KERNEL);
+	if (!svc->key)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void enumerate_services(struct tb_xdomain *xd)
+{
+	struct tb_service *svc;
+	struct tb_property *p;
+	struct device *dev;
+
+	/*
+	 * First remove all services that are not available anymore in
+	 * the updated property block.
+	 */
+	device_for_each_child_reverse(&xd->dev, xd, remove_missing_service);
+
+	/* Then re-enumerate properties creating new services as we go */
+	tb_property_for_each(xd->properties, p) {
+		if (p->type != TB_PROPERTY_TYPE_DIRECTORY)
+			continue;
+
+		/* If the service exists already we are fine */
+		dev = device_find_child(&xd->dev, p, find_service);
+		if (dev) {
+			put_device(dev);
+			continue;
+		}
+
+		svc = kzalloc(sizeof(*svc), GFP_KERNEL);
+		if (!svc)
+			break;
+
+		if (populate_service(svc, p)) {
+			kfree(svc);
+			break;
+		}
+
+		svc->id = ida_simple_get(&xd->service_ids, 0, 0, GFP_KERNEL);
+		svc->dev.bus = &tb_bus_type;
+		svc->dev.type = &tb_service_type;
+		svc->dev.parent = &xd->dev;
+		dev_set_name(&svc->dev, "%s.%d", dev_name(&xd->dev), svc->id);
+
+		if (device_register(&svc->dev)) {
+			put_device(&svc->dev);
+			break;
+		}
+	}
+}
+
+static int populate_properties(struct tb_xdomain *xd,
+			       struct tb_property_dir *dir)
+{
+	const struct tb_property *p;
+
+	/* Required properties */
+	p = tb_property_find(dir, "deviceid", TB_PROPERTY_TYPE_VALUE);
+	if (!p)
+		return -EINVAL;
+	xd->device = p->value.immediate;
+
+	p = tb_property_find(dir, "vendorid", TB_PROPERTY_TYPE_VALUE);
+	if (!p)
+		return -EINVAL;
+	xd->vendor = p->value.immediate;
+
+	kfree(xd->device_name);
+	xd->device_name = NULL;
+	kfree(xd->vendor_name);
+	xd->vendor_name = NULL;
+
+	/* Optional properties */
+	p = tb_property_find(dir, "deviceid", TB_PROPERTY_TYPE_TEXT);
+	if (p)
+		xd->device_name = kstrdup(p->value.text, GFP_KERNEL);
+	p = tb_property_find(dir, "vendorid", TB_PROPERTY_TYPE_TEXT);
+	if (p)
+		xd->vendor_name = kstrdup(p->value.text, GFP_KERNEL);
+
+	return 0;
+}
+
+/* Called with @xd->lock held */
+static void tb_xdomain_restore_paths(struct tb_xdomain *xd)
+{
+	if (!xd->resume)
+		return;
+
+	xd->resume = false;
+	if (xd->transmit_path) {
+		dev_dbg(&xd->dev, "re-establishing DMA path\n");
+		tb_domain_approve_xdomain_paths(xd->tb, xd);
+	}
+}
+
+static void tb_xdomain_get_properties(struct work_struct *work)
+{
+	struct tb_xdomain *xd = container_of(work, typeof(*xd),
+					     get_properties_work.work);
+	struct tb_property_dir *dir;
+	struct tb *tb = xd->tb;
+	bool update = false;
+	u32 *block = NULL;
+	u32 gen = 0;
+	int ret;
+
+	ret = tb_xdp_properties_request(tb->ctl, xd->route, xd->local_uuid,
+					xd->remote_uuid, xd->properties_retries,
+					&block, &gen);
+	if (ret < 0) {
+		if (xd->properties_retries-- > 0) {
+			queue_delayed_work(xd->tb->wq, &xd->get_properties_work,
+					   msecs_to_jiffies(1000));
+		} else {
+			/* Give up now */
+			dev_err(&xd->dev,
+				"failed read XDomain properties from %pUb\n",
+				xd->remote_uuid);
+		}
+		return;
+	}
+
+	xd->properties_retries = XDOMAIN_PROPERTIES_RETRIES;
+
+	mutex_lock(&xd->lock);
+
+	/* Only accept newer generation properties */
+	if (xd->properties && gen <= xd->property_block_gen) {
+		/*
+		 * On resume it is likely that the properties block is
+		 * not changed (unless the other end added or removed
+		 * services). However, we need to make sure the existing
+		 * DMA paths are restored properly.
+		 */
+		tb_xdomain_restore_paths(xd);
+		goto err_free_block;
+	}
+
+	dir = tb_property_parse_dir(block, ret);
+	if (!dir) {
+		dev_err(&xd->dev, "failed to parse XDomain properties\n");
+		goto err_free_block;
+	}
+
+	ret = populate_properties(xd, dir);
+	if (ret) {
+		dev_err(&xd->dev, "missing XDomain properties in response\n");
+		goto err_free_dir;
+	}
+
+	/* Release the existing one */
+	if (xd->properties) {
+		tb_property_free_dir(xd->properties);
+		update = true;
+	}
+
+	xd->properties = dir;
+	xd->property_block_gen = gen;
+
+	tb_xdomain_restore_paths(xd);
+
+	mutex_unlock(&xd->lock);
+
+	kfree(block);
+
+	/*
+	 * Now the device should be ready enough so we can add it to the
+	 * bus and let userspace know about it. If the device is already
+	 * registered, we notify the userspace that it has changed.
+	 */
+	if (!update) {
+		if (device_add(&xd->dev)) {
+			dev_err(&xd->dev, "failed to add XDomain device\n");
+			return;
+		}
+	} else {
+		kobject_uevent(&xd->dev.kobj, KOBJ_CHANGE);
+	}
+
+	enumerate_services(xd);
+	return;
+
+err_free_dir:
+	tb_property_free_dir(dir);
+err_free_block:
+	kfree(block);
+	mutex_unlock(&xd->lock);
+}
+
+static void tb_xdomain_properties_changed(struct work_struct *work)
+{
+	struct tb_xdomain *xd = container_of(work, typeof(*xd),
+					     properties_changed_work.work);
+	int ret;
+
+	ret = tb_xdp_properties_changed_request(xd->tb->ctl, xd->route,
+				xd->properties_changed_retries, xd->local_uuid);
+	if (ret) {
+		if (xd->properties_changed_retries-- > 0)
+			queue_delayed_work(xd->tb->wq,
+					   &xd->properties_changed_work,
+					   msecs_to_jiffies(1000));
+		return;
+	}
+
+	xd->properties_changed_retries = XDOMAIN_PROPERTIES_CHANGED_RETRIES;
+}
+
+static ssize_t device_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%#x\n", xd->device);
+}
+static DEVICE_ATTR_RO(device);
+
+static ssize_t
+device_name_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+	int ret;
+
+	if (mutex_lock_interruptible(&xd->lock))
+		return -ERESTARTSYS;
+	ret = sprintf(buf, "%s\n", xd->device_name ? xd->device_name : "");
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+static DEVICE_ATTR_RO(device_name);
+
+static ssize_t vendor_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%#x\n", xd->vendor);
+}
+static DEVICE_ATTR_RO(vendor);
+
+static ssize_t
+vendor_name_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+	int ret;
+
+	if (mutex_lock_interruptible(&xd->lock))
+		return -ERESTARTSYS;
+	ret = sprintf(buf, "%s\n", xd->vendor_name ? xd->vendor_name : "");
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+static DEVICE_ATTR_RO(vendor_name);
+
+static ssize_t unique_id_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%pUb\n", xd->remote_uuid);
+}
+static DEVICE_ATTR_RO(unique_id);
+
+static struct attribute *xdomain_attrs[] = {
+	&dev_attr_device.attr,
+	&dev_attr_device_name.attr,
+	&dev_attr_unique_id.attr,
+	&dev_attr_vendor.attr,
+	&dev_attr_vendor_name.attr,
+	NULL,
+};
+
+static struct attribute_group xdomain_attr_group = {
+	.attrs = xdomain_attrs,
+};
+
+static const struct attribute_group *xdomain_attr_groups[] = {
+	&xdomain_attr_group,
+	NULL,
+};
+
+static void tb_xdomain_release(struct device *dev)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	put_device(xd->dev.parent);
+
+	tb_property_free_dir(xd->properties);
+	ida_destroy(&xd->service_ids);
+
+	kfree(xd->local_uuid);
+	kfree(xd->remote_uuid);
+	kfree(xd->device_name);
+	kfree(xd->vendor_name);
+	kfree(xd);
+}
+
+static void start_handshake(struct tb_xdomain *xd)
+{
+	xd->properties_retries = XDOMAIN_PROPERTIES_RETRIES;
+	xd->properties_changed_retries = XDOMAIN_PROPERTIES_CHANGED_RETRIES;
+
+	/* Start exchanging properties with the other host */
+	queue_delayed_work(xd->tb->wq, &xd->properties_changed_work,
+			   msecs_to_jiffies(100));
+	queue_delayed_work(xd->tb->wq, &xd->get_properties_work,
+			   msecs_to_jiffies(1000));
+}
+
+static void stop_handshake(struct tb_xdomain *xd)
+{
+	xd->properties_retries = 0;
+	xd->properties_changed_retries = 0;
+
+	cancel_delayed_work_sync(&xd->get_properties_work);
+	cancel_delayed_work_sync(&xd->properties_changed_work);
+}
+
+static int __maybe_unused tb_xdomain_suspend(struct device *dev)
+{
+	stop_handshake(tb_to_xdomain(dev));
+	return 0;
+}
+
+static int __maybe_unused tb_xdomain_resume(struct device *dev)
+{
+	struct tb_xdomain *xd = tb_to_xdomain(dev);
+
+	/*
+	 * Ask tb_xdomain_get_properties() restore any existing DMA
+	 * paths after properties are re-read.
+	 */
+	xd->resume = true;
+	start_handshake(xd);
+
+	return 0;
+}
+
+static const struct dev_pm_ops tb_xdomain_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(tb_xdomain_suspend, tb_xdomain_resume)
+};
+
+struct device_type tb_xdomain_type = {
+	.name = "thunderbolt_xdomain",
+	.release = tb_xdomain_release,
+	.pm = &tb_xdomain_pm_ops,
+};
+EXPORT_SYMBOL_GPL(tb_xdomain_type);
+
+/**
+ * tb_xdomain_alloc() - Allocate new XDomain object
+ * @tb: Domain where the XDomain belongs
+ * @parent: Parent device (the switch through the connection to the
+ *	    other domain is reached).
+ * @route: Route string used to reach the other domain
+ * @local_uuid: Our local domain UUID
+ * @remote_uuid: UUID of the other domain
+ *
+ * Allocates new XDomain structure and returns pointer to that. The
+ * object must be released by calling tb_xdomain_put().
+ */
+struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent,
+				    u64 route, const uuid_t *local_uuid,
+				    const uuid_t *remote_uuid)
+{
+	struct tb_xdomain *xd;
+
+	xd = kzalloc(sizeof(*xd), GFP_KERNEL);
+	if (!xd)
+		return NULL;
+
+	xd->tb = tb;
+	xd->route = route;
+	ida_init(&xd->service_ids);
+	mutex_init(&xd->lock);
+	INIT_DELAYED_WORK(&xd->get_properties_work, tb_xdomain_get_properties);
+	INIT_DELAYED_WORK(&xd->properties_changed_work,
+			  tb_xdomain_properties_changed);
+
+	xd->local_uuid = kmemdup(local_uuid, sizeof(uuid_t), GFP_KERNEL);
+	if (!xd->local_uuid)
+		goto err_free;
+
+	xd->remote_uuid = kmemdup(remote_uuid, sizeof(uuid_t), GFP_KERNEL);
+	if (!xd->remote_uuid)
+		goto err_free_local_uuid;
+
+	device_initialize(&xd->dev);
+	xd->dev.parent = get_device(parent);
+	xd->dev.bus = &tb_bus_type;
+	xd->dev.type = &tb_xdomain_type;
+	xd->dev.groups = xdomain_attr_groups;
+	dev_set_name(&xd->dev, "%u-%llx", tb->index, route);
+
+	return xd;
+
+err_free_local_uuid:
+	kfree(xd->local_uuid);
+err_free:
+	kfree(xd);
+
+	return NULL;
+}
+
+/**
+ * tb_xdomain_add() - Add XDomain to the bus
+ * @xd: XDomain to add
+ *
+ * This function starts XDomain discovery protocol handshake and
+ * eventually adds the XDomain to the bus. After calling this function
+ * the caller needs to call tb_xdomain_remove() in order to remove and
+ * release the object regardless whether the handshake succeeded or not.
+ */
+void tb_xdomain_add(struct tb_xdomain *xd)
+{
+	/* Start exchanging properties with the other host */
+	start_handshake(xd);
+}
+
+static int unregister_service(struct device *dev, void *data)
+{
+	device_unregister(dev);
+	return 0;
+}
+
+/**
+ * tb_xdomain_remove() - Remove XDomain from the bus
+ * @xd: XDomain to remove
+ *
+ * This will stop all ongoing configuration work and remove the XDomain
+ * along with any services from the bus. When the last reference to @xd
+ * is released the object will be released as well.
+ */
+void tb_xdomain_remove(struct tb_xdomain *xd)
+{
+	stop_handshake(xd);
+
+	device_for_each_child_reverse(&xd->dev, xd, unregister_service);
+
+	if (!device_is_registered(&xd->dev))
+		put_device(&xd->dev);
+	else
+		device_unregister(&xd->dev);
+}
+
+/**
+ * tb_xdomain_enable_paths() - Enable DMA paths for XDomain connection
+ * @xd: XDomain connection
+ * @transmit_path: HopID of the transmit path the other end is using to
+ *		   send packets
+ * @transmit_ring: DMA ring used to receive packets from the other end
+ * @receive_path: HopID of the receive path the other end is using to
+ *		  receive packets
+ * @receive_ring: DMA ring used to send packets to the other end
+ *
+ * The function enables DMA paths accordingly so that after successful
+ * return the caller can send and receive packets using high-speed DMA
+ * path.
+ *
+ * Return: %0 in case of success and negative errno in case of error
+ */
+int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
+			    u16 transmit_ring, u16 receive_path,
+			    u16 receive_ring)
+{
+	int ret;
+
+	mutex_lock(&xd->lock);
+
+	if (xd->transmit_path) {
+		ret = xd->transmit_path == transmit_path ? 0 : -EBUSY;
+		goto exit_unlock;
+	}
+
+	xd->transmit_path = transmit_path;
+	xd->transmit_ring = transmit_ring;
+	xd->receive_path = receive_path;
+	xd->receive_ring = receive_ring;
+
+	ret = tb_domain_approve_xdomain_paths(xd->tb, xd);
+
+exit_unlock:
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_enable_paths);
+
+/**
+ * tb_xdomain_disable_paths() - Disable DMA paths for XDomain connection
+ * @xd: XDomain connection
+ *
+ * This does the opposite of tb_xdomain_enable_paths(). After call to
+ * this the caller is not expected to use the rings anymore.
+ *
+ * Return: %0 in case of success and negative errno in case of error
+ */
+int tb_xdomain_disable_paths(struct tb_xdomain *xd)
+{
+	int ret = 0;
+
+	mutex_lock(&xd->lock);
+	if (xd->transmit_path) {
+		xd->transmit_path = 0;
+		xd->transmit_ring = 0;
+		xd->receive_path = 0;
+		xd->receive_ring = 0;
+
+		ret = tb_domain_disconnect_xdomain_paths(xd->tb, xd);
+	}
+	mutex_unlock(&xd->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_disable_paths);
+
+struct tb_xdomain_lookup {
+	const uuid_t *uuid;
+	u8 link;
+	u8 depth;
+};
+
+static struct tb_xdomain *switch_find_xdomain(struct tb_switch *sw,
+	const struct tb_xdomain_lookup *lookup)
+{
+	int i;
+
+	for (i = 1; i <= sw->config.max_port_number; i++) {
+		struct tb_port *port = &sw->ports[i];
+		struct tb_xdomain *xd;
+
+		if (tb_is_upstream_port(port))
+			continue;
+
+		if (port->xdomain) {
+			xd = port->xdomain;
+
+			if (lookup->uuid) {
+				if (uuid_equal(xd->remote_uuid, lookup->uuid))
+					return xd;
+			} else if (lookup->link == xd->link &&
+				   lookup->depth == xd->depth) {
+				return xd;
+			}
+		} else if (port->remote) {
+			xd = switch_find_xdomain(port->remote->sw, lookup);
+			if (xd)
+				return xd;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * tb_xdomain_find_by_uuid() - Find an XDomain by UUID
+ * @tb: Domain where the XDomain belongs to
+ * @uuid: UUID to look for
+ *
+ * Finds XDomain by walking through the Thunderbolt topology below @tb.
+ * The returned XDomain will have its reference count increased so the
+ * caller needs to call tb_xdomain_put() when it is done with the
+ * object.
+ *
+ * This will find all XDomains including the ones that are not yet added
+ * to the bus (handshake is still in progress).
+ *
+ * The caller needs to hold @tb->lock.
+ */
+struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid)
+{
+	struct tb_xdomain_lookup lookup;
+	struct tb_xdomain *xd;
+
+	memset(&lookup, 0, sizeof(lookup));
+	lookup.uuid = uuid;
+
+	xd = switch_find_xdomain(tb->root_switch, &lookup);
+	if (xd) {
+		get_device(&xd->dev);
+		return xd;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_find_by_uuid);
+
+/**
+ * tb_xdomain_find_by_link_depth() - Find an XDomain by link and depth
+ * @tb: Domain where the XDomain belongs to
+ * @link: Root switch link number
+ * @depth: Depth in the link
+ *
+ * Finds XDomain by walking through the Thunderbolt topology below @tb.
+ * The returned XDomain will have its reference count increased so the
+ * caller needs to call tb_xdomain_put() when it is done with the
+ * object.
+ *
+ * This will find all XDomains including the ones that are not yet added
+ * to the bus (handshake is still in progress).
+ *
+ * The caller needs to hold @tb->lock.
+ */
+struct tb_xdomain *tb_xdomain_find_by_link_depth(struct tb *tb, u8 link,
+						 u8 depth)
+{
+	struct tb_xdomain_lookup lookup;
+	struct tb_xdomain *xd;
+
+	memset(&lookup, 0, sizeof(lookup));
+	lookup.link = link;
+	lookup.depth = depth;
+
+	xd = switch_find_xdomain(tb->root_switch, &lookup);
+	if (xd) {
+		get_device(&xd->dev);
+		return xd;
+	}
+
+	return NULL;
+}
+
+bool tb_xdomain_handle_request(struct tb *tb, enum tb_cfg_pkg_type type,
+			       const void *buf, size_t size)
+{
+	const struct tb_protocol_handler *handler, *tmp;
+	const struct tb_xdp_header *hdr = buf;
+	unsigned int length;
+	int ret = 0;
+
+	/* We expect the packet is at least size of the header */
+	length = hdr->xd_hdr.length_sn & TB_XDOMAIN_LENGTH_MASK;
+	if (length != size / 4 - sizeof(hdr->xd_hdr) / 4)
+		return true;
+	if (length < sizeof(*hdr) / 4 - sizeof(hdr->xd_hdr) / 4)
+		return true;
+
+	/*
+	 * Handle XDomain discovery protocol packets directly here. For
+	 * other protocols (based on their UUID) we call registered
+	 * handlers in turn.
+	 */
+	if (uuid_equal(&hdr->uuid, &tb_xdp_uuid)) {
+		if (type == TB_CFG_PKG_XDOMAIN_REQ) {
+			tb_xdp_schedule_request(tb, hdr, size);
+			return true;
+		}
+		return false;
+	}
+
+	mutex_lock(&xdomain_lock);
+	list_for_each_entry_safe(handler, tmp, &protocol_handlers, list) {
+		if (!uuid_equal(&hdr->uuid, handler->uuid))
+			continue;
+
+		mutex_unlock(&xdomain_lock);
+		ret = handler->callback(buf, size, handler->data);
+		mutex_lock(&xdomain_lock);
+
+		if (ret)
+			break;
+	}
+	mutex_unlock(&xdomain_lock);
+
+	return ret > 0;
+}
+
+static int rebuild_property_block(void)
+{
+	u32 *block, len;
+	int ret;
+
+	ret = tb_property_format_dir(xdomain_property_dir, NULL, 0);
+	if (ret < 0)
+		return ret;
+
+	len = ret;
+
+	block = kcalloc(len, sizeof(u32), GFP_KERNEL);
+	if (!block)
+		return -ENOMEM;
+
+	ret = tb_property_format_dir(xdomain_property_dir, block, len);
+	if (ret) {
+		kfree(block);
+		return ret;
+	}
+
+	kfree(xdomain_property_block);
+	xdomain_property_block = block;
+	xdomain_property_block_len = len;
+	xdomain_property_block_gen++;
+
+	return 0;
+}
+
+static int update_xdomain(struct device *dev, void *data)
+{
+	struct tb_xdomain *xd;
+
+	xd = tb_to_xdomain(dev);
+	if (xd) {
+		queue_delayed_work(xd->tb->wq, &xd->properties_changed_work,
+				   msecs_to_jiffies(50));
+	}
+
+	return 0;
+}
+
+static void update_all_xdomains(void)
+{
+	bus_for_each_dev(&tb_bus_type, NULL, NULL, update_xdomain);
+}
+
+static bool remove_directory(const char *key, const struct tb_property_dir *dir)
+{
+	struct tb_property *p;
+
+	p = tb_property_find(xdomain_property_dir, key,
+			     TB_PROPERTY_TYPE_DIRECTORY);
+	if (p && p->value.dir == dir) {
+		tb_property_remove(p);
+		return true;
+	}
+	return false;
+}
+
+/**
+ * tb_register_property_dir() - Register property directory to the host
+ * @key: Key (name) of the directory to add
+ * @dir: Directory to add
+ *
+ * Service drivers can use this function to add new property directory
+ * to the host available properties. The other connected hosts are
+ * notified so they can re-read properties of this host if they are
+ * interested.
+ *
+ * Return: %0 on success and negative errno on failure
+ */
+int tb_register_property_dir(const char *key, struct tb_property_dir *dir)
+{
+	int ret;
+
+	if (!key || strlen(key) > 8)
+		return -EINVAL;
+
+	mutex_lock(&xdomain_lock);
+	if (tb_property_find(xdomain_property_dir, key,
+			     TB_PROPERTY_TYPE_DIRECTORY)) {
+		ret = -EEXIST;
+		goto err_unlock;
+	}
+
+	ret = tb_property_add_dir(xdomain_property_dir, key, dir);
+	if (ret)
+		goto err_unlock;
+
+	ret = rebuild_property_block();
+	if (ret) {
+		remove_directory(key, dir);
+		goto err_unlock;
+	}
+
+	mutex_unlock(&xdomain_lock);
+	update_all_xdomains();
+	return 0;
+
+err_unlock:
+	mutex_unlock(&xdomain_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tb_register_property_dir);
+
+/**
+ * tb_unregister_property_dir() - Removes property directory from host
+ * @key: Key (name) of the directory
+ * @dir: Directory to remove
+ *
+ * This will remove the existing directory from this host and notify the
+ * connected hosts about the change.
+ */
+void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir)
+{
+	int ret = 0;
+
+	mutex_lock(&xdomain_lock);
+	if (remove_directory(key, dir))
+		ret = rebuild_property_block();
+	mutex_unlock(&xdomain_lock);
+
+	if (!ret)
+		update_all_xdomains();
+}
+EXPORT_SYMBOL_GPL(tb_unregister_property_dir);
+
+int tb_xdomain_init(void)
+{
+	int ret;
+
+	xdomain_property_dir = tb_property_create_dir(NULL);
+	if (!xdomain_property_dir)
+		return -ENOMEM;
+
+	/*
+	 * Initialize standard set of properties without any service
+	 * directories. Those will be added by service drivers
+	 * themselves when they are loaded.
+	 */
+	tb_property_add_immediate(xdomain_property_dir, "vendorid",
+				  PCI_VENDOR_ID_INTEL);
+	tb_property_add_text(xdomain_property_dir, "vendorid", "Intel Corp.");
+	tb_property_add_immediate(xdomain_property_dir, "deviceid", 0x1);
+	tb_property_add_text(xdomain_property_dir, "deviceid",
+			     utsname()->nodename);
+	tb_property_add_immediate(xdomain_property_dir, "devicerv", 0x80000100);
+
+	ret = rebuild_property_block();
+	if (ret) {
+		tb_property_free_dir(xdomain_property_dir);
+		xdomain_property_dir = NULL;
+	}
+
+	return ret;
+}
+
+void tb_xdomain_exit(void)
+{
+	kfree(xdomain_property_block);
+	tb_property_free_dir(xdomain_property_dir);
+}
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 2b038442c352..60e5d90cb18a 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -135,6 +135,7 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_SET_RSS_HENA = 26,
 	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27,
 	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28,
+	VIRTCHNL_OP_REQUEST_QUEUES = 29,
 };
 
 /* This macro is used to generate a compilation error if a structure
@@ -235,6 +236,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_RSS_AQ		0x00000008
 #define VIRTCHNL_VF_OFFLOAD_RSS_REG		0x00000010
 #define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR		0x00000020
+#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		0x00000040
 #define VIRTCHNL_VF_OFFLOAD_VLAN		0x00010000
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING		0x00020000
 #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2	0x00040000
@@ -325,6 +327,21 @@ struct virtchnl_vsi_queue_config_info {
 	struct virtchnl_queue_pair_info qpair[1];
 };
 
+/* VIRTCHNL_OP_REQUEST_QUEUES
+ * VF sends this message to request the PF to allocate additional queues to
+ * this VF.  Each VF gets a guaranteed number of queues on init but asking for
+ * additional queues must be negotiated.  This is a best effort request as it
+ * is possible the PF does not have enough queues left to support the request.
+ * If the PF cannot support the number requested it will respond with the
+ * maximum number it is able to support; otherwise it will respond with the
+ * number requested.
+ */
+
+/* VF resource request */
+struct virtchnl_vf_res_request {
+	u16 num_queue_pairs;
+};
+
 VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_vsi_queue_config_info);
 
 /* VIRTCHNL_OP_CONFIG_IRQ_MAP
@@ -691,6 +708,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
 		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		valid_len = sizeof(struct virtchnl_vf_res_request);
+		break;
 	/* These are always errors coming from the VF. */
 	case VIRTCHNL_OP_EVENT:
 	case VIRTCHNL_OP_UNKNOWN:
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d41d40ac3efd..359b6f5d3d90 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -14,27 +14,46 @@ struct bpf_sock_ops_kern;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+struct bpf_prog_list {
+	struct list_head node;
+	struct bpf_prog *prog;
+};
+
+struct bpf_prog_array;
+
 struct cgroup_bpf {
-	/*
-	 * Store two sets of bpf_prog pointers, one for programs that are
-	 * pinned directly to this cgroup, and one for those that are effective
-	 * when this cgroup is accessed.
+	/* array of effective progs in this cgroup */
+	struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+
+	/* attached progs to this cgroup and attach flags
+	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
+	 * have either zero or one element
+	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
-	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
-	bool disallow_override[MAX_BPF_ATTACH_TYPE];
+	struct list_head progs[MAX_BPF_ATTACH_TYPE];
+	u32 flags[MAX_BPF_ATTACH_TYPE];
+
+	/* temp storage for effective prog array used by prog_attach/detach */
+	struct bpf_prog_array __rcu *inactive;
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
-
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool overridable);
-
-/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable);
+int cgroup_bpf_inherit(struct cgroup *cgrp);
+
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
+
+/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
@@ -96,8 +115,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 
 struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
-static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
-				      struct cgroup *parent) {}
+static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..a67daea731ab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
+	u8 name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
@@ -137,6 +138,7 @@ enum bpf_reg_type {
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
 	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
+	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
@@ -186,6 +188,8 @@ struct bpf_prog_aux {
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
+	u64 load_time; /* ns since boottime */
+	u8 name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -237,6 +241,41 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
 
+/* an array of programs to be executed under rcu_lock.
+ *
+ * Typical usage:
+ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
+ *
+ * the structure returned by bpf_prog_array_alloc() should be populated
+ * with program pointers and the last pointer must be NULL.
+ * The user has to keep refcnt on the program and make sure the program
+ * is removed from the array before bpf_prog_put().
+ * The 'struct bpf_prog_array *' should only be replaced with xchg()
+ * since other cpus are walking the array of pointers in parallel.
+ */
+struct bpf_prog_array {
+	struct rcu_head rcu;
+	struct bpf_prog *progs[0];
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt);
+
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog **_prog;		\
+		u32 _ret = 1;				\
+		rcu_read_lock();			\
+		_prog = rcu_dereference(array)->progs;	\
+		for (; *_prog; _prog++)			\
+			_ret &= func(*_prog, ctx);	\
+		rcu_read_unlock();			\
+		_ret;					\
+	 })
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 89f67c1c3160..805d16654459 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -170,4 +170,20 @@ static inline void be64_add_cpu(__be64 *var, u64 val)
 	*var = cpu_to_be64(be64_to_cpu(*var) + val);
 }
 
+static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = cpu_to_be32(src[i]);
+}
+
+static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = be32_to_cpu(src[i]);
+}
+
 #endif /* _LINUX_BYTEORDER_GENERIC_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d29e58fde364..2d2db394b0ca 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -481,30 +481,36 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
+#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
 struct bpf_skb_data_end {
 	struct qdisc_skb_cb qdisc_cb;
+	void *data_meta;
 	void *data_end;
 };
 
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_meta;
 	void *data_hard_start;
 };
 
-/* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf, act_bpf and lwt programs
+/* Compute the linear packet data range [data, data_end) which
+ * will be accessed by various program types (cls_bpf, act_bpf,
+ * lwt, ...). Subsystems allowing direct data access must (!)
+ * ensure that cb[] area can be written to when BPF program is
+ * invoked (otherwise cb[] save/restore is necessary).
  */
-static inline void bpf_compute_data_end(struct sk_buff *skb)
+static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 {
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
 	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
-	cb->data_end = skb->data + skb_headlen(skb);
+	cb->data_meta = skb->data - skb_metadata_len(skb);
+	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@@ -725,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 void bpf_warn_invalid_xdp_action(u32 act);
-void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
 struct sock *do_sk_redirect_map(void);
 
diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index 3355efc89781..6756fea18b69 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -31,7 +31,7 @@ static inline struct arphdr *arp_hdr(const struct sk_buff *skb)
 	return (struct arphdr *)skb_network_header(skb);
 }
 
-static inline int arp_hdr_len(struct net_device *dev)
+static inline unsigned int arp_hdr_len(const struct net_device *dev)
 {
 	switch (dev->type) {
 #if IS_ENABLED(CONFIG_FIREWIRE_NET)
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index c9ec1343d187..10e319f41fb1 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -72,7 +72,8 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 extern void macvlan_common_setup(struct net_device *dev);
 
 extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
-				  struct nlattr *tb[], struct nlattr *data[]);
+				  struct nlattr *tb[], struct nlattr *data[],
+				  struct netlink_ext_ack *extack);
 
 extern void macvlan_count_rx(const struct macvlan_dev *vlan,
 			     unsigned int len, bool success,
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index fb3f809e34e4..751d051f0bc7 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -179,6 +179,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
 			 __be32 local, int scope);
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
 				    __be32 mask);
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
 static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa)
 {
 	return !((addr^ifa->ifa_address)&ifa->ifa_mask);
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 694cebb50f72..7625c3b81f84 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -683,5 +683,31 @@ struct fsl_mc_device_id {
 	const char obj_type[16];
 };
 
+/**
+ * struct tb_service_id - Thunderbolt service identifiers
+ * @match_flags: Flags used to match the structure
+ * @protocol_key: Protocol key the service supports
+ * @protocol_id: Protocol id the service supports
+ * @protocol_version: Version of the protocol
+ * @protocol_revision: Revision of the protocol software
+ * @driver_data: Driver specific data
+ *
+ * Thunderbolt XDomain services are exposed as devices where each device
+ * carries the protocol information the service supports. Thunderbolt
+ * XDomain service drivers match against that information.
+ */
+struct tb_service_id {
+	__u32 match_flags;
+	char protocol_key[8 + 1];
+	__u32 protocol_id;
+	__u32 protocol_version;
+	__u32 protocol_revision;
+	kernel_ulong_t driver_data;
+};
+
+#define TBSVC_MATCH_PROTOCOL_KEY	0x0001
+#define TBSVC_MATCH_PROTOCOL_ID		0x0002
+#define TBSVC_MATCH_PROTOCOL_VERSION	0x0004
+#define TBSVC_MATCH_PROTOCOL_REVISION	0x0008
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index d7f63339ef0b..8242d05df35e 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -5,6 +5,8 @@
 #include <linux/pim.h>
 #include <linux/rhashtable.h>
 #include <net/sock.h>
+#include <net/fib_rules.h>
+#include <net/fib_notifier.h>
 #include <uapi/linux/mroute.h>
 
 #ifdef CONFIG_IP_MROUTE
@@ -18,6 +20,7 @@ int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
 int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
 int ip_mr_init(void);
+bool ipmr_rule_default(const struct fib_rule *rule);
 #else
 static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
 				       char __user *optval, unsigned int optlen)
@@ -45,10 +48,16 @@ static inline int ip_mroute_opt(int opt)
 {
 	return 0;
 }
+
+static inline bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
 #endif
 
 struct vif_device {
 	struct net_device 	*dev;			/* Device we are using */
+	struct netdev_phys_item_id dev_parent_id;	/* Device parent ID    */
 	unsigned long	bytes_in,bytes_out;
 	unsigned long	pkt_in,pkt_out;		/* Statistics 			*/
 	unsigned long	rate_limit;		/* Traffic shaping (NI) 	*/
@@ -58,6 +67,14 @@ struct vif_device {
 	int		link;			/* Physical interface index	*/
 };
 
+struct vif_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct net_device *dev;
+	vifi_t vif_index;
+	unsigned short vif_flags;
+	u32 tb_id;
+};
+
 #define VIFF_STATIC 0x8000
 
 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
@@ -81,9 +98,11 @@ struct mr_table {
 
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
+ * MFC_OFFLOAD - the entry was offloaded to the hardware
  */
 enum {
 	MFC_STATIC = BIT(0),
+	MFC_OFFLOAD = BIT(1),
 };
 
 struct mfc_cache_cmp_arg {
@@ -109,6 +128,7 @@ struct mfc_cache_cmp_arg {
  * @wrong_if: number of wrong source interface hits
  * @lastuse: time of last use of the group (traffic or update)
  * @ttls: OIF TTL threshold array
+ * @refcount: reference count for this entry
  * @list: global entry list
  * @rcu: used for entry destruction
  */
@@ -138,14 +158,40 @@ struct mfc_cache {
 			unsigned long wrong_if;
 			unsigned long lastuse;
 			unsigned char ttls[MAXVIFS];
+			refcount_t refcount;
 		} res;
 	} mfc_un;
 	struct list_head list;
 	struct rcu_head	rcu;
 };
 
+struct mfc_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct mfc_cache *mfc;
+	u32 tb_id;
+};
+
 struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
 		   struct rtmsg *rtm, u32 portid);
+
+#ifdef CONFIG_IP_MROUTE
+void ipmr_cache_free(struct mfc_cache *mfc_cache);
+#else
+static inline void ipmr_cache_free(struct mfc_cache *mfc_cache)
+{
+}
+#endif
+
+static inline void ipmr_cache_put(struct mfc_cache *c)
+{
+	if (refcount_dec_and_test(&c->mfc_un.res.refcount))
+		ipmr_cache_free(c);
+}
+static inline void ipmr_cache_hold(struct mfc_cache *c)
+{
+	refcount_inc(&c->mfc_un.res.refcount);
+}
+
 #endif
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f535779d9dc1..31bb3010c69b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -55,7 +55,7 @@
 struct netpoll_info;
 struct device;
 struct phy_device;
-struct dsa_switch_tree;
+struct dsa_port;
 
 /* 802.11 specific */
 struct wireless_dev;
@@ -826,6 +826,11 @@ struct xfrmdev_ops {
 };
 #endif
 
+struct dev_ifalias {
+	struct rcu_head rcuhead;
+	char ifalias[];
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1241,7 +1246,8 @@ struct net_device_ops {
 						     u32 flow_id);
 #endif
 	int			(*ndo_add_slave)(struct net_device *dev,
-						 struct net_device *slave_dev);
+						 struct net_device *slave_dev,
+						 struct netlink_ext_ack *extack);
 	int			(*ndo_del_slave)(struct net_device *dev,
 						 struct net_device *slave_dev);
 	netdev_features_t	(*ndo_fix_features)(struct net_device *dev,
@@ -1632,7 +1638,7 @@ enum netdev_priv_flags {
 struct net_device {
 	char			name[IFNAMSIZ];
 	struct hlist_node	name_hlist;
-	char 			*ifalias;
+	struct dev_ifalias	__rcu *ifalias;
 	/*
 	 *	I/O specific fields
 	 *	FIXME: Merge these and struct ifmap into one
@@ -1752,7 +1758,7 @@ struct net_device {
 	struct vlan_info __rcu	*vlan_info;
 #endif
 #if IS_ENABLED(CONFIG_NET_DSA)
-	struct dsa_switch_tree	*dsa_ptr;
+	struct dsa_port		*dsa_ptr;
 #endif
 #if IS_ENABLED(CONFIG_TIPC)
 	struct tipc_bearer __rcu *tipc_ptr;
@@ -2304,7 +2310,8 @@ int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
 
 struct netdev_notifier_info {
-	struct net_device *dev;
+	struct net_device	*dev;
+	struct netlink_ext_ack	*extack;
 };
 
 struct netdev_notifier_change_info {
@@ -2329,6 +2336,7 @@ static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
 					     struct net_device *dev)
 {
 	info->dev = dev;
+	info->extack = NULL;
 }
 
 static inline struct net_device *
@@ -2337,6 +2345,12 @@ netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
 	return info->dev;
 }
 
+static inline struct netlink_ext_ack *
+netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
+{
+	return info->extack;
+}
+
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 
 
@@ -3275,6 +3289,7 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags,
 			unsigned int gchanges);
 int dev_change_name(struct net_device *, const char *);
 int dev_set_alias(struct net_device *, const char *, size_t);
+int dev_get_alias(const struct net_device *, char *, size_t);
 int dev_change_net_namespace(struct net_device *, struct net *, const char *);
 int __dev_set_mtu(struct net_device *, int);
 int dev_set_mtu(struct net_device *, int);
@@ -3904,10 +3919,12 @@ void *netdev_adjacent_get_private(struct list_head *adj_list);
 void *netdev_lower_get_first_private_rcu(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
-int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev);
+int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
+			  struct netlink_ext_ack *extack);
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv, void *upper_info);
+				 void *upper_priv, void *upper_info,
+				 struct netlink_ext_ack *extack);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev);
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index dd7a3b86bb9e..89fa0bbd54f3 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -101,6 +101,7 @@ struct qed_ll2_comp_rx_data {
 	void *cookie;
 	dma_addr_t rx_buf_addr;
 	u16 parse_flags;
+	u16 err_flags;
 	u16 vlan;
 	bool b_last_packet;
 	u8 connection_handle;
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index dea59c8eec54..1251638e60d3 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -17,9 +17,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, long expires, u32 error);
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+			 gfp_t flags, int *new_nsid);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
-				       gfp_t flags);
+				       gfp_t flags, int *new_nsid);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 72299ef00061..ada821466e88 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	unsigned short	_unused;
-	unsigned char	nr_frags;
+	__u8		__unused;
+	__u8		meta_len;
+	__u8		nr_frags;
 	__u8		tx_flags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
@@ -661,8 +662,12 @@ struct sk_buff {
 			struct sk_buff		*prev;
 
 			union {
-				ktime_t		tstamp;
-				u64		skb_mstamp;
+				struct net_device	*dev;
+				/* Some protocols might use this space to store information,
+				 * while device pointer would be NULL.
+				 * UDP receive path is one user.
+				 */
+				unsigned long		dev_scratch;
 			};
 		};
 		struct rb_node	rbnode; /* used in netem & tcp stack */
@@ -670,12 +675,8 @@ struct sk_buff {
 	struct sock		*sk;
 
 	union {
-		struct net_device	*dev;
-		/* Some protocols might use this space to store information,
-		 * while device pointer would be NULL.
-		 * UDP receive path is one user.
-		 */
-		unsigned long		dev_scratch;
+		ktime_t		tstamp;
+		u64		skb_mstamp;
 	};
 	/*
 	 * This is the control buffer. It is free to use for every
@@ -771,6 +772,7 @@ struct sk_buff {
 	__u8			remcsum_offload:1;
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
+	__u8			offload_mr_fwd_mark:1;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
@@ -1457,27 +1459,8 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
 }
 
 /**
- *	skb_header_release - release reference to header
- *	@skb: buffer to operate on
- *
- *	Drop a reference to the header part of the buffer.  This is done
- *	by acquiring a payload reference.  You must not read from the header
- *	part of skb->data after this.
- *	Note : Check if you can use __skb_header_release() instead.
- */
-static inline void skb_header_release(struct sk_buff *skb)
-{
-	BUG_ON(skb->nohdr);
-	skb->nohdr = 1;
-	atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
-}
-
-/**
  *	__skb_header_release - release reference to header
  *	@skb: buffer to operate on
- *
- *	Variant of skb_header_release() assuming skb is private to caller.
- *	We can avoid one atomic operation.
  */
 static inline void __skb_header_release(struct sk_buff *skb)
 {
@@ -3419,6 +3402,69 @@ static inline ktime_t net_invalid_timestamp(void)
 	return 0;
 }
 
+static inline u8 skb_metadata_len(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->meta_len;
+}
+
+static inline void *skb_metadata_end(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb);
+}
+
+static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
+					  const struct sk_buff *skb_b,
+					  u8 meta_len)
+{
+	const void *a = skb_metadata_end(skb_a);
+	const void *b = skb_metadata_end(skb_b);
+	/* Using more efficient varaiant than plain call to memcmp(). */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	u64 diffs = 0;
+
+	switch (meta_len) {
+#define __it(x, op) (x -= sizeof(u##op))
+#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
+	case 32: diffs |= __it_diff(a, b, 64);
+	case 24: diffs |= __it_diff(a, b, 64);
+	case 16: diffs |= __it_diff(a, b, 64);
+	case  8: diffs |= __it_diff(a, b, 64);
+		break;
+	case 28: diffs |= __it_diff(a, b, 64);
+	case 20: diffs |= __it_diff(a, b, 64);
+	case 12: diffs |= __it_diff(a, b, 64);
+	case  4: diffs |= __it_diff(a, b, 32);
+		break;
+	}
+	return diffs;
+#else
+	return memcmp(a - meta_len, b - meta_len, meta_len);
+#endif
+}
+
+static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
+					const struct sk_buff *skb_b)
+{
+	u8 len_a = skb_metadata_len(skb_a);
+	u8 len_b = skb_metadata_len(skb_b);
+
+	if (!(len_a | len_b))
+		return false;
+
+	return len_a != len_b ?
+	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
+}
+
+static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
+{
+	skb_shinfo(skb)->meta_len = meta_len;
+}
+
+static inline void skb_metadata_clear(struct sk_buff *skb)
+{
+	skb_metadata_set(skb, 0);
+}
+
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
 
 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
new file mode 100644
index 000000000000..7b69853188b1
--- /dev/null
+++ b/include/linux/thunderbolt.h
@@ -0,0 +1,598 @@
+/*
+ * Thunderbolt service API
+ *
+ * Copyright (C) 2014 Andreas Noever <andreas.noever@gmail.com>
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef THUNDERBOLT_H_
+#define THUNDERBOLT_H_
+
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/mod_devicetable.h>
+#include <linux/pci.h>
+#include <linux/uuid.h>
+#include <linux/workqueue.h>
+
+enum tb_cfg_pkg_type {
+	TB_CFG_PKG_READ = 1,
+	TB_CFG_PKG_WRITE = 2,
+	TB_CFG_PKG_ERROR = 3,
+	TB_CFG_PKG_NOTIFY_ACK = 4,
+	TB_CFG_PKG_EVENT = 5,
+	TB_CFG_PKG_XDOMAIN_REQ = 6,
+	TB_CFG_PKG_XDOMAIN_RESP = 7,
+	TB_CFG_PKG_OVERRIDE = 8,
+	TB_CFG_PKG_RESET = 9,
+	TB_CFG_PKG_ICM_EVENT = 10,
+	TB_CFG_PKG_ICM_CMD = 11,
+	TB_CFG_PKG_ICM_RESP = 12,
+	TB_CFG_PKG_PREPARE_TO_SLEEP = 13,
+};
+
+/**
+ * enum tb_security_level - Thunderbolt security level
+ * @TB_SECURITY_NONE: No security, legacy mode
+ * @TB_SECURITY_USER: User approval required at minimum
+ * @TB_SECURITY_SECURE: One time saved key required at minimum
+ * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB)
+ */
+enum tb_security_level {
+	TB_SECURITY_NONE,
+	TB_SECURITY_USER,
+	TB_SECURITY_SECURE,
+	TB_SECURITY_DPONLY,
+};
+
+/**
+ * struct tb - main thunderbolt bus structure
+ * @dev: Domain device
+ * @lock: Big lock. Must be held when accessing any struct
+ *	  tb_switch / struct tb_port.
+ * @nhi: Pointer to the NHI structure
+ * @ctl: Control channel for this domain
+ * @wq: Ordered workqueue for all domain specific work
+ * @root_switch: Root switch of this domain
+ * @cm_ops: Connection manager specific operations vector
+ * @index: Linux assigned domain number
+ * @security_level: Current security level
+ * @privdata: Private connection manager specific data
+ */
+struct tb {
+	struct device dev;
+	struct mutex lock;
+	struct tb_nhi *nhi;
+	struct tb_ctl *ctl;
+	struct workqueue_struct *wq;
+	struct tb_switch *root_switch;
+	const struct tb_cm_ops *cm_ops;
+	int index;
+	enum tb_security_level security_level;
+	unsigned long privdata[0];
+};
+
+extern struct bus_type tb_bus_type;
+extern struct device_type tb_service_type;
+extern struct device_type tb_xdomain_type;
+
+#define TB_LINKS_PER_PHY_PORT	2
+
+static inline unsigned int tb_phy_port_from_link(unsigned int link)
+{
+	return (link - 1) / TB_LINKS_PER_PHY_PORT;
+}
+
+/**
+ * struct tb_property_dir - XDomain property directory
+ * @uuid: Directory UUID or %NULL if root directory
+ * @properties: List of properties in this directory
+ *
+ * User needs to provide serialization if needed.
+ */
+struct tb_property_dir {
+	const uuid_t *uuid;
+	struct list_head properties;
+};
+
+enum tb_property_type {
+	TB_PROPERTY_TYPE_UNKNOWN = 0x00,
+	TB_PROPERTY_TYPE_DIRECTORY = 0x44,
+	TB_PROPERTY_TYPE_DATA = 0x64,
+	TB_PROPERTY_TYPE_TEXT = 0x74,
+	TB_PROPERTY_TYPE_VALUE = 0x76,
+};
+
+#define TB_PROPERTY_KEY_SIZE	8
+
+/**
+ * struct tb_property - XDomain property
+ * @list: Used to link properties together in a directory
+ * @key: Key for the property (always terminated).
+ * @type: Type of the property
+ * @length: Length of the property data in dwords
+ * @value: Property value
+ *
+ * Users use @type to determine which field in @value is filled.
+ */
+struct tb_property {
+	struct list_head list;
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	enum tb_property_type type;
+	size_t length;
+	union {
+		struct tb_property_dir *dir;
+		u8 *data;
+		char *text;
+		u32 immediate;
+	} value;
+};
+
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len);
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len);
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid);
+void tb_property_free_dir(struct tb_property_dir *dir);
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value);
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen);
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text);
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir);
+void tb_property_remove(struct tb_property *tb_property);
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+			const char *key, enum tb_property_type type);
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev);
+
+#define tb_property_for_each(dir, property)			\
+	for (property = tb_property_get_next(dir, NULL);	\
+	     property;						\
+	     property = tb_property_get_next(dir, property))
+
+int tb_register_property_dir(const char *key, struct tb_property_dir *dir);
+void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
+
+/**
+ * struct tb_xdomain - Cross-domain (XDomain) connection
+ * @dev: XDomain device
+ * @tb: Pointer to the domain
+ * @remote_uuid: UUID of the remote domain (host)
+ * @local_uuid: Cached local UUID
+ * @route: Route string the other domain can be reached
+ * @vendor: Vendor ID of the remote domain
+ * @device: Device ID of the demote domain
+ * @lock: Lock to serialize access to the following fields of this structure
+ * @vendor_name: Name of the vendor (or %NULL if not known)
+ * @device_name: Name of the device (or %NULL if not known)
+ * @is_unplugged: The XDomain is unplugged
+ * @resume: The XDomain is being resumed
+ * @transmit_path: HopID which the remote end expects us to transmit
+ * @transmit_ring: Local ring (hop) where outgoing packets are pushed
+ * @receive_path: HopID which we expect the remote end to transmit
+ * @receive_ring: Local ring (hop) where incoming packets arrive
+ * @service_ids: Used to generate IDs for the services
+ * @properties: Properties exported by the remote domain
+ * @property_block_gen: Generation of @properties
+ * @properties_lock: Lock protecting @properties.
+ * @get_properties_work: Work used to get remote domain properties
+ * @properties_retries: Number of times left to read properties
+ * @properties_changed_work: Work used to notify the remote domain that
+ *			     our properties have changed
+ * @properties_changed_retries: Number of times left to send properties
+ *				changed notification
+ * @link: Root switch link the remote domain is connected (ICM only)
+ * @depth: Depth in the chain the remote domain is connected (ICM only)
+ *
+ * This structure represents connection across two domains (hosts).
+ * Each XDomain contains zero or more services which are exposed as
+ * &struct tb_service objects.
+ *
+ * Service drivers may access this structure if they need to enumerate
+ * non-standard properties but they need hold @lock when doing so
+ * because properties can be changed asynchronously in response to
+ * changes in the remote domain.
+ */
+struct tb_xdomain {
+	struct device dev;
+	struct tb *tb;
+	uuid_t *remote_uuid;
+	const uuid_t *local_uuid;
+	u64 route;
+	u16 vendor;
+	u16 device;
+	struct mutex lock;
+	const char *vendor_name;
+	const char *device_name;
+	bool is_unplugged;
+	bool resume;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+	struct ida service_ids;
+	struct tb_property_dir *properties;
+	u32 property_block_gen;
+	struct delayed_work get_properties_work;
+	int properties_retries;
+	struct delayed_work properties_changed_work;
+	int properties_changed_retries;
+	u8 link;
+	u8 depth;
+};
+
+int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
+			    u16 transmit_ring, u16 receive_path,
+			    u16 receive_ring);
+int tb_xdomain_disable_paths(struct tb_xdomain *xd);
+struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid);
+
+static inline struct tb_xdomain *
+tb_xdomain_find_by_uuid_locked(struct tb *tb, const uuid_t *uuid)
+{
+	struct tb_xdomain *xd;
+
+	mutex_lock(&tb->lock);
+	xd = tb_xdomain_find_by_uuid(tb, uuid);
+	mutex_unlock(&tb->lock);
+
+	return xd;
+}
+
+static inline struct tb_xdomain *tb_xdomain_get(struct tb_xdomain *xd)
+{
+	if (xd)
+		get_device(&xd->dev);
+	return xd;
+}
+
+static inline void tb_xdomain_put(struct tb_xdomain *xd)
+{
+	if (xd)
+		put_device(&xd->dev);
+}
+
+static inline bool tb_is_xdomain(const struct device *dev)
+{
+	return dev->type == &tb_xdomain_type;
+}
+
+static inline struct tb_xdomain *tb_to_xdomain(struct device *dev)
+{
+	if (tb_is_xdomain(dev))
+		return container_of(dev, struct tb_xdomain, dev);
+	return NULL;
+}
+
+int tb_xdomain_response(struct tb_xdomain *xd, const void *response,
+			size_t size, enum tb_cfg_pkg_type type);
+int tb_xdomain_request(struct tb_xdomain *xd, const void *request,
+		       size_t request_size, enum tb_cfg_pkg_type request_type,
+		       void *response, size_t response_size,
+		       enum tb_cfg_pkg_type response_type,
+		       unsigned int timeout_msec);
+
+/**
+ * tb_protocol_handler - Protocol specific handler
+ * @uuid: XDomain messages with this UUID are dispatched to this handler
+ * @callback: Callback called with the XDomain message. Returning %1
+ *	      here tells the XDomain core that the message was handled
+ *	      by this handler and should not be forwared to other
+ *	      handlers.
+ * @data: Data passed with the callback
+ * @list: Handlers are linked using this
+ *
+ * Thunderbolt services can hook into incoming XDomain requests by
+ * registering protocol handler. Only limitation is that the XDomain
+ * discovery protocol UUID cannot be registered since it is handled by
+ * the core XDomain code.
+ *
+ * The @callback must check that the message is really directed to the
+ * service the driver implements.
+ */
+struct tb_protocol_handler {
+	const uuid_t *uuid;
+	int (*callback)(const void *buf, size_t size, void *data);
+	void *data;
+	struct list_head list;
+};
+
+int tb_register_protocol_handler(struct tb_protocol_handler *handler);
+void tb_unregister_protocol_handler(struct tb_protocol_handler *handler);
+
+/**
+ * struct tb_service - Thunderbolt service
+ * @dev: XDomain device
+ * @id: ID of the service (shown in sysfs)
+ * @key: Protocol key from the properties directory
+ * @prtcid: Protocol ID from the properties directory
+ * @prtcvers: Protocol version from the properties directory
+ * @prtcrevs: Protocol software revision from the properties directory
+ * @prtcstns: Protocol settings mask from the properties directory
+ *
+ * Each domain exposes set of services it supports as collection of
+ * properties. For each service there will be one corresponding
+ * &struct tb_service. Service drivers are bound to these.
+ */
+struct tb_service {
+	struct device dev;
+	int id;
+	const char *key;
+	u32 prtcid;
+	u32 prtcvers;
+	u32 prtcrevs;
+	u32 prtcstns;
+};
+
+static inline struct tb_service *tb_service_get(struct tb_service *svc)
+{
+	if (svc)
+		get_device(&svc->dev);
+	return svc;
+}
+
+static inline void tb_service_put(struct tb_service *svc)
+{
+	if (svc)
+		put_device(&svc->dev);
+}
+
+static inline bool tb_is_service(const struct device *dev)
+{
+	return dev->type == &tb_service_type;
+}
+
+static inline struct tb_service *tb_to_service(struct device *dev)
+{
+	if (tb_is_service(dev))
+		return container_of(dev, struct tb_service, dev);
+	return NULL;
+}
+
+/**
+ * tb_service_driver - Thunderbolt service driver
+ * @driver: Driver structure
+ * @probe: Called when the driver is probed
+ * @remove: Called when the driver is removed (optional)
+ * @shutdown: Called at shutdown time to stop the service (optional)
+ * @id_table: Table of service identifiers the driver supports
+ */
+struct tb_service_driver {
+	struct device_driver driver;
+	int (*probe)(struct tb_service *svc, const struct tb_service_id *id);
+	void (*remove)(struct tb_service *svc);
+	void (*shutdown)(struct tb_service *svc);
+	const struct tb_service_id *id_table;
+};
+
+#define TB_SERVICE(key, id)				\
+	.match_flags = TBSVC_MATCH_PROTOCOL_KEY |	\
+		       TBSVC_MATCH_PROTOCOL_ID,		\
+	.protocol_key = (key),				\
+	.protocol_id = (id)
+
+int tb_register_service_driver(struct tb_service_driver *drv);
+void tb_unregister_service_driver(struct tb_service_driver *drv);
+
+static inline void *tb_service_get_drvdata(const struct tb_service *svc)
+{
+	return dev_get_drvdata(&svc->dev);
+}
+
+static inline void tb_service_set_drvdata(struct tb_service *svc, void *data)
+{
+	dev_set_drvdata(&svc->dev, data);
+}
+
+static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
+{
+	return tb_to_xdomain(svc->dev.parent);
+}
+
+/**
+ * struct tb_nhi - thunderbolt native host interface
+ * @lock: Must be held during ring creation/destruction. Is acquired by
+ *	  interrupt_work when dispatching interrupts to individual rings.
+ * @pdev: Pointer to the PCI device
+ * @iobase: MMIO space of the NHI
+ * @tx_rings: All Tx rings available on this host controller
+ * @rx_rings: All Rx rings available on this host controller
+ * @msix_ida: Used to allocate MSI-X vectors for rings
+ * @going_away: The host controller device is about to disappear so when
+ *		this flag is set, avoid touching the hardware anymore.
+ * @interrupt_work: Work scheduled to handle ring interrupt when no
+ *		    MSI-X is used.
+ * @hop_count: Number of rings (end point hops) supported by NHI.
+ */
+struct tb_nhi {
+	spinlock_t lock;
+	struct pci_dev *pdev;
+	void __iomem *iobase;
+	struct tb_ring **tx_rings;
+	struct tb_ring **rx_rings;
+	struct ida msix_ida;
+	bool going_away;
+	struct work_struct interrupt_work;
+	u32 hop_count;
+};
+
+/**
+ * struct tb_ring - thunderbolt TX or RX ring associated with a NHI
+ * @lock: Lock serializing actions to this ring. Must be acquired after
+ *	  nhi->lock.
+ * @nhi: Pointer to the native host controller interface
+ * @size: Size of the ring
+ * @hop: Hop (DMA channel) associated with this ring
+ * @head: Head of the ring (write next descriptor here)
+ * @tail: Tail of the ring (complete next descriptor here)
+ * @descriptors: Allocated descriptors for this ring
+ * @queue: Queue holding frames to be transferred over this ring
+ * @in_flight: Queue holding frames that are currently in flight
+ * @work: Interrupt work structure
+ * @is_tx: Is the ring Tx or Rx
+ * @running: Is the ring running
+ * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise.
+ * @vector: MSI-X vector number the ring uses (only set if @irq is > 0)
+ * @flags: Ring specific flags
+ * @sof_mask: Bit mask used to detect start of frame PDF
+ * @eof_mask: Bit mask used to detect end of frame PDF
+ * @start_poll: Called when ring interrupt is triggered to start
+ *		polling. Passing %NULL keeps the ring in interrupt mode.
+ * @poll_data: Data passed to @start_poll
+ */
+struct tb_ring {
+	spinlock_t lock;
+	struct tb_nhi *nhi;
+	int size;
+	int hop;
+	int head;
+	int tail;
+	struct ring_desc *descriptors;
+	dma_addr_t descriptors_dma;
+	struct list_head queue;
+	struct list_head in_flight;
+	struct work_struct work;
+	bool is_tx:1;
+	bool running:1;
+	int irq;
+	u8 vector;
+	unsigned int flags;
+	u16 sof_mask;
+	u16 eof_mask;
+	void (*start_poll)(void *data);
+	void *poll_data;
+};
+
+/* Leave ring interrupt enabled on suspend */
+#define RING_FLAG_NO_SUSPEND	BIT(0)
+/* Configure the ring to be in frame mode */
+#define RING_FLAG_FRAME		BIT(1)
+/* Enable end-to-end flow control */
+#define RING_FLAG_E2E		BIT(2)
+
+struct ring_frame;
+typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled);
+
+/**
+ * enum ring_desc_flags - Flags for DMA ring descriptor
+ * %RING_DESC_ISOCH: Enable isonchronous DMA (Tx only)
+ * %RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only)
+ * %RING_DESC_COMPLETED: Descriptor completed (set by NHI)
+ * %RING_DESC_POSTED: Always set this
+ * %RING_DESC_BUFFER_OVERRUN: RX buffer overrun
+ * %RING_DESC_INTERRUPT: Request an interrupt on completion
+ */
+enum ring_desc_flags {
+	RING_DESC_ISOCH = 0x1,
+	RING_DESC_CRC_ERROR = 0x1,
+	RING_DESC_COMPLETED = 0x2,
+	RING_DESC_POSTED = 0x4,
+	RING_DESC_BUFFER_OVERRUN = 0x04,
+	RING_DESC_INTERRUPT = 0x8,
+};
+
+/**
+ * struct ring_frame - For use with ring_rx/ring_tx
+ * @buffer_phy: DMA mapped address of the frame
+ * @callback: Callback called when the frame is finished (optional)
+ * @list: Frame is linked to a queue using this
+ * @size: Size of the frame in bytes (%0 means %4096)
+ * @flags: Flags for the frame (see &enum ring_desc_flags)
+ * @eof: End of frame protocol defined field
+ * @sof: Start of frame protocol defined field
+ */
+struct ring_frame {
+	dma_addr_t buffer_phy;
+	ring_cb callback;
+	struct list_head list;
+	u32 size:12;
+	u32 flags:12;
+	u32 eof:4;
+	u32 sof:4;
+};
+
+/* Minimum size for ring_rx */
+#define TB_FRAME_SIZE		0x100
+
+struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags);
+struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 void (*start_poll)(void *), void *poll_data);
+void tb_ring_start(struct tb_ring *ring);
+void tb_ring_stop(struct tb_ring *ring);
+void tb_ring_free(struct tb_ring *ring);
+
+int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
+
+/**
+ * tb_ring_rx() - enqueue a frame on an RX ring
+ * @ring: Ring to enqueue the frame
+ * @frame: Frame to enqueue
+ *
+ * @frame->buffer, @frame->buffer_phy have to be set. The buffer must
+ * contain at least %TB_FRAME_SIZE bytes.
+ *
+ * @frame->callback will be invoked with @frame->size, @frame->flags,
+ * @frame->eof, @frame->sof set once the frame has been received.
+ *
+ * If ring_stop() is called after the packet has been enqueued
+ * @frame->callback will be called with canceled set to true.
+ *
+ * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise.
+ */
+static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame)
+{
+	WARN_ON(ring->is_tx);
+	return __tb_ring_enqueue(ring, frame);
+}
+
+/**
+ * tb_ring_tx() - enqueue a frame on an TX ring
+ * @ring: Ring the enqueue the frame
+ * @frame: Frame to enqueue
+ *
+ * @frame->buffer, @frame->buffer_phy, @frame->size, @frame->eof and
+ * @frame->sof have to be set.
+ *
+ * @frame->callback will be invoked with once the frame has been transmitted.
+ *
+ * If ring_stop() is called after the packet has been enqueued @frame->callback
+ * will be called with canceled set to true.
+ *
+ * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise.
+ */
+static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
+{
+	WARN_ON(!ring->is_tx);
+	return __tb_ring_enqueue(ring, frame);
+}
+
+/* Used only when the ring is in polling mode */
+struct ring_frame *tb_ring_poll(struct tb_ring *ring);
+void tb_ring_poll_complete(struct tb_ring *ring);
+
+/**
+ * tb_ring_dma_device() - Return device used for DMA mapping
+ * @ring: Ring whose DMA device is retrieved
+ *
+ * Use this function when you are mapping DMA for buffers that are
+ * passed to the ring for sending/receiving.
+ */
+static inline struct device *tb_ring_dma_device(struct tb_ring *ring)
+{
+	return &ring->nhi->pdev->dev;
+}
+
+#endif /* THUNDERBOLT_H_ */
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f44ff2476758..87981cd63180 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -94,8 +94,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
 		      u32 banned_flags);
 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 		    u32 banned_flags);
-int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-			 bool match_wildcard);
+bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			  bool match_wildcard);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index b2e68657a216..2860cc66c2bb 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -596,7 +596,8 @@ void bond_destroy_sysfs(struct bond_net *net);
 void bond_prepare_sysfs_group(struct bonding *bond);
 int bond_sysfs_slave_add(struct slave *slave);
 void bond_sysfs_slave_del(struct slave *slave);
-int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev);
+int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
+		 struct netlink_ext_ack *extack);
 int bond_release(struct net_device *bond_dev, struct net_device *slave_dev);
 u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb);
 int bond_set_carrier(struct bonding *bond);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index dd44d6ce1097..10dceccd9ce8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -130,11 +130,6 @@ struct dsa_switch_tree {
 	 */
 	struct dsa_platform_data	*pd;
 
-	/* Copy of tag_ops->rcv for faster access in hot path */
-	struct sk_buff *	(*rcv)(struct sk_buff *skb,
-				       struct net_device *dev,
-				       struct packet_type *pt);
-
 	/*
 	 * The switch port to which the CPU is attached.
 	 */
@@ -144,12 +139,6 @@ struct dsa_switch_tree {
 	 * Data for the individual switch chips.
 	 */
 	struct dsa_switch	*ds[DSA_MAX_SWITCHES];
-
-	/*
-	 * Tagging protocol operations for adding and removing an
-	 * encapsulation tag.
-	 */
-	const struct dsa_device_ops *tag_ops;
 };
 
 /* TC matchall action types, only mirroring for now */
@@ -175,6 +164,14 @@ struct dsa_mall_tc_entry {
 
 
 struct dsa_port {
+	/* CPU port tagging operations used by master or slave devices */
+	const struct dsa_device_ops *tag_ops;
+
+	/* Copies for faster access in master receive hot path */
+	struct dsa_switch_tree *dst;
+	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
+			       struct packet_type *pt);
+
 	struct dsa_switch	*ds;
 	unsigned int		index;
 	const char		*name;
@@ -188,7 +185,6 @@ struct dsa_port {
 	/*
 	 * Original copy of the master netdev ethtool_ops
 	 */
-	struct ethtool_ops	ethtool_ops;
 	const struct ethtool_ops *orig_ethtool_ops;
 };
 
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index a803129a4849..9fba2ebf6dda 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -24,7 +24,7 @@ struct metadata_dst {
 	} u;
 };
 
-static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
+static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb)
 {
 	struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb);
 
@@ -34,7 +34,8 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
 	return NULL;
 }
 
-static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
+static inline struct ip_tunnel_info *
+skb_tunnel_info(const struct sk_buff *skb)
 {
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
 	struct dst_entry *dst;
diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
index 669b9716dc7a..54cd6b839d2f 100644
--- a/include/net/fib_notifier.h
+++ b/include/net/fib_notifier.h
@@ -20,6 +20,8 @@ enum fib_event_type {
 	FIB_EVENT_RULE_DEL,
 	FIB_EVENT_NH_ADD,
 	FIB_EVENT_NH_DEL,
+	FIB_EVENT_VIF_ADD,
+	FIB_EVENT_VIF_DEL,
 };
 
 struct fib_notifier_ops {
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index dce2d586d9ce..f5ff16d72fe6 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -133,11 +133,6 @@ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph)
 	return 1;
 }
 
-static inline void IP6_ECN_clear(struct ipv6hdr *iph)
-{
-	*(__be32*)iph &= ~htonl(INET_ECN_MASK << 20);
-}
-
 static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner)
 {
 	dscp &= ~INET_ECN_MASK;
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1a7f7e424320..f80524396c06 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -122,9 +122,6 @@ struct fib_info {
 #define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
 #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
 	int			fib_nhs;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	int			fib_weight;
-#endif
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].nh_dev
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 992652856fe8..b41a1e057fce 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -258,7 +258,8 @@ int ip_tunnel_get_iflink(const struct net_device *dev);
 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
+void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id,
+			   struct rtnl_link_ops *ops);
 
 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9816df225af3..2492000e1035 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -190,8 +190,8 @@ struct neigh_hash_table {
 
 struct neigh_table {
 	int			family;
-	int			entry_size;
-	int			key_len;
+	unsigned int		entry_size;
+	unsigned int		key_len;
 	__be16			protocol;
 	__u32			(*hash)(const void *pkey,
 					const struct net_device *dev,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 20d061c805e3..2c4222a5d102 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -36,6 +36,8 @@ struct inet_timewait_death_row {
 	int			sysctl_max_tw_buckets;
 };
 
+struct tcp_fastopen_context;
+
 struct netns_ipv4 {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*forw_hdr;
@@ -52,6 +54,7 @@ struct netns_ipv4 {
 	struct fib_table __rcu	*fib_main;
 	struct fib_table __rcu	*fib_default;
 #endif
+	bool			fib_has_custom_local_routes;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	int			fib_num_tclassid_users;
 #endif
@@ -127,6 +130,12 @@ struct netns_ipv4 {
 	int sysctl_tcp_timestamps;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
+	int sysctl_tcp_fastopen;
+	struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
+	spinlock_t tcp_fastopen_ctx_lock;
+	unsigned int sysctl_tcp_fastopen_blackhole_timeout;
+	atomic_t tfo_active_disable_times;
+	unsigned long tfo_active_disable_stamp;
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	int sysctl_udp_l3mdev_accept;
@@ -162,6 +171,9 @@ struct netns_ipv4 {
 	struct fib_notifier_ops	*notifier_ops;
 	unsigned int	fib_seq;	/* protected by rtnl_mutex */
 
+	struct fib_notifier_ops	*ipmr_notifier_ops;
+	unsigned int	ipmr_seq;	/* protected by rtnl_mutex */
+
 	atomic_t	rt_genid;
 };
 #endif
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 2544f9760a42..2ea1ed341ef8 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -89,6 +89,11 @@ struct netns_ipv6 {
 	atomic_t		fib6_sernum;
 	struct seg6_pernet_data *seg6_data;
 	struct fib_notifier_ops	*notifier_ops;
+	struct {
+		struct hlist_head head;
+		spinlock_t	lock;
+		u32		seq;
+	} ip6addrlbl_table;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 21837ca68ecc..e3ca8e2e3103 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -93,9 +93,6 @@ struct rtnl_link_ops {
 
 	int			slave_maxtype;
 	const struct nla_policy	*slave_policy;
-	int			(*slave_validate)(struct nlattr *tb[],
-						  struct nlattr *data[],
-						  struct netlink_ext_ack *extack);
 	int			(*slave_changelink)(struct net_device *dev,
 						    struct net_device *slave_dev,
 						    struct nlattr *tb[],
@@ -154,8 +151,6 @@ struct rtnl_af_ops {
 	size_t			(*get_stats_af_size)(const struct net_device *dev);
 };
 
-void __rtnl_af_unregister(struct rtnl_af_ops *ops);
-
 void rtnl_af_register(struct rtnl_af_ops *ops);
 void rtnl_af_unregister(struct rtnl_af_ops *ops);
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 135f5a2dd931..684d8ed27eaa 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -93,7 +93,6 @@ struct Qdisc {
 	unsigned long		state;
 	struct Qdisc            *next_sched;
 	struct sk_buff		*skb_bad_txq;
-	struct rcu_head		rcu_head;
 	int			padded;
 	refcount_t		refcnt;
 
diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
new file mode 100644
index 000000000000..c676550a4c7d
--- /dev/null
+++ b/include/net/sctp/stream_sched.h
@@ -0,0 +1,72 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * These are definitions used by the stream schedulers, defined in RFC
+ * draft ndata (https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-11)
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation  is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresses:
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *   Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#ifndef __sctp_stream_sched_h__
+#define __sctp_stream_sched_h__
+
+struct sctp_sched_ops {
+	/* Property handling for a given stream */
+	int (*set)(struct sctp_stream *stream, __u16 sid, __u16 value,
+		   gfp_t gfp);
+	int (*get)(struct sctp_stream *stream, __u16 sid, __u16 *value);
+
+	/* Init the specific scheduler */
+	int (*init)(struct sctp_stream *stream);
+	/* Init a stream */
+	int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
+	/* Frees the entire thing */
+	void (*free)(struct sctp_stream *stream);
+
+	/* Enqueue a chunk */
+	void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg);
+	/* Dequeue a chunk */
+	struct sctp_chunk *(*dequeue)(struct sctp_outq *q);
+	/* Called only if the chunk fit the packet */
+	void (*dequeue_done)(struct sctp_outq *q, struct sctp_chunk *chunk);
+	/* Sched all chunks already enqueued */
+	void (*sched_all)(struct sctp_stream *steam);
+	/* Unched all chunks already enqueued */
+	void (*unsched_all)(struct sctp_stream *steam);
+};
+
+int sctp_sched_set_sched(struct sctp_association *asoc,
+			 enum sctp_sched_type sched);
+int sctp_sched_get_sched(struct sctp_association *asoc);
+int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 value, gfp_t gfp);
+int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 *value);
+void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch);
+
+void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch);
+int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
+struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream);
+
+#endif /* __sctp_stream_sched_h__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 0477945de1a3..16f949eef52f 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -380,6 +380,7 @@ struct sctp_sender_hb_info {
 
 int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 		     gfp_t gfp);
+int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid);
 void sctp_stream_free(struct sctp_stream *stream);
 void sctp_stream_clear(struct sctp_stream *stream);
 void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new);
@@ -529,8 +530,12 @@ struct sctp_chunk {
 	/* How many times this chunk have been sent, for prsctp RTX policy */
 	int sent_count;
 
-	/* This is our link to the per-transport transmitted list.  */
-	struct list_head transmitted_list;
+	union {
+		/* This is our link to the per-transport transmitted list.  */
+		struct list_head transmitted_list;
+		/* List in specific stream outq */
+		struct list_head stream_list;
+	};
 
 	/* This field is used by chunks that hold fragmented data.
 	 * For the first fragment this is the list that holds the rest of
@@ -640,6 +645,11 @@ void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *,
 		     union sctp_addr *);
 const union sctp_addr *sctp_source(const struct sctp_chunk *chunk);
 
+static inline __u16 sctp_chunk_stream_no(struct sctp_chunk *ch)
+{
+	return ntohs(ch->subh.data_hdr->stream);
+}
+
 enum {
 	SCTP_ADDR_NEW,		/* new address added to assoc/ep */
 	SCTP_ADDR_SRC,		/* address can be used as source */
@@ -1012,6 +1022,9 @@ struct sctp_outq {
 	/* Data pending that has never been transmitted.  */
 	struct list_head out_chunk_list;
 
+	/* Stream scheduler being used */
+	struct sctp_sched_ops *sched;
+
 	unsigned int out_qlen;	/* Total length of queued data chunks. */
 
 	/* Error of send failed, may used in SCTP_SEND_FAILED event. */
@@ -1315,11 +1328,37 @@ struct sctp_inithdr_host {
 	__u32 initial_tsn;
 };
 
+struct sctp_stream_priorities {
+	/* List of priorities scheduled */
+	struct list_head prio_sched;
+	/* List of streams scheduled */
+	struct list_head active;
+	/* The next stream stream in line */
+	struct sctp_stream_out_ext *next;
+	__u16 prio;
+};
+
+struct sctp_stream_out_ext {
+	__u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
+	__u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
+	struct list_head outq; /* chunks enqueued by this stream */
+	union {
+		struct {
+			/* Scheduled streams list */
+			struct list_head prio_list;
+			struct sctp_stream_priorities *prio_head;
+		};
+		/* Fields used by RR scheduler */
+		struct {
+			struct list_head rr_list;
+		};
+	};
+};
+
 struct sctp_stream_out {
 	__u16	ssn;
 	__u8	state;
-	__u64	abandoned_unsent[SCTP_PR_INDEX(MAX) + 1];
-	__u64	abandoned_sent[SCTP_PR_INDEX(MAX) + 1];
+	struct sctp_stream_out_ext *ext;
 };
 
 struct sctp_stream_in {
@@ -1331,6 +1370,22 @@ struct sctp_stream {
 	struct sctp_stream_in *in;
 	__u16 outcnt;
 	__u16 incnt;
+	/* Current stream being sent, if any */
+	struct sctp_stream_out *out_curr;
+	union {
+		/* Fields used by priority scheduler */
+		struct {
+			/* List of priorities scheduled */
+			struct list_head prio_list;
+		};
+		/* Fields used by RR scheduler */
+		struct {
+			/* List of streams scheduled */
+			struct list_head rr_list;
+			/* The next stream stream in line */
+			struct sctp_stream_out_ext *rr_next;
+		};
+	};
 };
 
 #define SCTP_STREAM_CLOSED		0x00
diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index 41afe1ce7b16..d979a0d48f9e 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -33,6 +33,11 @@ static inline bool __is_tcf_gact_act(const struct tc_action *a, int act,
 	return false;
 }
 
+static inline bool is_tcf_gact_ok(const struct tc_action *a)
+{
+	return __is_tcf_gact_act(a, TC_ACT_OK, false);
+}
+
 static inline bool is_tcf_gact_shot(const struct tc_action *a)
 {
 	return __is_tcf_gact_act(a, TC_ACT_SHOT, false);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 89974c5286d8..7a3a8af56fd6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -240,7 +240,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 
 /* sysctl variables for tcp */
-extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
 extern int sysctl_tcp_rfc1337;
@@ -796,12 +795,6 @@ struct tcp_skb_cb {
 			u16	tcp_gso_segs;
 			u16	tcp_gso_size;
 		};
-
-		/* Used to stash the receive timestamp while this skb is in the
-		 * out of order queue, as skb->tstamp is overwritten by the
-		 * rbnode.
-		 */
-		ktime_t		swtstamp;
 	};
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
@@ -1556,13 +1549,13 @@ struct tcp_fastopen_request {
 };
 void tcp_free_fastopen_req(struct tcp_sock *tp);
 
-extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
-int tcp_fastopen_reset_cipher(void *key, unsigned int len);
+void tcp_fastopen_ctx_destroy(struct net *net);
+int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
 			      struct tcp_fastopen_cookie *foc);
-void tcp_fastopen_init_key_once(bool publish);
+void tcp_fastopen_init_key_once(struct net *net);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
 bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f90860d1f897..6082faf5fd2a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
 	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
 };
 
 enum bpf_map_type {
@@ -143,11 +144,47 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
- * to the given target_fd cgroup the descendent cgroup will be able to
- * override effective bpf program that was inherited from this cgroup
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
@@ -175,6 +212,11 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
+#define BPF_OBJ_NAME_LEN 16U
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -188,6 +230,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
+		__u8	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -210,6 +253,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
+		__u8		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -249,6 +293,15 @@ union bpf_attr {
 		__u32		info_len;
 		__aligned_u64	info;
 	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
@@ -582,6 +635,12 @@ union bpf_attr {
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
+ *
+ * int bpf_xdp_adjust_meta(xdp_md, delta)
+ *     Adjust the xdp_md.data_meta by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data_meta
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -638,6 +697,7 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -715,7 +775,7 @@ struct __sk_buff {
 	__u32 data_end;
 	__u32 napi_id;
 
-	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
 	__u32 local_ip4;	/* Stored in network byte order */
@@ -723,6 +783,9 @@ struct __sk_buff {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
 };
 
 struct bpf_tunnel_key {
@@ -783,6 +846,7 @@ enum xdp_action {
 struct xdp_md {
 	__u32 data;
 	__u32 data_end;
+	__u32 data_meta;
 };
 
 enum sk_action {
@@ -801,6 +865,11 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -810,6 +879,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d062c58d5cb..cd580fc0e58f 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -158,6 +158,7 @@ enum {
 	IFLA_PAD,
 	IFLA_XDP,
 	IFLA_EVENT,
+	IFLA_NEW_NETNSID,
 	__IFLA_MAX
 };
 
@@ -325,6 +326,7 @@ enum {
 	IFLA_BRPORT_MCAST_TO_UCAST,
 	IFLA_BRPORT_VLAN_TUNNEL,
 	IFLA_BRPORT_BCAST_FLOOD,
+	IFLA_BRPORT_GROUP_FWD_MASK,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 3cb5e1d85ddd..365ade5685c9 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,8 @@
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
+#define IFF_NAPI	0x0010
+#define IFF_NAPI_FRAGS	0x0020
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 46444f8fbee4..4f8f3eb0699f 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -284,6 +284,7 @@ struct in6_flowlabel_req {
 #define IPV6_TRANSPARENT        75
 #define IPV6_UNICAST_IF         76
 #define IPV6_RECVFRAGSIZE	77
+#define IPV6_FREEBIND		78
 
 /*
  * Multicast Routing:
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6217ff8500a1..6cd7d416ca40 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -122,6 +122,8 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_RESET_ASSOC	120
 #define SCTP_ADD_STREAMS	121
 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122
+#define SCTP_STREAM_SCHEDULER	123
+#define SCTP_STREAM_SCHEDULER_VALUE	124
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
@@ -814,6 +816,12 @@ struct sctp_assoc_value {
     uint32_t                assoc_value;
 };
 
+struct sctp_stream_value {
+	sctp_assoc_t assoc_id;
+	uint16_t stream_id;
+	uint16_t stream_value;
+};
+
 /*
  * 7.2.2 Peer Address Information
  *
@@ -1088,4 +1096,12 @@ struct sctp_add_streams {
 	uint16_t sas_outstrms;
 };
 
+/* SCTP Stream schedulers */
+enum sctp_sched_type {
+	SCTP_SS_FCFS,
+	SCTP_SS_PRIO,
+	SCTP_SS_RR,
+	SCTP_SS_MAX = SCTP_SS_RR
+};
+
 #endif /* _UAPI_SCTP_H */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..e88abc0865d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,407 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 {
 	unsigned int type;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
-		struct bpf_prog *prog = cgrp->bpf.prog[type];
-
-		if (prog) {
-			bpf_prog_put(prog);
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+		struct list_head *progs = &cgrp->bpf.progs[type];
+		struct bpf_prog_list *pl, *tmp;
+
+		list_for_each_entry_safe(pl, tmp, progs, node) {
+			list_del(&pl->node);
+			bpf_prog_put(pl->prog);
+			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
+		bpf_prog_array_free(cgrp->bpf.effective[type]);
+	}
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+	struct bpf_prog_list *pl;
+	u32 cnt = 0;
+
+	list_for_each_entry(pl, head, node) {
+		if (!pl->prog)
+			continue;
+		cnt++;
 	}
+	return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+				    enum bpf_attach_type type,
+				    u32 new_flags)
+{
+	struct cgroup *p;
+
+	p = cgroup_parent(cgrp);
+	if (!p)
+		return true;
+	do {
+		u32 flags = p->bpf.flags[type];
+		u32 cnt;
+
+		if (flags & BPF_F_ALLOW_MULTI)
+			return true;
+		cnt = prog_list_length(&p->bpf.progs[type]);
+		WARN_ON_ONCE(cnt > 1);
+		if (cnt == 1)
+			return !!(flags & BPF_F_ALLOW_OVERRIDE);
+		p = cgroup_parent(p);
+	} while (p);
+	return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_prog_array __rcu **array)
+{
+	struct bpf_prog_array __rcu *progs;
+	struct bpf_prog_list *pl;
+	struct cgroup *p = cgrp;
+	int cnt = 0;
+
+	/* count number of effective programs by walking parents */
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			cnt += prog_list_length(&p->bpf.progs[type]);
+		p = cgroup_parent(p);
+	} while (p);
+
+	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+	if (!progs)
+		return -ENOMEM;
+
+	/* populate the array with effective progs */
+	cnt = 0;
+	p = cgrp;
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			list_for_each_entry(pl,
+					    &p->bpf.progs[type], node) {
+				if (!pl->prog)
+					continue;
+				rcu_dereference_protected(progs, 1)->
+					progs[cnt++] = pl->prog;
+			}
+		p = cgroup_parent(p);
+	} while (p);
+
+	*array = progs;
+	return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+				     enum bpf_attach_type type,
+				     struct bpf_prog_array __rcu *array)
+{
+	struct bpf_prog_array __rcu *old_array;
+
+	old_array = xchg(&cgrp->bpf.effective[type], array);
+	/* free prog array after grace period, since __cgroup_bpf_run_*()
+	 * might be still walking the array
+	 */
+	bpf_prog_array_free(old_array);
 }
 
 /**
  * cgroup_bpf_inherit() - inherit effective programs from parent
  * @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
  */
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
 {
-	unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define	NR ARRAY_SIZE(cgrp->bpf.effective)
+	struct bpf_prog_array __rcu *arrays[NR] = {};
+	int i;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
-		struct bpf_prog *e;
+	for (i = 0; i < NR; i++)
+		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 
-		e = rcu_dereference_protected(parent->bpf.effective[type],
-					      lockdep_is_held(&cgroup_mutex));
-		rcu_assign_pointer(cgrp->bpf.effective[type], e);
-		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
-	}
+	for (i = 0; i < NR; i++)
+		if (compute_effective_progs(cgrp, i, &arrays[i]))
+			goto cleanup;
+
+	for (i = 0; i < NR; i++)
+		activate_effective_progs(cgrp, i, arrays[i]);
+
+	return 0;
+cleanup:
+	for (i = 0; i < NR; i++)
+		bpf_prog_array_free(arrays[i]);
+	return -ENOMEM;
 }
 
+#define BPF_CGROUP_MAX_PROGS 64
+
 /**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
  *
  * Must be called with cgroup_mutex held.
  */
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags)
 {
-	struct bpf_prog *old_prog, *effective = NULL;
-	struct cgroup_subsys_state *pos;
-	bool overridable = true;
-
-	if (parent) {
-		overridable = !parent->bpf.disallow_override[type];
-		effective = rcu_dereference_protected(parent->bpf.effective[type],
-						      lockdep_is_held(&cgroup_mutex));
-	}
-
-	if (prog && effective && !overridable)
-		/* if parent has non-overridable prog attached, disallow
-		 * attaching new programs to descendent cgroup
-		 */
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	bool pl_was_allocated;
+	u32 old_flags;
+	int err;
+
+	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+		/* invalid combination */
+		return -EINVAL;
+
+	if (!hierarchy_allows_attach(cgrp, type, flags))
 		return -EPERM;
 
-	if (prog && effective && overridable != new_overridable)
-		/* if parent has overridable prog attached, only
-		 * allow overridable programs in descendent cgroup
+	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+		/* Disallow attaching non-overridable on top
+		 * of existing overridable in this cgroup.
+		 * Disallow attaching multi-prog if overridable or none
 		 */
 		return -EPERM;
 
-	old_prog = cgrp->bpf.prog[type];
-
-	if (prog) {
-		overridable = new_overridable;
-		effective = prog;
-		if (old_prog &&
-		    cgrp->bpf.disallow_override[type] == new_overridable)
-			/* disallow attaching non-overridable on top
-			 * of existing overridable in this cgroup
-			 * and vice versa
-			 */
-			return -EPERM;
+	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+		return -E2BIG;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		list_for_each_entry(pl, progs, node)
+			if (pl->prog == prog)
+				/* disallow attaching the same prog twice */
+				return -EINVAL;
+
+		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+		if (!pl)
+			return -ENOMEM;
+		pl_was_allocated = true;
+		pl->prog = prog;
+		list_add_tail(&pl->node, progs);
+	} else {
+		if (list_empty(progs)) {
+			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+			if (!pl)
+				return -ENOMEM;
+			pl_was_allocated = true;
+			list_add_tail(&pl->node, progs);
+		} else {
+			pl = list_first_entry(progs, typeof(*pl), node);
+			old_prog = pl->prog;
+			pl_was_allocated = false;
+		}
+		pl->prog = prog;
 	}
 
-	if (!prog && !old_prog)
-		/* report error when trying to detach and nothing is attached */
-		return -ENOENT;
-
-	cgrp->bpf.prog[type] = prog;
+	old_flags = cgrp->bpf.flags[type];
+	cgrp->bpf.flags[type] = flags;
 
-	css_for_each_descendant_pre(pos, &cgrp->self) {
-		struct cgroup *desc = container_of(pos, struct cgroup, self);
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
-		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp) {
-			pos = css_rightmost_descendant(pos);
-		} else {
-			rcu_assign_pointer(desc->bpf.effective[type],
-					   effective);
-			desc->bpf.disallow_override[type] = !overridable;
-		}
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
 	}
 
-	if (prog)
-		static_branch_inc(&cgroup_bpf_enabled_key);
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
 
+	static_branch_inc(&cgroup_bpf_enabled_key);
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
 	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and cleanup the prog list */
+	pl->prog = old_prog;
+	if (pl_was_allocated) {
+		list_del(&pl->node);
+		kfree(pl);
+	}
+	return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 unused_flags)
+{
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	int err;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		if (!prog)
+			/* to detach MULTI prog the user has to specify valid FD
+			 * of the program to be detached
+			 */
+			return -EINVAL;
+	} else {
+		if (list_empty(progs))
+			/* report error when trying to detach and nothing is attached */
+			return -ENOENT;
+	}
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		/* find the prog and detach it */
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog != prog)
+				continue;
+			old_prog = prog;
+			/* mark it deleted, so it's ignored while
+			 * recomputing effective
+			 */
+			pl->prog = NULL;
+			break;
+		}
+		if (!old_prog)
+			return -ENOENT;
+	} else {
+		/* to maintain backward compatibility NONE and OVERRIDE cgroups
+		 * allow detaching with invalid FD (prog==NULL)
+		 */
+		pl = list_first_entry(progs, typeof(*pl), node);
+		old_prog = pl->prog;
+		pl->prog = NULL;
+	}
+
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
+	}
+
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* now can actually delete it from this cgroup list */
+	list_del(&pl->node);
+	kfree(pl);
+	if (list_empty(progs))
+		/* last program was detached, reset flags to zero */
+		cgrp->bpf.flags[type] = 0;
+
+	bpf_prog_put(old_prog);
+	static_branch_dec(&cgroup_bpf_enabled_key);
+	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and restore back old_prog */
+	pl->prog = old_prog;
+	return err;
+}
+
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	enum bpf_attach_type type = attr->query.attach_type;
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	int cnt, ret = 0, i;
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+	else
+		cnt = prog_list_length(progs);
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+		return -EFAULT;
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+		return -EFAULT;
+	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+		/* return early if user requested only program count + flags */
+		return 0;
+	if (attr->query.prog_cnt < cnt) {
+		cnt = attr->query.prog_cnt;
+		ret = -ENOSPC;
+	}
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+						   prog_ids, cnt);
+	} else {
+		struct bpf_prog_list *pl;
+		u32 id;
+
+		i = 0;
+		list_for_each_entry(pl, progs, node) {
+			id = pl->prog->aux->id;
+			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+				return -EFAULT;
+			if (++i == cnt)
+				break;
+		}
+	}
+	return ret;
 }
 
 /**
@@ -171,36 +449,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type)
 {
-	struct bpf_prog *prog;
+	unsigned int offset = skb->data - skb_network_header(skb);
+	struct sock *save_sk;
 	struct cgroup *cgrp;
-	int ret = 0;
+	int ret;
 
 	if (!sk || !sk_fullsock(sk))
 		return 0;
 
-	if (sk->sk_family != AF_INET &&
-	    sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog) {
-		unsigned int offset = skb->data - skb_network_header(skb);
-		struct sock *save_sk = skb->sk;
-
-		skb->sk = sk;
-		__skb_push(skb, offset);
-		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
-		__skb_pull(skb, offset);
-		skb->sk = save_sk;
-	}
-
-	rcu_read_unlock();
-
-	return ret;
+	save_sk = skb->sk;
+	skb->sk = sk;
+	__skb_push(skb, offset);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+				 bpf_prog_run_save_cb);
+	__skb_pull(skb, offset);
+	skb->sk = save_sk;
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 
@@ -221,19 +489,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
-
-	rcu_read_unlock();
+	int ret;
 
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -258,18 +517,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
+	int ret;
 
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
-
-	rcu_read_unlock();
-
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+				 BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b62df86be1d..c6be15ae83ee 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1381,6 +1381,75 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+	struct bpf_prog_array hdr;
+	struct bpf_prog *null_prog;
+} empty_prog_array = {
+	.null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+	if (prog_cnt)
+		return kzalloc(sizeof(struct bpf_prog_array) +
+			       sizeof(struct bpf_prog *) * (prog_cnt + 1),
+			       flags);
+
+	return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+	if (!progs ||
+	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+		return;
+	kfree_rcu(progs, rcu);
+}
+
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+	struct bpf_prog **prog;
+	u32 cnt = 0;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++)
+		cnt++;
+	rcu_read_unlock();
+	return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt)
+{
+	struct bpf_prog **prog;
+	u32 i = 0, id;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++) {
+		id = (*prog)->aux->id;
+		if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+			rcu_read_unlock();
+			return -EFAULT;
+		}
+		if (++i == cnt) {
+			prog++;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (*prog)
+		return -ENOSPC;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b767844a76f..34d8a690ea05 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -389,10 +389,99 @@ out:
 	return ret;
 }
 
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
 {
-	/* TODO */
-	return -ENOSYS;
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct bpf_lpm_trie_key *key = _key;
+	struct lpm_trie_node __rcu **trim, **trim2;
+	struct lpm_trie_node *node, *parent;
+	unsigned long irq_flags;
+	unsigned int next_bit;
+	size_t matchlen = 0;
+	int ret = 0;
+
+	if (key->prefixlen > trie->max_prefixlen)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+	/* Walk the tree looking for an exact key/length match and keeping
+	 * track of the path we traverse.  We will need to know the node
+	 * we wish to delete, and the slot that points to the node we want
+	 * to delete.  We may also need to know the nodes parent and the
+	 * slot that contains it.
+	 */
+	trim = &trie->root;
+	trim2 = trim;
+	parent = NULL;
+	while ((node = rcu_dereference_protected(
+		       *trim, lockdep_is_held(&trie->lock)))) {
+		matchlen = longest_prefix_match(trie, node, key);
+
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen)
+			break;
+
+		parent = node;
+		trim2 = trim;
+		next_bit = extract_bit(key->data, node->prefixlen);
+		trim = &node->child[next_bit];
+	}
+
+	if (!node || node->prefixlen != key->prefixlen ||
+	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	trie->n_entries--;
+
+	/* If the node we are removing has two children, simply mark it
+	 * as intermediate and we are done.
+	 */
+	if (rcu_access_pointer(node->child[0]) &&
+	    rcu_access_pointer(node->child[1])) {
+		node->flags |= LPM_TREE_NODE_FLAG_IM;
+		goto out;
+	}
+
+	/* If the parent of the node we are about to delete is an intermediate
+	 * node, and the deleted node doesn't have any children, we can delete
+	 * the intermediate parent as well and promote its other child
+	 * up the tree.  Doing this maintains the invariant that all
+	 * intermediate nodes have exactly 2 children and that there are no
+	 * unnecessary intermediate nodes in the tree.
+	 */
+	if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+	    !node->child[0] && !node->child[1]) {
+		if (node == rcu_access_pointer(parent->child[0]))
+			rcu_assign_pointer(
+				*trim2, rcu_access_pointer(parent->child[1]));
+		else
+			rcu_assign_pointer(
+				*trim2, rcu_access_pointer(parent->child[0]));
+		kfree_rcu(parent, rcu);
+		kfree_rcu(node, rcu);
+		goto out;
+	}
+
+	/* The node we are removing has either zero or one child. If there
+	 * is a child, move it into the removed node's slot then delete
+	 * the node.  Otherwise just clear the slot and delete the node.
+	 */
+	if (node->child[0])
+		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+	else if (node->child[1])
+		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+	else
+		RCU_INIT_POINTER(*trim, NULL);
+	kfree_rcu(node, rcu);
+
+out:
+	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+	return ret;
 }
 
 #define LPM_DATA_SIZE_MAX	256
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0e4969..a298d6666698 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	skb_orphan(skb);
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 
@@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..0048cb24ba7b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
 #include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -312,7 +315,31 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+	const char *end = src + BPF_OBJ_NAME_LEN;
+
+	/* Copy all isalnum() and '_' char */
+	while (src < end && *src) {
+		if (!isalnum(*src) && *src != '_')
+			return -EINVAL;
+		*dst++ = *src++;
+	}
+
+	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+	if (src == end)
+		return -EINVAL;
+
+	/* '\0' terminates dst */
+	*dst = 0;
+
+	return 0;
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_name
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -334,6 +361,10 @@ static int map_create(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	err = bpf_obj_name_cpy(map->name, attr->map_name);
+	if (err)
+		goto free_map_nouncharge;
+
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
@@ -973,7 +1004,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define	BPF_PROG_LOAD_LAST_FIELD prog_name
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1037,6 +1068,11 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_prog;
 
+	prog->aux->load_time = ktime_get_boot_ns();
+	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+	if (err)
+		goto free_prog;
+
 	/* run eBPF verifier */
 	err = bpf_check(&prog, attr);
 	if (err < 0)
@@ -1132,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
 	return 0;
 }
 
+#define BPF_F_ATTACH_MASK \
+	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
@@ -1145,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
-	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
 		return -EINVAL;
 
 	switch (attr->attach_type) {
@@ -1176,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
-				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+				attr->attach_flags);
 	if (ret)
 		bpf_prog_put(prog);
 	cgroup_put(cgrp);
@@ -1189,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
+	struct bpf_prog *prog;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -1201,26 +1242,67 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
+		break;
 	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	case BPF_CGROUP_SOCK_OPS:
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp))
-			return PTR_ERR(cgrp);
-
-		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		ret = sockmap_get_from_fd(attr, false);
-		break;
+		return sockmap_get_from_fd(attr, false);
 	default:
 		return -EINVAL;
 	}
 
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		prog = NULL;
+
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	if (prog)
+		bpf_prog_put(prog);
+	cgroup_put(cgrp);
 	return ret;
 }
 
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (CHECK_ATTR(BPF_PROG_QUERY))
+		return -EINVAL;
+	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+		return -EINVAL;
+
+	switch (attr->query.attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
+		break;
+	default:
+		return -EINVAL;
+	}
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+	ret = cgroup_bpf_query(cgrp, attr, uattr);
+	cgroup_put(cgrp);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1358,8 +1440,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	info.type = prog->type;
 	info.id = prog->aux->id;
+	info.load_time = prog->aux->load_time;
+	info.created_by_uid = from_kuid_munged(current_user_ns(),
+					       prog->aux->user->uid);
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+	ulen = info.nr_map_ids;
+	info.nr_map_ids = prog->aux->used_map_cnt;
+	ulen = min_t(u32, info.nr_map_ids, ulen);
+	if (ulen) {
+		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
+		u32 i;
+
+		for (i = 0; i < ulen; i++)
+			if (put_user(prog->aux->used_maps[i]->id,
+				     &user_map_ids[i]))
+				return -EFAULT;
+	}
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
@@ -1413,6 +1512,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.value_size = map->value_size;
 	info.max_entries = map->max_entries;
 	info.map_flags = map->map_flags;
+	memcpy(info.name, map->name, sizeof(map->name));
 
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
@@ -1499,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_DETACH:
 		err = bpf_prog_detach(&attr);
 		break;
+	case BPF_PROG_QUERY:
+		err = bpf_prog_query(&attr, uattr);
+		break;
 #endif
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..52b022310f6a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
 	va_end(args);
 }
 
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -187,6 +193,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
 	[PTR_TO_STACK]		= "fp",
 	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
@@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 			verbose("(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
 				verbose(",off=%d", reg->off);
-			if (t == PTR_TO_PACKET)
+			if (type_is_pkt_pointer(t))
 				verbose(",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
@@ -325,26 +332,45 @@ static const char *const bpf_jmp_string[16] = {
 	[BPF_EXIT >> 4] = "exit",
 };
 
+static void print_bpf_end_insn(const struct bpf_verifier_env *env,
+			       const struct bpf_insn *insn)
+{
+	verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+		insn->imm, insn->dst_reg);
+}
+
 static void print_bpf_insn(const struct bpf_verifier_env *env,
 			   const struct bpf_insn *insn)
 {
 	u8 class = BPF_CLASS(insn->code);
 
 	if (class == BPF_ALU || class == BPF_ALU64) {
-		if (BPF_SRC(insn->code) == BPF_X)
+		if (BPF_OP(insn->code) == BPF_END) {
+			if (class == BPF_ALU64)
+				verbose("BUG_alu64_%02x\n", insn->code);
+			else
+				print_bpf_end_insn(env, insn);
+		} else if (BPF_OP(insn->code) == BPF_NEG) {
+			verbose("(%02x) r%d = %s-r%d\n",
+				insn->code, insn->dst_reg,
+				class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
 			verbose("(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->src_reg);
-		else
+		} else {
 			verbose("(%02x) %sr%d %s %s%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->imm);
+		}
 	} else if (class == BPF_STX) {
 		if (BPF_MODE(insn->code) == BPF_MEM)
 			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
@@ -519,6 +545,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_known_zero(regs + regno);
 }
 
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+	return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+	return reg_is_pkt_pointer(reg) ||
+	       reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+				    enum bpf_reg_type which)
+{
+	/* The register can already have a range from prior markings.
+	 * This is fine as long as it hasn't been advanced from its
+	 * origin.
+	 */
+	return reg->type == which &&
+	       reg->id == 0 &&
+	       reg->off == 0 &&
+	       tnum_equals_const(reg->var_off, 0);
+}
+
 /* Attempts to improve min/max values based on var_off information */
 static void __update_reg_bounds(struct bpf_reg_state *reg)
 {
@@ -702,6 +753,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -1047,7 +1099,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		/* special case, because of NET_IP_ALIGN */
+	case PTR_TO_PACKET_META:
+		/* Special case, because of NET_IP_ALIGN. Given metadata sits
+		 * right in front, treat it the very same way.
+		 */
 		return check_pkt_ptr_alignment(reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
@@ -1124,8 +1179,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
-			 * PTR_TO_PACKET[_END].  In the latter case, we know
-			 * the offset is zero.
+			 * PTR_TO_PACKET[_META,_END]. In the latter
+			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
 				mark_reg_unknown(state->regs, value_regno);
@@ -1170,7 +1225,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
-	} else if (reg->type == PTR_TO_PACKET) {
+	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
@@ -1310,6 +1365,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size);
@@ -1342,7 +1398,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET &&
+	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
@@ -1351,7 +1407,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (type != PTR_TO_PACKET && type != expected_type)
+		if (!type_is_pkt_pointer(type) &&
+		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,7 +1432,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (register_is_null(*reg))
 			/* final test in check_stack_boundary() */;
-		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+		else if (!type_is_pkt_pointer(type) &&
+			 type != PTR_TO_MAP_VALUE &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -1401,7 +1459,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->key_size);
 		else
@@ -1417,7 +1475,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->value_size);
 		else
@@ -1590,8 +1648,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -1600,18 +1658,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET ||
-		    regs[i].type == PTR_TO_PACKET_END)
+		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type != PTR_TO_PACKET &&
-		    reg->type != PTR_TO_PACKET_END)
-			continue;
-		__mark_reg_unknown(reg);
+		if (reg_is_pkt_pointer_any(reg))
+			__mark_reg_unknown(reg);
 	}
 }
 
@@ -1871,7 +1926,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			dst_reg->range = 0;
@@ -1931,7 +1986,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			if (smin_val < 0)
@@ -2421,7 +2476,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 }
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
-				   struct bpf_reg_state *dst_reg)
+				   struct bpf_reg_state *dst_reg,
+				   enum bpf_reg_type type)
 {
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
@@ -2483,7 +2539,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+		if (regs[i].type == type && regs[i].id == dst_reg->id)
 			/* keep the maximum range already checked */
 			regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
 
@@ -2491,7 +2547,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, dst_reg->off);
 	}
 }
@@ -2856,19 +2912,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(this_branch, dst_reg);
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(other_branch, dst_reg);
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
@@ -2997,6 +3073,43 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static int check_return_code(struct bpf_verifier_env *env)
+{
+	struct bpf_reg_state *reg;
+	struct tnum range = tnum_range(0, 1);
+
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_CGROUP_SKB:
+	case BPF_PROG_TYPE_CGROUP_SOCK:
+	case BPF_PROG_TYPE_SOCK_OPS:
+		break;
+	default:
+		return 0;
+	}
+
+	reg = &env->cur_state.regs[BPF_REG_0];
+	if (reg->type != SCALAR_VALUE) {
+		verbose("At program exit the register R0 is not a known value (%s)\n",
+			reg_type_str[reg->type]);
+		return -EINVAL;
+	}
+
+	if (!tnum_in(range, reg->var_off)) {
+		verbose("At program exit the register R0 ");
+		if (!tnum_is_unknown(reg->var_off)) {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose("has value %s", tn_buf);
+		} else {
+			verbose("has unknown scalar value");
+		}
+		verbose(" should have been 0 or 1\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /* non-recursive DFS pseudo code
  * 1  procedure DFS-iterative(G,v):
  * 2      label v as discovered
@@ -3298,8 +3411,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return false;
 		/* Check our ids match any regs they're supposed to */
 		return check_ids(rold->id, rcur->id, idmap);
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
-		if (rcur->type != PTR_TO_PACKET)
+		if (rcur->type != rold->type)
 			return false;
 		/* We must have at least as much range as the old ptr
 		 * did, so that any accesses which were safe before are
@@ -3786,6 +3900,9 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EACCES;
 				}
 
+				err = check_return_code(env);
+				if (err)
+					return err;
 process_bpf_exit:
 				insn_idx = pop_stack(env, &prev_insn_idx);
 				if (insn_idx < 0) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 44857278eb8a..00f5b358aeac 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 	if (ret)
 		goto destroy_root;
 
+	ret = cgroup_bpf_inherit(root_cgrp);
+	WARN_ON_ONCE(ret);
+
 	trace_cgroup_setup_root(root);
 
 	/*
@@ -4721,6 +4724,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
 	cgrp->level = level;
+	ret = cgroup_bpf_inherit(cgrp);
+	if (ret)
+		goto out_idr_free;
 
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4755,13 +4761,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
-	if (parent)
-		cgroup_bpf_inherit(cgrp, parent);
-
 	cgroup_propagate_control(cgrp);
 
 	return cgrp;
 
+out_idr_free:
+	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5744,14 +5749,33 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
 #ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr)
 {
-	struct cgroup *parent = cgroup_parent(cgrp);
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+	ret = __cgroup_bpf_query(cgrp, attr, uattr);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index f237a09a5862..c3e84edc47c9 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -294,6 +294,55 @@ static void cleanup_uevent_env(struct subprocess_info *info)
 }
 #endif
 
+static int kobject_uevent_net_broadcast(struct kobject *kobj,
+					struct kobj_uevent_env *env,
+					const char *action_string,
+					const char *devpath)
+{
+	int retval = 0;
+#if defined(CONFIG_NET)
+	struct sk_buff *skb = NULL;
+	struct uevent_sock *ue_sk;
+
+	/* send netlink message */
+	list_for_each_entry(ue_sk, &uevent_sock_list, list) {
+		struct sock *uevent_sock = ue_sk->sk;
+
+		if (!netlink_has_listeners(uevent_sock, 1))
+			continue;
+
+		if (!skb) {
+			/* allocate message with the maximum possible size */
+			size_t len = strlen(action_string) + strlen(devpath) + 2;
+			char *scratch;
+
+			retval = -ENOMEM;
+			skb = alloc_skb(len + env->buflen, GFP_KERNEL);
+			if (!skb)
+				continue;
+
+			/* add header */
+			scratch = skb_put(skb, len);
+			sprintf(scratch, "%s@%s", action_string, devpath);
+
+			skb_put_data(skb, env->buf, env->buflen);
+
+			NETLINK_CB(skb).dst_group = 1;
+		}
+
+		retval = netlink_broadcast_filtered(uevent_sock, skb_get(skb),
+						    0, 1, GFP_KERNEL,
+						    kobj_bcast_filter,
+						    kobj);
+		/* ENOBUFS should be handled in userspace */
+		if (retval == -ENOBUFS || retval == -ESRCH)
+			retval = 0;
+	}
+	consume_skb(skb);
+#endif
+	return retval;
+}
+
 static void zap_modalias_env(struct kobj_uevent_env *env)
 {
 	static const char modalias_prefix[] = "MODALIAS=";
@@ -336,9 +385,6 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 	const struct kset_uevent_ops *uevent_ops;
 	int i = 0;
 	int retval = 0;
-#ifdef CONFIG_NET
-	struct uevent_sock *ue_sk;
-#endif
 
 	pr_debug("kobject: '%s' (%p): %s\n",
 		 kobject_name(kobj), kobj, __func__);
@@ -460,46 +506,8 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 		mutex_unlock(&uevent_sock_mutex);
 		goto exit;
 	}
-
-#if defined(CONFIG_NET)
-	/* send netlink message */
-	list_for_each_entry(ue_sk, &uevent_sock_list, list) {
-		struct sock *uevent_sock = ue_sk->sk;
-		struct sk_buff *skb;
-		size_t len;
-
-		if (!netlink_has_listeners(uevent_sock, 1))
-			continue;
-
-		/* allocate message with the maximum possible size */
-		len = strlen(action_string) + strlen(devpath) + 2;
-		skb = alloc_skb(len + env->buflen, GFP_KERNEL);
-		if (skb) {
-			char *scratch;
-
-			/* add header */
-			scratch = skb_put(skb, len);
-			sprintf(scratch, "%s@%s", action_string, devpath);
-
-			/* copy keys to our continuous event payload buffer */
-			for (i = 0; i < env->envp_idx; i++) {
-				len = strlen(env->envp[i]) + 1;
-				scratch = skb_put(skb, len);
-				strcpy(scratch, env->envp[i]);
-			}
-
-			NETLINK_CB(skb).dst_group = 1;
-			retval = netlink_broadcast_filtered(uevent_sock, skb,
-							    0, 1, GFP_KERNEL,
-							    kobj_bcast_filter,
-							    kobj);
-			/* ENOBUFS should be handled in userspace */
-			if (retval == -ENOBUFS || retval == -ESRCH)
-				retval = 0;
-		} else
-			retval = -ENOMEM;
-	}
-#endif
+	retval = kobject_uevent_net_broadcast(kobj, env, action_string,
+					      devpath);
 	mutex_unlock(&uevent_sock_mutex);
 
 #ifdef CONFIG_UEVENT_HELPER
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 0ffca990a833..8e83cbdc049c 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -23,14 +23,15 @@
 #include <linux/semaphore.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/random.h>
 #include <linux/vmalloc.h>
 
 #define MAX_ENTRIES	1000000
 #define TEST_INSERT_FAIL INT_MAX
 
-static int entries = 50000;
-module_param(entries, int, 0);
-MODULE_PARM_DESC(entries, "Number of entries to add (default: 50000)");
+static int parm_entries = 50000;
+module_param(parm_entries, int, 0);
+MODULE_PARM_DESC(parm_entries, "Number of entries to add (default: 50000)");
 
 static int runs = 4;
 module_param(runs, int, 0);
@@ -66,14 +67,18 @@ struct test_obj {
 	struct rhash_head	node;
 };
 
+struct test_obj_rhl {
+	struct test_obj_val	value;
+	struct rhlist_head	list_node;
+};
+
 struct thread_data {
+	unsigned int entries;
 	int id;
 	struct task_struct *task;
 	struct test_obj *objs;
 };
 
-static struct test_obj array[MAX_ENTRIES];
-
 static struct rhashtable_params test_rht_params = {
 	.head_offset = offsetof(struct test_obj, node),
 	.key_offset = offsetof(struct test_obj, value),
@@ -85,7 +90,7 @@ static struct rhashtable_params test_rht_params = {
 static struct semaphore prestart_sem;
 static struct semaphore startup_sem = __SEMAPHORE_INITIALIZER(startup_sem, 0);
 
-static int insert_retry(struct rhashtable *ht, struct rhash_head *obj,
+static int insert_retry(struct rhashtable *ht, struct test_obj *obj,
                         const struct rhashtable_params params)
 {
 	int err, retries = -1, enomem_retries = 0;
@@ -93,7 +98,7 @@ static int insert_retry(struct rhashtable *ht, struct rhash_head *obj,
 	do {
 		retries++;
 		cond_resched();
-		err = rhashtable_insert_fast(ht, obj, params);
+		err = rhashtable_insert_fast(ht, &obj->node, params);
 		if (err == -ENOMEM && enomem_retry) {
 			enomem_retries++;
 			err = -EBUSY;
@@ -107,11 +112,12 @@ static int insert_retry(struct rhashtable *ht, struct rhash_head *obj,
 	return err ? : retries;
 }
 
-static int __init test_rht_lookup(struct rhashtable *ht)
+static int __init test_rht_lookup(struct rhashtable *ht, struct test_obj *array,
+				  unsigned int entries)
 {
 	unsigned int i;
 
-	for (i = 0; i < entries * 2; i++) {
+	for (i = 0; i < entries; i++) {
 		struct test_obj *obj;
 		bool expected = !(i % 2);
 		struct test_obj_val key = {
@@ -144,7 +150,7 @@ static int __init test_rht_lookup(struct rhashtable *ht)
 	return 0;
 }
 
-static void test_bucket_stats(struct rhashtable *ht)
+static void test_bucket_stats(struct rhashtable *ht, unsigned int entries)
 {
 	unsigned int err, total = 0, chain_len = 0;
 	struct rhashtable_iter hti;
@@ -186,7 +192,8 @@ static void test_bucket_stats(struct rhashtable *ht)
 		pr_warn("Test failed: Total count mismatch ^^^");
 }
 
-static s64 __init test_rhashtable(struct rhashtable *ht)
+static s64 __init test_rhashtable(struct rhashtable *ht, struct test_obj *array,
+				  unsigned int entries)
 {
 	struct test_obj *obj;
 	int err;
@@ -203,7 +210,7 @@ static s64 __init test_rhashtable(struct rhashtable *ht)
 		struct test_obj *obj = &array[i];
 
 		obj->value.id = i * 2;
-		err = insert_retry(ht, &obj->node, test_rht_params);
+		err = insert_retry(ht, obj, test_rht_params);
 		if (err > 0)
 			insert_retries += err;
 		else if (err)
@@ -214,12 +221,12 @@ static s64 __init test_rhashtable(struct rhashtable *ht)
 		pr_info("  %u insertions retried due to memory pressure\n",
 			insert_retries);
 
-	test_bucket_stats(ht);
+	test_bucket_stats(ht, entries);
 	rcu_read_lock();
-	test_rht_lookup(ht);
+	test_rht_lookup(ht, array, entries);
 	rcu_read_unlock();
 
-	test_bucket_stats(ht);
+	test_bucket_stats(ht, entries);
 
 	pr_info("  Deleting %d keys\n", entries);
 	for (i = 0; i < entries; i++) {
@@ -244,9 +251,227 @@ static s64 __init test_rhashtable(struct rhashtable *ht)
 }
 
 static struct rhashtable ht;
+static struct rhltable rhlt;
+
+static int __init test_rhltable(unsigned int entries)
+{
+	struct test_obj_rhl *rhl_test_objects;
+	unsigned long *obj_in_table;
+	unsigned int i, j, k;
+	int ret, err;
+
+	if (entries == 0)
+		entries = 1;
+
+	rhl_test_objects = vzalloc(sizeof(*rhl_test_objects) * entries);
+	if (!rhl_test_objects)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	obj_in_table = vzalloc(BITS_TO_LONGS(entries) * sizeof(unsigned long));
+	if (!obj_in_table)
+		goto out_free;
+
+	/* nulls_base not supported in rhlist interface */
+	test_rht_params.nulls_base = 0;
+	err = rhltable_init(&rhlt, &test_rht_params);
+	if (WARN_ON(err))
+		goto out_free;
+
+	k = prandom_u32();
+	ret = 0;
+	for (i = 0; i < entries; i++) {
+		rhl_test_objects[i].value.id = k;
+		err = rhltable_insert(&rhlt, &rhl_test_objects[i].list_node,
+				      test_rht_params);
+		if (WARN(err, "error %d on element %d\n", err, i))
+			break;
+		if (err == 0)
+			set_bit(i, obj_in_table);
+	}
+
+	if (err)
+		ret = err;
+
+	pr_info("test %d add/delete pairs into rhlist\n", entries);
+	for (i = 0; i < entries; i++) {
+		struct rhlist_head *h, *pos;
+		struct test_obj_rhl *obj;
+		struct test_obj_val key = {
+			.id = k,
+		};
+		bool found;
+
+		rcu_read_lock();
+		h = rhltable_lookup(&rhlt, &key, test_rht_params);
+		if (WARN(!h, "key not found during iteration %d of %d", i, entries)) {
+			rcu_read_unlock();
+			break;
+		}
+
+		if (i) {
+			j = i - 1;
+			rhl_for_each_entry_rcu(obj, pos, h, list_node) {
+				if (WARN(pos == &rhl_test_objects[j].list_node, "old element found, should be gone"))
+					break;
+			}
+		}
+
+		cond_resched_rcu();
+
+		found = false;
+
+		rhl_for_each_entry_rcu(obj, pos, h, list_node) {
+			if (pos == &rhl_test_objects[i].list_node) {
+				found = true;
+				break;
+			}
+		}
+
+		rcu_read_unlock();
+
+		if (WARN(!found, "element %d not found", i))
+			break;
+
+		err = rhltable_remove(&rhlt, &rhl_test_objects[i].list_node, test_rht_params);
+		WARN(err, "rhltable_remove: err %d for iteration %d\n", err, i);
+		if (err == 0)
+			clear_bit(i, obj_in_table);
+	}
+
+	if (ret == 0 && err)
+		ret = err;
+
+	for (i = 0; i < entries; i++) {
+		WARN(test_bit(i, obj_in_table), "elem %d allegedly still present", i);
+
+		err = rhltable_insert(&rhlt, &rhl_test_objects[i].list_node,
+				      test_rht_params);
+		if (WARN(err, "error %d on element %d\n", err, i))
+			break;
+		if (err == 0)
+			set_bit(i, obj_in_table);
+	}
+
+	pr_info("test %d random rhlist add/delete operations\n", entries);
+	for (j = 0; j < entries; j++) {
+		u32 i = prandom_u32_max(entries);
+		u32 prand = prandom_u32();
+
+		cond_resched();
+
+		if (prand == 0)
+			prand = prandom_u32();
+
+		if (prand & 1) {
+			prand >>= 1;
+			continue;
+		}
+
+		err = rhltable_remove(&rhlt, &rhl_test_objects[i].list_node, test_rht_params);
+		if (test_bit(i, obj_in_table)) {
+			clear_bit(i, obj_in_table);
+			if (WARN(err, "cannot remove element at slot %d", i))
+				continue;
+		} else {
+			if (WARN(err != -ENOENT, "removed non-existant element %d, error %d not %d",
+			     i, err, -ENOENT))
+				continue;
+		}
+
+		if (prand & 1) {
+			prand >>= 1;
+			continue;
+		}
+
+		err = rhltable_insert(&rhlt, &rhl_test_objects[i].list_node, test_rht_params);
+		if (err == 0) {
+			if (WARN(test_and_set_bit(i, obj_in_table), "succeeded to insert same object %d", i))
+				continue;
+		} else {
+			if (WARN(!test_bit(i, obj_in_table), "failed to insert object %d", i))
+				continue;
+		}
+
+		if (prand & 1) {
+			prand >>= 1;
+			continue;
+		}
+
+		i = prandom_u32_max(entries);
+		if (test_bit(i, obj_in_table)) {
+			err = rhltable_remove(&rhlt, &rhl_test_objects[i].list_node, test_rht_params);
+			WARN(err, "cannot remove element at slot %d", i);
+			if (err == 0)
+				clear_bit(i, obj_in_table);
+		} else {
+			err = rhltable_insert(&rhlt, &rhl_test_objects[i].list_node, test_rht_params);
+			WARN(err, "failed to insert object %d", i);
+			if (err == 0)
+				set_bit(i, obj_in_table);
+		}
+	}
+
+	for (i = 0; i < entries; i++) {
+		cond_resched();
+		err = rhltable_remove(&rhlt, &rhl_test_objects[i].list_node, test_rht_params);
+		if (test_bit(i, obj_in_table)) {
+			if (WARN(err, "cannot remove element at slot %d", i))
+				continue;
+		} else {
+			if (WARN(err != -ENOENT, "removed non-existant element, error %d not %d",
+				 err, -ENOENT))
+			continue;
+		}
+	}
+
+	rhltable_destroy(&rhlt);
+out_free:
+	vfree(rhl_test_objects);
+	vfree(obj_in_table);
+	return ret;
+}
+
+static int __init test_rhashtable_max(struct test_obj *array,
+				      unsigned int entries)
+{
+	unsigned int i, insert_retries = 0;
+	int err;
+
+	test_rht_params.max_size = roundup_pow_of_two(entries / 8);
+	err = rhashtable_init(&ht, &test_rht_params);
+	if (err)
+		return err;
+
+	for (i = 0; i < ht.max_elems; i++) {
+		struct test_obj *obj = &array[i];
+
+		obj->value.id = i * 2;
+		err = insert_retry(&ht, obj, test_rht_params);
+		if (err > 0)
+			insert_retries += err;
+		else if (err)
+			return err;
+	}
+
+	err = insert_retry(&ht, &array[ht.max_elems], test_rht_params);
+	if (err == -E2BIG) {
+		err = 0;
+	} else {
+		pr_info("insert element %u should have failed with %d, got %d\n",
+				ht.max_elems, -E2BIG, err);
+		if (err == 0)
+			err = -1;
+	}
+
+	rhashtable_destroy(&ht);
+
+	return err;
+}
 
 static int thread_lookup_test(struct thread_data *tdata)
 {
+	unsigned int entries = tdata->entries;
 	int i, err = 0;
 
 	for (i = 0; i < entries; i++) {
@@ -283,10 +508,10 @@ static int threadfunc(void *data)
 	if (down_interruptible(&startup_sem))
 		pr_err("  thread[%d]: down_interruptible failed\n", tdata->id);
 
-	for (i = 0; i < entries; i++) {
+	for (i = 0; i < tdata->entries; i++) {
 		tdata->objs[i].value.id = i;
 		tdata->objs[i].value.tid = tdata->id;
-		err = insert_retry(&ht, &tdata->objs[i].node, test_rht_params);
+		err = insert_retry(&ht, &tdata->objs[i], test_rht_params);
 		if (err > 0) {
 			insert_retries += err;
 		} else if (err) {
@@ -307,7 +532,7 @@ static int threadfunc(void *data)
 	}
 
 	for (step = 10; step > 0; step--) {
-		for (i = 0; i < entries; i += step) {
+		for (i = 0; i < tdata->entries; i += step) {
 			if (tdata->objs[i].value.id == TEST_INSERT_FAIL)
 				continue;
 			err = rhashtable_remove_fast(&ht, &tdata->objs[i].node,
@@ -338,17 +563,25 @@ out:
 
 static int __init test_rht_init(void)
 {
+	unsigned int entries;
 	int i, err, started_threads = 0, failed_threads = 0;
 	u64 total_time = 0;
 	struct thread_data *tdata;
 	struct test_obj *objs;
 
-	entries = min(entries, MAX_ENTRIES);
+	if (parm_entries < 0)
+		parm_entries = 1;
+
+	entries = min(parm_entries, MAX_ENTRIES);
 
 	test_rht_params.automatic_shrinking = shrinking;
 	test_rht_params.max_size = max_size ? : roundup_pow_of_two(entries);
 	test_rht_params.nelem_hint = size;
 
+	objs = vzalloc((test_rht_params.max_size + 1) * sizeof(struct test_obj));
+	if (!objs)
+		return -ENOMEM;
+
 	pr_info("Running rhashtable test nelem=%d, max_size=%d, shrinking=%d\n",
 		size, max_size, shrinking);
 
@@ -356,7 +589,8 @@ static int __init test_rht_init(void)
 		s64 time;
 
 		pr_info("Test %02d:\n", i);
-		memset(&array, 0, sizeof(array));
+		memset(objs, 0, test_rht_params.max_size * sizeof(struct test_obj));
+
 		err = rhashtable_init(&ht, &test_rht_params);
 		if (err < 0) {
 			pr_warn("Test failed: Unable to initialize hashtable: %d\n",
@@ -364,9 +598,10 @@ static int __init test_rht_init(void)
 			continue;
 		}
 
-		time = test_rhashtable(&ht);
+		time = test_rhashtable(&ht, objs, entries);
 		rhashtable_destroy(&ht);
 		if (time < 0) {
+			vfree(objs);
 			pr_warn("Test failed: return code %lld\n", time);
 			return -EINVAL;
 		}
@@ -374,6 +609,11 @@ static int __init test_rht_init(void)
 		total_time += time;
 	}
 
+	pr_info("test if its possible to exceed max_size %d: %s\n",
+			test_rht_params.max_size, test_rhashtable_max(objs, entries) == 0 ?
+			"no, ok" : "YES, failed");
+	vfree(objs);
+
 	do_div(total_time, runs);
 	pr_info("Average test time: %llu\n", total_time);
 
@@ -404,6 +644,7 @@ static int __init test_rht_init(void)
 	}
 	for (i = 0; i < tcount; i++) {
 		tdata[i].id = i;
+		tdata[i].entries = entries;
 		tdata[i].objs = objs + i * entries;
 		tdata[i].task = kthread_run(threadfunc, &tdata[i],
 		                            "rhashtable_thrad[%d]", i);
@@ -425,11 +666,17 @@ static int __init test_rht_init(void)
 			failed_threads++;
 		}
 	}
-	pr_info("Started %d threads, %d failed\n",
-	        started_threads, failed_threads);
 	rhashtable_destroy(&ht);
 	vfree(tdata);
 	vfree(objs);
+
+	/*
+	 * rhltable_remove is very expensive, default values can cause test
+	 * to run for 2 minutes or more,  use a smaller number instead.
+	 */
+	err = test_rhltable(entries / 16);
+	pr_info("Started %d threads, %d failed, rhltable test returns %d\n",
+	        started_threads, failed_threads, err);
 	return 0;
 }
 
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 9649579b5b9f..71c3e045505b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -138,7 +138,7 @@ int vlan_check_real_dev(struct net_device *real_dev,
 	return 0;
 }
 
-int register_vlan_dev(struct net_device *dev)
+int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
 {
 	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	struct net_device *real_dev = vlan->real_dev;
@@ -174,7 +174,7 @@ int register_vlan_dev(struct net_device *dev)
 	if (err < 0)
 		goto out_uninit_mvrp;
 
-	err = netdev_upper_dev_link(real_dev, dev);
+	err = netdev_upper_dev_link(real_dev, dev, extack);
 	if (err)
 		goto out_unregister_netdev;
 
@@ -270,7 +270,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	vlan->flags = VLAN_FLAG_REORDER_HDR;
 
 	new_dev->rtnl_link_ops = &vlan_link_ops;
-	err = register_vlan_dev(new_dev);
+	err = register_vlan_dev(new_dev, NULL);
 	if (err < 0)
 		goto out_free_newdev;
 
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index df8bd65dd370..94f8eed9f9b3 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -107,7 +107,7 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
 int vlan_check_real_dev(struct net_device *real_dev,
 			__be16 protocol, u16 vlan_id);
 void vlan_setup(struct net_device *dev);
-int register_vlan_dev(struct net_device *dev);
+int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
 void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
 bool vlan_dev_inherit_address(struct net_device *dev,
 			      struct net_device *real_dev);
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 5e831de3103e..6e7c5a6a7930 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -160,7 +160,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
 	if (err < 0)
 		return err;
 
-	return register_vlan_dev(dev);
+	return register_vlan_dev(dev, extack);
 }
 
 static inline size_t vlan_qos_map_size(unsigned int n)
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index e348f76ea8c1..f7b413b9297e 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -738,7 +738,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
 	bat_priv = netdev_priv(hard_iface->soft_iface);
 
 	ret = netdev_master_upper_dev_link(hard_iface->net_dev,
-					   soft_iface, NULL, NULL);
+					   soft_iface, NULL, NULL, NULL);
 	if (ret)
 		goto err_dev;
 
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 10f7edfb176e..e7d5fbb6ad53 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -69,7 +69,7 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
 	int result;
 
 	/* TODO: We must check if we can release all references to non-payload
-	 * data using skb_header_release in our skbs to allow skb_cow_header to
+	 * data using __skb_header_release in our skbs to allow skb_cow_header to
 	 * work optimally. This means that those skbs are not allowed to read
 	 * or write any data which is before the current position of skb->data
 	 * after that call and thus allow other skbs with the same data buffer
@@ -867,7 +867,8 @@ free_bat_counters:
  * Return: 0 if successful or error otherwise.
  */
 static int batadv_softif_slave_add(struct net_device *dev,
-				   struct net_device *slave_dev)
+				   struct net_device *slave_dev,
+				   struct netlink_ext_ack *extack)
 {
 	struct batadv_hard_iface *hard_iface;
 	struct net *net = dev_net(dev);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6be41a44d688..a86e6687026e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (is_l2)
 		__skb_push(skb, ETH_HLEN);
 	if (is_direct_pkt_access)
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 	retval = bpf_test_run(prog, skb, repeat, &duration);
 	if (!is_l2)
 		__skb_push(skb, ETH_HLEN);
@@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	xdp.data_hard_start = data;
 	xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + size;
 
 	retval = bpf_test_run(prog, &xdp, repeat, &duration);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f6b6a92f1c48..7acb77c9bd65 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -320,12 +320,13 @@ void br_netpoll_disable(struct net_bridge_port *p)
 
 #endif
 
-static int br_add_slave(struct net_device *dev, struct net_device *slave_dev)
+static int br_add_slave(struct net_device *dev, struct net_device *slave_dev,
+			struct netlink_ext_ack *extack)
 
 {
 	struct net_bridge *br = netdev_priv(dev);
 
-	return br_add_if(br, slave_dev);
+	return br_add_if(br, slave_dev, extack);
 }
 
 static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f3aef22931ab..59a74a414e20 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -480,7 +480,8 @@ netdev_features_t br_features_recompute(struct net_bridge *br,
 }
 
 /* called with RTNL */
-int br_add_if(struct net_bridge *br, struct net_device *dev)
+int br_add_if(struct net_bridge *br, struct net_device *dev,
+	      struct netlink_ext_ack *extack)
 {
 	struct net_bridge_port *p;
 	int err = 0;
@@ -500,16 +501,22 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 		return -EINVAL;
 
 	/* No bridging of bridges */
-	if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
+	if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) {
+		NL_SET_ERR_MSG(extack,
+			       "Can not enslave a bridge to a bridge");
 		return -ELOOP;
+	}
 
 	/* Device is already being bridged */
 	if (br_port_exists(dev))
 		return -EBUSY;
 
 	/* No bridging devices that dislike that (e.g. wireless) */
-	if (dev->priv_flags & IFF_DONT_BRIDGE)
+	if (dev->priv_flags & IFF_DONT_BRIDGE) {
+		NL_SET_ERR_MSG(extack,
+			       "Device does not allow enslaving to a bridge");
 		return -EOPNOTSUPP;
+	}
 
 	p = new_nbp(br, dev);
 	if (IS_ERR(p))
@@ -540,7 +547,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	dev->priv_flags |= IFF_BRIDGE_PORT;
 
-	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL);
+	err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack);
 	if (err)
 		goto err5;
 
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7637f58c1226..7cb613776b31 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -289,6 +289,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 		 *
 		 * Others reserved for future standardization
 		 */
+		fwd_mask |= p->group_fwd_mask;
 		switch (dest[5]) {
 		case 0x00:	/* Bridge Group Address */
 			/* If STP is turned off,
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 7970f8540cbb..8f29103935a3 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -98,10 +98,13 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
 		return -EINVAL;
 
 	if (isadd)
-		ret = br_add_if(br, dev);
+		ret = br_add_if(br, dev, NULL);
 	else
 		ret = br_del_if(br, dev);
 
+	if (!ret)
+		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL);
+
 	return ret;
 }
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 3bc890716c89..dea88a255d26 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void)
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 		+ nla_total_size(sizeof(u8))	/* IFLA_BRPORT_MULTICAST_ROUTER */
 #endif
+		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_GROUP_FWD_MASK */
 		+ 0;
 }
 
@@ -208,7 +209,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       p->topology_change_ack) ||
 	    nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
 	    nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
-							BR_VLAN_TUNNEL)))
+							BR_VLAN_TUNNEL)) ||
+	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -637,6 +639,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 },
 	[IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 },
 	[IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 },
+	[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -773,6 +776,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 			return err;
 	}
 #endif
+
+	if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) {
+		u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]);
+
+		if (fwd_mask & BR_GROUPFWD_MACPAUSE)
+			return -EINVAL;
+		p->group_fwd_mask = fwd_mask;
+	}
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e870cfc85b14..ab4df24f7bba 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -36,7 +36,14 @@
 /* Control of forwarding link local multicast */
 #define BR_GROUPFWD_DEFAULT	0
 /* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */
-#define BR_GROUPFWD_RESTRICTED	0x0007u
+enum {
+	BR_GROUPFWD_STP		= BIT(0),
+	BR_GROUPFWD_MACPAUSE	= BIT(1),
+	BR_GROUPFWD_LACP	= BIT(2),
+};
+
+#define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \
+				BR_GROUPFWD_LACP)
 /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */
 #define BR_GROUPFWD_8021AD	0xB801u
 
@@ -268,6 +275,7 @@ struct net_bridge_port {
 #ifdef CONFIG_NET_SWITCHDEV
 	int				offload_fwd_mark;
 #endif
+	u16				group_fwd_mask;
 };
 
 #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
@@ -558,7 +566,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 void br_port_carrier_check(struct net_bridge_port *p);
 int br_add_bridge(struct net *net, const char *name);
 int br_del_bridge(struct net *net, const char *name);
-int br_add_if(struct net_bridge *br, struct net_device *dev);
+int br_add_if(struct net_bridge *br, struct net_device *dev,
+	      struct netlink_ext_ack *extack);
 int br_del_if(struct net_bridge *br, struct net_device *dev);
 int br_min_mtu(const struct net_bridge *br);
 netdev_features_t br_features_recompute(struct net_bridge *br,
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 5d5d413a6cf8..9110d5e56085 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -165,6 +165,23 @@ static int store_flush(struct net_bridge_port *p, unsigned long v)
 }
 static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush);
 
+static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%#x\n", p->group_fwd_mask);
+}
+
+static int store_group_fwd_mask(struct net_bridge_port *p,
+				unsigned long v)
+{
+	if (v & BR_GROUPFWD_MACPAUSE)
+		return -EINVAL;
+	p->group_fwd_mask = v;
+
+	return 0;
+}
+static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask,
+		   store_group_fwd_mask);
+
 BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
 BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
 BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
@@ -223,6 +240,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_proxyarp_wifi,
 	&brport_attr_multicast_flood,
 	&brport_attr_broadcast_flood,
+	&brport_attr_group_fwd_mask,
 	NULL
 };
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f7fb7e3f2acf..0b7b4c22719e 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -188,7 +188,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 			}
 			if (!skb->len) {
 				skb = skb_set_peeked(skb);
-				if (unlikely(IS_ERR(skb))) {
+				if (IS_ERR(skb)) {
 					*err = PTR_ERR(skb);
 					return NULL;
 				}
diff --git a/net/core/dev.c b/net/core/dev.c
index 588b473194a8..fcddccb6be41 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -145,6 +145,7 @@
 #include <linux/crash_dump.h>
 #include <linux/sctp.h>
 #include <net/udp_tunnel.h>
+#include <linux/net_namespace.h>
 
 #include "net-sysfs.h"
 
@@ -162,7 +163,6 @@ static struct list_head offload_base __read_mostly;
 
 static int netif_rx_internal(struct sk_buff *skb);
 static int call_netdevice_notifiers_info(unsigned long val,
-					 struct net_device *dev,
 					 struct netdev_notifier_info *info);
 static struct napi_struct *napi_by_id(unsigned int napi_id);
 
@@ -188,6 +188,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id);
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
 
+static DEFINE_MUTEX(ifalias_mutex);
+
 /* protects napi_hash addition/deletion and napi_gen_id */
 static DEFINE_SPINLOCK(napi_hash_lock);
 
@@ -1265,29 +1267,53 @@ rollback:
  */
 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 {
-	char *new_ifalias;
-
-	ASSERT_RTNL();
+	struct dev_ifalias *new_alias = NULL;
 
 	if (len >= IFALIASZ)
 		return -EINVAL;
 
-	if (!len) {
-		kfree(dev->ifalias);
-		dev->ifalias = NULL;
-		return 0;
+	if (len) {
+		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
+		if (!new_alias)
+			return -ENOMEM;
+
+		memcpy(new_alias->ifalias, alias, len);
+		new_alias->ifalias[len] = 0;
 	}
 
-	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
-	if (!new_ifalias)
-		return -ENOMEM;
-	dev->ifalias = new_ifalias;
-	memcpy(dev->ifalias, alias, len);
-	dev->ifalias[len] = 0;
+	mutex_lock(&ifalias_mutex);
+	rcu_swap_protected(dev->ifalias, new_alias,
+			   mutex_is_locked(&ifalias_mutex));
+	mutex_unlock(&ifalias_mutex);
+
+	if (new_alias)
+		kfree_rcu(new_alias, rcuhead);
 
 	return len;
 }
 
+/**
+ *	dev_get_alias - get ifalias of a device
+ *	@dev: device
+ *	@name: buffer to store name of ifalias
+ *	@len: size of buffer
+ *
+ *	get ifalias for a device.  Caller must make sure dev cannot go
+ *	away,  e.g. rcu read lock or own a reference count to device.
+ */
+int dev_get_alias(const struct net_device *dev, char *name, size_t len)
+{
+	const struct dev_ifalias *alias;
+	int ret = 0;
+
+	rcu_read_lock();
+	alias = rcu_dereference(dev->ifalias);
+	if (alias)
+		ret = snprintf(name, len, "%s", alias->ifalias);
+	rcu_read_unlock();
+
+	return ret;
+}
 
 /**
  *	netdev_features_change - device changes features
@@ -1312,10 +1338,11 @@ EXPORT_SYMBOL(netdev_features_change);
 void netdev_state_change(struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
-		struct netdev_notifier_change_info change_info;
+		struct netdev_notifier_change_info change_info = {
+			.info.dev = dev,
+		};
 
-		change_info.flags_changed = 0;
-		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
+		call_netdevice_notifiers_info(NETDEV_CHANGE,
 					      &change_info.info);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 	}
@@ -1536,9 +1563,10 @@ EXPORT_SYMBOL(dev_disable_lro);
 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 				   struct net_device *dev)
 {
-	struct netdev_notifier_info info;
+	struct netdev_notifier_info info = {
+		.dev = dev,
+	};
 
-	netdev_notifier_info_init(&info, dev);
 	return nb->notifier_call(nb, val, &info);
 }
 
@@ -1663,11 +1691,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
  */
 
 static int call_netdevice_notifiers_info(unsigned long val,
-					 struct net_device *dev,
 					 struct netdev_notifier_info *info)
 {
 	ASSERT_RTNL();
-	netdev_notifier_info_init(info, dev);
 	return raw_notifier_call_chain(&netdev_chain, val, info);
 }
 
@@ -1682,9 +1708,11 @@ static int call_netdevice_notifiers_info(unsigned long val,
 
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 {
-	struct netdev_notifier_info info;
+	struct netdev_notifier_info info = {
+		.dev = dev,
+	};
 
-	return call_netdevice_notifiers_info(val, dev, &info);
+	return call_netdevice_notifiers_info(val, &info);
 }
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
@@ -3864,8 +3892,8 @@ drop:
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
-	u32 act = XDP_DROP;
 	void *orig_data;
 	int hlen, off;
 	u32 mac_len;
@@ -3876,8 +3904,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	if (skb_cloned(skb))
 		return XDP_PASS;
 
-	if (skb_linearize(skb))
-		goto do_drop;
+	/* XDP packets must be linear and must have sufficient headroom
+	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+	 * native XDP provides, thus we need to do it here as well.
+	 */
+	if (skb_is_nonlinear(skb) ||
+	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+		int troom = skb->tail + skb->data_len - skb->end;
+
+		/* In case we have to go down the path and also linearize,
+		 * then lets do the pskb_expand_head() work just once here.
+		 */
+		if (pskb_expand_head(skb,
+				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+			goto do_drop;
+		if (troom > 0 && __skb_linearize(skb))
+			goto do_drop;
+	}
 
 	/* The XDP program wants to see the packet starting at the MAC
 	 * header.
@@ -3885,6 +3930,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	mac_len = skb->data - skb_mac_header(skb);
 	hlen = skb_headlen(skb) + mac_len;
 	xdp.data = skb->data - mac_len;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + hlen;
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
@@ -3902,10 +3948,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	case XDP_REDIRECT:
 	case XDP_TX:
 		__skb_push(skb, mac_len);
-		/* fall through */
+		break;
 	case XDP_PASS:
+		metalen = xdp.data - xdp.data_meta;
+		if (metalen)
+			skb_metadata_set(skb, metalen);
 		break;
-
 	default:
 		bpf_warn_invalid_xdp_action(act);
 		/* fall through */
@@ -4695,6 +4743,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
 		diffs |= skb_metadata_dst_cmp(p, skb);
+		diffs |= skb_metadata_differs(p, skb);
 		if (maclen == ETH_HLEN)
 			diffs |= compare_ether_header(skb_mac_header(p),
 						      skb_mac_header(skb));
@@ -6228,9 +6277,19 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
-				   void *upper_priv, void *upper_info)
-{
-	struct netdev_notifier_changeupper_info changeupper_info;
+				   void *upper_priv, void *upper_info,
+				   struct netlink_ext_ack *extack)
+{
+	struct netdev_notifier_changeupper_info changeupper_info = {
+		.info = {
+			.dev = dev,
+			.extack = extack,
+		},
+		.upper_dev = upper_dev,
+		.master = master,
+		.linking = true,
+		.upper_info = upper_info,
+	};
 	int ret = 0;
 
 	ASSERT_RTNL();
@@ -6248,12 +6307,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (master && netdev_master_upper_dev_get(dev))
 		return -EBUSY;
 
-	changeupper_info.upper_dev = upper_dev;
-	changeupper_info.master = master;
-	changeupper_info.linking = true;
-	changeupper_info.upper_info = upper_info;
-
-	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 					    &changeupper_info.info);
 	ret = notifier_to_errno(ret);
 	if (ret)
@@ -6264,7 +6318,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 					    &changeupper_info.info);
 	ret = notifier_to_errno(ret);
 	if (ret)
@@ -6289,9 +6343,11 @@ rollback:
  * returns zero.
  */
 int netdev_upper_dev_link(struct net_device *dev,
-			  struct net_device *upper_dev)
+			  struct net_device *upper_dev,
+			  struct netlink_ext_ack *extack)
 {
-	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
+	return __netdev_upper_dev_link(dev, upper_dev, false,
+				       NULL, NULL, extack);
 }
 EXPORT_SYMBOL(netdev_upper_dev_link);
 
@@ -6310,10 +6366,11 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  */
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv, void *upper_info)
+				 void *upper_priv, void *upper_info,
+				 struct netlink_ext_ack *extack)
 {
 	return __netdev_upper_dev_link(dev, upper_dev, true,
-				       upper_priv, upper_info);
+				       upper_priv, upper_info, extack);
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_link);
 
@@ -6328,20 +6385,24 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
-	struct netdev_notifier_changeupper_info changeupper_info;
+	struct netdev_notifier_changeupper_info changeupper_info = {
+		.info = {
+			.dev = dev,
+		},
+		.upper_dev = upper_dev,
+		.linking = false,
+	};
 
 	ASSERT_RTNL();
 
-	changeupper_info.upper_dev = upper_dev;
 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
-	changeupper_info.linking = false;
 
-	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
+	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 				      &changeupper_info.info);
 
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
-	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 				      &changeupper_info.info);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
@@ -6357,11 +6418,13 @@ EXPORT_SYMBOL(netdev_upper_dev_unlink);
 void netdev_bonding_info_change(struct net_device *dev,
 				struct netdev_bonding_info *bonding_info)
 {
-	struct netdev_notifier_bonding_info	info;
+	struct netdev_notifier_bonding_info info = {
+		.info.dev = dev,
+	};
 
 	memcpy(&info.bonding_info, bonding_info,
 	       sizeof(struct netdev_bonding_info));
-	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
+	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 				      &info.info);
 }
 EXPORT_SYMBOL(netdev_bonding_info_change);
@@ -6487,11 +6550,13 @@ EXPORT_SYMBOL(dev_get_nest_level);
 void netdev_lower_state_changed(struct net_device *lower_dev,
 				void *lower_state_info)
 {
-	struct netdev_notifier_changelowerstate_info changelowerstate_info;
+	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
+		.info.dev = lower_dev,
+	};
 
 	ASSERT_RTNL();
 	changelowerstate_info.lower_state_info = lower_state_info;
-	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
+	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 				      &changelowerstate_info.info);
 }
 EXPORT_SYMBOL(netdev_lower_state_changed);
@@ -6782,11 +6847,14 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 
 	if (dev->flags & IFF_UP &&
 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
-		struct netdev_notifier_change_info change_info;
+		struct netdev_notifier_change_info change_info = {
+			.info = {
+				.dev = dev,
+			},
+			.flags_changed = changes,
+		};
 
-		change_info.flags_changed = changes;
-		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
-					      &change_info.info);
+		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 	}
 }
 
@@ -7157,7 +7225,7 @@ static void rollback_registered_many(struct list_head *head)
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
-						     GFP_KERNEL);
+						     GFP_KERNEL, NULL);
 
 		/*
 		 *	Flush the unicast and multicast chains
@@ -7994,7 +8062,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		unsigned int txqs, unsigned int rxqs)
 {
 	struct net_device *dev;
-	size_t alloc_size;
+	unsigned int alloc_size;
 	struct net_device *p;
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
@@ -8244,7 +8312,7 @@ EXPORT_SYMBOL(unregister_netdev);
 
 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
 {
-	int err;
+	int err, new_nsid;
 
 	ASSERT_RTNL();
 
@@ -8300,7 +8368,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
+	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net)
+		new_nsid = peernet2id_alloc(dev_net(dev), net);
+	else
+		new_nsid = peernet2id(dev_net(dev), net);
+	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid);
 
 	/*
 	 *	Flush the unicast and multicast chains
diff --git a/net/core/filter.c b/net/core/filter.c
index 74b8c91fb5f4..b7e8caa1e790 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1406,7 +1406,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 {
 	int err = __bpf_try_make_writable(skb, write_len);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return err;
 }
 
@@ -1966,7 +1966,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -1988,7 +1988,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 	ret = skb_vlan_pop(skb);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2182,7 +2182,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
 	 * need to be verified first.
 	 */
 	ret = bpf_skb_proto_xlat(skb, proto);
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2307,7 +2307,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
 	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
 		       bpf_skb_net_grow(skb, len_diff_abs);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2398,7 +2398,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
 			skb_gso_reset(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2438,7 +2438,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
 		skb_reset_mac_header(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return 0;
 }
 
@@ -2451,14 +2451,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
+{
+	return xdp_data_meta_unsupported(xdp) ? 0 :
+	       xdp->data - xdp->data_meta;
+}
+
 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	unsigned long metalen = xdp_get_metalen(xdp);
+	void *data_start = xdp->data_hard_start + metalen;
 	void *data = xdp->data + offset;
 
-	if (unlikely(data < xdp->data_hard_start ||
+	if (unlikely(data < data_start ||
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
+	if (metalen)
+		memmove(xdp->data_meta + offset,
+			xdp->data_meta, metalen);
+	xdp->data_meta += offset;
 	xdp->data = data;
 
 	return 0;
@@ -2472,6 +2484,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
+{
+	void *meta = xdp->data_meta + offset;
+	unsigned long metalen = xdp->data - meta;
+
+	if (xdp_data_meta_unsupported(xdp))
+		return -ENOTSUPP;
+	if (unlikely(meta < xdp->data_hard_start ||
+		     meta > xdp->data))
+		return -EINVAL;
+	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
+		     (metalen > 32)))
+		return -EACCES;
+
+	xdp->data_meta = meta;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
+	.func		= bpf_xdp_adjust_meta,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static int __bpf_tx_xdp(struct net_device *dev,
 			struct bpf_map *map,
 			struct xdp_buff *xdp,
@@ -2696,7 +2735,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_clone_redirect ||
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head)
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta)
 		return true;
 
 	return false;
@@ -3292,6 +3332,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
+	case BPF_FUNC_xdp_adjust_meta:
+		return &bpf_xdp_adjust_meta_proto;
 	case BPF_FUNC_redirect:
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
@@ -3422,6 +3464,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 	case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
 	case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		if (size != size_default)
 			return false;
@@ -3448,6 +3491,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
@@ -3472,6 +3516,7 @@ static bool lwt_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 		return false;
 	}
 
@@ -3590,6 +3635,9 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3623,6 +3671,9 @@ static bool xdp_is_valid_access(int off, int size,
 	case offsetof(struct xdp_md, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case offsetof(struct xdp_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3681,6 +3732,12 @@ static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
 				   struct bpf_insn_access_aux *info)
 {
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		return false;
+	}
+
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case bpf_ctx_range(struct __sk_buff, mark):
@@ -3693,8 +3750,6 @@ static bool sk_skb_is_valid_access(int off, int size,
 	}
 
 	switch (off) {
-	case bpf_ctx_range(struct __sk_buff, tc_classid):
-		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
@@ -3851,6 +3906,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sk_buff, data));
 		break;
 
+	case offsetof(struct __sk_buff, data_meta):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_meta);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_meta);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
+
 	case offsetof(struct __sk_buff, data_end):
 		off  = si->off;
 		off -= offsetof(struct __sk_buff, data_end);
@@ -4099,6 +4163,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, data_meta));
+		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
 				      si->dst_reg, si->src_reg,
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0a977373d003..1f5caafb4492 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -5,6 +5,7 @@
 #include <linux/ipv6.h>
 #include <linux/if_vlan.h>
 #include <net/dsa.h>
+#include <net/dst_metadata.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/gre.h>
@@ -115,6 +116,102 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
+static void
+skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
+				   struct flow_dissector *flow_dissector,
+				   void *target_container)
+{
+	struct flow_dissector_key_control *ctrl;
+
+	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL))
+		return;
+
+	ctrl = skb_flow_dissector_target(flow_dissector,
+					 FLOW_DISSECTOR_KEY_ENC_CONTROL,
+					 target_container);
+	ctrl->addr_type = type;
+}
+
+static void
+__skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
+			       struct flow_dissector *flow_dissector,
+			       void *target_container)
+{
+	struct ip_tunnel_info *info;
+	struct ip_tunnel_key *key;
+
+	/* A quick check to see if there might be something to do. */
+	if (!dissector_uses_key(flow_dissector,
+				FLOW_DISSECTOR_KEY_ENC_KEYID) &&
+	    !dissector_uses_key(flow_dissector,
+				FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) &&
+	    !dissector_uses_key(flow_dissector,
+				FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) &&
+	    !dissector_uses_key(flow_dissector,
+				FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
+	    !dissector_uses_key(flow_dissector,
+				FLOW_DISSECTOR_KEY_ENC_PORTS))
+		return;
+
+	info = skb_tunnel_info(skb);
+	if (!info)
+		return;
+
+	key = &info->key;
+
+	switch (ip_tunnel_info_af(info)) {
+	case AF_INET:
+		skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						   flow_dissector,
+						   target_container);
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
+			struct flow_dissector_key_ipv4_addrs *ipv4;
+
+			ipv4 = skb_flow_dissector_target(flow_dissector,
+							 FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+							 target_container);
+			ipv4->src = key->u.ipv4.src;
+			ipv4->dst = key->u.ipv4.dst;
+		}
+		break;
+	case AF_INET6:
+		skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						   flow_dissector,
+						   target_container);
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
+			struct flow_dissector_key_ipv6_addrs *ipv6;
+
+			ipv6 = skb_flow_dissector_target(flow_dissector,
+							 FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
+							 target_container);
+			ipv6->src = key->u.ipv6.src;
+			ipv6->dst = key->u.ipv6.dst;
+		}
+		break;
+	}
+
+	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+		struct flow_dissector_key_keyid *keyid;
+
+		keyid = skb_flow_dissector_target(flow_dissector,
+						  FLOW_DISSECTOR_KEY_ENC_KEYID,
+						  target_container);
+		keyid->keyid = tunnel_id_to_key32(key->tun_id);
+	}
+
+	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+		struct flow_dissector_key_ports *tp;
+
+		tp = skb_flow_dissector_target(flow_dissector,
+					       FLOW_DISSECTOR_KEY_ENC_PORTS,
+					       target_container);
+		tp->src = key->tp_src;
+		tp->dst = key->tp_dst;
+	}
+}
+
 static enum flow_dissect_ret
 __skb_flow_dissect_mpls(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
@@ -478,6 +575,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
+	__skb_flow_dissect_tunnel_info(skb, flow_dissector,
+				       target_container);
+
 	if (dissector_uses_key(flow_dissector,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct ethhdr *eth = eth_hdr(skb);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 1307731ddfe4..e7e626fb87bb 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	 */
 	preempt_disable();
 	rcu_read_lock();
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	ret = bpf_prog_run_save_cb(lwt->prog, skb);
 	rcu_read_unlock();
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 16a1a4c4eb57..6ea3a1a7f36a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -457,7 +457,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
 				     const void *pkey)
 {
 	struct neighbour *n;
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	u32 hash_val;
 	struct neigh_hash_table *nht;
 
@@ -488,7 +488,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
 				 struct net_device *dev, bool want_ref)
 {
 	u32 hash_val;
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	int error;
 	struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
 	struct neigh_hash_table *nht;
@@ -572,7 +572,7 @@ out_neigh_release:
 }
 EXPORT_SYMBOL(__neigh_create);
 
-static u32 pneigh_hash(const void *pkey, int key_len)
+static u32 pneigh_hash(const void *pkey, unsigned int key_len)
 {
 	u32 hash_val = *(u32 *)(pkey + key_len - 4);
 	hash_val ^= (hash_val >> 16);
@@ -585,7 +585,7 @@ static u32 pneigh_hash(const void *pkey, int key_len)
 static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
 					      struct net *net,
 					      const void *pkey,
-					      int key_len,
+					      unsigned int key_len,
 					      struct net_device *dev)
 {
 	while (n) {
@@ -601,7 +601,7 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
 struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
 		struct net *net, const void *pkey, struct net_device *dev)
 {
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	u32 hash_val = pneigh_hash(pkey, key_len);
 
 	return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
@@ -614,7 +614,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
 				    struct net_device *dev, int creat)
 {
 	struct pneigh_entry *n;
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	u32 hash_val = pneigh_hash(pkey, key_len);
 
 	read_lock_bh(&tbl->lock);
@@ -659,7 +659,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 		  struct net_device *dev)
 {
 	struct pneigh_entry *n, **np;
-	int key_len = tbl->key_len;
+	unsigned int key_len = tbl->key_len;
 	u32 hash_val = pneigh_hash(pkey, key_len);
 
 	write_lock_bh(&tbl->lock);
@@ -1662,7 +1662,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tbl == NULL)
 		return -EAFNOSUPPORT;
 
-	if (nla_len(dst_attr) < tbl->key_len)
+	if (nla_len(dst_attr) < (int)tbl->key_len)
 		goto out;
 
 	if (ndm->ndm_flags & NTF_PROXY) {
@@ -1730,7 +1730,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tbl == NULL)
 		return -EAFNOSUPPORT;
 
-	if (nla_len(tb[NDA_DST]) < tbl->key_len)
+	if (nla_len(tb[NDA_DST]) < (int)tbl->key_len)
 		goto out;
 	dst = nla_data(tb[NDA_DST]);
 	lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 927a6dcbad96..51d5836d8fb9 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -391,10 +391,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
 	if (len >  0 && buf[len - 1] == '\n')
 		--count;
 
-	if (!rtnl_trylock())
-		return restart_syscall();
 	ret = dev_set_alias(netdev, buf, count);
-	rtnl_unlock();
 
 	return ret < 0 ? ret : len;
 }
@@ -403,13 +400,12 @@ static ssize_t ifalias_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	const struct net_device *netdev = to_net_dev(dev);
+	char tmp[IFALIASZ];
 	ssize_t ret = 0;
 
-	if (!rtnl_trylock())
-		return restart_syscall();
-	if (netdev->ifalias)
-		ret = sprintf(buf, "%s\n", netdev->ifalias);
-	rtnl_unlock();
+	ret = dev_get_alias(netdev, tmp, sizeof(tmp));
+	if (ret > 0)
+		ret = sprintf(buf, "%s\n", tmp);
 	return ret;
 }
 static DEVICE_ATTR_RW(ifalias);
@@ -1488,7 +1484,10 @@ static void netdev_release(struct device *d)
 
 	BUG_ON(dev->reg_state != NETREG_RELEASED);
 
-	kfree(dev->ifalias);
+	/* no need to wait for rcu grace period:
+	 * device is dead and about to be freed.
+	 */
+	kfree(rcu_access_pointer(dev->ifalias));
 	netdev_freemem(dev);
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d4bcdcc68e92..e84d108cfee4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -476,25 +476,13 @@ void rtnl_af_register(struct rtnl_af_ops *ops)
 EXPORT_SYMBOL_GPL(rtnl_af_register);
 
 /**
- * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
- * @ops: struct rtnl_af_ops * to unregister
- *
- * The caller must hold the rtnl_mutex.
- */
-void __rtnl_af_unregister(struct rtnl_af_ops *ops)
-{
-	list_del(&ops->list);
-}
-EXPORT_SYMBOL_GPL(__rtnl_af_unregister);
-
-/**
  * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
  * @ops: struct rtnl_af_ops * to unregister
  */
 void rtnl_af_unregister(struct rtnl_af_ops *ops)
 {
 	rtnl_lock();
-	__rtnl_af_unregister(ops);
+	list_del(&ops->list);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL_GPL(rtnl_af_unregister);
@@ -522,11 +510,15 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev,
 static bool rtnl_have_link_slave_info(const struct net_device *dev)
 {
 	struct net_device *master_dev;
+	bool ret = false;
 
-	master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+	rcu_read_lock();
+
+	master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
 	if (master_dev && master_dev->rtnl_link_ops)
-		return true;
-	return false;
+		ret = true;
+	rcu_read_unlock();
+	return ret;
 }
 
 static int rtnl_link_slave_info_fill(struct sk_buff *skb,
@@ -923,6 +915,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
 	       + rtnl_xdp_size() /* IFLA_XDP */
 	       + nla_total_size(4)  /* IFLA_EVENT */
+	       + nla_total_size(4)  /* IFLA_NEW_NETNSID */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1211,6 +1204,36 @@ nla_put_vfinfo_failure:
 	return -EMSGSIZE;
 }
 
+static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
+					   struct net_device *dev,
+					   u32 ext_filter_mask)
+{
+	struct nlattr *vfinfo;
+	int i, num_vfs;
+
+	if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0))
+		return 0;
+
+	num_vfs = dev_num_vf(dev->dev.parent);
+	if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs))
+		return -EMSGSIZE;
+
+	if (!dev->netdev_ops->ndo_get_vf_config)
+		return 0;
+
+	vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
+	if (!vfinfo)
+		return -EMSGSIZE;
+
+	for (i = 0; i < num_vfs; i++) {
+		if (rtnl_fill_vfinfo(skb, dev, i, vfinfo))
+			return -EMSGSIZE;
+	}
+
+	nla_nest_end(skb, vfinfo);
+	return 0;
+}
+
 static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 {
 	struct rtnl_link_ifmap map;
@@ -1307,16 +1330,67 @@ static u32 rtnl_get_event(unsigned long event)
 	return rtnl_event_type;
 }
 
+static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct net_device *upper_dev;
+	int ret = 0;
+
+	rcu_read_lock();
+
+	upper_dev = netdev_master_upper_dev_get_rcu(dev);
+	if (upper_dev)
+		ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex);
+
+	rcu_read_unlock();
+	return ret;
+}
+
+static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev)
+{
+	int ifindex = dev_get_iflink(dev);
+
+	if (dev->ifindex == ifindex)
+		return 0;
+
+	return nla_put_u32(skb, IFLA_LINK, ifindex);
+}
+
+static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
+					      struct net_device *dev)
+{
+	char buf[IFALIASZ];
+	int ret;
+
+	ret = dev_get_alias(dev, buf, sizeof(buf));
+	return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0;
+}
+
+static int rtnl_fill_link_netnsid(struct sk_buff *skb,
+				  const struct net_device *dev)
+{
+	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) {
+		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
+
+		if (!net_eq(dev_net(dev), link_net)) {
+			int id = peernet2id_alloc(dev_net(dev), link_net);
+
+			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
+				return -EMSGSIZE;
+		}
+	}
+
+	return 0;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
-			    u32 event)
+			    u32 event, int *new_nsid)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
 	struct nlattr *af_spec;
 	struct rtnl_af_ops *af_ops;
-	struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
 
 	ASSERT_RTNL();
 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
@@ -1345,15 +1419,12 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 #ifdef CONFIG_RPS
 	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
 #endif
-	    (dev->ifindex != dev_get_iflink(dev) &&
-	     nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) ||
-	    (upper_dev &&
-	     nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) ||
+	    nla_put_iflink(skb, dev) ||
+	    put_master_ifindex(skb, dev) ||
 	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
 	    (dev->qdisc &&
 	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
-	    (dev->ifalias &&
-	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+	    nla_put_ifalias(skb, dev) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
 			atomic_read(&dev->carrier_changes)) ||
 	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
@@ -1385,27 +1456,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (rtnl_fill_stats(skb, dev))
 		goto nla_put_failure;
 
-	if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) &&
-	    nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)))
+	if (rtnl_fill_vf(skb, dev, ext_filter_mask))
 		goto nla_put_failure;
 
-	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent &&
-	    ext_filter_mask & RTEXT_FILTER_VF) {
-		int i;
-		struct nlattr *vfinfo;
-		int num_vfs = dev_num_vf(dev->dev.parent);
-
-		vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
-		if (!vfinfo)
-			goto nla_put_failure;
-		for (i = 0; i < num_vfs; i++) {
-			if (rtnl_fill_vfinfo(skb, dev, i, vfinfo))
-				goto nla_put_failure;
-		}
-
-		nla_nest_end(skb, vfinfo);
-	}
-
 	if (rtnl_port_fill(skb, dev, ext_filter_mask))
 		goto nla_put_failure;
 
@@ -1417,17 +1470,12 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			goto nla_put_failure;
 	}
 
-	if (dev->rtnl_link_ops &&
-	    dev->rtnl_link_ops->get_link_net) {
-		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
-
-		if (!net_eq(dev_net(dev), link_net)) {
-			int id = peernet2id_alloc(dev_net(dev), link_net);
+	if (rtnl_fill_link_netnsid(skb, dev))
+		goto nla_put_failure;
 
-			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
-				goto nla_put_failure;
-		}
-	}
+	if (new_nsid &&
+	    nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
+		goto nla_put_failure;
 
 	if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
 		goto nla_put_failure;
@@ -1658,7 +1706,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0);
+					       ext_filter_mask, 0, NULL);
 
 			if (err < 0) {
 				if (likely(skb->len))
@@ -1909,7 +1957,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 	return err;
 }
 
-static int do_set_master(struct net_device *dev, int ifindex)
+static int do_set_master(struct net_device *dev, int ifindex,
+			 struct netlink_ext_ack *extack)
 {
 	struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
 	const struct net_device_ops *ops;
@@ -1934,7 +1983,7 @@ static int do_set_master(struct net_device *dev, int ifindex)
 			return -EINVAL;
 		ops = upper_dev->netdev_ops;
 		if (ops->ndo_add_slave) {
-			err = ops->ndo_add_slave(upper_dev, dev);
+			err = ops->ndo_add_slave(upper_dev, dev, extack);
 			if (err)
 				return err;
 		} else {
@@ -2067,7 +2116,7 @@ static int do_setlink(const struct sk_buff *skb,
 	}
 
 	if (tb[IFLA_MASTER]) {
-		err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+		err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
 		if (err)
 			goto errout;
 		status |= DO_SETLINK_MODIFIED;
@@ -2576,12 +2625,6 @@ replay:
 					return err;
 				slave_data = slave_attr;
 			}
-			if (m_ops->slave_validate) {
-				err = m_ops->slave_validate(tb, slave_data,
-							    extack);
-				if (err < 0)
-					return err;
-			}
 		}
 
 		if (dev) {
@@ -2711,7 +2754,8 @@ replay:
 				goto out_unregister;
 		}
 		if (tb[IFLA_MASTER]) {
-			err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+			err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]),
+					    extack);
 			if (err)
 				goto out_unregister;
 		}
@@ -2771,7 +2815,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return -ENOBUFS;
 
 	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
-			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0);
+			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -2856,7 +2900,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
-				       u32 event, gfp_t flags)
+				       u32 event, gfp_t flags, int *new_nsid)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -2867,7 +2911,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 	if (skb == NULL)
 		goto errout;
 
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event);
+	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event,
+			       new_nsid);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -2890,14 +2935,14 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 
 static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 			       unsigned int change, u32 event,
-			       gfp_t flags)
+			       gfp_t flags, int *new_nsid)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
@@ -2905,10 +2950,17 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 		  gfp_t flags)
 {
-	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags);
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL);
 }
 EXPORT_SYMBOL(rtmsg_ifinfo);
 
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+			 gfp_t flags, int *new_nsid)
+{
+	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
+			   new_nsid);
+}
+
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
 				   struct net_device *dev,
 				   u8 *addr, u16 vid, u32 pid, u32 seq,
@@ -4287,7 +4339,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_RESEND_IGMP:
 	case NETDEV_CHANGEINFODATA:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
-				   GFP_KERNEL);
+				   GFP_KERNEL, NULL);
 		break;
 	default:
 		break;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 16982de649b9..822a90e56aea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 
+	skb_metadata_clear(skb);
+
 	/* It is not generally safe to change skb->truesize.
 	 * For the moment, we really care of rx path, or
 	 * when skb is orphaned (not attached to a socket).
@@ -2848,12 +2850,15 @@ EXPORT_SYMBOL(skb_queue_purge);
  */
 void skb_rbtree_purge(struct rb_root *root)
 {
-	struct sk_buff *skb, *next;
+	struct rb_node *p = rb_first(root);
 
-	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
-		kfree_skb(skb);
+	while (p) {
+		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 
-	*root = RB_ROOT;
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, root);
+		kfree_skb(skb);
+	}
 }
 
 /**
@@ -4762,6 +4767,7 @@ EXPORT_SYMBOL(kfree_skb_partial);
 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 		      bool *fragstolen, int *delta_truesize)
 {
+	struct skb_shared_info *to_shinfo, *from_shinfo;
 	int i, delta, len = from->len;
 
 	*fragstolen = false;
@@ -4776,7 +4782,9 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 		return true;
 	}
 
-	if (skb_has_frag_list(to) || skb_has_frag_list(from))
+	to_shinfo = skb_shinfo(to);
+	from_shinfo = skb_shinfo(from);
+	if (to_shinfo->frag_list || from_shinfo->frag_list)
 		return false;
 	if (skb_zcopy(to) || skb_zcopy(from))
 		return false;
@@ -4785,8 +4793,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 		struct page *page;
 		unsigned int offset;
 
-		if (skb_shinfo(to)->nr_frags +
-		    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+		if (to_shinfo->nr_frags +
+		    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
 			return false;
 
 		if (skb_head_is_locked(from))
@@ -4797,12 +4805,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 		page = virt_to_head_page(from->head);
 		offset = from->data - (unsigned char *)page_address(page);
 
-		skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
+		skb_fill_page_desc(to, to_shinfo->nr_frags,
 				   page, offset, skb_headlen(from));
 		*fragstolen = true;
 	} else {
-		if (skb_shinfo(to)->nr_frags +
-		    skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
+		if (to_shinfo->nr_frags +
+		    from_shinfo->nr_frags > MAX_SKB_FRAGS)
 			return false;
 
 		delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
@@ -4810,19 +4818,19 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 
 	WARN_ON_ONCE(delta < len);
 
-	memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
-	       skb_shinfo(from)->frags,
-	       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
-	skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
+	memcpy(to_shinfo->frags + to_shinfo->nr_frags,
+	       from_shinfo->frags,
+	       from_shinfo->nr_frags * sizeof(skb_frag_t));
+	to_shinfo->nr_frags += from_shinfo->nr_frags;
 
 	if (!skb_cloned(from))
-		skb_shinfo(from)->nr_frags = 0;
+		from_shinfo->nr_frags = 0;
 
 	/* if the skb is not cloned this does nothing
 	 * since we set nr_frags to 0.
 	 */
-	for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
-		skb_frag_ref(from, i);
+	for (i = 0; i < from_shinfo->nr_frags; i++)
+		__skb_frag_ref(&from_shinfo->frags[i]);
 
 	to->truesize += delta;
 	to->len += len;
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index fcce25da937c..2e7ac8bab19d 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,6 +1,6 @@
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o dsa2.o legacy.o port.o slave.o switch.o
+dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o
 
 # tagging formats
 dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 03c58b0eb082..51ca2a524a27 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -112,34 +112,6 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol)
 	return ops;
 }
 
-int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp)
-{
-	struct dsa_switch *ds = cpu_dp->ds;
-	struct net_device *master;
-	struct ethtool_ops *cpu_ops;
-
-	master = cpu_dp->netdev;
-
-	cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL);
-	if (!cpu_ops)
-		return -ENOMEM;
-
-	memcpy(&cpu_dp->ethtool_ops, master->ethtool_ops,
-	       sizeof(struct ethtool_ops));
-	cpu_dp->orig_ethtool_ops = master->ethtool_ops;
-	memcpy(cpu_ops, &cpu_dp->ethtool_ops,
-	       sizeof(struct ethtool_ops));
-	dsa_cpu_port_ethtool_init(cpu_ops);
-	master->ethtool_ops = cpu_ops;
-
-	return 0;
-}
-
-void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp)
-{
-	cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops;
-}
-
 void dsa_cpu_dsa_destroy(struct dsa_port *port)
 {
 	struct device_node *port_dn = port->dn;
@@ -188,12 +160,12 @@ EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
 static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 			  struct packet_type *pt, struct net_device *unused)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	struct sk_buff *nskb = NULL;
 	struct pcpu_sw_netstats *s;
 	struct dsa_slave_priv *p;
 
-	if (unlikely(dst == NULL)) {
+	if (unlikely(!cpu_dp)) {
 		kfree_skb(skb);
 		return 0;
 	}
@@ -202,7 +174,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!skb)
 		return 0;
 
-	nskb = dst->rcv(skb, dev, pt);
+	nskb = cpu_dp->rcv(skb, dev, pt);
 	if (!nskb) {
 		kfree_skb(skb);
 		return 0;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 873af0108e24..54ed054777bd 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -433,18 +433,17 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst)
 			return err;
 	}
 
-	if (dst->cpu_dp) {
-		err = dsa_cpu_port_ethtool_setup(dst->cpu_dp);
-		if (err)
-			return err;
-	}
-
 	/* If we use a tagging format that doesn't have an ethertype
 	 * field, make sure that all packets from this point on get
 	 * sent to the tag format's receive function.
 	 */
 	wmb();
-	dst->cpu_dp->netdev->dsa_ptr = dst;
+	dst->cpu_dp->netdev->dsa_ptr = dst->cpu_dp;
+
+	err = dsa_master_ethtool_setup(dst->cpu_dp->netdev);
+	if (err)
+		return err;
+
 	dst->applied = true;
 
 	return 0;
@@ -458,6 +457,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
 	if (!dst->applied)
 		return;
 
+	dsa_master_ethtool_restore(dst->cpu_dp->netdev);
+
 	dst->cpu_dp->netdev->dsa_ptr = NULL;
 
 	/* If we used a tagging format that doesn't have an ethertype
@@ -474,10 +475,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst)
 		dsa_ds_unapply(dst, ds);
 	}
 
-	if (dst->cpu_dp) {
-		dsa_cpu_port_ethtool_restore(dst->cpu_dp);
-		dst->cpu_dp = NULL;
-	}
+	dst->cpu_dp = NULL;
 
 	pr_info("DSA: tree %d unapplied\n", dst->tree);
 	dst->applied = false;
@@ -487,6 +485,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 			 struct dsa_switch_tree *dst,
 			 struct dsa_switch *ds)
 {
+	const struct dsa_device_ops *tag_ops;
 	enum dsa_tag_protocol tag_protocol;
 	struct net_device *ethernet_dev;
 	struct device_node *ethernet;
@@ -516,14 +515,18 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index,
 	ds->cpu_port_mask |= BIT(index);
 
 	tag_protocol = ds->ops->get_tag_protocol(ds);
-	dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
-	if (IS_ERR(dst->tag_ops)) {
+	tag_ops = dsa_resolve_tag_protocol(tag_protocol);
+	if (IS_ERR(tag_ops)) {
 		dev_warn(ds->dev, "No tagger for this switch\n");
 		ds->cpu_port_mask &= ~BIT(index);
-		return PTR_ERR(dst->tag_ops);
+		return PTR_ERR(tag_ops);
 	}
 
-	dst->rcv = dst->tag_ops->rcv;
+	dst->cpu_dp->tag_ops = tag_ops;
+
+	/* Make a few copies for faster access in master receive hot path */
+	dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv;
+	dst->cpu_dp->dst = dst;
 
 	return 0;
 }
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 9c3eeb72462d..2850077cc9cc 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -66,7 +66,7 @@ struct dsa_notifier_vlan_info {
 };
 
 struct dsa_slave_priv {
-	/* Copy of dp->ds->dst->tag_ops->xmit for faster access in hot path */
+	/* Copy of CPU port xmit for faster access in slave transmit hot path */
 	struct sk_buff *	(*xmit)(struct sk_buff *skb,
 					struct net_device *dev);
 
@@ -79,7 +79,6 @@ struct dsa_slave_priv {
 	 * The phylib phy_device pointer for the PHY connected
 	 * to this port.
 	 */
-	struct phy_device	*phy;
 	phy_interface_t		phy_interface;
 	int			old_link;
 	int			old_pause;
@@ -97,8 +96,6 @@ struct dsa_slave_priv {
 int dsa_cpu_dsa_setup(struct dsa_port *port);
 void dsa_cpu_dsa_destroy(struct dsa_port *dport);
 const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
-int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp);
-void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp);
 bool dsa_schedule_work(struct work_struct *work);
 
 /* legacy.c */
@@ -112,10 +109,35 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 		       struct net_device *dev,
 		       const unsigned char *addr, u16 vid);
 
+/* master.c */
+int dsa_master_ethtool_setup(struct net_device *dev);
+void dsa_master_ethtool_restore(struct net_device *dev);
+
+static inline struct net_device *dsa_master_get_slave(struct net_device *dev,
+						      int device, int port)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	struct dsa_switch_tree *dst = cpu_dp->dst;
+	struct dsa_switch *ds;
+
+	if (device < 0 || device >= DSA_MAX_SWITCHES)
+		return NULL;
+
+	ds = dst->ds[device];
+	if (!ds)
+		return NULL;
+
+	if (port < 0 || port >= ds->num_ports)
+		return NULL;
+
+	return ds->ports[port].netdev;
+}
+
 /* port.c */
 int dsa_port_set_state(struct dsa_port *dp, u8 state,
 		       struct switchdev_trans *trans);
-void dsa_port_set_state_now(struct dsa_port *dp, u8 state);
+int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy);
+void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy);
 int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br);
 void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
 int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
@@ -126,6 +148,7 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
 		     u16 vid);
 int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
 		     u16 vid);
+int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data);
 int dsa_port_mdb_add(struct dsa_port *dp,
 		     const struct switchdev_obj_port_mdb *mdb,
 		     struct switchdev_trans *trans);
@@ -139,7 +162,6 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
-void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops);
 int dsa_slave_create(struct dsa_port *port, const char *name);
 void dsa_slave_destroy(struct net_device *slave_dev);
 int dsa_slave_suspend(struct net_device *slave_dev);
@@ -180,9 +202,4 @@ static inline struct net_device *dsa_master_netdev(struct dsa_slave_priv *p)
 	return p->dp->cpu_dp->netdev;
 }
 
-static inline struct dsa_port *dsa_get_cpu_port(struct dsa_switch_tree *dst)
-{
-	return dst->cpu_dp;
-}
-
 #endif
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 91e6f7981d39..19ff6e0a21dc 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -144,14 +144,19 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 	 * switch.
 	 */
 	if (dst->cpu_dp->ds == ds) {
+		const struct dsa_device_ops *tag_ops;
 		enum dsa_tag_protocol tag_protocol;
 
 		tag_protocol = ops->get_tag_protocol(ds);
-		dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol);
-		if (IS_ERR(dst->tag_ops))
-			return PTR_ERR(dst->tag_ops);
+		tag_ops = dsa_resolve_tag_protocol(tag_protocol);
+		if (IS_ERR(tag_ops))
+			return PTR_ERR(tag_ops);
 
-		dst->rcv = dst->tag_ops->rcv;
+		dst->cpu_dp->tag_ops = tag_ops;
+
+		/* Few copies for faster access in master receive hot path */
+		dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv;
+		dst->cpu_dp->dst = dst;
 	}
 
 	memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable));
@@ -206,10 +211,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds,
 		netdev_err(master, "[%d] : can't configure CPU and DSA ports\n",
 			   index);
 
-	ret = dsa_cpu_port_ethtool_setup(ds->dst->cpu_dp);
-	if (ret)
-		return ret;
-
 	return 0;
 }
 
@@ -604,9 +605,9 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
 	 * sent to the tag format's receive function.
 	 */
 	wmb();
-	dev->dsa_ptr = dst;
+	dev->dsa_ptr = dst->cpu_dp;
 
-	return 0;
+	return dsa_master_ethtool_setup(dst->cpu_dp->netdev);
 }
 
 static int dsa_probe(struct platform_device *pdev)
@@ -671,6 +672,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 {
 	int i;
 
+	dsa_master_ethtool_restore(dst->cpu_dp->netdev);
+
 	dst->cpu_dp->netdev->dsa_ptr = NULL;
 
 	/* If we used a tagging format that doesn't have an ethertype
@@ -686,8 +689,6 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
 			dsa_switch_destroy(ds);
 	}
 
-	dsa_cpu_port_ethtool_restore(dst->cpu_dp);
-
 	dev_put(dst->cpu_dp->netdev);
 }
 
diff --git a/net/dsa/master.c b/net/dsa/master.c
new file mode 100644
index 000000000000..5f3f57e372e0
--- /dev/null
+++ b/net/dsa/master.c
@@ -0,0 +1,117 @@
+/*
+ * Handling of a master device, switching frames via its switch fabric CPU port
+ *
+ * Copyright (c) 2017 Savoir-faire Linux Inc.
+ *	Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include "dsa_priv.h"
+
+static void dsa_master_get_ethtool_stats(struct net_device *dev,
+					 struct ethtool_stats *stats,
+					 uint64_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int count = 0;
+
+	if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+		count = ops->get_sset_count(dev, ETH_SS_STATS);
+		ops->get_ethtool_stats(dev, stats, data);
+	}
+
+	if (ds->ops->get_ethtool_stats)
+		ds->ops->get_ethtool_stats(ds, port, data + count);
+}
+
+static int dsa_master_get_sset_count(struct net_device *dev, int sset)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int count = 0;
+
+	if (ops && ops->get_sset_count)
+		count += ops->get_sset_count(dev, sset);
+
+	if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
+		count += ds->ops->get_sset_count(ds);
+
+	return count;
+}
+
+static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
+				   uint8_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int len = ETH_GSTRING_LEN;
+	int mcount = 0, count;
+	unsigned int i;
+	uint8_t pfx[4];
+	uint8_t *ndata;
+
+	snprintf(pfx, sizeof(pfx), "p%.2d", port);
+	/* We do not want to be NULL-terminated, since this is a prefix */
+	pfx[sizeof(pfx) - 1] = '_';
+
+	if (ops && ops->get_sset_count && ops->get_strings) {
+		mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+		ops->get_strings(dev, stringset, data);
+	}
+
+	if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
+		ndata = data + mcount * len;
+		/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
+		 * the output after to prepend our CPU port prefix we
+		 * constructed earlier
+		 */
+		ds->ops->get_strings(ds, port, ndata);
+		count = ds->ops->get_sset_count(ds);
+		for (i = 0; i < count; i++) {
+			memmove(ndata + (i * len + sizeof(pfx)),
+				ndata + i * len, len - sizeof(pfx));
+			memcpy(ndata + i * len, pfx, sizeof(pfx));
+		}
+	}
+}
+
+int dsa_master_ethtool_setup(struct net_device *dev)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct ethtool_ops *ops;
+
+	ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	cpu_dp->orig_ethtool_ops = dev->ethtool_ops;
+	if (cpu_dp->orig_ethtool_ops)
+		memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops));
+
+	ops->get_sset_count = dsa_master_get_sset_count;
+	ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
+	ops->get_strings = dsa_master_get_strings;
+
+	dev->ethtool_ops = ops;
+
+	return 0;
+}
+
+void dsa_master_ethtool_restore(struct net_device *dev)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+	dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
+	cpu_dp->orig_ethtool_ops = NULL;
+}
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 659676ba3f8b..72c8dbd3d3f2 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -56,7 +56,7 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state,
 	return 0;
 }
 
-void dsa_port_set_state_now(struct dsa_port *dp, u8 state)
+static void dsa_port_set_state_now(struct dsa_port *dp, u8 state)
 {
 	int err;
 
@@ -65,6 +65,35 @@ void dsa_port_set_state_now(struct dsa_port *dp, u8 state)
 		pr_err("DSA: failed to set STP state %u (%d)\n", state, err);
 }
 
+int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy)
+{
+	u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING;
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+	int err;
+
+	if (ds->ops->port_enable) {
+		err = ds->ops->port_enable(ds, port, phy);
+		if (err)
+			return err;
+	}
+
+	dsa_port_set_state_now(dp, stp_state);
+
+	return 0;
+}
+
+void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy)
+{
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	dsa_port_set_state_now(dp, BR_STATE_DISABLED);
+
+	if (ds->ops->port_disable)
+		ds->ops->port_disable(ds, port, phy);
+}
+
 int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br)
 {
 	struct dsa_notifier_bridge_info info = {
@@ -173,6 +202,17 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
 	return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info);
 }
 
+int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data)
+{
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	if (!ds->ops->port_fdb_dump)
+		return -EOPNOTSUPP;
+
+	return ds->ops->port_fdb_dump(ds, port, cb, data);
+}
+
 int dsa_port_mdb_add(struct dsa_port *dp,
 		     const struct switchdev_obj_port_mdb *mdb,
 		     struct switchdev_trans *trans)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 865e29e62bad..fb2954ff198c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -73,9 +73,7 @@ static int dsa_slave_open(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_port *dp = p->dp;
-	struct dsa_switch *ds = dp->ds;
 	struct net_device *master = dsa_master_netdev(p);
-	u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING;
 	int err;
 
 	if (!(master->flags & IFF_UP))
@@ -98,16 +96,12 @@ static int dsa_slave_open(struct net_device *dev)
 			goto clear_allmulti;
 	}
 
-	if (ds->ops->port_enable) {
-		err = ds->ops->port_enable(ds, p->dp->index, p->phy);
-		if (err)
-			goto clear_promisc;
-	}
-
-	dsa_port_set_state_now(p->dp, stp_state);
+	err = dsa_port_enable(dp, dev->phydev);
+	if (err)
+		goto clear_promisc;
 
-	if (p->phy)
-		phy_start(p->phy);
+	if (dev->phydev)
+		phy_start(dev->phydev);
 
 	return 0;
 
@@ -128,10 +122,12 @@ static int dsa_slave_close(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct net_device *master = dsa_master_netdev(p);
-	struct dsa_switch *ds = p->dp->ds;
+	struct dsa_port *dp = p->dp;
 
-	if (p->phy)
-		phy_stop(p->phy);
+	if (dev->phydev)
+		phy_stop(dev->phydev);
+
+	dsa_port_disable(dp, dev->phydev);
 
 	dev_mc_unsync(master, dev);
 	dev_uc_unsync(master, dev);
@@ -143,11 +139,6 @@ static int dsa_slave_close(struct net_device *dev)
 	if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
 		dev_uc_del(master, dev->dev_addr);
 
-	if (ds->ops->port_disable)
-		ds->ops->port_disable(ds, p->dp->index, p->phy);
-
-	dsa_port_set_state_now(p->dp, BR_STATE_DISABLED);
-
 	return 0;
 }
 
@@ -263,27 +254,20 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	};
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_port *dp = p->dp;
-	struct dsa_switch *ds = dp->ds;
 	int err;
 
-	if (!ds->ops->port_fdb_dump)
-		return -EOPNOTSUPP;
-
-	err = ds->ops->port_fdb_dump(ds, dp->index,
-				     dsa_slave_port_fdb_do_dump,
-				     &dump);
+	err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump);
 	*idx = dump.idx;
+
 	return err;
 }
 
 static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
-	struct dsa_slave_priv *p = netdev_priv(dev);
-
-	if (p->phy != NULL)
-		return phy_mii_ioctl(p->phy, ifr, cmd);
+	if (!dev->phydev)
+		return -ENODEV;
 
-	return -EOPNOTSUPP;
+	return phy_mii_ioctl(dev->phydev, ifr, cmd);
 }
 
 static int dsa_slave_port_attr_set(struct net_device *dev,
@@ -385,10 +369,12 @@ static int dsa_slave_port_attr_get(struct net_device *dev,
 	return 0;
 }
 
-static inline netdev_tx_t dsa_netpoll_send_skb(struct dsa_slave_priv *p,
-					       struct sk_buff *skb)
+static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
+						     struct sk_buff *skb)
 {
 #ifdef CONFIG_NET_POLL_CONTROLLER
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
 	if (p->netpoll)
 		netpoll_send_skb(p->netpoll, skb);
 #else
@@ -422,7 +408,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * tag to be successfully transmitted
 	 */
 	if (unlikely(netpoll_tx_running(dev)))
-		return dsa_netpoll_send_skb(p, nskb);
+		return dsa_slave_netpoll_send_skb(dev, nskb);
 
 	/* Queue the SKB for transmission on the parent interface, but
 	 * do not modify its EtherType
@@ -434,31 +420,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 /* ethtool operations *******************************************************/
-static int
-dsa_slave_get_link_ksettings(struct net_device *dev,
-			     struct ethtool_link_ksettings *cmd)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-
-	if (!p->phy)
-		return -EOPNOTSUPP;
-
-	phy_ethtool_ksettings_get(p->phy, cmd);
-
-	return 0;
-}
-
-static int
-dsa_slave_set_link_ksettings(struct net_device *dev,
-			     const struct ethtool_link_ksettings *cmd)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-
-	if (p->phy != NULL)
-		return phy_ethtool_ksettings_set(p->phy, cmd);
-
-	return -EOPNOTSUPP;
-}
 
 static void dsa_slave_get_drvinfo(struct net_device *dev,
 				  struct ethtool_drvinfo *drvinfo)
@@ -489,26 +450,14 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
 		ds->ops->get_regs(ds, p->dp->index, regs, _p);
 }
 
-static int dsa_slave_nway_reset(struct net_device *dev)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-
-	if (p->phy != NULL)
-		return genphy_restart_aneg(p->phy);
-
-	return -EOPNOTSUPP;
-}
-
 static u32 dsa_slave_get_link(struct net_device *dev)
 {
-	struct dsa_slave_priv *p = netdev_priv(dev);
+	if (!dev->phydev)
+		return -ENODEV;
 
-	if (p->phy != NULL) {
-		genphy_update_link(p->phy);
-		return p->phy->link;
-	}
+	genphy_update_link(dev->phydev);
 
-	return -EOPNOTSUPP;
+	return dev->phydev->link;
 }
 
 static int dsa_slave_get_eeprom_len(struct net_device *dev)
@@ -567,79 +516,6 @@ static void dsa_slave_get_strings(struct net_device *dev,
 	}
 }
 
-static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev,
-					   struct ethtool_stats *stats,
-					   uint64_t *data)
-{
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
-	struct dsa_switch *ds = cpu_dp->ds;
-	s8 cpu_port = cpu_dp->index;
-	int count = 0;
-
-	if (cpu_dp->ethtool_ops.get_sset_count) {
-		count = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS);
-		cpu_dp->ethtool_ops.get_ethtool_stats(dev, stats, data);
-	}
-
-	if (ds->ops->get_ethtool_stats)
-		ds->ops->get_ethtool_stats(ds, cpu_port, data + count);
-}
-
-static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset)
-{
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
-	struct dsa_switch *ds = cpu_dp->ds;
-	int count = 0;
-
-	if (cpu_dp->ethtool_ops.get_sset_count)
-		count += cpu_dp->ethtool_ops.get_sset_count(dev, sset);
-
-	if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
-		count += ds->ops->get_sset_count(ds);
-
-	return count;
-}
-
-static void dsa_cpu_port_get_strings(struct net_device *dev,
-				     uint32_t stringset, uint8_t *data)
-{
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
-	struct dsa_switch *ds = cpu_dp->ds;
-	s8 cpu_port = cpu_dp->index;
-	int len = ETH_GSTRING_LEN;
-	int mcount = 0, count;
-	unsigned int i;
-	uint8_t pfx[4];
-	uint8_t *ndata;
-
-	snprintf(pfx, sizeof(pfx), "p%.2d", cpu_port);
-	/* We do not want to be NULL-terminated, since this is a prefix */
-	pfx[sizeof(pfx) - 1] = '_';
-
-	if (cpu_dp->ethtool_ops.get_sset_count) {
-		mcount = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS);
-		cpu_dp->ethtool_ops.get_strings(dev, stringset, data);
-	}
-
-	if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
-		ndata = data + mcount * len;
-		/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
-		 * the output after to prepend our CPU port prefix we
-		 * constructed earlier
-		 */
-		ds->ops->get_strings(ds, cpu_port, ndata);
-		count = ds->ops->get_sset_count(ds);
-		for (i = 0; i < count; i++) {
-			memmove(ndata + (i * len + sizeof(pfx)),
-				ndata + i * len, len - sizeof(pfx));
-			memcpy(ndata + i * len, pfx, sizeof(pfx));
-		}
-	}
-}
-
 static void dsa_slave_get_ethtool_stats(struct net_device *dev,
 					struct ethtool_stats *stats,
 					uint64_t *data)
@@ -716,7 +592,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 	int ret;
 
 	/* Port's PHY and MAC both need to be EEE capable */
-	if (!p->phy)
+	if (!dev->phydev)
 		return -ENODEV;
 
 	if (!ds->ops->set_mac_eee)
@@ -727,12 +603,12 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 		return ret;
 
 	if (e->eee_enabled) {
-		ret = phy_init_eee(p->phy, 0);
+		ret = phy_init_eee(dev->phydev, 0);
 		if (ret)
 			return ret;
 	}
 
-	return phy_ethtool_set_eee(p->phy, e);
+	return phy_ethtool_set_eee(dev->phydev, e);
 }
 
 static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
@@ -742,7 +618,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
 	int ret;
 
 	/* Port's PHY and MAC both need to be EEE capable */
-	if (!p->phy)
+	if (!dev->phydev)
 		return -ENODEV;
 
 	if (!ds->ops->get_mac_eee)
@@ -752,7 +628,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
 	if (ret)
 		return ret;
 
-	return phy_ethtool_get_eee(p->phy, e);
+	return phy_ethtool_get_eee(dev->phydev, e);
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -809,9 +685,9 @@ static int dsa_slave_get_phys_port_name(struct net_device *dev,
 }
 
 static struct dsa_mall_tc_entry *
-dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p,
-			     unsigned long cookie)
+dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
 {
+	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_mall_tc_entry *mall_tc_entry;
 
 	list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
@@ -893,7 +769,7 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev,
 	if (!ds->ops->port_mirror_del)
 		return;
 
-	mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie);
+	mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie);
 	if (!mall_tc_entry)
 		return;
 
@@ -976,13 +852,6 @@ static void dsa_slave_get_stats64(struct net_device *dev,
 	}
 }
 
-void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops)
-{
-	ops->get_sset_count = dsa_cpu_port_get_sset_count;
-	ops->get_ethtool_stats = dsa_cpu_port_get_ethtool_stats;
-	ops->get_strings = dsa_cpu_port_get_strings;
-}
-
 static int dsa_slave_get_rxnfc(struct net_device *dev,
 			       struct ethtool_rxnfc *nfc, u32 *rule_locs)
 {
@@ -1011,7 +880,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_drvinfo		= dsa_slave_get_drvinfo,
 	.get_regs_len		= dsa_slave_get_regs_len,
 	.get_regs		= dsa_slave_get_regs,
-	.nway_reset		= dsa_slave_nway_reset,
+	.nway_reset		= phy_ethtool_nway_reset,
 	.get_link		= dsa_slave_get_link,
 	.get_eeprom_len		= dsa_slave_get_eeprom_len,
 	.get_eeprom		= dsa_slave_get_eeprom,
@@ -1023,8 +892,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_wol		= dsa_slave_get_wol,
 	.set_eee		= dsa_slave_set_eee,
 	.get_eee		= dsa_slave_get_eee,
-	.get_link_ksettings	= dsa_slave_get_link_ksettings,
-	.set_link_ksettings	= dsa_slave_set_link_ksettings,
+	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
+	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
 	.get_rxnfc		= dsa_slave_get_rxnfc,
 	.set_rxnfc		= dsa_slave_set_rxnfc,
 };
@@ -1068,26 +937,26 @@ static void dsa_slave_adjust_link(struct net_device *dev)
 	struct dsa_switch *ds = p->dp->ds;
 	unsigned int status_changed = 0;
 
-	if (p->old_link != p->phy->link) {
+	if (p->old_link != dev->phydev->link) {
 		status_changed = 1;
-		p->old_link = p->phy->link;
+		p->old_link = dev->phydev->link;
 	}
 
-	if (p->old_duplex != p->phy->duplex) {
+	if (p->old_duplex != dev->phydev->duplex) {
 		status_changed = 1;
-		p->old_duplex = p->phy->duplex;
+		p->old_duplex = dev->phydev->duplex;
 	}
 
-	if (p->old_pause != p->phy->pause) {
+	if (p->old_pause != dev->phydev->pause) {
 		status_changed = 1;
-		p->old_pause = p->phy->pause;
+		p->old_pause = dev->phydev->pause;
 	}
 
 	if (ds->ops->adjust_link && status_changed)
-		ds->ops->adjust_link(ds, p->dp->index, p->phy);
+		ds->ops->adjust_link(ds, p->dp->index, dev->phydev);
 
 	if (status_changed)
-		phy_print_status(p->phy);
+		phy_print_status(dev->phydev);
 }
 
 static int dsa_slave_fixed_link_update(struct net_device *dev,
@@ -1107,28 +976,28 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
 }
 
 /* slave device setup *******************************************************/
-static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
-				 struct net_device *slave_dev,
-				 int addr)
+static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
 {
+	struct dsa_slave_priv *p = netdev_priv(slave_dev);
 	struct dsa_switch *ds = p->dp->ds;
 
-	p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr);
-	if (!p->phy) {
+	slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr);
+	if (!slave_dev->phydev) {
 		netdev_err(slave_dev, "no phy at %d\n", addr);
 		return -ENODEV;
 	}
 
 	/* Use already configured phy mode */
 	if (p->phy_interface == PHY_INTERFACE_MODE_NA)
-		p->phy_interface = p->phy->interface;
-	return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
-				  p->phy_interface);
+		p->phy_interface = slave_dev->phydev->interface;
+
+	return phy_connect_direct(slave_dev, slave_dev->phydev,
+				  dsa_slave_adjust_link, p->phy_interface);
 }
 
-static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
-				struct net_device *slave_dev)
+static int dsa_slave_phy_setup(struct net_device *slave_dev)
 {
+	struct dsa_slave_priv *p = netdev_priv(slave_dev);
 	struct dsa_switch *ds = p->dp->ds;
 	struct device_node *phy_dn, *port_dn;
 	bool phy_is_fixed = false;
@@ -1168,30 +1037,31 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 		 */
 		if (!phy_is_fixed && phy_id >= 0 &&
 		    (ds->phys_mii_mask & (1 << phy_id))) {
-			ret = dsa_slave_phy_connect(p, slave_dev, phy_id);
+			ret = dsa_slave_phy_connect(slave_dev, phy_id);
 			if (ret) {
 				netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret);
 				of_node_put(phy_dn);
 				return ret;
 			}
 		} else {
-			p->phy = of_phy_connect(slave_dev, phy_dn,
-						dsa_slave_adjust_link,
-						phy_flags,
-						p->phy_interface);
+			slave_dev->phydev = of_phy_connect(slave_dev, phy_dn,
+							   dsa_slave_adjust_link,
+							   phy_flags,
+							   p->phy_interface);
 		}
 
 		of_node_put(phy_dn);
 	}
 
-	if (p->phy && phy_is_fixed)
-		fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update);
+	if (slave_dev->phydev && phy_is_fixed)
+		fixed_phy_set_link_update(slave_dev->phydev,
+					  dsa_slave_fixed_link_update);
 
 	/* We could not connect to a designated PHY, so use the switch internal
 	 * MDIO bus instead
 	 */
-	if (!p->phy) {
-		ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index);
+	if (!slave_dev->phydev) {
+		ret = dsa_slave_phy_connect(slave_dev, p->dp->index);
 		if (ret) {
 			netdev_err(slave_dev, "failed to connect to port %d: %d\n",
 				   p->dp->index, ret);
@@ -1201,7 +1071,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 		}
 	}
 
-	phy_attached_info(p->phy);
+	phy_attached_info(slave_dev->phydev);
 
 	return 0;
 }
@@ -1221,12 +1091,12 @@ int dsa_slave_suspend(struct net_device *slave_dev)
 
 	netif_device_detach(slave_dev);
 
-	if (p->phy) {
-		phy_stop(p->phy);
+	if (slave_dev->phydev) {
+		phy_stop(slave_dev->phydev);
 		p->old_pause = -1;
 		p->old_link = -1;
 		p->old_duplex = -1;
-		phy_suspend(p->phy);
+		phy_suspend(slave_dev->phydev);
 	}
 
 	return 0;
@@ -1234,13 +1104,11 @@ int dsa_slave_suspend(struct net_device *slave_dev)
 
 int dsa_slave_resume(struct net_device *slave_dev)
 {
-	struct dsa_slave_priv *p = netdev_priv(slave_dev);
-
 	netif_device_attach(slave_dev);
 
-	if (p->phy) {
-		phy_resume(p->phy);
-		phy_start(p->phy);
+	if (slave_dev->phydev) {
+		phy_resume(slave_dev->phydev);
+		phy_start(slave_dev->phydev);
 	}
 
 	return 0;
@@ -1249,7 +1117,6 @@ int dsa_slave_resume(struct net_device *slave_dev)
 int dsa_slave_create(struct dsa_port *port, const char *name)
 {
 	struct dsa_switch *ds = port->ds;
-	struct dsa_switch_tree *dst = ds->dst;
 	struct net_device *master;
 	struct net_device *slave_dev;
 	struct dsa_slave_priv *p;
@@ -1294,7 +1161,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name)
 	}
 	p->dp = port;
 	INIT_LIST_HEAD(&p->mall_tc_list);
-	p->xmit = dst->tag_ops->xmit;
+	p->xmit = cpu_dp->tag_ops->xmit;
 
 	p->old_pause = -1;
 	p->old_link = -1;
@@ -1304,7 +1171,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name)
 
 	netif_carrier_off(slave_dev);
 
-	ret = dsa_slave_phy_setup(p, slave_dev);
+	ret = dsa_slave_phy_setup(slave_dev);
 	if (ret) {
 		netdev_err(master, "error %d setting up slave phy\n", ret);
 		goto out_free;
@@ -1320,7 +1187,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name)
 	return 0;
 
 out_phy:
-	phy_disconnect(p->phy);
+	phy_disconnect(slave_dev->phydev);
 	if (of_phy_is_fixed_link(p->dp->dn))
 		of_phy_deregister_fixed_link(p->dp->dn);
 out_free:
@@ -1338,8 +1205,8 @@ void dsa_slave_destroy(struct net_device *slave_dev)
 	port_dn = p->dp->dn;
 
 	netif_carrier_off(slave_dev);
-	if (p->phy) {
-		phy_disconnect(p->phy);
+	if (slave_dev->phydev) {
+		phy_disconnect(slave_dev->phydev);
 
 		if (of_phy_is_fixed_link(port_dn))
 			of_phy_deregister_fixed_link(port_dn);
@@ -1379,7 +1246,7 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
-	if (dev->netdev_ops != &dsa_slave_netdev_ops)
+	if (!dsa_slave_dev_check(dev))
 		return NOTIFY_DONE;
 
 	if (event == NETDEV_CHANGEUPPER)
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index dbb016434ace..8e4bdb9d9ae3 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -92,9 +92,6 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev
 static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 				    struct packet_type *pt)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
-	struct dsa_switch *ds = cpu_dp->ds;
 	int source_port;
 	u8 *brcm_tag;
 
@@ -117,8 +114,8 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	/* Locate which port this is coming from */
 	source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
 
-	/* Validate port against switch setup, either the port is totally */
-	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
+	skb->dev = dsa_master_get_slave(dev, 0, source_port);
+	if (!skb->dev)
 		return NULL;
 
 	/* Remove Broadcom tag and update checksum */
@@ -129,8 +126,6 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 		skb->data - ETH_HLEN - BRCM_TAG_LEN,
 		2 * ETH_ALEN);
 
-	skb->dev = ds->ports[source_port].netdev;
-
 	return skb;
 }
 
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index fbf9ca954773..c77218f173d1 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -67,8 +67,6 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 			       struct packet_type *pt)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds;
 	u8 *dsa_header;
 	int source_device;
 	int source_port;
@@ -93,18 +91,8 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	source_device = dsa_header[0] & 0x1f;
 	source_port = (dsa_header[1] >> 3) & 0x1f;
 
-	/*
-	 * Check that the source device exists and that the source
-	 * port is a registered DSA port.
-	 */
-	if (source_device >= DSA_MAX_SWITCHES)
-		return NULL;
-
-	ds = dst->ds[source_device];
-	if (!ds)
-		return NULL;
-
-	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
+	skb->dev = dsa_master_get_slave(dev, source_device, source_port);
+	if (!skb->dev)
 		return NULL;
 
 	/*
@@ -153,8 +141,6 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 			2 * ETH_ALEN);
 	}
 
-	skb->dev = ds->ports[source_port].netdev;
-
 	return skb;
 }
 
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 76367ba1b2e2..0b83cbe0c9e8 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -80,8 +80,6 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 				struct packet_type *pt)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds;
 	u8 *edsa_header;
 	int source_device;
 	int source_port;
@@ -106,18 +104,8 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	source_device = edsa_header[0] & 0x1f;
 	source_port = (edsa_header[1] >> 3) & 0x1f;
 
-	/*
-	 * Check that the source device exists and that the source
-	 * port is a registered DSA port.
-	 */
-	if (source_device >= DSA_MAX_SWITCHES)
-		return NULL;
-
-	ds = dst->ds[source_device];
-	if (!ds)
-		return NULL;
-
-	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
+	skb->dev = dsa_master_get_slave(dev, source_device, source_port);
+	if (!skb->dev)
 		return NULL;
 
 	/*
@@ -172,8 +160,6 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 			2 * ETH_ALEN);
 	}
 
-	skb->dev = ds->ports[source_port].netdev;
-
 	return skb;
 }
 
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 010ca0a336c4..b241c990cde0 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -80,22 +80,19 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
 static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev,
 			       struct packet_type *pt)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
-	struct dsa_switch *ds = cpu_dp->ds;
 	u8 *tag;
 	int source_port;
 
 	tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
 
 	source_port = tag[0] & 7;
-	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
+
+	skb->dev = dsa_master_get_slave(dev, 0, source_port);
+	if (!skb->dev)
 		return NULL;
 
 	pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN);
 
-	skb->dev = ds->ports[source_port].netdev;
-
 	return skb;
 }
 
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index 0b9826105e42..4f211e56c81f 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -71,17 +71,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
 			struct packet_type *pt)
 {
 	u16 *lan9303_tag;
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds;
 	unsigned int source_port;
 
-	ds = dst->ds[0];
-
-	if (unlikely(!ds)) {
-		dev_warn_ratelimited(&dev->dev, "Dropping packet, due to missing DSA switch device\n");
-		return NULL;
-	}
-
 	if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) {
 		dev_warn_ratelimited(&dev->dev,
 				     "Dropping packet, cannot pull\n");
@@ -103,16 +94,12 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	source_port = ntohs(lan9303_tag[1]) & 0x3;
 
-	if (source_port >= ds->num_ports) {
+	skb->dev = dsa_master_get_slave(dev, 0, source_port);
+	if (!skb->dev) {
 		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
 		return NULL;
 	}
 
-	if (!ds->ports[source_port].netdev) {
-		dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid netdev or device\n");
-		return NULL;
-	}
-
 	/* remove the special VLAN tag between the MAC addresses
 	 * and the current ethertype field.
 	 */
@@ -120,9 +107,6 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
 	memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN),
 		2 * ETH_ALEN);
 
-	/* forward the packet to the dedicated interface */
-	skb->dev = ds->ports[source_port].netdev;
-
 	return skb;
 }
 
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index ec8ee5f43255..968586c5d40e 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -46,8 +46,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 				   struct packet_type *pt)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_switch *ds;
 	int port;
 	__be16 *phdr, hdr;
 
@@ -68,20 +66,12 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 		skb->data - ETH_HLEN - MTK_HDR_LEN,
 		2 * ETH_ALEN);
 
-	/* This protocol doesn't support cascading multiple
-	 * switches so it's safe to assume the switch is first
-	 * in the tree.
-	 */
-	ds = dst->ds[0];
-	if (!ds)
-		return NULL;
-
 	/* Get source port information */
 	port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK);
-	if (!ds->ports[port].netdev)
-		return NULL;
 
-	skb->dev = ds->ports[port].netdev;
+	skb->dev = dsa_master_get_slave(dev, 0, port);
+	if (!skb->dev)
+		return NULL;
 
 	return skb;
 }
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 1d4c70711c0f..8d33d9ebf910 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -65,9 +65,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 				   struct packet_type *pt)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
-	struct dsa_switch *ds;
 	u8 ver;
 	int port;
 	__be16 *phdr, hdr;
@@ -92,20 +89,12 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN,
 		ETH_HLEN - QCA_HDR_LEN);
 
-	/* This protocol doesn't support cascading multiple switches so it's
-	 * safe to assume the switch is first in the tree
-	 */
-	ds = cpu_dp->ds;
-	if (!ds)
-		return NULL;
-
 	/* Get source port information */
 	port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
-	if (!ds->ports[port].netdev)
-		return NULL;
 
-	/* Update skb & forward the frame accordingly */
-	skb->dev = ds->ports[port].netdev;
+	skb->dev = dsa_master_get_slave(dev, 0, port);
+	if (!skb->dev)
+		return NULL;
 
 	return skb;
 }
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index d2fd4923aa3e..61668be267f5 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -58,9 +58,6 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 				   struct packet_type *pt)
 {
-	struct dsa_switch_tree *dst = dev->dsa_ptr;
-	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
-	struct dsa_switch *ds = cpu_dp->ds;
 	u8 *trailer;
 	int source_port;
 
@@ -73,13 +70,13 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 		return NULL;
 
 	source_port = trailer[1] & 7;
-	if (source_port >= ds->num_ports || !ds->ports[source_port].netdev)
+
+	skb->dev = dsa_master_get_slave(dev, 0, source_port);
+	if (!skb->dev)
 		return NULL;
 
 	pskb_trim_rcsum(skb, skb->len - 4);
 
-	skb->dev = ds->ports[source_port].netdev;
-
 	return skb;
 }
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e31108e5ef79..43a1bbed7a42 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog)
 {
 	struct sock *sk = sock->sk;
 	unsigned char old_state;
-	int err;
+	int err, tcp_fastopen;
 
 	lock_sock(sk);
 
@@ -217,11 +217,12 @@ int inet_listen(struct socket *sock, int backlog)
 		 * because the socket was in TCP_LISTEN state previously but
 		 * was shutdown() rather than close().
 		 */
-		if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
-		    (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+		tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+		if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
+		    (tcp_fastopen & TFO_SERVER_ENABLE) &&
 		    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
 			fastopen_queue_tune(sk, backlog);
-			tcp_fastopen_init_key_once(true);
+			tcp_fastopen_init_key_once(sock_net(sk));
 		}
 
 		err = inet_csk_listen_start(sk, backlog);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7adc0616599..7ce22a2c07ce 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
  */
 struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 {
-	u32 hash = inet_addr_hash(net, addr);
 	struct net_device *result = NULL;
 	struct in_ifaddr *ifa;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) {
-		if (ifa->ifa_local == addr) {
-			struct net_device *dev = ifa->ifa_dev->dev;
-
-			if (!net_eq(dev_net(dev), net))
-				continue;
-			result = dev;
-			break;
-		}
-	}
-	if (!result) {
+	ifa = inet_lookup_ifaddr_rcu(net, addr);
+	if (!ifa) {
 		struct flowi4 fl4 = { .daddr = addr };
 		struct fib_result res = { 0 };
 		struct fib_table *local;
@@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 		    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
 		    res.type == RTN_LOCAL)
 			result = FIB_RES_DEV(res);
+	} else {
+		result = ifa->ifa_dev->dev;
 	}
 	if (result && devref)
 		dev_hold(result);
@@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 }
 EXPORT_SYMBOL(__ip_dev_find);
 
+/* called under RCU lock */
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr)
+{
+	u32 hash = inet_addr_hash(net, addr);
+	struct in_ifaddr *ifa;
+
+	hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash)
+		if (ifa->ifa_local == addr &&
+		    net_eq(dev_net(ifa->ifa_dev->dev), net))
+			return ifa;
+
+	return NULL;
+}
+
 static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
 
 static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 37819ab4cc74..f02819134ba2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -345,9 +345,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	if (res.type != RTN_UNICAST &&
 	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 		goto e_inval;
-	if (!rpf && !fib_num_tclassid_users(net) &&
-	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
-		goto last_resort;
 	fib_combine_itag(itag, &res);
 	dev_match = false;
 
@@ -402,13 +399,26 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			struct in_device *idev, u32 *itag)
 {
 	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
+	struct net *net = dev_net(dev);
 
-	if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
-	    IN_DEV_ACCEPT_LOCAL(idev) &&
+	if (!r && !fib_num_tclassid_users(net) &&
 	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
+		if (IN_DEV_ACCEPT_LOCAL(idev))
+			goto ok;
+		/* if no local routes are added from user space we can check
+		 * for local addresses looking-up the ifaddr table
+		 */
+		if (net->ipv4.fib_has_custom_local_routes)
+			goto full_check;
+		if (inet_lookup_ifaddr_rcu(net, src))
+			return -EINVAL;
+
+ok:
 		*itag = 0;
 		return 0;
 	}
+
+full_check:
 	return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
 }
 
@@ -759,6 +769,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	}
 
 	err = fib_table_insert(net, tb, &cfg, extack);
+	if (!err && cfg.fc_type == RTN_LOCAL)
+		net->ipv4.fib_has_custom_local_routes = true;
 errout:
 	return err;
 }
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 57a5d48acee8..467b3c15395f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi)
 		atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
 	} endfor_nexthops(fi);
 }
-
-static inline void fib_add_weight(struct fib_info *fi,
-				  const struct fib_nh *nh)
-{
-	fi->fib_weight += nh->nh_weight;
-}
-
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 
 #define fib_rebalance(fi) do { } while (0)
-#define fib_add_weight(fi, nh) do { } while (0)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
@@ -774,8 +766,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
  *					|
  *					|-> {local prefix} (terminal node)
  */
-static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
-			struct fib_nh *nh, struct netlink_ext_ack *extack)
+static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
+			struct netlink_ext_ack *extack)
 {
 	int err = 0;
 	struct net *net;
@@ -1258,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 		int linkdown = 0;
 
 		change_nexthops(fi) {
-			err = fib_check_nh(cfg, fi, nexthop_nh, extack);
+			err = fib_check_nh(cfg, nexthop_nh, extack);
 			if (err != 0)
 				goto failure;
 			if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
@@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 
 	change_nexthops(fi) {
 		fib_info_update_nh_saddr(net, nexthop_nh);
-		fib_add_weight(fi, nexthop_nh);
 	} endfor_nexthops(fi)
 
 	fib_rebalance(fi);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c039c937ba90..8a91ebbf0c01 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -39,11 +39,11 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
  *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
  *                          and 0.0.0.0 equals to 0.0.0.0 only
  */
-static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
-				const struct in6_addr *sk2_rcv_saddr6,
-				__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
-				bool sk1_ipv6only, bool sk2_ipv6only,
-				bool match_wildcard)
+static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
+				 const struct in6_addr *sk2_rcv_saddr6,
+				 __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+				 bool sk1_ipv6only, bool sk2_ipv6only,
+				 bool match_wildcard)
 {
 	int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
 	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
@@ -52,29 +52,29 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
 	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
 		if (!sk2_ipv6only) {
 			if (sk1_rcv_saddr == sk2_rcv_saddr)
-				return 1;
+				return true;
 			if (!sk1_rcv_saddr || !sk2_rcv_saddr)
 				return match_wildcard;
 		}
-		return 0;
+		return false;
 	}
 
 	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
-		return 1;
+		return true;
 
 	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
 	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
-		return 1;
+		return true;
 
 	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
 	    !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
-		return 1;
+		return true;
 
 	if (sk2_rcv_saddr6 &&
 	    ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 #endif
 
@@ -82,20 +82,20 @@ static int ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
  * match_wildcard == false: addresses must be exactly the same, i.e.
  *                          0.0.0.0 only equals to 0.0.0.0
  */
-static int ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
-				bool sk2_ipv6only, bool match_wildcard)
+static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
+				 bool sk2_ipv6only, bool match_wildcard)
 {
 	if (!sk2_ipv6only) {
 		if (sk1_rcv_saddr == sk2_rcv_saddr)
-			return 1;
+			return true;
 		if (!sk1_rcv_saddr || !sk2_rcv_saddr)
 			return match_wildcard;
 	}
-	return 0;
+	return false;
 }
 
-int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
-			 bool match_wildcard)
+bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+			  bool match_wildcard)
 {
 #if IS_ENABLED(CONFIG_IPV6)
 	if (sk->sk_family == AF_INET6)
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index b20c8ac64081..914d56928578 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -284,14 +284,17 @@ EXPORT_SYMBOL(inet_peer_xrlim_allow);
 
 void inetpeer_invalidate_tree(struct inet_peer_base *base)
 {
-	struct inet_peer *p, *n;
+	struct rb_node *p = rb_first(&base->rb_root);
 
-	rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) {
-		inet_putpeer(p);
+	while (p) {
+		struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node);
+
+		p = rb_next(p);
+		rb_erase(&peer->rb_node, &base->rb_root);
+		inet_putpeer(peer);
 		cond_resched();
 	}
 
-	base->rb_root = RB_ROOT;
 	base->total = 0;
 }
 EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 467e44d7587d..dc2317776499 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1011,15 +1011,14 @@ static int __net_init ipgre_init_net(struct net *net)
 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
 }
 
-static void __net_exit ipgre_exit_net(struct net *net)
+static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
 {
-	struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
-	ip_tunnel_delete_net(itn, &ipgre_link_ops);
+	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
 }
 
 static struct pernet_operations ipgre_net_ops = {
 	.init = ipgre_init_net,
-	.exit = ipgre_exit_net,
+	.exit_batch = ipgre_exit_batch_net,
 	.id   = &ipgre_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
@@ -1542,15 +1541,14 @@ static int __net_init ipgre_tap_init_net(struct net *net)
 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
 }
 
-static void __net_exit ipgre_tap_exit_net(struct net *net)
+static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
 {
-	struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
-	ip_tunnel_delete_net(itn, &ipgre_tap_ops);
+	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
 }
 
 static struct pernet_operations ipgre_tap_net_ops = {
 	.init = ipgre_tap_init_net,
-	.exit = ipgre_tap_exit_net,
+	.exit_batch = ipgre_tap_exit_batch_net,
 	.id   = &gre_tap_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
@@ -1561,16 +1559,14 @@ static int __net_init erspan_init_net(struct net *net)
 				  &erspan_link_ops, "erspan0");
 }
 
-static void __net_exit erspan_exit_net(struct net *net)
+static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
 {
-	struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
-
-	ip_tunnel_delete_net(itn, &erspan_link_ops);
+	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
 }
 
 static struct pernet_operations erspan_net_ops = {
 	.init = erspan_init_net,
-	.exit = erspan_exit_net,
+	.exit_batch = erspan_exit_batch_net,
 	.id   = &erspan_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index e9805ad664ac..fe6fee728ce4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1061,16 +1061,22 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
 	}
 }
 
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
+void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
+			   struct rtnl_link_ops *ops)
 {
+	struct ip_tunnel_net *itn;
+	struct net *net;
 	LIST_HEAD(list);
 
 	rtnl_lock();
-	ip_tunnel_destroy(itn, &list, ops);
+	list_for_each_entry(net, net_list, exit_list) {
+		itn = net_generic(net, id);
+		ip_tunnel_destroy(itn, &list, ops);
+	}
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
 }
-EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
 
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p, __u32 fwmark)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 89453cf62158..58465c0a8682 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -453,15 +453,14 @@ static int __net_init vti_init_net(struct net *net)
 	return 0;
 }
 
-static void __net_exit vti_exit_net(struct net *net)
+static void __net_exit vti_exit_batch_net(struct list_head *list_net)
 {
-	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
-	ip_tunnel_delete_net(itn, &vti_link_ops);
+	ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops);
 }
 
 static struct pernet_operations vti_net_ops = {
 	.init = vti_init_net,
-	.exit = vti_exit_net,
+	.exit_batch = vti_exit_batch_net,
 	.id   = &vti_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index fb1ad22b5e29..1e47818e38c7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -634,15 +634,14 @@ static int __net_init ipip_init_net(struct net *net)
 	return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
 }
 
-static void __net_exit ipip_exit_net(struct net *net)
+static void __net_exit ipip_exit_batch_net(struct list_head *list_net)
 {
-	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
-	ip_tunnel_delete_net(itn, &ipip_link_ops);
+	ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops);
 }
 
 static struct pernet_operations ipip_net_ops = {
 	.init = ipip_init_net,
-	.exit = ipip_exit_net,
+	.exit_batch = ipip_exit_batch_net,
 	.id   = &ipip_net_id,
 	.size = sizeof(struct ip_tunnel_net),
 };
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c9b3e6e069ae..b3ee01b0551b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -67,6 +67,7 @@
 #include <net/fib_rules.h>
 #include <linux/netconf.h>
 #include <net/nexthop.h>
+#include <net/switchdev.h>
 
 struct ipmr_rule {
 	struct fib_rule		common;
@@ -264,6 +265,22 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	fib_rules_unregister(net->ipv4.mr_rules_ops);
 	rtnl_unlock();
 }
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
 #else
 #define ipmr_for_each_table(mrt, net) \
 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
@@ -298,6 +315,22 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	net->ipv4.mrt = NULL;
 	rtnl_unlock();
 }
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return 0;
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return 0;
+}
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
 #endif
 
 static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -587,6 +620,82 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 }
 #endif
 
+static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
+					struct net *net,
+					enum fib_event_type event_type,
+					struct vif_device *vif,
+					vifi_t vif_index, u32 tb_id)
+{
+	struct vif_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.dev = vif->dev,
+		.vif_index = vif_index,
+		.vif_flags = vif->flags,
+		.tb_id = tb_id,
+	};
+
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_ipmr_vif_entry_notifiers(struct net *net,
+					 enum fib_event_type event_type,
+					 struct vif_device *vif,
+					 vifi_t vif_index, u32 tb_id)
+{
+	struct vif_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.dev = vif->dev,
+		.vif_index = vif_index,
+		.vif_flags = vif->flags,
+		.tb_id = tb_id,
+	};
+
+	ASSERT_RTNL();
+	net->ipv4.ipmr_seq++;
+	return call_fib_notifiers(net, event_type, &info.info);
+}
+
+static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
+					struct net *net,
+					enum fib_event_type event_type,
+					struct mfc_cache *mfc, u32 tb_id)
+{
+	struct mfc_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.mfc = mfc,
+		.tb_id = tb_id
+	};
+
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_ipmr_mfc_entry_notifiers(struct net *net,
+					 enum fib_event_type event_type,
+					 struct mfc_cache *mfc, u32 tb_id)
+{
+	struct mfc_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.mfc = mfc,
+		.tb_id = tb_id
+	};
+
+	ASSERT_RTNL();
+	net->ipv4.ipmr_seq++;
+	return call_fib_notifiers(net, event_type, &info.info);
+}
+
 /**
  *	vif_delete - Delete a VIF entry
  *	@notify: Set to 1, if the caller is a notifier_call
@@ -594,6 +703,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 		      struct list_head *head)
 {
+	struct net *net = read_pnet(&mrt->net);
 	struct vif_device *v;
 	struct net_device *dev;
 	struct in_device *in_dev;
@@ -603,6 +713,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 
 	v = &mrt->vif_table[vifi];
 
+	if (VIF_EXISTS(mrt, vifi))
+		call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
+					      mrt->id);
+
 	write_lock_bh(&mrt_lock);
 	dev = v->dev;
 	v->dev = NULL;
@@ -652,10 +766,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head)
 	kmem_cache_free(mrt_cachep, c);
 }
 
-static inline void ipmr_cache_free(struct mfc_cache *c)
+void ipmr_cache_free(struct mfc_cache *c)
 {
 	call_rcu(&c->rcu, ipmr_cache_free_rcu);
 }
+EXPORT_SYMBOL(ipmr_cache_free);
 
 /* Destroy an unresolved cache entry, killing queued skbs
  * and reporting error to netlink readers.
@@ -754,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 		   struct vifctl *vifc, int mrtsock)
 {
 	int vifi = vifc->vifc_vifi;
+	struct switchdev_attr attr = {
+		.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
+	};
 	struct vif_device *v = &mrt->vif_table[vifi];
 	struct net_device *dev;
 	struct in_device *in_dev;
@@ -828,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 
 	/* Fill in the VIF structures */
 
+	attr.orig_dev = dev;
+	if (!switchdev_port_attr_get(dev, &attr)) {
+		memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len);
+		v->dev_parent_id.id_len = attr.u.ppid.id_len;
+	} else {
+		v->dev_parent_id.id_len = 0;
+	}
 	v->rate_limit = vifc->vifc_rate_limit;
 	v->local = vifc->vifc_lcl_addr.s_addr;
 	v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -851,6 +976,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	if (vifi+1 > mrt->maxvif)
 		mrt->maxvif = vifi+1;
 	write_unlock_bh(&mrt_lock);
+	call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
 	return 0;
 }
 
@@ -949,6 +1075,7 @@ static struct mfc_cache *ipmr_cache_alloc(void)
 	if (c) {
 		c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
 		c->mfc_un.res.minvif = MAXVIFS;
+		refcount_set(&c->mfc_un.res.refcount, 1);
 	}
 	return c;
 }
@@ -1150,6 +1277,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 
 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 {
+	struct net *net = read_pnet(&mrt->net);
 	struct mfc_cache *c;
 
 	/* The entries are added/deleted only under RTNL */
@@ -1161,8 +1289,9 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 		return -ENOENT;
 	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 	list_del_rcu(&c->list);
+	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
 	mroute_netlink_event(mrt, c, RTM_DELROUTE);
-	ipmr_cache_free(c);
+	ipmr_cache_put(c);
 
 	return 0;
 }
@@ -1189,6 +1318,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 		if (!mrtsock)
 			c->mfc_flags |= MFC_STATIC;
 		write_unlock_bh(&mrt_lock);
+		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
+					      mrt->id);
 		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
 		return 0;
 	}
@@ -1238,6 +1369,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 		ipmr_cache_resolve(net, mrt, uc, c);
 		ipmr_cache_free(uc);
 	}
+	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id);
 	mroute_netlink_event(mrt, c, RTM_NEWROUTE);
 	return 0;
 }
@@ -1245,6 +1377,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 /* Close the multicast socket, and clear the vif tables etc */
 static void mroute_clean_tables(struct mr_table *mrt, bool all)
 {
+	struct net *net = read_pnet(&mrt->net);
 	struct mfc_cache *c, *tmp;
 	LIST_HEAD(list);
 	int i;
@@ -1263,8 +1396,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 			continue;
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 		list_del_rcu(&c->list);
+		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c,
+					      mrt->id);
 		mroute_netlink_event(mrt, c, RTM_DELROUTE);
-		ipmr_cache_free(c);
+		ipmr_cache_put(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
@@ -1724,10 +1859,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
 	return dst_output(net, sk, skb);
 }
 
+#ifdef CONFIG_NET_SWITCHDEV
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+				   int in_vifi, int out_vifi)
+{
+	struct vif_device *out_vif = &mrt->vif_table[out_vifi];
+	struct vif_device *in_vif = &mrt->vif_table[in_vifi];
+
+	if (!skb->offload_mr_fwd_mark)
+		return false;
+	if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
+		return false;
+	return netdev_phys_item_id_same(&out_vif->dev_parent_id,
+					&in_vif->dev_parent_id);
+}
+#else
+static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
+				   int in_vifi, int out_vifi)
+{
+	return false;
+}
+#endif
+
 /* Processing handlers for ipmr_forward */
 
 static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
-			    struct sk_buff *skb, struct mfc_cache *c, int vifi)
+			    int in_vifi, struct sk_buff *skb,
+			    struct mfc_cache *c, int vifi)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct vif_device *vif = &mrt->vif_table[vifi];
@@ -1748,6 +1906,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 		goto out_free;
 	}
 
+	if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
+		goto out_free;
+
 	if (vif->flags & VIFF_TUNNEL) {
 		rt = ip_route_output_ports(net, &fl4, NULL,
 					   vif->remote, vif->local,
@@ -1925,8 +2086,8 @@ forward:
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 
 				if (skb2)
-					ipmr_queue_xmit(net, mrt, skb2, cache,
-							psend);
+					ipmr_queue_xmit(net, mrt, true_vifi,
+							skb2, cache, psend);
 			}
 			psend = ct;
 		}
@@ -1937,9 +2098,10 @@ last_forward:
 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 
 			if (skb2)
-				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
+				ipmr_queue_xmit(net, mrt, true_vifi, skb2,
+						cache, psend);
 		} else {
-			ipmr_queue_xmit(net, mrt, skb, cache, psend);
+			ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend);
 			return;
 		}
 	}
@@ -2156,6 +2318,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
 		return -EMSGSIZE;
 
+	if (c->mfc_flags & MFC_OFFLOAD)
+		rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
 	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
 		return -EMSGSIZE;
 
@@ -3048,14 +3213,87 @@ static const struct net_protocol pim_protocol = {
 };
 #endif
 
+static unsigned int ipmr_seq_read(struct net *net)
+{
+	ASSERT_RTNL();
+
+	return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
+}
+
+static int ipmr_dump(struct net *net, struct notifier_block *nb)
+{
+	struct mr_table *mrt;
+	int err;
+
+	err = ipmr_rules_dump(net, nb);
+	if (err)
+		return err;
+
+	ipmr_for_each_table(mrt, net) {
+		struct vif_device *v = &mrt->vif_table[0];
+		struct mfc_cache *mfc;
+		int vifi;
+
+		/* Notifiy on table VIF entries */
+		read_lock(&mrt_lock);
+		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
+			if (!v->dev)
+				continue;
+
+			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
+						     v, vifi, mrt->id);
+		}
+		read_unlock(&mrt_lock);
+
+		/* Notify on table MFC entries */
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+			call_ipmr_mfc_entry_notifier(nb, net,
+						     FIB_EVENT_ENTRY_ADD, mfc,
+						     mrt->id);
+	}
+
+	return 0;
+}
+
+static const struct fib_notifier_ops ipmr_notifier_ops_template = {
+	.family		= RTNL_FAMILY_IPMR,
+	.fib_seq_read	= ipmr_seq_read,
+	.fib_dump	= ipmr_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init ipmr_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
+
+	net->ipv4.ipmr_seq = 0;
+
+	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv4.ipmr_notifier_ops = ops;
+
+	return 0;
+}
+
+static void __net_exit ipmr_notifier_exit(struct net *net)
+{
+	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
+	net->ipv4.ipmr_notifier_ops = NULL;
+}
+
 /* Setup for IP multicast routing */
 static int __net_init ipmr_net_init(struct net *net)
 {
 	int err;
 
+	err = ipmr_notifier_init(net);
+	if (err)
+		goto ipmr_notifier_fail;
+
 	err = ipmr_rules_init(net);
 	if (err < 0)
-		goto fail;
+		goto ipmr_rules_fail;
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
@@ -3072,7 +3310,9 @@ proc_cache_fail:
 proc_vif_fail:
 	ipmr_rules_exit(net);
 #endif
-fail:
+ipmr_rules_fail:
+	ipmr_notifier_exit(net);
+ipmr_notifier_fail:
 	return err;
 }
 
@@ -3082,6 +3322,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
 	remove_proc_entry("ip_mr_cache", net->proc_net);
 	remove_proc_entry("ip_mr_vif", net->proc_net);
 #endif
+	ipmr_notifier_exit(net);
 	ipmr_rules_exit(net);
 }
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0d3c038d7b04..cac8dd309f39 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -251,10 +251,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
 	return ret;
 }
 
-static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
+static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos)
 {
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_tcp_fastopen);
 	struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
 	struct tcp_fastopen_context *ctxt;
 	int ret;
@@ -265,7 +267,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
 		return -ENOMEM;
 
 	rcu_read_lock();
-	ctxt = rcu_dereference(tcp_fastopen_ctx);
+	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
 	if (ctxt)
 		memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
 	else
@@ -282,12 +284,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
 			ret = -EINVAL;
 			goto bad_key;
 		}
-		/* Generate a dummy secret but don't publish it. This
-		 * is needed so we don't regenerate a new key on the
-		 * first invocation of tcp_fastopen_cookie_gen
-		 */
-		tcp_fastopen_init_key_once(false);
-		tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
+		tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH);
 	}
 
 bad_key:
@@ -358,11 +355,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
 					     void __user *buffer,
 					     size_t *lenp, loff_t *ppos)
 {
+	struct net *net = container_of(table->data, struct net,
+	    ipv4.sysctl_tcp_fastopen_blackhole_timeout);
 	int ret;
 
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (write && ret == 0)
-		tcp_fastopen_active_timeout_reset();
+		atomic_set(&net->ipv4.tfo_active_disable_times, 0);
 
 	return ret;
 }
@@ -401,27 +400,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.procname	= "tcp_fastopen",
-		.data		= &sysctl_tcp_fastopen,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "tcp_fastopen_key",
-		.mode		= 0600,
-		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
-		.proc_handler	= proc_tcp_fastopen_key,
-	},
-	{
-		.procname	= "tcp_fastopen_blackhole_timeout_sec",
-		.data		= &sysctl_tcp_fastopen_blackhole_timeout,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_tfo_blackhole_detect_timeout,
-		.extra1		= &zero,
-	},
-	{
 		.procname	= "tcp_abort_on_overflow",
 		.data		= &sysctl_tcp_abort_on_overflow,
 		.maxlen		= sizeof(int),
@@ -1085,6 +1063,28 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_fastopen",
+		.data		= &init_net.ipv4.sysctl_tcp_fastopen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_fastopen_key",
+		.mode		= 0600,
+		.data		= &init_net.ipv4.sysctl_tcp_fastopen,
+		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+		.proc_handler	= proc_tcp_fastopen_key,
+	},
+	{
+		.procname	= "tcp_fastopen_blackhole_timeout_sec",
+		.data		= &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_tfo_blackhole_detect_timeout,
+		.extra1		= &zero,
+	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
 		.procname	= "fib_multipath_use_neigh",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5091402720ab..23225c98d287 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1126,7 +1126,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 	struct sockaddr *uaddr = msg->msg_name;
 	int err, flags;
 
-	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+	if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
 	    (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
 	     uaddr->sa_family == AF_UNSPEC))
 		return -EOPNOTSUPP;
@@ -2749,7 +2749,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_FASTOPEN:
 		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
 		    TCPF_LISTEN))) {
-			tcp_fastopen_init_key_once(true);
+			tcp_fastopen_init_key_once(net);
 
 			fastopen_queue_tune(sk, val);
 		} else {
@@ -2759,7 +2759,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 	case TCP_FASTOPEN_CONNECT:
 		if (val > 1 || val < 0) {
 			err = -EINVAL;
-		} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+		} else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
 			if (sk->sk_state == TCP_CLOSE)
 				tp->fastopen_connect = val;
 			else
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index e3c33220c418..de470e7e586f 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -9,15 +9,18 @@
 #include <net/inetpeer.h>
 #include <net/tcp.h>
 
-int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
-
-struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
-
-static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
-
-void tcp_fastopen_init_key_once(bool publish)
+void tcp_fastopen_init_key_once(struct net *net)
 {
-	static u8 key[TCP_FASTOPEN_KEY_LENGTH];
+	u8 key[TCP_FASTOPEN_KEY_LENGTH];
+	struct tcp_fastopen_context *ctxt;
+
+	rcu_read_lock();
+	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
+	if (ctxt) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
 
 	/* tcp_fastopen_reset_cipher publishes the new context
 	 * atomically, so we allow this race happening here.
@@ -25,8 +28,8 @@ void tcp_fastopen_init_key_once(bool publish)
 	 * All call sites of tcp_fastopen_cookie_gen also check
 	 * for a valid cookie, so this is an acceptable risk.
 	 */
-	if (net_get_random_once(key, sizeof(key)) && publish)
-		tcp_fastopen_reset_cipher(key, sizeof(key));
+	get_random_bytes(key, sizeof(key));
+	tcp_fastopen_reset_cipher(net, key, sizeof(key));
 }
 
 static void tcp_fastopen_ctx_free(struct rcu_head *head)
@@ -37,7 +40,22 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head)
 	kfree(ctx);
 }
 
-int tcp_fastopen_reset_cipher(void *key, unsigned int len)
+void tcp_fastopen_ctx_destroy(struct net *net)
+{
+	struct tcp_fastopen_context *ctxt;
+
+	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
+
+	ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL);
+	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+
+	if (ctxt)
+		call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
+}
+
+int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len)
 {
 	int err;
 	struct tcp_fastopen_context *ctx, *octx;
@@ -61,26 +79,27 @@ error:		kfree(ctx);
 	}
 	memcpy(ctx->key, key, len);
 
-	spin_lock(&tcp_fastopen_ctx_lock);
+	spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
 
-	octx = rcu_dereference_protected(tcp_fastopen_ctx,
-				lockdep_is_held(&tcp_fastopen_ctx_lock));
-	rcu_assign_pointer(tcp_fastopen_ctx, ctx);
-	spin_unlock(&tcp_fastopen_ctx_lock);
+	octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
+				lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
+	rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
+	spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
 
 	if (octx)
 		call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
 	return err;
 }
 
-static bool __tcp_fastopen_cookie_gen(const void *path,
+static bool __tcp_fastopen_cookie_gen(struct net *net,
+				      const void *path,
 				      struct tcp_fastopen_cookie *foc)
 {
 	struct tcp_fastopen_context *ctx;
 	bool ok = false;
 
 	rcu_read_lock();
-	ctx = rcu_dereference(tcp_fastopen_ctx);
+	ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
 	if (ctx) {
 		crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
 		foc->len = TCP_FASTOPEN_COOKIE_SIZE;
@@ -96,7 +115,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path,
  *
  * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
  */
-static bool tcp_fastopen_cookie_gen(struct request_sock *req,
+static bool tcp_fastopen_cookie_gen(struct net *net,
+				    struct request_sock *req,
 				    struct sk_buff *syn,
 				    struct tcp_fastopen_cookie *foc)
 {
@@ -104,7 +124,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
 		const struct iphdr *iph = ip_hdr(syn);
 
 		__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
-		return __tcp_fastopen_cookie_gen(path, foc);
+		return __tcp_fastopen_cookie_gen(net, path, foc);
 	}
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -112,13 +132,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
 		const struct ipv6hdr *ip6h = ipv6_hdr(syn);
 		struct tcp_fastopen_cookie tmp;
 
-		if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
+		if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) {
 			struct in6_addr *buf = &tmp.addr;
 			int i;
 
 			for (i = 0; i < 4; i++)
 				buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
-			return __tcp_fastopen_cookie_gen(buf, foc);
+			return __tcp_fastopen_cookie_gen(net, buf, foc);
 		}
 	}
 #endif
@@ -279,25 +299,26 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
 			      struct tcp_fastopen_cookie *foc)
 {
-	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
 	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
+	int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
 	struct sock *child;
 
 	if (foc->len == 0) /* Client requests a cookie */
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
 
-	if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
+	if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
 	      (syn_data || foc->len >= 0) &&
 	      tcp_fastopen_queue_check(sk))) {
 		foc->len = -1;
 		return NULL;
 	}
 
-	if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
+	if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
 		goto fastopen;
 
 	if (foc->len >= 0 &&  /* Client presents or requests a cookie */
-	    tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
+	    tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) &&
 	    foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
 	    foc->len == valid_foc.len &&
 	    !memcmp(foc->val, valid_foc.val, foc->len)) {
@@ -347,7 +368,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 		return false;
 	}
 
-	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
+	if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
 		cookie->len = -1;
 		return true;
 	}
@@ -401,25 +422,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect);
  * TFO connection with data exchanges.
  */
 
-/* Default to 1hr */
-unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60;
-static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0);
-static unsigned long tfo_active_disable_stamp __read_mostly;
-
 /* Disable active TFO and record current jiffies and
  * tfo_active_disable_times
  */
 void tcp_fastopen_active_disable(struct sock *sk)
 {
-	atomic_inc(&tfo_active_disable_times);
-	tfo_active_disable_stamp = jiffies;
-	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE);
-}
+	struct net *net = sock_net(sk);
 
-/* Reset tfo_active_disable_times to 0 */
-void tcp_fastopen_active_timeout_reset(void)
-{
-	atomic_set(&tfo_active_disable_times, 0);
+	atomic_inc(&net->ipv4.tfo_active_disable_times);
+	net->ipv4.tfo_active_disable_stamp = jiffies;
+	NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
 }
 
 /* Calculate timeout for tfo active disable
@@ -428,17 +440,18 @@ void tcp_fastopen_active_timeout_reset(void)
  */
 bool tcp_fastopen_active_should_disable(struct sock *sk)
 {
-	int tfo_da_times = atomic_read(&tfo_active_disable_times);
-	int multiplier;
+	unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout;
+	int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
 	unsigned long timeout;
+	int multiplier;
 
 	if (!tfo_da_times)
 		return false;
 
 	/* Limit timout to max: 2^6 * initial timeout */
 	multiplier = 1 << min(tfo_da_times - 1, 6);
-	timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ;
-	if (time_before(jiffies, tfo_active_disable_stamp + timeout))
+	timeout = multiplier * tfo_bh_timeout * HZ;
+	if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout))
 		return true;
 
 	/* Mark check bit so we can check for successful active TFO
@@ -474,10 +487,10 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 			}
 		}
 	} else if (tp->syn_fastopen_ch &&
-		   atomic_read(&tfo_active_disable_times)) {
+		   atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
 		dst = sk_dst_get(sk);
 		if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
-			tcp_fastopen_active_timeout_reset();
+			atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
 		dst_release(dst);
 	}
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5d7656beeee..db9bb46b5776 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2333,9 +2333,9 @@ static bool tcp_any_retrans_done(const struct sock *sk)
 	return false;
 }
 
-#if FASTRETRANS_DEBUG > 1
 static void DBGUNDO(struct sock *sk, const char *msg)
 {
+#if FASTRETRANS_DEBUG > 1
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_sock *inet = inet_sk(sk);
 
@@ -2357,10 +2357,8 @@ static void DBGUNDO(struct sock *sk, const char *msg)
 			 tp->packets_out);
 	}
 #endif
-}
-#else
-#define DBGUNDO(x...) do { } while (0)
 #endif
+}
 
 static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
 {
@@ -4268,11 +4266,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	tp->rx_opt.num_sacks = num_sacks;
 }
 
-enum tcp_queue {
-	OOO_QUEUE,
-	RCV_QUEUE,
-};
-
 /**
  * tcp_try_coalesce - try to merge skb to prior one
  * @sk: socket
@@ -4288,7 +4281,6 @@ enum tcp_queue {
  * Returns true if caller should free @from instead of queueing it
  */
 static bool tcp_try_coalesce(struct sock *sk,
-			     enum tcp_queue dest,
 			     struct sk_buff *to,
 			     struct sk_buff *from,
 			     bool *fragstolen)
@@ -4313,10 +4305,7 @@ static bool tcp_try_coalesce(struct sock *sk,
 
 	if (TCP_SKB_CB(from)->has_rxtstamp) {
 		TCP_SKB_CB(to)->has_rxtstamp = true;
-		if (dest == OOO_QUEUE)
-			TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp;
-		else
-			to->tstamp = from->tstamp;
+		to->tstamp = from->tstamp;
 	}
 
 	return true;
@@ -4353,9 +4342,6 @@ static void tcp_ofo_queue(struct sock *sk)
 		}
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
-		/* Replace tstamp which was stomped by rbnode */
-		if (TCP_SKB_CB(skb)->has_rxtstamp)
-			skb->tstamp = TCP_SKB_CB(skb)->swtstamp;
 
 		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
@@ -4367,8 +4353,7 @@ static void tcp_ofo_queue(struct sock *sk)
 			   TCP_SKB_CB(skb)->end_seq);
 
 		tail = skb_peek_tail(&sk->sk_receive_queue);
-		eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE,
-						 tail, skb, &fragstolen);
+		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (!eaten)
@@ -4422,10 +4407,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		return;
 	}
 
-	/* Stash tstamp to avoid being stomped on by rbnode */
-	if (TCP_SKB_CB(skb)->has_rxtstamp)
-		TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
-
 	/* Disable header prediction. */
 	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
@@ -4453,7 +4434,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	/* In the typical case, we are adding an skb to the end of the list.
 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 	 */
-	if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb,
+	if (tcp_try_coalesce(sk, tp->ooo_last_skb,
 			     skb, &fragstolen)) {
 coalesce_done:
 		tcp_grow_window(sk, skb);
@@ -4504,7 +4485,7 @@ coalesce_done:
 				__kfree_skb(skb1);
 				goto merge_right;
 			}
-		} else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1,
+		} else if (tcp_try_coalesce(sk, skb1,
 					    skb, &fragstolen)) {
 			goto coalesce_done;
 		}
@@ -4556,7 +4537,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
 
 	__skb_pull(skb, hdrlen);
 	eaten = (tail &&
-		 tcp_try_coalesce(sk, RCV_QUEUE, tail,
+		 tcp_try_coalesce(sk, tail,
 				  skb, fragstolen)) ? 1 : 0;
 	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
 	if (!eaten) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 85164d4d3e53..c7460fd90884 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2473,6 +2473,11 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_window_scaling = 1;
 	net->ipv4.sysctl_tcp_timestamps = 1;
 
+	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
+	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
+	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
+	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+
 	return 0;
 fail:
 	tcp_sk_exit(net);
@@ -2482,7 +2487,12 @@ fail:
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
 {
+	struct net *net;
+
 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
+
+	list_for_each_entry(net, net_exit_list, exit_list)
+		tcp_fastopen_ctx_destroy(net);
 }
 
 static struct pernet_operations __net_initdata tcp_sk_ops = {
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 102b2c90bb80..0ab78abc811b 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -892,10 +892,14 @@ static void tcp_metrics_flush_all(struct net *net)
 
 	for (row = 0; row < max_rows; row++, hb++) {
 		struct tcp_metrics_block __rcu **pp;
+		bool match;
+
 		spin_lock_bh(&tcp_metrics_lock);
 		pp = &hb->chain;
 		for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
-			if (net_eq(tm_net(tm), net)) {
+			match = net ? net_eq(tm_net(tm), net) :
+				!atomic_read(&tm_net(tm)->count);
+			if (match) {
 				*pp = tm->tcpm_next;
 				kfree_rcu(tm, rcu_head);
 			} else {
@@ -1018,14 +1022,14 @@ static int __net_init tcp_net_metrics_init(struct net *net)
 	return 0;
 }
 
-static void __net_exit tcp_net_metrics_exit(struct net *net)
+static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list)
 {
-	tcp_metrics_flush_all(net);
+	tcp_metrics_flush_all(NULL);
 }
 
 static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
-	.init	=	tcp_net_metrics_init,
-	.exit	=	tcp_net_metrics_exit,
+	.init		=	tcp_net_metrics_init,
+	.exit_batch	=	tcp_net_metrics_exit_batch,
 };
 
 void __init tcp_metrics_init(void)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 218cfcc77650..ee113ff15fd0 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
 
 static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
 {
-	return  min(tp->snd_ssthresh, tp->snd_cwnd-1);
+	return  min(tp->snd_ssthresh, tp->snd_cwnd);
 }
 
 static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5676237d2b0f..eb0359bdaa11 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1212,8 +1212,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
 	if (likely(partial)) {
 		up->forward_deficit += size;
 		size = up->forward_deficit;
-		if (size < (sk->sk_rcvbuf >> 2) &&
-		    !skb_queue_empty(&up->reader_queue))
+		if (size < (sk->sk_rcvbuf >> 2))
 			return;
 	} else {
 		size += up->forward_deficit;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 96861c702c06..837418ff2d4b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -303,10 +303,10 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.disable_policy		= 0,
 };
 
-/* Check if a valid qdisc is available */
-static inline bool addrconf_qdisc_ok(const struct net_device *dev)
+/* Check if link is ready: is it up and is a valid qdisc available */
+static inline bool addrconf_link_ready(const struct net_device *dev)
 {
-	return !qdisc_tx_is_noop(dev);
+	return netif_oper_up(dev) && !qdisc_tx_is_noop(dev);
 }
 
 static void addrconf_del_rs_timer(struct inet6_dev *idev)
@@ -451,7 +451,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 
 	ndev->token = in6addr_any;
 
-	if (netif_running(dev) && addrconf_qdisc_ok(dev))
+	if (netif_running(dev) && addrconf_link_ready(dev))
 		ndev->if_flags |= IF_READY;
 
 	ipv6_mc_init_dev(ndev);
@@ -3297,7 +3297,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
 		struct rt6_info *rt, *prev;
 
 		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
-		if (unlikely(IS_ERR(rt)))
+		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 
 		/* ifp->rt can be accessed outside of rtnl */
@@ -3403,7 +3403,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			/* restore routes for permanent addresses */
 			addrconf_permanent_addr(dev);
 
-			if (!addrconf_qdisc_ok(dev)) {
+			if (!addrconf_link_ready(dev)) {
 				/* device is not ready yet. */
 				pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n",
 					dev->name);
@@ -3418,7 +3418,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 				run_pending = 1;
 			}
 		} else if (event == NETDEV_CHANGE) {
-			if (!addrconf_qdisc_ok(dev)) {
+			if (!addrconf_link_ready(dev)) {
 				/* device is still not ready. */
 				break;
 			}
@@ -6618,9 +6618,9 @@ void addrconf_cleanup(void)
 	unregister_pernet_subsys(&addrconf_ops);
 	ipv6_addr_label_cleanup();
 
-	rtnl_lock();
+	rtnl_af_unregister(&inet6_ops);
 
-	__rtnl_af_unregister(&inet6_ops);
+	rtnl_lock();
 
 	/* clean dev list */
 	for_each_netdev(&init_net, dev) {
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index b055bc79f56d..c6311d7108f6 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -30,7 +30,6 @@
  * Policy Table
  */
 struct ip6addrlbl_entry {
-	possible_net_t lbl_net;
 	struct in6_addr prefix;
 	int prefixlen;
 	int ifindex;
@@ -41,19 +40,6 @@ struct ip6addrlbl_entry {
 	struct rcu_head rcu;
 };
 
-static struct ip6addrlbl_table
-{
-	struct hlist_head head;
-	spinlock_t lock;
-	u32 seq;
-} ip6addrlbl_table;
-
-static inline
-struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl)
-{
-	return read_pnet(&lbl->lbl_net);
-}
-
 /*
  * Default policy table (RFC6724 + extensions)
  *
@@ -148,13 +134,10 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p)
 }
 
 /* Find label */
-static bool __ip6addrlbl_match(struct net *net,
-			       const struct ip6addrlbl_entry *p,
+static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p,
 			       const struct in6_addr *addr,
 			       int addrtype, int ifindex)
 {
-	if (!net_eq(ip6addrlbl_net(p), net))
-		return false;
 	if (p->ifindex && p->ifindex != ifindex)
 		return false;
 	if (p->addrtype && p->addrtype != addrtype)
@@ -169,8 +152,9 @@ static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net,
 						  int type, int ifindex)
 {
 	struct ip6addrlbl_entry *p;
-	hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) {
-		if (__ip6addrlbl_match(net, p, addr, type, ifindex))
+
+	hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
+		if (__ip6addrlbl_match(p, addr, type, ifindex))
 			return p;
 	}
 	return NULL;
@@ -196,8 +180,7 @@ u32 ipv6_addr_label(struct net *net,
 }
 
 /* allocate one entry */
-static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
-						 const struct in6_addr *prefix,
+static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix,
 						 int prefixlen, int ifindex,
 						 u32 label)
 {
@@ -236,24 +219,23 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
 	newp->addrtype = addrtype;
 	newp->label = label;
 	INIT_HLIST_NODE(&newp->list);
-	write_pnet(&newp->lbl_net, net);
 	refcount_set(&newp->refcnt, 1);
 	return newp;
 }
 
 /* add a label */
-static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
+static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp,
+			    int replace)
 {
-	struct hlist_node *n;
 	struct ip6addrlbl_entry *last = NULL, *p = NULL;
+	struct hlist_node *n;
 	int ret = 0;
 
 	ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp,
 		  replace);
 
-	hlist_for_each_entry_safe(p, n,	&ip6addrlbl_table.head, list) {
+	hlist_for_each_entry_safe(p, n,	&net->ipv6.ip6addrlbl_table.head, list) {
 		if (p->prefixlen == newp->prefixlen &&
-		    net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) &&
 		    p->ifindex == newp->ifindex &&
 		    ipv6_addr_equal(&p->prefix, &newp->prefix)) {
 			if (!replace) {
@@ -273,10 +255,10 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
 	if (last)
 		hlist_add_behind_rcu(&newp->list, &last->list);
 	else
-		hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head);
+		hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head);
 out:
 	if (!ret)
-		ip6addrlbl_table.seq++;
+		net->ipv6.ip6addrlbl_table.seq++;
 	return ret;
 }
 
@@ -292,12 +274,12 @@ static int ip6addrlbl_add(struct net *net,
 		  __func__, prefix, prefixlen, ifindex, (unsigned int)label,
 		  replace);
 
-	newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label);
+	newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label);
 	if (IS_ERR(newp))
 		return PTR_ERR(newp);
-	spin_lock(&ip6addrlbl_table.lock);
-	ret = __ip6addrlbl_add(newp, replace);
-	spin_unlock(&ip6addrlbl_table.lock);
+	spin_lock(&net->ipv6.ip6addrlbl_table.lock);
+	ret = __ip6addrlbl_add(net, newp, replace);
+	spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
 	if (ret)
 		ip6addrlbl_free(newp);
 	return ret;
@@ -315,9 +297,8 @@ static int __ip6addrlbl_del(struct net *net,
 	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
 		  __func__, prefix, prefixlen, ifindex);
 
-	hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) {
+	hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
 		if (p->prefixlen == prefixlen &&
-		    net_eq(ip6addrlbl_net(p), net) &&
 		    p->ifindex == ifindex &&
 		    ipv6_addr_equal(&p->prefix, prefix)) {
 			hlist_del_rcu(&p->list);
@@ -340,9 +321,9 @@ static int ip6addrlbl_del(struct net *net,
 		  __func__, prefix, prefixlen, ifindex);
 
 	ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
-	spin_lock(&ip6addrlbl_table.lock);
+	spin_lock(&net->ipv6.ip6addrlbl_table.lock);
 	ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex);
-	spin_unlock(&ip6addrlbl_table.lock);
+	spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
 	return ret;
 }
 
@@ -354,6 +335,9 @@ static int __net_init ip6addrlbl_net_init(struct net *net)
 
 	ADDRLABEL(KERN_DEBUG "%s\n", __func__);
 
+	spin_lock_init(&net->ipv6.ip6addrlbl_table.lock);
+	INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head);
+
 	for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
 		int ret = ip6addrlbl_add(net,
 					 ip6addrlbl_init_table[i].prefix,
@@ -373,14 +357,12 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net)
 	struct hlist_node *n;
 
 	/* Remove all labels belonging to the exiting net */
-	spin_lock(&ip6addrlbl_table.lock);
-	hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) {
-		if (net_eq(ip6addrlbl_net(p), net)) {
-			hlist_del_rcu(&p->list);
-			ip6addrlbl_put(p);
-		}
+	spin_lock(&net->ipv6.ip6addrlbl_table.lock);
+	hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
+		hlist_del_rcu(&p->list);
+		ip6addrlbl_put(p);
 	}
-	spin_unlock(&ip6addrlbl_table.lock);
+	spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
 }
 
 static struct pernet_operations ipv6_addr_label_ops = {
@@ -390,8 +372,6 @@ static struct pernet_operations ipv6_addr_label_ops = {
 
 int __init ipv6_addr_label_init(void)
 {
-	spin_lock_init(&ip6addrlbl_table.lock);
-
 	return register_pernet_subsys(&ipv6_addr_label_ops);
 }
 
@@ -510,11 +490,10 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	int err;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) {
-		if (idx >= s_idx &&
-		    net_eq(ip6addrlbl_net(p), net)) {
+	hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
+		if (idx >= s_idx) {
 			err = ip6addrlbl_fill(skb, p,
-					      ip6addrlbl_table.seq,
+					      net->ipv6.ip6addrlbl_table.seq,
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWADDRLABEL,
@@ -571,7 +550,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
 	if (p && !ip6addrlbl_hold(p))
 		p = NULL;
-	lseq = ip6addrlbl_table.seq;
+	lseq = net->ipv6.ip6addrlbl_table.seq;
 	rcu_read_unlock();
 
 	if (!p) {
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 305e2ed730bf..115d60919f72 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -99,7 +99,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
 				break;
 			hdrlen = 8;
 		} else if (nexthdr == NEXTHDR_AUTH)
-			hdrlen = (hp->hdrlen+2)<<2;
+			hdrlen = ipv6_authlen(hp);
 		else
 			hdrlen = ipv6_optlen(hp);
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 1602b491b281..241841fb479c 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1156,19 +1156,21 @@ err_alloc_dev:
 	return err;
 }
 
-static void __net_exit ip6gre_exit_net(struct net *net)
+static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list)
 {
+	struct net *net;
 	LIST_HEAD(list);
 
 	rtnl_lock();
-	ip6gre_destroy_tunnels(net, &list);
+	list_for_each_entry(net, net_list, exit_list)
+		ip6gre_destroy_tunnels(net, &list);
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
 }
 
 static struct pernet_operations ip6gre_net_ops = {
 	.init = ip6gre_init_net,
-	.exit = ip6gre_exit_net,
+	.exit_batch = ip6gre_exit_batch_net,
 	.id   = &ip6gre_net_id,
 	.size = sizeof(struct ip6gre_net),
 };
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a1c24443cd9e..825548680b1e 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -2168,17 +2168,16 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
 	.priority	=	1,
 };
 
-static void __net_exit ip6_tnl_destroy_tunnels(struct net *net)
+static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list)
 {
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 	struct net_device *dev, *aux;
 	int h;
 	struct ip6_tnl *t;
-	LIST_HEAD(list);
 
 	for_each_netdev_safe(net, dev, aux)
 		if (dev->rtnl_link_ops == &ip6_link_ops)
-			unregister_netdevice_queue(dev, &list);
+			unregister_netdevice_queue(dev, list);
 
 	for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) {
 		t = rtnl_dereference(ip6n->tnls_r_l[h]);
@@ -2187,12 +2186,10 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net)
 			 * been added to the list by the previous loop.
 			 */
 			if (!net_eq(dev_net(t->dev), net))
-				unregister_netdevice_queue(t->dev, &list);
+				unregister_netdevice_queue(t->dev, list);
 			t = rtnl_dereference(t->next);
 		}
 	}
-
-	unregister_netdevice_many(&list);
 }
 
 static int __net_init ip6_tnl_init_net(struct net *net)
@@ -2236,16 +2233,21 @@ err_alloc_dev:
 	return err;
 }
 
-static void __net_exit ip6_tnl_exit_net(struct net *net)
+static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list)
 {
+	struct net *net;
+	LIST_HEAD(list);
+
 	rtnl_lock();
-	ip6_tnl_destroy_tunnels(net);
+	list_for_each_entry(net, net_list, exit_list)
+		ip6_tnl_destroy_tunnels(net, &list);
+	unregister_netdevice_many(&list);
 	rtnl_unlock();
 }
 
 static struct pernet_operations ip6_tnl_net_ops = {
 	.init = ip6_tnl_init_net,
-	.exit = ip6_tnl_exit_net,
+	.exit_batch = ip6_tnl_exit_batch_net,
 	.id   = &ip6_tnl_net_id,
 	.size = sizeof(struct ip6_tnl_net),
 };
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index bcdc2d557de1..dbb74f3c57a7 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1053,23 +1053,22 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = {
 	.get_link_net	= ip6_tnl_get_link_net,
 };
 
-static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
+static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n,
+					    struct list_head *list)
 {
 	int h;
 	struct ip6_tnl *t;
-	LIST_HEAD(list);
 
 	for (h = 0; h < IP6_VTI_HASH_SIZE; h++) {
 		t = rtnl_dereference(ip6n->tnls_r_l[h]);
 		while (t) {
-			unregister_netdevice_queue(t->dev, &list);
+			unregister_netdevice_queue(t->dev, list);
 			t = rtnl_dereference(t->next);
 		}
 	}
 
 	t = rtnl_dereference(ip6n->tnls_wc[0]);
-	unregister_netdevice_queue(t->dev, &list);
-	unregister_netdevice_many(&list);
+	unregister_netdevice_queue(t->dev, list);
 }
 
 static int __net_init vti6_init_net(struct net *net)
@@ -1109,18 +1108,24 @@ err_alloc_dev:
 	return err;
 }
 
-static void __net_exit vti6_exit_net(struct net *net)
+static void __net_exit vti6_exit_batch_net(struct list_head *net_list)
 {
-	struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+	struct vti6_net *ip6n;
+	struct net *net;
+	LIST_HEAD(list);
 
 	rtnl_lock();
-	vti6_destroy_tunnels(ip6n);
+	list_for_each_entry(net, net_list, exit_list) {
+		ip6n = net_generic(net, vti6_net_id);
+		vti6_destroy_tunnels(ip6n, &list);
+	}
+	unregister_netdevice_many(&list);
 	rtnl_unlock();
 }
 
 static struct pernet_operations vti6_net_ops = {
 	.init = vti6_init_net,
-	.exit = vti6_exit_net,
+	.exit_batch = vti6_exit_batch_net,
 	.id   = &vti6_net_id,
 	.size = sizeof(struct vti6_net),
 };
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a5e466d4e093..b9404feabd78 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -377,6 +377,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		retv = 0;
 		break;
 
+	case IPV6_FREEBIND:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		/* we also don't have a separate freebind bit for IPV6 */
+		inet_sk(sk)->freebind = valbool;
+		retv = 0;
+		break;
+
 	case IPV6_RECVORIGDSTADDR:
 		if (optlen < sizeof(int))
 			goto e_inval;
@@ -1214,6 +1222,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		val = inet_sk(sk)->transparent;
 		break;
 
+	case IPV6_FREEBIND:
+		val = inet_sk(sk)->freebind;
+		break;
+
 	case IPV6_RECVORIGDSTADDR:
 		val = np->rxopt.bits.rxorigdstaddr;
 		break;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index ac912bb21747..a799f5258614 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1848,19 +1848,22 @@ err_alloc_dev:
 	return err;
 }
 
-static void __net_exit sit_exit_net(struct net *net)
+static void __net_exit sit_exit_batch_net(struct list_head *net_list)
 {
 	LIST_HEAD(list);
+	struct net *net;
 
 	rtnl_lock();
-	sit_destroy_tunnels(net, &list);
+	list_for_each_entry(net, net_list, exit_list)
+		sit_destroy_tunnels(net, &list);
+
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
 }
 
 static struct pernet_operations sit_net_ops = {
 	.init = sit_init_net,
-	.exit = sit_exit_net,
+	.exit_batch = sit_exit_batch_net,
 	.id   = &sit_net_id,
 	.size = sizeof(struct sit_net),
 };
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index af4e76ac88ff..0b750a22c4b9 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1650,7 +1650,7 @@ static int kcm_clone(struct socket *osock, struct kcm_clone *info,
 	}
 
 	newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
-	if (unlikely(IS_ERR(newfile))) {
+	if (IS_ERR(newfile)) {
 		err = PTR_ERR(newfile);
 		goto out_sock_alloc_fail;
 	}
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index 1e1c9b20bab7..2fb703d70803 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -623,13 +623,18 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
 	u8 iv[16];
 	struct scatterlist src;
 	SKCIPHER_REQUEST_ON_STACK(req, key->tfm0);
-	int err;
+	int err, datalen;
+	unsigned char *data;
 
 	llsec_geniv(iv, sec->params.hwaddr, &hdr->sec);
-	sg_init_one(&src, skb->data, skb->len);
+	/* Compute data payload offset and data length */
+	data = skb_mac_header(skb) + skb->mac_len;
+	datalen = skb_tail_pointer(skb) - data;
+	sg_init_one(&src, data, datalen);
+
 	skcipher_request_set_tfm(req, key->tfm0);
 	skcipher_request_set_callback(req, 0, NULL, NULL);
-	skcipher_request_set_crypt(req, &src, &src, skb->len, iv);
+	skcipher_request_set_crypt(req, &src, &src, datalen, iv);
 	err = crypto_skcipher_encrypt(req);
 	skcipher_request_zero(req);
 	return err;
@@ -713,7 +718,8 @@ int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb)
 	if (hlen < 0 || hdr.fc.type != IEEE802154_FC_TYPE_DATA)
 		return -EINVAL;
 
-	if (!hdr.fc.security_enabled || hdr.sec.level == 0) {
+	if (!hdr.fc.security_enabled ||
+	    (hdr.sec.level == IEEE802154_SCF_SECLEVEL_NONE)) {
 		skb_push(skb, hlen);
 		return 0;
 	}
diff --git a/net/nfc/core.c b/net/nfc/core.c
index 5cf33df888c3..e5e23c2cbe74 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -1094,9 +1094,8 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops,
 	dev->targets_generation = 1;
 
 	if (ops->check_presence) {
-		init_timer(&dev->check_pres_timer);
-		dev->check_pres_timer.data = (unsigned long)dev;
-		dev->check_pres_timer.function = nfc_check_pres_timeout;
+		setup_timer(&dev->check_pres_timer, nfc_check_pres_timeout,
+			    (unsigned long)dev);
 
 		INIT_WORK(&dev->check_pres_work, nfc_check_pres_work);
 	}
diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index b740fef0acc5..a8a6e7814e09 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -1004,9 +1004,8 @@ int nfc_hci_register_device(struct nfc_hci_dev *hdev)
 
 	INIT_WORK(&hdev->msg_tx_work, nfc_hci_msg_tx_work);
 
-	init_timer(&hdev->cmd_timer);
-	hdev->cmd_timer.data = (unsigned long)hdev;
-	hdev->cmd_timer.function = nfc_hci_cmd_timeout;
+	setup_timer(&hdev->cmd_timer, nfc_hci_cmd_timeout,
+		    (unsigned long)hdev);
 
 	skb_queue_head_init(&hdev->rx_hcp_frags);
 
diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c
index 17e59a009ce6..58df37eae1e8 100644
--- a/net/nfc/hci/llc_shdlc.c
+++ b/net/nfc/hci/llc_shdlc.c
@@ -763,17 +763,14 @@ static void *llc_shdlc_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv,
 	mutex_init(&shdlc->state_mutex);
 	shdlc->state = SHDLC_DISCONNECTED;
 
-	init_timer(&shdlc->connect_timer);
-	shdlc->connect_timer.data = (unsigned long)shdlc;
-	shdlc->connect_timer.function = llc_shdlc_connect_timeout;
+	setup_timer(&shdlc->connect_timer, llc_shdlc_connect_timeout,
+		    (unsigned long)shdlc);
 
-	init_timer(&shdlc->t1_timer);
-	shdlc->t1_timer.data = (unsigned long)shdlc;
-	shdlc->t1_timer.function = llc_shdlc_t1_timeout;
+	setup_timer(&shdlc->t1_timer, llc_shdlc_t1_timeout,
+		    (unsigned long)shdlc);
 
-	init_timer(&shdlc->t2_timer);
-	shdlc->t2_timer.data = (unsigned long)shdlc;
-	shdlc->t2_timer.function = llc_shdlc_t2_timeout;
+	setup_timer(&shdlc->t2_timer, llc_shdlc_t2_timeout,
+		    (unsigned long)shdlc);
 
 	shdlc->w = SHDLC_MAX_WINDOW;
 	shdlc->srej_support = SHDLC_SREJ_SUPPORT;
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index 02eef5cf3cce..7988185072e5 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -1573,9 +1573,8 @@ int nfc_llcp_register_device(struct nfc_dev *ndev)
 	INIT_LIST_HEAD(&local->list);
 	kref_init(&local->ref);
 	mutex_init(&local->sdp_lock);
-	init_timer(&local->link_timer);
-	local->link_timer.data = (unsigned long) local;
-	local->link_timer.function = nfc_llcp_symm_timer;
+	setup_timer(&local->link_timer, nfc_llcp_symm_timer,
+		    (unsigned long)local);
 
 	skb_queue_head_init(&local->tx_queue);
 	INIT_WORK(&local->tx_work, nfc_llcp_tx_work);
@@ -1601,9 +1600,8 @@ int nfc_llcp_register_device(struct nfc_dev *ndev)
 
 	mutex_init(&local->sdreq_lock);
 	INIT_HLIST_HEAD(&local->pending_sdreqs);
-	init_timer(&local->sdreq_timer);
-	local->sdreq_timer.data = (unsigned long) local;
-	local->sdreq_timer.function = nfc_llcp_sdreq_timer;
+	setup_timer(&local->sdreq_timer, nfc_llcp_sdreq_timer,
+		    (unsigned long)local);
 	INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work);
 
 	list_add(&local->list, &llcp_devices);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 0389398fa4ab..2e5e7a41d8ef 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -108,7 +108,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
 
 	rtnl_lock();
 	err = netdev_master_upper_dev_link(vport->dev,
-					   get_dpdev(vport->dp), NULL, NULL);
+					   get_dpdev(vport->dp),
+					   NULL, NULL, NULL);
 	if (err)
 		goto error_unlock;
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bec01a3daf5b..3f5caa3fbd06 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -544,9 +544,7 @@ static void prb_init_blk_timer(struct packet_sock *po,
 		struct tpacket_kbdq_core *pkc,
 		void (*func) (unsigned long))
 {
-	init_timer(&pkc->retire_blk_timer);
-	pkc->retire_blk_timer.data = (long)po;
-	pkc->retire_blk_timer.function = func;
+	setup_timer(&pkc->retire_blk_timer, func, (long)po);
 	pkc->retire_blk_timer.expires = jiffies;
 }
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c0c707eb2c96..5ef8ce8c83d4 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index d89ebafd2239..700b345b07f9 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -17,13 +17,14 @@
 #include <linux/errno.h>
 #include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
+#include <linux/idr.h>
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
 struct basic_head {
-	u32			hgenerator;
 	struct list_head	flist;
+	struct idr		handle_idr;
 	struct rcu_head		rcu;
 };
 
@@ -78,6 +79,7 @@ static int basic_init(struct tcf_proto *tp)
 	if (head == NULL)
 		return -ENOBUFS;
 	INIT_LIST_HEAD(&head->flist);
+	idr_init(&head->handle_idr);
 	rcu_assign_pointer(tp->root, head);
 	return 0;
 }
@@ -99,8 +101,10 @@ static void basic_destroy(struct tcf_proto *tp)
 	list_for_each_entry_safe(f, n, &head->flist, link) {
 		list_del_rcu(&f->link);
 		tcf_unbind_filter(tp, &f->res);
+		idr_remove_ext(&head->handle_idr, f->handle);
 		call_rcu(&f->rcu, basic_delete_filter);
 	}
+	idr_destroy(&head->handle_idr);
 	kfree_rcu(head, rcu);
 }
 
@@ -111,6 +115,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last)
 
 	list_del_rcu(&f->link);
 	tcf_unbind_filter(tp, &f->res);
+	idr_remove_ext(&head->handle_idr, f->handle);
 	call_rcu(&f->rcu, basic_delete_filter);
 	*last = list_empty(&head->flist);
 	return 0;
@@ -154,6 +159,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 	struct nlattr *tb[TCA_BASIC_MAX + 1];
 	struct basic_filter *fold = (struct basic_filter *) *arg;
 	struct basic_filter *fnew;
+	unsigned long idr_index;
 
 	if (tca[TCA_OPTIONS] == NULL)
 		return -EINVAL;
@@ -176,33 +182,33 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto errout;
 
-	err = -EINVAL;
 	if (handle) {
 		fnew->handle = handle;
-	} else if (fold) {
-		fnew->handle = fold->handle;
+		if (!fold) {
+			err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index,
+					    handle, handle + 1, GFP_KERNEL);
+			if (err)
+				goto errout;
+		}
 	} else {
-		unsigned int i = 0x80000000;
-		do {
-			if (++head->hgenerator == 0x7FFFFFFF)
-				head->hgenerator = 1;
-		} while (--i > 0 && basic_get(tp, head->hgenerator));
-
-		if (i <= 0) {
-			pr_err("Insufficient number of handles\n");
+		err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index,
+				    1, 0x7FFFFFFF, GFP_KERNEL);
+		if (err)
 			goto errout;
-		}
-
-		fnew->handle = head->hgenerator;
+		fnew->handle = idr_index;
 	}
 
 	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
-	if (err < 0)
+	if (err < 0) {
+		if (!fold)
+			idr_remove_ext(&head->handle_idr, fnew->handle);
 		goto errout;
+	}
 
 	*arg = fnew;
 
 	if (fold) {
+		idr_replace_ext(&head->handle_idr, fnew, fnew->handle);
 		list_replace_rcu(&fold->link, &fnew->link);
 		tcf_unbind_filter(tp, &fold->res);
 		call_rcu(&fold->rcu, basic_delete_filter);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 520c5027646a..6c6b21f6ba62 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -17,6 +17,7 @@
 #include <linux/skbuff.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
+#include <linux/idr.h>
 
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
@@ -32,7 +33,7 @@ MODULE_DESCRIPTION("TC BPF based classifier");
 
 struct cls_bpf_head {
 	struct list_head plist;
-	u32 hgen;
+	struct idr handle_idr;
 	struct rcu_head rcu;
 };
 
@@ -99,11 +100,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		} else if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
@@ -238,6 +239,7 @@ static int cls_bpf_init(struct tcf_proto *tp)
 		return -ENOBUFS;
 
 	INIT_LIST_HEAD_RCU(&head->plist);
+	idr_init(&head->handle_idr);
 	rcu_assign_pointer(tp->root, head);
 
 	return 0;
@@ -264,6 +266,9 @@ static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
 
 static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
 {
+	struct cls_bpf_head *head = rtnl_dereference(tp->root);
+
+	idr_remove_ext(&head->handle_idr, prog->handle);
 	cls_bpf_stop_offload(tp, prog);
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
@@ -287,6 +292,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp)
 	list_for_each_entry_safe(prog, tmp, &head->plist, link)
 		__cls_bpf_delete(tp, prog);
 
+	idr_destroy(&head->handle_idr);
 	kfree_rcu(head, rcu);
 }
 
@@ -421,27 +427,6 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 	return 0;
 }
 
-static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
-				   struct cls_bpf_head *head)
-{
-	unsigned int i = 0x80000000;
-	u32 handle;
-
-	do {
-		if (++head->hgen == 0x7FFFFFFF)
-			head->hgen = 1;
-	} while (--i > 0 && cls_bpf_get(tp, head->hgen));
-
-	if (unlikely(i == 0)) {
-		pr_err("Insufficient number of handles\n");
-		handle = 0;
-	} else {
-		handle = head->hgen;
-	}
-
-	return handle;
-}
-
 static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 			  struct tcf_proto *tp, unsigned long base,
 			  u32 handle, struct nlattr **tca,
@@ -451,6 +436,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 	struct cls_bpf_prog *oldprog = *arg;
 	struct nlattr *tb[TCA_BPF_MAX + 1];
 	struct cls_bpf_prog *prog;
+	unsigned long idr_index;
 	int ret;
 
 	if (tca[TCA_OPTIONS] == NULL)
@@ -476,21 +462,30 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		}
 	}
 
-	if (handle == 0)
-		prog->handle = cls_bpf_grab_new_handle(tp, head);
-	else
+	if (handle == 0) {
+		ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index,
+				    1, 0x7FFFFFFF, GFP_KERNEL);
+		if (ret)
+			goto errout;
+		prog->handle = idr_index;
+	} else {
+		if (!oldprog) {
+			ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index,
+					    handle, handle + 1, GFP_KERNEL);
+			if (ret)
+				goto errout;
+		}
 		prog->handle = handle;
-	if (prog->handle == 0) {
-		ret = -EINVAL;
-		goto errout;
 	}
 
 	ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
 	if (ret < 0)
-		goto errout;
+		goto errout_idr;
 
 	ret = cls_bpf_offload(tp, prog, oldprog);
 	if (ret) {
+		if (!oldprog)
+			idr_remove_ext(&head->handle_idr, prog->handle);
 		__cls_bpf_delete_prog(prog);
 		return ret;
 	}
@@ -499,6 +494,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
 
 	if (oldprog) {
+		idr_replace_ext(&head->handle_idr, prog, handle);
 		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
 		call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
@@ -509,6 +505,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 	*arg = prog;
 	return 0;
 
+errout_idr:
+	if (!oldprog)
+		idr_remove_ext(&head->handle_idr, prog->handle);
 errout:
 	tcf_exts_destroy(&prog->exts);
 	kfree(prog);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index d230cb4c8094..db831ac708f6 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -152,37 +152,12 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	struct cls_fl_filter *f;
 	struct fl_flow_key skb_key;
 	struct fl_flow_key skb_mkey;
-	struct ip_tunnel_info *info;
 
 	if (!atomic_read(&head->ht.nelems))
 		return -1;
 
 	fl_clear_masked_range(&skb_key, &head->mask);
 
-	info = skb_tunnel_info(skb);
-	if (info) {
-		struct ip_tunnel_key *key = &info->key;
-
-		switch (ip_tunnel_info_af(info)) {
-		case AF_INET:
-			skb_key.enc_control.addr_type =
-				FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-			skb_key.enc_ipv4.src = key->u.ipv4.src;
-			skb_key.enc_ipv4.dst = key->u.ipv4.dst;
-			break;
-		case AF_INET6:
-			skb_key.enc_control.addr_type =
-				FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-			skb_key.enc_ipv6.src = key->u.ipv6.src;
-			skb_key.enc_ipv6.dst = key->u.ipv6.dst;
-			break;
-		}
-
-		skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
-		skb_key.enc_tp.src = key->tp_src;
-		skb_key.enc_tp.dst = key->tp_dst;
-	}
-
 	skb_key.indev_ifindex = skb->skb_iif;
 	/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
 	 * so do it rather here.
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 10b8d851fc6b..094d224411a9 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -46,6 +46,7 @@
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 #include <linux/netdevice.h>
+#include <linux/idr.h>
 
 struct tc_u_knode {
 	struct tc_u_knode __rcu	*next;
@@ -82,6 +83,7 @@ struct tc_u_hnode {
 	struct tc_u_common	*tp_c;
 	int			refcnt;
 	unsigned int		divisor;
+	struct idr		handle_idr;
 	struct rcu_head		rcu;
 	/* The 'ht' field MUST be the last field in structure to allow for
 	 * more entries allocated at end of structure.
@@ -93,7 +95,7 @@ struct tc_u_common {
 	struct tc_u_hnode __rcu	*hlist;
 	struct Qdisc		*q;
 	int			refcnt;
-	u32			hgenerator;
+	struct idr		handle_idr;
 	struct hlist_node	hnode;
 	struct rcu_head		rcu;
 };
@@ -311,19 +313,19 @@ static void *u32_get(struct tcf_proto *tp, u32 handle)
 	return u32_lookup_key(ht, handle);
 }
 
-static u32 gen_new_htid(struct tc_u_common *tp_c)
+static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr)
 {
-	int i = 0x800;
+	unsigned long idr_index;
+	int err;
 
-	/* hgenerator only used inside rtnl lock it is safe to increment
+	/* This is only used inside rtnl lock it is safe to increment
 	 * without read _copy_ update semantics
 	 */
-	do {
-		if (++tp_c->hgenerator == 0x7FF)
-			tp_c->hgenerator = 1;
-	} while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
-
-	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
+	err = idr_alloc_ext(&tp_c->handle_idr, ptr, &idr_index,
+			    1, 0x7FF, GFP_KERNEL);
+	if (err)
+		return 0;
+	return (u32)(idr_index | 0x800) << 20;
 }
 
 static struct hlist_head *tc_u_common_hash;
@@ -366,8 +368,9 @@ static int u32_init(struct tcf_proto *tp)
 		return -ENOBUFS;
 
 	root_ht->refcnt++;
-	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
+	root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
 	root_ht->prio = tp->prio;
+	idr_init(&root_ht->handle_idr);
 
 	if (tp_c == NULL) {
 		tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
@@ -377,6 +380,7 @@ static int u32_init(struct tcf_proto *tp)
 		}
 		tp_c->q = tp->q;
 		INIT_HLIST_NODE(&tp_c->hnode);
+		idr_init(&tp_c->handle_idr);
 
 		h = tc_u_hash(tp);
 		hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
@@ -565,6 +569,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 					 rtnl_dereference(n->next));
 			tcf_unbind_filter(tp, &n->res);
 			u32_remove_hw_knode(tp, n->handle);
+			idr_remove_ext(&ht->handle_idr, n->handle);
 			call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
 		}
 	}
@@ -586,6 +591,8 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 	     hn = &phn->next, phn = rtnl_dereference(*hn)) {
 		if (phn == ht) {
 			u32_clear_hw_hnode(tp, ht);
+			idr_destroy(&ht->handle_idr);
+			idr_remove_ext(&tp_c->handle_idr, ht->handle);
 			RCU_INIT_POINTER(*hn, ht->next);
 			kfree_rcu(ht, rcu);
 			return 0;
@@ -633,6 +640,7 @@ static void u32_destroy(struct tcf_proto *tp)
 			kfree_rcu(ht, rcu);
 		}
 
+		idr_destroy(&tp_c->handle_idr);
 		kfree(tp_c);
 	}
 
@@ -701,27 +709,21 @@ ret:
 	return ret;
 }
 
-#define NR_U32_NODE (1<<12)
-static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
+static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid)
 {
-	struct tc_u_knode *n;
-	unsigned long i;
-	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
-					GFP_KERNEL);
-	if (!bitmap)
-		return handle | 0xFFF;
-
-	for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]);
-	     n;
-	     n = rtnl_dereference(n->next))
-		set_bit(TC_U32_NODE(n->handle), bitmap);
-
-	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
-	if (i >= NR_U32_NODE)
-		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);
+	unsigned long idr_index;
+	u32 start = htid | 0x800;
+	u32 max = htid | 0xFFF;
+	u32 min = htid;
+
+	if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index,
+			  start, max + 1, GFP_KERNEL)) {
+		if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index,
+				  min + 1, max + 1, GFP_KERNEL))
+			return max;
+	}
 
-	kfree(bitmap);
-	return handle | (i >= NR_U32_NODE ? 0xFFF : i);
+	return (u32)idr_index;
 }
 
 static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
@@ -806,6 +808,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
 		if (pins->handle == n->handle)
 			break;
 
+	idr_replace_ext(&ht->handle_idr, n, n->handle);
 	RCU_INIT_POINTER(n->next, pins->next);
 	rcu_assign_pointer(*ins, n);
 }
@@ -937,22 +940,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -EINVAL;
 		if (TC_U32_KEY(handle))
 			return -EINVAL;
-		if (handle == 0) {
-			handle = gen_new_htid(tp->data);
-			if (handle == 0)
-				return -ENOMEM;
-		}
 		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
 		if (ht == NULL)
 			return -ENOBUFS;
+		if (handle == 0) {
+			handle = gen_new_htid(tp->data, ht);
+			if (handle == 0) {
+				kfree(ht);
+				return -ENOMEM;
+			}
+		} else {
+			err = idr_alloc_ext(&tp_c->handle_idr, ht, NULL,
+					    handle, handle + 1, GFP_KERNEL);
+			if (err) {
+				kfree(ht);
+				return err;
+			}
+		}
 		ht->tp_c = tp_c;
 		ht->refcnt = 1;
 		ht->divisor = divisor;
 		ht->handle = handle;
 		ht->prio = tp->prio;
+		idr_init(&ht->handle_idr);
 
 		err = u32_replace_hw_hnode(tp, ht, flags);
 		if (err) {
+			idr_remove_ext(&tp_c->handle_idr, handle);
 			kfree(ht);
 			return err;
 		}
@@ -986,24 +1000,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
 			return -EINVAL;
 		handle = htid | TC_U32_NODE(handle);
+		err = idr_alloc_ext(&ht->handle_idr, NULL, NULL,
+				    handle, handle + 1,
+				    GFP_KERNEL);
+		if (err)
+			return err;
 	} else
 		handle = gen_new_kid(ht, htid);
 
-	if (tb[TCA_U32_SEL] == NULL)
-		return -EINVAL;
+	if (tb[TCA_U32_SEL] == NULL) {
+		err = -EINVAL;
+		goto erridr;
+	}
 
 	s = nla_data(tb[TCA_U32_SEL]);
 
 	n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
-	if (n == NULL)
-		return -ENOBUFS;
+	if (n == NULL) {
+		err = -ENOBUFS;
+		goto erridr;
+	}
 
 #ifdef CONFIG_CLS_U32_PERF
 	size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
 	n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
 	if (!n->pf) {
-		kfree(n);
-		return -ENOBUFS;
+		err = -ENOBUFS;
+		goto errfree;
 	}
 #endif
 
@@ -1066,9 +1089,12 @@ errhw:
 errout:
 	tcf_exts_destroy(&n->exts);
 #ifdef CONFIG_CLS_U32_PERF
+errfree:
 	free_percpu(n->pf);
 #endif
 	kfree(n);
+erridr:
+	idr_remove_ext(&ht->handle_idr, handle);
 	return err;
 }
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c6deb74e3d2f..aa82116ed10c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1500,7 +1500,6 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 	int s_idx, s_q_idx;
 	struct net_device *dev;
 	const struct nlmsghdr *nlh = cb->nlh;
-	struct tcmsg *tcm = nlmsg_data(nlh);
 	struct nlattr *tca[TCA_MAX + 1];
 	int err;
 
@@ -1510,7 +1509,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 	idx = 0;
 	ASSERT_RTNL();
 
-	err = nlmsg_parse(nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL);
+	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
 	if (err < 0)
 		return err;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index bf8c81e07c70..a0a198768aad 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -689,10 +689,8 @@ void qdisc_reset(struct Qdisc *qdisc)
 }
 EXPORT_SYMBOL(qdisc_reset);
 
-static void qdisc_rcu_free(struct rcu_head *head)
+static void qdisc_free(struct Qdisc *qdisc)
 {
-	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
-
 	if (qdisc_is_percpu_stats(qdisc)) {
 		free_percpu(qdisc->cpu_bstats);
 		free_percpu(qdisc->cpu_qstats);
@@ -725,11 +723,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 
 	kfree_skb_list(qdisc->gso_skb);
 	kfree_skb(qdisc->skb_bad_txq);
-	/*
-	 * gen_estimator est_timer() might access qdisc->q.lock,
-	 * wait a RCU grace period before freeing qdisc.
-	 */
-	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
+	qdisc_free(qdisc);
 }
 EXPORT_SYMBOL(qdisc_destroy);
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 7e148376ba52..c6d7ae81b41f 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -142,6 +142,7 @@ struct htb_class {
 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
 
 	unsigned int drops ____cacheline_aligned_in_smp;
+	unsigned int		overlimits;
 };
 
 struct htb_level {
@@ -533,6 +534,9 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
 	if (new_mode == cl->cmode)
 		return;
 
+	if (new_mode == HTB_CANT_SEND)
+		cl->overlimits++;
+
 	if (cl->prio_activity) {	/* not necessary: speed optimization */
 		if (cl->cmode != HTB_CANT_SEND)
 			htb_deactivate_prios(q, cl);
@@ -1143,6 +1147,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 	struct htb_class *cl = (struct htb_class *)arg;
 	struct gnet_stats_queue qs = {
 		.drops = cl->drops,
+		.overlimits = cl->overlimits,
 	};
 	__u32 qlen = 0;
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b1266e75ca43..5a4f10080290 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -146,7 +146,6 @@ struct netem_sched_data {
  */
 struct netem_skb_cb {
 	psched_time_t	time_to_send;
-	ktime_t		tstamp_save;
 };
 
 
@@ -362,12 +361,13 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche
 static void tfifo_reset(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
-	struct rb_node *p;
+	struct rb_node *p = rb_first(&q->t_root);
 
-	while ((p = rb_first(&q->t_root))) {
+	while (p) {
 		struct sk_buff *skb = netem_rb_to_skb(p);
 
-		rb_erase(p, &q->t_root);
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, &q->t_root);
 		rtnl_kfree_skbs(skb, skb);
 	}
 }
@@ -561,7 +561,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 
 		cb->time_to_send = now + delay;
-		cb->tstamp_save = skb->tstamp;
 		++q->counter;
 		tfifo_enqueue(skb, sch);
 	} else {
@@ -629,7 +628,10 @@ deliver:
 			qdisc_qstats_backlog_dec(sch, skb);
 			skb->next = NULL;
 			skb->prev = NULL;
-			skb->tstamp = netem_skb_cb(skb)->tstamp_save;
+			/* skb->dev shares skb->rbnode area,
+			 * we need to restore its value.
+			 */
+			skb->dev = qdisc_dev(sch);
 
 #ifdef CONFIG_NET_CLS_ACT
 			/*
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 70f1b570bab9..bf90c5397719 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -12,7 +12,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
-	  offload.o
+	  offload.o stream_sched.o stream_sched_prio.o \
+	  stream_sched_rr.o
 
 sctp_probe-y := probe.o
 
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 3afac275ee82..7b261afc47b9 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -311,10 +311,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 
 		if (chunk->sent_count) {
 			chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
-			streamout->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
+			streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
 		} else {
 			chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
-			streamout->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
+			streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
 		}
 		return 1;
 	} else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
@@ -323,7 +323,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
 			&chunk->asoc->stream.out[chunk->sinfo.sinfo_stream];
 
 		chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
-		streamout->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
+		streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
 		return 1;
 	} else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) &&
 		   chunk->msg->expires_at &&
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 2966ff400755..4db012aa25f7 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -50,6 +50,7 @@
 
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
 
 /* Declare internal functions here.  */
 static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn);
@@ -72,32 +73,38 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 
 /* Add data to the front of the queue. */
 static inline void sctp_outq_head_data(struct sctp_outq *q,
-					struct sctp_chunk *ch)
+				       struct sctp_chunk *ch)
 {
+	struct sctp_stream_out_ext *oute;
+	__u16 stream;
+
 	list_add(&ch->list, &q->out_chunk_list);
 	q->out_qlen += ch->skb->len;
+
+	stream = sctp_chunk_stream_no(ch);
+	oute = q->asoc->stream.out[stream].ext;
+	list_add(&ch->stream_list, &oute->outq);
 }
 
 /* Take data from the front of the queue. */
 static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
 {
-	struct sctp_chunk *ch = NULL;
-
-	if (!list_empty(&q->out_chunk_list)) {
-		struct list_head *entry = q->out_chunk_list.next;
-
-		ch = list_entry(entry, struct sctp_chunk, list);
-		list_del_init(entry);
-		q->out_qlen -= ch->skb->len;
-	}
-	return ch;
+	return q->sched->dequeue(q);
 }
+
 /* Add data chunk to the end of the queue. */
 static inline void sctp_outq_tail_data(struct sctp_outq *q,
 				       struct sctp_chunk *ch)
 {
+	struct sctp_stream_out_ext *oute;
+	__u16 stream;
+
 	list_add_tail(&ch->list, &q->out_chunk_list);
 	q->out_qlen += ch->skb->len;
+
+	stream = sctp_chunk_stream_no(ch);
+	oute = q->asoc->stream.out[stream].ext;
+	list_add_tail(&ch->stream_list, &oute->outq);
 }
 
 /*
@@ -207,6 +214,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
 	INIT_LIST_HEAD(&q->retransmit);
 	INIT_LIST_HEAD(&q->sacked);
 	INIT_LIST_HEAD(&q->abandoned);
+	sctp_sched_set_sched(asoc, SCTP_SS_FCFS);
 }
 
 /* Free the outqueue structure and any related pending chunks.
@@ -258,6 +266,7 @@ static void __sctp_outq_teardown(struct sctp_outq *q)
 
 	/* Throw away any leftover data chunks. */
 	while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+		sctp_sched_dequeue_done(q, chunk);
 
 		/* Mark as send failure. */
 		sctp_chunk_fail(chunk, q->error);
@@ -366,7 +375,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
 		streamout = &asoc->stream.out[chk->sinfo.sinfo_stream];
 		asoc->sent_cnt_removable--;
 		asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
-		streamout->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
+		streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
 
 		if (!chk->tsn_gap_acked) {
 			if (chk->transport)
@@ -391,20 +400,21 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 	struct sctp_outq *q = &asoc->outqueue;
 	struct sctp_chunk *chk, *temp;
 
+	q->sched->unsched_all(&asoc->stream);
+
 	list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
 		if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
 		    chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
 			continue;
 
-		list_del_init(&chk->list);
-		q->out_qlen -= chk->skb->len;
+		sctp_sched_dequeue_common(q, chk);
 		asoc->sent_cnt_removable--;
 		asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
 		if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) {
 			struct sctp_stream_out *streamout =
 				&asoc->stream.out[chk->sinfo.sinfo_stream];
 
-			streamout->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
+			streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
 		}
 
 		msg_len -= SCTP_DATA_SNDSIZE(chk) +
@@ -415,6 +425,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 			break;
 	}
 
+	q->sched->sched_all(&asoc->stream);
+
 	return msg_len;
 }
 
@@ -1033,22 +1045,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
 			__u32 sid = ntohs(chunk->subh.data_hdr->stream);
 
-			/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
-			 * stream identifier.
-			 */
-			if (chunk->sinfo.sinfo_stream >= asoc->stream.outcnt) {
-
-				/* Mark as failed send. */
-				sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
-				if (asoc->peer.prsctp_capable &&
-				    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
-					asoc->sent_cnt_removable--;
-				sctp_chunk_free(chunk);
-				continue;
-			}
-
 			/* Has this chunk expired? */
 			if (sctp_chunk_abandoned(chunk)) {
+				sctp_sched_dequeue_done(q, chunk);
 				sctp_chunk_fail(chunk, 0);
 				sctp_chunk_free(chunk);
 				continue;
@@ -1070,6 +1069,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				new_transport = asoc->peer.active_path;
 			if (new_transport->state == SCTP_UNCONFIRMED) {
 				WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
+				sctp_sched_dequeue_done(q, chunk);
 				sctp_chunk_fail(chunk, 0);
 				sctp_chunk_free(chunk);
 				continue;
@@ -1133,6 +1133,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				else
 					asoc->stats.oodchunks++;
 
+				/* Only now it's safe to consider this
+				 * chunk as sent, sched-wise.
+				 */
+				sctp_sched_dequeue_done(q, chunk);
+
 				break;
 
 			default:
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index e6a2974e020e..402bfbb888cd 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -50,6 +50,7 @@
 #include <net/sock.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
 
 static int sctp_cmd_interpreter(enum sctp_event event_type,
 				union sctp_subtype subtype,
@@ -1089,6 +1090,8 @@ static void sctp_cmd_send_msg(struct sctp_association *asoc,
 
 	list_for_each_entry(chunk, &msg->chunks, frag_list)
 		sctp_outq_tail(&asoc->outqueue, chunk, gfp);
+
+	asoc->outqueue.sched->enqueue(&asoc->outqueue, msg);
 }
 
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index d4730ada7f32..88c28421ec15 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -79,6 +79,7 @@
 #include <net/sock.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
 
 /* Forward declarations for internal helper functions. */
 static int sctp_writeable(struct sock *sk);
@@ -1927,6 +1928,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		goto out_free;
 	}
 
+	/* Allocate sctp_stream_out_ext if not already done */
+	if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) {
+		err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream);
+		if (err)
+			goto out_free;
+	}
+
 	if (sctp_wspace(asoc) < msg_len)
 		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
 
@@ -3907,6 +3915,64 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_scheduler(struct sock *sk,
+				     char __user *optval,
+				     unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_assoc_value params;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(params))
+		goto out;
+
+	optlen = sizeof(params);
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_value > SCTP_SS_MAX)
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_sched_set_sched(asoc, params.assoc_value);
+
+out:
+	return retval;
+}
+
+static int sctp_setsockopt_scheduler_value(struct sock *sk,
+					   char __user *optval,
+					   unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_value params;
+	int retval = -EINVAL;
+
+	if (optlen < sizeof(params))
+		goto out;
+
+	optlen = sizeof(params);
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc)
+		goto out;
+
+	retval = sctp_sched_set_value(asoc, params.stream_id,
+				      params.stream_value, GFP_KERNEL);
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4088,6 +4154,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_ADD_STREAMS:
 		retval = sctp_setsockopt_add_streams(sk, optval, optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER:
+		retval = sctp_setsockopt_scheduler(sk, optval, optlen);
+		break;
+	case SCTP_STREAM_SCHEDULER_VALUE:
+		retval = sctp_setsockopt_scheduler_value(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -6645,7 +6717,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len,
 					   char __user *optval,
 					   int __user *optlen)
 {
-	struct sctp_stream_out *streamout;
+	struct sctp_stream_out_ext *streamoute;
 	struct sctp_association *asoc;
 	struct sctp_prstatus params;
 	int retval = -EINVAL;
@@ -6668,21 +6740,29 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len,
 	if (!asoc || params.sprstat_sid >= asoc->stream.outcnt)
 		goto out;
 
-	streamout = &asoc->stream.out[params.sprstat_sid];
+	streamoute = asoc->stream.out[params.sprstat_sid].ext;
+	if (!streamoute) {
+		/* Not allocated yet, means all stats are 0 */
+		params.sprstat_abandoned_unsent = 0;
+		params.sprstat_abandoned_sent = 0;
+		retval = 0;
+		goto out;
+	}
+
 	if (policy == SCTP_PR_SCTP_NONE) {
 		params.sprstat_abandoned_unsent = 0;
 		params.sprstat_abandoned_sent = 0;
 		for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
 			params.sprstat_abandoned_unsent +=
-				streamout->abandoned_unsent[policy];
+				streamoute->abandoned_unsent[policy];
 			params.sprstat_abandoned_sent +=
-				streamout->abandoned_sent[policy];
+				streamoute->abandoned_sent[policy];
 		}
 	} else {
 		params.sprstat_abandoned_unsent =
-			streamout->abandoned_unsent[__SCTP_PR_INDEX(policy)];
+			streamoute->abandoned_unsent[__SCTP_PR_INDEX(policy)];
 		params.sprstat_abandoned_sent =
-			streamout->abandoned_sent[__SCTP_PR_INDEX(policy)];
+			streamoute->abandoned_sent[__SCTP_PR_INDEX(policy)];
 	}
 
 	if (put_user(len, optlen) || copy_to_user(optval, &params, len)) {
@@ -6778,6 +6858,85 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_scheduler(struct sock *sk, int len,
+				     char __user *optval,
+				     int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	params.assoc_value = sctp_sched_get_sched(asoc);
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
+static int sctp_getsockopt_scheduler_value(struct sock *sk, int len,
+					   char __user *optval,
+					   int __user *optlen)
+{
+	struct sctp_stream_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	retval = sctp_sched_get_value(asoc, params.stream_id,
+				      &params.stream_value);
+	if (retval)
+		goto out;
+
+	if (put_user(len, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (copy_to_user(optval, &params, len)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -6960,6 +7119,14 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_enable_strreset(sk, len, optval,
 							 optlen);
 		break;
+	case SCTP_STREAM_SCHEDULER:
+		retval = sctp_getsockopt_scheduler(sk, len, optval,
+						   optlen);
+		break;
+	case SCTP_STREAM_SCHEDULER_VALUE:
+		retval = sctp_getsockopt_scheduler_value(sk, len, optval,
+							 optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 63ea15503714..5ea33a2c453b 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -32,44 +32,181 @@
  *    Xin Long <lucien.xin@gmail.com>
  */
 
+#include <linux/list.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Migrates chunks from stream queues to new stream queues if needed,
+ * but not across associations. Also, removes those chunks to streams
+ * higher than the new max.
+ */
+static void sctp_stream_outq_migrate(struct sctp_stream *stream,
+				     struct sctp_stream *new, __u16 outcnt)
+{
+	struct sctp_association *asoc;
+	struct sctp_chunk *ch, *temp;
+	struct sctp_outq *outq;
+	int i;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	outq = &asoc->outqueue;
+
+	list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) {
+		__u16 sid = sctp_chunk_stream_no(ch);
+
+		if (sid < outcnt)
+			continue;
+
+		sctp_sched_dequeue_common(outq, ch);
+		/* No need to call dequeue_done here because
+		 * the chunks are not scheduled by now.
+		 */
+
+		/* Mark as failed send. */
+		sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM);
+		if (asoc->peer.prsctp_capable &&
+		    SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags))
+			asoc->sent_cnt_removable--;
+
+		sctp_chunk_free(ch);
+	}
+
+	if (new) {
+		/* Here we actually move the old ext stuff into the new
+		 * buffer, because we want to keep it. Then
+		 * sctp_stream_update will swap ->out pointers.
+		 */
+		for (i = 0; i < outcnt; i++) {
+			kfree(new->out[i].ext);
+			new->out[i].ext = stream->out[i].ext;
+			stream->out[i].ext = NULL;
+		}
+	}
+
+	for (i = outcnt; i < stream->outcnt; i++)
+		kfree(stream->out[i].ext);
+}
+
+static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
+				 gfp_t gfp)
+{
+	struct sctp_stream_out *out;
+
+	out = kmalloc_array(outcnt, sizeof(*out), gfp);
+	if (!out)
+		return -ENOMEM;
+
+	if (stream->out) {
+		memcpy(out, stream->out, min(outcnt, stream->outcnt) *
+					 sizeof(*out));
+		kfree(stream->out);
+	}
+
+	if (outcnt > stream->outcnt)
+		memset(out + stream->outcnt, 0,
+		       (outcnt - stream->outcnt) * sizeof(*out));
+
+	stream->out = out;
+
+	return 0;
+}
+
+static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
+				gfp_t gfp)
+{
+	struct sctp_stream_in *in;
+
+	in = kmalloc_array(incnt, sizeof(*stream->in), gfp);
+
+	if (!in)
+		return -ENOMEM;
+
+	if (stream->in) {
+		memcpy(in, stream->in, min(incnt, stream->incnt) *
+				       sizeof(*in));
+		kfree(stream->in);
+	}
+
+	if (incnt > stream->incnt)
+		memset(in + stream->incnt, 0,
+		       (incnt - stream->incnt) * sizeof(*in));
+
+	stream->in = in;
+
+	return 0;
+}
 
 int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 		     gfp_t gfp)
 {
-	int i;
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	int i, ret = 0;
+
+	gfp |= __GFP_NOWARN;
 
 	/* Initial stream->out size may be very big, so free it and alloc
-	 * a new one with new outcnt to save memory.
+	 * a new one with new outcnt to save memory if needed.
 	 */
-	kfree(stream->out);
+	if (outcnt == stream->outcnt)
+		goto in;
 
-	stream->out = kcalloc(outcnt, sizeof(*stream->out), gfp);
-	if (!stream->out)
-		return -ENOMEM;
+	/* Filter out chunks queued on streams that won't exist anymore */
+	sched->unsched_all(stream);
+	sctp_stream_outq_migrate(stream, NULL, outcnt);
+	sched->sched_all(stream);
+
+	i = sctp_stream_alloc_out(stream, outcnt, gfp);
+	if (i)
+		return i;
 
 	stream->outcnt = outcnt;
 	for (i = 0; i < stream->outcnt; i++)
 		stream->out[i].state = SCTP_STREAM_OPEN;
 
+	sched->init(stream);
+
+in:
 	if (!incnt)
-		return 0;
+		goto out;
 
-	stream->in = kcalloc(incnt, sizeof(*stream->in), gfp);
-	if (!stream->in) {
-		kfree(stream->out);
-		stream->out = NULL;
-		return -ENOMEM;
+	i = sctp_stream_alloc_in(stream, incnt, gfp);
+	if (i) {
+		ret = -ENOMEM;
+		goto free;
 	}
 
 	stream->incnt = incnt;
+	goto out;
 
-	return 0;
+free:
+	sched->free(stream);
+	kfree(stream->out);
+	stream->out = NULL;
+out:
+	return ret;
+}
+
+int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
+{
+	struct sctp_stream_out_ext *soute;
+
+	soute = kzalloc(sizeof(*soute), GFP_KERNEL);
+	if (!soute)
+		return -ENOMEM;
+	stream->out[sid].ext = soute;
+
+	return sctp_sched_init_sid(stream, sid, GFP_KERNEL);
 }
 
 void sctp_stream_free(struct sctp_stream *stream)
 {
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	int i;
+
+	sched->free(stream);
+	for (i = 0; i < stream->outcnt; i++)
+		kfree(stream->out[i].ext);
 	kfree(stream->out);
 	kfree(stream->in);
 }
@@ -87,6 +224,10 @@ void sctp_stream_clear(struct sctp_stream *stream)
 
 void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
 {
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+
+	sched->unsched_all(stream);
+	sctp_stream_outq_migrate(stream, new, new->outcnt);
 	sctp_stream_free(stream);
 
 	stream->out = new->out;
@@ -94,6 +235,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
 	stream->outcnt = new->outcnt;
 	stream->incnt  = new->incnt;
 
+	sched->sched_all(stream);
+
 	new->out = NULL;
 	new->in  = NULL;
 }
@@ -270,15 +413,9 @@ int sctp_send_add_streams(struct sctp_association *asoc,
 	}
 
 	if (out) {
-		struct sctp_stream_out *streamout;
-
-		streamout = krealloc(stream->out, outcnt * sizeof(*streamout),
-				     GFP_KERNEL);
-		if (!streamout)
+		retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL);
+		if (retval)
 			goto out;
-
-		memset(streamout + stream->outcnt, 0, out * sizeof(*streamout));
-		stream->out = streamout;
 	}
 
 	chunk = sctp_make_strreset_addstrm(asoc, out, in);
@@ -601,7 +738,6 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out(
 	struct sctp_strreset_addstrm *addstrm = param.v;
 	struct sctp_stream *stream = &asoc->stream;
 	__u32 result = SCTP_STRRESET_DENIED;
-	struct sctp_stream_in *streamin;
 	__u32 request_seq, incnt;
 	__u16 in, i;
 
@@ -648,13 +784,9 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out(
 	if (!in || incnt > SCTP_MAX_STREAM)
 		goto out;
 
-	streamin = krealloc(stream->in, incnt * sizeof(*streamin),
-			    GFP_ATOMIC);
-	if (!streamin)
+	if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC))
 		goto out;
 
-	memset(streamin + stream->incnt, 0, in * sizeof(*streamin));
-	stream->in = streamin;
 	stream->incnt = incnt;
 
 	result = SCTP_STRRESET_PERFORMED;
@@ -676,10 +808,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in(
 	struct sctp_strreset_addstrm *addstrm = param.v;
 	struct sctp_stream *stream = &asoc->stream;
 	__u32 result = SCTP_STRRESET_DENIED;
-	struct sctp_stream_out *streamout;
 	struct sctp_chunk *chunk = NULL;
 	__u32 request_seq, outcnt;
 	__u16 out, i;
+	int ret;
 
 	request_seq = ntohl(addstrm->request_seq);
 	if (TSN_lt(asoc->strreset_inseq, request_seq) ||
@@ -708,14 +840,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in(
 	if (!out || outcnt > SCTP_MAX_STREAM)
 		goto out;
 
-	streamout = krealloc(stream->out, outcnt * sizeof(*streamout),
-			     GFP_ATOMIC);
-	if (!streamout)
+	ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC);
+	if (ret)
 		goto out;
 
-	memset(streamout + stream->outcnt, 0, out * sizeof(*streamout));
-	stream->out = streamout;
-
 	chunk = sctp_make_strreset_addstrm(asoc, out, 0);
 	if (!chunk)
 		goto out;
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
new file mode 100644
index 000000000000..03513a9fa110
--- /dev/null
+++ b/net/sctp/stream_sched.c
@@ -0,0 +1,275 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* First Come First Serve (a.k.a. FIFO)
+ * RFC DRAFT ndata Section 3.1
+ */
+static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid,
+			       __u16 value, gfp_t gfp)
+{
+	return 0;
+}
+
+static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid,
+			       __u16 *value)
+{
+	*value = 0;
+	return 0;
+}
+
+static int sctp_sched_fcfs_init(struct sctp_stream *stream)
+{
+	return 0;
+}
+
+static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid,
+				    gfp_t gfp)
+{
+	return 0;
+}
+
+static void sctp_sched_fcfs_free(struct sctp_stream *stream)
+{
+}
+
+static void sctp_sched_fcfs_enqueue(struct sctp_outq *q,
+				    struct sctp_datamsg *msg)
+{
+}
+
+static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_chunk *ch = NULL;
+	struct list_head *entry;
+
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	if (stream->out_curr) {
+		ch = list_entry(stream->out_curr->ext->outq.next,
+				struct sctp_chunk, stream_list);
+	} else {
+		entry = q->out_chunk_list.next;
+		ch = list_entry(entry, struct sctp_chunk, list);
+	}
+
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q,
+					 struct sctp_chunk *chunk)
+{
+}
+
+static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream)
+{
+}
+
+static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream)
+{
+}
+
+static struct sctp_sched_ops sctp_sched_fcfs = {
+	.set = sctp_sched_fcfs_set,
+	.get = sctp_sched_fcfs_get,
+	.init = sctp_sched_fcfs_init,
+	.init_sid = sctp_sched_fcfs_init_sid,
+	.free = sctp_sched_fcfs_free,
+	.enqueue = sctp_sched_fcfs_enqueue,
+	.dequeue = sctp_sched_fcfs_dequeue,
+	.dequeue_done = sctp_sched_fcfs_dequeue_done,
+	.sched_all = sctp_sched_fcfs_sched_all,
+	.unsched_all = sctp_sched_fcfs_unsched_all,
+};
+
+/* API to other parts of the stack */
+
+extern struct sctp_sched_ops sctp_sched_prio;
+extern struct sctp_sched_ops sctp_sched_rr;
+
+struct sctp_sched_ops *sctp_sched_ops[] = {
+	&sctp_sched_fcfs,
+	&sctp_sched_prio,
+	&sctp_sched_rr,
+};
+
+int sctp_sched_set_sched(struct sctp_association *asoc,
+			 enum sctp_sched_type sched)
+{
+	struct sctp_sched_ops *n = sctp_sched_ops[sched];
+	struct sctp_sched_ops *old = asoc->outqueue.sched;
+	struct sctp_datamsg *msg = NULL;
+	struct sctp_chunk *ch;
+	int i, ret = 0;
+
+	if (old == n)
+		return ret;
+
+	if (sched > SCTP_SS_MAX)
+		return -EINVAL;
+
+	if (old) {
+		old->free(&asoc->stream);
+
+		/* Give the next scheduler a clean slate. */
+		for (i = 0; i < asoc->stream.outcnt; i++) {
+			void *p = asoc->stream.out[i].ext;
+
+			if (!p)
+				continue;
+
+			p += offsetofend(struct sctp_stream_out_ext, outq);
+			memset(p, 0, sizeof(struct sctp_stream_out_ext) -
+				     offsetofend(struct sctp_stream_out_ext, outq));
+		}
+	}
+
+	asoc->outqueue.sched = n;
+	n->init(&asoc->stream);
+	for (i = 0; i < asoc->stream.outcnt; i++) {
+		if (!asoc->stream.out[i].ext)
+			continue;
+
+		ret = n->init_sid(&asoc->stream, i, GFP_KERNEL);
+		if (ret)
+			goto err;
+	}
+
+	/* We have to requeue all chunks already queued. */
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		if (ch->msg == msg)
+			continue;
+		msg = ch->msg;
+		n->enqueue(&asoc->outqueue, msg);
+	}
+
+	return ret;
+
+err:
+	n->free(&asoc->stream);
+	asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */
+
+	return ret;
+}
+
+int sctp_sched_get_sched(struct sctp_association *asoc)
+{
+	int i;
+
+	for (i = 0; i <= SCTP_SS_MAX; i++)
+		if (asoc->outqueue.sched == sctp_sched_ops[i])
+			return i;
+
+	return 0;
+}
+
+int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 value, gfp_t gfp)
+{
+	if (sid >= asoc->stream.outcnt)
+		return -EINVAL;
+
+	if (!asoc->stream.out[sid].ext) {
+		int ret;
+
+		ret = sctp_stream_init_ext(&asoc->stream, sid);
+		if (ret)
+			return ret;
+	}
+
+	return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp);
+}
+
+int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
+			 __u16 *value)
+{
+	if (sid >= asoc->stream.outcnt)
+		return -EINVAL;
+
+	if (!asoc->stream.out[sid].ext)
+		return 0;
+
+	return asoc->outqueue.sched->get(&asoc->stream, sid, value);
+}
+
+void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
+{
+	if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) {
+		struct sctp_stream_out *sout;
+		__u16 sid;
+
+		/* datamsg is not finish, so save it as current one,
+		 * in case application switch scheduler or a higher
+		 * priority stream comes in.
+		 */
+		sid = sctp_chunk_stream_no(ch);
+		sout = &q->asoc->stream.out[sid];
+		q->asoc->stream.out_curr = sout;
+		return;
+	}
+
+	q->asoc->stream.out_curr = NULL;
+	q->sched->dequeue_done(q, ch);
+}
+
+/* Auxiliary functions for the schedulers */
+void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch)
+{
+	list_del_init(&ch->list);
+	list_del_init(&ch->stream_list);
+	q->out_qlen -= ch->skb->len;
+}
+
+int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp)
+{
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+
+	INIT_LIST_HEAD(&stream->out[sid].ext->outq);
+	return sched->init_sid(stream, sid, gfp);
+}
+
+struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+
+	return asoc->outqueue.sched;
+}
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
new file mode 100644
index 000000000000..384dbf3c8760
--- /dev/null
+++ b/net/sctp/stream_sched_prio.c
@@ -0,0 +1,347 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Priority handling
+ * RFC DRAFT ndata section 3.4
+ */
+
+static void sctp_sched_prio_unsched_all(struct sctp_stream *stream);
+
+static struct sctp_stream_priorities *sctp_sched_prio_new_head(
+			struct sctp_stream *stream, int prio, gfp_t gfp)
+{
+	struct sctp_stream_priorities *p;
+
+	p = kmalloc(sizeof(*p), gfp);
+	if (!p)
+		return NULL;
+
+	INIT_LIST_HEAD(&p->prio_sched);
+	INIT_LIST_HEAD(&p->active);
+	p->next = NULL;
+	p->prio = prio;
+
+	return p;
+}
+
+static struct sctp_stream_priorities *sctp_sched_prio_get_head(
+			struct sctp_stream *stream, int prio, gfp_t gfp)
+{
+	struct sctp_stream_priorities *p;
+	int i;
+
+	/* Look into scheduled priorities first, as they are sorted and
+	 * we can find it fast IF it's scheduled.
+	 */
+	list_for_each_entry(p, &stream->prio_list, prio_sched) {
+		if (p->prio == prio)
+			return p;
+		if (p->prio > prio)
+			break;
+	}
+
+	/* No luck. So we search on all streams now. */
+	for (i = 0; i < stream->outcnt; i++) {
+		if (!stream->out[i].ext)
+			continue;
+
+		p = stream->out[i].ext->prio_head;
+		if (!p)
+			/* Means all other streams won't be initialized
+			 * as well.
+			 */
+			break;
+		if (p->prio == prio)
+			return p;
+	}
+
+	/* If not even there, allocate a new one. */
+	return sctp_sched_prio_new_head(stream, prio, gfp);
+}
+
+static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p)
+{
+	struct list_head *pos;
+
+	pos = p->next->prio_list.next;
+	if (pos == &p->active)
+		pos = pos->next;
+	p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list);
+}
+
+static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute)
+{
+	bool scheduled = false;
+
+	if (!list_empty(&soute->prio_list)) {
+		struct sctp_stream_priorities *prio_head = soute->prio_head;
+
+		/* Scheduled */
+		scheduled = true;
+
+		if (prio_head->next == soute)
+			/* Try to move to the next stream */
+			sctp_sched_prio_next_stream(prio_head);
+
+		list_del_init(&soute->prio_list);
+
+		/* Also unsched the priority if this was the last stream */
+		if (list_empty(&prio_head->active)) {
+			list_del_init(&prio_head->prio_sched);
+			/* If there is no stream left, clear next */
+			prio_head->next = NULL;
+		}
+	}
+
+	return scheduled;
+}
+
+static void sctp_sched_prio_sched(struct sctp_stream *stream,
+				  struct sctp_stream_out_ext *soute)
+{
+	struct sctp_stream_priorities *prio, *prio_head;
+
+	prio_head = soute->prio_head;
+
+	/* Nothing to do if already scheduled */
+	if (!list_empty(&soute->prio_list))
+		return;
+
+	/* Schedule the stream. If there is a next, we schedule the new
+	 * one before it, so it's the last in round robin order.
+	 * If there isn't, we also have to schedule the priority.
+	 */
+	if (prio_head->next) {
+		list_add(&soute->prio_list, prio_head->next->prio_list.prev);
+		return;
+	}
+
+	list_add(&soute->prio_list, &prio_head->active);
+	prio_head->next = soute;
+
+	list_for_each_entry(prio, &stream->prio_list, prio_sched) {
+		if (prio->prio > prio_head->prio) {
+			list_add(&prio_head->prio_sched, prio->prio_sched.prev);
+			return;
+		}
+	}
+
+	list_add_tail(&prio_head->prio_sched, &stream->prio_list);
+}
+
+static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid,
+			       __u16 prio, gfp_t gfp)
+{
+	struct sctp_stream_out *sout = &stream->out[sid];
+	struct sctp_stream_out_ext *soute = sout->ext;
+	struct sctp_stream_priorities *prio_head, *old;
+	bool reschedule = false;
+	int i;
+
+	prio_head = sctp_sched_prio_get_head(stream, prio, gfp);
+	if (!prio_head)
+		return -ENOMEM;
+
+	reschedule = sctp_sched_prio_unsched(soute);
+	old = soute->prio_head;
+	soute->prio_head = prio_head;
+	if (reschedule)
+		sctp_sched_prio_sched(stream, soute);
+
+	if (!old)
+		/* Happens when we set the priority for the first time */
+		return 0;
+
+	for (i = 0; i < stream->outcnt; i++) {
+		soute = stream->out[i].ext;
+		if (soute && soute->prio_head == old)
+			/* It's still in use, nothing else to do here. */
+			return 0;
+	}
+
+	/* No hits, we are good to free it. */
+	kfree(old);
+
+	return 0;
+}
+
+static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid,
+			       __u16 *value)
+{
+	*value = stream->out[sid].ext->prio_head->prio;
+	return 0;
+}
+
+static int sctp_sched_prio_init(struct sctp_stream *stream)
+{
+	INIT_LIST_HEAD(&stream->prio_list);
+
+	return 0;
+}
+
+static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid,
+				    gfp_t gfp)
+{
+	INIT_LIST_HEAD(&stream->out[sid].ext->prio_list);
+	return sctp_sched_prio_set(stream, sid, 0, gfp);
+}
+
+static void sctp_sched_prio_free(struct sctp_stream *stream)
+{
+	struct sctp_stream_priorities *prio, *n;
+	LIST_HEAD(list);
+	int i;
+
+	/* As we don't keep a list of priorities, to avoid multiple
+	 * frees we have to do it in 3 steps:
+	 *   1. unsched everyone, so the lists are free to use in 2.
+	 *   2. build the list of the priorities
+	 *   3. free the list
+	 */
+	sctp_sched_prio_unsched_all(stream);
+	for (i = 0; i < stream->outcnt; i++) {
+		if (!stream->out[i].ext)
+			continue;
+		prio = stream->out[i].ext->prio_head;
+		if (prio && list_empty(&prio->prio_sched))
+			list_add(&prio->prio_sched, &list);
+	}
+	list_for_each_entry_safe(prio, n, &list, prio_sched) {
+		list_del_init(&prio->prio_sched);
+		kfree(prio);
+	}
+}
+
+static void sctp_sched_prio_enqueue(struct sctp_outq *q,
+				    struct sctp_datamsg *msg)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *ch;
+	__u16 sid;
+
+	ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
+	sid = sctp_chunk_stream_no(ch);
+	stream = &q->asoc->stream;
+	sctp_sched_prio_sched(stream, stream->out[sid].ext);
+}
+
+static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_priorities *prio;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch = NULL;
+
+	/* Bail out quickly if queue is empty */
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	/* Find which chunk is next. It's easy, it's either the current
+	 * one or the first chunk on the next active stream.
+	 */
+	if (stream->out_curr) {
+		soute = stream->out_curr->ext;
+	} else {
+		prio = list_entry(stream->prio_list.next,
+				  struct sctp_stream_priorities, prio_sched);
+		soute = prio->next;
+	}
+	ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list);
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_prio_dequeue_done(struct sctp_outq *q,
+					 struct sctp_chunk *ch)
+{
+	struct sctp_stream_priorities *prio;
+	struct sctp_stream_out_ext *soute;
+	__u16 sid;
+
+	/* Last chunk on that msg, move to the next stream on
+	 * this priority.
+	 */
+	sid = sctp_chunk_stream_no(ch);
+	soute = q->asoc->stream.out[sid].ext;
+	prio = soute->prio_head;
+
+	sctp_sched_prio_next_stream(prio);
+
+	if (list_empty(&soute->outq))
+		sctp_sched_prio_unsched(soute);
+}
+
+static void sctp_sched_prio_sched_all(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_out *sout;
+	struct sctp_chunk *ch;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		__u16 sid;
+
+		sid = sctp_chunk_stream_no(ch);
+		sout = &stream->out[sid];
+		if (sout->ext)
+			sctp_sched_prio_sched(stream, sout->ext);
+	}
+}
+
+static void sctp_sched_prio_unsched_all(struct sctp_stream *stream)
+{
+	struct sctp_stream_priorities *p, *tmp;
+	struct sctp_stream_out_ext *soute, *souttmp;
+
+	list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched)
+		list_for_each_entry_safe(soute, souttmp, &p->active, prio_list)
+			sctp_sched_prio_unsched(soute);
+}
+
+struct sctp_sched_ops sctp_sched_prio = {
+	.set = sctp_sched_prio_set,
+	.get = sctp_sched_prio_get,
+	.init = sctp_sched_prio_init,
+	.init_sid = sctp_sched_prio_init_sid,
+	.free = sctp_sched_prio_free,
+	.enqueue = sctp_sched_prio_enqueue,
+	.dequeue = sctp_sched_prio_dequeue,
+	.dequeue_done = sctp_sched_prio_dequeue_done,
+	.sched_all = sctp_sched_prio_sched_all,
+	.unsched_all = sctp_sched_prio_unsched_all,
+};
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
new file mode 100644
index 000000000000..7612a438c5b9
--- /dev/null
+++ b/net/sctp/stream_sched_rr.c
@@ -0,0 +1,201 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Priority handling
+ * RFC DRAFT ndata section 3.2
+ */
+static void sctp_sched_rr_unsched_all(struct sctp_stream *stream);
+
+static void sctp_sched_rr_next_stream(struct sctp_stream *stream)
+{
+	struct list_head *pos;
+
+	pos = stream->rr_next->rr_list.next;
+	if (pos == &stream->rr_list)
+		pos = pos->next;
+	stream->rr_next = list_entry(pos, struct sctp_stream_out_ext, rr_list);
+}
+
+static void sctp_sched_rr_unsched(struct sctp_stream *stream,
+				  struct sctp_stream_out_ext *soute)
+{
+	if (stream->rr_next == soute)
+		/* Try to move to the next stream */
+		sctp_sched_rr_next_stream(stream);
+
+	list_del_init(&soute->rr_list);
+
+	/* If we have no other stream queued, clear next */
+	if (list_empty(&stream->rr_list))
+		stream->rr_next = NULL;
+}
+
+static void sctp_sched_rr_sched(struct sctp_stream *stream,
+				struct sctp_stream_out_ext *soute)
+{
+	if (!list_empty(&soute->rr_list))
+		/* Already scheduled. */
+		return;
+
+	/* Schedule the stream */
+	list_add_tail(&soute->rr_list, &stream->rr_list);
+
+	if (!stream->rr_next)
+		stream->rr_next = soute;
+}
+
+static int sctp_sched_rr_set(struct sctp_stream *stream, __u16 sid,
+			     __u16 prio, gfp_t gfp)
+{
+	return 0;
+}
+
+static int sctp_sched_rr_get(struct sctp_stream *stream, __u16 sid,
+			     __u16 *value)
+{
+	return 0;
+}
+
+static int sctp_sched_rr_init(struct sctp_stream *stream)
+{
+	INIT_LIST_HEAD(&stream->rr_list);
+	stream->rr_next = NULL;
+
+	return 0;
+}
+
+static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid,
+				  gfp_t gfp)
+{
+	INIT_LIST_HEAD(&stream->out[sid].ext->rr_list);
+
+	return 0;
+}
+
+static void sctp_sched_rr_free(struct sctp_stream *stream)
+{
+	sctp_sched_rr_unsched_all(stream);
+}
+
+static void sctp_sched_rr_enqueue(struct sctp_outq *q,
+				  struct sctp_datamsg *msg)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *ch;
+	__u16 sid;
+
+	ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
+	sid = sctp_chunk_stream_no(ch);
+	stream = &q->asoc->stream;
+	sctp_sched_rr_sched(stream, stream->out[sid].ext);
+}
+
+static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch = NULL;
+
+	/* Bail out quickly if queue is empty */
+	if (list_empty(&q->out_chunk_list))
+		goto out;
+
+	/* Find which chunk is next */
+	if (stream->out_curr)
+		soute = stream->out_curr->ext;
+	else
+		soute = stream->rr_next;
+	ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list);
+
+	sctp_sched_dequeue_common(q, ch);
+
+out:
+	return ch;
+}
+
+static void sctp_sched_rr_dequeue_done(struct sctp_outq *q,
+				       struct sctp_chunk *ch)
+{
+	struct sctp_stream_out_ext *soute;
+	__u16 sid;
+
+	/* Last chunk on that msg, move to the next stream */
+	sid = sctp_chunk_stream_no(ch);
+	soute = q->asoc->stream.out[sid].ext;
+
+	sctp_sched_rr_next_stream(&q->asoc->stream);
+
+	if (list_empty(&soute->outq))
+		sctp_sched_rr_unsched(&q->asoc->stream, soute);
+}
+
+static void sctp_sched_rr_sched_all(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		__u16 sid;
+
+		sid = sctp_chunk_stream_no(ch);
+		soute = stream->out[sid].ext;
+		if (soute)
+			sctp_sched_rr_sched(stream, soute);
+	}
+}
+
+static void sctp_sched_rr_unsched_all(struct sctp_stream *stream)
+{
+	struct sctp_stream_out_ext *soute, *tmp;
+
+	list_for_each_entry_safe(soute, tmp, &stream->rr_list, rr_list)
+		sctp_sched_rr_unsched(stream, soute);
+}
+
+struct sctp_sched_ops sctp_sched_rr = {
+	.set = sctp_sched_rr_set,
+	.get = sctp_sched_rr_get,
+	.init = sctp_sched_rr_init,
+	.init_sid = sctp_sched_rr_init_sid,
+	.free = sctp_sched_rr_free,
+	.enqueue = sctp_sched_rr_enqueue,
+	.dequeue = sctp_sched_rr_dequeue,
+	.dequeue_done = sctp_sched_rr_dequeue_done,
+	.sched_all = sctp_sched_rr_sched_all,
+	.unsched_all = sctp_sched_rr_unsched_all,
+};
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index a7294edbc221..5ef97e5a5f78 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -62,10 +62,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 	bh_unlock_sock(&smc->sk);
 }
 
-int smc_cdc_get_free_slot(struct smc_link *link,
+int smc_cdc_get_free_slot(struct smc_connection *conn,
 			  struct smc_wr_buf **wr_buf,
 			  struct smc_cdc_tx_pend **pend)
 {
+	struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
 	return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
 				       (struct smc_wr_tx_pend_priv **)pend);
 }
@@ -118,8 +120,7 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
-				   &pend);
+	rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);
 	if (rc)
 		return rc;
 
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 8e1d76f26007..56f883d1159c 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -206,7 +206,8 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
 
 struct smc_cdc_tx_pend;
 
-int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf,
+int smc_cdc_get_free_slot(struct smc_connection *conn,
+			  struct smc_wr_buf **wr_buf,
 			  struct smc_cdc_tx_pend **pend);
 void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
 int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 3866573288dd..ec49bc3c3283 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -396,8 +396,7 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 	int rc;
 
 	spin_lock_bh(&conn->send_lock);
-	rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
-				   &pend);
+	rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);
 	if (rc < 0) {
 		if (rc == -EBUSY) {
 			struct smc_sock *smc =
@@ -466,8 +465,7 @@ void smc_tx_consumer_update(struct smc_connection *conn)
 	    ((to_confirm > conn->rmbe_update_limit) &&
 	     ((to_confirm > (conn->rmbe_size / 2)) ||
 	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
-		rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
-					   &wr_buf, &pend);
+		rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend);
 		if (!rc)
 			rc = smc_cdc_msg_send(conn, wr_buf, pend);
 		if (rc < 0) {
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 10ae7823a19d..0206155bff53 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -21,7 +21,6 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
-#include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/net.h>
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index cf17c7932a6e..ebc2ad69b62c 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -177,6 +177,11 @@ HOSTLOADLIBES_syscall_tp += -lelf
 LLC ?= llc
 CLANG ?= clang
 
+# Detect that we're cross compiling and use the cross compiler
+ifdef CROSS_COMPILE
+HOSTCC = $(CROSS_COMPILE)gcc
+endif
+
 # Trick to allow make to be run from this directory
 all:
 	$(MAKE) -C ../../ $(CURDIR)/
@@ -225,7 +230,7 @@ $(obj)/%.o: $(src)/%.c
 	$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
 		-I$(srctree)/tools/testing/selftests/bpf/ \
 		-D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
-		-Wno-compare-distinct-pointer-types \
+		-D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \
 		-Wno-gnu-variable-sized-type-not-at-end \
 		-Wno-address-of-packed-member -Wno-tautological-compare \
 		-Wno-unknown-warning-option \
diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst
index 79f9a58f1872..5f27e4faca50 100644
--- a/samples/bpf/README.rst
+++ b/samples/bpf/README.rst
@@ -64,3 +64,13 @@ It is also possible to point make to the newly compiled 'llc' or
 'clang' command via redefining LLC or CLANG on the make command line::
 
  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
+
+Cross compiling samples
+-----------------------
+In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH
+environment variables before calling make. This will direct make to build
+samples for the cross target.
+
+export ARCH=arm64
+export CROSS_COMPILE="aarch64-linux-gnu-"
+make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 6aa50098dfb8..18b1c8dd0391 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -221,6 +221,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps,
 			int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
 
 			map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
+							maps[i].name,
 							maps[i].def.key_size,
 							inner_map_fd,
 							maps[i].def.max_entries,
@@ -228,6 +229,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps,
 							numa_node);
 		} else {
 			map_fd[i] = bpf_create_map_node(maps[i].def.type,
+							maps[i].name,
 							maps[i].def.key_size,
 							maps[i].def.value_size,
 							maps[i].def.max_entries,
diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c
index 9d1be9426401..88bdcf4b1670 100644
--- a/samples/bpf/cgroup_helpers.c
+++ b/samples/bpf/cgroup_helpers.c
@@ -56,7 +56,7 @@ int setup_cgroup_environment(void)
 		return 1;
 	}
 
-	if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) {
+	if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL) && errno != EBUSY) {
 		log_err("mount cgroup2");
 		return 1;
 	}
@@ -163,7 +163,7 @@ int create_and_get_cgroup(char *path)
 
 	format_cgroup_path(cgroup_path, path);
 	if (mkdir(cgroup_path, 0777) && errno != EEXIST) {
-		log_err("mkdiring cgroup");
+		log_err("mkdiring cgroup %s .. %s", path, cgroup_path);
 		return 0;
 	}
 
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
index 098c857f1eda..2b2ffb97018b 100644
--- a/samples/bpf/map_perf_test_kern.c
+++ b/samples/bpf/map_perf_test_kern.c
@@ -266,7 +266,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kprobe/sys_getpgrp")
+SEC("kprobe/sys_getppid")
 int stress_array_map_lookup(struct pt_regs *ctx)
 {
 	u32 key = 1, i;
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index f388254896f6..519d9af4b04a 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -137,6 +137,7 @@ static void do_test_lru(enum test_type test, int cpu)
 
 			inner_lru_map_fds[cpu] =
 				bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH,
+						    test_map_names[INNER_LRU_HASH_PREALLOC],
 						    sizeof(uint32_t),
 						    sizeof(long),
 						    inner_lru_hash_size, 0,
@@ -282,7 +283,7 @@ static void test_array_lookup(int cpu)
 
 	start_time = time_get_ns();
 	for (i = 0; i < max_cnt; i++)
-		syscall(__NR_getpgrp, 0);
+		syscall(__NR_getppid, 0);
 	printf("%d:array_lookup %lld lookups per sec\n",
 	       cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
 }
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 3049b1f26267..3e8232cc04a8 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -30,7 +30,7 @@
 
 #define FOO		"/foo"
 #define BAR		"/foo/bar/"
-#define PING_CMD	"ping -c1 -w1 127.0.0.1"
+#define PING_CMD	"ping -c1 -w1 127.0.0.1 > /dev/null"
 
 char bpf_log_buf[BPF_LOG_BUF_SIZE];
 
@@ -55,8 +55,7 @@ static int prog_load(int verdict)
 	return ret;
 }
 
-
-int main(int argc, char **argv)
+static int test_foo_bar(void)
 {
 	int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0;
 
@@ -189,8 +188,223 @@ out:
 	close(bar);
 	cleanup_cgroup_environment();
 	if (!rc)
-		printf("PASS\n");
+		printf("### override:PASS\n");
 	else
-		printf("FAIL\n");
+		printf("### override:FAIL\n");
 	return rc;
 }
+
+static int map_fd = -1;
+
+static int prog_load_cnt(int verdict, int val)
+{
+	if (map_fd < 0)
+		map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+	if (map_fd < 0) {
+		printf("failed to create map '%s'\n", strerror(errno));
+		return -1;
+	}
+
+	struct bpf_insn prog[] = {
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+		BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */
+		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+		BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+		BPF_EXIT_INSN(),
+	};
+	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+	int ret;
+
+	ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+			       prog, insns_cnt, "GPL", 0,
+			       bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+	if (ret < 0) {
+		log_err("Loading program");
+		printf("Output from verifier:\n%s\n-------\n", bpf_log_buf);
+		return 0;
+	}
+	return ret;
+}
+
+
+static int test_multiprog(void)
+{
+	__u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id;
+	int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0;
+	int drop_prog, allow_prog[6] = {}, rc = 0;
+	unsigned long long value;
+	int i = 0;
+
+	for (i = 0; i < 6; i++) {
+		allow_prog[i] = prog_load_cnt(1, 1 << i);
+		if (!allow_prog[i])
+			goto err;
+	}
+	drop_prog = prog_load_cnt(0, 1);
+	if (!drop_prog)
+		goto err;
+
+	if (setup_cgroup_environment())
+		goto err;
+
+	cg1 = create_and_get_cgroup("/cg1");
+	if (!cg1)
+		goto err;
+	cg2 = create_and_get_cgroup("/cg1/cg2");
+	if (!cg2)
+		goto err;
+	cg3 = create_and_get_cgroup("/cg1/cg2/cg3");
+	if (!cg3)
+		goto err;
+	cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4");
+	if (!cg4)
+		goto err;
+	cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5");
+	if (!cg5)
+		goto err;
+
+	if (join_cgroup("/cg1/cg2/cg3/cg4/cg5"))
+		goto err;
+
+	if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+		log_err("Attaching prog to cg1");
+		goto err;
+	}
+	if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+		log_err("Unexpected success attaching the same prog to cg1");
+		goto err;
+	}
+	if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+		log_err("Attaching prog2 to cg1");
+		goto err;
+	}
+	if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, 1)) {
+		log_err("Attaching prog to cg2");
+		goto err;
+	}
+	if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, 2)) {
+		log_err("Attaching prog to cg3");
+		goto err;
+	}
+	if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, 1)) {
+		log_err("Attaching prog to cg4");
+		goto err;
+	}
+	if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) {
+		log_err("Attaching prog to cg5");
+		goto err;
+	}
+	assert(system(PING_CMD) == 0);
+	assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+	assert(value == 1 + 2 + 8 + 32);
+
+	/* query the number of effective progs in cg5 */
+	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+			      NULL, NULL, &prog_cnt) == 0);
+	assert(prog_cnt == 4);
+	/* retrieve prog_ids of effective progs in cg5 */
+	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+			      &attach_flags, prog_ids, &prog_cnt) == 0);
+	assert(prog_cnt == 4);
+	assert(attach_flags == 0);
+	saved_prog_id = prog_ids[0];
+	/* check enospc handling */
+	prog_ids[0] = 0;
+	prog_cnt = 2;
+	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+			      &attach_flags, prog_ids, &prog_cnt) == -1 &&
+	       errno == ENOSPC);
+	assert(prog_cnt == 4);
+	/* check that prog_ids are returned even when buffer is too small */
+	assert(prog_ids[0] == saved_prog_id);
+	/* retrieve prog_id of single attached prog in cg5 */
+	prog_ids[0] = 0;
+	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
+			      NULL, prog_ids, &prog_cnt) == 0);
+	assert(prog_cnt == 1);
+	assert(prog_ids[0] == saved_prog_id);
+
+	/* detach bottom program and ping again */
+	if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) {
+		log_err("Detaching prog from cg5");
+		goto err;
+	}
+	value = 0;
+	assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+	assert(system(PING_CMD) == 0);
+	assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+	assert(value == 1 + 2 + 8 + 16);
+
+	/* detach 3rd from bottom program and ping again */
+	errno = 0;
+	if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) {
+		log_err("Unexpected success on detach from cg3");
+		goto err;
+	}
+	if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) {
+		log_err("Detaching from cg3");
+		goto err;
+	}
+	value = 0;
+	assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+	assert(system(PING_CMD) == 0);
+	assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+	assert(value == 1 + 2 + 16);
+
+	/* detach 2nd from bottom program and ping again */
+	if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) {
+		log_err("Detaching prog from cg4");
+		goto err;
+	}
+	value = 0;
+	assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+	assert(system(PING_CMD) == 0);
+	assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+	assert(value == 1 + 2 + 4);
+
+	prog_cnt = 4;
+	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+			      &attach_flags, prog_ids, &prog_cnt) == 0);
+	assert(prog_cnt == 3);
+	assert(attach_flags == 0);
+	assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
+			      NULL, prog_ids, &prog_cnt) == 0);
+	assert(prog_cnt == 0);
+	goto out;
+err:
+	rc = 1;
+
+out:
+	for (i = 0; i < 6; i++)
+		if (allow_prog[i] > 0)
+			close(allow_prog[i]);
+	close(cg1);
+	close(cg2);
+	close(cg3);
+	close(cg4);
+	close(cg5);
+	cleanup_cgroup_environment();
+	if (!rc)
+		printf("### multi:PASS\n");
+	else
+		printf("### multi:FAIL\n");
+	return rc;
+}
+
+int main(int argc, char **argv)
+{
+	int rc = 0;
+
+	rc = test_foo_bar();
+	if (rc)
+		return rc;
+
+	return test_multiprog();
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index b51b4f5e3257..c5ab8b776973 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -61,7 +61,7 @@ static void usage(char *argv[])
 }
 
 #define NANOSEC_PER_SEC 1000000000 /* 10^9 */
-__u64 gettime(void)
+static __u64 gettime(void)
 {
 	struct timespec t;
 	int res;
@@ -206,7 +206,7 @@ static void stats_poll(int interval, bool err_only)
 	}
 }
 
-void print_bpf_prog_info(void)
+static void print_bpf_prog_info(void)
 {
 	int i;
 
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index e4d90e50f6fe..57263f2f8f2f 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -206,5 +206,12 @@ int main(void)
 	DEVID_FIELD(fsl_mc_device_id, vendor);
 	DEVID_FIELD(fsl_mc_device_id, obj_type);
 
+	DEVID(tb_service_id);
+	DEVID_FIELD(tb_service_id, match_flags);
+	DEVID_FIELD(tb_service_id, protocol_key);
+	DEVID_FIELD(tb_service_id, protocol_id);
+	DEVID_FIELD(tb_service_id, protocol_version);
+	DEVID_FIELD(tb_service_id, protocol_revision);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 29d6699d5a06..6ef6e63f96fd 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1301,6 +1301,31 @@ static int do_fsl_mc_entry(const char *filename, void *symval,
 }
 ADD_TO_DEVTABLE("fslmc", fsl_mc_device_id, do_fsl_mc_entry);
 
+/* Looks like: tbsvc:kSpNvNrN */
+static int do_tbsvc_entry(const char *filename, void *symval, char *alias)
+{
+	DEF_FIELD(symval, tb_service_id, match_flags);
+	DEF_FIELD_ADDR(symval, tb_service_id, protocol_key);
+	DEF_FIELD(symval, tb_service_id, protocol_id);
+	DEF_FIELD(symval, tb_service_id, protocol_version);
+	DEF_FIELD(symval, tb_service_id, protocol_revision);
+
+	strcpy(alias, "tbsvc:");
+	if (match_flags & TBSVC_MATCH_PROTOCOL_KEY)
+		sprintf(alias + strlen(alias), "k%s", *protocol_key);
+	else
+		strcat(alias + strlen(alias), "k*");
+	ADD(alias, "p", match_flags & TBSVC_MATCH_PROTOCOL_ID, protocol_id);
+	ADD(alias, "v", match_flags & TBSVC_MATCH_PROTOCOL_VERSION,
+	    protocol_version);
+	ADD(alias, "r", match_flags & TBSVC_MATCH_PROTOCOL_REVISION,
+	    protocol_revision);
+
+	add_wildcard(alias);
+	return 1;
+}
+ADD_TO_DEVTABLE("tbsvc", tb_service_id, do_tbsvc_entry);
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
diff --git a/tools/Makefile b/tools/Makefile
index 9dfede37c8ff..df6fcb293fbc 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -19,7 +19,7 @@ help:
 	@echo '  kvm_stat               - top-like utility for displaying kvm statistics'
 	@echo '  leds                   - LEDs  tools'
 	@echo '  liblockdep             - user-space wrapper for kernel locking-validator'
-	@echo '  net                    - misc networking tools'
+	@echo '  bpf                    - misc BPF tools'
 	@echo '  perf                   - Linux performance measurement and analysis tool'
 	@echo '  selftests              - various kernel selftests'
 	@echo '  spi                    - spi tools'
@@ -57,7 +57,7 @@ acpi: FORCE
 cpupower: FORCE
 	$(call descend,power/$@)
 
-cgroup firewire hv guest spi usb virtio vm net iio gpio objtool leds: FORCE
+cgroup firewire hv guest spi usb virtio vm bpf iio gpio objtool leds: FORCE
 	$(call descend,$@)
 
 liblockdep: FORCE
@@ -91,7 +91,7 @@ kvm_stat: FORCE
 
 all: acpi cgroup cpupower gpio hv firewire liblockdep \
 		perf selftests spi turbostat usb \
-		virtio vm net x86_energy_perf_policy \
+		virtio vm bpf x86_energy_perf_policy \
 		tmon freefall iio objtool kvm_stat
 
 acpi_install:
@@ -100,7 +100,7 @@ acpi_install:
 cpupower_install:
 	$(call descend,power/$(@:_install=),install)
 
-cgroup_install firewire_install gpio_install hv_install iio_install perf_install spi_install usb_install virtio_install vm_install net_install objtool_install:
+cgroup_install firewire_install gpio_install hv_install iio_install perf_install spi_install usb_install virtio_install vm_install bpf_install objtool_install:
 	$(call descend,$(@:_install=),install)
 
 liblockdep_install:
@@ -124,7 +124,7 @@ kvm_stat_install:
 install: acpi_install cgroup_install cpupower_install gpio_install \
 		hv_install firewire_install iio_install liblockdep_install \
 		perf_install selftests_install turbostat_install usb_install \
-		virtio_install vm_install net_install x86_energy_perf_policy_install \
+		virtio_install vm_install bpf_install x86_energy_perf_policy_install \
 		tmon_install freefall_install objtool_install kvm_stat_install
 
 acpi_clean:
@@ -133,7 +133,7 @@ acpi_clean:
 cpupower_clean:
 	$(call descend,power/cpupower,clean)
 
-cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
+cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean:
 	$(call descend,$(@:_clean=),clean)
 
 liblockdep_clean:
@@ -169,7 +169,7 @@ build_clean:
 
 clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
 		perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \
-		vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
+		vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
 		gpio_clean objtool_clean leds_clean
 
diff --git a/tools/net/Makefile b/tools/bpf/Makefile
index ddf888010652..325a35e1c28e 100644
--- a/tools/net/Makefile
+++ b/tools/bpf/Makefile
@@ -3,6 +3,7 @@ prefix = /usr
 CC = gcc
 LEX = flex
 YACC = bison
+MAKE = make
 
 CFLAGS += -Wall -O2
 CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include
@@ -13,7 +14,7 @@ CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include
 %.lex.c: %.l
 	$(LEX) -o $@ $<
 
-all : bpf_jit_disasm bpf_dbg bpf_asm
+all: bpf_jit_disasm bpf_dbg bpf_asm bpftool
 
 bpf_jit_disasm : CFLAGS += -DPACKAGE='bpf_jit_disasm'
 bpf_jit_disasm : LDLIBS = -lopcodes -lbfd -ldl
@@ -26,10 +27,21 @@ bpf_asm : LDLIBS =
 bpf_asm : bpf_asm.o bpf_exp.yacc.o bpf_exp.lex.o
 bpf_exp.lex.o : bpf_exp.yacc.c
 
-clean :
+clean: bpftool_clean
 	rm -rf *.o bpf_jit_disasm bpf_dbg bpf_asm bpf_exp.yacc.* bpf_exp.lex.*
 
-install :
+install: bpftool_install
 	install bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm
 	install bpf_dbg $(prefix)/bin/bpf_dbg
 	install bpf_asm $(prefix)/bin/bpf_asm
+
+bpftool:
+	$(MAKE) -C bpftool
+
+bpftool_install:
+	$(MAKE) -C bpftool install
+
+bpftool_clean:
+	$(MAKE) -C bpftool clean
+
+.PHONY: bpftool FORCE
diff --git a/tools/net/bpf_asm.c b/tools/bpf/bpf_asm.c
index c15aef097b04..c15aef097b04 100644
--- a/tools/net/bpf_asm.c
+++ b/tools/bpf/bpf_asm.c
diff --git a/tools/net/bpf_dbg.c b/tools/bpf/bpf_dbg.c
index 4f254bcc4423..4f254bcc4423 100644
--- a/tools/net/bpf_dbg.c
+++ b/tools/bpf/bpf_dbg.c
diff --git a/tools/net/bpf_exp.l b/tools/bpf/bpf_exp.l
index bd83149e7be0..bd83149e7be0 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/bpf/bpf_exp.l
diff --git a/tools/net/bpf_exp.y b/tools/bpf/bpf_exp.y
index 56ba1de50784..56ba1de50784 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/bpf/bpf_exp.y
diff --git a/tools/net/bpf_jit_disasm.c b/tools/bpf/bpf_jit_disasm.c
index 422d9abd666a..422d9abd666a 100644
--- a/tools/net/bpf_jit_disasm.c
+++ b/tools/bpf/bpf_jit_disasm.c
diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile
new file mode 100644
index 000000000000..bde77d7c4390
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/Makefile
@@ -0,0 +1,34 @@
+include ../../../scripts/Makefile.include
+include ../../../scripts/utilities.mak
+
+INSTALL ?= install
+RM ?= rm -f
+
+# Make the path relative to DESTDIR, not prefix
+ifndef DESTDIR
+prefix?=$(HOME)
+endif
+mandir ?= $(prefix)/share/man
+man8dir = $(mandir)/man8
+
+MAN8_RST = $(wildcard *.rst)
+
+_DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST))
+DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8))
+
+man: man8
+man8: $(DOC_MAN8)
+
+$(OUTPUT)%.8: %.rst
+	rst2man $< > $@
+
+clean:
+	$(call QUIET_CLEAN, Documentation) $(RM) $(DOC_MAN8)
+
+install: man
+	$(call QUIET_INSTALL, Documentation-man) \
+		$(INSTALL) -d -m 755 $(DESTDIR)$(man8dir); \
+		$(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir);
+
+.PHONY: man man8 clean install
+.DEFAULT_GOAL := man
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
new file mode 100644
index 000000000000..ff63e89e4b6c
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -0,0 +1,110 @@
+================
+bpftool-map
+================
+-------------------------------------------------------------------------------
+tool for inspection and simple manipulation of eBPF maps
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** **map** *COMMAND*
+
+	*COMMANDS* :=
+	{ show | dump | update | lookup | getnext | delete | pin | help }
+
+MAP COMMANDS
+=============
+
+|	**bpftool** map show   [*MAP*]
+|	**bpftool** map dump    *MAP*
+|	**bpftool** map update  *MAP*  key *BYTES*   value *VALUE* [*UPDATE_FLAGS*]
+|	**bpftool** map lookup  *MAP*  key *BYTES*
+|	**bpftool** map getnext *MAP* [key *BYTES*]
+|	**bpftool** map delete  *MAP*  key *BYTES*
+|	**bpftool** map pin     *MAP*  *FILE*
+|	**bpftool** map help
+|
+|	*MAP* := { id MAP_ID | pinned FILE }
+|	*VALUE* := { BYTES | MAP | PROGRAM }
+|	*UPDATE_FLAGS* := { any | exist | noexist }
+
+DESCRIPTION
+===========
+	**bpftool map show**   [*MAP*]
+		  Show information about loaded maps.  If *MAP* is specified
+		  show information only about given map, otherwise list all
+		  maps currently loaded on the system.
+
+		  Output will start with map ID followed by map type and
+		  zero or more named attributes (depending on kernel version).
+
+	**bpftool map dump**    *MAP*
+		  Dump all entries in a given *MAP*.
+
+	**bpftool map update**  *MAP*  **key** *BYTES*   **value** *VALUE* [*UPDATE_FLAGS*]
+		  Update map entry for a given *KEY*.
+
+		  *UPDATE_FLAGS* can be one of: **any** update existing entry
+		  or add if doesn't exit; **exist** update only if entry already
+		  exists; **noexist** update only if entry doesn't exist.
+
+	**bpftool map lookup**  *MAP*  **key** *BYTES*
+		  Lookup **key** in the map.
+
+	**bpftool map getnext** *MAP* [**key** *BYTES*]
+		  Get next key.  If *key* is not specified, get first key.
+
+	**bpftool map delete**  *MAP*  **key** *BYTES*
+		  Remove entry from the map.
+
+	**bpftool map pin**     *MAP*  *FILE*
+		  Pin map *MAP* as *FILE*.
+
+		  Note: *FILE* must be located in *bpffs* mount.
+
+	**bpftool map help**
+		  Print short help message.
+
+EXAMPLES
+========
+**# bpftool map show**
+::
+
+  10: hash  name some_map  flags 0x0
+	key 4B  value 8B  max_entries 2048  memlock 167936B
+
+**# bpftool map update id 10 key 13 00 07 00 value 02 00 00 00 01 02 03 04**
+
+**# bpftool map lookup id 10 key 0 1 2 3**
+
+::
+
+  key: 00 01 02 03 value: 00 01 02 03 04 05 06 07
+
+
+**# bpftool map dump id 10**
+::
+
+  key: 00 01 02 03  value: 00 01 02 03 04 05 06 07
+  key: 0d 00 07 00  value: 02 00 00 00 01 02 03 04
+  Found 2 elements
+
+**# bpftool map getnext id 10 key 0 1 2 3**
+::
+
+  key:
+  00 01 02 03
+  next key:
+  0d 00 07 00
+
+|
+| **# mount -t bpf none /sys/fs/bpf/**
+| **# bpftool map pin id 10 /sys/fs/bpf/map**
+| **# bpftool map del pinned /sys/fs/bpf/map key 13 00 07 00**
+
+SEE ALSO
+========
+	**bpftool**\ (8), **bpftool-prog**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
new file mode 100644
index 000000000000..57fc4b9924ea
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -0,0 +1,79 @@
+================
+bpftool-prog
+================
+-------------------------------------------------------------------------------
+tool for inspection and simple manipulation of eBPF progs
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+|	**bpftool** prog show [*PROG*]
+|	**bpftool** prog dump xlated *PROG*  file *FILE*
+|	**bpftool** prog dump jited  *PROG* [file *FILE*] [opcodes]
+|	**bpftool** prog pin *PROG* *FILE*
+|	**bpftool** prog help
+|
+|	*PROG* := { id *PROG_ID* | pinned *FILE* | tag *PROG_TAG* }
+
+DESCRIPTION
+===========
+	**bpftool prog show** [*PROG*]
+		  Show information about loaded programs.  If *PROG* is
+		  specified show information only about given program, otherwise
+		  list all programs currently loaded on the system.
+
+		  Output will start with program ID followed by program type and
+		  zero or more named attributes (depending on kernel version).
+
+	**bpftool prog dump xlated** *PROG*  **file** *FILE*
+		  Dump eBPF instructions of the program from the kernel to a
+		  file.
+
+	**bpftool prog dump jited**  *PROG* [**file** *FILE*] [**opcodes**]
+		  Dump jited image (host machine code) of the program.
+		  If *FILE* is specified image will be written to a file,
+		  otherwise it will be disassembled and printed to stdout.
+
+		  **opcodes** controls if raw opcodes will be printed.
+
+	**bpftool prog pin** *PROG* *FILE*
+		  Pin program *PROG* as *FILE*.
+
+		  Note: *FILE* must be located in *bpffs* mount.
+
+	**bpftool prog help**
+		  Print short help message.
+
+EXAMPLES
+========
+**# bpftool prog show**
+::
+
+  10: xdp  name some_prog  tag 00:5a:3d:21:23:62:0c:8b
+	loaded_at Sep 29/20:11  uid 0
+	xlated 528B  jited 370B  memlock 4096B  map_ids 10
+
+|
+| **# bpftool prog dump xlated id 10 file /tmp/t**
+| **# ls -l /tmp/t**
+|   -rw------- 1 root root 560 Jul 22 01:42 /tmp/t
+
+|
+| **# bpftool prog dum jited pinned /sys/fs/bpf/prog**
+
+::
+
+    push   %rbp
+    mov    %rsp,%rbp
+    sub    $0x228,%rsp
+    sub    $0x28,%rbp
+    mov    %rbx,0x0(%rbp)
+
+
+
+SEE ALSO
+========
+	**bpftool**\ (8), **bpftool-map**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
new file mode 100644
index 000000000000..f1df1893fb54
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -0,0 +1,34 @@
+================
+BPFTOOL
+================
+-------------------------------------------------------------------------------
+tool for inspection and simple manipulation of eBPF programs and maps
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** *OBJECT* { *COMMAND* | help }
+
+	**bpftool** batch file *FILE*
+
+	*OBJECT* := { **map** | **program** }
+
+	*MAP-COMMANDS* :=
+	{ show | dump | update | lookup | getnext | delete | pin | help }
+
+	*PROG-COMMANDS* := { show | dump jited | dump xlated | pin | help }
+
+DESCRIPTION
+===========
+	*bpftool* allows for inspection and simple modification of BPF objects
+	on the system.
+
+	Note that format of the output of all tools is not guaranteed to be
+	stable and should not be depended upon.
+
+SEE ALSO
+========
+	**bpftool-map**\ (8), **bpftool-prog**\ (8)
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
new file mode 100644
index 000000000000..8705ee44664d
--- /dev/null
+++ b/tools/bpf/bpftool/Makefile
@@ -0,0 +1,86 @@
+include ../../scripts/Makefile.include
+
+include ../../scripts/utilities.mak
+
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(CURDIR)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
+endif
+
+ifneq ($(objtree),)
+#$(info Determined 'objtree' to be $(objtree))
+endif
+
+ifneq ($(OUTPUT),)
+#$(info Determined 'OUTPUT' to be $(OUTPUT))
+# Adding $(OUTPUT) as a directory to look for source files,
+# because use generated output files as sources dependency
+# for flex/bison parsers.
+VPATH += $(OUTPUT)
+export VPATH
+endif
+
+ifeq ($(V),1)
+  Q =
+else
+  Q = @
+endif
+
+BPF_DIR	= $(srctree)/tools/lib/bpf/
+
+ifneq ($(OUTPUT),)
+  BPF_PATH=$(OUTPUT)
+else
+  BPF_PATH=$(BPF_DIR)
+endif
+
+LIBBPF = $(BPF_PATH)libbpf.a
+
+$(LIBBPF): FORCE
+	$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(FEATURE_DUMP_EXPORT)
+
+$(LIBBPF)-clean:
+	$(call QUIET_CLEAN, libbpf)
+	$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) clean >/dev/null
+
+prefix = /usr
+
+CC = gcc
+
+CFLAGS += -O2
+CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow
+CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf
+LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
+
+include $(wildcard *.d)
+
+all: $(OUTPUT)bpftool
+
+SRCS=$(wildcard *.c)
+OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS))
+
+$(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
+	$(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
+
+$(OUTPUT)%.o: %.c
+	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
+
+clean: $(LIBBPF)-clean
+	$(call QUIET_CLEAN, bpftool)
+	$(Q)rm -rf $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+
+install:
+	install $(OUTPUT)bpftool $(prefix)/sbin/bpftool
+
+doc:
+	$(Q)$(MAKE) -C Documentation/
+
+doc-install:
+	$(Q)$(MAKE) -C Documentation/ install
+
+FORCE:
+
+.PHONY: all clean FORCE
+.DEFAULT_GOAL := all
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
new file mode 100644
index 000000000000..df8396a0c400
--- /dev/null
+++ b/tools/bpf/bpftool/common.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Author: Jakub Kicinski <kubakici@wp.pl> */
+
+#include <errno.h>
+#include <libgen.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/limits.h>
+#include <linux/magic.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+
+#include <bpf.h>
+
+#include "main.h"
+
+static bool is_bpffs(char *path)
+{
+	struct statfs st_fs;
+
+	if (statfs(path, &st_fs) < 0)
+		return false;
+
+	return (unsigned long)st_fs.f_type == BPF_FS_MAGIC;
+}
+
+int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type)
+{
+	enum bpf_obj_type type;
+	int fd;
+
+	fd = bpf_obj_get(path);
+	if (fd < 0) {
+		err("bpf obj get (%s): %s\n", path,
+		    errno == EACCES && !is_bpffs(dirname(path)) ?
+		    "directory not in bpf file system (bpffs)" :
+		    strerror(errno));
+		return -1;
+	}
+
+	type = get_fd_type(fd);
+	if (type < 0) {
+		close(fd);
+		return type;
+	}
+	if (type != exp_type) {
+		err("incorrect object type: %s\n", get_fd_type_name(type));
+		close(fd);
+		return -1;
+	}
+
+	return fd;
+}
+
+int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32))
+{
+	unsigned int id;
+	char *endptr;
+	int err;
+	int fd;
+
+	if (!is_prefix(*argv, "id")) {
+		err("expected 'id' got %s\n", *argv);
+		return -1;
+	}
+	NEXT_ARG();
+
+	id = strtoul(*argv, &endptr, 0);
+	if (*endptr) {
+		err("can't parse %s as ID\n", *argv);
+		return -1;
+	}
+	NEXT_ARG();
+
+	if (argc != 1)
+		usage();
+
+	fd = get_fd_by_id(id);
+	if (fd < 0) {
+		err("can't get prog by id (%u): %s\n", id, strerror(errno));
+		return -1;
+	}
+
+	err = bpf_obj_pin(fd, *argv);
+	close(fd);
+	if (err) {
+		err("can't pin the object (%s): %s\n", *argv,
+		    errno == EACCES && !is_bpffs(dirname(*argv)) ?
+		    "directory not in bpf file system (bpffs)" :
+		    strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+const char *get_fd_type_name(enum bpf_obj_type type)
+{
+	static const char * const names[] = {
+		[BPF_OBJ_UNKNOWN]	= "unknown",
+		[BPF_OBJ_PROG]		= "prog",
+		[BPF_OBJ_MAP]		= "map",
+	};
+
+	if (type < 0 || type >= ARRAY_SIZE(names) || !names[type])
+		return names[BPF_OBJ_UNKNOWN];
+
+	return names[type];
+}
+
+int get_fd_type(int fd)
+{
+	char path[PATH_MAX];
+	char buf[512];
+	ssize_t n;
+
+	snprintf(path, sizeof(path), "/proc/%d/fd/%d", getpid(), fd);
+
+	n = readlink(path, buf, sizeof(buf));
+	if (n < 0) {
+		err("can't read link type: %s\n", strerror(errno));
+		return -1;
+	}
+	if (n == sizeof(path)) {
+		err("can't read link type: path too long!\n");
+		return -1;
+	}
+
+	if (strstr(buf, "bpf-map"))
+		return BPF_OBJ_MAP;
+	else if (strstr(buf, "bpf-prog"))
+		return BPF_OBJ_PROG;
+
+	return BPF_OBJ_UNKNOWN;
+}
+
+char *get_fdinfo(int fd, const char *key)
+{
+	char path[PATH_MAX];
+	char *line = NULL;
+	size_t line_n = 0;
+	ssize_t n;
+	FILE *fdi;
+
+	snprintf(path, sizeof(path), "/proc/%d/fdinfo/%d", getpid(), fd);
+
+	fdi = fopen(path, "r");
+	if (!fdi) {
+		err("can't open fdinfo: %s\n", strerror(errno));
+		return NULL;
+	}
+
+	while ((n = getline(&line, &line_n, fdi))) {
+		char *value;
+		int len;
+
+		if (!strstr(line, key))
+			continue;
+
+		fclose(fdi);
+
+		value = strchr(line, '\t');
+		if (!value || !value[1]) {
+			err("malformed fdinfo!?\n");
+			free(line);
+			return NULL;
+		}
+		value++;
+
+		len = strlen(value);
+		memmove(line, value, len);
+		line[len - 1] = '\0';
+
+		return line;
+	}
+
+	err("key '%s' not found in fdinfo\n", key);
+	free(line);
+	fclose(fdi);
+	return NULL;
+}
diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c
new file mode 100644
index 000000000000..70e480b59e9d
--- /dev/null
+++ b/tools/bpf/bpftool/jit_disasm.c
@@ -0,0 +1,87 @@
+/*
+ * Based on:
+ *
+ * Minimal BPF JIT image disassembler
+ *
+ * Disassembles BPF JIT compiler emitted opcodes back to asm insn's for
+ * debugging or verification purposes.
+ *
+ * Copyright 2013 Daniel Borkmann <daniel@iogearbox.net>
+ * Licensed under the GNU General Public License, version 2.0 (GPLv2)
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <unistd.h>
+#include <string.h>
+#include <bfd.h>
+#include <dis-asm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+static void get_exec_path(char *tpath, size_t size)
+{
+	ssize_t len;
+	char *path;
+
+	snprintf(tpath, size, "/proc/%d/exe", (int) getpid());
+	tpath[size - 1] = 0;
+
+	path = strdup(tpath);
+	assert(path);
+
+	len = readlink(path, tpath, size - 1);
+	assert(len > 0);
+	tpath[len] = 0;
+
+	free(path);
+}
+
+void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes)
+{
+	disassembler_ftype disassemble;
+	struct disassemble_info info;
+	int count, i, pc = 0;
+	char tpath[256];
+	bfd *bfdf;
+
+	if (!len)
+		return;
+
+	memset(tpath, 0, sizeof(tpath));
+	get_exec_path(tpath, sizeof(tpath));
+
+	bfdf = bfd_openr(tpath, NULL);
+	assert(bfdf);
+	assert(bfd_check_format(bfdf, bfd_object));
+
+	init_disassemble_info(&info, stdout, (fprintf_ftype) fprintf);
+	info.arch = bfd_get_arch(bfdf);
+	info.mach = bfd_get_mach(bfdf);
+	info.buffer = image;
+	info.buffer_length = len;
+
+	disassemble_init_for_target(&info);
+
+	disassemble = disassembler(bfdf);
+	assert(disassemble);
+
+	do {
+		printf("%4x:\t", pc);
+
+		count = disassemble(pc, &info);
+
+		if (opcodes) {
+			printf("\n\t");
+			for (i = 0; i < count; ++i)
+				printf("%02x ", (uint8_t) image[pc + i]);
+		}
+		printf("\n");
+
+		pc += count;
+	} while (count > 0 && pc < len);
+
+	bfd_close(bfdf);
+}
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
new file mode 100644
index 000000000000..e02d00d6e00b
--- /dev/null
+++ b/tools/bpf/bpftool/main.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Author: Jakub Kicinski <kubakici@wp.pl> */
+
+#include <bfd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <linux/bpf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <bpf.h>
+
+#include "main.h"
+
+const char *bin_name;
+static int last_argc;
+static char **last_argv;
+static int (*last_do_help)(int argc, char **argv);
+
+void usage(void)
+{
+	last_do_help(last_argc - 1, last_argv + 1);
+
+	exit(-1);
+}
+
+static int do_help(int argc, char **argv)
+{
+	fprintf(stderr,
+		"Usage: %s OBJECT { COMMAND | help }\n"
+		"       %s batch file FILE\n"
+		"\n"
+		"       OBJECT := { prog | map }\n",
+		bin_name, bin_name);
+
+	return 0;
+}
+
+int cmd_select(const struct cmd *cmds, int argc, char **argv,
+	       int (*help)(int argc, char **argv))
+{
+	unsigned int i;
+
+	last_argc = argc;
+	last_argv = argv;
+	last_do_help = help;
+
+	if (argc < 1 && cmds[0].func)
+		return cmds[0].func(argc, argv);
+
+	for (i = 0; cmds[i].func; i++)
+		if (is_prefix(*argv, cmds[i].cmd))
+			return cmds[i].func(argc - 1, argv + 1);
+
+	help(argc - 1, argv + 1);
+
+	return -1;
+}
+
+bool is_prefix(const char *pfx, const char *str)
+{
+	if (!pfx)
+		return false;
+	if (strlen(str) < strlen(pfx))
+		return false;
+
+	return !memcmp(str, pfx, strlen(pfx));
+}
+
+void print_hex(void *arg, unsigned int n, const char *sep)
+{
+	unsigned char *data = arg;
+	unsigned int i;
+
+	for (i = 0; i < n; i++) {
+		const char *pfx = "";
+
+		if (!i)
+			/* nothing */;
+		else if (!(i % 16))
+			printf("\n");
+		else if (!(i % 8))
+			printf("  ");
+		else
+			pfx = sep;
+
+		printf("%s%02hhx", i ? pfx : "", data[i]);
+	}
+}
+
+static int do_batch(int argc, char **argv);
+
+static const struct cmd cmds[] = {
+	{ "help",	do_help },
+	{ "batch",	do_batch },
+	{ "prog",	do_prog },
+	{ "map",	do_map },
+	{ 0 }
+};
+
+static int do_batch(int argc, char **argv)
+{
+	unsigned int lines = 0;
+	char *n_argv[4096];
+	char buf[65536];
+	int n_argc;
+	FILE *fp;
+	int err;
+
+	if (argc < 2) {
+		err("too few parameters for batch\n");
+		return -1;
+	} else if (!is_prefix(*argv, "file")) {
+		err("expected 'file', got: %s\n", *argv);
+		return -1;
+	} else if (argc > 2) {
+		err("too many parameters for batch\n");
+		return -1;
+	}
+	NEXT_ARG();
+
+	fp = fopen(*argv, "r");
+	if (!fp) {
+		err("Can't open file (%s): %s\n", *argv, strerror(errno));
+		return -1;
+	}
+
+	while (fgets(buf, sizeof(buf), fp)) {
+		if (strlen(buf) == sizeof(buf) - 1) {
+			errno = E2BIG;
+			break;
+		}
+
+		n_argc = 0;
+		n_argv[n_argc] = strtok(buf, " \t\n");
+
+		while (n_argv[n_argc]) {
+			n_argc++;
+			if (n_argc == ARRAY_SIZE(n_argv)) {
+				err("line %d has too many arguments, skip\n",
+				    lines);
+				n_argc = 0;
+				break;
+			}
+			n_argv[n_argc] = strtok(NULL, " \t\n");
+		}
+
+		if (!n_argc)
+			continue;
+
+		err = cmd_select(cmds, n_argc, n_argv, do_help);
+		if (err)
+			goto err_close;
+
+		lines++;
+	}
+
+	if (errno && errno != ENOENT) {
+		perror("reading batch file failed");
+		err = -1;
+	} else {
+		info("processed %d lines\n", lines);
+		err = 0;
+	}
+err_close:
+	fclose(fp);
+
+	return err;
+}
+
+int main(int argc, char **argv)
+{
+	bin_name = argv[0];
+	NEXT_ARG();
+
+	bfd_init();
+
+	return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
new file mode 100644
index 000000000000..85d2d7870a58
--- /dev/null
+++ b/tools/bpf/bpftool/main.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Author: Jakub Kicinski <kubakici@wp.pl> */
+
+#ifndef __BPF_TOOL_H
+#define __BPF_TOOL_H
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <linux/bpf.h>
+
+#define ARRAY_SIZE(a)	(sizeof(a) / sizeof(a[0]))
+
+#define err(msg...)	fprintf(stderr, "Error: " msg)
+#define warn(msg...)	fprintf(stderr, "Warning: " msg)
+#define info(msg...)	fprintf(stderr, msg)
+
+#define ptr_to_u64(ptr)	((__u64)(unsigned long)(ptr))
+
+#define min(a, b)							\
+	({ typeof(a) _a = (a); typeof(b) _b = (b); _a > _b ? _b : _a; })
+#define max(a, b)							\
+	({ typeof(a) _a = (a); typeof(b) _b = (b); _a < _b ? _b : _a; })
+
+#define NEXT_ARG()	({ argc--; argv++; if (argc < 0) usage(); })
+#define NEXT_ARGP()	({ (*argc)--; (*argv)++; if (*argc < 0) usage(); })
+#define BAD_ARG()	({ err("what is '%s'?\n", *argv); -1; })
+
+#define BPF_TAG_FMT	"%02hhx:%02hhx:%02hhx:%02hhx:"	\
+			"%02hhx:%02hhx:%02hhx:%02hhx"
+
+#define HELP_SPEC_PROGRAM						\
+	"PROG := { id PROG_ID | pinned FILE | tag PROG_TAG }"
+
+enum bpf_obj_type {
+	BPF_OBJ_UNKNOWN,
+	BPF_OBJ_PROG,
+	BPF_OBJ_MAP,
+};
+
+extern const char *bin_name;
+
+bool is_prefix(const char *pfx, const char *str);
+void print_hex(void *arg, unsigned int n, const char *sep);
+void usage(void) __attribute__((noreturn));
+
+struct cmd {
+	const char *cmd;
+	int (*func)(int argc, char **argv);
+};
+
+int cmd_select(const struct cmd *cmds, int argc, char **argv,
+	       int (*help)(int argc, char **argv));
+
+int get_fd_type(int fd);
+const char *get_fd_type_name(enum bpf_obj_type type);
+char *get_fdinfo(int fd, const char *key);
+int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type);
+int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32));
+
+int do_prog(int argc, char **arg);
+int do_map(int argc, char **arg);
+
+int prog_parse_fd(int *argc, char ***argv);
+
+void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes);
+
+#endif
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
new file mode 100644
index 000000000000..0528a5379e6c
--- /dev/null
+++ b/tools/bpf/bpftool/map.c
@@ -0,0 +1,744 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Author: Jakub Kicinski <kubakici@wp.pl> */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <bpf.h>
+
+#include "main.h"
+
+static const char * const map_type_name[] = {
+	[BPF_MAP_TYPE_UNSPEC]		= "unspec",
+	[BPF_MAP_TYPE_HASH]		= "hash",
+	[BPF_MAP_TYPE_ARRAY]		= "array",
+	[BPF_MAP_TYPE_PROG_ARRAY]	= "prog_array",
+	[BPF_MAP_TYPE_PERF_EVENT_ARRAY]	= "perf_event_array",
+	[BPF_MAP_TYPE_PERCPU_HASH]	= "percpu_hash",
+	[BPF_MAP_TYPE_PERCPU_ARRAY]	= "percpu_array",
+	[BPF_MAP_TYPE_STACK_TRACE]	= "stack_trace",
+	[BPF_MAP_TYPE_CGROUP_ARRAY]	= "cgroup_array",
+	[BPF_MAP_TYPE_LRU_HASH]		= "lru_hash",
+	[BPF_MAP_TYPE_LRU_PERCPU_HASH]	= "lru_percpu_hash",
+	[BPF_MAP_TYPE_LPM_TRIE]		= "lpm_trie",
+	[BPF_MAP_TYPE_ARRAY_OF_MAPS]	= "array_of_maps",
+	[BPF_MAP_TYPE_HASH_OF_MAPS]	= "hash_of_maps",
+	[BPF_MAP_TYPE_DEVMAP]		= "devmap",
+	[BPF_MAP_TYPE_SOCKMAP]		= "sockmap",
+};
+
+static unsigned int get_possible_cpus(void)
+{
+	static unsigned int result;
+	char buf[128];
+	long int n;
+	char *ptr;
+	int fd;
+
+	if (result)
+		return result;
+
+	fd = open("/sys/devices/system/cpu/possible", O_RDONLY);
+	if (fd < 0) {
+		err("can't open sysfs possible cpus\n");
+		exit(-1);
+	}
+
+	n = read(fd, buf, sizeof(buf));
+	if (n < 2) {
+		err("can't read sysfs possible cpus\n");
+		exit(-1);
+	}
+	close(fd);
+
+	if (n == sizeof(buf)) {
+		err("read sysfs possible cpus overflow\n");
+		exit(-1);
+	}
+
+	ptr = buf;
+	n = 0;
+	while (*ptr && *ptr != '\n') {
+		unsigned int a, b;
+
+		if (sscanf(ptr, "%u-%u", &a, &b) == 2) {
+			n += b - a + 1;
+
+			ptr = strchr(ptr, '-') + 1;
+		} else if (sscanf(ptr, "%u", &a) == 1) {
+			n++;
+		} else {
+			assert(0);
+		}
+
+		while (isdigit(*ptr))
+			ptr++;
+		if (*ptr == ',')
+			ptr++;
+	}
+
+	result = n;
+
+	return result;
+}
+
+static bool map_is_per_cpu(__u32 type)
+{
+	return type == BPF_MAP_TYPE_PERCPU_HASH ||
+	       type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+	       type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
+static bool map_is_map_of_maps(__u32 type)
+{
+	return type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+	       type == BPF_MAP_TYPE_HASH_OF_MAPS;
+}
+
+static bool map_is_map_of_progs(__u32 type)
+{
+	return type == BPF_MAP_TYPE_PROG_ARRAY;
+}
+
+static void *alloc_value(struct bpf_map_info *info)
+{
+	if (map_is_per_cpu(info->type))
+		return malloc(info->value_size * get_possible_cpus());
+	else
+		return malloc(info->value_size);
+}
+
+static int map_parse_fd(int *argc, char ***argv)
+{
+	int fd;
+
+	if (is_prefix(**argv, "id")) {
+		unsigned int id;
+		char *endptr;
+
+		NEXT_ARGP();
+
+		id = strtoul(**argv, &endptr, 0);
+		if (*endptr) {
+			err("can't parse %s as ID\n", **argv);
+			return -1;
+		}
+		NEXT_ARGP();
+
+		fd = bpf_map_get_fd_by_id(id);
+		if (fd < 0)
+			err("get map by id (%u): %s\n", id, strerror(errno));
+		return fd;
+	} else if (is_prefix(**argv, "pinned")) {
+		char *path;
+
+		NEXT_ARGP();
+
+		path = **argv;
+		NEXT_ARGP();
+
+		return open_obj_pinned_any(path, BPF_OBJ_MAP);
+	}
+
+	err("expected 'id' or 'pinned', got: '%s'?\n", **argv);
+	return -1;
+}
+
+static int
+map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
+{
+	int err;
+	int fd;
+
+	fd = map_parse_fd(argc, argv);
+	if (fd < 0)
+		return -1;
+
+	err = bpf_obj_get_info_by_fd(fd, info, info_len);
+	if (err) {
+		err("can't get map info: %s\n", strerror(errno));
+		close(fd);
+		return err;
+	}
+
+	return fd;
+}
+
+static void print_entry(struct bpf_map_info *info, unsigned char *key,
+			unsigned char *value)
+{
+	if (!map_is_per_cpu(info->type)) {
+		bool single_line, break_names;
+
+		break_names = info->key_size > 16 || info->value_size > 16;
+		single_line = info->key_size + info->value_size <= 24 &&
+			!break_names;
+
+		printf("key:%c", break_names ? '\n' : ' ');
+		print_hex(key, info->key_size, " ");
+
+		printf(single_line ? "  " : "\n");
+
+		printf("value:%c", break_names ? '\n' : ' ');
+		print_hex(value, info->value_size, " ");
+
+		printf("\n");
+	} else {
+		unsigned int i, n;
+
+		n = get_possible_cpus();
+
+		printf("key:\n");
+		print_hex(key, info->key_size, " ");
+		printf("\n");
+		for (i = 0; i < n; i++) {
+			printf("value (CPU %02d):%c",
+			       i, info->value_size > 16 ? '\n' : ' ');
+			print_hex(value + i * info->value_size,
+				  info->value_size, " ");
+			printf("\n");
+		}
+	}
+}
+
+static char **parse_bytes(char **argv, const char *name, unsigned char *val,
+			  unsigned int n)
+{
+	unsigned int i = 0;
+	char *endptr;
+
+	while (i < n && argv[i]) {
+		val[i] = strtoul(argv[i], &endptr, 0);
+		if (*endptr) {
+			err("error parsing byte: %s\n", argv[i]);
+			break;
+		}
+		i++;
+	}
+
+	if (i != n) {
+		err("%s expected %d bytes got %d\n", name, n, i);
+		return NULL;
+	}
+
+	return argv + i;
+}
+
+static int parse_elem(char **argv, struct bpf_map_info *info,
+		      void *key, void *value, __u32 key_size, __u32 value_size,
+		      __u32 *flags, __u32 **value_fd)
+{
+	if (!*argv) {
+		if (!key && !value)
+			return 0;
+		err("did not find %s\n", key ? "key" : "value");
+		return -1;
+	}
+
+	if (is_prefix(*argv, "key")) {
+		if (!key) {
+			if (key_size)
+				err("duplicate key\n");
+			else
+				err("unnecessary key\n");
+			return -1;
+		}
+
+		argv = parse_bytes(argv + 1, "key", key, key_size);
+		if (!argv)
+			return -1;
+
+		return parse_elem(argv, info, NULL, value, key_size, value_size,
+				  flags, value_fd);
+	} else if (is_prefix(*argv, "value")) {
+		int fd;
+
+		if (!value) {
+			if (value_size)
+				err("duplicate value\n");
+			else
+				err("unnecessary value\n");
+			return -1;
+		}
+
+		argv++;
+
+		if (map_is_map_of_maps(info->type)) {
+			int argc = 2;
+
+			if (value_size != 4) {
+				err("value smaller than 4B for map in map?\n");
+				return -1;
+			}
+			if (!argv[0] || !argv[1]) {
+				err("not enough value arguments for map in map\n");
+				return -1;
+			}
+
+			fd = map_parse_fd(&argc, &argv);
+			if (fd < 0)
+				return -1;
+
+			*value_fd = value;
+			**value_fd = fd;
+		} else if (map_is_map_of_progs(info->type)) {
+			int argc = 2;
+
+			if (value_size != 4) {
+				err("value smaller than 4B for map of progs?\n");
+				return -1;
+			}
+			if (!argv[0] || !argv[1]) {
+				err("not enough value arguments for map of progs\n");
+				return -1;
+			}
+
+			fd = prog_parse_fd(&argc, &argv);
+			if (fd < 0)
+				return -1;
+
+			*value_fd = value;
+			**value_fd = fd;
+		} else {
+			argv = parse_bytes(argv, "value", value, value_size);
+			if (!argv)
+				return -1;
+		}
+
+		return parse_elem(argv, info, key, NULL, key_size, value_size,
+				  flags, NULL);
+	} else if (is_prefix(*argv, "any") || is_prefix(*argv, "noexist") ||
+		   is_prefix(*argv, "exist")) {
+		if (!flags) {
+			err("flags specified multiple times: %s\n", *argv);
+			return -1;
+		}
+
+		if (is_prefix(*argv, "any"))
+			*flags = BPF_ANY;
+		else if (is_prefix(*argv, "noexist"))
+			*flags = BPF_NOEXIST;
+		else if (is_prefix(*argv, "exist"))
+			*flags = BPF_EXIST;
+
+		return parse_elem(argv + 1, info, key, value, key_size,
+				  value_size, NULL, value_fd);
+	}
+
+	err("expected key or value, got: %s\n", *argv);
+	return -1;
+}
+
+static int show_map_close(int fd, struct bpf_map_info *info)
+{
+	char *memlock;
+
+	memlock = get_fdinfo(fd, "memlock");
+	close(fd);
+
+	printf("%u: ", info->id);
+	if (info->type < ARRAY_SIZE(map_type_name))
+		printf("%s  ", map_type_name[info->type]);
+	else
+		printf("type %u  ", info->type);
+
+	if (*info->name)
+		printf("name %s  ", info->name);
+
+	printf("flags 0x%x\n", info->map_flags);
+	printf("\tkey %uB  value %uB  max_entries %u",
+	       info->key_size, info->value_size, info->max_entries);
+
+	if (memlock)
+		printf("  memlock %sB", memlock);
+	free(memlock);
+
+	printf("\n");
+
+	return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+	struct bpf_map_info info = {};
+	__u32 len = sizeof(info);
+	__u32 id = 0;
+	int err;
+	int fd;
+
+	if (argc == 2) {
+		fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
+		if (fd < 0)
+			return -1;
+
+		return show_map_close(fd, &info);
+	}
+
+	if (argc)
+		return BAD_ARG();
+
+	while (true) {
+		err = bpf_map_get_next_id(id, &id);
+		if (err) {
+			if (errno == ENOENT)
+				break;
+			err("can't get next map: %s\n", strerror(errno));
+			if (errno == EINVAL)
+				err("kernel too old?\n");
+			return -1;
+		}
+
+		fd = bpf_map_get_fd_by_id(id);
+		if (fd < 0) {
+			err("can't get map by id (%u): %s\n",
+			    id, strerror(errno));
+			return -1;
+		}
+
+		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		if (err) {
+			err("can't get map info: %s\n", strerror(errno));
+			close(fd);
+			return -1;
+		}
+
+		show_map_close(fd, &info);
+	}
+
+	return errno == ENOENT ? 0 : -1;
+}
+
+static int do_dump(int argc, char **argv)
+{
+	void *key, *value, *prev_key;
+	unsigned int num_elems = 0;
+	struct bpf_map_info info = {};
+	__u32 len = sizeof(info);
+	int err;
+	int fd;
+
+	if (argc != 2)
+		usage();
+
+	fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
+	if (fd < 0)
+		return -1;
+
+	if (map_is_map_of_maps(info.type) || map_is_map_of_progs(info.type)) {
+		err("Dumping maps of maps and program maps not supported\n");
+		close(fd);
+		return -1;
+	}
+
+	key = malloc(info.key_size);
+	value = alloc_value(&info);
+	if (!key || !value) {
+		err("mem alloc failed\n");
+		err = -1;
+		goto exit_free;
+	}
+
+	prev_key = NULL;
+	while (true) {
+		err = bpf_map_get_next_key(fd, prev_key, key);
+		if (err) {
+			if (errno == ENOENT)
+				err = 0;
+			break;
+		}
+
+		if (!bpf_map_lookup_elem(fd, key, value)) {
+			print_entry(&info, key, value);
+		} else {
+			info("can't lookup element with key: ");
+			print_hex(key, info.key_size, " ");
+			printf("\n");
+		}
+
+		prev_key = key;
+		num_elems++;
+	}
+
+	printf("Found %u element%s\n", num_elems, num_elems != 1 ? "s" : "");
+
+exit_free:
+	free(key);
+	free(value);
+	close(fd);
+
+	return err;
+}
+
+static int do_update(int argc, char **argv)
+{
+	struct bpf_map_info info = {};
+	__u32 len = sizeof(info);
+	__u32 *value_fd = NULL;
+	__u32 flags = BPF_ANY;
+	void *key, *value;
+	int fd, err;
+
+	if (argc < 2)
+		usage();
+
+	fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
+	if (fd < 0)
+		return -1;
+
+	key = malloc(info.key_size);
+	value = alloc_value(&info);
+	if (!key || !value) {
+		err("mem alloc failed");
+		err = -1;
+		goto exit_free;
+	}
+
+	err = parse_elem(argv, &info, key, value, info.key_size,
+			 info.value_size, &flags, &value_fd);
+	if (err)
+		goto exit_free;
+
+	err = bpf_map_update_elem(fd, key, value, flags);
+	if (err) {
+		err("update failed: %s\n", strerror(errno));
+		goto exit_free;
+	}
+
+exit_free:
+	if (value_fd)
+		close(*value_fd);
+	free(key);
+	free(value);
+	close(fd);
+
+	return err;
+}
+
+static int do_lookup(int argc, char **argv)
+{
+	struct bpf_map_info info = {};
+	__u32 len = sizeof(info);
+	void *key, *value;
+	int err;
+	int fd;
+
+	if (argc < 2)
+		usage();
+
+	fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
+	if (fd < 0)
+		return -1;
+
+	key = malloc(info.key_size);
+	value = alloc_value(&info);
+	if (!key || !value) {
+		err("mem alloc failed");
+		err = -1;
+		goto exit_free;
+	}
+
+	err = parse_elem(argv, &info, key, NULL, info.key_size, 0, NULL, NULL);
+	if (err)
+		goto exit_free;
+
+	err = bpf_map_lookup_elem(fd, key, value);
+	if (!err) {
+		print_entry(&info, key, value);
+	} else if (errno == ENOENT) {
+		printf("key:\n");
+		print_hex(key, info.key_size, " ");
+		printf("\n\nNot found\n");
+	} else {
+		err("lookup failed: %s\n", strerror(errno));
+	}
+
+exit_free:
+	free(key);
+	free(value);
+	close(fd);
+
+	return err;
+}
+
+static int do_getnext(int argc, char **argv)
+{
+	struct bpf_map_info info = {};
+	__u32 len = sizeof(info);
+	void *key, *nextkey;
+	int err;
+	int fd;
+
+	if (argc < 2)
+		usage();
+
+	fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
+	if (fd < 0)
+		return -1;
+
+	key = malloc(info.key_size);
+	nextkey = malloc(info.key_size);
+	if (!key || !nextkey) {
+		err("mem alloc failed");
+		err = -1;
+		goto exit_free;
+	}
+
+	if (argc) {
+		err = parse_elem(argv, &info, key, NULL, info.key_size, 0,
+				 NULL, NULL);
+		if (err)
+			goto exit_free;
+	} else {
+		free(key);
+		key = NULL;
+	}
+
+	err = bpf_map_get_next_key(fd, key, nextkey);
+	if (err) {
+		err("can't get next key: %s\n", strerror(errno));
+		goto exit_free;
+	}
+
+	if (key) {
+		printf("key:\n");
+		print_hex(key, info.key_size, " ");
+		printf("\n");
+	} else {
+		printf("key: None\n");
+	}
+
+	printf("next key:\n");
+	print_hex(nextkey, info.key_size, " ");
+	printf("\n");
+
+exit_free:
+	free(nextkey);
+	free(key);
+	close(fd);
+
+	return err;
+}
+
+static int do_delete(int argc, char **argv)
+{
+	struct bpf_map_info info = {};
+	__u32 len = sizeof(info);
+	void *key;
+	int err;
+	int fd;
+
+	if (argc < 2)
+		usage();
+
+	fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
+	if (fd < 0)
+		return -1;
+
+	key = malloc(info.key_size);
+	if (!key) {
+		err("mem alloc failed");
+		err = -1;
+		goto exit_free;
+	}
+
+	err = parse_elem(argv, &info, key, NULL, info.key_size, 0, NULL, NULL);
+	if (err)
+		goto exit_free;
+
+	err = bpf_map_delete_elem(fd, key);
+	if (err)
+		err("delete failed: %s\n", strerror(errno));
+
+exit_free:
+	free(key);
+	close(fd);
+
+	return err;
+}
+
+static int do_pin(int argc, char **argv)
+{
+	return do_pin_any(argc, argv, bpf_map_get_fd_by_id);
+}
+
+static int do_help(int argc, char **argv)
+{
+	fprintf(stderr,
+		"Usage: %s %s show   [MAP]\n"
+		"       %s %s dump    MAP\n"
+		"       %s %s update  MAP  key BYTES value VALUE [UPDATE_FLAGS]\n"
+		"       %s %s lookup  MAP  key BYTES\n"
+		"       %s %s getnext MAP [key BYTES]\n"
+		"       %s %s delete  MAP  key BYTES\n"
+		"       %s %s pin     MAP  FILE\n"
+		"       %s %s help\n"
+		"\n"
+		"       MAP := { id MAP_ID | pinned FILE }\n"
+		"       " HELP_SPEC_PROGRAM "\n"
+		"       VALUE := { BYTES | MAP | PROG }\n"
+		"       UPDATE_FLAGS := { any | exist | noexist }\n"
+		"",
+		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+		bin_name, argv[-2], bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "show",	do_show },
+	{ "help",	do_help },
+	{ "dump",	do_dump },
+	{ "update",	do_update },
+	{ "lookup",	do_lookup },
+	{ "getnext",	do_getnext },
+	{ "delete",	do_delete },
+	{ "pin",	do_pin },
+	{ 0 }
+};
+
+int do_map(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
new file mode 100644
index 000000000000..421ba89ce86a
--- /dev/null
+++ b/tools/bpf/bpftool/prog.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Author: Jakub Kicinski <kubakici@wp.pl> */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <bpf.h>
+
+#include "main.h"
+
+static const char * const prog_type_name[] = {
+	[BPF_PROG_TYPE_UNSPEC]		= "unspec",
+	[BPF_PROG_TYPE_SOCKET_FILTER]	= "socket_filter",
+	[BPF_PROG_TYPE_KPROBE]		= "kprobe",
+	[BPF_PROG_TYPE_SCHED_CLS]	= "sched_cls",
+	[BPF_PROG_TYPE_SCHED_ACT]	= "sched_act",
+	[BPF_PROG_TYPE_TRACEPOINT]	= "tracepoint",
+	[BPF_PROG_TYPE_XDP]		= "xdp",
+	[BPF_PROG_TYPE_PERF_EVENT]	= "perf_event",
+	[BPF_PROG_TYPE_CGROUP_SKB]	= "cgroup_skb",
+	[BPF_PROG_TYPE_CGROUP_SOCK]	= "cgroup_sock",
+	[BPF_PROG_TYPE_LWT_IN]		= "lwt_in",
+	[BPF_PROG_TYPE_LWT_OUT]		= "lwt_out",
+	[BPF_PROG_TYPE_LWT_XMIT]	= "lwt_xmit",
+	[BPF_PROG_TYPE_SOCK_OPS]	= "sock_ops",
+	[BPF_PROG_TYPE_SK_SKB]		= "sk_skb",
+};
+
+static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
+{
+	struct timespec real_time_ts, boot_time_ts;
+	time_t wallclock_secs;
+	struct tm load_tm;
+
+	buf[--size] = '\0';
+
+	if (clock_gettime(CLOCK_REALTIME, &real_time_ts) ||
+	    clock_gettime(CLOCK_BOOTTIME, &boot_time_ts)) {
+		perror("Can't read clocks");
+		snprintf(buf, size, "%llu", nsecs / 1000000000);
+		return;
+	}
+
+	wallclock_secs = (real_time_ts.tv_sec - boot_time_ts.tv_sec) +
+		nsecs / 1000000000;
+
+	if (!localtime_r(&wallclock_secs, &load_tm)) {
+		snprintf(buf, size, "%llu", nsecs / 1000000000);
+		return;
+	}
+
+	strftime(buf, size, "%b %d/%H:%M", &load_tm);
+}
+
+static int prog_fd_by_tag(unsigned char *tag)
+{
+	struct bpf_prog_info info = {};
+	__u32 len = sizeof(info);
+	unsigned int id = 0;
+	int err;
+	int fd;
+
+	while (true) {
+		err = bpf_prog_get_next_id(id, &id);
+		if (err) {
+			err("%s\n", strerror(errno));
+			return -1;
+		}
+
+		fd = bpf_prog_get_fd_by_id(id);
+		if (fd < 0) {
+			err("can't get prog by id (%u): %s\n",
+			    id, strerror(errno));
+			return -1;
+		}
+
+		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		if (err) {
+			err("can't get prog info (%u): %s\n",
+			    id, strerror(errno));
+			close(fd);
+			return -1;
+		}
+
+		if (!memcmp(tag, info.tag, BPF_TAG_SIZE))
+			return fd;
+
+		close(fd);
+	}
+}
+
+int prog_parse_fd(int *argc, char ***argv)
+{
+	int fd;
+
+	if (is_prefix(**argv, "id")) {
+		unsigned int id;
+		char *endptr;
+
+		NEXT_ARGP();
+
+		id = strtoul(**argv, &endptr, 0);
+		if (*endptr) {
+			err("can't parse %s as ID\n", **argv);
+			return -1;
+		}
+		NEXT_ARGP();
+
+		fd = bpf_prog_get_fd_by_id(id);
+		if (fd < 0)
+			err("get by id (%u): %s\n", id, strerror(errno));
+		return fd;
+	} else if (is_prefix(**argv, "tag")) {
+		unsigned char tag[BPF_TAG_SIZE];
+
+		NEXT_ARGP();
+
+		if (sscanf(**argv, BPF_TAG_FMT, tag, tag + 1, tag + 2,
+			   tag + 3, tag + 4, tag + 5, tag + 6, tag + 7)
+		    != BPF_TAG_SIZE) {
+			err("can't parse tag\n");
+			return -1;
+		}
+		NEXT_ARGP();
+
+		return prog_fd_by_tag(tag);
+	} else if (is_prefix(**argv, "pinned")) {
+		char *path;
+
+		NEXT_ARGP();
+
+		path = **argv;
+		NEXT_ARGP();
+
+		return open_obj_pinned_any(path, BPF_OBJ_PROG);
+	}
+
+	err("expected 'id', 'tag' or 'pinned', got: '%s'?\n", **argv);
+	return -1;
+}
+
+static void show_prog_maps(int fd, u32 num_maps)
+{
+	struct bpf_prog_info info = {};
+	__u32 len = sizeof(info);
+	__u32 map_ids[num_maps];
+	unsigned int i;
+	int err;
+
+	info.nr_map_ids = num_maps;
+	info.map_ids = ptr_to_u64(map_ids);
+
+	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	if (err || !info.nr_map_ids)
+		return;
+
+	printf("  map_ids ");
+	for (i = 0; i < info.nr_map_ids; i++)
+		printf("%u%s", map_ids[i],
+		       i == info.nr_map_ids - 1 ? "" : ",");
+}
+
+static int show_prog(int fd)
+{
+	struct bpf_prog_info info = {};
+	__u32 len = sizeof(info);
+	char *memlock;
+	int err;
+
+	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	if (err) {
+		err("can't get prog info: %s\n", strerror(errno));
+		return -1;
+	}
+
+	printf("%u: ", info.id);
+	if (info.type < ARRAY_SIZE(prog_type_name))
+		printf("%s  ", prog_type_name[info.type]);
+	else
+		printf("type %u  ", info.type);
+
+	if (*info.name)
+		printf("name %s  ", info.name);
+
+	printf("tag ");
+	print_hex(info.tag, BPF_TAG_SIZE, ":");
+	printf("\n");
+
+	if (info.load_time) {
+		char buf[32];
+
+		print_boot_time(info.load_time, buf, sizeof(buf));
+
+		/* Piggy back on load_time, since 0 uid is a valid one */
+		printf("\tloaded_at %s  uid %u\n", buf, info.created_by_uid);
+	}
+
+	printf("\txlated %uB", info.xlated_prog_len);
+
+	if (info.jited_prog_len)
+		printf("  jited %uB", info.jited_prog_len);
+	else
+		printf("  not jited");
+
+	memlock = get_fdinfo(fd, "memlock");
+	if (memlock)
+		printf("  memlock %sB", memlock);
+	free(memlock);
+
+	if (info.nr_map_ids)
+		show_prog_maps(fd, info.nr_map_ids);
+
+	printf("\n");
+
+	return 0;
+}
+
+static int do_show(int argc, char **argv)
+{	__u32 id = 0;
+	int err;
+	int fd;
+
+	if (argc == 2) {
+		fd = prog_parse_fd(&argc, &argv);
+		if (fd < 0)
+			return -1;
+
+		return show_prog(fd);
+	}
+
+	if (argc)
+		return BAD_ARG();
+
+	while (true) {
+		err = bpf_prog_get_next_id(id, &id);
+		if (err) {
+			if (errno == ENOENT)
+				break;
+			err("can't get next program: %s\n", strerror(errno));
+			if (errno == EINVAL)
+				err("kernel too old?\n");
+			return -1;
+		}
+
+		fd = bpf_prog_get_fd_by_id(id);
+		if (fd < 0) {
+			err("can't get prog by id (%u): %s\n",
+			    id, strerror(errno));
+			return -1;
+		}
+
+		err = show_prog(fd);
+		close(fd);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int do_dump(int argc, char **argv)
+{
+	struct bpf_prog_info info = {};
+	__u32 len = sizeof(info);
+	bool can_disasm = false;
+	unsigned int buf_size;
+	char *filepath = NULL;
+	bool opcodes = false;
+	unsigned char *buf;
+	__u32 *member_len;
+	__u64 *member_ptr;
+	ssize_t n;
+	int err;
+	int fd;
+
+	if (is_prefix(*argv, "jited")) {
+		member_len = &info.jited_prog_len;
+		member_ptr = &info.jited_prog_insns;
+		can_disasm = true;
+	} else if (is_prefix(*argv, "xlated")) {
+		member_len = &info.xlated_prog_len;
+		member_ptr = &info.xlated_prog_insns;
+	} else {
+		err("expected 'xlated' or 'jited', got: %s\n", *argv);
+		return -1;
+	}
+	NEXT_ARG();
+
+	if (argc < 2)
+		usage();
+
+	fd = prog_parse_fd(&argc, &argv);
+	if (fd < 0)
+		return -1;
+
+	if (is_prefix(*argv, "file")) {
+		NEXT_ARG();
+		if (!argc) {
+			err("expected file path\n");
+			return -1;
+		}
+
+		filepath = *argv;
+		NEXT_ARG();
+	} else if (is_prefix(*argv, "opcodes")) {
+		opcodes = true;
+		NEXT_ARG();
+	}
+
+	if (!filepath && !can_disasm) {
+		err("expected 'file' got %s\n", *argv);
+		return -1;
+	}
+	if (argc) {
+		usage();
+		return -1;
+	}
+
+	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	if (err) {
+		err("can't get prog info: %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (!*member_len) {
+		info("no instructions returned\n");
+		close(fd);
+		return 0;
+	}
+
+	buf_size = *member_len;
+
+	buf = malloc(buf_size);
+	if (!buf) {
+		err("mem alloc failed\n");
+		close(fd);
+		return -1;
+	}
+
+	memset(&info, 0, sizeof(info));
+
+	*member_ptr = ptr_to_u64(buf);
+	*member_len = buf_size;
+
+	err = bpf_obj_get_info_by_fd(fd, &info, &len);
+	close(fd);
+	if (err) {
+		err("can't get prog info: %s\n", strerror(errno));
+		goto err_free;
+	}
+
+	if (*member_len > buf_size) {
+		info("too many instructions returned\n");
+		goto err_free;
+	}
+
+	if (filepath) {
+		fd = open(filepath, O_WRONLY | O_CREAT | O_TRUNC, 0600);
+		if (fd < 0) {
+			err("can't open file %s: %s\n", filepath,
+			    strerror(errno));
+			goto err_free;
+		}
+
+		n = write(fd, buf, *member_len);
+		close(fd);
+		if (n != *member_len) {
+			err("error writing output file: %s\n",
+			    n < 0 ? strerror(errno) : "short write");
+			goto err_free;
+		}
+	} else {
+		disasm_print_insn(buf, *member_len, opcodes);
+	}
+
+	free(buf);
+
+	return 0;
+
+err_free:
+	free(buf);
+	return -1;
+}
+
+static int do_pin(int argc, char **argv)
+{
+	return do_pin_any(argc, argv, bpf_prog_get_fd_by_id);
+}
+
+static int do_help(int argc, char **argv)
+{
+	fprintf(stderr,
+		"Usage: %s %s show [PROG]\n"
+		"       %s %s dump xlated PROG  file FILE\n"
+		"       %s %s dump jited  PROG [file FILE] [opcodes]\n"
+		"       %s %s pin   PROG FILE\n"
+		"       %s %s help\n"
+		"\n"
+		"       " HELP_SPEC_PROGRAM "\n"
+		"",
+		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+		bin_name, argv[-2], bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "show",	do_show },
+	{ "dump",	do_dump },
+	{ "pin",	do_pin },
+	{ 0 }
+};
+
+int do_prog(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 43ab5c402f98..cb2b9f95160a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
 	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
 };
 
 enum bpf_map_type {
@@ -143,11 +144,47 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
- * to the given target_fd cgroup the descendent cgroup will be able to
- * override effective bpf program that was inherited from this cgroup
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
@@ -175,6 +212,11 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
+#define BPF_OBJ_NAME_LEN 16U
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -188,6 +230,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
+		__u8	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -210,6 +253,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
+		__u8		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -249,6 +293,15 @@ union bpf_attr {
 		__u32		info_len;
 		__aligned_u64	info;
 	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
@@ -582,6 +635,12 @@ union bpf_attr {
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
+ *
+ * int bpf_xdp_adjust_meta(xdp_md, delta)
+ *     Adjust the xdp_md.data_meta by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data_meta
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -638,6 +697,7 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -715,7 +775,7 @@ struct __sk_buff {
 	__u32 data_end;
 	__u32 napi_id;
 
-	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
 	__u32 local_ip4;	/* Stored in network byte order */
@@ -723,6 +783,9 @@ struct __sk_buff {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
 };
 
 struct bpf_tunnel_key {
@@ -783,6 +846,7 @@ enum xdp_action {
 struct xdp_md {
 	__u32 data;
 	__u32 data_end;
+	__u32 data_meta;
 };
 
 enum sk_action {
@@ -801,6 +865,11 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -810,6 +879,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 1d6907d379c9..5128677e4117 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -46,6 +46,8 @@
 # endif
 #endif
 
+#define min(x, y) ((x) < (y) ? (x) : (y))
+
 static inline __u64 ptr_to_u64(const void *ptr)
 {
 	return (__u64) (unsigned long) ptr;
@@ -57,10 +59,11 @@ static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
 	return syscall(__NR_bpf, cmd, attr, size);
 }
 
-int bpf_create_map_node(enum bpf_map_type map_type, int key_size,
-			int value_size, int max_entries, __u32 map_flags,
-			int node)
+int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
+			int key_size, int value_size, int max_entries,
+			__u32 map_flags, int node)
 {
+	__u32 name_len = name ? strlen(name) : 0;
 	union bpf_attr attr;
 
 	memset(&attr, '\0', sizeof(attr));
@@ -70,6 +73,8 @@ int bpf_create_map_node(enum bpf_map_type map_type, int key_size,
 	attr.value_size = value_size;
 	attr.max_entries = max_entries;
 	attr.map_flags = map_flags;
+	memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
+
 	if (node >= 0) {
 		attr.map_flags |= BPF_F_NUMA_NODE;
 		attr.numa_node = node;
@@ -81,14 +86,23 @@ int bpf_create_map_node(enum bpf_map_type map_type, int key_size,
 int bpf_create_map(enum bpf_map_type map_type, int key_size,
 		   int value_size, int max_entries, __u32 map_flags)
 {
-	return bpf_create_map_node(map_type, key_size, value_size,
+	return bpf_create_map_node(map_type, NULL, key_size, value_size,
 				   max_entries, map_flags, -1);
 }
 
-int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size,
-			       int inner_map_fd, int max_entries,
+int bpf_create_map_name(enum bpf_map_type map_type, const char *name,
+			int key_size, int value_size, int max_entries,
+			__u32 map_flags)
+{
+	return bpf_create_map_node(map_type, name, key_size, value_size,
+				   max_entries, map_flags, -1);
+}
+
+int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
+			       int key_size, int inner_map_fd, int max_entries,
 			       __u32 map_flags, int node)
 {
+	__u32 name_len = name ? strlen(name) : 0;
 	union bpf_attr attr;
 
 	memset(&attr, '\0', sizeof(attr));
@@ -99,6 +113,8 @@ int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size,
 	attr.inner_map_fd = inner_map_fd;
 	attr.max_entries = max_entries;
 	attr.map_flags = map_flags;
+	memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
+
 	if (node >= 0) {
 		attr.map_flags |= BPF_F_NUMA_NODE;
 		attr.numa_node = node;
@@ -107,19 +123,24 @@ int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size,
 	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
 }
 
-int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size,
-			  int inner_map_fd, int max_entries, __u32 map_flags)
+int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name,
+			  int key_size, int inner_map_fd, int max_entries,
+			  __u32 map_flags)
 {
-	return bpf_create_map_in_map_node(map_type, key_size, inner_map_fd,
-					  max_entries, map_flags, -1);
+	return bpf_create_map_in_map_node(map_type, name, key_size,
+					  inner_map_fd, max_entries, map_flags,
+					  -1);
 }
 
-int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
-		     size_t insns_cnt, const char *license,
-		     __u32 kern_version, char *log_buf, size_t log_buf_sz)
+int bpf_load_program_name(enum bpf_prog_type type, const char *name,
+			  const struct bpf_insn *insns,
+			  size_t insns_cnt, const char *license,
+			  __u32 kern_version, char *log_buf,
+			  size_t log_buf_sz)
 {
 	int fd;
 	union bpf_attr attr;
+	__u32 name_len = name ? strlen(name) : 0;
 
 	bzero(&attr, sizeof(attr));
 	attr.prog_type = type;
@@ -130,6 +151,7 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 	attr.log_size = 0;
 	attr.log_level = 0;
 	attr.kern_version = kern_version;
+	memcpy(attr.prog_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
 
 	fd = sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 	if (fd >= 0 || !log_buf || !log_buf_sz)
@@ -143,6 +165,15 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 	return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 }
 
+int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
+		     size_t insns_cnt, const char *license,
+		     __u32 kern_version, char *log_buf,
+		     size_t log_buf_sz)
+{
+	return bpf_load_program_name(type, NULL, insns, insns_cnt, license,
+				     kern_version, log_buf, log_buf_sz);
+}
+
 int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 		       size_t insns_cnt, int strict_alignment,
 		       const char *license, __u32 kern_version,
@@ -260,6 +291,38 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
 	return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
 }
 
+int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type)
+{
+	union bpf_attr attr;
+
+	bzero(&attr, sizeof(attr));
+	attr.target_fd	 = target_fd;
+	attr.attach_bpf_fd = prog_fd;
+	attr.attach_type = type;
+
+	return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
+}
+
+int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
+		   __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt)
+{
+	union bpf_attr attr;
+	int ret;
+
+	bzero(&attr, sizeof(attr));
+	attr.query.target_fd	= target_fd;
+	attr.query.attach_type	= type;
+	attr.query.query_flags	= query_flags;
+	attr.query.prog_cnt	= *prog_cnt;
+	attr.query.prog_ids	= ptr_to_u64(prog_ids);
+
+	ret = sys_bpf(BPF_PROG_QUERY, &attr, sizeof(attr));
+	if (attach_flags)
+		*attach_flags = attr.query.attach_flags;
+	*prog_cnt = attr.query.prog_cnt;
+	return ret;
+}
+
 int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
 		      void *data_out, __u32 *size_out, __u32 *retval,
 		      __u32 *duration)
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index b8ea5843c39e..6534889e2b2f 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -24,19 +24,28 @@
 #include <linux/bpf.h>
 #include <stddef.h>
 
-int bpf_create_map_node(enum bpf_map_type map_type, int key_size,
-			int value_size, int max_entries, __u32 map_flags,
-			int node);
+int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
+			int key_size, int value_size, int max_entries,
+			__u32 map_flags, int node);
+int bpf_create_map_name(enum bpf_map_type map_type, const char *name,
+			int key_size, int value_size, int max_entries,
+			__u32 map_flags);
 int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
 		   int max_entries, __u32 map_flags);
-int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size,
-			       int inner_map_fd, int max_entries,
+int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
+			       int key_size, int inner_map_fd, int max_entries,
 			       __u32 map_flags, int node);
-int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size,
-			  int inner_map_fd, int max_entries, __u32 map_flags);
+int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name,
+			  int key_size, int inner_map_fd, int max_entries,
+			  __u32 map_flags);
 
 /* Recommend log buffer size */
 #define BPF_LOG_BUF_SIZE 65536
+int bpf_load_program_name(enum bpf_prog_type type, const char *name,
+			  const struct bpf_insn *insns,
+			  size_t insns_cnt, const char *license,
+			  __u32 kern_version, char *log_buf,
+			  size_t log_buf_sz);
 int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 		     size_t insns_cnt, const char *license,
 		     __u32 kern_version, char *log_buf,
@@ -57,6 +66,7 @@ int bpf_obj_get(const char *pathname);
 int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
 		    unsigned int flags);
 int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
+int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type);
 int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
 		      void *data_out, __u32 *size_out, __u32 *retval,
 		      __u32 *duration);
@@ -65,5 +75,6 @@ int bpf_map_get_next_id(__u32 start_id, __u32 *next_id);
 int bpf_prog_get_fd_by_id(__u32 id);
 int bpf_map_get_fd_by_id(__u32 id);
 int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len);
-
+int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
+		   __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt);
 #endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 35f6dfcdc565..4f402dcdf372 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -171,6 +171,7 @@ int libbpf_strerror(int err, char *buf, size_t size)
 struct bpf_program {
 	/* Index in elf obj file, for relocation use. */
 	int idx;
+	char *name;
 	char *section_name;
 	struct bpf_insn *insns;
 	size_t insns_cnt;
@@ -283,6 +284,7 @@ static void bpf_program__exit(struct bpf_program *prog)
 	prog->clear_priv = NULL;
 
 	bpf_program__unload(prog);
+	zfree(&prog->name);
 	zfree(&prog->section_name);
 	zfree(&prog->insns);
 	zfree(&prog->reloc_desc);
@@ -293,26 +295,27 @@ static void bpf_program__exit(struct bpf_program *prog)
 }
 
 static int
-bpf_program__init(void *data, size_t size, char *name, int idx,
-		    struct bpf_program *prog)
+bpf_program__init(void *data, size_t size, char *section_name, int idx,
+		  struct bpf_program *prog)
 {
 	if (size < sizeof(struct bpf_insn)) {
-		pr_warning("corrupted section '%s'\n", name);
+		pr_warning("corrupted section '%s'\n", section_name);
 		return -EINVAL;
 	}
 
 	bzero(prog, sizeof(*prog));
 
-	prog->section_name = strdup(name);
+	prog->section_name = strdup(section_name);
 	if (!prog->section_name) {
-		pr_warning("failed to alloc name for prog %s\n",
-			   name);
+		pr_warning("failed to alloc name for prog under section %s\n",
+			   section_name);
 		goto errout;
 	}
 
 	prog->insns = malloc(size);
 	if (!prog->insns) {
-		pr_warning("failed to alloc insns for %s\n", name);
+		pr_warning("failed to alloc insns for prog under section %s\n",
+			   section_name);
 		goto errout;
 	}
 	prog->insns_cnt = size / sizeof(struct bpf_insn);
@@ -331,12 +334,12 @@ errout:
 
 static int
 bpf_object__add_program(struct bpf_object *obj, void *data, size_t size,
-			char *name, int idx)
+			char *section_name, int idx)
 {
 	struct bpf_program prog, *progs;
 	int nr_progs, err;
 
-	err = bpf_program__init(data, size, name, idx, &prog);
+	err = bpf_program__init(data, size, section_name, idx, &prog);
 	if (err)
 		return err;
 
@@ -350,8 +353,8 @@ bpf_object__add_program(struct bpf_object *obj, void *data, size_t size,
 		 * is still valid, so don't need special treat for
 		 * bpf_close_object().
 		 */
-		pr_warning("failed to alloc a new program '%s'\n",
-			   name);
+		pr_warning("failed to alloc a new program under section '%s'\n",
+			   section_name);
 		bpf_program__exit(&prog);
 		return -ENOMEM;
 	}
@@ -364,6 +367,54 @@ bpf_object__add_program(struct bpf_object *obj, void *data, size_t size,
 	return 0;
 }
 
+static int
+bpf_object__init_prog_names(struct bpf_object *obj)
+{
+	Elf_Data *symbols = obj->efile.symbols;
+	struct bpf_program *prog;
+	size_t pi, si;
+
+	for (pi = 0; pi < obj->nr_programs; pi++) {
+		char *name = NULL;
+
+		prog = &obj->programs[pi];
+
+		for (si = 0; si < symbols->d_size / sizeof(GElf_Sym) && !name;
+		     si++) {
+			GElf_Sym sym;
+
+			if (!gelf_getsym(symbols, si, &sym))
+				continue;
+			if (sym.st_shndx != prog->idx)
+				continue;
+
+			name = elf_strptr(obj->efile.elf,
+					  obj->efile.strtabidx,
+					  sym.st_name);
+			if (!name) {
+				pr_warning("failed to get sym name string for prog %s\n",
+					   prog->section_name);
+				return -LIBBPF_ERRNO__LIBELF;
+			}
+		}
+
+		if (!name) {
+			pr_warning("failed to find sym for prog %s\n",
+				   prog->section_name);
+			return -EINVAL;
+		}
+
+		prog->name = strdup(name);
+		if (!prog->name) {
+			pr_warning("failed to allocate memory for prog sym %s\n",
+				   name);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 static struct bpf_object *bpf_object__new(const char *path,
 					  void *obj_buf,
 					  size_t obj_buf_sz)
@@ -766,8 +817,12 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		pr_warning("Corrupted ELF file: index of strtab invalid\n");
 		return LIBBPF_ERRNO__FORMAT;
 	}
-	if (obj->efile.maps_shndx >= 0)
+	if (obj->efile.maps_shndx >= 0) {
 		err = bpf_object__init_maps(obj);
+		if (err)
+			goto out;
+	}
+	err = bpf_object__init_prog_names(obj);
 out:
 	return err;
 }
@@ -870,11 +925,12 @@ bpf_object__create_maps(struct bpf_object *obj)
 		struct bpf_map_def *def = &obj->maps[i].def;
 		int *pfd = &obj->maps[i].fd;
 
-		*pfd = bpf_create_map(def->type,
-				      def->key_size,
-				      def->value_size,
-				      def->max_entries,
-				      0);
+		*pfd = bpf_create_map_name(def->type,
+					   obj->maps[i].name,
+					   def->key_size,
+					   def->value_size,
+					   def->max_entries,
+					   0);
 		if (*pfd < 0) {
 			size_t j;
 			int err = *pfd;
@@ -982,7 +1038,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 }
 
 static int
-load_program(enum bpf_prog_type type, struct bpf_insn *insns,
+load_program(enum bpf_prog_type type, const char *name, struct bpf_insn *insns,
 	     int insns_cnt, char *license, u32 kern_version, int *pfd)
 {
 	int ret;
@@ -995,8 +1051,8 @@ load_program(enum bpf_prog_type type, struct bpf_insn *insns,
 	if (!log_buf)
 		pr_warning("Alloc log buffer for bpf loader error, continue without log\n");
 
-	ret = bpf_load_program(type, insns, insns_cnt, license,
-			       kern_version, log_buf, BPF_LOG_BUF_SIZE);
+	ret = bpf_load_program_name(type, name, insns, insns_cnt, license,
+				    kern_version, log_buf, BPF_LOG_BUF_SIZE);
 
 	if (ret >= 0) {
 		*pfd = ret;
@@ -1021,9 +1077,9 @@ load_program(enum bpf_prog_type type, struct bpf_insn *insns,
 		if (type != BPF_PROG_TYPE_KPROBE) {
 			int fd;
 
-			fd = bpf_load_program(BPF_PROG_TYPE_KPROBE, insns,
-					      insns_cnt, license, kern_version,
-					      NULL, 0);
+			fd = bpf_load_program_name(BPF_PROG_TYPE_KPROBE, name,
+						   insns, insns_cnt, license,
+						   kern_version, NULL, 0);
 			if (fd >= 0) {
 				close(fd);
 				ret = -LIBBPF_ERRNO__PROGTYPE;
@@ -1067,8 +1123,8 @@ bpf_program__load(struct bpf_program *prog,
 			pr_warning("Program '%s' is inconsistent: nr(%d) != 1\n",
 				   prog->section_name, prog->instances.nr);
 		}
-		err = load_program(prog->type, prog->insns, prog->insns_cnt,
-				   license, kern_version, &fd);
+		err = load_program(prog->type, prog->name, prog->insns,
+				   prog->insns_cnt, license, kern_version, &fd);
 		if (!err)
 			prog->instances.fds[0] = fd;
 		goto out;
@@ -1096,7 +1152,8 @@ bpf_program__load(struct bpf_program *prog,
 			continue;
 		}
 
-		err = load_program(prog->type, result.new_insn_ptr,
+		err = load_program(prog->type, prog->name,
+				   result.new_insn_ptr,
 				   result.new_insn_cnt,
 				   license, kern_version, &fd);
 
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f4b23d697448..924af8d79bde 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -15,9 +15,10 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_align
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
-	test_pkt_md_access.o test_xdp_redirect.o sockmap_parse_prog.o sockmap_verdict_prog.o
+	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
+	sockmap_verdict_prog.o
 
-TEST_PROGS := test_kmod.sh test_xdp_redirect.sh
+TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh
 
 include ../lib.mk
 
@@ -34,8 +35,20 @@ $(BPFOBJ): force
 	$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
 
 CLANG ?= clang
+LLC   ?= llc
+
+PROBE := $(shell llc -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
+
+# Let newer LLVM versions transparently probe the kernel for availability
+# of full BPF instruction set.
+ifeq ($(PROBE),)
+  CPU ?= probe
+else
+  CPU ?= generic
+endif
 
 %.o: %.c
 	$(CLANG) -I. -I./include/uapi -I../../../include/uapi \
-		-Wno-compare-distinct-pointer-types \
-		-O2 -target bpf -c $< -o $@
+		 -Wno-compare-distinct-pointer-types          \
+		 -O2 -target bpf -emit-llvm -c $< -o - |      \
+	$(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 36fb9161b34a..a56053db26f5 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -62,6 +62,8 @@ static unsigned long long (*bpf_get_prandom_u32)(void) =
 	(void *) BPF_FUNC_get_prandom_u32;
 static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
 	(void *) BPF_FUNC_xdp_adjust_head;
+static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) =
+	(void *) BPF_FUNC_xdp_adjust_meta;
 static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
 			     int optlen) =
 	(void *) BPF_FUNC_setsockopt;
@@ -109,7 +111,47 @@ static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) =
 static int (*bpf_skb_change_head)(void *, int len, int flags) =
 	(void *) BPF_FUNC_skb_change_head;
 
+/* Scan the ARCH passed in from ARCH env variable (see Makefile) */
+#if defined(__TARGET_ARCH_x86)
+	#define bpf_target_x86
+	#define bpf_target_defined
+#elif defined(__TARGET_ARCH_s930x)
+	#define bpf_target_s930x
+	#define bpf_target_defined
+#elif defined(__TARGET_ARCH_arm64)
+	#define bpf_target_arm64
+	#define bpf_target_defined
+#elif defined(__TARGET_ARCH_mips)
+	#define bpf_target_mips
+	#define bpf_target_defined
+#elif defined(__TARGET_ARCH_powerpc)
+	#define bpf_target_powerpc
+	#define bpf_target_defined
+#elif defined(__TARGET_ARCH_sparc)
+	#define bpf_target_sparc
+	#define bpf_target_defined
+#else
+	#undef bpf_target_defined
+#endif
+
+/* Fall back to what the compiler says */
+#ifndef bpf_target_defined
 #if defined(__x86_64__)
+	#define bpf_target_x86
+#elif defined(__s390x__)
+	#define bpf_target_s930x
+#elif defined(__aarch64__)
+	#define bpf_target_arm64
+#elif defined(__mips__)
+	#define bpf_target_mips
+#elif defined(__powerpc__)
+	#define bpf_target_powerpc
+#elif defined(__sparc__)
+	#define bpf_target_sparc
+#endif
+#endif
+
+#if defined(bpf_target_x86)
 
 #define PT_REGS_PARM1(x) ((x)->di)
 #define PT_REGS_PARM2(x) ((x)->si)
@@ -122,7 +164,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) =
 #define PT_REGS_SP(x) ((x)->sp)
 #define PT_REGS_IP(x) ((x)->ip)
 
-#elif defined(__s390x__)
+#elif defined(bpf_target_s390x)
 
 #define PT_REGS_PARM1(x) ((x)->gprs[2])
 #define PT_REGS_PARM2(x) ((x)->gprs[3])
@@ -135,7 +177,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) =
 #define PT_REGS_SP(x) ((x)->gprs[15])
 #define PT_REGS_IP(x) ((x)->psw.addr)
 
-#elif defined(__aarch64__)
+#elif defined(bpf_target_arm64)
 
 #define PT_REGS_PARM1(x) ((x)->regs[0])
 #define PT_REGS_PARM2(x) ((x)->regs[1])
@@ -148,7 +190,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) =
 #define PT_REGS_SP(x) ((x)->sp)
 #define PT_REGS_IP(x) ((x)->pc)
 
-#elif defined(__mips__)
+#elif defined(bpf_target_mips)
 
 #define PT_REGS_PARM1(x) ((x)->regs[4])
 #define PT_REGS_PARM2(x) ((x)->regs[5])
@@ -161,7 +203,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) =
 #define PT_REGS_SP(x) ((x)->regs[29])
 #define PT_REGS_IP(x) ((x)->cp0_epc)
 
-#elif defined(__powerpc__)
+#elif defined(bpf_target_powerpc)
 
 #define PT_REGS_PARM1(x) ((x)->gpr[3])
 #define PT_REGS_PARM2(x) ((x)->gpr[4])
@@ -172,7 +214,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) =
 #define PT_REGS_SP(x) ((x)->sp)
 #define PT_REGS_IP(x) ((x)->nip)
 
-#elif defined(__sparc__)
+#elif defined(bpf_target_sparc)
 
 #define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0])
 #define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1])
@@ -182,6 +224,8 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) =
 #define PT_REGS_RET(x) ((x)->u_regs[UREG_I7])
 #define PT_REGS_RC(x) ((x)->u_regs[UREG_I0])
 #define PT_REGS_SP(x) ((x)->u_regs[UREG_FP])
+
+/* Should this also be a bpf_target check for the sparc case? */
 #if defined(__arch64__)
 #define PT_REGS_IP(x) ((x)->tpc)
 #else
@@ -190,10 +234,10 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) =
 
 #endif
 
-#ifdef __powerpc__
+#ifdef bpf_target_powerpc
 #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({ (ip) = (ctx)->link; })
 #define BPF_KRETPROBE_READ_RET_IP		BPF_KPROBE_READ_RET_IP
-#elif defined(__sparc__)
+#elif bpf_target_sparc
 #define BPF_KPROBE_READ_RET_IP(ip, ctx)		({ (ip) = PT_REGS_RET(ctx); })
 #define BPF_KRETPROBE_READ_RET_IP		BPF_KPROBE_READ_RET_IP
 #else
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c
index e97565243d59..6fedb9fec36b 100644
--- a/tools/testing/selftests/bpf/test_lpm_map.c
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -31,6 +31,10 @@ struct tlpm_node {
 	uint8_t key[];
 };
 
+static struct tlpm_node *tlpm_match(struct tlpm_node *list,
+				    const uint8_t *key,
+				    size_t n_bits);
+
 static struct tlpm_node *tlpm_add(struct tlpm_node *list,
 				  const uint8_t *key,
 				  size_t n_bits)
@@ -38,9 +42,17 @@ static struct tlpm_node *tlpm_add(struct tlpm_node *list,
 	struct tlpm_node *node;
 	size_t n;
 
+	n = (n_bits + 7) / 8;
+
+	/* 'overwrite' an equivalent entry if one already exists */
+	node = tlpm_match(list, key, n_bits);
+	if (node && node->n_bits == n_bits) {
+		memcpy(node->key, key, n);
+		return list;
+	}
+
 	/* add new entry with @key/@n_bits to @list and return new head */
 
-	n = (n_bits + 7) / 8;
 	node = malloc(sizeof(*node) + n);
 	assert(node);
 
@@ -92,6 +104,34 @@ static struct tlpm_node *tlpm_match(struct tlpm_node *list,
 	return best;
 }
 
+static struct tlpm_node *tlpm_delete(struct tlpm_node *list,
+				     const uint8_t *key,
+				     size_t n_bits)
+{
+	struct tlpm_node *best = tlpm_match(list, key, n_bits);
+	struct tlpm_node *node;
+
+	if (!best || best->n_bits != n_bits)
+		return list;
+
+	if (best == list) {
+		node = best->next;
+		free(best);
+		return node;
+	}
+
+	for (node = list; node; node = node->next) {
+		if (node->next == best) {
+			node->next = best->next;
+			free(best);
+			return list;
+		}
+	}
+	/* should never get here */
+	assert(0);
+	return list;
+}
+
 static void test_lpm_basic(void)
 {
 	struct tlpm_node *list = NULL, *t1, *t2;
@@ -114,6 +154,13 @@ static void test_lpm_basic(void)
 	assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 15));
 	assert(!tlpm_match(list, (uint8_t[]){ 0x7f, 0xff }, 16));
 
+	list = tlpm_delete(list, (uint8_t[]){ 0xff, 0xff }, 16);
+	assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+	assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+
+	list = tlpm_delete(list, (uint8_t[]){ 0xff }, 8);
+	assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+
 	tlpm_clear(list);
 }
 
@@ -158,7 +205,7 @@ static void test_lpm_order(void)
 
 static void test_lpm_map(int keysize)
 {
-	size_t i, j, n_matches, n_nodes, n_lookups;
+	size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups;
 	struct tlpm_node *t, *list = NULL;
 	struct bpf_lpm_trie_key *key;
 	uint8_t *data, *value;
@@ -170,6 +217,7 @@ static void test_lpm_map(int keysize)
 	 */
 
 	n_matches = 0;
+	n_matches_after_delete = 0;
 	n_nodes = 1 << 8;
 	n_lookups = 1 << 16;
 
@@ -223,15 +271,54 @@ static void test_lpm_map(int keysize)
 		}
 	}
 
+	/* Remove the first half of the elements in the tlpm and the
+	 * corresponding nodes from the bpf-lpm.  Then run the same
+	 * large number of random lookups in both and make sure they match.
+	 * Note: we need to count the number of nodes actually inserted
+	 * since there may have been duplicates.
+	 */
+	for (i = 0, t = list; t; i++, t = t->next)
+		;
+	for (j = 0; j < i / 2; ++j) {
+		key->prefixlen = list->n_bits;
+		memcpy(key->data, list->key, keysize);
+		r = bpf_map_delete_elem(map, key);
+		assert(!r);
+		list = tlpm_delete(list, list->key, list->n_bits);
+		assert(list);
+	}
+	for (i = 0; i < n_lookups; ++i) {
+		for (j = 0; j < keysize; ++j)
+			data[j] = rand() & 0xff;
+
+		t = tlpm_match(list, data, 8 * keysize);
+
+		key->prefixlen = 8 * keysize;
+		memcpy(key->data, data, keysize);
+		r = bpf_map_lookup_elem(map, key, value);
+		assert(!r || errno == ENOENT);
+		assert(!t == !!r);
+
+		if (t) {
+			++n_matches_after_delete;
+			assert(t->n_bits == value[keysize]);
+			for (j = 0; j < t->n_bits; ++j)
+				assert((t->key[j / 8] & (1 << (7 - j % 8))) ==
+				       (value[j / 8] & (1 << (7 - j % 8))));
+		}
+	}
+
 	close(map);
 	tlpm_clear(list);
 
 	/* With 255 random nodes in the map, we are pretty likely to match
 	 * something on every lookup. For statistics, use this:
 	 *
-	 *     printf("  nodes: %zu\n"
-	 *            "lookups: %zu\n"
-	 *            "matches: %zu\n", n_nodes, n_lookups, n_matches);
+	 *     printf("          nodes: %zu\n"
+	 *            "        lookups: %zu\n"
+	 *            "        matches: %zu\n"
+	 *            "matches(delete): %zu\n",
+	 *            n_nodes, n_lookups, n_matches, n_matches_after_delete);
 	 */
 }
 
@@ -331,6 +418,108 @@ static void test_lpm_ipaddr(void)
 	close(map_fd_ipv6);
 }
 
+static void test_lpm_delete(void)
+{
+	struct bpf_lpm_trie_key *key;
+	size_t key_size;
+	int map_fd;
+	__u64 value;
+
+	key_size = sizeof(*key) + sizeof(__u32);
+	key = alloca(key_size);
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
+				key_size, sizeof(value),
+				100, BPF_F_NO_PREALLOC);
+	assert(map_fd >= 0);
+
+	/* Add nodes:
+	 * 192.168.0.0/16   (1)
+	 * 192.168.0.0/24   (2)
+	 * 192.168.128.0/24 (3)
+	 * 192.168.1.0/24   (4)
+	 *
+	 *         (1)
+	 *        /   \
+         *     (IM)    (3)
+	 *    /   \
+         *   (2)  (4)
+	 */
+	value = 1;
+	key->prefixlen = 16;
+	inet_pton(AF_INET, "192.168.0.0", key->data);
+	assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0);
+
+	value = 2;
+	key->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.0.0", key->data);
+	assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0);
+
+	value = 3;
+	key->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.128.0", key->data);
+	assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0);
+
+	value = 4;
+	key->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.1.0", key->data);
+	assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0);
+
+	/* remove non-existent node */
+	key->prefixlen = 32;
+	inet_pton(AF_INET, "10.0.0.1", key->data);
+	assert(bpf_map_lookup_elem(map_fd, key, &value) == -1 &&
+		errno == ENOENT);
+
+	/* assert initial lookup */
+	key->prefixlen = 32;
+	inet_pton(AF_INET, "192.168.0.1", key->data);
+	assert(bpf_map_lookup_elem(map_fd, key, &value) == 0);
+	assert(value == 2);
+
+	/* remove leaf node */
+	key->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.0.0", key->data);
+	assert(bpf_map_delete_elem(map_fd, key) == 0);
+
+	key->prefixlen = 32;
+	inet_pton(AF_INET, "192.168.0.1", key->data);
+	assert(bpf_map_lookup_elem(map_fd, key, &value) == 0);
+	assert(value == 1);
+
+	/* remove leaf (and intermediary) node */
+	key->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.1.0", key->data);
+	assert(bpf_map_delete_elem(map_fd, key) == 0);
+
+	key->prefixlen = 32;
+	inet_pton(AF_INET, "192.168.1.1", key->data);
+	assert(bpf_map_lookup_elem(map_fd, key, &value) == 0);
+	assert(value == 1);
+
+	/* remove root node */
+	key->prefixlen = 16;
+	inet_pton(AF_INET, "192.168.0.0", key->data);
+	assert(bpf_map_delete_elem(map_fd, key) == 0);
+
+	key->prefixlen = 32;
+	inet_pton(AF_INET, "192.168.128.1", key->data);
+	assert(bpf_map_lookup_elem(map_fd, key, &value) == 0);
+	assert(value == 3);
+
+	/* remove last node */
+	key->prefixlen = 24;
+	inet_pton(AF_INET, "192.168.128.0", key->data);
+	assert(bpf_map_delete_elem(map_fd, key) == 0);
+
+	key->prefixlen = 32;
+	inet_pton(AF_INET, "192.168.128.1", key->data);
+	assert(bpf_map_lookup_elem(map_fd, key, &value) == -1 &&
+		errno == ENOENT);
+
+	close(map_fd);
+}
+
 int main(void)
 {
 	struct rlimit limit  = { RLIM_INFINITY, RLIM_INFINITY };
@@ -353,6 +542,8 @@ int main(void)
 
 	test_lpm_ipaddr();
 
+	test_lpm_delete();
+
 	printf("test_lpm: OK\n");
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 11ee25cea227..69427531408d 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -10,6 +10,7 @@
 #include <string.h>
 #include <assert.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include <linux/types.h>
 typedef __u16 __sum16;
@@ -19,6 +20,8 @@ typedef __u16 __sum16;
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/tcp.h>
+#include <linux/filter.h>
+#include <linux/unistd.h>
 
 #include <sys/wait.h>
 #include <sys/resource.h>
@@ -273,16 +276,26 @@ static void test_bpf_obj_id(void)
 	const int nr_iters = 2;
 	const char *file = "./test_obj_id.o";
 	const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
+	const char *expected_prog_name = "test_obj_id";
+	const char *expected_map_name = "test_map_id";
+	const __u64 nsec_per_sec = 1000000000;
 
 	struct bpf_object *objs[nr_iters];
 	int prog_fds[nr_iters], map_fds[nr_iters];
 	/* +1 to test for the info_len returned by kernel */
 	struct bpf_prog_info prog_infos[nr_iters + 1];
 	struct bpf_map_info map_infos[nr_iters + 1];
+	/* Each prog only uses one map. +1 to test nr_map_ids
+	 * returned by kernel.
+	 */
+	__u32 map_ids[nr_iters + 1];
 	char jited_insns[128], xlated_insns[128], zeros[128];
 	__u32 i, next_id, info_len, nr_id_found, duration = 0;
+	struct timespec real_time_ts, boot_time_ts;
 	int sysctl_fd, jit_enabled = 0, err = 0;
 	__u64 array_value;
+	uid_t my_uid = getuid();
+	time_t now, load_time;
 
 	sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
 	if (sysctl_fd != -1) {
@@ -307,6 +320,7 @@ static void test_bpf_obj_id(void)
 	/* Check bpf_obj_get_info_by_fd() */
 	bzero(zeros, sizeof(zeros));
 	for (i = 0; i < nr_iters; i++) {
+		now = time(NULL);
 		err = bpf_prog_load(file, BPF_PROG_TYPE_SOCKET_FILTER,
 				    &objs[i], &prog_fds[i]);
 		/* test_obj_id.o is a dumb prog. It should never fail
@@ -316,6 +330,38 @@ static void test_bpf_obj_id(void)
 			error_cnt++;
 		assert(!err);
 
+		/* Insert a magic value to the map */
+		map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
+		assert(map_fds[i] >= 0);
+		err = bpf_map_update_elem(map_fds[i], &array_key,
+					  &array_magic_value, 0);
+		assert(!err);
+
+		/* Check getting map info */
+		info_len = sizeof(struct bpf_map_info) * 2;
+		bzero(&map_infos[i], info_len);
+		err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i],
+					     &info_len);
+		if (CHECK(err ||
+			  map_infos[i].type != BPF_MAP_TYPE_ARRAY ||
+			  map_infos[i].key_size != sizeof(__u32) ||
+			  map_infos[i].value_size != sizeof(__u64) ||
+			  map_infos[i].max_entries != 1 ||
+			  map_infos[i].map_flags != 0 ||
+			  info_len != sizeof(struct bpf_map_info) ||
+			  strcmp((char *)map_infos[i].name, expected_map_name),
+			  "get-map-info(fd)",
+			  "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X name %s(%s)\n",
+			  err, errno,
+			  map_infos[i].type, BPF_MAP_TYPE_ARRAY,
+			  info_len, sizeof(struct bpf_map_info),
+			  map_infos[i].key_size,
+			  map_infos[i].value_size,
+			  map_infos[i].max_entries,
+			  map_infos[i].map_flags,
+			  map_infos[i].name, expected_map_name))
+			goto done;
+
 		/* Check getting prog info */
 		info_len = sizeof(struct bpf_prog_info) * 2;
 		bzero(&prog_infos[i], info_len);
@@ -325,8 +371,16 @@ static void test_bpf_obj_id(void)
 		prog_infos[i].jited_prog_len = sizeof(jited_insns);
 		prog_infos[i].xlated_prog_insns = ptr_to_u64(xlated_insns);
 		prog_infos[i].xlated_prog_len = sizeof(xlated_insns);
+		prog_infos[i].map_ids = ptr_to_u64(map_ids + i);
+		prog_infos[i].nr_map_ids = 2;
+		err = clock_gettime(CLOCK_REALTIME, &real_time_ts);
+		assert(!err);
+		err = clock_gettime(CLOCK_BOOTTIME, &boot_time_ts);
+		assert(!err);
 		err = bpf_obj_get_info_by_fd(prog_fds[i], &prog_infos[i],
 					     &info_len);
+		load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec)
+			+ (prog_infos[i].load_time / nsec_per_sec);
 		if (CHECK(err ||
 			  prog_infos[i].type != BPF_PROG_TYPE_SOCKET_FILTER ||
 			  info_len != sizeof(struct bpf_prog_info) ||
@@ -334,9 +388,14 @@ static void test_bpf_obj_id(void)
 			  (jit_enabled &&
 			   !memcmp(jited_insns, zeros, sizeof(zeros))) ||
 			  !prog_infos[i].xlated_prog_len ||
-			  !memcmp(xlated_insns, zeros, sizeof(zeros)),
+			  !memcmp(xlated_insns, zeros, sizeof(zeros)) ||
+			  load_time < now - 60 || load_time > now + 60 ||
+			  prog_infos[i].created_by_uid != my_uid ||
+			  prog_infos[i].nr_map_ids != 1 ||
+			  *(int *)prog_infos[i].map_ids != map_infos[i].id ||
+			  strcmp((char *)prog_infos[i].name, expected_prog_name),
 			  "get-prog-info(fd)",
-			  "err %d errno %d i %d type %d(%d) info_len %u(%lu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d\n",
+			  "err %d errno %d i %d type %d(%d) info_len %u(%lu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n",
 			  err, errno, i,
 			  prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER,
 			  info_len, sizeof(struct bpf_prog_info),
@@ -344,36 +403,12 @@ static void test_bpf_obj_id(void)
 			  prog_infos[i].jited_prog_len,
 			  prog_infos[i].xlated_prog_len,
 			  !!memcmp(jited_insns, zeros, sizeof(zeros)),
-			  !!memcmp(xlated_insns, zeros, sizeof(zeros))))
-			goto done;
-
-		map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
-		assert(map_fds[i] >= 0);
-		err = bpf_map_update_elem(map_fds[i], &array_key,
-					  &array_magic_value, 0);
-		assert(!err);
-
-		/* Check getting map info */
-		info_len = sizeof(struct bpf_map_info) * 2;
-		bzero(&map_infos[i], info_len);
-		err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i],
-					     &info_len);
-		if (CHECK(err ||
-			  map_infos[i].type != BPF_MAP_TYPE_ARRAY ||
-			  map_infos[i].key_size != sizeof(__u32) ||
-			  map_infos[i].value_size != sizeof(__u64) ||
-			  map_infos[i].max_entries != 1 ||
-			  map_infos[i].map_flags != 0 ||
-			  info_len != sizeof(struct bpf_map_info),
-			  "get-map-info(fd)",
-			  "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X\n",
-			  err, errno,
-			  map_infos[i].type, BPF_MAP_TYPE_ARRAY,
-			  info_len, sizeof(struct bpf_map_info),
-			  map_infos[i].key_size,
-			  map_infos[i].value_size,
-			  map_infos[i].max_entries,
-			  map_infos[i].map_flags))
+			  !!memcmp(xlated_insns, zeros, sizeof(zeros)),
+			  load_time, now,
+			  prog_infos[i].created_by_uid, my_uid,
+			  prog_infos[i].nr_map_ids, 1,
+			  *(int *)prog_infos[i].map_ids, map_infos[i].id,
+			  prog_infos[i].name, expected_prog_name))
 			goto done;
 	}
 
@@ -382,6 +417,7 @@ static void test_bpf_obj_id(void)
 	next_id = 0;
 	while (!bpf_prog_get_next_id(next_id, &next_id)) {
 		struct bpf_prog_info prog_info = {};
+		__u32 saved_map_id;
 		int prog_fd;
 
 		info_len = sizeof(prog_info);
@@ -404,16 +440,33 @@ static void test_bpf_obj_id(void)
 
 		nr_id_found++;
 
+		/* Negative test:
+		 * prog_info.nr_map_ids = 1
+		 * prog_info.map_ids = NULL
+		 */
+		prog_info.nr_map_ids = 1;
+		err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
+		if (CHECK(!err || errno != EFAULT,
+			  "get-prog-fd-bad-nr-map-ids", "err %d errno %d(%d)",
+			  err, errno, EFAULT))
+			break;
+		bzero(&prog_info, sizeof(prog_info));
+		info_len = sizeof(prog_info);
+
+		saved_map_id = *(int *)(prog_infos[i].map_ids);
+		prog_info.map_ids = prog_infos[i].map_ids;
+		prog_info.nr_map_ids = 2;
 		err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
 		prog_infos[i].jited_prog_insns = 0;
 		prog_infos[i].xlated_prog_insns = 0;
 		CHECK(err || info_len != sizeof(struct bpf_prog_info) ||
-		      memcmp(&prog_info, &prog_infos[i], info_len),
+		      memcmp(&prog_info, &prog_infos[i], info_len) ||
+		      *(int *)prog_info.map_ids != saved_map_id,
 		      "get-prog-info(next_id->fd)",
-		      "err %d errno %d info_len %u(%lu) memcmp %d\n",
+		      "err %d errno %d info_len %u(%lu) memcmp %d map_id %u(%u)\n",
 		      err, errno, info_len, sizeof(struct bpf_prog_info),
-		      memcmp(&prog_info, &prog_infos[i], info_len));
-
+		      memcmp(&prog_info, &prog_infos[i], info_len),
+		      *(int *)prog_info.map_ids, saved_map_id);
 		close(prog_fd);
 	}
 	CHECK(nr_id_found != nr_iters,
@@ -495,6 +548,75 @@ static void test_pkt_md_access(void)
 	bpf_object__close(obj);
 }
 
+static void test_obj_name(void)
+{
+	struct {
+		const char *name;
+		int success;
+		int expected_errno;
+	} tests[] = {
+		{ "", 1, 0 },
+		{ "_123456789ABCDE", 1, 0 },
+		{ "_123456789ABCDEF", 0, EINVAL },
+		{ "_123456789ABCD\n", 0, EINVAL },
+	};
+	struct bpf_insn prog[] = {
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	__u32 duration = 0;
+	int i;
+
+	for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) {
+		size_t name_len = strlen(tests[i].name) + 1;
+		union bpf_attr attr;
+		size_t ncopy;
+		int fd;
+
+		/* test different attr.prog_name during BPF_PROG_LOAD */
+		ncopy = name_len < sizeof(attr.prog_name) ?
+			name_len : sizeof(attr.prog_name);
+		bzero(&attr, sizeof(attr));
+		attr.prog_type = BPF_PROG_TYPE_SCHED_CLS;
+		attr.insn_cnt = 2;
+		attr.insns = ptr_to_u64(prog);
+		attr.license = ptr_to_u64("");
+		memcpy(attr.prog_name, tests[i].name, ncopy);
+
+		fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+		CHECK((tests[i].success && fd < 0) ||
+		      (!tests[i].success && fd != -1) ||
+		      (!tests[i].success && errno != tests[i].expected_errno),
+		      "check-bpf-prog-name",
+		      "fd %d(%d) errno %d(%d)\n",
+		       fd, tests[i].success, errno, tests[i].expected_errno);
+
+		if (fd != -1)
+			close(fd);
+
+		/* test different attr.map_name during BPF_MAP_CREATE */
+		ncopy = name_len < sizeof(attr.map_name) ?
+			name_len : sizeof(attr.map_name);
+		bzero(&attr, sizeof(attr));
+		attr.map_type = BPF_MAP_TYPE_ARRAY;
+		attr.key_size = 4;
+		attr.value_size = 4;
+		attr.max_entries = 1;
+		attr.map_flags = 0;
+		memcpy(attr.map_name, tests[i].name, ncopy);
+		fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
+		CHECK((tests[i].success && fd < 0) ||
+		      (!tests[i].success && fd != -1) ||
+		      (!tests[i].success && errno != tests[i].expected_errno),
+		      "check-bpf-map-name",
+		      "fd %d(%d) errno %d(%d)\n",
+		      fd, tests[i].success, errno, tests[i].expected_errno);
+
+		if (fd != -1)
+			close(fd);
+	}
+}
+
 int main(void)
 {
 	struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
@@ -507,6 +629,7 @@ int main(void)
 	test_tcp_estats();
 	test_bpf_obj_id();
 	test_pkt_md_access();
+	test_obj_name();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 26f3250bdcd2..cc91d0159f43 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -6645,6 +6645,325 @@ static struct bpf_test tests[] = {
 		.errstr = "BPF_END uses reserved fields",
 		.result = REJECT,
 	},
+	{
+		"meta access, test1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 8),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet, off=-8",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test3",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test4",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data_end)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_4),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test5",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_4, 3),
+			BPF_MOV64_IMM(BPF_REG_2, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_xdp_adjust_meta),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "R3 !read_ok",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test6",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test7",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test8",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test9",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 1),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test10",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct xdp_md, data_end)),
+			BPF_MOV64_IMM(BPF_REG_5, 42),
+			BPF_MOV64_IMM(BPF_REG_6, 24),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
+			BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_5),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_5, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test11",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_MOV64_IMM(BPF_REG_5, 42),
+			BPF_MOV64_IMM(BPF_REG_6, 24),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
+			BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_5),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_5, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"meta access, test12",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct xdp_md, data_meta)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct xdp_md, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct xdp_md, data_end)),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_4, 5),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_XDP,
+	},
+	{
+		"bpf_exit with invalid return code. test1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0xffffffff)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test3",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0x3)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test5",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x2; 0x0)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test6",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 is not a known value (ctx)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test7",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4),
+			BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has unknown scalar value",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
@@ -6692,7 +7011,7 @@ static int create_map_in_map(void)
 		return inner_map_fd;
 	}
 
-	outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS,
+	outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, NULL,
 					     sizeof(int), inner_map_fd, 1, 0);
 	if (outer_map_fd < 0)
 		printf("Failed to create array of maps '%s'!\n",
diff --git a/tools/testing/selftests/bpf/test_xdp_meta.c b/tools/testing/selftests/bpf/test_xdp_meta.c
new file mode 100644
index 000000000000..8d0182650653
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_meta.c
@@ -0,0 +1,53 @@
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/pkt_cls.h>
+
+#include "bpf_helpers.h"
+
+#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
+#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
+#define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem
+
+SEC("t")
+int ing_cls(struct __sk_buff *ctx)
+{
+	__u8 *data, *data_meta, *data_end;
+	__u32 diff = 0;
+
+	data_meta = ctx_ptr(ctx, data_meta);
+	data_end  = ctx_ptr(ctx, data_end);
+	data      = ctx_ptr(ctx, data);
+
+	if (data + ETH_ALEN > data_end ||
+	    data_meta + round_up(ETH_ALEN, 4) > data)
+		return TC_ACT_SHOT;
+
+	diff |= ((__u32 *)data_meta)[0] ^ ((__u32 *)data)[0];
+	diff |= ((__u16 *)data_meta)[2] ^ ((__u16 *)data)[2];
+
+	return diff ? TC_ACT_SHOT : TC_ACT_OK;
+}
+
+SEC("x")
+int ing_xdp(struct xdp_md *ctx)
+{
+	__u8 *data, *data_meta, *data_end;
+	int ret;
+
+	ret = bpf_xdp_adjust_meta(ctx, -round_up(ETH_ALEN, 4));
+	if (ret < 0)
+		return XDP_DROP;
+
+	data_meta = ctx_ptr(ctx, data_meta);
+	data_end  = ctx_ptr(ctx, data_end);
+	data      = ctx_ptr(ctx, data);
+
+	if (data + ETH_ALEN > data_end ||
+	    data_meta + round_up(ETH_ALEN, 4) > data)
+		return XDP_DROP;
+
+	__builtin_memcpy(data_meta, data, ETH_ALEN);
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh
new file mode 100755
index 000000000000..307aa856cee3
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_meta.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+cleanup()
+{
+	if [ "$?" = "0" ]; then
+		echo "selftests: test_xdp_meta [PASS]";
+	else
+		echo "selftests: test_xdp_meta [FAILED]";
+	fi
+
+	set +e
+	ip netns del ns1 2> /dev/null
+	ip netns del ns2 2> /dev/null
+}
+
+ip link set dev lo xdp off 2>/dev/null > /dev/null
+if [ $? -ne 0 ];then
+	echo "selftests: [SKIP] Could not run test without the ip xdp support"
+	exit 0
+fi
+set -e
+
+ip netns add ns1
+ip netns add ns2
+
+trap cleanup 0 2 3 6 9
+
+ip link add veth1 type veth peer name veth2
+
+ip link set veth1 netns ns1
+ip link set veth2 netns ns2
+
+ip netns exec ns1 ip addr add 10.1.1.11/24 dev veth1
+ip netns exec ns2 ip addr add 10.1.1.22/24 dev veth2
+
+ip netns exec ns1 tc qdisc add dev veth1 clsact
+ip netns exec ns2 tc qdisc add dev veth2 clsact
+
+ip netns exec ns1 tc filter add dev veth1 ingress bpf da obj test_xdp_meta.o sec t
+ip netns exec ns2 tc filter add dev veth2 ingress bpf da obj test_xdp_meta.o sec t
+
+ip netns exec ns1 ip link set dev veth1 xdp obj test_xdp_meta.o sec x
+ip netns exec ns2 ip link set dev veth2 xdp obj test_xdp_meta.o sec x
+
+ip netns exec ns1 ip link set dev veth1 up
+ip netns exec ns2 ip link set dev veth2 up
+
+ip netns exec ns1 ping -c 1 10.1.1.22
+ip netns exec ns2 ping -c 1 10.1.1.11
+
+exit 0
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 57b5ff576240..e8c86c416ed0 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -15,6 +15,14 @@ check_err()
 	fi
 }
 
+# same but inverted -- used when command must fail for test to pass
+check_fail()
+{
+	if [ $1 -eq 0 ]; then
+		ret=1
+	fi
+}
+
 kci_add_dummy()
 {
 	ip link add name "$devdummy" type dummy
@@ -235,6 +243,196 @@ kci_test_addrlabel()
 	echo "PASS: ipv6 addrlabel"
 }
 
+kci_test_ifalias()
+{
+	ret=0
+	namewant=$(uuidgen)
+	syspathname="/sys/class/net/$devdummy/ifalias"
+
+	ip link set dev "$devdummy" alias "$namewant"
+	check_err $?
+
+	if [ $ret -ne 0 ]; then
+		echo "FAIL: cannot set interface alias of $devdummy to $namewant"
+		return 1
+	fi
+
+	ip link show "$devdummy" | grep -q "alias $namewant"
+	check_err $?
+
+	if [ -r "$syspathname" ] ; then
+		read namehave < "$syspathname"
+		if [ "$namewant" != "$namehave" ]; then
+			echo "FAIL: did set ifalias $namewant but got $namehave"
+			return 1
+		fi
+
+		namewant=$(uuidgen)
+		echo "$namewant" > "$syspathname"
+	        ip link show "$devdummy" | grep -q "alias $namewant"
+		check_err $?
+
+		# sysfs interface allows to delete alias again
+		echo "" > "$syspathname"
+
+	        ip link show "$devdummy" | grep -q "alias $namewant"
+		check_fail $?
+
+		for i in $(seq 1 100); do
+			uuidgen > "$syspathname" &
+		done
+
+		wait
+
+		# re-add the alias -- kernel should free mem when dummy dev is removed
+		ip link set dev "$devdummy" alias "$namewant"
+		check_err $?
+	fi
+
+	if [ $ret -ne 0 ]; then
+		echo "FAIL: set interface alias $devdummy to $namewant"
+		return 1
+	fi
+
+	echo "PASS: set ifalias $namewant for $devdummy"
+}
+
+kci_test_vrf()
+{
+	vrfname="test-vrf"
+	ret=0
+
+	ip link show type vrf 2>/dev/null
+	if [ $? -ne 0 ]; then
+		echo "SKIP: vrf: iproute2 too old"
+		return 0
+	fi
+
+	ip link add "$vrfname" type vrf table 10
+	check_err $?
+	if [ $ret -ne 0 ];then
+		echo "FAIL: can't add vrf interface, skipping test"
+		return 0
+	fi
+
+	ip -br link show type vrf | grep -q "$vrfname"
+	check_err $?
+	if [ $ret -ne 0 ];then
+		echo "FAIL: created vrf device not found"
+		return 1
+	fi
+
+	ip link set dev "$vrfname" up
+	check_err $?
+
+	ip link set dev "$devdummy" master "$vrfname"
+	check_err $?
+	ip link del dev "$vrfname"
+	check_err $?
+
+	if [ $ret -ne 0 ];then
+		echo "FAIL: vrf"
+		return 1
+	fi
+
+	echo "PASS: vrf"
+}
+
+kci_test_encap_vxlan()
+{
+	ret=0
+	vxlan="test-vxlan0"
+	vlan="test-vlan0"
+	testns="$1"
+
+	ip netns exec "$testns" ip link add "$vxlan" type vxlan id 42 group 239.1.1.1 \
+		dev "$devdummy" dstport 4789 2>/dev/null
+	if [ $? -ne 0 ]; then
+		echo "FAIL: can't add vxlan interface, skipping test"
+		return 0
+	fi
+	check_err $?
+
+	ip netns exec "$testns" ip addr add 10.2.11.49/24 dev "$vxlan"
+	check_err $?
+
+	ip netns exec "$testns" ip link set up dev "$vxlan"
+	check_err $?
+
+	ip netns exec "$testns" ip link add link "$vxlan" name "$vlan" type vlan id 1
+	check_err $?
+
+	ip netns exec "$testns" ip link del "$vxlan"
+	check_err $?
+
+	if [ $ret -ne 0 ]; then
+		echo "FAIL: vxlan"
+		return 1
+	fi
+	echo "PASS: vxlan"
+}
+
+kci_test_encap_fou()
+{
+	ret=0
+	name="test-fou"
+	testns="$1"
+
+	ip fou help 2>&1 |grep -q 'Usage: ip fou'
+	if [ $? -ne 0 ];then
+		echo "SKIP: fou: iproute2 too old"
+		return 1
+	fi
+
+	ip netns exec "$testns" ip fou add port 7777 ipproto 47 2>/dev/null
+	if [ $? -ne 0 ];then
+		echo "FAIL: can't add fou port 7777, skipping test"
+		return 1
+	fi
+
+	ip netns exec "$testns" ip fou add port 8888 ipproto 4
+	check_err $?
+
+	ip netns exec "$testns" ip fou del port 9999 2>/dev/null
+	check_fail $?
+
+	ip netns exec "$testns" ip fou del port 7777
+	check_err $?
+
+	if [ $ret -ne 0 ]; then
+		echo "FAIL: fou"
+		return 1
+	fi
+
+	echo "PASS: fou"
+}
+
+# test various encap methods, use netns to avoid unwanted interference
+kci_test_encap()
+{
+	testns="testns"
+	ret=0
+
+	ip netns add "$testns"
+	if [ $? -ne 0 ]; then
+		echo "SKIP encap tests: cannot add net namespace $testns"
+		return 1
+	fi
+
+	ip netns exec "$testns" ip link set lo up
+	check_err $?
+
+	ip netns exec "$testns" ip link add name "$devdummy" type dummy
+	check_err $?
+	ip netns exec "$testns" ip link set "$devdummy" up
+	check_err $?
+
+	kci_test_encap_vxlan "$testns"
+	kci_test_encap_fou "$testns"
+
+	ip netns del "$testns"
+}
+
 kci_test_rtnl()
 {
 	kci_add_dummy
@@ -249,6 +447,9 @@ kci_test_rtnl()
 	kci_test_gre
 	kci_test_bridge
 	kci_test_addrlabel
+	kci_test_ifalias
+	kci_test_vrf
+	kci_test_encap
 
 	kci_del_dummy
 }