24 files changed, 229 insertions, 107 deletions
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 96066665e376..336774a535c3 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -657,12 +657,15 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 	if (compat_mode) {
 		if (a->type == TCA_OLD_COMPAT)
 			err = gnet_stats_start_copy_compat(skb, 0,
-				TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d);
+							   TCA_STATS,
+							   TCA_XSTATS,
+							   &p->tcfc_lock, &d,
+							   TCA_PAD);
 		else
 			return 0;
 	} else
 		err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
-					    &p->tcfc_lock, &d);
+					    &p->tcfc_lock, &d, TCA_ACT_PAD);
 
 	if (err < 0)
 		goto errout;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 8c9f1f0459ab..c7123e01c2ca 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -53,9 +53,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
+		bpf_compute_data_end(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
+		bpf_compute_data_end(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
@@ -156,7 +158,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
 	tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse);
 	tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires);
 
-	if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm))
+	if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm,
+			  TCA_ACT_BPF_PAD))
 		goto nla_put_failure;
 
 	return skb->len;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index c0ed93ce2391..2ba700c765e0 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -163,7 +163,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
 	t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(ci->tcf_tm.expires);
-	if (nla_put(skb, TCA_CONNMARK_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
+			  TCA_CONNMARK_PAD))
 		goto nla_put_failure;
 
 	return skb->len;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index d22426cdebc0..28e934ed038a 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -549,7 +549,7 @@ static int tcf_csum_dump(struct sk_buff *skb,
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
-	if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
 		goto nla_put_failure;
 
 	return skb->len;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 887fc1f209ff..ec5cc8435238 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -148,6 +148,20 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
 	return action;
 }
 
+static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+				  u64 lastuse)
+{
+	struct tcf_gact *gact = a->priv;
+	int action = READ_ONCE(gact->tcf_action);
+	struct tcf_t *tm = &gact->tcf_tm;
+
+	_bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets);
+	if (action == TC_ACT_SHOT)
+		this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
+
+	tm->lastuse = lastuse;
+}
+
 static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
@@ -177,7 +191,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 	t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
-	if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
 		goto nla_put_failure;
 	return skb->len;
 
@@ -207,6 +221,7 @@ static struct tc_action_ops act_gact_ops = {
 	.type		=	TCA_ACT_GACT,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_gact,
+	.stats_update	=	tcf_gact_stats_update,
 	.dump		=	tcf_gact_dump,
 	.init		=	tcf_gact_init,
 	.walk		=	tcf_gact_walker,
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 343d011aa818..658046dfe02d 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -556,7 +556,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 	t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(ife->tcf_tm.expires);
-	if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD))
 		goto nla_put_failure;
 
 	if (!is_zero_ether_addr(ife->eth_dst)) {
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8b5270008a6e..9f002ada7074 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -280,7 +280,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 	tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
 	tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
 	tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
-	if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm))
+	if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
 		goto nla_put_failure;
 	kfree(t);
 	return skb->len;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 8f3948dd38b8..128942bc9e42 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -36,14 +36,15 @@ static DEFINE_SPINLOCK(mirred_list_lock);
 static void tcf_mirred_release(struct tc_action *a, int bind)
 {
 	struct tcf_mirred *m = to_mirred(a);
-	struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1);
+	struct net_device *dev;
 
 	/* We could be called either in a RCU callback or with RTNL lock held. */
 	spin_lock_bh(&mirred_list_lock);
 	list_del(&m->tcfm_list);
-	spin_unlock_bh(&mirred_list_lock);
+	dev = rcu_dereference_protected(m->tcfm_dev, 1);
 	if (dev)
 		dev_put(dev);
+	spin_unlock_bh(&mirred_list_lock);
 }
 
 static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
@@ -221,7 +222,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
 	t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
-	if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
 		goto nla_put_failure;
 	return skb->len;
 
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 0f65cdfbfb1d..c0a879f940de 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -267,7 +267,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
-	if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
 		goto nla_put_failure;
 
 	return skb->len;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 429c3ab65142..c6e18f230af6 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -203,7 +203,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
-	if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
 		goto nla_put_failure;
 	kfree(opt);
 	return skb->len;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 3a33fb648a6d..e42f8daca147 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -161,7 +161,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
 	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
-	if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
 		goto nla_put_failure;
 	return skb->len;
 
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 69da5a8f0034..e928802966bc 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -171,7 +171,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
-	if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
 		goto nla_put_failure;
 	return skb->len;
 
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index c45f926dafb9..ac4adc812c12 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -185,7 +185,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
 	t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(v->tcf_tm.expires);
-	if (nla_put(skb, TCA_VLAN_TM, sizeof(t), &t))
+	if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
 		goto nla_put_failure;
 	return skb->len;
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 425fe6a0eda3..7b342c779da7 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -96,9 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
+			bpf_compute_data_end(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
+			bpf_compute_data_end(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 2181ffc76638..730aacafc22d 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -210,6 +210,25 @@ static void fl_hw_replace_filter(struct tcf_proto *tp,
 	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
 }
 
+static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_flower_offload offload = {0};
+	struct tc_to_netdev tc;
+
+	if (!tc_should_offload(dev, 0))
+		return;
+
+	offload.command = TC_CLSFLOWER_STATS;
+	offload.cookie = (unsigned long)f;
+	offload.exts = &f->exts;
+
+	tc.type = TC_SETUP_CLSFLOWER;
+	tc.cls_flower = &offload;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
 static bool fl_destroy(struct tcf_proto *tp, bool force)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -662,6 +681,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			goto nla_put_failure;
 	}
 
+	fl_hw_update_stats(tp, f);
+
 	if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
 			    mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
 			    sizeof(key->eth.dst)) ||
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 563cdad76448..079b43b3c5d2 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -134,6 +134,11 @@ next_knode:
 		j = 0;
 #endif
 
+		if (tc_skip_sw(n->flags)) {
+			n = rcu_dereference_bh(n->next);
+			goto next_knode;
+		}
+
 #ifdef CONFIG_CLS_U32_MARK
 		if ((skb->mark & n->mask) != n->val) {
 			n = rcu_dereference_bh(n->next);
@@ -443,13 +448,14 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	}
 }
 
-static void u32_replace_hw_hnode(struct tcf_proto *tp,
+static int u32_replace_hw_hnode(struct tcf_proto *tp,
 				 struct tc_u_hnode *h,
 				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
 	struct tc_to_netdev offload;
+	int err;
 
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
@@ -460,9 +466,13 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp,
 		offload.cls_u32->hnode.handle = h->handle;
 		offload.cls_u32->hnode.prio = h->prio;
 
-		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-					      tp->protocol, &offload);
+		err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+						    tp->protocol, &offload);
+		if (tc_skip_sw(flags))
+			return err;
 	}
+
+	return 0;
 }
 
 static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
@@ -485,13 +495,14 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	}
 }
 
-static void u32_replace_hw_knode(struct tcf_proto *tp,
+static int u32_replace_hw_knode(struct tcf_proto *tp,
 				 struct tc_u_knode *n,
 				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
 	struct tc_to_netdev offload;
+	int err;
 
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
@@ -512,9 +523,13 @@ static void u32_replace_hw_knode(struct tcf_proto *tp,
 		if (n->ht_down)
 			offload.cls_u32->knode.link_handle = n->ht_down->handle;
 
-		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-					      tp->protocol, &offload);
+		err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+						    tp->protocol, &offload);
+		if (tc_skip_sw(flags))
+			return err;
 	}
+
+	return 0;
 }
 
 static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
@@ -845,8 +860,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
-	if (tb[TCA_U32_FLAGS])
+	if (tb[TCA_U32_FLAGS]) {
 		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
+		if (!tc_flags_valid(flags))
+			return err;
+	}
 
 	n = (struct tc_u_knode *)*arg;
 	if (n) {
@@ -871,10 +889,15 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return err;
 		}
 
+		err = u32_replace_hw_knode(tp, new, flags);
+		if (err) {
+			u32_destroy_key(tp, new, false);
+			return err;
+		}
+
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
-		u32_replace_hw_knode(tp, new, flags);
 		return 0;
 	}
 
@@ -978,6 +1001,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		struct tc_u_knode __rcu **ins;
 		struct tc_u_knode *pins;
 
+		err = u32_replace_hw_knode(tp, n, flags);
+		if (err)
+			goto errhw;
+
 		ins = &ht->ht[TC_U32_HASH(handle)];
 		for (pins = rtnl_dereference(*ins); pins;
 		     ins = &pins->next, pins = rtnl_dereference(*ins))
@@ -986,11 +1013,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
-		u32_replace_hw_knode(tp, n, flags);
 		*arg = (unsigned long)n;
 		return 0;
 	}
 
+errhw:
 #ifdef CONFIG_CLS_U32_MARK
 	free_percpu(n->pcpu_success);
 errout:
@@ -1140,9 +1167,10 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 				gpf->kcnts[i] += pf->kcnts[i];
 		}
 
-		if (nla_put(skb, TCA_U32_PCNT,
-			    sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
-			    gpf)) {
+		if (nla_put_64bit(skb, TCA_U32_PCNT,
+				  sizeof(struct tc_u32_pcnt) +
+				  n->sel.nkeys * sizeof(u64),
+				  gpf, TCA_U32_PAD)) {
 			kfree(gpf);
 			goto nla_put_failure;
 		}
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index f2aabc0089da..a309a07ccb35 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -796,7 +796,7 @@ struct meta_type_ops {
 	int	(*dump)(struct sk_buff *, struct meta_value *, int);
 };
 
-static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
+static const struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
 	[TCF_META_TYPE_VAR] = {
 		.destroy = meta_var_destroy,
 		.compare = meta_var_compare,
@@ -812,7 +812,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
 	}
 };
 
-static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
+static inline const struct meta_type_ops *meta_type_ops(struct meta_value *v)
 {
 	return &__meta_type_ops[meta_type(v)];
 }
@@ -870,7 +870,7 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
 static void meta_delete(struct meta_match *meta)
 {
 	if (meta) {
-		struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
+		const struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
 
 		if (ops && ops->destroy) {
 			ops->destroy(&meta->lvalue);
@@ -964,7 +964,7 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
 {
 	struct meta_match *meta = (struct meta_match *) em->data;
 	struct tcf_meta_hdr hdr;
-	struct meta_type_ops *ops;
+	const struct meta_type_ops *ops;
 
 	memset(&hdr, 0, sizeof(hdr));
 	memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 3b180ff72f79..64f71a2155f3 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1365,7 +1365,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 		goto nla_put_failure;
 
 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
-					 qdisc_root_sleeping_lock(q), &d) < 0)
+					 qdisc_root_sleeping_lock(q), &d,
+					 TCA_PAD) < 0)
 		goto nla_put_failure;
 
 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
@@ -1679,7 +1680,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 		goto nla_put_failure;
 
 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
-					 qdisc_root_sleeping_lock(q), &d) < 0)
+					 qdisc_root_sleeping_lock(q), &d,
+					 TCA_PAD) < 0)
 		goto nla_put_failure;
 
 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 9b7e2980ee5c..dddf3bb65a32 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -49,6 +49,8 @@
 #include <linux/prefetch.h>
 #include <net/pkt_sched.h>
 #include <net/codel.h>
+#include <net/codel_impl.h>
+#include <net/codel_qdisc.h>
 
 
 #define DEFAULT_CODEL_LIMIT 1000
@@ -64,20 +66,33 @@ struct codel_sched_data {
  * to dequeue a packet from queue. Note: backlog is handled in
  * codel, we dont need to reduce it here.
  */
-static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
 {
+	struct Qdisc *sch = ctx;
 	struct sk_buff *skb = __skb_dequeue(&sch->q);
 
+	if (skb)
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+
 	prefetch(&skb->end); /* we'll need skb_shinfo() */
 	return skb;
 }
 
+static void drop_func(struct sk_buff *skb, void *ctx)
+{
+	struct Qdisc *sch = ctx;
+
+	qdisc_drop(skb, sch);
+}
+
 static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
 {
 	struct codel_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 
-	skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
+	skb = codel_dequeue(sch, &sch->qstats.backlog, &q->params, &q->vars,
+			    &q->stats, qdisc_pkt_len, codel_get_enqueue_time,
+			    drop_func, dequeue_func);
 
 	/* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
 	 * or HTB crashes. Defer it for next round.
@@ -173,9 +188,10 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt)
 
 	sch->limit = DEFAULT_CODEL_LIMIT;
 
-	codel_params_init(&q->params, sch);
+	codel_params_init(&q->params);
 	codel_vars_init(&q->vars);
 	codel_stats_init(&q->stats);
+	q->params.mtu = psched_mtu(qdisc_dev(sch));
 
 	if (opt) {
 		int err = codel_change(sch, opt);
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index d3fc8f9dd3d4..6883a8971562 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -24,6 +24,8 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/codel.h>
+#include <net/codel_impl.h>
+#include <net/codel_qdisc.h>
 
 /*	Fair Queue CoDel.
  *
@@ -57,8 +59,12 @@ struct fq_codel_sched_data {
 	u32		flows_cnt;	/* number of flows */
 	u32		perturbation;	/* hash perturbation */
 	u32		quantum;	/* psched_mtu(qdisc_dev(sch)); */
+	u32		drop_batch_size;
+	u32		memory_limit;
 	struct codel_params cparams;
 	struct codel_stats cstats;
+	u32		memory_usage;
+	u32		drop_overmemory;
 	u32		drop_overlimit;
 	u32		new_flow_count;
 
@@ -133,17 +139,21 @@ static inline void flow_queue_add(struct fq_codel_flow *flow,
 	skb->next = NULL;
 }
 
-static unsigned int fq_codel_drop(struct Qdisc *sch)
+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 	unsigned int maxbacklog = 0, idx = 0, i, len;
 	struct fq_codel_flow *flow;
+	unsigned int threshold;
+	unsigned int mem = 0;
 
-	/* Queue is full! Find the fat flow and drop packet from it.
+	/* Queue is full! Find the fat flow and drop packet(s) from it.
 	 * This might sound expensive, but with 1024 flows, we scan
 	 * 4KB of memory, and we dont need to handle a complex tree
 	 * in fast path (packet queue/enqueue) with many cache misses.
+	 * In stress mode, we'll try to drop 64 packets from the flow,
+	 * amortizing this linear lookup to one cache line per drop.
 	 */
 	for (i = 0; i < q->flows_cnt; i++) {
 		if (q->backlogs[i] > maxbacklog) {
@@ -151,15 +161,26 @@ static unsigned int fq_codel_drop(struct Qdisc *sch)
 			idx = i;
 		}
 	}
+
+	/* Our goal is to drop half of this fat flow backlog */
+	threshold = maxbacklog >> 1;
+
 	flow = &q->flows[idx];
-	skb = dequeue_head(flow);
-	len = qdisc_pkt_len(skb);
+	len = 0;
+	i = 0;
+	do {
+		skb = dequeue_head(flow);
+		len += qdisc_pkt_len(skb);
+		mem += skb->truesize;
+		kfree_skb(skb);
+	} while (++i < max_packets && len < threshold);
+
+	flow->dropped += i;
 	q->backlogs[idx] -= len;
-	sch->q.qlen--;
-	qdisc_qstats_drop(sch);
-	qdisc_qstats_backlog_dec(sch, skb);
-	kfree_skb(skb);
-	flow->dropped++;
+	q->memory_usage -= mem;
+	sch->qstats.drops += i;
+	sch->qstats.backlog -= len;
+	sch->q.qlen -= i;
 	return idx;
 }
 
@@ -168,16 +189,17 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
 	unsigned int prev_backlog;
 
 	prev_backlog = sch->qstats.backlog;
-	fq_codel_drop(sch);
+	fq_codel_drop(sch, 1U);
 	return prev_backlog - sch->qstats.backlog;
 }
 
 static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
-	unsigned int idx, prev_backlog;
+	unsigned int idx, prev_backlog, prev_qlen;
 	struct fq_codel_flow *flow;
 	int uninitialized_var(ret);
+	bool memory_limited;
 
 	idx = fq_codel_classify(skb, sch, &ret);
 	if (idx == 0) {
@@ -200,28 +222,38 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		flow->deficit = q->quantum;
 		flow->dropped = 0;
 	}
-	if (++sch->q.qlen <= sch->limit)
+	q->memory_usage += skb->truesize;
+	memory_limited = q->memory_usage > q->memory_limit;
+	if (++sch->q.qlen <= sch->limit && !memory_limited)
 		return NET_XMIT_SUCCESS;
 
 	prev_backlog = sch->qstats.backlog;
-	q->drop_overlimit++;
-	/* Return Congestion Notification only if we dropped a packet
-	 * from this flow.
+	prev_qlen = sch->q.qlen;
+
+	/* fq_codel_drop() is quite expensive, as it performs a linear search
+	 * in q->backlogs[] to find a fat flow.
+	 * So instead of dropping a single packet, drop half of its backlog
+	 * with a 64 packets limit to not add a too big cpu spike here.
 	 */
-	if (fq_codel_drop(sch) == idx)
-		return NET_XMIT_CN;
+	ret = fq_codel_drop(sch, q->drop_batch_size);
+
+	q->drop_overlimit += prev_qlen - sch->q.qlen;
+	if (memory_limited)
+		q->drop_overmemory += prev_qlen - sch->q.qlen;
+	/* As we dropped packet(s), better let upper stack know this */
+	qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen,
+				  prev_backlog - sch->qstats.backlog);
 
-	/* As we dropped a packet, better let upper stack know this */
-	qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
-	return NET_XMIT_SUCCESS;
+	return ret == idx ? NET_XMIT_CN : NET_XMIT_SUCCESS;
 }
 
 /* This is the specific function called from codel_dequeue()
  * to dequeue a packet from queue. Note: backlog is handled in
  * codel, we dont need to reduce it here.
  */
-static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
 {
+	struct Qdisc *sch = ctx;
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	struct fq_codel_flow *flow;
 	struct sk_buff *skb = NULL;
@@ -230,11 +262,20 @@ static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
 	if (flow->head) {
 		skb = dequeue_head(flow);
 		q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
+		q->memory_usage -= skb->truesize;
 		sch->q.qlen--;
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
 	}
 	return skb;
 }
 
+static void drop_func(struct sk_buff *skb, void *ctx)
+{
+	struct Qdisc *sch = ctx;
+
+	qdisc_drop(skb, sch);
+}
+
 static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
@@ -263,8 +304,9 @@ begin:
 	prev_ecn_mark = q->cstats.ecn_mark;
 	prev_backlog = sch->qstats.backlog;
 
-	skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
-			    dequeue);
+	skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams,
+			    &flow->cvars, &q->cstats, qdisc_pkt_len,
+			    codel_get_enqueue_time, drop_func, dequeue_func);
 
 	flow->dropped += q->cstats.drop_count - prev_drop_count;
 	flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
@@ -313,6 +355,7 @@ static void fq_codel_reset(struct Qdisc *sch)
 	}
 	memset(q->backlogs, 0, q->flows_cnt * sizeof(u32));
 	sch->q.qlen = 0;
+	q->memory_usage = 0;
 }
 
 static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
@@ -323,6 +366,8 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
 	[TCA_FQ_CODEL_FLOWS]	= { .type = NLA_U32 },
 	[TCA_FQ_CODEL_QUANTUM]	= { .type = NLA_U32 },
 	[TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
+	[TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 },
+	[TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 },
 };
 
 static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -374,7 +419,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
 	if (tb[TCA_FQ_CODEL_QUANTUM])
 		q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
 
-	while (sch->q.qlen > sch->limit) {
+	if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])
+		q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));
+
+	if (tb[TCA_FQ_CODEL_MEMORY_LIMIT])
+		q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT]));
+
+	while (sch->q.qlen > sch->limit ||
+	       q->memory_usage > q->memory_limit) {
 		struct sk_buff *skb = fq_codel_dequeue(sch);
 
 		q->cstats.drop_len += qdisc_pkt_len(skb);
@@ -419,13 +471,16 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
 
 	sch->limit = 10*1024;
 	q->flows_cnt = 1024;
+	q->memory_limit = 32 << 20; /* 32 MBytes */
+	q->drop_batch_size = 64;
 	q->quantum = psched_mtu(qdisc_dev(sch));
 	q->perturbation = prandom_u32();
 	INIT_LIST_HEAD(&q->new_flows);
 	INIT_LIST_HEAD(&q->old_flows);
-	codel_params_init(&q->cparams, sch);
+	codel_params_init(&q->cparams);
 	codel_stats_init(&q->cstats);
 	q->cparams.ecn = true;
+	q->cparams.mtu = psched_mtu(qdisc_dev(sch));
 
 	if (opt) {
 		int err = fq_codel_change(sch, opt);
@@ -476,6 +531,10 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
 			q->cparams.ecn) ||
 	    nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
 			q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE,
+			q->drop_batch_size) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT,
+			q->memory_limit) ||
 	    nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
 			q->flows_cnt))
 		goto nla_put_failure;
@@ -504,6 +563,8 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
 	st.qdisc_stats.new_flow_count = q->new_flow_count;
 	st.qdisc_stats.ce_mark = q->cstats.ce_mark;
+	st.qdisc_stats.memory_usage  = q->memory_usage;
+	st.qdisc_stats.drop_overmemory = q->drop_overmemory;
 
 	list_for_each(pos, &q->new_flows)
 		st.qdisc_stats.new_flows_len++;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 80742edea96f..269dd71b3828 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -108,35 +108,6 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 	return skb;
 }
 
-static inline int handle_dev_cpu_collision(struct sk_buff *skb,
-					   struct netdev_queue *dev_queue,
-					   struct Qdisc *q)
-{
-	int ret;
-
-	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
-		/*
-		 * Same CPU holding the lock. It may be a transient
-		 * configuration error, when hard_start_xmit() recurses. We
-		 * detect it by checking xmit owner and drop the packet when
-		 * deadloop is detected. Return OK to try the next skb.
-		 */
-		kfree_skb_list(skb);
-		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
-				     dev_queue->dev->name);
-		ret = qdisc_qlen(q);
-	} else {
-		/*
-		 * Another cpu is holding lock, requeue & delay xmits for
-		 * some time.
-		 */
-		__this_cpu_inc(softnet_data.cpu_collision);
-		ret = dev_requeue_skb(skb, q);
-	}
-
-	return ret;
-}
-
 /*
  * Transmit possibly several skbs, and handle the return status as
  * required. Holding the __QDISC___STATE_RUNNING bit guarantees that
@@ -174,9 +145,6 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
 		ret = qdisc_qlen(q);
-	} else if (ret == NETDEV_TX_LOCKED) {
-		/* Driver try lock failed */
-		ret = handle_dev_cpu_collision(skb, txq, q);
 	} else {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */
 		if (unlikely(ret != NETDEV_TX_BUSY))
@@ -259,13 +227,12 @@ unsigned long dev_trans_start(struct net_device *dev)
 
 	if (is_vlan_dev(dev))
 		dev = vlan_dev_real_dev(dev);
-	res = dev->trans_start;
-	for (i = 0; i < dev->num_tx_queues; i++) {
+	res = netdev_get_tx_queue(dev, 0)->trans_start;
+	for (i = 1; i < dev->num_tx_queues; i++) {
 		val = netdev_get_tx_queue(dev, i)->trans_start;
 		if (val && time_after(val, res))
 			res = val;
 	}
-	dev->trans_start = res;
 
 	return res;
 }
@@ -288,10 +255,7 @@ static void dev_watchdog(unsigned long arg)
 				struct netdev_queue *txq;
 
 				txq = netdev_get_tx_queue(dev, i);
-				/*
-				 * old device drivers set dev->trans_start
-				 */
-				trans_start = txq->trans_start ? : dev->trans_start;
+				trans_start = txq->trans_start;
 				if (netif_xmit_stopped(txq) &&
 				    time_after(jiffies, (trans_start +
 							 dev->watchdog_timeo))) {
@@ -807,7 +771,7 @@ void dev_activate(struct net_device *dev)
 		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
 
 	if (need_watchdog) {
-		dev->trans_start = jiffies;
+		netif_trans_update(dev);
 		dev_watchdog_up(dev);
 	}
 }
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 87b02ed3d5f2..f6bf5818ed4d 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1122,10 +1122,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
-	    nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
+	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
+			      TCA_HTB_PAD))
 		goto nla_put_failure;
 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
-	    nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
+	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
+			      TCA_HTB_PAD))
 		goto nla_put_failure;
 
 	return nla_nest_end(skb, nest);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 4befe97a9034..205bed00dd34 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1051,7 +1051,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 		goto nla_put_failure;
 
 	if (q->rate >= (1ULL << 32)) {
-		if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
+		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
+				      TCA_NETEM_PAD))
 			goto nla_put_failure;
 		rate.rate = ~0U;
 	} else {
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index c2fbde742f37..83b90b584fae 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -472,11 +472,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
 	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
-	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
+	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
+			      TCA_TBF_PAD))
 		goto nla_put_failure;
 	if (tbf_peak_present(q) &&
 	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
-	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
+	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
+			      TCA_TBF_PAD))
 		goto nla_put_failure;
 
 	return nla_nest_end(skb, nest);