From b592843c6723a850be70bf9618578082f3b73851 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Nov 2018 17:33:34 -0800
Subject: net: sched: add an offload dump helper

Qdisc dump operation of offload-capable qdiscs performs a few
extra steps which are identical among all the qdiscs.  Add
a helper to share this code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c  | 21 +++++++++++++++++++++
 net/sched/sch_prio.c | 16 +---------------
 net/sched/sch_red.c  | 17 +----------------
 3 files changed, 23 insertions(+), 31 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ca3b0f46de53..e534825d3d3a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -810,6 +810,27 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
 }
 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
 
+int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
+			      void *type_data)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	int err;
+
+	sch->flags &= ~TCQ_F_OFFLOADED;
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return 0;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+	if (err == -EOPNOTSUPP)
+		return 0;
+
+	if (!err)
+		sch->flags |= TCQ_F_OFFLOADED;
+
+	return err;
+}
+EXPORT_SYMBOL(qdisc_offload_dump_helper);
+
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 			 u32 portid, u32 seq, u16 flags, int event)
 {
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index f8af98621179..4bdd04c30ead 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -251,7 +251,6 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt,
 
 static int prio_dump_offload(struct Qdisc *sch)
 {
-	struct net_device *dev = qdisc_dev(sch);
 	struct tc_prio_qopt_offload hw_stats = {
 		.command = TC_PRIO_STATS,
 		.handle = sch->handle,
@@ -263,21 +262,8 @@ static int prio_dump_offload(struct Qdisc *sch)
 			},
 		},
 	};
-	int err;
-
-	sch->flags &= ~TCQ_F_OFFLOADED;
-	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
-		return 0;
-
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
-					    &hw_stats);
-	if (err == -EOPNOTSUPP)
-		return 0;
-
-	if (!err)
-		sch->flags |= TCQ_F_OFFLOADED;
 
-	return err;
+	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats);
 }
 
 static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 3ce6c0a2c493..d5e441194397 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -281,7 +281,6 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt,
 
 static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
 {
-	struct net_device *dev = qdisc_dev(sch);
 	struct tc_red_qopt_offload hw_stats = {
 		.command = TC_RED_STATS,
 		.handle = sch->handle,
@@ -291,22 +290,8 @@ static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
 			.stats.qstats = &sch->qstats,
 		},
 	};
-	int err;
-
-	sch->flags &= ~TCQ_F_OFFLOADED;
-
-	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
-		return 0;
-
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
-					    &hw_stats);
-	if (err == -EOPNOTSUPP)
-		return 0;
-
-	if (!err)
-		sch->flags |= TCQ_F_OFFLOADED;
 
-	return err;
+	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_RED, &hw_stats);
 }
 
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
cgit v1.2.3


From dad54c0fab31c2ed813cb54bb024c65d6467d0b9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Nov 2018 17:33:35 -0800
Subject: net: sched: red: remove unnecessary red_dump_offload_stats parameter

Offload dump helper does not use opt parameter, remove it.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index d5e441194397..2bf1d2fabc48 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -279,7 +279,7 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt,
 	return red_change(sch, opt, extack);
 }
 
-static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
+static int red_dump_offload_stats(struct Qdisc *sch)
 {
 	struct tc_red_qopt_offload hw_stats = {
 		.command = TC_RED_STATS,
@@ -309,7 +309,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 	};
 	int err;
 
-	err = red_dump_offload_stats(sch, &opt);
+	err = red_dump_offload_stats(sch);
 	if (err)
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 58f8927399eac8f2ac3c7289a2856b8b5516ba30 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Nov 2018 17:33:36 -0800
Subject: net: sched: set TCQ_F_OFFLOADED flag for MQ

PRIO and RED mark the qdisc with TCQ_F_OFFLOADED upon successful offload,
make MQ do the same.  The consistency will help with consistent
graft callback behaviour.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_mq.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f20f3a0f8424..1db5c1bf6ddd 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -38,9 +38,8 @@ static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
 }
 
-static void mq_offload_stats(struct Qdisc *sch)
+static int mq_offload_stats(struct Qdisc *sch)
 {
-	struct net_device *dev = qdisc_dev(sch);
 	struct tc_mq_qopt_offload opt = {
 		.command = TC_MQ_STATS,
 		.handle = sch->handle,
@@ -50,8 +49,7 @@ static void mq_offload_stats(struct Qdisc *sch)
 		},
 	};
 
-	if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt);
 }
 
 static void mq_destroy(struct Qdisc *sch)
@@ -171,9 +169,8 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 
 		spin_unlock_bh(qdisc_lock(qdisc));
 	}
-	mq_offload_stats(sch);
 
-	return 0;
+	return mq_offload_stats(sch);
 }
 
 static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
-- 
cgit v1.2.3


From bfaee9113f30abfa1f77ecb5e4a6f53a9d4c690c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Nov 2018 17:33:37 -0800
Subject: net: sched: add an offload graft helper

Qdisc graft operation of offload-capable qdiscs performs a few
extra steps which are identical among all the qdiscs.  Add
a helper to share this code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c  | 29 +++++++++++++++++++++++++++++
 net/sched/sch_prio.c | 27 +++------------------------
 2 files changed, 32 insertions(+), 24 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index e534825d3d3a..4b3af41cc1d7 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -831,6 +831,35 @@ int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
 }
 EXPORT_SYMBOL(qdisc_offload_dump_helper);
 
+void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
+				struct Qdisc *new, struct Qdisc *old,
+				enum tc_setup_type type, void *type_data,
+				struct netlink_ext_ack *extack)
+{
+	bool any_qdisc_is_offloaded;
+	int err;
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+
+	/* Don't report error if the graft is part of destroy operation. */
+	if (!err || !new || new == &noop_qdisc)
+		return;
+
+	/* Don't report error if the parent, the old child and the new
+	 * one are not offloaded.
+	 */
+	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
+	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
+	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
+
+	if (any_qdisc_is_offloaded)
+		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
+}
+EXPORT_SYMBOL(qdisc_offload_graft_helper);
+
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 			 u32 portid, u32 seq, u16 flags, int event)
 {
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 4bdd04c30ead..63a90c5055ee 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -295,43 +295,22 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	struct tc_prio_qopt_offload graft_offload;
-	struct net_device *dev = qdisc_dev(sch);
 	unsigned long band = arg - 1;
-	bool any_qdisc_is_offloaded;
-	int err;
 
 	if (new == NULL)
 		new = &noop_qdisc;
 
 	*old = qdisc_replace(sch, new, &q->queues[band]);
 
-	if (!tc_can_offload(dev))
-		return 0;
-
 	graft_offload.handle = sch->handle;
 	graft_offload.parent = sch->parent;
 	graft_offload.graft_params.band = band;
 	graft_offload.graft_params.child_handle = new->handle;
 	graft_offload.command = TC_PRIO_GRAFT;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
-					    &graft_offload);
-
-	/* Don't report error if the graft is part of destroy operation. */
-	if (err && new != &noop_qdisc) {
-		/* Don't report error if the parent, the old child and the new
-		 * one are not offloaded.
-		 */
-		any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
-		any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED;
-		if (*old)
-			any_qdisc_is_offloaded |= (*old)->flags &
-						   TCQ_F_OFFLOADED;
-
-		if (any_qdisc_is_offloaded)
-			NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
-	}
-
+	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old,
+				   TC_SETUP_QDISC_PRIO, &graft_offload,
+				   extack);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9da93ece59f4a3e1544dfa2aa53e91f9e724abc6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Nov 2018 17:33:38 -0800
Subject: net: sched: refactor grafting Qdiscs with a parent

The code for grafting Qdiscs when there is a parent has two needless
indentation levels, and breaks the "keep the success path unindented"
guideline.  Refactor.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 4b3af41cc1d7..f55bc50cd0a9 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1007,7 +1007,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 {
 	struct Qdisc *q = old;
 	struct net *net = dev_net(dev);
-	int err = 0;
 
 	if (parent == NULL) {
 		unsigned int i, num_q, ingress;
@@ -1062,28 +1061,29 @@ skip:
 			dev_activate(dev);
 	} else {
 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+		unsigned long cl;
+		int err;
 
 		/* Only support running class lockless if parent is lockless */
 		if (new && (new->flags & TCQ_F_NOLOCK) &&
 		    parent && !(parent->flags & TCQ_F_NOLOCK))
 			new->flags &= ~TCQ_F_NOLOCK;
 
-		err = -EOPNOTSUPP;
-		if (cops && cops->graft) {
-			unsigned long cl = cops->find(parent, classid);
+		if (!cops || !cops->graft)
+			return -EOPNOTSUPP;
 
-			if (cl) {
-				err = cops->graft(parent, cl, new, &old,
-						  extack);
-			} else {
-				NL_SET_ERR_MSG(extack, "Specified class not found");
-				err = -ENOENT;
-			}
+		cl = cops->find(parent, classid);
+		if (!cl) {
+			NL_SET_ERR_MSG(extack, "Specified class not found");
+			return -ENOENT;
 		}
-		if (!err)
-			notify_and_destroy(net, skb, n, classid, old, new);
+
+		err = cops->graft(parent, cl, new, &old, extack);
+		if (err)
+			return err;
+		notify_and_destroy(net, skb, n, classid, old, new);
 	}
-	return err;
+	return 0;
 }
 
 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
-- 
cgit v1.2.3


From 0c8d13ac96070000da33f394f45e9c19638483c5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Nov 2018 17:33:39 -0800
Subject: net: sched: red: delay destroying child qdisc on replace

Move destroying of the old child qdisc outside of the sch_tree_lock()
section.  This should improve the software qdisc replace but is even
more important for offloads.  Firstly calling offloads under a spin
lock is best avoided.  Secondly the destroy event of existing child
would have been sent to the offload device before the replace, causing
confusion.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 2bf1d2fabc48..7682f7a618a1 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -193,10 +193,10 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
 static int red_change(struct Qdisc *sch, struct nlattr *opt,
 		      struct netlink_ext_ack *extack)
 {
+	struct Qdisc *old_child = NULL, *child = NULL;
 	struct red_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_RED_MAX + 1];
 	struct tc_red_qopt *ctl;
-	struct Qdisc *child = NULL;
 	int err;
 	u32 max_P;
 
@@ -233,7 +233,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 	if (child) {
 		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
 					  q->qdisc->qstats.backlog);
-		qdisc_put(q->qdisc);
+		old_child = q->qdisc;
 		q->qdisc = child;
 	}
 
@@ -252,7 +252,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 		red_start_of_idle_period(&q->vars);
 
 	sch_tree_unlock(sch);
+
 	red_offload(sch, true);
+
+	if (old_child)
+		qdisc_put(old_child);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7b8e0b6e659983154c8d7e756cdb833d89a3d4d7 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Nov 2018 17:33:40 -0800
Subject: net: sched: prio: delay destroying child qdiscs on change

Move destroying of the old child qdiscs outside of the sch_tree_lock()
section.  This should improve the software qdisc replace but is even
more important for offloads.  Calling offloads under a spin lock is
best avoided, and child's destroy would be called under sch_tree_lock().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_prio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 63a90c5055ee..cdf68706e40f 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -220,7 +220,6 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 
 		qdisc_tree_reduce_backlog(child, child->q.qlen,
 					  child->qstats.backlog);
-		qdisc_put(child);
 	}
 
 	for (i = oldbands; i < q->bands; i++) {
@@ -230,6 +229,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	sch_tree_unlock(sch);
+
+	for (i = q->bands; i < oldbands; i++)
+		qdisc_put(q->queues[i]);
 	return 0;
 }
 
-- 
cgit v1.2.3


From b1817524c028a5a5284f21358185c74790001e0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= <mirq-linux@rere.qmqm.pl>
Date: Fri, 9 Nov 2018 00:18:02 +0100
Subject: net/core: use __vlan_hwaccel helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This removes assumptions about VLAN_TAG_PRESENT bit.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_vlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/sched')

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index ba677d54a7af..93fdaf707313 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -63,7 +63,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
 		/* extract existing tag (and guarantee no hw-accel tag) */
 		if (skb_vlan_tag_present(skb)) {
 			tci = skb_vlan_tag_get(skb);
-			skb->vlan_tci = 0;
+			__vlan_hwaccel_clear_tag(skb);
 		} else {
 			/* in-payload vlan tag, pop it */
 			err = __skb_vlan_pop(skb, &tci);
-- 
cgit v1.2.3


From 190852a55edbe138503259ea1bb40c08be221d75 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 8 Nov 2018 19:50:38 -0800
Subject: net: sched: red: inform offloads about harddrop setting

To mirror software behaviour on offload more precisely inform
the drivers about the state of the harddrop flag.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/sched')

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 7682f7a618a1..a1d08bdd9357 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -167,6 +167,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		opt.set.max = q->parms.qth_max >> q->parms.Wlog;
 		opt.set.probability = q->parms.max_P;
 		opt.set.is_ecn = red_use_ecn(q);
+		opt.set.is_harddrop = red_use_harddrop(q);
 		opt.set.qstats = &sch->qstats;
 	} else {
 		opt.command = TC_RED_DESTROY;
-- 
cgit v1.2.3


From 7f76fa36754b08d9709ae50cd0a9477a6f998b21 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Fri, 9 Nov 2018 21:21:26 -0800
Subject: net: sched: register callbacks for indirect tc block binds

Currently drivers can register to receive TC block bind/unbind callbacks
by implementing the setup_tc ndo in any of their given netdevs. However,
drivers may also be interested in binds to higher level devices (e.g.
tunnel drivers) to potentially offload filters applied to them.

Introduce indirect block devs which allows drivers to register callbacks
for block binds on other devices. The callback is triggered when the
device is bound to a block, allowing the driver to register for rules
applied to that block using already available functions.

Freeing an indirect block callback will trigger an unbind event (if
necessary) to direct the driver to remove any offloaded rules and unreg
any block rule callbacks. It is the responsibility of the implementing
driver to clean any registered indirect block callbacks before exiting,
if the block it still active at such a time.

Allow registering an indirect block dev callback for a device that is
already bound to a block. In this case (if it is an ingress block),
register and also trigger the callback meaning that any already installed
rules can be replayed to the calling driver.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 256 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 255 insertions(+), 1 deletion(-)

(limited to 'net/sched')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f427a1e00e7e..d92f44ac4c39 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -25,6 +25,7 @@
 #include <linux/kmod.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/rhashtable.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/netlink.h>
@@ -365,6 +366,245 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 	}
 }
 
+static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
+{
+	const struct Qdisc_class_ops *cops;
+	struct Qdisc *qdisc;
+
+	if (!dev_ingress_queue(dev))
+		return NULL;
+
+	qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
+	if (!qdisc)
+		return NULL;
+
+	cops = qdisc->ops->cl_ops;
+	if (!cops)
+		return NULL;
+
+	if (!cops->tcf_block)
+		return NULL;
+
+	return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL);
+}
+
+static struct rhashtable indr_setup_block_ht;
+
+struct tc_indr_block_dev {
+	struct rhash_head ht_node;
+	struct net_device *dev;
+	unsigned int refcnt;
+	struct list_head cb_list;
+	struct tcf_block *block;
+};
+
+struct tc_indr_block_cb {
+	struct list_head list;
+	void *cb_priv;
+	tc_indr_block_bind_cb_t *cb;
+	void *cb_ident;
+};
+
+static const struct rhashtable_params tc_indr_setup_block_ht_params = {
+	.key_offset	= offsetof(struct tc_indr_block_dev, dev),
+	.head_offset	= offsetof(struct tc_indr_block_dev, ht_node),
+	.key_len	= sizeof(struct net_device *),
+};
+
+static struct tc_indr_block_dev *
+tc_indr_block_dev_lookup(struct net_device *dev)
+{
+	return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
+				      tc_indr_setup_block_ht_params);
+}
+
+static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
+{
+	struct tc_indr_block_dev *indr_dev;
+
+	indr_dev = tc_indr_block_dev_lookup(dev);
+	if (indr_dev)
+		goto inc_ref;
+
+	indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
+	if (!indr_dev)
+		return NULL;
+
+	INIT_LIST_HEAD(&indr_dev->cb_list);
+	indr_dev->dev = dev;
+	indr_dev->block = tc_dev_ingress_block(dev);
+	if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+				   tc_indr_setup_block_ht_params)) {
+		kfree(indr_dev);
+		return NULL;
+	}
+
+inc_ref:
+	indr_dev->refcnt++;
+	return indr_dev;
+}
+
+static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev)
+{
+	if (--indr_dev->refcnt)
+		return;
+
+	rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+			       tc_indr_setup_block_ht_params);
+	kfree(indr_dev);
+}
+
+static struct tc_indr_block_cb *
+tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev,
+			tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	struct tc_indr_block_cb *indr_block_cb;
+
+	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+		if (indr_block_cb->cb == cb &&
+		    indr_block_cb->cb_ident == cb_ident)
+			return indr_block_cb;
+	return NULL;
+}
+
+static struct tc_indr_block_cb *
+tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv,
+		     tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	struct tc_indr_block_cb *indr_block_cb;
+
+	indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+	if (indr_block_cb)
+		return ERR_PTR(-EEXIST);
+
+	indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
+	if (!indr_block_cb)
+		return ERR_PTR(-ENOMEM);
+
+	indr_block_cb->cb_priv = cb_priv;
+	indr_block_cb->cb = cb;
+	indr_block_cb->cb_ident = cb_ident;
+	list_add(&indr_block_cb->list, &indr_dev->cb_list);
+
+	return indr_block_cb;
+}
+
+static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
+{
+	list_del(&indr_block_cb->list);
+	kfree(indr_block_cb);
+}
+
+static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
+				  struct tc_indr_block_cb *indr_block_cb,
+				  enum tc_block_command command)
+{
+	struct tc_block_offload bo = {
+		.command	= command,
+		.binder_type	= TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
+		.block		= indr_dev->block,
+	};
+
+	if (!indr_dev->block)
+		return;
+
+	indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
+			  &bo);
+}
+
+int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+				tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	struct tc_indr_block_cb *indr_block_cb;
+	struct tc_indr_block_dev *indr_dev;
+	int err;
+
+	indr_dev = tc_indr_block_dev_get(dev);
+	if (!indr_dev)
+		return -ENOMEM;
+
+	indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
+	err = PTR_ERR_OR_ZERO(indr_block_cb);
+	if (err)
+		goto err_dev_put;
+
+	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_BIND);
+	return 0;
+
+err_dev_put:
+	tc_indr_block_dev_put(indr_dev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register);
+
+int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+			      tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	int err;
+
+	rtnl_lock();
+	err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
+	rtnl_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(tc_indr_block_cb_register);
+
+void __tc_indr_block_cb_unregister(struct net_device *dev,
+				   tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	struct tc_indr_block_cb *indr_block_cb;
+	struct tc_indr_block_dev *indr_dev;
+
+	indr_dev = tc_indr_block_dev_lookup(dev);
+	if (!indr_dev)
+		return;
+
+	indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+	if (!indr_block_cb)
+		return;
+
+	/* Send unbind message if required to free any block cbs. */
+	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, TC_BLOCK_UNBIND);
+	tc_indr_block_cb_del(indr_block_cb);
+	tc_indr_block_dev_put(indr_dev);
+}
+EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister);
+
+void tc_indr_block_cb_unregister(struct net_device *dev,
+				 tc_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	rtnl_lock();
+	__tc_indr_block_cb_unregister(dev, cb, cb_ident);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
+
+static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
+			       struct tcf_block_ext_info *ei,
+			       enum tc_block_command command,
+			       struct netlink_ext_ack *extack)
+{
+	struct tc_indr_block_cb *indr_block_cb;
+	struct tc_indr_block_dev *indr_dev;
+	struct tc_block_offload bo = {
+		.command	= command,
+		.binder_type	= ei->binder_type,
+		.block		= block,
+		.extack		= extack,
+	};
+
+	indr_dev = tc_indr_block_dev_lookup(dev);
+	if (!indr_dev)
+		return;
+
+	indr_dev->block = command == TC_BLOCK_BIND ? block : NULL;
+
+	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+		indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
+				  &bo);
+}
+
 static bool tcf_block_offload_in_use(struct tcf_block *block)
 {
 	return block->offloadcnt;
@@ -406,12 +646,17 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
 	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
 	if (err == -EOPNOTSUPP)
 		goto no_offload_dev_inc;
-	return err;
+	if (err)
+		return err;
+
+	tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
+	return 0;
 
 no_offload_dev_inc:
 	if (tcf_block_offload_in_use(block))
 		return -EOPNOTSUPP;
 	block->nooffloaddevcnt++;
+	tc_indr_block_call(block, dev, ei, TC_BLOCK_BIND, extack);
 	return 0;
 }
 
@@ -421,6 +666,8 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 	struct net_device *dev = q->dev_queue->dev;
 	int err;
 
+	tc_indr_block_call(block, dev, ei, TC_BLOCK_UNBIND, NULL);
+
 	if (!dev->netdev_ops->ndo_setup_tc)
 		goto no_offload_dev_dec;
 	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
@@ -2355,6 +2602,11 @@ static int __init tc_filter_init(void)
 	if (err)
 		goto err_register_pernet_subsys;
 
+	err = rhashtable_init(&indr_setup_block_ht,
+			      &tc_indr_setup_block_ht_params);
+	if (err)
+		goto err_rhash_setup_block_ht;
+
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
@@ -2366,6 +2618,8 @@ static int __init tc_filter_init(void)
 
 	return 0;
 
+err_rhash_setup_block_ht:
+	unregister_pernet_subsys(&tcf_net_ops);
 err_register_pernet_subsys:
 	destroy_workqueue(tc_filter_wq);
 	return err;
-- 
cgit v1.2.3


From 48872c11b77271ef9b070bdc50afe6655c4eb9aa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 11 Nov 2018 09:11:31 -0800
Subject: net_sched: sch_fq: add dctcp-like marking

Similar to 80ba92fa1a92 ("codel: add ce_threshold attribute")

After EDT adoption, it became easier to implement DCTCP-like CE marking.

In many cases, queues are not building in the network fabric but on
the hosts themselves.

If packets leaving fq missed their Earliest Departure Time by XXX usec,
we mark them with ECN CE. This gives a feedback (after one RTT) to
the sender to slow down and find better operating mode.

Example :

tc qd replace dev eth0 root fq ce_threshold 2.5ms

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'net/sched')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4b1af706896c..3671eab91107 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -94,6 +94,7 @@ struct fq_sched_data {
 	u32		flow_refill_delay;
 	u32		flow_plimit;	/* max packets per flow */
 	unsigned long	flow_max_rate;	/* optional max rate per flow */
+	u64		ce_threshold;
 	u32		orphan_mask;	/* mask for orphaned skb */
 	u32		low_rate_threshold;
 	struct rb_root	*fq_root;
@@ -107,6 +108,7 @@ struct fq_sched_data {
 	u64		stat_gc_flows;
 	u64		stat_internal_packets;
 	u64		stat_throttled;
+	u64		stat_ce_mark;
 	u64		stat_flows_plimit;
 	u64		stat_pkts_too_long;
 	u64		stat_allocation_errors;
@@ -454,6 +456,11 @@ begin:
 			fq_flow_set_throttled(q, f);
 			goto begin;
 		}
+		if (time_next_packet &&
+		    (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+			INET_ECN_set_ce(skb);
+			q->stat_ce_mark++;
+		}
 	}
 
 	skb = fq_dequeue_head(sch, f);
@@ -650,6 +657,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
 	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
 	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
+	[TCA_FQ_CE_THRESHOLD]		= { .type = NLA_U32 },
 };
 
 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -729,6 +737,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_FQ_ORPHAN_MASK])
 		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
 
+	if (tb[TCA_FQ_CE_THRESHOLD])
+		q->ce_threshold = (u64)NSEC_PER_USEC *
+				  nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+
 	if (!err) {
 		sch_tree_unlock(sch);
 		err = fq_resize(sch, fq_log);
@@ -779,6 +791,10 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 	q->fq_trees_log		= ilog2(1024);
 	q->orphan_mask		= 1024 - 1;
 	q->low_rate_threshold	= 550000 / 8;
+
+	/* Default ce_threshold of 4294 seconds */
+	q->ce_threshold		= (u64)NSEC_PER_USEC * ~0U;
+
 	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
 
 	if (opt)
@@ -792,6 +808,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 ce_threshold = q->ce_threshold;
 	struct nlattr *opts;
 
 	opts = nla_nest_start(skb, TCA_OPTIONS);
@@ -800,6 +817,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 
 	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
 
+	do_div(ce_threshold, NSEC_PER_USEC);
+
 	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
 	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
 	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
@@ -812,6 +831,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
 	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
 			q->low_rate_threshold) ||
+	    nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
 		goto nla_put_failure;
 
@@ -841,6 +861,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.throttled_flows	  = q->throttled_flows;
 	st.unthrottle_latency_ns  = min_t(unsigned long,
 					  q->unthrottle_latency_ns, ~0U);
+	st.ce_mark		  = q->stat_ce_mark;
 	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
-- 
cgit v1.2.3


From 98b0e5f6842a9982a793f0837b1bd1495542a3d8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 12 Nov 2018 14:58:10 -0800
Subject: net: sched: provide notification for graft on root

Drivers are currently not notified when a Qdisc is grafted as root.
This requires special casing Qdiscs added with parent = TC_H_ROOT in
the driver.  Also there is no notification sent to the driver when
an existing Qdisc is grafted as root.

Add this very simple notifications, drivers should now be able to
track their Qdisc tree fully.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net/sched')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f55bc50cd0a9..9c88cec7e8a2 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -860,6 +860,21 @@ void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
 }
 EXPORT_SYMBOL(qdisc_offload_graft_helper);
 
+static void qdisc_offload_graft_root(struct net_device *dev,
+				     struct Qdisc *new, struct Qdisc *old,
+				     struct netlink_ext_ack *extack)
+{
+	struct tc_root_qopt_offload graft_offload = {
+		.command	= TC_ROOT_GRAFT,
+		.handle		= new ? new->handle : 0,
+		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
+				  (old && old->flags & TCQ_F_INGRESS),
+	};
+
+	qdisc_offload_graft_helper(dev, NULL, new, old,
+				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
+}
+
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 			 u32 portid, u32 seq, u16 flags, int event)
 {
@@ -1026,6 +1041,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		if (dev->flags & IFF_UP)
 			dev_deactivate(dev);
 
+		qdisc_offload_graft_root(dev, new, old, extack);
+
 		if (new && new->ops->attach)
 			goto skip;
 
-- 
cgit v1.2.3


From bf2a752bea027ec5a0bc5b4042d78b32715ad198 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 12 Nov 2018 14:58:13 -0800
Subject: net: sched: red: offload a graft notification

Drivers offloading Qdiscs should have reasonable certainty
the offloaded behaviour matches the SW path.  This is impossible
if the driver does not know about all Qdiscs or when Qdiscs move
and are reused.  Send a graft notification from RED.  The drivers
are expected to simply stop offloading the Qdisc, if a non-standard
child is ever grafted onto it.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net/sched')

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a1d08bdd9357..4b5ca172ee2d 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -367,6 +367,21 @@ static int red_dump_class(struct Qdisc *sch, unsigned long cl,
 	return 0;
 }
 
+static void red_graft_offload(struct Qdisc *sch,
+			      struct Qdisc *new, struct Qdisc *old,
+			      struct netlink_ext_ack *extack)
+{
+	struct tc_red_qopt_offload graft_offload = {
+		.handle		= sch->handle,
+		.parent		= sch->parent,
+		.child_handle	= new->handle,
+		.command	= TC_RED_GRAFT,
+	};
+
+	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
+				   TC_SETUP_QDISC_RED, &graft_offload, extack);
+}
+
 static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 		     struct Qdisc **old, struct netlink_ext_ack *extack)
 {
@@ -376,6 +391,8 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 		new = &noop_qdisc;
 
 	*old = qdisc_replace(sch, new, &q->qdisc);
+
+	red_graft_offload(sch, new, *old, extack);
 	return 0;
 }
 
-- 
cgit v1.2.3


From d577a3d279c3c60adabdcc4b7a414d37dea7b8b2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 12 Nov 2018 14:58:14 -0800
Subject: net: sched: mq: offload a graft notification

Drivers offloading Qdiscs should have reasonable certainty
the offloaded behaviour matches the SW path.  This is impossible
if the driver does not know about all Qdiscs or when Qdiscs move
and are reused.  Send a graft notification from MQ.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_mq.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net/sched')

diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 1db5c1bf6ddd..203659bc3906 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -193,6 +193,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
 		    struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+	struct tc_mq_qopt_offload graft_offload;
 	struct net_device *dev = qdisc_dev(sch);
 
 	if (dev->flags & IFF_UP)
@@ -203,6 +204,14 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
 		new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
 	if (dev->flags & IFF_UP)
 		dev_activate(dev);
+
+	graft_offload.handle = sch->handle;
+	graft_offload.graft_params.queue = cl - 1;
+	graft_offload.graft_params.child_handle = new ? new->handle : 0;
+	graft_offload.command = TC_MQ_GRAFT;
+
+	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old,
+				   TC_SETUP_QDISC_MQ, &graft_offload, extack);
 	return 0;
 }
 
-- 
cgit v1.2.3


From c0b7490b19f6ab43c3c4ef82c8d5ed3bf19a8913 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 12 Nov 2018 14:58:16 -0800
Subject: net: sched: red: notify drivers about RED's limit parameter

RED qdisc's limit parameter changes the behaviour of the qdisc,
for instance if it's set to 0 qdisc will drop all the packets.

When replace operation happens and parameter is set to non-0
a new fifo qdisc will be instantiated and replace the old child
qdisc which will be destroyed.

Drivers need to know the parameter, even if they don't impose
the actual limit to be able to reliably reconstruct the Qdisc
hierarchy.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_red.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/sched')

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 4b5ca172ee2d..9df9942340ea 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -166,6 +166,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
 		opt.set.min = q->parms.qth_min >> q->parms.Wlog;
 		opt.set.max = q->parms.qth_max >> q->parms.Wlog;
 		opt.set.probability = q->parms.max_P;
+		opt.set.limit = q->limit;
 		opt.set.is_ecn = red_use_ecn(q);
 		opt.set.is_harddrop = red_use_harddrop(q);
 		opt.set.qstats = &sch->qstats;
-- 
cgit v1.2.3


From 5c72299fba9df407c2f2994e194edebf878996ee Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Mon, 12 Nov 2018 16:15:55 -0800
Subject: net: sched: cls_flower: Classify packets using port ranges

Added support in tc flower for filtering based on port ranges.

Example:
1. Match on a port range:
-------------------------
$ tc filter add dev enp4s0 protocol ip parent ffff:\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent ffff:
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 1 ref 1 bind 1 installed 85 sec used 3 sec
        Action statistics:
        Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

2. Match on IP address and port range:
--------------------------------------
$ tc filter add dev enp4s0 protocol ip parent ffff:\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent ffff:
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 2 ref 1 bind 1 installed 58 sec used 2 sec
        Action statistics:
        Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

v4:
1. Added condition before setting port key.
2. Organized setting and dumping port range keys into functions
   and added validation of input range.

v3:
1. Moved new fields in UAPI enum to the end of enum.
2. Removed couple of empty lines.

v2:
Addressed Jiri's comments:
1. Added separate functions for dst and src comparisons.
2. Removed endpoint enum.
3. Added new bit TCA_FLOWER_FLAGS_RANGE to decide normal/range
  lookup.
4. Cleaned up fl_lookup function.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 149 insertions(+), 6 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c6c327874abc..85e9f8e1da10 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -55,6 +55,8 @@ struct fl_flow_key {
 	struct flow_dissector_key_ip ip;
 	struct flow_dissector_key_ip enc_ip;
 	struct flow_dissector_key_enc_opts enc_opts;
+	struct flow_dissector_key_ports tp_min;
+	struct flow_dissector_key_ports tp_max;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -65,6 +67,7 @@ struct fl_flow_mask_range {
 struct fl_flow_mask {
 	struct fl_flow_key key;
 	struct fl_flow_mask_range range;
+	u32 flags;
 	struct rhash_head ht_node;
 	struct rhashtable ht;
 	struct rhashtable_params filter_ht_params;
@@ -179,13 +182,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
 	memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
-				       struct fl_flow_key *mkey)
+static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
+				  struct fl_flow_key *key,
+				  struct fl_flow_key *mkey)
+{
+	__be16 min_mask, max_mask, min_val, max_val;
+
+	min_mask = htons(filter->mask->key.tp_min.dst);
+	max_mask = htons(filter->mask->key.tp_max.dst);
+	min_val = htons(filter->key.tp_min.dst);
+	max_val = htons(filter->key.tp_max.dst);
+
+	if (min_mask && max_mask) {
+		if (htons(key->tp.dst) < min_val ||
+		    htons(key->tp.dst) > max_val)
+			return false;
+
+		/* skb does not have min and max values */
+		mkey->tp_min.dst = filter->mkey.tp_min.dst;
+		mkey->tp_max.dst = filter->mkey.tp_max.dst;
+	}
+	return true;
+}
+
+static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
+				  struct fl_flow_key *key,
+				  struct fl_flow_key *mkey)
+{
+	__be16 min_mask, max_mask, min_val, max_val;
+
+	min_mask = htons(filter->mask->key.tp_min.src);
+	max_mask = htons(filter->mask->key.tp_max.src);
+	min_val = htons(filter->key.tp_min.src);
+	max_val = htons(filter->key.tp_max.src);
+
+	if (min_mask && max_mask) {
+		if (htons(key->tp.src) < min_val ||
+		    htons(key->tp.src) > max_val)
+			return false;
+
+		/* skb does not have min and max values */
+		mkey->tp_min.src = filter->mkey.tp_min.src;
+		mkey->tp_max.src = filter->mkey.tp_max.src;
+	}
+	return true;
+}
+
+static struct cls_fl_filter *__fl_lookup(struct fl_flow_mask *mask,
+					 struct fl_flow_key *mkey)
 {
 	return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
 				      mask->filter_ht_params);
 }
 
+static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask,
+					     struct fl_flow_key *mkey,
+					     struct fl_flow_key *key)
+{
+	struct cls_fl_filter *filter, *f;
+
+	list_for_each_entry_rcu(filter, &mask->filters, list) {
+		if (!fl_range_port_dst_cmp(filter, key, mkey))
+			continue;
+
+		if (!fl_range_port_src_cmp(filter, key, mkey))
+			continue;
+
+		f = __fl_lookup(mask, mkey);
+		if (f)
+			return f;
+	}
+	return NULL;
+}
+
+static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
+				       struct fl_flow_key *mkey,
+				       struct fl_flow_key *key)
+{
+	if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
+		return fl_lookup_range(mask, mkey, key);
+
+	return __fl_lookup(mask, mkey);
+}
+
 static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
@@ -208,7 +287,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 		fl_set_masked_key(&skb_mkey, &skb_key, mask);
 
-		f = fl_lookup(mask, &skb_mkey);
+		f = fl_lookup(mask, &skb_mkey, &skb_key);
 		if (f && !tc_skip_sw(f->flags)) {
 			*res = f->res;
 			return tcf_exts_exec(skb, &f->exts, res);
@@ -514,6 +593,31 @@ static void fl_set_key_val(struct nlattr **tb,
 		memcpy(mask, nla_data(tb[mask_type]), len);
 }
 
+static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
+				 struct fl_flow_key *mask)
+{
+	fl_set_key_val(tb, &key->tp_min.dst,
+		       TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst,
+		       TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst));
+	fl_set_key_val(tb, &key->tp_max.dst,
+		       TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst,
+		       TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst));
+	fl_set_key_val(tb, &key->tp_min.src,
+		       TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src,
+		       TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src));
+	fl_set_key_val(tb, &key->tp_max.src,
+		       TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src,
+		       TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src));
+
+	if ((mask->tp_min.dst && mask->tp_max.dst &&
+	     htons(key->tp_max.dst) <= htons(key->tp_min.dst)) ||
+	     (mask->tp_min.src && mask->tp_max.src &&
+	      htons(key->tp_max.src) <= htons(key->tp_min.src)))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int fl_set_key_mpls(struct nlattr **tb,
 			   struct flow_dissector_key_mpls *key_val,
 			   struct flow_dissector_key_mpls *key_mask)
@@ -921,6 +1025,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       sizeof(key->arp.tha));
 	}
 
+	if (key->basic.ip_proto == IPPROTO_TCP ||
+	    key->basic.ip_proto == IPPROTO_UDP ||
+	    key->basic.ip_proto == IPPROTO_SCTP) {
+		ret = fl_set_key_port_range(tb, key, mask);
+		if (ret)
+			return ret;
+	}
+
 	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
 	    tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
 		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -1038,8 +1150,9 @@ static void fl_init_dissector(struct flow_dissector *dissector,
 			     FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
-	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
-			     FLOW_DISSECTOR_KEY_PORTS, tp);
+	if (FL_KEY_IS_MASKED(mask, tp) ||
+	    FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max))
+		FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_IP, ip);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1086,6 +1199,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
 
 	fl_mask_copy(newmask, mask);
 
+	if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) ||
+	    (newmask->key.tp_min.src && newmask->key.tp_max.src))
+		newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE;
+
 	err = fl_init_mask_hashtable(newmask);
 	if (err)
 		goto errout_free;
@@ -1239,7 +1356,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		goto errout_idr;
 
 	if (!tc_skip_sw(fnew->flags)) {
-		if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
+		if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) {
 			err = -EEXIST;
 			goto errout_mask;
 		}
@@ -1476,6 +1593,26 @@ static int fl_dump_key_val(struct sk_buff *skb,
 	return 0;
 }
 
+static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key,
+				  struct fl_flow_key *mask)
+{
+	if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN,
+			    &mask->tp_min.dst, TCA_FLOWER_UNSPEC,
+			    sizeof(key->tp_min.dst)) ||
+	    fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX,
+			    &mask->tp_max.dst, TCA_FLOWER_UNSPEC,
+			    sizeof(key->tp_max.dst)) ||
+	    fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN,
+			    &mask->tp_min.src, TCA_FLOWER_UNSPEC,
+			    sizeof(key->tp_min.src)) ||
+	    fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX,
+			    &mask->tp_max.src, TCA_FLOWER_UNSPEC,
+			    sizeof(key->tp_max.src)))
+		return -1;
+
+	return 0;
+}
+
 static int fl_dump_key_mpls(struct sk_buff *skb,
 			    struct flow_dissector_key_mpls *mpls_key,
 			    struct flow_dissector_key_mpls *mpls_mask)
@@ -1812,6 +1949,12 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 				  sizeof(key->arp.tha))))
 		goto nla_put_failure;
 
+	if ((key->basic.ip_proto == IPPROTO_TCP ||
+	     key->basic.ip_proto == IPPROTO_UDP ||
+	     key->basic.ip_proto == IPPROTO_SCTP) &&
+	     fl_dump_key_port_range(skb, key, mask))
+		goto nla_put_failure;
+
 	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
 			    TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
-- 
cgit v1.2.3


From 3fcbdaee3b5c4a7f8ea4c7c11fecc009d2a1e7e4 Mon Sep 17 00:00:00 2001
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Date: Wed, 14 Nov 2018 17:26:32 -0800
Subject: etf: Cancel timer if there are no pending skbs

There is no point in firing the qdisc watchdog if there are no future
skbs pending in the queue and the watchdog had been set previously.

Signed-off-by: Jesus Sanchez-Palencia <jesus.s.palencia@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_etf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index 1538d6fa8165..fa85b24ac794 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -117,8 +117,10 @@ static void reset_watchdog(struct Qdisc *sch)
 	struct sk_buff *skb = etf_peek_timesortedlist(sch);
 	ktime_t next;
 
-	if (!skb)
+	if (!skb) {
+		qdisc_watchdog_cancel(&q->watchdog);
 		return;
+	}
 
 	next = ktime_sub_ns(skb->tstamp, q->delta);
 	qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
-- 
cgit v1.2.3


From 09fd4860ea258d9666f852c213875e6bcdb1204e Mon Sep 17 00:00:00 2001
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Date: Wed, 14 Nov 2018 17:26:33 -0800
Subject: etf: Use cached rb_root

ETF's peek() operation is heavily used so use an rb_root_cached instead
and leverage rb_first_cached() which will run in O(1) instead of
O(log n).

Even if on 'timesortedlist_clear()' we could be using rb_erase(), we
choose to use rb_erase_cached(), because if in the future we allow
runtime changes to ETF parameters, and need to do a '_clear()', this
might cause some hard to debug issues.

Signed-off-by: Jesus Sanchez-Palencia <jesus.s.palencia@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_etf.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index fa85b24ac794..52452b546564 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -30,7 +30,7 @@ struct etf_sched_data {
 	int queue;
 	s32 delta; /* in ns */
 	ktime_t last; /* The txtime of the last skb sent to the netdevice. */
-	struct rb_root head;
+	struct rb_root_cached head;
 	struct qdisc_watchdog watchdog;
 	ktime_t (*get_time)(void);
 };
@@ -104,7 +104,7 @@ static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
 	struct etf_sched_data *q = qdisc_priv(sch);
 	struct rb_node *p;
 
-	p = rb_first(&q->head);
+	p = rb_first_cached(&q->head);
 	if (!p)
 		return NULL;
 
@@ -156,8 +156,9 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
 				      struct sk_buff **to_free)
 {
 	struct etf_sched_data *q = qdisc_priv(sch);
-	struct rb_node **p = &q->head.rb_node, *parent = NULL;
+	struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL;
 	ktime_t txtime = nskb->tstamp;
+	bool leftmost = true;
 
 	if (!is_packet_valid(sch, nskb)) {
 		report_sock_error(nskb, EINVAL,
@@ -170,13 +171,15 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
 
 		parent = *p;
 		skb = rb_to_skb(parent);
-		if (ktime_after(txtime, skb->tstamp))
+		if (ktime_after(txtime, skb->tstamp)) {
 			p = &parent->rb_right;
-		else
+			leftmost = false;
+		} else {
 			p = &parent->rb_left;
+		}
 	}
 	rb_link_node(&nskb->rbnode, parent, p);
-	rb_insert_color(&nskb->rbnode, &q->head);
+	rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost);
 
 	qdisc_qstats_backlog_inc(sch, nskb);
 	sch->q.qlen++;
@@ -192,7 +195,7 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
 {
 	struct etf_sched_data *q = qdisc_priv(sch);
 
-	rb_erase(&skb->rbnode, &q->head);
+	rb_erase_cached(&skb->rbnode, &q->head);
 
 	/* The rbnode field in the skb re-uses these fields, now that
 	 * we are done with the rbnode, reset them.
@@ -388,14 +391,14 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt,
 static void timesortedlist_clear(struct Qdisc *sch)
 {
 	struct etf_sched_data *q = qdisc_priv(sch);
-	struct rb_node *p = rb_first(&q->head);
+	struct rb_node *p = rb_first_cached(&q->head);
 
 	while (p) {
 		struct sk_buff *skb = rb_to_skb(p);
 
 		p = rb_next(p);
 
-		rb_erase(&skb->rbnode, &q->head);
+		rb_erase_cached(&skb->rbnode, &q->head);
 		rtnl_kfree_skbs(skb, skb);
 		sch->q.qlen--;
 	}
-- 
cgit v1.2.3


From cbeeb8efec821188c770f582be345ed7b04a0b60 Mon Sep 17 00:00:00 2001
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Date: Wed, 14 Nov 2018 17:26:34 -0800
Subject: etf: Split timersortedlist_erase()

This is just a refactor that will simplify the implementation of the
next patch in this series which will drop all expired packets on the
dequeue flow.

Signed-off-by: Jesus Sanchez-Palencia <jesus.s.palencia@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_etf.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index 52452b546564..bfe04748d5f0 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -190,10 +190,10 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
 	return NET_XMIT_SUCCESS;
 }
 
-static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
-				 bool drop)
+static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct etf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *to_free = NULL;
 
 	rb_erase_cached(&skb->rbnode, &q->head);
 
@@ -206,19 +206,33 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
 
 	qdisc_qstats_backlog_dec(sch, skb);
 
-	if (drop) {
-		struct sk_buff *to_free = NULL;
+	report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
 
-		report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
+	qdisc_drop(skb, sch, &to_free);
+	kfree_skb_list(to_free);
+	qdisc_qstats_overlimit(sch);
 
-		qdisc_drop(skb, sch, &to_free);
-		kfree_skb_list(to_free);
-		qdisc_qstats_overlimit(sch);
-	} else {
-		qdisc_bstats_update(sch, skb);
+	sch->q.qlen--;
+}
 
-		q->last = skb->tstamp;
-	}
+static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct etf_sched_data *q = qdisc_priv(sch);
+
+	rb_erase_cached(&skb->rbnode, &q->head);
+
+	/* The rbnode field in the skb re-uses these fields, now that
+	 * we are done with the rbnode, reset them.
+	 */
+	skb->next = NULL;
+	skb->prev = NULL;
+	skb->dev = qdisc_dev(sch);
+
+	qdisc_qstats_backlog_dec(sch, skb);
+
+	qdisc_bstats_update(sch, skb);
+
+	q->last = skb->tstamp;
 
 	sch->q.qlen--;
 }
@@ -237,7 +251,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
 
 	/* Drop if packet has expired while in queue. */
 	if (ktime_before(skb->tstamp, now)) {
-		timesortedlist_erase(sch, skb, true);
+		timesortedlist_drop(sch, skb);
 		skb = NULL;
 		goto out;
 	}
@@ -246,7 +260,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
 	 * txtime from deadline to (now + delta).
 	 */
 	if (q->deadline_mode) {
-		timesortedlist_erase(sch, skb, false);
+		timesortedlist_remove(sch, skb);
 		skb->tstamp = now;
 		goto out;
 	}
@@ -255,7 +269,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
 
 	/* Dequeue only if now is within the [txtime - delta, txtime] range. */
 	if (ktime_after(now, next))
-		timesortedlist_erase(sch, skb, false);
+		timesortedlist_remove(sch, skb);
 	else
 		skb = NULL;
 
-- 
cgit v1.2.3


From 37342bdaf5b363cf2e1bd170ce7d1de34ecf57e7 Mon Sep 17 00:00:00 2001
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Date: Wed, 14 Nov 2018 17:26:35 -0800
Subject: etf: Drop all expired packets

Currently on dequeue() ETF only drops the first expired packet, which
causes a problem if the next packet is already expired. When this
happens, the watchdog will be configured with a time in the past, fire
straight way and the packet will finally be dropped once the dequeue()
function of the qdisc is called again.

We can save quite a few cycles and improve the overall behavior of the
qdisc if we drop all expired packets if the next packet is expired.
This should allow ETF to recover faster from bad situations. But
packet drops are still a very serious warning that the requirements
imposed on the system aren't reasonable.

This was inspired by how the implementation of hrtimers use the
rb_tree inside the kernel.

Signed-off-by: Jesus Sanchez-Palencia <jesus.s.palencia@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_etf.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index bfe04748d5f0..1150f22983df 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -190,29 +190,35 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
 	return NET_XMIT_SUCCESS;
 }
 
-static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb)
+static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb,
+				ktime_t now)
 {
 	struct etf_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *to_free = NULL;
+	struct sk_buff *tmp = NULL;
 
-	rb_erase_cached(&skb->rbnode, &q->head);
+	skb_rbtree_walk_from_safe(skb, tmp) {
+		if (ktime_after(skb->tstamp, now))
+			break;
 
-	/* The rbnode field in the skb re-uses these fields, now that
-	 * we are done with the rbnode, reset them.
-	 */
-	skb->next = NULL;
-	skb->prev = NULL;
-	skb->dev = qdisc_dev(sch);
+		rb_erase_cached(&skb->rbnode, &q->head);
 
-	qdisc_qstats_backlog_dec(sch, skb);
+		/* The rbnode field in the skb re-uses these fields, now that
+		 * we are done with the rbnode, reset them.
+		 */
+		skb->next = NULL;
+		skb->prev = NULL;
+		skb->dev = qdisc_dev(sch);
 
-	report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
+		report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
 
-	qdisc_drop(skb, sch, &to_free);
-	kfree_skb_list(to_free);
-	qdisc_qstats_overlimit(sch);
+		qdisc_qstats_backlog_dec(sch, skb);
+		qdisc_drop(skb, sch, &to_free);
+		qdisc_qstats_overlimit(sch);
+		sch->q.qlen--;
+	}
 
-	sch->q.qlen--;
+	kfree_skb_list(to_free);
 }
 
 static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb)
@@ -251,7 +257,7 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
 
 	/* Drop if packet has expired while in queue. */
 	if (ktime_before(skb->tstamp, now)) {
-		timesortedlist_drop(sch, skb);
+		timesortedlist_drop(sch, skb, now);
 		skb = NULL;
 		goto out;
 	}
-- 
cgit v1.2.3


From 255f4803ecc490319596d73f24ed775f99923a53 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Nov 2018 22:23:45 -0800
Subject: net: sched: gred: separate error and non-error path in gred_change()

We will soon want to add more code to the non-error path, separate
it from the error handling flow.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_gred.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 4a042abf844c..22110be9d285 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -423,12 +423,11 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 
 	max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
 
-	err = -EINVAL;
 	ctl = nla_data(tb[TCA_GRED_PARMS]);
 	stab = nla_data(tb[TCA_GRED_STAB]);
 
 	if (ctl->DP >= table->DPs)
-		goto errout;
+		return -EINVAL;
 
 	if (gred_rio_mode(table)) {
 		if (ctl->prio == 0) {
@@ -450,7 +449,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 
 	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
 	if (err < 0)
-		goto errout_locked;
+		goto err_unlock_free;
 
 	if (gred_rio_mode(table)) {
 		gred_disable_wred_mode(table);
@@ -458,12 +457,13 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 			gred_enable_wred_mode(table);
 	}
 
-	err = 0;
+	sch_tree_unlock(sch);
+	kfree(prealloc);
+	return 0;
 
-errout_locked:
+err_unlock_free:
 	sch_tree_unlock(sch);
 	kfree(prealloc);
-errout:
 	return err;
 }
 
-- 
cgit v1.2.3


From 79c59fe01e70f595f0d67be76ae8309a32a3760d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Nov 2018 22:23:46 -0800
Subject: net: sched: gred: pass extack to nla_parse_nested()

In case netlink wants to provide parsing error pass extack
to nla_parse_nested().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_gred.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 22110be9d285..9f6a4ddd262a 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -406,7 +406,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 	if (opt == NULL)
 		return -EINVAL;
 
-	err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL);
+	err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack);
 	if (err < 0)
 		return err;
 
@@ -476,7 +476,7 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt,
 	if (!opt)
 		return -EINVAL;
 
-	err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL);
+	err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, extack);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3


From 4777be08b8aab41286efdf5362a02f8e26d1a84e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Nov 2018 22:23:47 -0800
Subject: net: sched: gred: use extack to provide more details on configuration
 errors

Add extack messages to -EINVAL errors, to help users identify
their mistakes.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_gred.c | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 9f6a4ddd262a..3d7bd374b303 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -300,7 +300,8 @@ static inline void gred_destroy_vq(struct gred_sched_data *q)
 	kfree(q);
 }
 
-static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
+static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps,
+				 struct netlink_ext_ack *extack)
 {
 	struct gred_sched *table = qdisc_priv(sch);
 	struct tc_gred_sopt *sopt;
@@ -311,9 +312,19 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
 
 	sopt = nla_data(dps);
 
-	if (sopt->DPs > MAX_DPs || sopt->DPs == 0 ||
-	    sopt->def_DP >= sopt->DPs)
+	if (sopt->DPs > MAX_DPs) {
+		NL_SET_ERR_MSG_MOD(extack, "number of virtual queues too high");
 		return -EINVAL;
+	}
+	if (sopt->DPs == 0) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "number of virtual queues can't be 0");
+		return -EINVAL;
+	}
+	if (sopt->def_DP >= sopt->DPs) {
+		NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count");
+		return -EINVAL;
+	}
 
 	sch_tree_lock(sch);
 	table->DPs = sopt->DPs;
@@ -352,13 +363,16 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
 static inline int gred_change_vq(struct Qdisc *sch, int dp,
 				 struct tc_gred_qopt *ctl, int prio,
 				 u8 *stab, u32 max_P,
-				 struct gred_sched_data **prealloc)
+				 struct gred_sched_data **prealloc,
+				 struct netlink_ext_ack *extack)
 {
 	struct gred_sched *table = qdisc_priv(sch);
 	struct gred_sched_data *q = table->tab[dp];
 
-	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) {
+		NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters");
 		return -EINVAL;
+	}
 
 	if (!q) {
 		table->tab[dp] = q = *prealloc;
@@ -413,21 +427,25 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) {
 		if (tb[TCA_GRED_LIMIT] != NULL)
 			sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
-		return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+		return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack);
 	}
 
 	if (tb[TCA_GRED_PARMS] == NULL ||
 	    tb[TCA_GRED_STAB] == NULL ||
-	    tb[TCA_GRED_LIMIT] != NULL)
+	    tb[TCA_GRED_LIMIT] != NULL) {
+		NL_SET_ERR_MSG_MOD(extack, "can't configure Qdisc and virtual queue at the same time");
 		return -EINVAL;
+	}
 
 	max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
 
 	ctl = nla_data(tb[TCA_GRED_PARMS]);
 	stab = nla_data(tb[TCA_GRED_STAB]);
 
-	if (ctl->DP >= table->DPs)
+	if (ctl->DP >= table->DPs) {
+		NL_SET_ERR_MSG_MOD(extack, "virtual queue index above virtual queue count");
 		return -EINVAL;
+	}
 
 	if (gred_rio_mode(table)) {
 		if (ctl->prio == 0) {
@@ -447,7 +465,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 	prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
 	sch_tree_lock(sch);
 
-	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
+	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc,
+			     extack);
 	if (err < 0)
 		goto err_unlock_free;
 
@@ -480,8 +499,11 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt,
 	if (err < 0)
 		return err;
 
-	if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
+	if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB]) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "virtual queue configuration can't be specified at initialization time");
 		return -EINVAL;
+	}
 
 	if (tb[TCA_GRED_LIMIT])
 		sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]);
@@ -489,7 +511,7 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt,
 		sch->limit = qdisc_dev(sch)->tx_queue_len
 		             * psched_mtu(qdisc_dev(sch));
 
-	return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+	return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack);
 }
 
 static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
cgit v1.2.3


From 9f5cd0c8066997b77ba98fc5355faa425f14b381 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Nov 2018 22:23:48 -0800
Subject: net: sched: gred: store bytesin as a 64 bit value

32 bit counters for bytes are not really going to last long in modern
world.  Make sch_gred count bytes on a 64 bit counter.  It will still
get truncated during dump but follow up patch will add set of new
stat dump attributes.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_gred.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 3d7bd374b303..6f209c83ee7a 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -35,7 +35,7 @@ struct gred_sched;
 struct gred_sched_data {
 	u32		limit;		/* HARD maximal queue length	*/
 	u32		DP;		/* the drop parameters */
-	u32		bytesin;	/* bytes seen on virtualQ so far*/
+	u64		bytesin;	/* bytes seen on virtualQ so far*/
 	u32		packetsin;	/* packets seen on virtualQ so far*/
 	u32		backlog;	/* bytes on the virtualQ */
 	u8		prio;		/* the prio of this vq */
-- 
cgit v1.2.3


From 80e22e961dfd15530215f6f6dcd94cd8f65ba1ea Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Nov 2018 22:23:49 -0800
Subject: net: sched: gred: provide a better structured dump and expose stats

Currently all GRED's virtual queue data is dumped in a single
array in a single attribute.  This makes it pretty much impossible
to add new fields.  In order to expose more detailed stats add a
new set of attributes.  We can now expose the 64 bit value of bytesin
and all the mark stats which were not part of the original design.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_gred.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 6f209c83ee7a..dc09a32c4b4f 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -404,6 +404,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
 	[TCA_GRED_DPS]		= { .len = sizeof(struct tc_gred_sopt) },
 	[TCA_GRED_MAX_P]	= { .type = NLA_U32 },
 	[TCA_GRED_LIMIT]	= { .type = NLA_U32 },
+	[TCA_GRED_VQ_LIST]	= { .type = NLA_REJECT },
 };
 
 static int gred_change(struct Qdisc *sch, struct nlattr *opt,
@@ -517,7 +518,7 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt,
 static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct gred_sched *table = qdisc_priv(sch);
-	struct nlattr *parms, *opts = NULL;
+	struct nlattr *parms, *vqs, *opts = NULL;
 	int i;
 	u32 max_p[MAX_DPs];
 	struct tc_gred_sopt sopt = {
@@ -544,6 +545,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put_u32(skb, TCA_GRED_LIMIT, sch->limit))
 		goto nla_put_failure;
 
+	/* Old style all-in-one dump of VQs */
 	parms = nla_nest_start(skb, TCA_GRED_PARMS);
 	if (parms == NULL)
 		goto nla_put_failure;
@@ -594,6 +596,55 @@ append_opt:
 
 	nla_nest_end(skb, parms);
 
+	/* Dump the VQs again, in more structured way */
+	vqs = nla_nest_start(skb, TCA_GRED_VQ_LIST);
+	if (!vqs)
+		goto nla_put_failure;
+
+	for (i = 0; i < MAX_DPs; i++) {
+		struct gred_sched_data *q = table->tab[i];
+		struct nlattr *vq;
+
+		if (!q)
+			continue;
+
+		vq = nla_nest_start(skb, TCA_GRED_VQ_ENTRY);
+		if (!vq)
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP))
+			goto nla_put_failure;
+
+		/* Stats */
+		if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin,
+				      TCA_GRED_VQ_PAD))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PACKETS, q->packetsin))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_BACKLOG,
+				gred_backlog(table, q, sch)))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_DROP,
+				q->stats.prob_drop))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PROB_MARK,
+				q->stats.prob_mark))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_DROP,
+				q->stats.forced_drop))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_FORCED_MARK,
+				q->stats.forced_mark))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_PDROP, q->stats.pdrop))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_GRED_VQ_STAT_OTHER, q->stats.other))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, vq);
+	}
+	nla_nest_end(skb, vqs);
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
-- 
cgit v1.2.3


From 25fc1989077e71be9fdbe6b78670cf90df2fb789 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Nov 2018 22:23:50 -0800
Subject: net: sched: gred: store red flags per virtual queue

Right now ECN marking and HARD drop (the common RED flags) can only
be configured for the entire Qdisc.  In preparation for per-vq flags
store the values in the virtual queue structure.  Setting per-vq
flags will only be allowed when no flags are set for the entire Qdisc.
For the new flags we will also make sure undefined bits are 0.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_gred.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index dc09a32c4b4f..47133106c7e2 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -29,12 +29,15 @@
 #define GRED_DEF_PRIO (MAX_DPs / 2)
 #define GRED_VQ_MASK (MAX_DPs - 1)
 
+#define GRED_VQ_RED_FLAGS	(TC_RED_ECN | TC_RED_HARDDROP)
+
 struct gred_sched_data;
 struct gred_sched;
 
 struct gred_sched_data {
 	u32		limit;		/* HARD maximal queue length	*/
 	u32		DP;		/* the drop parameters */
+	u32		red_flags;	/* virtualQ version of red_flags */
 	u64		bytesin;	/* bytes seen on virtualQ so far*/
 	u32		packetsin;	/* packets seen on virtualQ so far*/
 	u32		backlog;	/* bytes on the virtualQ */
@@ -139,14 +142,14 @@ static inline void gred_store_wred_set(struct gred_sched *table,
 	table->wred_set.qidlestart = q->vars.qidlestart;
 }
 
-static inline int gred_use_ecn(struct gred_sched *t)
+static int gred_use_ecn(struct gred_sched_data *q)
 {
-	return t->red_flags & TC_RED_ECN;
+	return q->red_flags & TC_RED_ECN;
 }
 
-static inline int gred_use_harddrop(struct gred_sched *t)
+static int gred_use_harddrop(struct gred_sched_data *q)
 {
-	return t->red_flags & TC_RED_HARDDROP;
+	return q->red_flags & TC_RED_HARDDROP;
 }
 
 static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -212,7 +215,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 	case RED_PROB_MARK:
 		qdisc_qstats_overlimit(sch);
-		if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+		if (!gred_use_ecn(q) || !INET_ECN_set_ce(skb)) {
 			q->stats.prob_drop++;
 			goto congestion_drop;
 		}
@@ -222,7 +225,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 	case RED_HARD_MARK:
 		qdisc_qstats_overlimit(sch);
-		if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+		if (gred_use_harddrop(q) || !gred_use_ecn(q) ||
 		    !INET_ECN_set_ce(skb)) {
 			q->stats.forced_drop++;
 			goto congestion_drop;
@@ -305,6 +308,7 @@ static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps,
 {
 	struct gred_sched *table = qdisc_priv(sch);
 	struct tc_gred_sopt *sopt;
+	bool red_flags_changed;
 	int i;
 
 	if (!dps)
@@ -329,6 +333,7 @@ static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps,
 	sch_tree_lock(sch);
 	table->DPs = sopt->DPs;
 	table->def = sopt->def_DP;
+	red_flags_changed = table->red_flags != sopt->flags;
 	table->red_flags = sopt->flags;
 
 	/*
@@ -348,6 +353,12 @@ static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps,
 		gred_disable_wred_mode(table);
 	}
 
+	if (red_flags_changed)
+		for (i = 0; i < table->DPs; i++)
+			if (table->tab[i])
+				table->tab[i]->red_flags =
+					table->red_flags & GRED_VQ_RED_FLAGS;
+
 	for (i = table->DPs; i < MAX_DPs; i++) {
 		if (table->tab[i]) {
 			pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
@@ -379,6 +390,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
 		*prealloc = NULL;
 		if (!q)
 			return -ENOMEM;
+		q->red_flags = table->red_flags & GRED_VQ_RED_FLAGS;
 	}
 
 	q->DP = dp;
-- 
cgit v1.2.3


From 72111015024f4eddb5aac400ddbe38a4f8f0279a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 14 Nov 2018 22:23:51 -0800
Subject: net: sched: gred: allow manipulating per-DP RED flags

Allow users to set and dump RED flags (ECN enabled and harddrop)
on per-virtual queue basis.  Validation of attributes is split
from changes to make sure we won't have to undo previous operations
when we find out configuration is invalid.

The objective is to allow changing per-Qdisc parameters without
overwriting the per-vq configured flags.

Old user space will not pass the TCA_GRED_VQ_FLAGS attribute and
per-Qdisc flags will always get propagated to the virtual queues.

New user space which wants to make use of per-vq flags should set
per-Qdisc flags to 0 and then configure per-vq flags as it
sees fit.  Once per-vq flags are set per-Qdisc flags can't be
changed to non-zero.  Vice versa - if the per-Qdisc flags are
non-zero the TCA_GRED_VQ_FLAGS attribute has to either be omitted
or set to the same value as per-Qdisc flags.

Update per-Qdisc parameters:
per-Qdisc | per-VQ | result
        0 |      0 | all vq flags updated
	0 |  non-0 | error (vq flags in use)
    non-0 |      0 | -- impossible --
    non-0 |  non-0 | all vq flags updated

Update per-VQ state (flags parameter not specified):
   no change to flags

Update per-VQ state (flags parameter set):
per-Qdisc | per-VQ | result
        0 |   any  | per-vq flags updated
    non-0 |      0 | -- impossible --
    non-0 |  non-0 | error (per-Qdisc flags in use)

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_gred.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 143 insertions(+), 1 deletion(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 47133106c7e2..8b8c325f48bc 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -152,6 +152,19 @@ static int gred_use_harddrop(struct gred_sched_data *q)
 	return q->red_flags & TC_RED_HARDDROP;
 }
 
+static bool gred_per_vq_red_flags_used(struct gred_sched *table)
+{
+	unsigned int i;
+
+	/* Local per-vq flags couldn't have been set unless global are 0 */
+	if (table->red_flags)
+		return false;
+	for (i = 0; i < MAX_DPs; i++)
+		if (table->tab[i] && table->tab[i]->red_flags)
+			return true;
+	return false;
+}
+
 static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			struct sk_buff **to_free)
 {
@@ -329,6 +342,10 @@ static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps,
 		NL_SET_ERR_MSG_MOD(extack, "default virtual queue above virtual queue count");
 		return -EINVAL;
 	}
+	if (sopt->flags && gred_per_vq_red_flags_used(table)) {
+		NL_SET_ERR_MSG_MOD(extack, "can't set per-Qdisc RED flags when per-virtual queue flags are used");
+		return -EINVAL;
+	}
 
 	sch_tree_lock(sch);
 	table->DPs = sopt->DPs;
@@ -410,15 +427,127 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
 	return 0;
 }
 
+static const struct nla_policy gred_vq_policy[TCA_GRED_VQ_MAX + 1] = {
+	[TCA_GRED_VQ_DP]	= { .type = NLA_U32 },
+	[TCA_GRED_VQ_FLAGS]	= { .type = NLA_U32 },
+};
+
+static const struct nla_policy gred_vqe_policy[TCA_GRED_VQ_ENTRY_MAX + 1] = {
+	[TCA_GRED_VQ_ENTRY]	= { .type = NLA_NESTED },
+};
+
 static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
 	[TCA_GRED_PARMS]	= { .len = sizeof(struct tc_gred_qopt) },
 	[TCA_GRED_STAB]		= { .len = 256 },
 	[TCA_GRED_DPS]		= { .len = sizeof(struct tc_gred_sopt) },
 	[TCA_GRED_MAX_P]	= { .type = NLA_U32 },
 	[TCA_GRED_LIMIT]	= { .type = NLA_U32 },
-	[TCA_GRED_VQ_LIST]	= { .type = NLA_REJECT },
+	[TCA_GRED_VQ_LIST]	= { .type = NLA_NESTED },
 };
 
+static void gred_vq_apply(struct gred_sched *table, const struct nlattr *entry)
+{
+	struct nlattr *tb[TCA_GRED_VQ_MAX + 1];
+	u32 dp;
+
+	nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy, NULL);
+
+	dp = nla_get_u32(tb[TCA_GRED_VQ_DP]);
+
+	if (tb[TCA_GRED_VQ_FLAGS])
+		table->tab[dp]->red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]);
+}
+
+static void gred_vqs_apply(struct gred_sched *table, struct nlattr *vqs)
+{
+	const struct nlattr *attr;
+	int rem;
+
+	nla_for_each_nested(attr, vqs, rem) {
+		switch (nla_type(attr)) {
+		case TCA_GRED_VQ_ENTRY:
+			gred_vq_apply(table, attr);
+			break;
+		}
+	}
+}
+
+static int gred_vq_validate(struct gred_sched *table, u32 cdp,
+			    const struct nlattr *entry,
+			    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_GRED_VQ_MAX + 1];
+	int err;
+	u32 dp;
+
+	err = nla_parse_nested(tb, TCA_GRED_VQ_MAX, entry, gred_vq_policy,
+			       extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_GRED_VQ_DP]) {
+		NL_SET_ERR_MSG_MOD(extack, "Virtual queue with no index specified");
+		return -EINVAL;
+	}
+	dp = nla_get_u32(tb[TCA_GRED_VQ_DP]);
+	if (dp >= table->DPs) {
+		NL_SET_ERR_MSG_MOD(extack, "Virtual queue with index out of bounds");
+		return -EINVAL;
+	}
+	if (dp != cdp && !table->tab[dp]) {
+		NL_SET_ERR_MSG_MOD(extack, "Virtual queue not yet instantiated");
+		return -EINVAL;
+	}
+
+	if (tb[TCA_GRED_VQ_FLAGS]) {
+		u32 red_flags = nla_get_u32(tb[TCA_GRED_VQ_FLAGS]);
+
+		if (table->red_flags && table->red_flags != red_flags) {
+			NL_SET_ERR_MSG_MOD(extack, "can't change per-virtual queue RED flags when per-Qdisc flags are used");
+			return -EINVAL;
+		}
+		if (red_flags & ~GRED_VQ_RED_FLAGS) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "invalid RED flags specified");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int gred_vqs_validate(struct gred_sched *table, u32 cdp,
+			     struct nlattr *vqs, struct netlink_ext_ack *extack)
+{
+	const struct nlattr *attr;
+	int rem, err;
+
+	err = nla_validate_nested(vqs, TCA_GRED_VQ_ENTRY_MAX,
+				  gred_vqe_policy, extack);
+	if (err < 0)
+		return err;
+
+	nla_for_each_nested(attr, vqs, rem) {
+		switch (nla_type(attr)) {
+		case TCA_GRED_VQ_ENTRY:
+			err = gred_vq_validate(table, cdp, attr, extack);
+			if (err)
+				return err;
+			break;
+		default:
+			NL_SET_ERR_MSG_MOD(extack, "GRED_VQ_LIST can contain only entry attributes");
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Trailing data after parsing virtual queue list");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 		       struct netlink_ext_ack *extack)
 {
@@ -460,6 +589,13 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 		return -EINVAL;
 	}
 
+	if (tb[TCA_GRED_VQ_LIST]) {
+		err = gred_vqs_validate(table, ctl->DP, tb[TCA_GRED_VQ_LIST],
+					extack);
+		if (err)
+			return err;
+	}
+
 	if (gred_rio_mode(table)) {
 		if (ctl->prio == 0) {
 			int def_prio = GRED_DEF_PRIO;
@@ -483,6 +619,9 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 	if (err < 0)
 		goto err_unlock_free;
 
+	if (tb[TCA_GRED_VQ_LIST])
+		gred_vqs_apply(table, tb[TCA_GRED_VQ_LIST]);
+
 	if (gred_rio_mode(table)) {
 		gred_disable_wred_mode(table);
 		if (gred_wred_mode_check(sch))
@@ -627,6 +766,9 @@ append_opt:
 		if (nla_put_u32(skb, TCA_GRED_VQ_DP, q->DP))
 			goto nla_put_failure;
 
+		if (nla_put_u32(skb, TCA_GRED_VQ_FLAGS, q->red_flags))
+			goto nla_put_failure;
+
 		/* Stats */
 		if (nla_put_u64_64bit(skb, TCA_GRED_VQ_STAT_BYTES, q->bytesin,
 				      TCA_GRED_VQ_PAD))
-- 
cgit v1.2.3


From 890d8d23ec3c9eca847be0593c0cf5f650b97271 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 19 Nov 2018 15:21:42 -0800
Subject: net: sched: gred: add basic Qdisc offload

Add basic offload for the GRED Qdisc.  Inform the drivers any
time Qdisc or virtual queue configuration changes.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_gred.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'net/sched')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 8b8c325f48bc..908c9d1dfdf8 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
+#include <net/pkt_cls.h>
 #include <net/pkt_sched.h>
 #include <net/red.h>
 
@@ -311,6 +312,48 @@ static void gred_reset(struct Qdisc *sch)
 	}
 }
 
+static void gred_offload(struct Qdisc *sch, enum tc_gred_command command)
+{
+	struct gred_sched *table = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_gred_qopt_offload opt = {
+		.command	= command,
+		.handle		= sch->handle,
+		.parent		= sch->parent,
+	};
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return;
+
+	if (command == TC_GRED_REPLACE) {
+		unsigned int i;
+
+		opt.set.grio_on = gred_rio_mode(table);
+		opt.set.wred_on = gred_wred_mode(table);
+		opt.set.dp_cnt = table->DPs;
+		opt.set.dp_def = table->def;
+
+		for (i = 0; i < table->DPs; i++) {
+			struct gred_sched_data *q = table->tab[i];
+
+			if (!q)
+				continue;
+			opt.set.tab[i].present = true;
+			opt.set.tab[i].limit = q->limit;
+			opt.set.tab[i].prio = q->prio;
+			opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog;
+			opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog;
+			opt.set.tab[i].is_ecn = gred_use_ecn(q);
+			opt.set.tab[i].is_harddrop = gred_use_harddrop(q);
+			opt.set.tab[i].probability = q->parms.max_P;
+			opt.set.tab[i].backlog = &q->backlog;
+		}
+		opt.set.qstats = &sch->qstats;
+	}
+
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt);
+}
+
 static inline void gred_destroy_vq(struct gred_sched_data *q)
 {
 	kfree(q);
@@ -385,6 +428,7 @@ static int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps,
 		}
 	}
 
+	gred_offload(sch, TC_GRED_REPLACE);
 	return 0;
 }
 
@@ -630,6 +674,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt,
 
 	sch_tree_unlock(sch);
 	kfree(prealloc);
+
+	gred_offload(sch, TC_GRED_REPLACE);
 	return 0;
 
 err_unlock_free:
@@ -815,6 +861,7 @@ static void gred_destroy(struct Qdisc *sch)
 		if (table->tab[i])
 			gred_destroy_vq(table->tab[i]);
 	}
+	gred_offload(sch, TC_GRED_DESTROY);
 }
 
 static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
-- 
cgit v1.2.3


From e49efd5288bd6670cc05860fe04ef611c3887399 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 19 Nov 2018 15:21:43 -0800
Subject: net: sched: gred: support reporting stats from offloads

Allow drivers which offload GRED to report back statistics.  Since
A lot of GRED stats is fairly ad hoc in nature pass to drivers the
standard struct gnet_stats_basic/gnet_stats_queue pairs, and
untangle the values in the core.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_gred.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'net/sched')

diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 908c9d1dfdf8..234afbf9115b 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -354,6 +354,50 @@ static void gred_offload(struct Qdisc *sch, enum tc_gred_command command)
 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt);
 }
 
+static int gred_offload_dump_stats(struct Qdisc *sch)
+{
+	struct gred_sched *table = qdisc_priv(sch);
+	struct tc_gred_qopt_offload *hw_stats;
+	unsigned int i;
+	int ret;
+
+	hw_stats = kzalloc(sizeof(*hw_stats), GFP_KERNEL);
+	if (!hw_stats)
+		return -ENOMEM;
+
+	hw_stats->command = TC_GRED_STATS;
+	hw_stats->handle = sch->handle;
+	hw_stats->parent = sch->parent;
+
+	for (i = 0; i < MAX_DPs; i++)
+		if (table->tab[i])
+			hw_stats->stats.xstats[i] = &table->tab[i]->stats;
+
+	ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats);
+	/* Even if driver returns failure adjust the stats - in case offload
+	 * ended but driver still wants to adjust the values.
+	 */
+	for (i = 0; i < MAX_DPs; i++) {
+		if (!table->tab[i])
+			continue;
+		table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets;
+		table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes;
+		table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog;
+
+		_bstats_update(&sch->bstats,
+			       hw_stats->stats.bstats[i].bytes,
+			       hw_stats->stats.bstats[i].packets);
+		sch->qstats.qlen += hw_stats->stats.qstats[i].qlen;
+		sch->qstats.backlog += hw_stats->stats.qstats[i].backlog;
+		sch->qstats.drops += hw_stats->stats.qstats[i].drops;
+		sch->qstats.requeues += hw_stats->stats.qstats[i].requeues;
+		sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits;
+	}
+
+	kfree(hw_stats);
+	return ret;
+}
+
 static inline void gred_destroy_vq(struct gred_sched_data *q)
 {
 	kfree(q);
@@ -725,6 +769,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
 		.flags	= table->red_flags,
 	};
 
+	if (gred_offload_dump_stats(sch))
+		goto nla_put_failure;
+
 	opts = nla_nest_start(skb, TCA_OPTIONS);
 	if (opts == NULL)
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 068ceb3555397dbd82593fb505688c5bd200a4ad Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 19 Nov 2018 15:21:46 -0800
Subject: net: sched: cls_u32: add res to offload information

In case of egress offloads the class/flowid assigned by the filter
may be very important for offloaded Qdisc selection.  Provide this
info to drivers.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/sched')

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 4b28fd44576d..4c54bc440798 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -558,6 +558,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	cls_u32.knode.mask = 0;
 #endif
 	cls_u32.knode.sel = &n->sel;
+	cls_u32.knode.res = &n->res;
 	cls_u32.knode.exts = &n->exts;
 	if (n->ht_down)
 		cls_u32.knode.link_handle = ht->handle;
@@ -1206,6 +1207,7 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 		cls_u32.knode.mask = 0;
 #endif
 		cls_u32.knode.sel = &n->sel;
+		cls_u32.knode.res = &n->res;
 		cls_u32.knode.exts = &n->exts;
 		if (n->ht_down)
 			cls_u32.knode.link_handle = ht->handle;
-- 
cgit v1.2.3


From 6b015a523fa3ba57ccd9c72697163b560fcdedb4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Nov 2018 17:30:19 -0800
Subject: net_sched: sch_fq: avoid calling ktime_get_ns() if not needed

There are two cases were we can avoid calling ktime_get_ns() :

1) Queue is empty.
2) Internal queue is not empty.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 1da8864502d4..1a662f2bb7bb 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -414,16 +414,21 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
-	u64 now = ktime_get_ns();
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
 	unsigned long rate;
 	u32 plen;
+	u64 now;
+
+	if (!sch->q.qlen)
+		return NULL;
 
 	skb = fq_dequeue_head(sch, &q->internal);
 	if (skb)
 		goto out;
+
+	now = ktime_get_ns();
 	fq_check_throttled(q, now);
 begin:
 	head = &q->new_flows;
-- 
cgit v1.2.3


From 80ef0f22ceda1804cc572836722df7a33dec71c9 Mon Sep 17 00:00:00 2001
From: Adi Nissim <adin@mellanox.com>
Date: Sun, 2 Dec 2018 14:55:20 +0200
Subject: net/sched: act_tunnel_key: Allow key-less tunnels

Allow setting a tunnel without a tunnel key. This is required for
tunneling protocols, such as GRE, that define the key as an optional
field.

Signed-off-by: Adi Nissim <adin@mellanox.com>
Acked-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_tunnel_key.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 4cca8f274662..fad438c0f7a7 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -210,9 +210,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	struct tcf_tunnel_key *t;
 	bool exists = false;
 	__be16 dst_port = 0;
+	__be64 key_id = 0;
 	int opts_len = 0;
-	__be64 key_id;
-	__be16 flags;
+	__be16 flags = 0;
 	u8 tos, ttl;
 	int ret = 0;
 	int err;
@@ -246,15 +246,15 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	case TCA_TUNNEL_KEY_ACT_RELEASE:
 		break;
 	case TCA_TUNNEL_KEY_ACT_SET:
-		if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
-			NL_SET_ERR_MSG(extack, "Missing tunnel key id");
-			ret = -EINVAL;
-			goto err_out;
-		}
+		if (tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+			__be32 key32;
 
-		key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+			key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]);
+			key_id = key32_to_tunnel_id(key32);
+			flags = TUNNEL_KEY;
+		}
 
-		flags = TUNNEL_KEY | TUNNEL_CSUM;
+		flags |= TUNNEL_CSUM;
 		if (tb[TCA_TUNNEL_KEY_NO_CSUM] &&
 		    nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM]))
 			flags &= ~TUNNEL_CSUM;
@@ -508,7 +508,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 		struct ip_tunnel_key *key = &info->key;
 		__be32 key_id = tunnel_id_to_key32(key->tun_id);
 
-		if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+		if (((key->tun_flags & TUNNEL_KEY) &&
+		     nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) ||
 		    tunnel_key_dump_addresses(skb,
 					      &params->tcft_enc_metadata->u.tun_info) ||
 		    nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
-- 
cgit v1.2.3


From 1c25324caf8292573b2b519fa4957baefd0c9657 Mon Sep 17 00:00:00 2001
From: Adi Nissim <adin@mellanox.com>
Date: Sun, 2 Dec 2018 14:55:21 +0200
Subject: net/sched: act_tunnel_key: Don't dump dst port if it wasn't set

It's possible to set a tunnel without a destination port. However,
on dump(), a zero dst port is returned to user space even if it was not
set, fix that.

Note that so far it wasn't required, b/c key less tunnels were not
supported and the UDP tunnels do require destination port.

Signed-off-by: Adi Nissim <adin@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_tunnel_key.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/sched')

diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index fad438c0f7a7..c3b90fadaff6 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -512,7 +512,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 		     nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) ||
 		    tunnel_key_dump_addresses(skb,
 					      &params->tcft_enc_metadata->u.tun_info) ||
-		    nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
+		    (key->tp_dst &&
+		      nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT,
+				   key->tp_dst)) ||
 		    nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
 			       !(key->tun_flags & TUNNEL_CSUM)) ||
 		    tunnel_key_opts_dump(skb, info))
-- 
cgit v1.2.3


From d66280b12bd7ad6345df4dee2ee1c20f5902242d Mon Sep 17 00:00:00 2001
From: Peter Oskolkov <posk@google.com>
Date: Tue, 4 Dec 2018 11:55:56 -0800
Subject: net: netem: use a list in addition to rbtree

When testing high-bandwidth TCP streams with large windows,
high latency, and low jitter, netem consumes a lot of CPU cycles
doing rbtree rebalancing.

This patch uses a linear list/queue in addition to the rbtree:
if an incoming packet is past the tail of the linear queue, it is
added there, otherwise it is inserted into the rbtree.

Without this patch, perf shows netem_enqueue, netem_dequeue,
and rb_* functions among the top offenders. With this patch,
only netem_enqueue is noticeable if jitter is low/absent.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 89 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 69 insertions(+), 20 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 2c38e3d07924..84658f60a872 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -77,6 +77,10 @@ struct netem_sched_data {
 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
 	struct rb_root t_root;
 
+	/* a linear queue; reduces rbtree rebalancing when jitter is low */
+	struct sk_buff	*t_head;
+	struct sk_buff	*t_tail;
+
 	/* optional qdisc for classful handling (NULL at netem init) */
 	struct Qdisc	*qdisc;
 
@@ -369,26 +373,39 @@ static void tfifo_reset(struct Qdisc *sch)
 		rb_erase(&skb->rbnode, &q->t_root);
 		rtnl_kfree_skbs(skb, skb);
 	}
+
+	rtnl_kfree_skbs(q->t_head, q->t_tail);
+	q->t_head = NULL;
+	q->t_tail = NULL;
 }
 
 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
-	struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 
-	while (*p) {
-		struct sk_buff *skb;
-
-		parent = *p;
-		skb = rb_to_skb(parent);
-		if (tnext >= netem_skb_cb(skb)->time_to_send)
-			p = &parent->rb_right;
+	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
+		if (q->t_tail)
+			q->t_tail->next = nskb;
 		else
-			p = &parent->rb_left;
+			q->t_head = nskb;
+		q->t_tail = nskb;
+	} else {
+		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
+
+		while (*p) {
+			struct sk_buff *skb;
+
+			parent = *p;
+			skb = rb_to_skb(parent);
+			if (tnext >= netem_skb_cb(skb)->time_to_send)
+				p = &parent->rb_right;
+			else
+				p = &parent->rb_left;
+		}
+		rb_link_node(&nskb->rbnode, parent, p);
+		rb_insert_color(&nskb->rbnode, &q->t_root);
 	}
-	rb_link_node(&nskb->rbnode, parent, p);
-	rb_insert_color(&nskb->rbnode, &q->t_root);
 	sch->q.qlen++;
 }
 
@@ -530,9 +547,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				t_skb = skb_rb_last(&q->t_root);
 				t_last = netem_skb_cb(t_skb);
 				if (!last ||
-				    t_last->time_to_send > last->time_to_send) {
+				    t_last->time_to_send > last->time_to_send)
+					last = t_last;
+			}
+			if (q->t_tail) {
+				struct netem_skb_cb *t_last =
+					netem_skb_cb(q->t_tail);
+
+				if (!last ||
+				    t_last->time_to_send > last->time_to_send)
 					last = t_last;
-				}
 			}
 
 			if (last) {
@@ -611,11 +635,38 @@ static void get_slot_next(struct netem_sched_data *q, u64 now)
 	q->slot.bytes_left = q->slot_config.max_bytes;
 }
 
+static struct sk_buff *netem_peek(struct netem_sched_data *q)
+{
+	struct sk_buff *skb = skb_rb_first(&q->t_root);
+	u64 t1, t2;
+
+	if (!skb)
+		return q->t_head;
+	if (!q->t_head)
+		return skb;
+
+	t1 = netem_skb_cb(skb)->time_to_send;
+	t2 = netem_skb_cb(q->t_head)->time_to_send;
+	if (t1 < t2)
+		return skb;
+	return q->t_head;
+}
+
+static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
+{
+	if (skb == q->t_head) {
+		q->t_head = skb->next;
+		if (!q->t_head)
+			q->t_tail = NULL;
+	} else {
+		rb_erase(&skb->rbnode, &q->t_root);
+	}
+}
+
 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
-	struct rb_node *p;
 
 tfifo_dequeue:
 	skb = __qdisc_dequeue_head(&sch->q);
@@ -625,20 +676,18 @@ deliver:
 		qdisc_bstats_update(sch, skb);
 		return skb;
 	}
-	p = rb_first(&q->t_root);
-	if (p) {
+	skb = netem_peek(q);
+	if (skb) {
 		u64 time_to_send;
 		u64 now = ktime_get_ns();
 
-		skb = rb_to_skb(p);
-
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
 			get_slot_next(q, now);
 
-		if (time_to_send <= now &&  q->slot.slot_next <= now) {
-			rb_erase(p, &q->t_root);
+		if (time_to_send <= now && q->slot.slot_next <= now) {
+			netem_erase_head(q, skb);
 			sch->q.qlen--;
 			qdisc_qstats_backlog_dec(sch, skb);
 			skb->next = NULL;
-- 
cgit v1.2.3


From 69bd48404f251b9c45a15799fdcfc87a7ad6ab8a Mon Sep 17 00:00:00 2001
From: Oz Shlomo <ozsh@mellanox.com>
Date: Tue, 6 Nov 2018 09:58:37 +0200
Subject: net/sched: Remove egdev mechanism

The egdev mechanism was replaced by the TC indirect block notifications
platform.

Signed-off-by: Oz Shlomo <ozsh@mellanox.com>
Reviewed-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Cc: John Hurley <john.hurley@netronome.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 net/sched/act_api.c | 221 ----------------------------------------------------
 net/sched/cls_api.c |  47 +----------
 2 files changed, 1 insertion(+), 267 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9c1b0729aebf..d4b8355737d8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -21,8 +21,6 @@
 #include <linux/kmod.h>
 #include <linux/err.h>
 #include <linux/module.h>
-#include <linux/rhashtable.h>
-#include <linux/list.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/sch_generic.h>
@@ -1522,227 +1520,8 @@ out_module_put:
 	return skb->len;
 }
 
-struct tcf_action_net {
-	struct rhashtable egdev_ht;
-};
-
-static unsigned int tcf_action_net_id;
-
-struct tcf_action_egdev_cb {
-	struct list_head list;
-	tc_setup_cb_t *cb;
-	void *cb_priv;
-};
-
-struct tcf_action_egdev {
-	struct rhash_head ht_node;
-	const struct net_device *dev;
-	unsigned int refcnt;
-	struct list_head cb_list;
-};
-
-static const struct rhashtable_params tcf_action_egdev_ht_params = {
-	.key_offset = offsetof(struct tcf_action_egdev, dev),
-	.head_offset = offsetof(struct tcf_action_egdev, ht_node),
-	.key_len = sizeof(const struct net_device *),
-};
-
-static struct tcf_action_egdev *
-tcf_action_egdev_lookup(const struct net_device *dev)
-{
-	struct net *net = dev_net(dev);
-	struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
-
-	return rhashtable_lookup_fast(&tan->egdev_ht, &dev,
-				      tcf_action_egdev_ht_params);
-}
-
-static struct tcf_action_egdev *
-tcf_action_egdev_get(const struct net_device *dev)
-{
-	struct tcf_action_egdev *egdev;
-	struct tcf_action_net *tan;
-
-	egdev = tcf_action_egdev_lookup(dev);
-	if (egdev)
-		goto inc_ref;
-
-	egdev = kzalloc(sizeof(*egdev), GFP_KERNEL);
-	if (!egdev)
-		return NULL;
-	INIT_LIST_HEAD(&egdev->cb_list);
-	egdev->dev = dev;
-	tan = net_generic(dev_net(dev), tcf_action_net_id);
-	rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node,
-			       tcf_action_egdev_ht_params);
-
-inc_ref:
-	egdev->refcnt++;
-	return egdev;
-}
-
-static void tcf_action_egdev_put(struct tcf_action_egdev *egdev)
-{
-	struct tcf_action_net *tan;
-
-	if (--egdev->refcnt)
-		return;
-	tan = net_generic(dev_net(egdev->dev), tcf_action_net_id);
-	rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node,
-			       tcf_action_egdev_ht_params);
-	kfree(egdev);
-}
-
-static struct tcf_action_egdev_cb *
-tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev,
-			   tc_setup_cb_t *cb, void *cb_priv)
-{
-	struct tcf_action_egdev_cb *egdev_cb;
-
-	list_for_each_entry(egdev_cb, &egdev->cb_list, list)
-		if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv)
-			return egdev_cb;
-	return NULL;
-}
-
-static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev,
-				    enum tc_setup_type type,
-				    void *type_data, bool err_stop)
-{
-	struct tcf_action_egdev_cb *egdev_cb;
-	int ok_count = 0;
-	int err;
-
-	list_for_each_entry(egdev_cb, &egdev->cb_list, list) {
-		err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv);
-		if (err) {
-			if (err_stop)
-				return err;
-		} else {
-			ok_count++;
-		}
-	}
-	return ok_count;
-}
-
-static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev,
-				   tc_setup_cb_t *cb, void *cb_priv)
-{
-	struct tcf_action_egdev_cb *egdev_cb;
-
-	egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
-	if (WARN_ON(egdev_cb))
-		return -EEXIST;
-	egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL);
-	if (!egdev_cb)
-		return -ENOMEM;
-	egdev_cb->cb = cb;
-	egdev_cb->cb_priv = cb_priv;
-	list_add(&egdev_cb->list, &egdev->cb_list);
-	return 0;
-}
-
-static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev,
-				    tc_setup_cb_t *cb, void *cb_priv)
-{
-	struct tcf_action_egdev_cb *egdev_cb;
-
-	egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv);
-	if (WARN_ON(!egdev_cb))
-		return;
-	list_del(&egdev_cb->list);
-	kfree(egdev_cb);
-}
-
-static int __tc_setup_cb_egdev_register(const struct net_device *dev,
-					tc_setup_cb_t *cb, void *cb_priv)
-{
-	struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev);
-	int err;
-
-	if (!egdev)
-		return -ENOMEM;
-	err = tcf_action_egdev_cb_add(egdev, cb, cb_priv);
-	if (err)
-		goto err_cb_add;
-	return 0;
-
-err_cb_add:
-	tcf_action_egdev_put(egdev);
-	return err;
-}
-int tc_setup_cb_egdev_register(const struct net_device *dev,
-			       tc_setup_cb_t *cb, void *cb_priv)
-{
-	int err;
-
-	rtnl_lock();
-	err = __tc_setup_cb_egdev_register(dev, cb, cb_priv);
-	rtnl_unlock();
-	return err;
-}
-EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register);
-
-static void __tc_setup_cb_egdev_unregister(const struct net_device *dev,
-					   tc_setup_cb_t *cb, void *cb_priv)
-{
-	struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
-
-	if (WARN_ON(!egdev))
-		return;
-	tcf_action_egdev_cb_del(egdev, cb, cb_priv);
-	tcf_action_egdev_put(egdev);
-}
-void tc_setup_cb_egdev_unregister(const struct net_device *dev,
-				  tc_setup_cb_t *cb, void *cb_priv)
-{
-	rtnl_lock();
-	__tc_setup_cb_egdev_unregister(dev, cb, cb_priv);
-	rtnl_unlock();
-}
-EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister);
-
-int tc_setup_cb_egdev_call(const struct net_device *dev,
-			   enum tc_setup_type type, void *type_data,
-			   bool err_stop)
-{
-	struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev);
-
-	if (!egdev)
-		return 0;
-	return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop);
-}
-EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call);
-
-static __net_init int tcf_action_net_init(struct net *net)
-{
-	struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
-
-	return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params);
-}
-
-static void __net_exit tcf_action_net_exit(struct net *net)
-{
-	struct tcf_action_net *tan = net_generic(net, tcf_action_net_id);
-
-	rhashtable_destroy(&tan->egdev_ht);
-}
-
-static struct pernet_operations tcf_action_net_ops = {
-	.init = tcf_action_net_init,
-	.exit = tcf_action_net_exit,
-	.id = &tcf_action_net_id,
-	.size = sizeof(struct tcf_action_net),
-};
-
 static int __init tc_action_init(void)
 {
-	int err;
-
-	err = register_pernet_subsys(&tcf_action_net_ops);
-	if (err)
-		return err;
-
 	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index d92f44ac4c39..6207f265b87c 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -2515,55 +2515,10 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
 }
 EXPORT_SYMBOL(tcf_exts_dump_stats);
 
-static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
-				       enum tc_setup_type type,
-				       void *type_data, bool err_stop)
-{
-	int ok_count = 0;
-#ifdef CONFIG_NET_CLS_ACT
-	const struct tc_action *a;
-	struct net_device *dev;
-	int i, ret;
-
-	if (!tcf_exts_has_actions(exts))
-		return 0;
-
-	for (i = 0; i < exts->nr_actions; i++) {
-		a = exts->actions[i];
-		if (!a->ops->get_dev)
-			continue;
-		dev = a->ops->get_dev(a);
-		if (!dev)
-			continue;
-		ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop);
-		a->ops->put_dev(dev);
-		if (ret < 0)
-			return ret;
-		ok_count += ret;
-	}
-#endif
-	return ok_count;
-}
-
 int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
 		     enum tc_setup_type type, void *type_data, bool err_stop)
 {
-	int ok_count;
-	int ret;
-
-	ret = tcf_block_cb_call(block, type, type_data, err_stop);
-	if (ret < 0)
-		return ret;
-	ok_count = ret;
-
-	if (!exts || ok_count)
-		return ok_count;
-	ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop);
-	if (ret < 0)
-		return ret;
-	ok_count += ret;
-
-	return ok_count;
+	return tcf_block_cb_call(block, type, type_data, err_stop);
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
 
-- 
cgit v1.2.3


From aeb3fecde811d5392ed481d8558f5751ac542e77 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 11 Dec 2018 11:15:46 -0800
Subject: net_sched: fold tcf_block_cb_call() into tc_setup_cb_call()

After commit 69bd48404f25 ("net/sched: Remove egdev mechanism"),
tc_setup_cb_call() is nearly identical to tcf_block_cb_call(),
so we can just fold tcf_block_cb_call() into tc_setup_cb_call()
and remove its unused parameter 'exts'.

Fixes: 69bd48404f25 ("net/sched: Remove egdev mechanism")
Cc: Oz Shlomo <ozsh@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Oz Shlomo <ozsh@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c      | 46 ++++++++++++++++++++--------------------------
 net/sched/cls_bpf.c      |  4 ++--
 net/sched/cls_flower.c   | 15 +++++----------
 net/sched/cls_matchall.c |  5 ++---
 net/sched/cls_u32.c      |  8 ++++----
 5 files changed, 33 insertions(+), 45 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 6207f265b87c..8ce2a0507970 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1270,29 +1270,6 @@ void tcf_block_cb_unregister(struct tcf_block *block,
 }
 EXPORT_SYMBOL(tcf_block_cb_unregister);
 
-static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type,
-			     void *type_data, bool err_stop)
-{
-	struct tcf_block_cb *block_cb;
-	int ok_count = 0;
-	int err;
-
-	/* Make sure all netdevs sharing this block are offload-capable. */
-	if (block->nooffloaddevcnt && err_stop)
-		return -EOPNOTSUPP;
-
-	list_for_each_entry(block_cb, &block->cb_list, list) {
-		err = block_cb->cb(type, type_data, block_cb->cb_priv);
-		if (err) {
-			if (err_stop)
-				return err;
-		} else {
-			ok_count++;
-		}
-	}
-	return ok_count;
-}
-
 /* Main classifier routine: scans classifier chain attached
  * to this qdisc, (optionally) tests for protocol and asks
  * specific classifiers.
@@ -2515,10 +2492,27 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
 }
 EXPORT_SYMBOL(tcf_exts_dump_stats);
 
-int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
-		     enum tc_setup_type type, void *type_data, bool err_stop)
+int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+		     void *type_data, bool err_stop)
 {
-	return tcf_block_cb_call(block, type, type_data, err_stop);
+	struct tcf_block_cb *block_cb;
+	int ok_count = 0;
+	int err;
+
+	/* Make sure all netdevs sharing this block are offload-capable. */
+	if (block->nooffloaddevcnt && err_stop)
+		return -EOPNOTSUPP;
+
+	list_for_each_entry(block_cb, &block->cb_list, list) {
+		err = block_cb->cb(type, type_data, block_cb->cb_priv);
+		if (err) {
+			if (err_stop)
+				return err;
+		} else {
+			ok_count++;
+		}
+	}
+	return ok_count;
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index fa6fe2fe0f32..a95cb240a606 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -169,7 +169,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	if (oldprog)
 		tcf_block_offload_dec(block, &oldprog->gen_flags);
 
-	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
+	err = tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
 	if (prog) {
 		if (err < 0) {
 			cls_bpf_offload_cmd(tp, oldprog, prog, extack);
@@ -234,7 +234,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
 	cls_bpf.name = prog->bpf_name;
 	cls_bpf.exts_integrated = prog->exts_integrated;
 
-	tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false);
 }
 
 static int cls_bpf_init(struct tcf_proto *tp)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 544811dded60..1eb2e2c31dd5 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -368,8 +368,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
 	cls_flower.command = TC_CLSFLOWER_DESTROY;
 	cls_flower.cookie = (unsigned long) f;
 
-	tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
-			 &cls_flower, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
 	tcf_block_offload_dec(block, &f->flags);
 }
 
@@ -391,8 +390,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	cls_flower.exts = &f->exts;
 	cls_flower.classid = f->res.classid;
 
-	err = tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
-			       &cls_flower, skip_sw);
+	err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw);
 	if (err < 0) {
 		fl_hw_destroy_filter(tp, f, NULL);
 		return err;
@@ -418,8 +416,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	cls_flower.exts = &f->exts;
 	cls_flower.classid = f->res.classid;
 
-	tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
-			 &cls_flower, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
 }
 
 static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
@@ -1502,8 +1499,7 @@ static void fl_hw_create_tmplt(struct tcf_chain *chain,
 	/* We don't care if driver (any of them) fails to handle this
 	 * call. It serves just as a hint for it.
 	 */
-	tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
-			 &cls_flower, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
 }
 
 static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
@@ -1516,8 +1512,7 @@ static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
 	cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
 	cls_flower.cookie = (unsigned long) tmplt;
 
-	tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
-			 &cls_flower, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
 }
 
 static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 856fa79d4ffd..0e408ee9dcec 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -71,7 +71,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	cls_mall.command = TC_CLSMATCHALL_DESTROY;
 	cls_mall.cookie = cookie;
 
-	tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false);
 	tcf_block_offload_dec(block, &head->flags);
 }
 
@@ -90,8 +90,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	cls_mall.exts = &head->exts;
 	cls_mall.cookie = cookie;
 
-	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL,
-			       &cls_mall, skip_sw);
+	err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw);
 	if (err < 0) {
 		mall_destroy_hw_filter(tp, head, cookie, NULL);
 		return err;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 4c54bc440798..dcea21004604 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -491,7 +491,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	cls_u32.hnode.handle = h->handle;
 	cls_u32.hnode.prio = h->prio;
 
-	tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
 }
 
 static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
@@ -509,7 +509,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	cls_u32.hnode.handle = h->handle;
 	cls_u32.hnode.prio = h->prio;
 
-	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+	err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
 	if (err < 0) {
 		u32_clear_hw_hnode(tp, h, NULL);
 		return err;
@@ -533,7 +533,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	cls_u32.command = TC_CLSU32_DELETE_KNODE;
 	cls_u32.knode.handle = n->handle;
 
-	tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
+	tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
 	tcf_block_offload_dec(block, &n->flags);
 }
 
@@ -563,7 +563,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	if (n->ht_down)
 		cls_u32.knode.link_handle = ht->handle;
 
-	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+	err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
 	if (err < 0) {
 		u32_remove_hw_knode(tp, n, NULL);
 		return err;
-- 
cgit v1.2.3


From 2561f97267d656c9b2c62b32614870abb3eabfe6 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Thu, 13 Dec 2018 00:43:23 -0800
Subject: net: sched: simplify the qdisc_leaf code

Except for returning, the var leaf is not
used in the qdisc_leaf(). For simplicity, remove it.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net/sched')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 9c88cec7e8a2..187a57e7d601 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -335,7 +335,6 @@ out:
 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 {
 	unsigned long cl;
-	struct Qdisc *leaf;
 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
 
 	if (cops == NULL)
@@ -344,8 +343,7 @@ static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 
 	if (cl == 0)
 		return NULL;
-	leaf = cops->leaf(p, cl);
-	return leaf;
+	return cops->leaf(p, cl);
 }
 
 /* Find queueing discipline by name */
-- 
cgit v1.2.3