6 files changed, 126 insertions, 69 deletions
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 945fb34ae721..c79aca29505e 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -559,8 +559,11 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
 			struct tcf_result *res)
 {
 	struct tcf_csum *p = to_tcf_csum(a);
+	bool orig_vlan_tag_present = false;
+	unsigned int vlan_hdr_count = 0;
 	struct tcf_csum_params *params;
 	u32 update_flags;
+	__be16 protocol;
 	int action;
 
 	params = rcu_dereference_bh(p->params);
@@ -573,7 +576,9 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
 		goto drop;
 
 	update_flags = params->update_flags;
-	switch (tc_skb_protocol(skb)) {
+	protocol = tc_skb_protocol(skb);
+again:
+	switch (protocol) {
 	case cpu_to_be16(ETH_P_IP):
 		if (!tcf_csum_ipv4(skb, update_flags))
 			goto drop;
@@ -582,13 +587,35 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
 		if (!tcf_csum_ipv6(skb, update_flags))
 			goto drop;
 		break;
+	case cpu_to_be16(ETH_P_8021AD): /* fall through */
+	case cpu_to_be16(ETH_P_8021Q):
+		if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) {
+			protocol = skb->protocol;
+			orig_vlan_tag_present = true;
+		} else {
+			struct vlan_hdr *vlan = (struct vlan_hdr *)skb->data;
+
+			protocol = vlan->h_vlan_encapsulated_proto;
+			skb_pull(skb, VLAN_HLEN);
+			skb_reset_network_header(skb);
+			vlan_hdr_count++;
+		}
+		goto again;
+	}
+
+out:
+	/* Restore the skb for the pulled VLAN tags */
+	while (vlan_hdr_count--) {
+		skb_push(skb, VLAN_HLEN);
+		skb_reset_network_header(skb);
 	}
 
 	return action;
 
 drop:
 	qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
-	return TC_ACT_SHOT;
+	action = TC_ACT_SHOT;
+	goto out;
 }
 
 static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 3404af72b4c1..fc2b884b170c 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -201,8 +201,14 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
 {
 	if (!p)
 		return;
-	if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
+	if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
+#ifdef CONFIG_DST_CACHE
+		struct ip_tunnel_info *info = &p->tcft_enc_metadata->u.tun_info;
+
+		dst_cache_destroy(&info->dst_cache);
+#endif
 		dst_release(&p->tcft_enc_metadata->dst);
+	}
 	kfree_rcu(p, rcu);
 }
 
@@ -384,7 +390,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 
 release_dst_cache:
 #ifdef CONFIG_DST_CACHE
-	dst_cache_destroy(&metadata->u.tun_info.dst_cache);
+	if (metadata)
+		dst_cache_destroy(&metadata->u.tun_info.dst_cache);
 #endif
 release_tun_meta:
 	dst_release(&metadata->dst);
@@ -401,15 +408,8 @@ static void tunnel_key_release(struct tc_action *a)
 {
 	struct tcf_tunnel_key *t = to_tunnel_key(a);
 	struct tcf_tunnel_key_params *params;
-#ifdef CONFIG_DST_CACHE
-	struct ip_tunnel_info *info;
-#endif
 
 	params = rcu_dereference_protected(t->params, 1);
-#ifdef CONFIG_DST_CACHE
-	info = &params->tcft_enc_metadata->u.tun_info;
-	dst_cache_destroy(&info->dst_cache);
-#endif
 	tunnel_key_release_params(params);
 }
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 6593c245f714..478095d50f95 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -238,18 +238,23 @@ static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held,
 		tcf_proto_destroy(tp, rtnl_held, extack);
 }
 
-static int walker_noop(struct tcf_proto *tp, void *d, struct tcf_walker *arg)
+static int walker_check_empty(struct tcf_proto *tp, void *fh,
+			      struct tcf_walker *arg)
 {
-	return -1;
+	if (fh) {
+		arg->nonempty = true;
+		return -1;
+	}
+	return 0;
 }
 
 static bool tcf_proto_is_empty(struct tcf_proto *tp, bool rtnl_held)
 {
-	struct tcf_walker walker = { .fn = walker_noop, };
+	struct tcf_walker walker = { .fn = walker_check_empty, };
 
 	if (tp->ops->walk) {
 		tp->ops->walk(tp, &walker, rtnl_held);
-		return !walker.stop;
+		return !walker.nonempty;
 	}
 	return true;
 }
@@ -2907,12 +2912,12 @@ errout_block_locked:
 /* called with RTNL */
 static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct tcf_chain *chain, *chain_prev;
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *tca[TCA_MAX + 1];
 	struct Qdisc *q = NULL;
 	struct tcf_block *block;
 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
+	struct tcf_chain *chain;
 	long index_start;
 	long index;
 	u32 parent;
@@ -2975,11 +2980,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
 	index_start = cb->args[0];
 	index = 0;
 
-	for (chain = __tcf_get_next_chain(block, NULL);
-	     chain;
-	     chain_prev = chain,
-		     chain = __tcf_get_next_chain(block, chain),
-		     tcf_chain_put(chain_prev)) {
+	mutex_lock(&block->lock);
+	list_for_each_entry(chain, &block->chain_list, list) {
 		if ((tca[TCA_CHAIN] &&
 		     nla_get_u32(tca[TCA_CHAIN]) != chain->index))
 			continue;
@@ -2987,17 +2989,18 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
 			index++;
 			continue;
 		}
+		if (tcf_chain_held_by_acts_only(chain))
+			continue;
 		err = tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv,
 					 chain->index, net, skb, block,
 					 NETLINK_CB(cb->skb).portid,
 					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					 RTM_NEWCHAIN);
-		if (err <= 0) {
-			tcf_chain_put(chain);
+		if (err <= 0)
 			break;
-		}
 		index++;
 	}
+	mutex_unlock(&block->lock);
 
 	if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
 		tcf_block_refcnt_put(block, true);
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 456ee6e62dfa..ad036b00427d 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -363,7 +363,10 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg,
 	struct fw_head *head = rtnl_dereference(tp->root);
 	int h;
 
-	if (head == NULL || arg->stop)
+	if (head == NULL)
+		arg->stop = 1;
+
+	if (arg->stop)
 		return;
 
 	for (h = 0; h < HTSIZE; h++) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 38e5add14fab..7bcee8b8f803 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -559,7 +559,7 @@ struct Qdisc_ops noop_qdisc_ops __read_mostly = {
 };
 
 static struct netdev_queue noop_netdev_queue = {
-	.qdisc		=	&noop_qdisc,
+	RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
 	.qdisc_sleeping	=	&noop_qdisc,
 };
 
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index d1429371592f..1cc0c7b74aa3 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -17,9 +17,7 @@
  * University of Oslo, Norway.
  *
  * References:
- * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
- * IEEE  Conference on High Performance Switching and Routing 2013 :
- * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
+ * RFC 8033: https://tools.ietf.org/html/rfc8033
  */
 
 #include <linux/module.h>
@@ -31,9 +29,9 @@
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
 
-#define QUEUE_THRESHOLD 10000
+#define QUEUE_THRESHOLD 16384
 #define DQCOUNT_INVALID -1
-#define MAX_PROB  0xffffffff
+#define MAX_PROB 0xffffffffffffffff
 #define PIE_SCALE 8
 
 /* parameters used */
@@ -49,14 +47,16 @@ struct pie_params {
 
 /* variables used */
 struct pie_vars {
-	u32 prob;		/* probability but scaled by u32 limit. */
+	u64 prob;		/* probability but scaled by u64 limit. */
 	psched_time_t burst_time;
 	psched_time_t qdelay;
 	psched_time_t qdelay_old;
 	u64 dq_count;		/* measured in bytes */
 	psched_time_t dq_tstamp;	/* drain rate */
+	u64 accu_prob;		/* accumulated drop probability */
 	u32 avg_dq_rate;	/* bytes per pschedtime tick,scaled */
 	u32 qlen_old;		/* in bytes */
+	u8 accu_prob_overflows;	/* overflows of accu_prob */
 };
 
 /* statistics gathering */
@@ -81,9 +81,9 @@ static void pie_params_init(struct pie_params *params)
 {
 	params->alpha = 2;
 	params->beta = 20;
-	params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC);	/* 30 ms */
+	params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC);	/* 15 ms */
 	params->limit = 1000;	/* default of 1000 packets */
-	params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC);	/* 20 ms */
+	params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC);	/* 15 ms */
 	params->ecn = false;
 	params->bytemode = false;
 }
@@ -91,16 +91,18 @@ static void pie_params_init(struct pie_params *params)
 static void pie_vars_init(struct pie_vars *vars)
 {
 	vars->dq_count = DQCOUNT_INVALID;
+	vars->accu_prob = 0;
 	vars->avg_dq_rate = 0;
-	/* default of 100 ms in pschedtime */
-	vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
+	/* default of 150 ms in pschedtime */
+	vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC);
+	vars->accu_prob_overflows = 0;
 }
 
 static bool drop_early(struct Qdisc *sch, u32 packet_size)
 {
 	struct pie_sched_data *q = qdisc_priv(sch);
-	u32 rnd;
-	u32 local_prob = q->vars.prob;
+	u64 rnd;
+	u64 local_prob = q->vars.prob;
 	u32 mtu = psched_mtu(qdisc_dev(sch));
 
 	/* If there is still burst allowance left skip random early drop */
@@ -124,13 +126,33 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
 	 * probablity. Smaller packets will have lower drop prob in this case
 	 */
 	if (q->params.bytemode && packet_size <= mtu)
-		local_prob = (local_prob / mtu) * packet_size;
+		local_prob = (u64)packet_size * div_u64(local_prob, mtu);
 	else
 		local_prob = q->vars.prob;
 
-	rnd = prandom_u32();
-	if (rnd < local_prob)
+	if (local_prob == 0) {
+		q->vars.accu_prob = 0;
+		q->vars.accu_prob_overflows = 0;
+	}
+
+	if (local_prob > MAX_PROB - q->vars.accu_prob)
+		q->vars.accu_prob_overflows++;
+
+	q->vars.accu_prob += local_prob;
+
+	if (q->vars.accu_prob_overflows == 0 &&
+	    q->vars.accu_prob < (MAX_PROB / 100) * 85)
+		return false;
+	if (q->vars.accu_prob_overflows == 8 &&
+	    q->vars.accu_prob >= MAX_PROB / 2)
+		return true;
+
+	prandom_bytes(&rnd, 8);
+	if (rnd < local_prob) {
+		q->vars.accu_prob = 0;
+		q->vars.accu_prob_overflows = 0;
 		return true;
+	}
 
 	return false;
 }
@@ -168,6 +190,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 out:
 	q->stats.dropped++;
+	q->vars.accu_prob = 0;
+	q->vars.accu_prob_overflows = 0;
 	return qdisc_drop(skb, sch, to_free);
 }
 
@@ -317,9 +341,10 @@ static void calculate_probability(struct Qdisc *sch)
 	u32 qlen = sch->qstats.backlog;	/* queue size in bytes */
 	psched_time_t qdelay = 0;	/* in pschedtime */
 	psched_time_t qdelay_old = q->vars.qdelay;	/* in pschedtime */
-	s32 delta = 0;		/* determines the change in probability */
-	u32 oldprob;
-	u32 alpha, beta;
+	s64 delta = 0;		/* determines the change in probability */
+	u64 oldprob;
+	u64 alpha, beta;
+	u32 power;
 	bool update_prob = true;
 
 	q->vars.qdelay_old = q->vars.qdelay;
@@ -339,38 +364,36 @@ static void calculate_probability(struct Qdisc *sch)
 	 * value for alpha as 0.125. In this implementation, we use values 0-32
 	 * passed from user space to represent this. Also, alpha and beta have
 	 * unit of HZ and need to be scaled before they can used to update
-	 * probability. alpha/beta are updated locally below by 1) scaling them
-	 * appropriately 2) scaling down by 16 to come to 0-2 range.
-	 * Please see paper for details.
-	 *
-	 * We scale alpha and beta differently depending on whether we are in
-	 * light, medium or high dropping mode.
+	 * probability. alpha/beta are updated locally below by scaling down
+	 * by 16 to come to 0-2 range.
 	 */
-	if (q->vars.prob < MAX_PROB / 100) {
-		alpha =
-		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
-		beta =
-		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
-	} else if (q->vars.prob < MAX_PROB / 10) {
-		alpha =
-		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
-		beta =
-		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
-	} else {
-		alpha =
-		    (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
-		beta =
-		    (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+	alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+	beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+
+	/* We scale alpha and beta differently depending on how heavy the
+	 * congestion is. Please see RFC 8033 for details.
+	 */
+	if (q->vars.prob < MAX_PROB / 10) {
+		alpha >>= 1;
+		beta >>= 1;
+
+		power = 100;
+		while (q->vars.prob < div_u64(MAX_PROB, power) &&
+		       power <= 1000000) {
+			alpha >>= 2;
+			beta >>= 2;
+			power *= 10;
+		}
 	}
 
 	/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
-	delta += alpha * ((qdelay - q->params.target));
-	delta += beta * ((qdelay - qdelay_old));
+	delta += alpha * (u64)(qdelay - q->params.target);
+	delta += beta * (u64)(qdelay - qdelay_old);
 
 	oldprob = q->vars.prob;
 
 	/* to ensure we increase probability in steps of no more than 2% */
-	if (delta > (s32)(MAX_PROB / (100 / 2)) &&
+	if (delta > (s64)(MAX_PROB / (100 / 2)) &&
 	    q->vars.prob >= MAX_PROB / 10)
 		delta = (MAX_PROB / 100) * 2;
 
@@ -406,7 +429,8 @@ static void calculate_probability(struct Qdisc *sch)
 	 */
 
 	if (qdelay == 0 && qdelay_old == 0 && update_prob)
-		q->vars.prob = (q->vars.prob * 98) / 100;
+		/* Reduce drop probability to 98.4% */
+		q->vars.prob -= q->vars.prob / 64u;
 
 	q->vars.qdelay = qdelay;
 	q->vars.qlen_old = qlen;