diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/act_csum.c | 31 | ||||
-rw-r--r-- | net/sched/act_tunnel_key.c | 18 | ||||
-rw-r--r-- | net/sched/cls_api.c | 29 | ||||
-rw-r--r-- | net/sched/cls_fw.c | 5 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 2 | ||||
-rw-r--r-- | net/sched/sch_pie.c | 110 |
6 files changed, 126 insertions, 69 deletions
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 945fb34ae721..c79aca29505e 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -559,8 +559,11 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); + bool orig_vlan_tag_present = false; + unsigned int vlan_hdr_count = 0; struct tcf_csum_params *params; u32 update_flags; + __be16 protocol; int action; params = rcu_dereference_bh(p->params); @@ -573,7 +576,9 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, goto drop; update_flags = params->update_flags; - switch (tc_skb_protocol(skb)) { + protocol = tc_skb_protocol(skb); +again: + switch (protocol) { case cpu_to_be16(ETH_P_IP): if (!tcf_csum_ipv4(skb, update_flags)) goto drop; @@ -582,13 +587,35 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, if (!tcf_csum_ipv6(skb, update_flags)) goto drop; break; + case cpu_to_be16(ETH_P_8021AD): /* fall through */ + case cpu_to_be16(ETH_P_8021Q): + if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) { + protocol = skb->protocol; + orig_vlan_tag_present = true; + } else { + struct vlan_hdr *vlan = (struct vlan_hdr *)skb->data; + + protocol = vlan->h_vlan_encapsulated_proto; + skb_pull(skb, VLAN_HLEN); + skb_reset_network_header(skb); + vlan_hdr_count++; + } + goto again; + } + +out: + /* Restore the skb for the pulled VLAN tags */ + while (vlan_hdr_count--) { + skb_push(skb, VLAN_HLEN); + skb_reset_network_header(skb); } return action; drop: qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); - return TC_ACT_SHOT; + action = TC_ACT_SHOT; + goto out; } static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 3404af72b4c1..fc2b884b170c 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -201,8 +201,14 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) { if (!p) return; - if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET) + if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { +#ifdef CONFIG_DST_CACHE + struct ip_tunnel_info *info = &p->tcft_enc_metadata->u.tun_info; + + dst_cache_destroy(&info->dst_cache); +#endif dst_release(&p->tcft_enc_metadata->dst); + } kfree_rcu(p, rcu); } @@ -384,7 +390,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, release_dst_cache: #ifdef CONFIG_DST_CACHE - dst_cache_destroy(&metadata->u.tun_info.dst_cache); + if (metadata) + dst_cache_destroy(&metadata->u.tun_info.dst_cache); #endif release_tun_meta: dst_release(&metadata->dst); @@ -401,15 +408,8 @@ static void tunnel_key_release(struct tc_action *a) { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; -#ifdef CONFIG_DST_CACHE - struct ip_tunnel_info *info; -#endif params = rcu_dereference_protected(t->params, 1); -#ifdef CONFIG_DST_CACHE - info = ¶ms->tcft_enc_metadata->u.tun_info; - dst_cache_destroy(&info->dst_cache); -#endif tunnel_key_release_params(params); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 6593c245f714..478095d50f95 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -238,18 +238,23 @@ static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held, tcf_proto_destroy(tp, rtnl_held, extack); } -static int walker_noop(struct tcf_proto *tp, void *d, struct tcf_walker *arg) +static int walker_check_empty(struct tcf_proto *tp, void *fh, + struct tcf_walker *arg) { - return -1; + if (fh) { + arg->nonempty = true; + return -1; + } + return 0; } static bool tcf_proto_is_empty(struct tcf_proto *tp, bool rtnl_held) { - struct tcf_walker walker = { .fn = walker_noop, }; + struct tcf_walker walker = { .fn = walker_check_empty, }; if (tp->ops->walk) { tp->ops->walk(tp, &walker, rtnl_held); - return !walker.stop; + return !walker.nonempty; } return true; } @@ -2907,12 +2912,12 @@ errout_block_locked: /* called with RTNL */ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) { - struct tcf_chain *chain, *chain_prev; struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; struct Qdisc *q = NULL; struct tcf_block *block; struct tcmsg *tcm = nlmsg_data(cb->nlh); + struct tcf_chain *chain; long index_start; long index; u32 parent; @@ -2975,11 +2980,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index_start = cb->args[0]; index = 0; - for (chain = __tcf_get_next_chain(block, NULL); - chain; - chain_prev = chain, - chain = __tcf_get_next_chain(block, chain), - tcf_chain_put(chain_prev)) { + mutex_lock(&block->lock); + list_for_each_entry(chain, &block->chain_list, list) { if ((tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index)) continue; @@ -2987,17 +2989,18 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index++; continue; } + if (tcf_chain_held_by_acts_only(chain)) + continue; err = tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, chain->index, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWCHAIN); - if (err <= 0) { - tcf_chain_put(chain); + if (err <= 0) break; - } index++; } + mutex_unlock(&block->lock); if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) tcf_block_refcnt_put(block, true); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 456ee6e62dfa..ad036b00427d 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -363,7 +363,10 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg, struct fw_head *head = rtnl_dereference(tp->root); int h; - if (head == NULL || arg->stop) + if (head == NULL) + arg->stop = 1; + + if (arg->stop) return; for (h = 0; h < HTSIZE; h++) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 38e5add14fab..7bcee8b8f803 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -559,7 +559,7 @@ struct Qdisc_ops noop_qdisc_ops __read_mostly = { }; static struct netdev_queue noop_netdev_queue = { - .qdisc = &noop_qdisc, + RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc), .qdisc_sleeping = &noop_qdisc, }; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index d1429371592f..1cc0c7b74aa3 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -17,9 +17,7 @@ * University of Oslo, Norway. * * References: - * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 - * IEEE Conference on High Performance Switching and Routing 2013 : - * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" + * RFC 8033: https://tools.ietf.org/html/rfc8033 */ #include <linux/module.h> @@ -31,9 +29,9 @@ #include <net/pkt_sched.h> #include <net/inet_ecn.h> -#define QUEUE_THRESHOLD 10000 +#define QUEUE_THRESHOLD 16384 #define DQCOUNT_INVALID -1 -#define MAX_PROB 0xffffffff +#define MAX_PROB 0xffffffffffffffff #define PIE_SCALE 8 /* parameters used */ @@ -49,14 +47,16 @@ struct pie_params { /* variables used */ struct pie_vars { - u32 prob; /* probability but scaled by u32 limit. */ + u64 prob; /* probability but scaled by u64 limit. */ psched_time_t burst_time; psched_time_t qdelay; psched_time_t qdelay_old; u64 dq_count; /* measured in bytes */ psched_time_t dq_tstamp; /* drain rate */ + u64 accu_prob; /* accumulated drop probability */ u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ u32 qlen_old; /* in bytes */ + u8 accu_prob_overflows; /* overflows of accu_prob */ }; /* statistics gathering */ @@ -81,9 +81,9 @@ static void pie_params_init(struct pie_params *params) { params->alpha = 2; params->beta = 20; - params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */ + params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */ params->limit = 1000; /* default of 1000 packets */ - params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */ + params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */ params->ecn = false; params->bytemode = false; } @@ -91,16 +91,18 @@ static void pie_params_init(struct pie_params *params) static void pie_vars_init(struct pie_vars *vars) { vars->dq_count = DQCOUNT_INVALID; + vars->accu_prob = 0; vars->avg_dq_rate = 0; - /* default of 100 ms in pschedtime */ - vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); + /* default of 150 ms in pschedtime */ + vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC); + vars->accu_prob_overflows = 0; } static bool drop_early(struct Qdisc *sch, u32 packet_size) { struct pie_sched_data *q = qdisc_priv(sch); - u32 rnd; - u32 local_prob = q->vars.prob; + u64 rnd; + u64 local_prob = q->vars.prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ @@ -124,13 +126,33 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size) * probablity. Smaller packets will have lower drop prob in this case */ if (q->params.bytemode && packet_size <= mtu) - local_prob = (local_prob / mtu) * packet_size; + local_prob = (u64)packet_size * div_u64(local_prob, mtu); else local_prob = q->vars.prob; - rnd = prandom_u32(); - if (rnd < local_prob) + if (local_prob == 0) { + q->vars.accu_prob = 0; + q->vars.accu_prob_overflows = 0; + } + + if (local_prob > MAX_PROB - q->vars.accu_prob) + q->vars.accu_prob_overflows++; + + q->vars.accu_prob += local_prob; + + if (q->vars.accu_prob_overflows == 0 && + q->vars.accu_prob < (MAX_PROB / 100) * 85) + return false; + if (q->vars.accu_prob_overflows == 8 && + q->vars.accu_prob >= MAX_PROB / 2) + return true; + + prandom_bytes(&rnd, 8); + if (rnd < local_prob) { + q->vars.accu_prob = 0; + q->vars.accu_prob_overflows = 0; return true; + } return false; } @@ -168,6 +190,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; + q->vars.accu_prob = 0; + q->vars.accu_prob_overflows = 0; return qdisc_drop(skb, sch, to_free); } @@ -317,9 +341,10 @@ static void calculate_probability(struct Qdisc *sch) u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ - s32 delta = 0; /* determines the change in probability */ - u32 oldprob; - u32 alpha, beta; + s64 delta = 0; /* determines the change in probability */ + u64 oldprob; + u64 alpha, beta; + u32 power; bool update_prob = true; q->vars.qdelay_old = q->vars.qdelay; @@ -339,38 +364,36 @@ static void calculate_probability(struct Qdisc *sch) * value for alpha as 0.125. In this implementation, we use values 0-32 * passed from user space to represent this. Also, alpha and beta have * unit of HZ and need to be scaled before they can used to update - * probability. alpha/beta are updated locally below by 1) scaling them - * appropriately 2) scaling down by 16 to come to 0-2 range. - * Please see paper for details. - * - * We scale alpha and beta differently depending on whether we are in - * light, medium or high dropping mode. + * probability. alpha/beta are updated locally below by scaling down + * by 16 to come to 0-2 range. */ - if (q->vars.prob < MAX_PROB / 100) { - alpha = - (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; - beta = - (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; - } else if (q->vars.prob < MAX_PROB / 10) { - alpha = - (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; - beta = - (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; - } else { - alpha = - (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; - beta = - (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + + /* We scale alpha and beta differently depending on how heavy the + * congestion is. Please see RFC 8033 for details. + */ + if (q->vars.prob < MAX_PROB / 10) { + alpha >>= 1; + beta >>= 1; + + power = 100; + while (q->vars.prob < div_u64(MAX_PROB, power) && + power <= 1000000) { + alpha >>= 2; + beta >>= 2; + power *= 10; + } } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ - delta += alpha * ((qdelay - q->params.target)); - delta += beta * ((qdelay - qdelay_old)); + delta += alpha * (u64)(qdelay - q->params.target); + delta += beta * (u64)(qdelay - qdelay_old); oldprob = q->vars.prob; /* to ensure we increase probability in steps of no more than 2% */ - if (delta > (s32)(MAX_PROB / (100 / 2)) && + if (delta > (s64)(MAX_PROB / (100 / 2)) && q->vars.prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; @@ -406,7 +429,8 @@ static void calculate_probability(struct Qdisc *sch) */ if (qdelay == 0 && qdelay_old == 0 && update_prob) - q->vars.prob = (q->vars.prob * 98) / 100; + /* Reduce drop probability to 98.4% */ + q->vars.prob -= q->vars.prob / 64u; q->vars.qdelay = qdelay; q->vars.qlen_old = qlen; |