diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/act_mirred.c | 2 | ||||
-rw-r--r-- | net/sched/sch_cbq.c | 2 | ||||
-rw-r--r-- | net/sched/sch_drr.c | 2 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 44 | ||||
-rw-r--r-- | net/sched/sch_hfsc.c | 2 | ||||
-rw-r--r-- | net/sched/sch_htb.c | 261 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 111 | ||||
-rw-r--r-- | net/sched/sch_qfq.c | 2 | ||||
-rw-r--r-- | net/sched/sch_tbf.c | 47 |
9 files changed, 297 insertions, 176 deletions
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5d676edc22a6..977c10e0631b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -243,7 +243,7 @@ nla_put_failure: static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; if (event == NETDEV_UNREGISTER) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 1bc210ffcba2..71a568862557 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -130,7 +130,7 @@ struct cbq_class { psched_time_t penalized; struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; struct tcf_proto *filter_list; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 759b308d1a8d..8302717ea303 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -25,7 +25,7 @@ struct drr_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct list_head alist; struct Qdisc *qdisc; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 20224086cc28..4626cef4b76e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -901,37 +901,33 @@ void dev_shutdown(struct net_device *dev) void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf) { - u64 factor; - u64 mult; - int shift; - memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; - r->rate_bps = (u64)conf->rate << 3; + r->rate_bytes_ps = conf->rate; r->mult = 1; /* - * Calibrate mult, shift so that token counting is accurate - * for smallest packet size (64 bytes). Token (time in ns) is - * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will - * work as long as the smallest packet transfer time can be - * accurately represented in nanosec. + * The deal here is to replace a divide by a reciprocal one + * in fast path (a reciprocal divide is a multiply and a shift) + * + * Normal formula would be : + * time_in_ns = (NSEC_PER_SEC * len) / rate_bps + * + * We compute mult/shift to use instead : + * time_in_ns = (len * mult) >> shift; + * + * We try to get the highest possible mult value for accuracy, + * but have to make sure no overflows will ever happen. */ - if (r->rate_bps > 0) { - /* - * Higher shift gives better accuracy. Find the largest - * shift such that mult fits in 32 bits. - */ - for (shift = 0; shift < 16; shift++) { - r->shift = shift; - factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); - mult = div64_u64(factor, r->rate_bps); - if (mult > UINT_MAX) + if (r->rate_bytes_ps > 0) { + u64 factor = NSEC_PER_SEC; + + for (;;) { + r->mult = div64_u64(factor, r->rate_bytes_ps); + if (r->mult & (1U << 31) || factor & (1ULL << 63)) break; + factor <<= 1; + r->shift++; } - - r->shift = shift - 1; - factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); - r->mult = div64_u64(factor, r->rate_bps); } } EXPORT_SYMBOL(psched_ratecfg_precompute); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 9facea03faeb..c4075610502c 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -114,7 +114,7 @@ struct hfsc_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ struct tcf_proto *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index adaedd79389c..c2124ea29f45 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -65,6 +65,10 @@ static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis f module_param (htb_hysteresis, int, 0640); MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate"); +static int htb_rate_est = 0; /* htb classes have a default rate estimator */ +module_param(htb_rate_est, int, 0640); +MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes"); + /* used internaly to keep status of single class */ enum htb_cmode { HTB_CANT_SEND, /* class can't send and can't borrow */ @@ -72,95 +76,105 @@ enum htb_cmode { HTB_CAN_SEND /* class can send */ }; -/* interior & leaf nodes; props specific to leaves are marked L: */ +struct htb_prio { + union { + struct rb_root row; + struct rb_root feed; + }; + struct rb_node *ptr; + /* When class changes from state 1->2 and disconnects from + * parent's feed then we lost ptr value and start from the + * first child again. Here we store classid of the + * last valid ptr (used when ptr is NULL). + */ + u32 last_ptr_id; +}; + +/* interior & leaf nodes; props specific to leaves are marked L: + * To reduce false sharing, place mostly read fields at beginning, + * and mostly written ones at the end. + */ struct htb_class { struct Qdisc_class_common common; - /* general class parameters */ - struct gnet_stats_basic_packed bstats; - struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; - struct tc_htb_xstats xstats; /* our special stats */ - int refcnt; /* usage count of this class */ + struct psched_ratecfg rate; + struct psched_ratecfg ceil; + s64 buffer, cbuffer;/* token bucket depth/rate */ + s64 mbuffer; /* max wait time */ + int prio; /* these two are used only by leaves... */ + int quantum; /* but stored for parent-to-leaf return */ - /* topology */ - int level; /* our level (see above) */ - unsigned int children; - struct htb_class *parent; /* parent class */ + struct tcf_proto *filter_list; /* class attached filters */ + int filter_cnt; + int refcnt; /* usage count of this class */ - int prio; /* these two are used only by leaves... */ - int quantum; /* but stored for parent-to-leaf return */ + int level; /* our level (see above) */ + unsigned int children; + struct htb_class *parent; /* parent class */ + + struct gnet_stats_rate_est64 rate_est; + + /* + * Written often fields + */ + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct tc_htb_xstats xstats; /* our special stats */ + + /* token bucket parameters */ + s64 tokens, ctokens;/* current number of tokens */ + s64 t_c; /* checkpoint time */ union { struct htb_class_leaf { - struct Qdisc *q; - int deficit[TC_HTB_MAXDEPTH]; struct list_head drop_list; + int deficit[TC_HTB_MAXDEPTH]; + struct Qdisc *q; } leaf; struct htb_class_inner { - struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ - struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ - /* When class changes from state 1->2 and disconnects from - * parent's feed then we lost ptr value and start from the - * first child again. Here we store classid of the - * last valid ptr (used when ptr is NULL). - */ - u32 last_ptr_id[TC_HTB_NUMPRIO]; + struct htb_prio clprio[TC_HTB_NUMPRIO]; } inner; } un; - struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ - struct rb_node pq_node; /* node for event queue */ - s64 pq_key; + s64 pq_key; - int prio_activity; /* for which prios are we active */ - enum htb_cmode cmode; /* current mode of the class */ - - /* class attached filters */ - struct tcf_proto *filter_list; - int filter_cnt; + int prio_activity; /* for which prios are we active */ + enum htb_cmode cmode; /* current mode of the class */ + struct rb_node pq_node; /* node for event queue */ + struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ +}; - /* token bucket parameters */ - struct psched_ratecfg rate; - struct psched_ratecfg ceil; - s64 buffer, cbuffer; /* token bucket depth/rate */ - s64 mbuffer; /* max wait time */ - s64 tokens, ctokens; /* current number of tokens */ - s64 t_c; /* checkpoint time */ +struct htb_level { + struct rb_root wait_pq; + struct htb_prio hprio[TC_HTB_NUMPRIO]; }; struct htb_sched { struct Qdisc_class_hash clhash; - struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ - - /* self list - roots of self generating tree */ - struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - int row_mask[TC_HTB_MAXDEPTH]; - struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; - u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + int defcls; /* class where unclassified flows go to */ + int rate2quantum; /* quant = rate / rate2quantum */ - /* self wait list - roots of wait PQs per row */ - struct rb_root wait_pq[TC_HTB_MAXDEPTH]; + /* filters for qdisc itself */ + struct tcf_proto *filter_list; - /* time of nearest event per level (row) */ - s64 near_ev_cache[TC_HTB_MAXDEPTH]; +#define HTB_WARN_TOOMANYEVENTS 0x1 + unsigned int warned; /* only one warning */ + int direct_qlen; + struct work_struct work; - int defcls; /* class where unclassified flows go to */ + /* non shaped skbs; let them go directly thru */ + struct sk_buff_head direct_queue; + long direct_pkts; - /* filters for qdisc itself */ - struct tcf_proto *filter_list; + struct qdisc_watchdog watchdog; - int rate2quantum; /* quant = rate / rate2quantum */ - s64 now; /* cached dequeue time */ - struct qdisc_watchdog watchdog; + s64 now; /* cached dequeue time */ + struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ - /* non shaped skbs; let them go directly thru */ - struct sk_buff_head direct_queue; - int direct_qlen; /* max qlen of above */ + /* time of nearest event per level (row) */ + s64 near_ev_cache[TC_HTB_MAXDEPTH]; - long direct_pkts; + int row_mask[TC_HTB_MAXDEPTH]; -#define HTB_WARN_TOOMANYEVENTS 0x1 - unsigned int warned; /* only one warning */ - struct work_struct work; + struct htb_level hlevel[TC_HTB_MAXDEPTH]; }; /* find class in global hash table using given handle */ @@ -276,7 +290,7 @@ static void htb_add_to_id_tree(struct rb_root *root, static void htb_add_to_wait_tree(struct htb_sched *q, struct htb_class *cl, s64 delay) { - struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; + struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL; cl->pq_key = q->now + delay; if (cl->pq_key == q->now) @@ -296,7 +310,7 @@ static void htb_add_to_wait_tree(struct htb_sched *q, p = &parent->rb_left; } rb_link_node(&cl->pq_node, parent, p); - rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); + rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq); } /** @@ -323,7 +337,7 @@ static inline void htb_add_class_to_row(struct htb_sched *q, while (mask) { int prio = ffz(~mask); mask &= ~(1 << prio); - htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio); + htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio); } } @@ -349,16 +363,18 @@ static inline void htb_remove_class_from_row(struct htb_sched *q, struct htb_class *cl, int mask) { int m = 0; + struct htb_level *hlevel = &q->hlevel[cl->level]; while (mask) { int prio = ffz(~mask); + struct htb_prio *hprio = &hlevel->hprio[prio]; mask &= ~(1 << prio); - if (q->ptr[cl->level][prio] == cl->node + prio) - htb_next_rb_node(q->ptr[cl->level] + prio); + if (hprio->ptr == cl->node + prio) + htb_next_rb_node(&hprio->ptr); - htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio); - if (!q->row[cl->level][prio].rb_node) + htb_safe_rb_erase(cl->node + prio, &hprio->row); + if (!hprio->row.rb_node) m |= 1 << prio; } q->row_mask[cl->level] &= ~m; @@ -382,13 +398,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.feed[prio].rb_node) + if (p->un.inner.clprio[prio].feed.rb_node) /* parent already has its feed in use so that * reset bit in mask as parent is already ok */ mask &= ~(1 << prio); - htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio); + htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio); } p->prio_activity |= mask; cl = p; @@ -418,18 +434,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.ptr[prio] == cl->node + prio) { + if (p->un.inner.clprio[prio].ptr == cl->node + prio) { /* we are removing child which is pointed to from * parent feed - forget the pointer but remember * classid */ - p->un.inner.last_ptr_id[prio] = cl->common.classid; - p->un.inner.ptr[prio] = NULL; + p->un.inner.clprio[prio].last_ptr_id = cl->common.classid; + p->un.inner.clprio[prio].ptr = NULL; } - htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio); + htb_safe_rb_erase(cl->node + prio, + &p->un.inner.clprio[prio].feed); - if (!p->un.inner.feed[prio].rb_node) + if (!p->un.inner.clprio[prio].feed.rb_node) mask |= 1 << prio; } @@ -644,7 +661,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, htb_change_class_mode(q, cl, &diff); if (old_mode != cl->cmode) { if (old_mode != HTB_CAN_SEND) - htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); + htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq); if (cl->cmode != HTB_CAN_SEND) htb_add_to_wait_tree(q, cl, diff); } @@ -664,7 +681,7 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, * next pending event (0 for no event in pq, q->now for too many events). * Note: Applied are events whose have cl->pq_key <= q->now. */ -static s64 htb_do_events(struct htb_sched *q, int level, +static s64 htb_do_events(struct htb_sched *q, const int level, unsigned long start) { /* don't run for longer than 2 jiffies; 2 is used instead of @@ -672,10 +689,12 @@ static s64 htb_do_events(struct htb_sched *q, int level, * too soon */ unsigned long stop_at = start + 2; + struct rb_root *wait_pq = &q->hlevel[level].wait_pq; + while (time_before(jiffies, stop_at)) { struct htb_class *cl; s64 diff; - struct rb_node *p = rb_first(&q->wait_pq[level]); + struct rb_node *p = rb_first(wait_pq); if (!p) return 0; @@ -684,7 +703,7 @@ static s64 htb_do_events(struct htb_sched *q, int level, if (cl->pq_key > q->now) return cl->pq_key; - htb_safe_rb_erase(p, q->wait_pq + level); + htb_safe_rb_erase(p, wait_pq); diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); htb_change_class_mode(q, cl, &diff); if (cl->cmode != HTB_CAN_SEND) @@ -728,8 +747,7 @@ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, * * Find leaf where current feed pointers points to. */ -static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, - struct rb_node **pptr, u32 * pid) +static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) { int i; struct { @@ -738,10 +756,10 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, u32 *pid; } stk[TC_HTB_MAXDEPTH], *sp = stk; - BUG_ON(!tree->rb_node); - sp->root = tree->rb_node; - sp->pptr = pptr; - sp->pid = pid; + BUG_ON(!hprio->row.rb_node); + sp->root = hprio->row.rb_node; + sp->pptr = &hprio->ptr; + sp->pid = &hprio->last_ptr_id; for (i = 0; i < 65535; i++) { if (!*sp->pptr && *sp->pid) { @@ -768,12 +786,15 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, } } else { struct htb_class *cl; + struct htb_prio *clp; + cl = rb_entry(*sp->pptr, struct htb_class, node[prio]); if (!cl->level) return cl; - (++sp)->root = cl->un.inner.feed[prio].rb_node; - sp->pptr = cl->un.inner.ptr + prio; - sp->pid = cl->un.inner.last_ptr_id + prio; + clp = &cl->un.inner.clprio[prio]; + (++sp)->root = clp->feed.rb_node; + sp->pptr = &clp->ptr; + sp->pid = &clp->last_ptr_id; } } WARN_ON(1); @@ -783,15 +804,16 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, /* dequeues packet at given priority and level; call only if * you are sure that there is active class at prio/level */ -static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio, - int level) +static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio, + const int level) { struct sk_buff *skb = NULL; struct htb_class *cl, *start; + struct htb_level *hlevel = &q->hlevel[level]; + struct htb_prio *hprio = &hlevel->hprio[prio]; + /* look initial class up in the row */ - start = cl = htb_lookup_leaf(q->row[level] + prio, prio, - q->ptr[level] + prio, - q->last_ptr_id[level] + prio); + start = cl = htb_lookup_leaf(hprio, prio); do { next: @@ -811,9 +833,7 @@ next: if ((q->row_mask[level] & (1 << prio)) == 0) return NULL; - next = htb_lookup_leaf(q->row[level] + prio, - prio, q->ptr[level] + prio, - q->last_ptr_id[level] + prio); + next = htb_lookup_leaf(hprio, prio); if (cl == start) /* fix start if we just deleted it */ start = next; @@ -826,11 +846,9 @@ next: break; qdisc_warn_nonwc("htb", cl->un.leaf.q); - htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> - ptr[0]) + prio); - cl = htb_lookup_leaf(q->row[level] + prio, prio, - q->ptr[level] + prio, - q->last_ptr_id[level] + prio); + htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr: + &q->hlevel[0].hprio[prio].ptr); + cl = htb_lookup_leaf(hprio, prio); } while (cl != start); @@ -839,8 +857,8 @@ next: cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); if (cl->un.leaf.deficit[level] < 0) { cl->un.leaf.deficit[level] += cl->quantum; - htb_next_rb_node((level ? cl->parent->un.inner.ptr : q-> - ptr[0]) + prio); + htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr : + &q->hlevel[0].hprio[prio].ptr); } /* this used to be after charge_class but this constelation * gives us slightly better performance @@ -880,15 +898,14 @@ ok: for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ int m; - s64 event; + s64 event = q->near_ev_cache[level]; - if (q->now >= q->near_ev_cache[level]) { + if (q->now >= event) { event = htb_do_events(q, level, start_at); if (!event) event = q->now + NSEC_PER_SEC; q->near_ev_cache[level] = event; - } else - event = q->near_ev_cache[level]; + } if (next_event > event) next_event = event; @@ -968,10 +985,8 @@ static void htb_reset(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); __skb_queue_purge(&q->direct_queue); sch->q.qlen = 0; - memset(q->row, 0, sizeof(q->row)); + memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); - memset(q->wait_pq, 0, sizeof(q->wait_pq)); - memset(q->ptr, 0, sizeof(q->ptr)); for (i = 0; i < TC_HTB_NUMPRIO; i++) INIT_LIST_HEAD(q->drops + i); } @@ -1192,7 +1207,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity); if (parent->cmode != HTB_CAN_SEND) - htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level); + htb_safe_rb_erase(&parent->pq_node, + &q->hlevel[parent->level].wait_pq); parent->level = 0; memset(&parent->un.inner, 0, sizeof(parent->un.inner)); @@ -1281,7 +1297,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) htb_deactivate(q, cl); if (cl->cmode != HTB_CAN_SEND) - htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level); + htb_safe_rb_erase(&cl->pq_node, + &q->hlevel[cl->level].wait_pq); if (last_child) htb_parent_to_leaf(q, cl, new_q); @@ -1366,12 +1383,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; - err = gen_new_estimator(&cl->bstats, &cl->rate_est, - qdisc_root_sleeping_lock(sch), - tca[TCA_RATE] ? : &est.nla); - if (err) { - kfree(cl); - goto failure; + if (htb_rate_est || tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE] ? : &est.nla); + if (err) { + kfree(cl); + goto failure; + } } cl->refcnt = 1; @@ -1401,7 +1420,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { - htb_safe_rb_erase(&parent->pq_node, q->wait_pq); + htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq); parent->cmode = HTB_CAN_SEND; } parent->level = (parent->parent ? parent->parent->level diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 3d2acc7a9c80..82f6016d89ab 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -23,6 +23,7 @@ #include <linux/vmalloc.h> #include <linux/rtnetlink.h> #include <linux/reciprocal_div.h> +#include <linux/rbtree.h> #include <net/netlink.h> #include <net/pkt_sched.h> @@ -68,7 +69,8 @@ */ struct netem_sched_data { - /* internal t(ime)fifo qdisc uses sch->q and sch->limit */ + /* internal t(ime)fifo qdisc uses t_root and sch->limit */ + struct rb_root t_root; /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; @@ -128,10 +130,35 @@ struct netem_sched_data { */ struct netem_skb_cb { psched_time_t time_to_send; + ktime_t tstamp_save; }; +/* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp + * to hold a rb_node structure. + * + * If struct sk_buff layout is changed, the following checks will complain. + */ +static struct rb_node *netem_rb_node(struct sk_buff *skb) +{ + BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0); + BUILD_BUG_ON(offsetof(struct sk_buff, prev) != + offsetof(struct sk_buff, next) + sizeof(skb->next)); + BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) != + offsetof(struct sk_buff, prev) + sizeof(skb->prev)); + BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) + + sizeof(skb->prev) + + sizeof(skb->tstamp)); + return (struct rb_node *)&skb->next; +} + +static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) +{ + return (struct sk_buff *)rb; +} + static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { + /* we assume we can use skb next/prev/tstamp as storage for rb_node */ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } @@ -333,20 +360,23 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { - struct sk_buff_head *list = &sch->q; + struct netem_sched_data *q = qdisc_priv(sch); psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; - struct sk_buff *skb = skb_peek_tail(list); + struct rb_node **p = &q->t_root.rb_node, *parent = NULL; - /* Optimize for add at tail */ - if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send)) - return __skb_queue_tail(list, nskb); + while (*p) { + struct sk_buff *skb; - skb_queue_reverse_walk(list, skb) { + parent = *p; + skb = netem_rb_to_skb(parent); if (tnext >= netem_skb_cb(skb)->time_to_send) - break; + p = &parent->rb_right; + else + p = &parent->rb_left; } - - __skb_queue_after(list, skb, nskb); + rb_link_node(netem_rb_node(nskb), parent, p); + rb_insert_color(netem_rb_node(nskb), &q->t_root); + sch->q.qlen++; } /* @@ -436,23 +466,28 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) now = psched_get_time(); if (q->rate) { - struct sk_buff_head *list = &sch->q; + struct sk_buff *last; - if (!skb_queue_empty(list)) { + if (!skb_queue_empty(&sch->q)) + last = skb_peek_tail(&sch->q); + else + last = netem_rb_to_skb(rb_last(&q->t_root)); + if (last) { /* * Last packet in queue is reference point (now), * calculate this time bonus and subtract * from delay. */ - delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now; + delay -= netem_skb_cb(last)->time_to_send - now; delay = max_t(psched_tdiff_t, 0, delay); - now = netem_skb_cb(skb_peek_tail(list))->time_to_send; + now = netem_skb_cb(last)->time_to_send; } delay += packet_len_2_sched_time(skb->len, q); } cb->time_to_send = now + delay; + cb->tstamp_save = skb->tstamp; ++q->counter; tfifo_enqueue(skb, sch); } else { @@ -476,6 +511,21 @@ static unsigned int netem_drop(struct Qdisc *sch) unsigned int len; len = qdisc_queue_drop(sch); + + if (!len) { + struct rb_node *p = rb_first(&q->t_root); + + if (p) { + struct sk_buff *skb = netem_rb_to_skb(p); + + rb_erase(p, &q->t_root); + sch->q.qlen--; + skb->next = NULL; + skb->prev = NULL; + len = qdisc_pkt_len(skb); + kfree_skb(skb); + } + } if (!len && q->qdisc && q->qdisc->ops->drop) len = q->qdisc->ops->drop(q->qdisc); if (len) @@ -488,19 +538,35 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; + struct rb_node *p; if (qdisc_is_throttled(sch)) return NULL; tfifo_dequeue: - skb = qdisc_peek_head(sch); + skb = __skb_dequeue(&sch->q); if (skb) { - const struct netem_skb_cb *cb = netem_skb_cb(skb); +deliver: + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); + return skb; + } + p = rb_first(&q->t_root); + if (p) { + psched_time_t time_to_send; + + skb = netem_rb_to_skb(p); /* if more time remaining? */ - if (cb->time_to_send <= psched_get_time()) { - __skb_unlink(skb, &sch->q); - sch->qstats.backlog -= qdisc_pkt_len(skb); + time_to_send = netem_skb_cb(skb)->time_to_send; + if (time_to_send <= psched_get_time()) { + rb_erase(p, &q->t_root); + + sch->q.qlen--; + skb->next = NULL; + skb->prev = NULL; + skb->tstamp = netem_skb_cb(skb)->tstamp_save; #ifdef CONFIG_NET_CLS_ACT /* @@ -522,10 +588,7 @@ tfifo_dequeue: } goto tfifo_dequeue; } -deliver: - qdisc_unthrottled(sch); - qdisc_bstats_update(sch, skb); - return skb; + goto deliver; } if (q->qdisc) { @@ -533,7 +596,7 @@ deliver: if (skb) goto deliver; } - qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); + qdisc_watchdog_schedule(&q->watchdog, time_to_send); } if (q->qdisc) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index d51852bba01c..7c195d972bf0 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -138,7 +138,7 @@ struct qfq_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct Qdisc *qdisc; struct list_head alist; /* Link for active-classes list. */ struct qfq_aggregate *agg; /* Parent aggregate. */ diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index e478d316602b..1aaf1b6e51a2 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -116,14 +116,57 @@ struct tbf_sched_data { struct qdisc_watchdog watchdog; /* Watchdog timer */ }; + +/* GSO packet is too big, segment it so that tbf can transmit + * each segment in time + */ +static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct sk_buff *segs, *nskb; + netdev_features_t features = netif_skb_features(skb); + int ret, nb; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) + return qdisc_reshape_fail(skb, sch); + + nb = 0; + while (segs) { + nskb = segs->next; + segs->next = NULL; + if (likely(segs->len <= q->max_size)) { + qdisc_skb_cb(segs)->pkt_len = segs->len; + ret = qdisc_enqueue(segs, q->qdisc); + } else { + ret = qdisc_reshape_fail(skb, sch); + } + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + sch->qstats.drops++; + } else { + nb++; + } + segs = nskb; + } + sch->q.qlen += nb; + if (nb > 1) + qdisc_tree_decrease_qlen(sch, 1 - nb); + consume_skb(skb); + return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +} + static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); int ret; - if (qdisc_pkt_len(skb) > q->max_size) + if (qdisc_pkt_len(skb) > q->max_size) { + if (skb_is_gso(skb)) + return tbf_segment(skb, sch); return qdisc_reshape_fail(skb, sch); - + } ret = qdisc_enqueue(skb, q->qdisc); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) |