From 432c856fcf45c468fffe2e5029cb3f95c7dc9475 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Mon, 27 Oct 2014 10:30:51 -0700 Subject: net: skb_segment() should preserve backpressure This patch generalizes commit d6a4a1041176 ("tcp: GSO should be TSQ friendly") to protocols using skb_set_owner_w() TCP uses its own destructor (tcp_wfree) and needs a more complex scheme as explained in commit 6ff50cd55545 ("tcp: gso: do not generate out of order packets") This allows UDP sockets using UFO to get proper backpressure, thus avoiding qdisc drops and excessive cpu usage. Here are performance test results (macvlan on vlan): - Before # netperf -t UDP_STREAM ... Socket Message Elapsed Messages Size Size Time Okay Errors Throughput bytes bytes secs # # 10^6bits/sec 212992 65507 60.00 144096 1224195 1258.56 212992 60.00 51 0.45 Average: CPU %user %nice %system %iowait %steal %idle Average: all 0.23 0.00 25.26 0.08 0.00 74.43 - After # netperf -t UDP_STREAM ... Socket Message Elapsed Messages Size Size Time Okay Errors Throughput bytes bytes secs # # 10^6bits/sec 212992 65507 60.00 109593 0 957.20 212992 60.00 109593 957.20 Average: CPU %user %nice %system %iowait %steal %idle Average: all 0.18 0.00 8.38 0.02 0.00 91.43 [edumazet] Rewrote patch and changelog. Signed-off-by: Toshiaki Makita Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c16615bfb61e..e48e5c02e877 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3099,6 +3099,16 @@ perform_csum_check: * (see validate_xmit_skb_list() for example) */ segs->prev = tail; + + /* Following permits correct backpressure, for protocols + * using skb_set_owner_w(). + * Idea is to tranfert ownership from head_skb to last segment. + */ + if (head_skb->destructor == sock_wfree) { + swap(tail->truesize, head_skb->truesize); + swap(tail->destructor, head_skb->destructor); + swap(tail->sk, head_skb->sk); + } return segs; err: -- cgit v1.2.3 From bc9ad166e38ae1cdcb5323a8aa45dff834d68bfa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Oct 2014 18:05:13 -0700 Subject: net: introduce napi_schedule_irqoff() napi_schedule() can be called from any context and has to mask hard irqs. Add a variant that can only be called from hard interrupts handlers or when irqs are already masked. Many NIC drivers can use it from their hard IRQ handler instead of generic variant. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b793e3521a36..759940cdf896 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4372,7 +4372,8 @@ static int process_backlog(struct napi_struct *napi, int quota) * __napi_schedule - schedule for receive * @n: entry to schedule * - * The entry's receive function will be scheduled to run + * The entry's receive function will be scheduled to run. + * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { @@ -4384,6 +4385,18 @@ void __napi_schedule(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule); +/** + * __napi_schedule_irqoff - schedule for receive + * @n: entry to schedule + * + * Variant of __napi_schedule() assuming hard irqs are masked + */ +void __napi_schedule_irqoff(struct napi_struct *n) +{ + ____napi_schedule(this_cpu_ptr(&softnet_data), n); +} +EXPORT_SYMBOL(__napi_schedule_irqoff); + void __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); -- cgit v1.2.3 From 75fbfd33234a71556bec34b099d98f970190905d Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 29 Oct 2014 19:29:31 +0100 Subject: neigh: optimize neigh_parms_release() In neigh_parms_release() we loop over all entries to find the entry given in argument and being able to remove it from the list. By using a double linked list, we can avoid this loop. Here are some numbers with 30 000 dummy interfaces configured: Before the patch: $ time rmmod dummy real 2m0.118s user 0m0.000s sys 1m50.048s After the patch: $ time rmmod dummy real 1m9.970s user 0m0.000s sys 0m47.976s Suggested-by: Thierry Herbelot Signed-off-by: Nicolas Dichtel Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ef31fef25e5a..edd04116ecb7 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -773,7 +773,7 @@ static void neigh_periodic_work(struct work_struct *work) if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; tbl->last_rand = jiffies; - for (p = &tbl->parms; p; p = p->next) + list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } @@ -1446,7 +1446,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, { struct neigh_parms *p; - for (p = &tbl->parms; p; p = p->next) { + list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; @@ -1481,8 +1481,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, } write_lock_bh(&tbl->lock); - p->next = tbl->parms.next; - tbl->parms.next = p; + list_add(&p->list, &tbl->parms.list); write_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); @@ -1501,24 +1500,15 @@ static void neigh_rcu_free_parms(struct rcu_head *head) void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { - struct neigh_parms **p; - if (!parms || parms == &tbl->parms) return; write_lock_bh(&tbl->lock); - for (p = &tbl->parms.next; *p; p = &(*p)->next) { - if (*p == parms) { - *p = parms->next; - parms->dead = 1; - write_unlock_bh(&tbl->lock); - if (parms->dev) - dev_put(parms->dev); - call_rcu(&parms->rcu_head, neigh_rcu_free_parms); - return; - } - } + list_del(&parms->list); + parms->dead = 1; write_unlock_bh(&tbl->lock); - neigh_dbg(1, "%s: not found\n", __func__); + if (parms->dev) + dev_put(parms->dev); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); @@ -1535,6 +1525,8 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) unsigned long now = jiffies; unsigned long phsize; + INIT_LIST_HEAD(&tbl->parms_list); + list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); atomic_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = @@ -2154,7 +2146,9 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) NLM_F_MULTI) <= 0) break; - for (nidx = 0, p = tbl->parms.next; p; p = p->next) { + nidx = 0; + p = list_next_entry(&tbl->parms, list); + list_for_each_entry_from(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; -- cgit v1.2.3 From d75b1ade567ffab085e8adbbdacf0092d10cd09c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 2 Nov 2014 06:19:33 -0800 Subject: net: less interrupt masking in NAPI net_rx_action() can mask irqs a single time to transfert sd->poll_list into a private list, for a very short duration. Then, napi_complete() can avoid masking irqs again, and net_rx_action() only needs to mask irq again in slow path. This patch removes 2 couples of irq mask/unmask per typical NAPI run, more if multiple napi were triggered. Note this also allows to give control back to caller (do_softirq()) more often, so that other softirq handlers can be called a bit earlier, or ksoftirqd can be wakeup earlier under pressure. This was developed while testing an alternative to RX interrupt mitigation to reduce latencies while keeping or improving GRO aggregation on fast NIC. Idea is to test napi->gro_list at the end of a napi->poll() and reschedule one NAPI poll, but after servicing a full round of softirqs (timers, TX, rcu, ...). This will be allowed only if softirq is currently serviced by idle task or ksoftirqd, and resched not needed. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 68 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 25 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ebf778df58cd..40be481268de 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4316,20 +4316,28 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) local_irq_enable(); } +static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return sd->rps_ipi_list != NULL; +#else + return false; +#endif +} + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); -#ifdef CONFIG_RPS /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ - if (sd->rps_ipi_list) { + if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } -#endif + napi->weight = weight_p; local_irq_disable(); while (1) { @@ -4356,7 +4364,6 @@ static int process_backlog(struct napi_struct *napi, int quota) * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ - list_del(&napi->poll_list); napi->state = 0; rps_unlock(sd); @@ -4406,7 +4413,7 @@ void __napi_complete(struct napi_struct *n) BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); BUG_ON(n->gro_list); - list_del(&n->poll_list); + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } @@ -4424,9 +4431,15 @@ void napi_complete(struct napi_struct *n) return; napi_gro_flush(n, false); - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); + + if (likely(list_empty(&n->poll_list))) { + WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); + } else { + /* If n->poll_list is not empty, we need to mask irqs */ + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); + } } EXPORT_SYMBOL(napi_complete); @@ -4520,29 +4533,28 @@ static void net_rx_action(struct softirq_action *h) struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + 2; int budget = netdev_budget; + LIST_HEAD(list); + LIST_HEAD(repoll); void *have; local_irq_disable(); + list_splice_init(&sd->poll_list, &list); + local_irq_enable(); - while (!list_empty(&sd->poll_list)) { + while (!list_empty(&list)) { struct napi_struct *n; int work, weight; - /* If softirq window is exhuasted then punt. + /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) goto softnet_break; - local_irq_enable(); - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + n = list_first_entry(&list, struct napi_struct, poll_list); + list_del_init(&n->poll_list); have = netpoll_poll_lock(n); @@ -4564,8 +4576,6 @@ static void net_rx_action(struct softirq_action *h) budget -= work; - local_irq_disable(); - /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can @@ -4573,32 +4583,40 @@ static void net_rx_action(struct softirq_action *h) */ if (unlikely(work == weight)) { if (unlikely(napi_disable_pending(n))) { - local_irq_enable(); napi_complete(n); - local_irq_disable(); } else { if (n->gro_list) { /* flush too old packets * If HZ < 1000, flush all packets. */ - local_irq_enable(); napi_gro_flush(n, HZ >= 1000); - local_irq_disable(); } - list_move_tail(&n->poll_list, &sd->poll_list); + list_add_tail(&n->poll_list, &repoll); } } netpoll_poll_unlock(have); } + + if (!sd_has_rps_ipi_waiting(sd) && + list_empty(&list) && + list_empty(&repoll)) + return; out: + local_irq_disable(); + + list_splice_tail_init(&sd->poll_list, &list); + list_splice_tail(&repoll, &list); + list_splice(&list, &sd->poll_list); + if (!list_empty(&sd->poll_list)) + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + net_rps_action_and_irq_enable(sd); return; softnet_break: sd->time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); goto out; } -- cgit v1.2.3 From e585f23636370320bc2071ca5ba2744ae37c3e51 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 4 Nov 2014 09:06:54 -0800 Subject: udp: Changes to udp_offload to support remote checksum offload Add a new GSO type, SKB_GSO_TUNNEL_REMCSUM, which indicates remote checksum offload being done (in this case inner checksum must not be offloaded to the NIC). Added logic in __skb_udp_tunnel_segment to handle remote checksum offload case. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e48e5c02e877..700189604f3d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3013,7 +3013,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (nskb->len == len + doffset) goto perform_csum_check; - if (!sg) { + if (!sg && !nskb->remcsum_offload) { nskb->ip_summed = CHECKSUM_NONE; nskb->csum = skb_copy_and_csum_bits(head_skb, offset, skb_put(nskb, len), @@ -3085,7 +3085,7 @@ skip_fraglist: nskb->truesize += nskb->data_len; perform_csum_check: - if (!csum) { + if (!csum && !nskb->remcsum_offload) { nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); nskb->ip_summed = CHECKSUM_NONE; -- cgit v1.2.3 From 51f3d02b980a338cd291d2bc7629cdfb2568424b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 5 Nov 2014 16:46:40 -0500 Subject: net: Add and use skb_copy_datagram_msg() helper. This encapsulates all of the skb_copy_datagram_iovec() callers with call argument signature "skb, offset, msghdr->msg_iov, length". When we move to iov_iters in the networking, the iov_iter object will sit in the msghdr. Having a helper like this means there will be less places to touch during that transformation. Based upon descriptions and patch from Al Viro. Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 15e0c67b1069..ac56dd06c306 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; -- cgit v1.2.3 From 59b93b41e7fa71138734a911b11b044340dd16bd Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Wed, 5 Nov 2014 15:27:48 -0800 Subject: net: Remove MPLS GSO feature. Device can export MPLS GSO support in dev->mpls_features same way it export vlan features in dev->vlan_features. So it is safe to remove NETIF_F_GSO_MPLS redundant flag. Signed-off-by: Pravin B Shelar --- net/core/ethtool.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 06dfb293e5aa..b0f84f5ddda8 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -84,7 +84,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", - [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", -- cgit v1.2.3 From 25cd9ba0abc0749e5cb78e6493c6f6b3311ec6c5 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 6 Oct 2014 05:05:13 -0700 Subject: openvswitch: Add basic MPLS support to kernel Allow datapath to recognize and extract MPLS labels into flow keys and execute actions which push, pop, and set labels on packets. Based heavily on work by Leo Alterman, Ravi K, Isaku Yamahata and Joe Stringer. Cc: Ravi K Cc: Leo Alterman Cc: Isaku Yamahata Cc: Joe Stringer Signed-off-by: Simon Horman Signed-off-by: Jesse Gross Signed-off-by: Pravin B Shelar --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 40be481268de..70bb609c283d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -118,6 +118,7 @@ #include #include #include +#include #include #include #include @@ -2530,7 +2531,7 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { - if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; -- cgit v1.2.3 From a8f820aa4066d2c97e75ecd1bbca8a7920b66f2c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 7 Nov 2014 21:22:22 +0800 Subject: inet: Add skb_copy_datagram_iter This patch adds skb_copy_datagram_iter, which is identical to skb_copy_datagram_iovec except that it operates on iov_iter instead of iovec. Eventually all users of skb_copy_datagram_iovec should switch over to iov_iter and then we can remove skb_copy_datagram_iovec. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/datagram.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index fdbc9a81d4c2..84d90d087a30 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -481,6 +482,92 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_const_iovec); +/** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + trace_skb_copy_datagram_iovec(skb, len); + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (copy_to_iter(skb->data + offset, copy, to) != copy) + goto short_copy; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (copy_page_to_iter(skb_frag_page(frag), + frag->page_offset + offset - + start, copy, to) != copy) + goto short_copy; + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_iter(frag_iter, offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + if (!len) + return 0; + + /* This is not really a user copy fault, but rather someone + * gave us a bogus length on the skb. We should probably + * print a warning here as it may indicate a kernel bug. + */ + +fault: + return -EFAULT; + +short_copy: + if (iov_iter_count(to)) + goto fault; + + return 0; +} +EXPORT_SYMBOL(skb_copy_datagram_iter); + /** * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. * @skb: buffer to copy -- cgit v1.2.3 From bfe1be38fcee0e13ad53175d0b530707c20f93ec Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 7 Nov 2014 21:22:26 +0800 Subject: net: Kill skb_copy_datagram_const_iovec Now that both macvtap and tun are using skb_copy_datagram_iter, we can kill the abomination that is skb_copy_datagram_const_iovec. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/datagram.c | 89 ----------------------------------------------------- 1 file changed, 89 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 84d90d087a30..26391a3fe3e5 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -393,95 +393,6 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_iovec); -/** - * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @to_offset: offset in the io vector to start copying to - * @len: amount of data to copy from buffer to iovec - * - * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. - */ -int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, - const struct iovec *to, int to_offset, - int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_toiovecend(to, vaddr + frag->page_offset + - offset - start, to_offset, copy); - kunmap(page); - if (err) - goto fault; - if (!(len -= copy)) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_const_iovec(frag_iter, - offset - start, - to, to_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - return -EFAULT; -} -EXPORT_SYMBOL(skb_copy_datagram_const_iovec); - /** * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy -- cgit v1.2.3 From 3b47d30396bae4f0bd1ff0dbcd7c4f5077e7df4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Nov 2014 21:09:44 -0800 Subject: net: gro: add a per device gro flush timer Tuning coalescing parameters on NIC can be really hard. Servers can handle both bulk and RPC like traffic, with conflicting goals : bulk flows want as big GRO packets as possible, RPC want minimal latencies. To reach big GRO packets on 10Gbe NIC, one can use : ethtool -C eth0 rx-usecs 4 rx-frames 44 But this penalizes rpc sessions, with an increase of latencies, up to 50% in some cases, as NICs generally do not force an interrupt when a packet with TCP Push flag is received. Some NICs do not have an absolute timer, only a timer rearmed for every incoming packet. This patch uses a different strategy : Let GRO stack decides what do do, based on traffic pattern. Packets with Push flag wont be delayed. Packets without Push flag might be held in GRO engine, if we keep receiving data. This new mechanism is off by default, and shall be enabled by setting /sys/class/net/ethX/gro_flush_timeout to a value in nanosecond. To fully enable this mechanism, drivers should use napi_complete_done() instead of napi_complete(). Tested: Ran 200 netperf TCP_STREAM from A to B (10Gbe mlx4 link, 8 RX queues) Without this feature, we send back about 305,000 ACK per second. GRO aggregation ratio is low (811/305 = 2.65 segments per GRO packet) Setting a timer of 2000 nsec is enough to increase GRO packet sizes and reduce number of ACK packets. (811/19.2 = 42) Receiver performs less calls to upper stacks, less wakes up. This also reduces cpu usage on the sender, as it receives less ACK packets. Note that reducing number of wakes up increases cpu efficiency, but can decrease QPS, as applications wont have the chance to warmup cpu caches doing a partial read of RPC requests/answers if they fit in one skb. B:~# sar -n DEV 1 10 | grep eth0 | tail -1 Average: eth0 811269.80 305732.30 1199462.57 19705.72 0.00 0.00 0.50 B:~# echo 2000 >/sys/class/net/eth0/gro_flush_timeout B:~# sar -n DEV 1 10 | grep eth0 | tail -1 Average: eth0 811577.30 19230.80 1199916.51 1239.80 0.00 0.00 0.50 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 45 +++++++++++++++++++++++++++++++++++++++++---- net/core/net-sysfs.c | 18 ++++++++++++++++++ 2 files changed, 59 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 70bb609c283d..bb09b0364619 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,6 +134,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -4412,7 +4413,6 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); void __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - BUG_ON(n->gro_list); list_del_init(&n->poll_list); smp_mb__before_atomic(); @@ -4420,7 +4420,7 @@ void __napi_complete(struct napi_struct *n) } EXPORT_SYMBOL(__napi_complete); -void napi_complete(struct napi_struct *n) +void napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; @@ -4431,8 +4431,18 @@ void napi_complete(struct napi_struct *n) if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) return; - napi_gro_flush(n, false); + if (n->gro_list) { + unsigned long timeout = 0; + if (work_done) + timeout = n->dev->gro_flush_timeout; + + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + else + napi_gro_flush(n, false); + } if (likely(list_empty(&n->poll_list))) { WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); } else { @@ -4442,7 +4452,7 @@ void napi_complete(struct napi_struct *n) local_irq_restore(flags); } } -EXPORT_SYMBOL(napi_complete); +EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ struct napi_struct *napi_by_id(unsigned int napi_id) @@ -4496,10 +4506,23 @@ void napi_hash_del(struct napi_struct *napi) } EXPORT_SYMBOL_GPL(napi_hash_del); +static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) +{ + struct napi_struct *napi; + + napi = container_of(timer, struct napi_struct, timer); + if (napi->gro_list) + napi_schedule(napi); + + return HRTIMER_NORESTART; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&napi->poll_list); + hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + napi->timer.function = napi_watchdog; napi->gro_count = 0; napi->gro_list = NULL; napi->skb = NULL; @@ -4518,6 +4541,20 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, } EXPORT_SYMBOL(netif_napi_add); +void napi_disable(struct napi_struct *n) +{ + might_sleep(); + set_bit(NAPI_STATE_DISABLE, &n->state); + + while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) + msleep(1); + + hrtimer_cancel(&n->timer); + + clear_bit(NAPI_STATE_DISABLE, &n->state); +} +EXPORT_SYMBOL(napi_disable); + void netif_napi_del(struct napi_struct *napi) { list_del_init(&napi->dev_list); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 9dd06699b09c..1a24602cd54e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -325,6 +325,23 @@ static ssize_t tx_queue_len_store(struct device *dev, } NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); +static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) +{ + dev->gro_flush_timeout = val; + return 0; +} + +static ssize_t gro_flush_timeout_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_gro_flush_timeout); +} +NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -422,6 +439,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_mtu.attr, &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, + &dev_attr_gro_flush_timeout.attr, &dev_attr_phys_port_id.attr, NULL, }; -- cgit v1.2.3 From 2c8c56e15df3d4c2af3d656e44feb18789f75837 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 Nov 2014 05:54:28 -0800 Subject: net: introduce SO_INCOMING_CPU Alternative to RPS/RFS is to use hardware support for multiple queues. Then split a set of million of sockets into worker threads, each one using epoll() to manage events on its own socket pool. Ideally, we want one thread per RX/TX queue/cpu, but we have no way to know after accept() or connect() on which queue/cpu a socket is managed. We normally use one cpu per RX queue (IRQ smp_affinity being properly set), so remembering on socket structure which cpu delivered last packet is enough to solve the problem. After accept(), connect(), or even file descriptor passing around processes, applications can use : int cpu; socklen_t len = sizeof(cpu); getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len); And use this information to put the socket into the right silo for optimal performance, as all networking stack should run on the appropriate cpu, without need to send IPI (RPS/RFS). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index ac56dd06c306..0725cf0cb685 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_max_pacing_rate; break; + case SO_INCOMING_CPU: + v.val = sk->sk_incoming_cpu; + break; + default: return -ENOPROTOOPT; } @@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) -- cgit v1.2.3 From ba7a46f16dd29f93303daeb1fee8af316c5a07f4 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Nov 2014 10:59:17 -0800 Subject: net: Convert LIMIT_NETDEBUG to net_dbg_ratelimited Use the more common dynamic_debug capable net_dbg_ratelimited and remove the LIMIT_NETDEBUG macro. All messages are still ratelimited. Some KERN_ uses are changed to KERN_DEBUG. This may have some negative impact on messages that were emitted at KERN_INFO that are not not enabled at all unless DEBUG is defined or dynamic_debug is enabled. Even so, these messages are now _not_ emitted by default. This also eliminates the use of the net_msg_warn sysctl "/proc/sys/net/core/warnings". For backward compatibility, the sysctl is not removed, but it has no function. The extern declaration of net_msg_warn is removed from sock.h and made static in net/core/sysctl_net_core.c Miscellanea: o Update the sysctl documentation o Remove the embedded uses of pr_fmt o Coalesce format fragments o Realign arguments Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/core/sysctl_net_core.c | 2 ++ net/core/utils.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cf9cd13509a7..f93f092fe226 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -26,6 +26,8 @@ static int zero = 0; static int one = 1; static int ushort_max = USHRT_MAX; +static int net_msg_warn; /* Unused, but still a sysctl */ + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/net/core/utils.c b/net/core/utils.c index efc76dd9dcd1..7b803884c162 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -33,9 +33,6 @@ #include #include -int net_msg_warn __read_mostly = 1; -EXPORT_SYMBOL(net_msg_warn); - DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); /* * All net warning printk()s should be guarded by this function. -- cgit v1.2.3 From d7480fd3b1738a8eae6a76098b17af318cf9b9cc Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 10 Nov 2014 15:59:36 -0800 Subject: neigh: remove dynamic neigh table registration support Currently there are only three neigh tables in the whole kernel: arp table, ndisc table and decnet neigh table. What's more, we don't support registering multiple tables per family. Therefore we can just make these tables statically built-in. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/neighbour.c | 247 +++++++++++++++++++++++---------------------------- 1 file changed, 112 insertions(+), 135 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index edd04116ecb7..8e38f17288d3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -56,7 +56,6 @@ static void __neigh_notify(struct neighbour *n, int type, int flags); static void neigh_update_notify(struct neighbour *neigh); static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); -static struct neigh_table *neigh_tables; #ifdef CONFIG_PROC_FS static const struct file_operations neigh_stat_seq_fops; #endif @@ -87,13 +86,8 @@ static const struct file_operations neigh_stat_seq_fops; the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. - - The last lock is neigh_tbl_lock. It is pure SMP lock, protecting - list of neighbour tables. This list is used only in process context, */ -static DEFINE_RWLOCK(neigh_tbl_lock); - static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); @@ -1520,7 +1514,9 @@ static void neigh_parms_destroy(struct neigh_parms *parms) static struct lock_class_key neigh_table_proxy_queue_class; -static void neigh_table_init_no_netlink(struct neigh_table *tbl) +static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; + +void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; @@ -1566,34 +1562,14 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; -} - -void neigh_table_init(struct neigh_table *tbl) -{ - struct neigh_table *tmp; - neigh_table_init_no_netlink(tbl); - write_lock(&neigh_tbl_lock); - for (tmp = neigh_tables; tmp; tmp = tmp->next) { - if (tmp->family == tbl->family) - break; - } - tbl->next = neigh_tables; - neigh_tables = tbl; - write_unlock(&neigh_tbl_lock); - - if (unlikely(tmp)) { - pr_err("Registering multiple tables for family %d\n", - tbl->family); - dump_stack(); - } + neigh_tables[index] = tbl; } EXPORT_SYMBOL(neigh_table_init); -int neigh_table_clear(struct neigh_table *tbl) +int neigh_table_clear(int index, struct neigh_table *tbl) { - struct neigh_table **tp; - + neigh_tables[index] = NULL; /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); @@ -1601,14 +1577,6 @@ int neigh_table_clear(struct neigh_table *tbl) neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); - write_lock(&neigh_tbl_lock); - for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { - if (*tp == tbl) { - *tp = tbl->next; - break; - } - } - write_unlock(&neigh_tbl_lock); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); @@ -1626,12 +1594,32 @@ int neigh_table_clear(struct neigh_table *tbl) } EXPORT_SYMBOL(neigh_table_clear); +static struct neigh_table *neigh_find_table(int family) +{ + struct neigh_table *tbl = NULL; + + switch (family) { + case AF_INET: + tbl = neigh_tables[NEIGH_ARP_TABLE]; + break; + case AF_INET6: + tbl = neigh_tables[NEIGH_ND_TABLE]; + break; + case AF_DECnet: + tbl = neigh_tables[NEIGH_DN_TABLE]; + break; + } + + return tbl; +} + static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; + struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; @@ -1652,39 +1640,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) } } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - struct neighbour *neigh; - - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); - - if (nla_len(dst_attr) < tbl->key_len) - goto out; + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (ndm->ndm_flags & NTF_PROXY) { - err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); - goto out; - } + if (nla_len(dst_attr) < tbl->key_len) + goto out; - if (dev == NULL) - goto out; + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); + goto out; + } - neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); - if (neigh == NULL) { - err = -ENOENT; - goto out; - } + if (dev == NULL) + goto out; - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN); - neigh_release(neigh); + neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); + if (neigh == NULL) { + err = -ENOENT; goto out; } - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); out: return err; @@ -1692,11 +1672,14 @@ out: static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) { + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; + struct neighbour *neigh; + void *dst, *lladdr; int err; ASSERT_RTNL(); @@ -1720,70 +1703,60 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) goto out; } - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { - int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; - struct neighbour *neigh; - void *dst, *lladdr; + tbl = neigh_find_table(ndm->ndm_family); + if (tbl == NULL) + return -EAFNOSUPPORT; - if (tbl->family != ndm->ndm_family) - continue; - read_unlock(&neigh_tbl_lock); + if (nla_len(tb[NDA_DST]) < tbl->key_len) + goto out; + dst = nla_data(tb[NDA_DST]); + lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; - if (nla_len(tb[NDA_DST]) < tbl->key_len) - goto out; - dst = nla_data(tb[NDA_DST]); - lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + if (ndm->ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + err = -ENOBUFS; + pn = pneigh_lookup(tbl, net, dst, dev, 1); + if (pn) { + pn->flags = ndm->ndm_flags; + err = 0; + } + goto out; + } - if (ndm->ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; + if (dev == NULL) + goto out; - err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); - if (pn) { - pn->flags = ndm->ndm_flags; - err = 0; - } + neigh = neigh_lookup(tbl, dst, dev); + if (neigh == NULL) { + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; goto out; } - if (dev == NULL) + neigh = __neigh_lookup_errno(tbl, dst, dev); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; + } + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(neigh); goto out; - - neigh = neigh_lookup(tbl, dst, dev); - if (neigh == NULL) { - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - err = -ENOENT; - goto out; - } - - neigh = __neigh_lookup_errno(tbl, dst, dev); - if (IS_ERR(neigh)) { - err = PTR_ERR(neigh); - goto out; - } - } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { - err = -EEXIST; - neigh_release(neigh); - goto out; - } - - if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) - flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - if (ndm->ndm_flags & NTF_USE) { - neigh_event_send(neigh, NULL); - err = 0; - } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); - neigh_release(neigh); - goto out; + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) + flags &= ~NEIGH_UPDATE_F_OVERRIDE; } - read_unlock(&neigh_tbl_lock); - err = -EAFNOSUPPORT; + if (ndm->ndm_flags & NTF_USE) { + neigh_event_send(neigh, NULL); + err = 0; + } else + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + neigh_release(neigh); + out: return err; } @@ -1982,7 +1955,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) struct neigh_table *tbl; struct ndtmsg *ndtmsg; struct nlattr *tb[NDTA_MAX+1]; - int err; + bool found = false; + int err, tidx; err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy); @@ -1995,19 +1969,21 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) } ndtmsg = nlmsg_data(nlh); - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables; tbl; tbl = tbl->next) { + + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { + tbl = neigh_tables[tidx]; + if (!tbl) + continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; - - if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { + found = true; break; + } } - if (tbl == NULL) { - err = -ENOENT; - goto errout_locked; - } + if (!found) + return -ENOENT; /* * We acquire tbl->lock to be nice to the periodic timers and @@ -2118,8 +2094,6 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) errout_tbl_lock: write_unlock_bh(&tbl->lock); -errout_locked: - read_unlock(&neigh_tbl_lock); errout: return err; } @@ -2134,10 +2108,13 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; - read_lock(&neigh_tbl_lock); - for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) { + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; + tbl = neigh_tables[tidx]; + if (!tbl) + continue; + if (tidx < tbl_skip || (family && tbl->family != family)) continue; @@ -2168,7 +2145,6 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) neigh_skip = 0; } out: - read_unlock(&neigh_tbl_lock); cb->args[0] = tidx; cb->args[1] = nidx; @@ -2351,7 +2327,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) int proxy = 0; int err; - read_lock(&neigh_tbl_lock); family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is @@ -2363,8 +2338,11 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tbl = neigh_tables, t = 0; tbl; - tbl = tbl->next, t++) { + for (t = 0; t < NEIGH_NR_TABLES; t++) { + tbl = neigh_tables[t]; + + if (!tbl) + continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) @@ -2377,7 +2355,6 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) break; } - read_unlock(&neigh_tbl_lock); cb->args[0] = t; return skb->len; -- cgit v1.2.3 From fbe168ba91f7c327856f205699404284c2f09e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Thu, 13 Nov 2014 07:54:50 +0100 Subject: net: generic dev_disable_lro() stacked device handling Large receive offloading is known to cause problems if received packets are passed to other host. Therefore the kernel disables it by calling dev_disable_lro() whenever a network device is enslaved in a bridge or forwarding is enabled for it (or globally). For virtual devices we need to disable LRO on the underlying physical device (which is actually receiving the packets). Current dev_disable_lro() code handles this propagation for a vlan (including 802.1ad nested vlan), macvlan or a vlan on top of a macvlan. It doesn't handle other stacked devices and their combinations, in particular propagation from a bond to its slaves which often causes problems in virtualization setups. As we now have generic data structures describing the upper-lower device relationship, dev_disable_lro() can be generalized to disable LRO also for all lower devices (if any) once it is disabled for the device itself. For bonding and teaming devices, it is necessary to disable LRO not only on current slaves at the moment when dev_disable_lro() is called but also on any slave (port) added later. v2: use lower device links for all devices (including vlan and macvlan) Signed-off-by: Michal Kubecek Acked-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bb09b0364619..1ab168e0fdf7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1437,22 +1437,17 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - /* - * If we're trying to disable lro on a vlan device - * use the underlying physical device instead - */ - if (is_vlan_dev(dev)) - dev = vlan_dev_real_dev(dev); - - /* the same for macvlan devices */ - if (netif_is_macvlan(dev)) - dev = macvlan_dev_real_dev(dev); + struct net_device *lower_dev; + struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); + + netdev_for_each_lower_dev(dev, lower_dev, iter) + dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); -- cgit v1.2.3 From 960fb622f85180f36d3aff82af53e2be3db2f888 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Nov 2014 06:23:05 -0800 Subject: net: provide a per host RSS key generic infrastructure RSS (Receive Side Scaling) typically uses Toeplitz hash and a 40 or 52 bytes RSS key. Some drivers use a constant (and well known key), some drivers use a random key per port, making bonding setups hard to tune. Well known keys increase attack surface, considering that number of queues is usually a power of two. This patch provides infrastructure to help drivers doing the right thing. netdev_rss_key_fill() should be used by drivers to initialize their RSS key, even if they provide ethtool -X support to let user redefine the key later. A new /proc/sys/net/core/netdev_rss_key file can be used to get the host RSS key even for drivers not providing ethtool -x support, in case some applications want to precisely setup flows to match some RX queues. Tested: myhost:~# cat /proc/sys/net/core/netdev_rss_key 11:63:99:bb:79:fb:a5:a7:07:45:b2:20:bf:02:42:2d:08:1a:dd:19:2b:6b:23:ac:56:28:9d:70:c3:ac:e8:16:4b:b7:c1:10:53:a4:78:41:36:40:74:b6:15:ca:27:44:aa:b3:4d:72 myhost:~# ethtool -x eth0 RX flow hash indirection table for eth0 with 8 RX ring(s): 0: 0 1 2 3 4 5 6 7 RSS hash key: 11:63:99:bb:79:fb:a5:a7:07:45:b2:20:bf:02:42:2d:08:1a:dd:19:2b:6b:23:ac:56:28:9d:70:c3:ac:e8:16:4b:b7:c1:10:53:a4:78:41 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/ethtool.c | 11 +++++++++++ net/core/sysctl_net_core.c | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index b0f84f5ddda8..715f51f321e9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * Some useful ethtool_ops methods that're device independent. @@ -573,6 +574,16 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, return 0; } +u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; + +void netdev_rss_key_fill(void *buffer, size_t len) +{ + BUG_ON(len > sizeof(netdev_rss_key)); + net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); + memcpy(buffer, netdev_rss_key, len); +} +EXPORT_SYMBOL(netdev_rss_key_fill); + static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index f93f092fe226..31baba2a71ce 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -217,6 +217,18 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif +static int proc_do_rss_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table fake_table; + char buf[NETDEV_RSS_KEY_LEN * 3]; + + snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); + fake_table.data = buf; + fake_table.maxlen = sizeof(buf); + return proc_dostring(&fake_table, write, buffer, lenp, ppos); +} + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -265,6 +277,13 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "netdev_rss_key", + .data = &netdev_rss_key, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_do_rss_key, + }, #ifdef CONFIG_BPF_JIT { .procname = "bpf_jit_enable", -- cgit v1.2.3 From 1d2398dc7c78f32c50ee23a21ad9141e5e08a2ed Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 17 Nov 2014 22:04:03 +0100 Subject: net: fix spelling for synchronized Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index b6b230600b97..c0548d268e1a 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -278,8 +278,8 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, EXPORT_SYMBOL(__hw_addr_sync_dev); /** - * __hw_addr_unsync_dev - Remove synchonized addresses from device - * @list: address list to remove syncronized addresses from + * __hw_addr_unsync_dev - Remove synchronized addresses from device + * @list: address list to remove synchronized addresses from * @dev: device to sync * @unsync: function to call if address should be removed * -- cgit v1.2.3 From e56f735913c8d3c417c65c7e7fcdd65011f8d96f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 17 Nov 2014 22:08:22 +0100 Subject: net/core: include linux/types.h instead of asm/types.h Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/core/link_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index bd0767e6b2b3..49a9e3e06c08 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include enum lw_bits { -- cgit v1.2.3 From 54aeba7f06323e04d59a6053ee3c6023079667b2 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 17 Nov 2014 22:23:17 +0100 Subject: dev_ioctl: use sizeof(x) instead of sizeof x Also remove spaces after cast. Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 72e899a3efda..b94b1d293506 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -142,10 +142,12 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm case SIOCGIFHWADDR: if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + memset(ifr->ifr_hwaddr.sa_data, 0, + sizeof(ifr->ifr_hwaddr.sa_data)); else memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + min(sizeof(ifr->ifr_hwaddr.sa_data), + (size_t)dev->addr_len)); ifr->ifr_hwaddr.sa_family = dev->type; return 0; @@ -265,7 +267,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + min(sizeof(ifr->ifr_hwaddr.sa_data), + (size_t)dev->addr_len)); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return 0; -- cgit v1.2.3 From ef87c5d6a11af45c31910728e46642e77aa8db61 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 18 Nov 2014 20:10:34 +0100 Subject: net: pktgen: Deletion of an unnecessary check before the function call "proc_remove" The proc_remove() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: David S. Miller --- net/core/pktgen.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 443256bdcddc..da934fc3faa8 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3728,8 +3728,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Remove proc before if_list entry, because add_device uses * list to determine if interface already exist, avoid race * with proc_create_data() */ - if (pkt_dev->entry) - proc_remove(pkt_dev->entry); + proc_remove(pkt_dev->entry); /* And update the thread if_list */ _rem_dev_from_if_list(t, pkt_dev); -- cgit v1.2.3 From 0844932009e1656726c6e9c369e694017b129378 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Nov 2014 22:33:45 -0500 Subject: {compat_,}verify_iovec(): switch to generic copying of iovecs use {compat_,}rw_copy_check_uvector(). As the result, we are guaranteed that all iovecs seen in ->msg_iov by ->sendmsg() and ->recvmsg() will pass access_ok(). Signed-off-by: Al Viro --- net/core/iovec.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/iovec.c b/net/core/iovec.c index e1ec45ab1e63..86beeea61d72 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -37,13 +37,13 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode) { - int size, ct, err; + struct iovec *res; + int err; if (m->msg_name && m->msg_namelen) { - if (mode == VERIFY_READ) { - void __user *namep; - namep = (void __user __force *) m->msg_name; - err = move_addr_to_kernel(namep, m->msg_namelen, + if (mode == WRITE) { + void __user *namep = (void __user __force *)m->msg_name; + int err = move_addr_to_kernel(namep, m->msg_namelen, address); if (err < 0) return err; @@ -53,24 +53,15 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a m->msg_name = NULL; m->msg_namelen = 0; } - - size = m->msg_iovlen * sizeof(struct iovec); - if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) - return -EFAULT; - - m->msg_iov = iov; - err = 0; - - for (ct = 0; ct < m->msg_iovlen; ct++) { - size_t len = iov[ct].iov_len; - - if (len > INT_MAX - err) { - len = INT_MAX - err; - iov[ct].iov_len = len; - } - err += len; - } - + if (m->msg_iovlen > UIO_MAXIOV) + return -EMSGSIZE; + + err = rw_copy_check_uvector(mode, (void __user __force *)m->msg_iov, + m->msg_iovlen, UIO_FASTIOV, iov, &res); + if (err >= 0) + m->msg_iov = res; + else if (res != iov) + kfree(res); return err; } -- cgit v1.2.3 From 08adb7dabd4874cc5666b4490653b26534702ce0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 10 Nov 2014 20:23:13 -0500 Subject: fold verify_iovec() into copy_msghdr_from_user() ... and do the same on the compat side of things. Signed-off-by: Al Viro --- net/core/iovec.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) (limited to 'net/core') diff --git a/net/core/iovec.c b/net/core/iovec.c index 86beeea61d72..dcbe98b3726a 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -27,44 +27,6 @@ #include #include -/* - * Verify iovec. The caller must ensure that the iovec is big enough - * to hold the message iovec. - * - * Save time not doing access_ok. copy_*_user will make this work - * in any case. - */ - -int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode) -{ - struct iovec *res; - int err; - - if (m->msg_name && m->msg_namelen) { - if (mode == WRITE) { - void __user *namep = (void __user __force *)m->msg_name; - int err = move_addr_to_kernel(namep, m->msg_namelen, - address); - if (err < 0) - return err; - } - m->msg_name = address; - } else { - m->msg_name = NULL; - m->msg_namelen = 0; - } - if (m->msg_iovlen > UIO_MAXIOV) - return -EMSGSIZE; - - err = rw_copy_check_uvector(mode, (void __user __force *)m->msg_iov, - m->msg_iovlen, UIO_FASTIOV, iov, &res); - if (err >= 0) - m->msg_iov = res; - else if (res != iov) - kfree(res); - return err; -} - /* * And now for the all-in-one: copy and checksum from a user iovec * directly to a datagram -- cgit v1.2.3 From 62749e2cb3c4a7da3eaa5c01a7e787aebeff8536 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 19 Nov 2014 14:04:58 +0100 Subject: vlan: rename __vlan_put_tag to vlan_insert_tag_set_proto Name fits better. Plus there's going to be introduced __vlan_insert_tag later on. Signed-off-by: Jiri Pirko Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/netpoll.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1ab168e0fdf7..3611e60df407 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2645,8 +2645,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, { if (vlan_tx_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); if (skb) skb->vlan_tci = 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e6645b4f330a..65d372384a3f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -79,8 +79,8 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, if (vlan_tx_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = __vlan_put_tag(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this -- cgit v1.2.3 From 5968250c868ceee680aa77395b24e6ddcae17d36 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 19 Nov 2014 14:04:59 +0100 Subject: vlan: introduce *vlan_hwaccel_push_inside helpers Use them to push skb->vlan_tci into the payload and avoid code duplication. Signed-off-by: Jiri Pirko Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++------ net/core/netpoll.c | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3611e60df407..ac4836241a96 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2644,12 +2644,8 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (vlan_tx_tag_present(skb) && - !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (skb) - skb->vlan_tci = 0; - } + !vlan_hw_offload_capable(features, skb->vlan_proto)) + skb = __vlan_hwaccel_push_inside(skb); return skb; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 65d372384a3f..e0ad5d16c9c5 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -79,8 +79,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, if (vlan_tx_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { - skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb = __vlan_hwaccel_push_inside(skb); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this @@ -88,7 +87,6 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, */ goto out; } - skb->vlan_tci = 0; } status = netdev_start_xmit(skb, dev, txq, false); -- cgit v1.2.3 From e21951212f03b8d805795d8f71206853b2ab344d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 19 Nov 2014 14:05:01 +0100 Subject: net: move make_writable helper into common code note that skb_make_writable already exists in net/netfilter/core.c but does something slightly different. Suggested-by: Eric Dumazet Signed-off-by: Jiri Pirko Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/core/skbuff.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 700189604f3d..d11bbe0da355 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4151,6 +4151,18 @@ err_free: } EXPORT_SYMBOL(skb_vlan_untag); +int skb_ensure_writable(struct sk_buff *skb, int write_len) +{ + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + + return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_ensure_writable); + /** * alloc_skb_with_frags - allocate skb with page frags * -- cgit v1.2.3 From 93515d53b133d66f01aec7b231fa3e40e3d2fd9a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 19 Nov 2014 14:05:02 +0100 Subject: net: move vlan pop/push functions into common code So it can be used from out of openvswitch code. Did couple of cosmetic changes on the way, namely variable naming and adding support for 8021AD proto. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/skbuff.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d11bbe0da355..c906c5f4bf69 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4163,6 +4163,101 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len) } EXPORT_SYMBOL(skb_ensure_writable); +/* remove VLAN header from packet and update csum accordingly. */ +static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +{ + struct vlan_hdr *vhdr; + unsigned int offset = skb->data - skb_mac_header(skb); + int err; + + __skb_push(skb, offset); + err = skb_ensure_writable(skb, VLAN_ETH_HLEN); + if (unlikely(err)) + goto pull; + + skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); + + vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); + *vlan_tci = ntohs(vhdr->h_vlan_TCI); + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); + __skb_pull(skb, VLAN_HLEN); + + vlan_set_encap_proto(skb, vhdr); + skb->mac_header += VLAN_HLEN; + + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); + + skb_reset_mac_len(skb); +pull: + __skb_pull(skb, offset); + + return err; +} + +int skb_vlan_pop(struct sk_buff *skb) +{ + u16 vlan_tci; + __be16 vlan_proto; + int err; + + if (likely(vlan_tx_tag_present(skb))) { + skb->vlan_tci = 0; + } else { + if (unlikely((skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD)) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __skb_vlan_pop(skb, &vlan_tci); + if (err) + return err; + } + /* move next vlan tag to hw accel tag */ + if (likely((skb->protocol != htons(ETH_P_8021Q) && + skb->protocol != htons(ETH_P_8021AD)) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + vlan_proto = skb->protocol; + err = __skb_vlan_pop(skb, &vlan_tci); + if (unlikely(err)) + return err; + + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_pop); + +int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) +{ + if (vlan_tx_tag_present(skb)) { + unsigned int offset = skb->data - skb_mac_header(skb); + int err; + + /* __vlan_insert_tag expect skb->data pointing to mac header. + * So change skb->data before calling it and change back to + * original position later + */ + __skb_push(skb, offset); + err = __vlan_insert_tag(skb, skb->vlan_proto, + vlan_tx_tag_get(skb)); + if (err) + return err; + skb->protocol = skb->vlan_proto; + skb->mac_len += VLAN_HLEN; + __skb_pull(skb, offset); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(skb->data + + (2 * ETH_ALEN), VLAN_HLEN, 0)); + } + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + return 0; +} +EXPORT_SYMBOL(skb_vlan_push); + /** * alloc_skb_with_frags - allocate skb with page frags * -- cgit v1.2.3 From 3a654f975bf99165016fe257a3d2b4e6716e4931 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 19 Jun 2014 14:15:22 -0400 Subject: new helpers: skb_copy_datagram_from_iter() and zerocopy_sg_from_iter() Signed-off-by: Al Viro --- net/core/datagram.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 26391a3fe3e5..3ea038431ceb 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -572,6 +572,78 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iovec); +int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, + struct iov_iter *from, + int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (copy_from_iter(skb->data + offset, copy, from) != copy) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + size_t copied; + + if (copy > len) + copy = len; + copied = copy_page_from_iter(skb_frag_page(frag), + frag->page_offset + offset - start, + copy, from); + if (copied != copy) + goto fault; + + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_from_iter(frag_iter, + offset - start, + from, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_from_iter); + /** * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec * @skb: buffer to copy @@ -643,6 +715,50 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, } EXPORT_SYMBOL(zerocopy_sg_from_iovec); +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) +{ + int len = iov_iter_count(from); + int copy = min_t(int, skb_headlen(skb), len); + int frag = 0; + + /* copy up to skb headlen */ + if (skb_copy_datagram_from_iter(skb, 0, from, copy)) + return -EFAULT; + + while (iov_iter_count(from)) { + struct page *pages[MAX_SKB_FRAGS]; + size_t start; + ssize_t copied; + unsigned long truesize; + int n = 0; + + if (frag == MAX_SKB_FRAGS) + return -EMSGSIZE; + + copied = iov_iter_get_pages(from, pages, ~0U, + MAX_SKB_FRAGS - frag, &start); + if (copied < 0) + return -EFAULT; + + iov_iter_advance(from, copied); + + truesize = PAGE_ALIGN(copied + start); + skb->data_len += copied; + skb->len += copied; + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); + while (copied) { + int size = min_t(int, copied, PAGE_SIZE - start); + skb_fill_page_desc(skb, frag++, pages[n], start, size); + start = 0; + copied -= size; + n++; + } + } + return 0; +} +EXPORT_SYMBOL(zerocopy_sg_from_iter); + static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 __user *to, int len, __wsum *csump) -- cgit v1.2.3 From 195e952d03a797aa953f62ffe24ec58693e17ed8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 6 Nov 2014 00:56:48 -0500 Subject: kill zerocopy_sg_from_iovec() no users left Signed-off-by: Al Viro --- net/core/datagram.c | 65 ++--------------------------------------------------- 1 file changed, 2 insertions(+), 63 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 3ea038431ceb..c4d832efebb8 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -645,76 +645,15 @@ fault: EXPORT_SYMBOL(skb_copy_datagram_from_iter); /** - * zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec + * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter * @skb: buffer to copy - * @from: io vector to copy from - * @offset: offset in the io vector to start copying from - * @count: amount of vectors to copy to buffer from + * @from: the source to copy from * * The function will first copy up to headlen, and then pin the userspace * pages and build frags through them. * * Returns 0, -EFAULT or -EMSGSIZE. - * Note: the iovec is not modified during the copy */ -int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, - int offset, size_t count) -{ - int len = iov_length(from, count) - offset; - int copy = min_t(int, skb_headlen(skb), len); - int size; - int i = 0; - - /* copy up to skb headlen */ - if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy)) - return -EFAULT; - - if (len == copy) - return 0; - - offset += copy; - while (count--) { - struct page *page[MAX_SKB_FRAGS]; - int num_pages; - unsigned long base; - unsigned long truesize; - - /* Skip over from offset and copied */ - if (offset >= from->iov_len) { - offset -= from->iov_len; - ++from; - continue; - } - len = from->iov_len - offset; - base = (unsigned long)from->iov_base + offset; - size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; - if (i + size > MAX_SKB_FRAGS) - return -EMSGSIZE; - num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if (num_pages != size) { - release_pages(&page[i], num_pages, 0); - return -EFAULT; - } - truesize = size * PAGE_SIZE; - skb->data_len += len; - skb->len += len; - skb->truesize += truesize; - atomic_add(truesize, &skb->sk->sk_wmem_alloc); - while (len) { - int off = base & ~PAGE_MASK; - int size = min_t(int, len, PAGE_SIZE - off); - skb_fill_page_desc(skb, i, page[i], off, size); - base += size; - len -= size; - i++; - } - offset = 0; - ++from; - } - return 0; -} -EXPORT_SYMBOL(zerocopy_sg_from_iovec); - int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) { int len = iov_iter_count(from); -- cgit v1.2.3 From 8feb2fb2bb986c533e18037d3c45a5f779421992 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 6 Nov 2014 01:10:59 -0500 Subject: switch AF_PACKET and AF_UNIX to skb_copy_datagram_from_iter() ... and kill skb_copy_datagram_iovec() Signed-off-by: Al Viro --- net/core/datagram.c | 88 ++--------------------------------------------------- 1 file changed, 2 insertions(+), 86 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index c4d832efebb8..b6e303b0f01f 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -480,98 +480,14 @@ short_copy: EXPORT_SYMBOL(skb_copy_datagram_iter); /** - * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. + * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. * @skb: buffer to copy * @offset: offset in the buffer to start copying to - * @from: io vector to copy to - * @from_offset: offset in the io vector to start copying from + * @from: the copy source * @len: amount of data to copy to buffer from iovec * * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. */ -int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, - const struct iovec *from, int from_offset, - int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_fromiovecend(skb->data + offset, from, from_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - from_offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_fromiovecend(vaddr + frag->page_offset + - offset - start, - from, from_offset, copy); - kunmap(page); - if (err) - goto fault; - - if (!(len -= copy)) - return 0; - offset += copy; - from_offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_from_iovec(frag_iter, - offset - start, - from, - from_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - from_offset += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - return -EFAULT; -} -EXPORT_SYMBOL(skb_copy_datagram_from_iovec); - int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len) -- cgit v1.2.3 From f6f6424ba773da6221ecaaa70973eb4dacfa03b2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 28 Nov 2014 14:34:15 +0100 Subject: net: make vid as a parameter for ndo_fdb_add/ndo_fdb_del Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple u16 vid to drivers from there. Signed-off-by: Jiri Pirko Acked-by: Andy Gospodarek Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b9b7dfaf202b..1a233c1c8ab4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -2312,7 +2313,7 @@ errout: int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, + const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; @@ -2338,6 +2339,28 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_add); +static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid) +{ + u16 vid = 0; + + if (vlan_attr) { + if (nla_len(vlan_attr) != sizeof(u16)) { + pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n"); + return -EINVAL; + } + + vid = nla_get_u16(vlan_attr); + + if (!vid || vid >= VLAN_VID_MASK) { + pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n", + vid); + return -EINVAL; + } + } + *p_vid = vid; + return 0; +} + static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -2345,6 +2368,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; + u16 vid; int err; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); @@ -2370,6 +2394,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) addr = nla_data(tb[NDA_LLADDR]); + err = fdb_vid_parse(tb[NDA_VLAN], &vid); + if (err) + return err; + err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ @@ -2378,7 +2406,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; - err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags); + err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, + nlh->nlmsg_flags); if (err) goto out; else @@ -2389,9 +2418,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) if ((ndm->ndm_flags & NTF_SELF)) { if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, + vid, nlh->nlmsg_flags); else - err = ndo_dflt_fdb_add(ndm, tb, dev, addr, + err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); if (!err) { @@ -2409,7 +2439,7 @@ out: int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr) + const unsigned char *addr, u16 vid) { int err = -EINVAL; @@ -2438,6 +2468,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) struct net_device *dev; int err = -EINVAL; __u8 *addr; + u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; @@ -2465,6 +2496,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) addr = nla_data(tb[NDA_LLADDR]); + err = fdb_vid_parse(tb[NDA_VLAN], &vid); + if (err) + return err; + err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ @@ -2474,7 +2509,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) const struct net_device_ops *ops = br_dev->netdev_ops; if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); if (err) goto out; @@ -2485,9 +2520,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { if (dev->netdev_ops->ndo_fdb_del) - err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr); + err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr, + vid); else - err = ndo_dflt_fdb_del(ndm, tb, dev, addr); + err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); if (!err) { rtnl_fdb_notify(dev, addr, RTM_DELNEIGH); -- cgit v1.2.3 From 02637fce3e0103ba086b9c33b6d529e69460e4b6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 28 Nov 2014 14:34:16 +0100 Subject: net: rename netdev_phys_port_id to more generic name So this can be reused for identification of other "items" as well. Signed-off-by: Jiri Pirko Reviewed-by: Thomas Graf Acked-by: John Fastabend Acked-by: Andy Gospodarek Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/net-sysfs.c | 2 +- net/core/rtnetlink.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ac4836241a96..0814a560e5f3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5846,7 +5846,7 @@ EXPORT_SYMBOL(dev_change_carrier); * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, - struct netdev_phys_port_id *ppid) + struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1a24602cd54e..26c46f4726c5 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -404,7 +404,7 @@ static ssize_t phys_port_id_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) { - struct netdev_phys_port_id ppid; + struct netdev_phys_item_id ppid; ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1a233c1c8ab4..0640f1fbe9dd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -869,7 +869,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ - + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_PORT_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -953,7 +953,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; - struct netdev_phys_port_id ppid; + struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { @@ -1197,7 +1197,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, - [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN }, + [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ }; -- cgit v1.2.3 From 82f2841291cfaf4d225aa1766424280254d3e3b2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 28 Nov 2014 14:34:18 +0100 Subject: rtnl: expose physical switch id for particular device The netdevice represents a port in a switch, it will expose IFLA_PHYS_SWITCH_ID value via rtnl. Two netdevices with the same value belong to one physical switch. Signed-off-by: Jiri Pirko Reviewed-by: Thomas Graf Acked-by: John Fastabend Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0640f1fbe9dd..aa5cd2c53a56 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -44,6 +44,7 @@ #include #include +#include #include #include #include @@ -869,7 +870,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ - + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -968,6 +970,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_item_id psid; + + err = netdev_switch_parent_id_get(dev, &psid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -1040,6 +1060,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; + if (rtnl_phys_switch_id_fill(skb, dev)) + goto nla_put_failure; + attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (attr == NULL) @@ -1199,6 +1222,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ + [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit v1.2.3 From aecbe01e7410ad2de022796472f531ae6941f15e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 28 Nov 2014 14:34:19 +0100 Subject: net-sysfs: expose physical switch id for particular device Signed-off-by: Jiri Pirko Reviewed-by: Thomas Graf Acked-by: John Fastabend Acked-by: Andy Gospodarek Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 26c46f4726c5..999341244434 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -416,6 +417,28 @@ static ssize_t phys_port_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_port_id); +static ssize_t phys_switch_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + struct netdev_phys_item_id ppid; + + ret = netdev_switch_parent_id_get(netdev, &ppid); + if (!ret) + ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + } + rtnl_unlock(); + + return ret; +} +static DEVICE_ATTR_RO(phys_switch_id); + static struct attribute *net_class_attrs[] = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, @@ -441,6 +464,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_phys_port_id.attr, + &dev_attr_phys_switch_id.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); -- cgit v1.2.3 From 2c3c031c8f8930861815fa1685d7c5e8ccec047c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Fri, 28 Nov 2014 14:34:25 +0100 Subject: bridge: add brport flags to dflt bridge_getlink To allow brport device to return current brport flags set on port. Add returned flags to nested IFLA_PROTINFO netlink msg built in dflt getlink. With this change, netlink msg returned for bridge_getlink contains the port's offloaded flag settings (the port's SELF settings). Signed-off-by: Scott Feldman Signed-off-by: Jiri Pirko Acked-by: Andy Gospodarek Acked-by: Thomas Graf Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index aa5cd2c53a56..61cb7e7cc3c7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2687,12 +2687,22 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, + unsigned int attrnum, unsigned int flag) +{ + if (mask & flag) + return nla_put_u8(skb, attrnum, !!(flags & flag)); + return 0; +} + int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, - struct net_device *dev, u16 mode) + struct net_device *dev, u16 mode, + u32 flags, u32 mask) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; + struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); @@ -2731,6 +2741,33 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, } nla_nest_end(skb, br_afspec); + protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); + if (!protinfo) + goto nla_put_failure; + + if (brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_FAST_LEAVE, + BR_MULTICAST_FAST_LEAVE) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING, BR_LEARNING) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || + brport_nla_put_flag(skb, flags, mask, + IFLA_BRPORT_PROXYARP, BR_PROXYARP)) { + nla_nest_cancel(skb, protinfo); + goto nla_put_failure; + } + + nla_nest_end(skb, protinfo); + return nlmsg_end(skb, nlh); nla_put_failure: nlmsg_cancel(skb, nlh); -- cgit v1.2.3 From 89aa075832b0da4402acebd698d0411dcc82d03e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Dec 2014 15:06:35 -0800 Subject: net: sock: allow eBPF programs to be attached to sockets introduce new setsockopt() command: setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) where prog_fd was received from syscall bpf(BPF_PROG_LOAD, attr, ...) and attr->prog_type == BPF_PROG_TYPE_SOCKET_FILTER setsockopt() calls bpf_prog_get() which increments refcnt of the program, so it doesn't get unloaded while socket is using the program. The same eBPF program can be attached to multiple sockets. User task exit automatically closes socket which calls sk_filter_uncharge() which decrements refcnt of eBPF program Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- net/core/sock.c | 13 ++++++++ 2 files changed, 108 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 647b12265e18..8cc3c03078b3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -44,6 +44,7 @@ #include #include #include +#include /** * sk_filter - run a packet through a socket filter @@ -813,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp) static void __bpf_prog_release(struct bpf_prog *prog) { - bpf_release_orig_filter(prog); - bpf_prog_free(prog); + if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + } else { + bpf_release_orig_filter(prog); + bpf_prog_free(prog); + } } static void __sk_filter_release(struct sk_filter *fp) @@ -1088,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } EXPORT_SYMBOL_GPL(sk_attach_filter); +#ifdef CONFIG_BPF_SYSCALL +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + struct bpf_prog *prog; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get(ufd); + if (!prog) + return -EINVAL; + + if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + /* valid fd, but invalid program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) { + bpf_prog_put(prog); + return -ENOMEM; + } + fp->prog = prog; + + atomic_set(&fp->refcnt, 0); + + if (!sk_filter_charge(sk, fp)) { + __sk_filter_release(fp); + return -ENOMEM; + } + + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + + return 0; +} + +/* allow socket filters to call + * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem() + */ +static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + default: + return NULL; + } +} + +static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* skb fields cannot be accessed yet */ + return false; +} + +static struct bpf_verifier_ops sock_filter_ops = { + .get_func_proto = sock_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, +}; + +static struct bpf_prog_type_list tl = { + .ops = &sock_filter_ops, + .type = BPF_PROG_TYPE_SOCKET_FILTER, +}; + +static int __init register_sock_filter_ops(void) +{ + bpf_register_prog_type(&tl); + return 0; +} +late_initcall(register_sock_filter_ops); +#else +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + return -EOPNOTSUPP; +} +#endif int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/sock.c b/net/core/sock.c index 0725cf0cb685..9a56b2000c3f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -888,6 +888,19 @@ set_rcvbuf: } break; + case SO_ATTACH_BPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; -- cgit v1.2.3 From 892311f66f2411b813ca631009356891a0c2b0a1 Mon Sep 17 00:00:00 2001 From: Eyal Perry Date: Tue, 2 Dec 2014 18:12:10 +0200 Subject: ethtool: Support for configurable RSS hash function This patch extends the set/get_rxfh ethtool-options for getting or setting the RSS hash function. It modifies drivers implementation of set/get_rxfh accordingly. This change also delegates the responsibility of checking whether a modification to a certain RX flow hash parameter is supported to the driver implementation of set_rxfh. User-kernel API is done through the new hfunc bitmask field in the ethtool_rxfh struct. A bit set in the hfunc field is corresponding to an index in the new string-set ETH_SS_RSS_HASH_FUNCS. Got approval from most of the relevant driver maintainers that their driver is using Toeplitz, and for the few that didn't answered, also assumed it is Toeplitz. Cc: Tom Lendacky Cc: Ariel Elior Cc: Prashant Sreedharan Cc: Michael Chan Cc: Hariprasad S Cc: Sathya Perla Cc: Subbu Seetharaman Cc: Ajit Khaparde Cc: Jeff Kirsher Cc: Jesse Brandeburg Cc: Bruce Allan Cc: Carolyn Wyborny Cc: Don Skidmore Cc: Greg Rose Cc: Matthew Vick Cc: John Ronciak Cc: Mitch Williams Cc: Amir Vadai Cc: Solarflare linux maintainers Cc: Shradha Shah Cc: Shreyas Bhatewara Cc: "VMware, Inc." Cc: Ben Hutchings Signed-off-by: Eyal Perry Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- net/core/ethtool.c | 69 +++++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 715f51f321e9..550892cd6b3f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -100,6 +100,12 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; +static const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { + [ETH_RSS_HASH_TOP_BIT] = "toeplitz", + [ETH_RSS_HASH_XOR_BIT] = "xor", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -185,6 +191,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); + if (sset == ETH_SS_RSS_HASH_FUNCS) + return ARRAY_SIZE(rss_hash_func_strings); + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -199,6 +208,9 @@ static void __ethtool_get_strings(struct net_device *dev, if (stringset == ETH_SS_FEATURES) memcpy(data, netdev_features_strings, sizeof(netdev_features_strings)); + else if (stringset == ETH_SS_RSS_HASH_FUNCS) + memcpy(data, rss_hash_func_strings, + sizeof(rss_hash_func_strings)); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); @@ -618,7 +630,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, if (!indir) return -ENOMEM; - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL); + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); if (ret) goto out; @@ -679,7 +691,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, goto out; } - ret = ops->set_rxfh(dev, indir, NULL); + ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); out: kfree(indir); @@ -697,12 +709,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, u32 total_size; u32 indir_bytes; u32 *indir = NULL; + u8 dev_hfunc = 0; u8 *hkey = NULL; u8 *rss_config; - if (!(dev->ethtool_ops->get_rxfh_indir_size || - dev->ethtool_ops->get_rxfh_key_size) || - !dev->ethtool_ops->get_rxfh) + if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) @@ -710,16 +721,14 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; - if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; user_indir_size = rxfh.indir_size; user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; rxfh.indir_size = dev_indir_size; @@ -727,13 +736,6 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; - /* If the user buffer size is 0, this is just a query for the - * device table size and key size. Otherwise, if the User size is - * not equal to device table size or key size it's an error. - */ - if (!user_indir_size && !user_key_size) - return 0; - if ((user_indir_size && (user_indir_size != dev_indir_size)) || (user_key_size && (user_key_size != dev_key_size))) return -EINVAL; @@ -750,14 +752,19 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (user_key_size) hkey = rss_config + indir_bytes; - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); - if (!ret) { - if (copy_to_user(useraddr + - offsetof(struct ethtool_rxfh, rss_config[0]), - rss_config, total_size)) - ret = -EFAULT; - } + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + if (ret) + goto out; + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), + &dev_hfunc, sizeof(rxfh.hfunc))) { + ret = -EFAULT; + } else if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, rss_config[0]), + rss_config, total_size)) { + ret = -EFAULT; + } +out: kfree(rss_config); return ret; @@ -776,33 +783,31 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u8 *rss_config; u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); - if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || - !ops->get_rxnfc || !ops->set_rxfh) + if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; - /* If either indir or hash key is valid, proceed further. - * It is not valid to request that both be unchanged. + /* If either indir, hash key or function is valid, proceed further. + * Must request at least one change: indir size, hash key or function. */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && (rxfh.key_size != dev_key_size)) || (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && - rxfh.key_size == 0)) + rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE)) return -EINVAL; if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) @@ -845,7 +850,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } } - ret = ops->set_rxfh(dev, indir, hkey); + ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); out: kfree(rss_config); -- cgit v1.2.3 From 395eea6ccf2b253f81b4718ffbcae67d36fe2e69 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 3 Dec 2014 13:46:24 -0800 Subject: rtnetlink: delay RTM_DELLINK notification until after ndo_uninit() The commit 56bfa7ee7c ("unregister_netdevice : move RTM_DELLINK to until after ndo_uninit") tried to do this ealier but while doing so it created a problem. Unfortunately the delayed rtmsg_ifinfo() also delayed call to fill_info(). So this translated into asking driver to remove private state and then query it's private state. This could have catastropic consequences. This change breaks the rtmsg_ifinfo() into two parts - one takes the precise snapshot of the device by called fill_info() before calling the ndo_uninit() and the second part sends the notification using collected snapshot. It was brought to notice when last link is deleted from an ipvlan device when it has free-ed the port and the subsequent .fill_info() call is trying to get the info from the port. kernel: [ 255.139429] ------------[ cut here ]------------ kernel: [ 255.139439] WARNING: CPU: 12 PID: 11173 at net/core/rtnetlink.c:2238 rtmsg_ifinfo+0x100/0x110() kernel: [ 255.139493] Modules linked in: ipvlan bonding w1_therm ds2482 wire cdc_acm ehci_pci ehci_hcd i2c_dev i2c_i801 i2c_core msr cpuid bnx2x ptp pps_core mdio libcrc32c kernel: [ 255.139513] CPU: 12 PID: 11173 Comm: ip Not tainted 3.18.0-smp-DEV #167 kernel: [ 255.139514] Hardware name: Intel RML,PCH/Ibis_QC_18, BIOS 1.0.10 05/15/2012 kernel: [ 255.139515] 0000000000000009 ffff880851b6b828 ffffffff815d87f4 00000000000000e0 kernel: [ 255.139516] 0000000000000000 ffff880851b6b868 ffffffff8109c29c 0000000000000000 kernel: [ 255.139518] 00000000ffffffa6 00000000000000d0 ffffffff81aaf580 0000000000000011 kernel: [ 255.139520] Call Trace: kernel: [ 255.139527] [] dump_stack+0x46/0x58 kernel: [ 255.139531] [] warn_slowpath_common+0x8c/0xc0 kernel: [ 255.139540] [] warn_slowpath_null+0x1a/0x20 kernel: [ 255.139544] [] rtmsg_ifinfo+0x100/0x110 kernel: [ 255.139547] [] rollback_registered_many+0x1d5/0x2d0 kernel: [ 255.139549] [] unregister_netdevice_many+0x1f/0xb0 kernel: [ 255.139551] [] rtnl_dellink+0xbb/0x110 kernel: [ 255.139553] [] rtnetlink_rcv_msg+0xa0/0x240 kernel: [ 255.139557] [] ? rhashtable_lookup_compare+0x43/0x80 kernel: [ 255.139558] [] ? __rtnl_unlock+0x20/0x20 kernel: [ 255.139562] [] netlink_rcv_skb+0xb1/0xc0 kernel: [ 255.139563] [] rtnetlink_rcv+0x25/0x40 kernel: [ 255.139565] [] netlink_unicast+0x178/0x230 kernel: [ 255.139567] [] netlink_sendmsg+0x30f/0x420 kernel: [ 255.139571] [] sock_sendmsg+0x9c/0xd0 kernel: [ 255.139575] [] ? rw_copy_check_uvector+0x6f/0x130 kernel: [ 255.139577] [] ? copy_msghdr_from_user+0x139/0x1b0 kernel: [ 255.139578] [] ___sys_sendmsg+0x304/0x310 kernel: [ 255.139581] [] ? handle_mm_fault+0xca3/0xde0 kernel: [ 255.139585] [] ? destroy_inode+0x3c/0x70 kernel: [ 255.139589] [] ? __do_page_fault+0x20c/0x500 kernel: [ 255.139597] [] ? dput+0xb6/0x190 kernel: [ 255.139606] [] ? mntput+0x26/0x40 kernel: [ 255.139611] [] ? __fput+0x174/0x1e0 kernel: [ 255.139613] [] __sys_sendmsg+0x49/0x90 kernel: [ 255.139615] [] SyS_sendmsg+0x12/0x20 kernel: [ 255.139617] [] system_call_fastpath+0x12/0x17 kernel: [ 255.139619] ---[ end trace 5e6703e87d984f6b ]--- Signed-off-by: Mahesh Bandewar Reported-by: Toshiaki Makita Cc: Eric Dumazet Cc: Roopa Prabhu Cc: David S. Miller Acked-by: Eric Dumazet Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/dev.c | 12 +++++++++--- net/core/rtnetlink.c | 25 +++++++++++++++++++++---- 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0814a560e5f3..dd3bf582e6f0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5925,6 +5925,8 @@ static void rollback_registered_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + /* Shutdown queueing discipline. */ dev_shutdown(dev); @@ -5934,6 +5936,11 @@ static void rollback_registered_many(struct list_head *head) */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + GFP_KERNEL); + /* * Flush the unicast and multicast chains */ @@ -5943,9 +5950,8 @@ static void rollback_registered_many(struct list_head *head) if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 61cb7e7cc3c7..a9be2c161702 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2245,8 +2245,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) +struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, + unsigned int change, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2264,11 +2264,28 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, kfree_skb(skb); goto errout; } - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); - return; + return skb; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); + return NULL; +} + +void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) +{ + struct net *net = dev_net(dev); + + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + struct sk_buff *skb; + + skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); + if (skb) + rtmsg_ifinfo_send(skb, dev, flags); } EXPORT_SYMBOL(rtmsg_ifinfo); -- cgit v1.2.3 From 6ffe75eb53564953e75c051e1c28676e1e56f385 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Dec 2014 17:04:39 -0800 Subject: net: avoid two atomic operations in fast clones Commit ce1a4ea3f125 ("net: avoid one atomic operation in skb_clone()") took the wrong way to save one atomic operation. It is actually possible to avoid two atomic operations, if we do not change skb->fclone values, and only rely on clone_ref content to signal if the clone is available or not. skb_clone() can simply use the fast clone if clone_ref is 1. kfree_skbmem() can avoid the atomic_dec_and_test() if clone_ref is 1. Note that because we usually free the clone before the original skb, this particular attempt is only done for the original skb to have better branch prediction. SKB_FCLONE_FREE is removed. Signed-off-by: Eric Dumazet Cc: Chris Mason Cc: Sabrina Dubroca Cc: Vijay Subramanian Signed-off-by: David S. Miller --- net/core/skbuff.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 92116dfe827c..7a338fb55cc4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -265,7 +265,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->fclone = SKB_FCLONE_ORIG; atomic_set(&fclones->fclone_ref, 1); - fclones->skb2.fclone = SKB_FCLONE_FREE; + fclones->skb2.fclone = SKB_FCLONE_CLONE; fclones->skb2.pfmemalloc = pfmemalloc; } out: @@ -541,26 +541,27 @@ static void kfree_skbmem(struct sk_buff *skb) switch (skb->fclone) { case SKB_FCLONE_UNAVAILABLE: kmem_cache_free(skbuff_head_cache, skb); - break; + return; case SKB_FCLONE_ORIG: fclones = container_of(skb, struct sk_buff_fclones, skb1); - if (atomic_dec_and_test(&fclones->fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, fclones); - break; - - case SKB_FCLONE_CLONE: - fclones = container_of(skb, struct sk_buff_fclones, skb2); - /* The clone portion is available for - * fast-cloning again. + /* We usually free the clone (TX completion) before original skb + * This test would have no chance to be true for the clone, + * while here, branch prediction will be good. */ - skb->fclone = SKB_FCLONE_FREE; + if (atomic_read(&fclones->fclone_ref) == 1) + goto fastpath; + break; - if (atomic_dec_and_test(&fclones->fclone_ref)) - kmem_cache_free(skbuff_fclone_cache, fclones); + default: /* SKB_FCLONE_CLONE */ + fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } + if (!atomic_dec_and_test(&fclones->fclone_ref)) + return; +fastpath: + kmem_cache_free(skbuff_fclone_cache, fclones); } static void skb_release_head_state(struct sk_buff *skb) @@ -872,15 +873,15 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) struct sk_buff_fclones *fclones = container_of(skb, struct sk_buff_fclones, skb1); - struct sk_buff *n = &fclones->skb2; + struct sk_buff *n; if (skb_orphan_frags(skb, gfp_mask)) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && - n->fclone == SKB_FCLONE_FREE) { - n->fclone = SKB_FCLONE_CLONE; - atomic_inc(&fclones->fclone_ref); + atomic_read(&fclones->fclone_ref) == 1) { + n = &fclones->skb2; + atomic_set(&fclones->fclone_ref, 2); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; -- cgit v1.2.3 From dbfc4fb7d578d4f224faa6b60deb40804dfdc1b1 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Sat, 6 Dec 2014 19:19:42 +0100 Subject: dst: no need to take reference on DST_NOCACHE dsts Since commit f8864972126899 ("ipv4: fix dst race in sk_dst_get()") DST_NOCACHE dst_entries get freed by RCU. So there is no need to get a reference on them when we are in rcu protected sections. Cc: Eric Dumazet Cc: Julian Anastasov Signed-off-by: Hannes Frederic Sowa Reviewed-by: Julian Anastasov Signed-off-by: David S. Miller --- net/core/dst.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index a028409ee438..e956ce6d1378 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -327,30 +327,6 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); -/** - * __skb_dst_set_noref - sets skb dst, without a reference - * @skb: buffer - * @dst: dst entry - * @force: if force is set, use noref version even for DST_NOCACHE entries - * - * Sets skb dst, assuming a reference was not taken on dst - * skb_dst_drop() should not dst_release() this dst - */ -void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force) -{ - WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); - /* If dst not in cache, we must take a reference, because - * dst_release() will destroy dst as soon as its refcount becomes zero - */ - if (unlikely((dst->flags & DST_NOCACHE) && !force)) { - dst_hold(dst); - skb_dst_set(skb, dst); - } else { - skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; - } -} -EXPORT_SYMBOL(__skb_dst_set_noref); - /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice -- cgit v1.2.3 From e5a4b0bb803b39a36478451eae53a880d2663d5b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Nov 2014 18:17:55 -0500 Subject: switch memcpy_to_msg() and skb_copy{,_and_csum}_datagram_msg() to primitives ... making both non-draining. That means that tcp_recvmsg() becomes non-draining. And _that_ would break iscsit_do_rx_data() unless we a) make sure tcp_recvmsg() is uniformly non-draining (it is) b) make sure it copes with arbitrary (including shifted) iov_iter (it does, all it uses is iov_iter primitives) c) make iscsit_do_rx_data() initialize ->msg_iter only once. Fortunately, (c) is doable with minimal work and we are rid of one the two places where kernel send/recvmsg users would be unhappy with non-draining behaviour. Actually, that makes all but one of ->recvmsg() instances iov_iter-clean. The exception is skcipher_recvmsg() and it also isn't hard to convert to primitives (iov_iter_get_pages() is needed there). That'll wait a bit - there's some interplay with ->sendmsg() path for that one. Signed-off-by: Al Viro --- net/core/datagram.c | 54 +++++++++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 35 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index b6e303b0f01f..41075ed6bb52 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -615,27 +615,25 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) EXPORT_SYMBOL(zerocopy_sg_from_iter); static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, - u8 __user *to, int len, + struct iov_iter *to, int len, __wsum *csump) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int pos = 0; + int n; /* Copy header. */ if (copy > 0) { - int err = 0; if (copy > len) copy = len; - *csump = csum_and_copy_to_user(skb->data + offset, to, copy, - *csump, &err); - if (err) + n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); + if (n != copy) goto fault; if ((len -= copy) == 0) return 0; offset += copy; - to += copy; pos = copy; } @@ -647,26 +645,22 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { - __wsum csum2; - int err = 0; - u8 *vaddr; + __wsum csum2 = 0; struct page *page = skb_frag_page(frag); + u8 *vaddr = kmap(page); if (copy > len) copy = len; - vaddr = kmap(page); - csum2 = csum_and_copy_to_user(vaddr + - frag->page_offset + - offset - start, - to, copy, 0, &err); + n = csum_and_copy_to_iter(vaddr + frag->page_offset + + offset - start, copy, + &csum2, to); kunmap(page); - if (err) + if (n != copy) goto fault; *csump = csum_block_add(*csump, csum2, pos); if (!(len -= copy)) return 0; offset += copy; - to += copy; pos += copy; } start = end; @@ -691,7 +685,6 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, if ((len -= copy) == 0) return 0; offset += copy; - to += copy; pos += copy; } start = end; @@ -744,20 +737,19 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) EXPORT_SYMBOL(__skb_checksum_complete); /** - * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec. + * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff * @hlen: hardware length - * @iov: io vector + * @msg: destination * * Caller _must_ check that skb will fit to this iovec. * * Returns: 0 - success. * -EINVAL - checksum failure. - * -EFAULT - fault during copy. Beware, in this case iovec - * can be modified! + * -EFAULT - fault during copy. */ -int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, - int hlen, struct iovec *iov) +int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, + int hlen, struct msghdr *msg) { __wsum csum; int chunk = skb->len - hlen; @@ -765,28 +757,20 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, if (!chunk) return 0; - /* Skip filled elements. - * Pretty silly, look at memcpy_toiovec, though 8) - */ - while (!iov->iov_len) - iov++; - - if (iov->iov_len < chunk) { + if (iov_iter_count(&msg->msg_iter) < chunk) { if (__skb_checksum_complete(skb)) goto csum_error; - if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) + if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { csum = csum_partial(skb->data, hlen, skb->csum); - if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, + if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; if (csum_fold(csum)) goto csum_error; if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) netdev_rx_csum_fault(skb->dev); - iov->iov_len -= chunk; - iov->iov_base += chunk; } return 0; csum_error: @@ -794,7 +778,7 @@ csum_error: fault: return -EFAULT; } -EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); +EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** * datagram_poll - generic datagram poll -- cgit v1.2.3 From d3a9632f09153bc46a8077844e05e179f1c10c3f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Nov 2014 18:29:54 -0500 Subject: skb_copy_datagram_iovec() can die no callers other than itself. Signed-off-by: Al Viro --- net/core/datagram.c | 84 ----------------------------------------------------- 1 file changed, 84 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 41075ed6bb52..df493d68330c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -309,90 +309,6 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } EXPORT_SYMBOL(skb_kill_datagram); -/** - * skb_copy_datagram_iovec - Copy a datagram to an iovec. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @len: amount of data to copy from buffer to iovec - * - * Note: the iovec is modified during the copy. - */ -int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, - struct iovec *to, int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - - trace_skb_copy_datagram_iovec(skb, len); - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_toiovec(to, skb->data + offset, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_toiovec(to, vaddr + frag->page_offset + - offset - start, copy); - kunmap(page); - if (err) - goto fault; - if (!(len -= copy)) - return 0; - offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_iovec(frag_iter, - offset - start, - to, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - return -EFAULT; -} -EXPORT_SYMBOL(skb_copy_datagram_iovec); - /** * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy -- cgit v1.2.3 From e008f3f07ffba2be471ffd79bd1922af0042f936 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 8 Dec 2014 09:42:55 +0800 Subject: net: avoid to call skb_queue_len again the queue length of sd->input_pkt_queue has been put into qlen, and impossible to change, since hold the lock Signed-off-by: Li RongQing Acked-by: Eric Dumazet Cc: Sergei Shtylyov Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index dd3bf582e6f0..3f191da383f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3297,7 +3297,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, rps_lock(sd); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { - if (skb_queue_len(&sd->input_pkt_queue)) { + if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); -- cgit v1.2.3 From 1d460b988d97c5283e85c5bd00a7aafd5f1304b7 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 8 Dec 2014 14:04:20 -0800 Subject: rocker: remove swdev mode Remove use of 'swdev' mode in rocker. rocker dev offloads can use the BRIDGE_FLAGS_SELF to indicate offload to hardware. Signed-off-by: Roopa Prabhu Signed-off-by: Scott Feldman Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a9be2c161702..eaa057f14bcd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2751,11 +2751,17 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, if (!br_afspec) goto nla_put_failure; - if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) || - nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } + + if (mode != BRIDGE_MODE_UNDEF) { + if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED); -- cgit v1.2.3 From ffde7328a36d16e626bae8468571858d71cd010b Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 9 Dec 2014 19:40:42 -0800 Subject: net: Split netdev_alloc_frag into __alloc_page_frag and add __napi_alloc_frag This patch splits the netdev_alloc_frag function up so that it can be used on one of two page frag pools instead of being fixed on the netdev_alloc_cache. By doing this we can add a NAPI specific function __napi_alloc_frag that accesses a pool that is only used from softirq context. The advantage to this is that we do not need to call local_irq_save/restore which can be a significant savings. I also took the opportunity to refactor the core bits that were placed in __alloc_page_frag. First I updated the allocation to do either a 32K allocation or an order 0 page. This is based on the changes in commmit d9b2938aa where it was found that latencies could be reduced in case of failures. Then I also rewrote the logic to work from the end of the page to the start. By doing this the size value doesn't have to be used unless we have run out of space for page fragments. Finally I cleaned up the atomic bits so that we just do an atomic_sub_and_test and if that returns true then we set the page->_count via an atomic_set. This way we can remove the extra conditional for the atomic_read since it would have led to an atomic_inc in the case of success anyway. Signed-off-by: Alexander Duyck Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/skbuff.c | 117 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 40 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7a338fb55cc4..56ed17cd2151 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -336,59 +336,85 @@ struct netdev_alloc_cache { unsigned int pagecnt_bias; }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); -static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, + gfp_t gfp_mask) { - struct netdev_alloc_cache *nc; - void *data = NULL; - int order; - unsigned long flags; + const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER; + struct page *page = NULL; + gfp_t gfp = gfp_mask; + + if (order) { + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); + nc->frag.size = PAGE_SIZE << (page ? order : 0); + } - local_irq_save(flags); - nc = this_cpu_ptr(&netdev_alloc_cache); - if (unlikely(!nc->frag.page)) { + if (unlikely(!page)) + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + + nc->frag.page = page; + + return page; +} + +static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache, + unsigned int fragsz, gfp_t gfp_mask) +{ + struct netdev_alloc_cache *nc = this_cpu_ptr(cache); + struct page *page = nc->frag.page; + unsigned int size; + int offset; + + if (unlikely(!page)) { refill: - for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { - gfp_t gfp = gfp_mask; + page = __page_frag_refill(nc, gfp_mask); + if (!page) + return NULL; + + /* if size can vary use frag.size else just use PAGE_SIZE */ + size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - if (order) - gfp |= __GFP_COMP | __GFP_NOWARN; - nc->frag.page = alloc_pages(gfp, order); - if (likely(nc->frag.page)) - break; - if (--order < 0) - goto end; - } - nc->frag.size = PAGE_SIZE << order; /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1, - &nc->frag.page->_count); - nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; - nc->frag.offset = 0; + atomic_add(size - 1, &page->_count); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = size; + nc->frag.offset = size; } - if (nc->frag.offset + fragsz > nc->frag.size) { - if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) { - if (!atomic_sub_and_test(nc->pagecnt_bias, - &nc->frag.page->_count)) - goto refill; - /* OK, page count is 0, we can safely set it */ - atomic_set(&nc->frag.page->_count, - NETDEV_PAGECNT_MAX_BIAS); - } else { - atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias, - &nc->frag.page->_count); - } - nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; - nc->frag.offset = 0; + offset = nc->frag.offset - fragsz; + if (unlikely(offset < 0)) { + if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + goto refill; + + /* if size can vary use frag.size else just use PAGE_SIZE */ + size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; + + /* OK, page count is 0, we can safely set it */ + atomic_set(&page->_count, size); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = size; + offset = size - fragsz; } - data = page_address(nc->frag.page) + nc->frag.offset; - nc->frag.offset += fragsz; nc->pagecnt_bias--; -end: + nc->frag.offset = offset; + + return page_address(page) + offset; +} + +static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +{ + unsigned long flags; + void *data; + + local_irq_save(flags); + data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -406,6 +432,17 @@ void *netdev_alloc_frag(unsigned int fragsz) } EXPORT_SYMBOL(netdev_alloc_frag); +static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) +{ + return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask); +} + +void *napi_alloc_frag(unsigned int fragsz) +{ + return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); +} +EXPORT_SYMBOL(napi_alloc_frag); + /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on -- cgit v1.2.3 From fd11a83dd3630ec6a60f8a702446532c5c7e1991 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 9 Dec 2014 19:40:49 -0800 Subject: net: Pull out core bits of __netdev_alloc_skb and add __napi_alloc_skb This change pulls the core functionality out of __netdev_alloc_skb and places them in a new function named __alloc_rx_skb. The reason for doing this is to make these bits accessible to a new function __napi_alloc_skb. In addition __alloc_rx_skb now has a new flags value that is used to determine which page frag pool to allocate from. If the SKB_ALLOC_NAPI flag is set then the NAPI pool is used. The advantage of this is that we do not have to use local_irq_save/restore when accessing the NAPI pool from NAPI context. In my test setup I saw at least 11ns of savings using the napi_alloc_skb function versus the netdev_alloc_skb function, most of this being due to the fact that we didn't have to call local_irq_save/restore. The main use case for napi_alloc_skb would be for things such as copybreak or page fragment based receive paths where an skb is allocated after the data has been received instead of before. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/skbuff.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 68 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3f191da383f6..80f798da3d9f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4172,7 +4172,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) struct sk_buff *skb = napi->skb; if (!skb) { - skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); napi->skb = skb; } return skb; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 56ed17cd2151..ae13ef6b3ea7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -444,10 +444,13 @@ void *napi_alloc_frag(unsigned int fragsz) EXPORT_SYMBOL(napi_alloc_frag); /** - * __netdev_alloc_skb - allocate an skbuff for rx on a specific device - * @dev: network device to receive on + * __alloc_rx_skb - allocate an skbuff for rx * @length: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb + * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for + * allocations in case we have to fallback to __alloc_skb() + * If SKB_ALLOC_NAPI is set, page fragment will be allocated + * from napi_cache instead of netdev_cache. * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has unspecified headroom built in. Users should allocate @@ -456,11 +459,11 @@ EXPORT_SYMBOL(napi_alloc_frag); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, - unsigned int length, gfp_t gfp_mask) +static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask, + int flags) { struct sk_buff *skb = NULL; - unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + + unsigned int fragsz = SKB_DATA_ALIGN(length) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { @@ -469,7 +472,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __netdev_alloc_frag(fragsz, gfp_mask); + data = (flags & SKB_ALLOC_NAPI) ? + __napi_alloc_frag(fragsz, gfp_mask) : + __netdev_alloc_frag(fragsz, gfp_mask); if (likely(data)) { skb = build_skb(data, fragsz); @@ -477,17 +482,72 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, put_page(virt_to_head_page(data)); } } else { - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, + skb = __alloc_skb(length, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); } + return skb; +} + +/** + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has NET_SKB_PAD headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + + length += NET_SKB_PAD; + skb = __alloc_rx_skb(length, gfp_mask, 0); + if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; } + return skb; } EXPORT_SYMBOL(__netdev_alloc_skb); +/** + * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * @napi: napi instance this buffer was allocated for + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages + * + * Allocate a new sk_buff for use in NAPI receive. This buffer will + * attempt to allocate the head from a special reserved region used + * only for NAPI Rx allocation. By doing this we can save several + * CPU cycles by avoiding having to disable and re-enable IRQs. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + + length += NET_SKB_PAD + NET_IP_ALIGN; + skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI); + + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + skb->dev = napi->dev; + } + + return skb; +} +EXPORT_SYMBOL(__napi_alloc_skb); + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize) { -- cgit v1.2.3 From f95b414edb18de59940dcebbefb49cf25c6d505c Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 11 Dec 2014 11:22:04 +0800 Subject: net: introduce helper macro for_each_cmsghdr Introduce helper macro for_each_cmsghdr as a wrapper of the enumerating cmsghdr from msghdr, just cleanup. Signed-off-by: Gu Zheng Signed-off-by: David S. Miller --- net/core/scm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index b442e7e25e60..3b6899b7d810 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -129,8 +129,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) struct cmsghdr *cmsg; int err; - for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) - { + for_each_cmsghdr(cmsg, msg) { err = -EINVAL; /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ -- cgit v1.2.3 From 198bf1b046e370a7d3987b195cff5f1efebec3ac Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 10 Dec 2014 20:14:55 -0800 Subject: net: sock: fix access via invalid file descriptor 0day robot reported the following crash: [ 21.233581] BUG: unable to handle kernel NULL pointer dereference at 0000000000000007 [ 21.234709] IP: [] sk_attach_bpf+0x39/0xc2 It's due to bpf_prog_get() returning ERR_PTR. Check it properly. Reported-by: Fengguang Wu Fixes: 89aa075832b0 ("net: sock: allow eBPF programs to be attached to sockets") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 8cc3c03078b3..ec9baea10c16 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1103,8 +1103,8 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return -EPERM; prog = bpf_prog_get(ufd); - if (!prog) - return -EINVAL; + if (IS_ERR(prog)) + return PTR_ERR(prog); if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { /* valid fd, but invalid program type */ -- cgit v1.2.3