diff options
Diffstat (limited to 'net/core')
32 files changed, 1637 insertions, 1159 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 4268846f2f47..a8e4f737692b 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,9 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o xdp.o flow_offload.o + fib_notifier.o xdp.o flow_offload.o gro.o + +obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 68d2cbf8331a..d9c37fd10809 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -13,6 +13,7 @@ #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(sk_cache); @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) struct bpf_local_storage *sk_storage; struct bpf_local_storage_map *smap; - sk_storage = rcu_dereference(sk->sk_bpf_storage); + sk_storage = + rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held()); if (!sk_storage) return NULL; @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, { struct bpf_local_storage_data *sdata; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; @@ -288,6 +291,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk)) return -EINVAL; @@ -416,6 +420,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return (unsigned long)NULL; @@ -425,6 +430,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, struct sock *, sk) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return -EPERM; @@ -929,7 +935,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; diff --git a/net/core/dev.c b/net/core/dev.c index c4708e2487fb..84a0d9542fe9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -153,16 +153,10 @@ #include "net-sysfs.h" -#define MAX_GRO_SKBS 8 - -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(ptype_lock); -static DEFINE_SPINLOCK(offload_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ -static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, @@ -371,12 +365,12 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(net); } @@ -389,11 +383,11 @@ static void unlist_netdevice(struct net_device *dev) ASSERT_RTNL(); /* Unlink dev from the device chain */ - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -604,84 +598,6 @@ void dev_remove_pack(struct packet_type *pt) EXPORT_SYMBOL(dev_remove_pack); -/** - * dev_add_offload - register offload handlers - * @po: protocol offload declaration - * - * Add protocol offload handlers to the networking stack. The passed - * &proto_offload is linked into kernel lists and may not be freed until - * it has been removed from the kernel lists. - * - * This call does not sleep therefore it can not - * guarantee all CPU's that are in middle of receiving packets - * will see the new offload handlers (until the next received packet). - */ -void dev_add_offload(struct packet_offload *po) -{ - struct packet_offload *elem; - - spin_lock(&offload_lock); - list_for_each_entry(elem, &offload_base, list) { - if (po->priority < elem->priority) - break; - } - list_add_rcu(&po->list, elem->list.prev); - spin_unlock(&offload_lock); -} -EXPORT_SYMBOL(dev_add_offload); - -/** - * __dev_remove_offload - remove offload handler - * @po: packet offload declaration - * - * Remove a protocol offload handler that was previously added to the - * kernel offload handlers by dev_add_offload(). The passed &offload_type - * is removed from the kernel lists and can be freed or reused once this - * function returns. - * - * The packet type might still be in use by receivers - * and must not be freed until after all the CPU's have gone - * through a quiescent state. - */ -static void __dev_remove_offload(struct packet_offload *po) -{ - struct list_head *head = &offload_base; - struct packet_offload *po1; - - spin_lock(&offload_lock); - - list_for_each_entry(po1, head, list) { - if (po == po1) { - list_del_rcu(&po->list); - goto out; - } - } - - pr_warn("dev_remove_offload: %p not found\n", po); -out: - spin_unlock(&offload_lock); -} - -/** - * dev_remove_offload - remove packet offload handler - * @po: packet offload declaration - * - * Remove a packet offload handler that was previously added to the kernel - * offload handlers by dev_add_offload(). The passed &offload_type is - * removed from the kernel lists and can be freed or reused once this - * function returns. - * - * This call sleeps to guarantee that no CPU is looking at the packet - * type after return. - */ -void dev_remove_offload(struct packet_offload *po) -{ - __dev_remove_offload(po); - - synchronize_net(); -} -EXPORT_SYMBOL(dev_remove_offload); - /******************************************************************************* * * Device Interface Subroutines @@ -1272,15 +1188,15 @@ rollback: netdev_adjacent_rename_links(dev, oldname); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); synchronize_rcu(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @@ -1461,6 +1377,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) int ret; ASSERT_RTNL(); + dev_addr_check(dev); if (!netif_device_present(dev)) { /* may be detached because parent is runtime-suspended */ @@ -3315,40 +3232,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) return __vlan_get_protocol(skb, type, depth); } -/** - * skb_mac_gso_segment - mac layer segmentation handler. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - int vlan_depth = skb->mac_len; - __be16 type = skb_network_protocol(skb, &vlan_depth); - - if (unlikely(!type)) - return ERR_PTR(-EINVAL); - - __skb_pull(skb, vlan_depth); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; -} -EXPORT_SYMBOL(skb_mac_gso_segment); - - /* openvswitch calls this on rx path, so we need a different check. */ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) @@ -3513,7 +3396,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, { u16 gso_segs = skb_shinfo(skb)->gso_segs; - if (gso_segs > dev->gso_max_segs) + if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { @@ -3836,8 +3719,12 @@ no_lock_out: * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. + * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit + * and then other tasks will only enqueue packets. The packets will be + * sent after the qdisc owner is scheduled again. To prevent this + * scenario the task always serialize on the lock. */ - contended = qdisc_is_running(q); + contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); if (unlikely(contended)) spin_lock(&q->busylock); @@ -4323,8 +4210,6 @@ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; int dev_tx_weight __read_mostly = 64; -/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ -int gro_normal_batch __read_mostly = 8; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -4827,7 +4712,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(skb->dev, xdp_prog, act); @@ -5014,7 +4899,8 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) trace_consume_skb(skb); else - trace_kfree_skb(skb, net_tx_action); + trace_kfree_skb(skb, net_tx_action, + SKB_DROP_REASON_NOT_SPECIFIED); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); @@ -5667,7 +5553,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } -static void netif_receive_skb_list_internal(struct list_head *head) +void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; struct list_head sublist; @@ -5845,550 +5731,6 @@ static void flush_all_backlogs(void) cpus_read_unlock(); } -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) -{ - list_add_tail(&skb->list, &napi->rx_list); - napi->rx_count += segs; - if (napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - -static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) -{ - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int err = -ENOENT; - - BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); - - if (NAPI_GRO_CB(skb)->count == 1) { - skb_shinfo(skb)->gso_size = 0; - goto out; - } - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_complete) - continue; - - err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, - ipv6_gro_complete, inet_gro_complete, - skb, 0); - break; - } - rcu_read_unlock(); - - if (err) { - WARN_ON(&ptype->list == head); - kfree_skb(skb); - return; - } - -out: - gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); -} - -static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, - bool flush_old) -{ - struct list_head *head = &napi->gro_hash[index].list; - struct sk_buff *skb, *p; - - list_for_each_entry_safe_reverse(skb, p, head, list) { - if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) - return; - skb_list_del_init(skb); - napi_gro_complete(napi, skb); - napi->gro_hash[index].count--; - } - - if (!napi->gro_hash[index].count) - __clear_bit(index, &napi->gro_bitmask); -} - -/* napi->gro_hash[].list contains packets ordered by age. - * youngest packets at the head of it. - * Complete skbs in reverse order to reduce latencies. - */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) -{ - unsigned long bitmask = napi->gro_bitmask; - unsigned int i, base = ~0U; - - while ((i = ffs(bitmask)) != 0) { - bitmask >>= i; - base += i; - __napi_gro_flush_chain(napi, base, flush_old); - } -} -EXPORT_SYMBOL(napi_gro_flush); - -static void gro_list_prepare(const struct list_head *head, - const struct sk_buff *skb) -{ - unsigned int maclen = skb->dev->hard_header_len; - u32 hash = skb_get_hash_raw(skb); - struct sk_buff *p; - - list_for_each_entry(p, head, list) { - unsigned long diffs; - - NAPI_GRO_CB(p)->flush = 0; - - if (hash != skb_get_hash_raw(p)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - - diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); - if (skb_vlan_tag_present(p)) - diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); - diffs |= skb_metadata_differs(p, skb); - if (maclen == ETH_HLEN) - diffs |= compare_ether_header(skb_mac_header(p), - skb_mac_header(skb)); - else if (!diffs) - diffs = memcmp(skb_mac_header(p), - skb_mac_header(skb), - maclen); - - /* in most common scenarions 'slow_gro' is 0 - * otherwise we are already on some slower paths - * either skip all the infrequent tests altogether or - * avoid trying too hard to skip each of them individually - */ - if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - struct tc_skb_ext *skb_ext; - struct tc_skb_ext *p_ext; -#endif - - diffs |= p->sk != skb->sk; - diffs |= skb_metadata_dst_cmp(p, skb); - diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); - -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext = skb_ext_find(skb, TC_SKB_EXT); - p_ext = skb_ext_find(p, TC_SKB_EXT); - - diffs |= (!!p_ext) ^ (!!skb_ext); - if (!diffs && unlikely(skb_ext)) - diffs |= p_ext->chain ^ skb_ext->chain; -#endif - } - - NAPI_GRO_CB(p)->same_flow = !diffs; - } -} - -static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) -{ - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; - - NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; - - if (!skb_headlen(skb) && pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0)) && - (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { - NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, - skb_frag_size(frag0), - skb->end - skb->tail); - } -} - -static void gro_pull_from_frag0(struct sk_buff *skb, int grow) -{ - struct skb_shared_info *pinfo = skb_shinfo(skb); - - BUG_ON(skb->end - skb->tail < grow); - - memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - - skb->data_len -= grow; - skb->tail += grow; - - skb_frag_off_add(&pinfo->frags[0], grow); - skb_frag_size_sub(&pinfo->frags[0], grow); - - if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { - skb_frag_unref(skb, 0); - memmove(pinfo->frags, pinfo->frags + 1, - --pinfo->nr_frags * sizeof(pinfo->frags[0])); - } -} - -static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) -{ - struct sk_buff *oldest; - - oldest = list_last_entry(head, struct sk_buff, list); - - /* We are called with head length >= MAX_GRO_SKBS, so this is - * impossible. - */ - if (WARN_ON_ONCE(!oldest)) - return; - - /* Do not adjust napi->gro_hash[].count, caller is adding a new - * SKB to the chain. - */ - skb_list_del_init(oldest); - napi_gro_complete(napi, oldest); -} - -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[bucket]; - struct list_head *head = &offload_base; - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct sk_buff *pp = NULL; - enum gro_result ret; - int same_flow; - int grow; - - if (netif_elide_gro(skb->dev)) - goto normal; - - gro_list_prepare(&gro_list->list, skb); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_receive) - continue; - - skb_set_network_header(skb, skb_gro_offset(skb)); - skb_reset_mac_len(skb); - NAPI_GRO_CB(skb)->same_flow = 0; - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->free = 0; - NAPI_GRO_CB(skb)->encap_mark = 0; - NAPI_GRO_CB(skb)->recursion_counter = 0; - NAPI_GRO_CB(skb)->is_fou = 0; - NAPI_GRO_CB(skb)->is_atomic = 1; - NAPI_GRO_CB(skb)->gro_remcsum_start = 0; - - /* Setup for GRO checksum validation */ - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - NAPI_GRO_CB(skb)->csum = skb->csum; - NAPI_GRO_CB(skb)->csum_valid = 1; - NAPI_GRO_CB(skb)->csum_cnt = 0; - break; - case CHECKSUM_UNNECESSARY: - NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; - NAPI_GRO_CB(skb)->csum_valid = 0; - break; - default: - NAPI_GRO_CB(skb)->csum_cnt = 0; - NAPI_GRO_CB(skb)->csum_valid = 0; - } - - pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, - ipv6_gro_receive, inet_gro_receive, - &gro_list->list, skb); - break; - } - rcu_read_unlock(); - - if (&ptype->list == head) - goto normal; - - if (PTR_ERR(pp) == -EINPROGRESS) { - ret = GRO_CONSUMED; - goto ok; - } - - same_flow = NAPI_GRO_CB(skb)->same_flow; - ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; - - if (pp) { - skb_list_del_init(pp); - napi_gro_complete(napi, pp); - gro_list->count--; - } - - if (same_flow) - goto ok; - - if (NAPI_GRO_CB(skb)->flush) - goto normal; - - if (unlikely(gro_list->count >= MAX_GRO_SKBS)) - gro_flush_oldest(napi, &gro_list->list); - else - gro_list->count++; - - NAPI_GRO_CB(skb)->count = 1; - NAPI_GRO_CB(skb)->age = jiffies; - NAPI_GRO_CB(skb)->last = skb; - skb_shinfo(skb)->gso_size = skb_gro_len(skb); - list_add(&skb->list, &gro_list->list); - ret = GRO_HELD; - -pull: - grow = skb_gro_offset(skb) - skb_headlen(skb); - if (grow > 0) - gro_pull_from_frag0(skb, grow); -ok: - if (gro_list->count) { - if (!test_bit(bucket, &napi->gro_bitmask)) - __set_bit(bucket, &napi->gro_bitmask); - } else if (test_bit(bucket, &napi->gro_bitmask)) { - __clear_bit(bucket, &napi->gro_bitmask); - } - - return ret; - -normal: - ret = GRO_NORMAL; - goto pull; -} - -struct packet_offload *gro_find_receive_by_type(__be16 type) -{ - struct list_head *offload_head = &offload_base; - struct packet_offload *ptype; - - list_for_each_entry_rcu(ptype, offload_head, list) { - if (ptype->type != type || !ptype->callbacks.gro_receive) - continue; - return ptype; - } - return NULL; -} -EXPORT_SYMBOL(gro_find_receive_by_type); - -struct packet_offload *gro_find_complete_by_type(__be16 type) -{ - struct list_head *offload_head = &offload_base; - struct packet_offload *ptype; - - list_for_each_entry_rcu(ptype, offload_head, list) { - if (ptype->type != type || !ptype->callbacks.gro_complete) - continue; - return ptype; - } - return NULL; -} -EXPORT_SYMBOL(gro_find_complete_by_type); - -static gro_result_t napi_skb_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) -{ - switch (ret) { - case GRO_NORMAL: - gro_normal_one(napi, skb, 1); - break; - - case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) - napi_skb_free_stolen_head(skb); - else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) - __kfree_skb(skb); - else - __kfree_skb_defer(skb); - break; - - case GRO_HELD: - case GRO_MERGED: - case GRO_CONSUMED: - break; - } - - return ret; -} - -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - gro_result_t ret; - - skb_mark_napi_id(skb, napi); - trace_napi_gro_receive_entry(skb); - - skb_gro_reset_offset(skb, 0); - - ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); - trace_napi_gro_receive_exit(ret); - - return ret; -} -EXPORT_SYMBOL(napi_gro_receive); - -static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) -{ - if (unlikely(skb->pfmemalloc)) { - consume_skb(skb); - return; - } - __skb_pull(skb, skb_headlen(skb)); - /* restore the reserve we had after netdev_alloc_skb_ip_align() */ - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - __vlan_hwaccel_clear_tag(skb); - skb->dev = napi->dev; - skb->skb_iif = 0; - - /* eth_type_trans() assumes pkt_type is PACKET_HOST */ - skb->pkt_type = PACKET_HOST; - - skb->encapsulation = 0; - skb_shinfo(skb)->gso_type = 0; - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - if (unlikely(skb->slow_gro)) { - skb_orphan(skb); - skb_ext_reset(skb); - nf_reset_ct(skb); - skb->slow_gro = 0; - } - - napi->skb = skb; -} - -struct sk_buff *napi_get_frags(struct napi_struct *napi) -{ - struct sk_buff *skb = napi->skb; - - if (!skb) { - skb = napi_alloc_skb(napi, GRO_MAX_HEAD); - if (skb) { - napi->skb = skb; - skb_mark_napi_id(skb, napi); - } - } - return skb; -} -EXPORT_SYMBOL(napi_get_frags); - -static gro_result_t napi_frags_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) -{ - switch (ret) { - case GRO_NORMAL: - case GRO_HELD: - __skb_push(skb, ETH_HLEN); - skb->protocol = eth_type_trans(skb, skb->dev); - if (ret == GRO_NORMAL) - gro_normal_one(napi, skb, 1); - break; - - case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) - napi_skb_free_stolen_head(skb); - else - napi_reuse_skb(napi, skb); - break; - - case GRO_MERGED: - case GRO_CONSUMED: - break; - } - - return ret; -} - -/* Upper GRO stack assumes network header starts at gro_offset=0 - * Drivers could call both napi_gro_frags() and napi_gro_receive() - * We copy ethernet header into skb->data to have a common layout. - */ -static struct sk_buff *napi_frags_skb(struct napi_struct *napi) -{ - struct sk_buff *skb = napi->skb; - const struct ethhdr *eth; - unsigned int hlen = sizeof(*eth); - - napi->skb = NULL; - - skb_reset_mac_header(skb); - skb_gro_reset_offset(skb, hlen); - - if (unlikely(skb_gro_header_hard(skb, hlen))) { - eth = skb_gro_header_slow(skb, hlen, 0); - if (unlikely(!eth)) { - net_warn_ratelimited("%s: dropping impossible skb from %s\n", - __func__, napi->dev->name); - napi_reuse_skb(napi, skb); - return NULL; - } - } else { - eth = (const struct ethhdr *)skb->data; - gro_pull_from_frag0(skb, hlen); - NAPI_GRO_CB(skb)->frag0 += hlen; - NAPI_GRO_CB(skb)->frag0_len -= hlen; - } - __skb_pull(skb, hlen); - - /* - * This works because the only protocols we care about don't require - * special handling. - * We'll fix it up properly in napi_frags_finish() - */ - skb->protocol = eth->h_proto; - - return skb; -} - -gro_result_t napi_gro_frags(struct napi_struct *napi) -{ - gro_result_t ret; - struct sk_buff *skb = napi_frags_skb(napi); - - trace_napi_gro_frags_entry(skb); - - ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); - trace_napi_gro_frags_exit(ret); - - return ret; -} -EXPORT_SYMBOL(napi_gro_frags); - -/* Compute the checksum from gro_offset and return the folded value - * after adding in any pseudo checksum. - */ -__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) -{ - __wsum wsum; - __sum16 sum; - - wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); - - /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ - sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); - /* See comments in __skb_checksum_complete(). */ - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev, skb); - } - - NAPI_GRO_CB(skb)->csum = wsum; - NAPI_GRO_CB(skb)->csum_valid = 1; - - return sum; -} -EXPORT_SYMBOL(__skb_gro_checksum_complete); - static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS @@ -7200,6 +6542,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) struct netdev_adjacent { struct net_device *dev; + netdevice_tracker dev_tracker; /* upper master flag, there can only be one master device per list */ bool master; @@ -7964,7 +7307,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->ref_nr = 1; adj->private = private; adj->ignore = false; - dev_hold(adj_dev); + dev_hold_track(adj_dev, &adj->dev_tracker, GFP_KERNEL); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); @@ -7993,8 +7336,8 @@ remove_symlinks: if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: + dev_put_track(adj_dev, &adj->dev_tracker); kfree(adj); - dev_put(adj_dev); return ret; } @@ -8035,7 +7378,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, list_del_rcu(&adj->list); pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); - dev_put(adj_dev); + dev_put_track(adj_dev, &adj->dev_tracker); kfree_rcu(adj, rcu); } @@ -9224,35 +8567,17 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) EXPORT_SYMBOL(netdev_port_same_parent_id); /** - * dev_change_proto_down - update protocol port state information + * dev_change_proto_down - set carrier according to proto_down. + * * @dev: device * @proto_down: new value - * - * This info can be used by switch drivers to set the phys state of the - * port. */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { - const struct net_device_ops *ops = dev->netdev_ops; - - if (!ops->ndo_change_proto_down) + if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; - return ops->ndo_change_proto_down(dev, proto_down); -} -EXPORT_SYMBOL(dev_change_proto_down); - -/** - * dev_change_proto_down_generic - generic implementation for - * ndo_change_proto_down that sets carrier according to - * proto_down. - * - * @dev: device - * @proto_down: new value - */ -int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) -{ if (proto_down) netif_carrier_off(dev); else @@ -9260,7 +8585,7 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) dev->proto_down = proto_down; return 0; } -EXPORT_SYMBOL(dev_change_proto_down_generic); +EXPORT_SYMBOL(dev_change_proto_down); /** * dev_change_proto_down_reason - proto down reason @@ -10545,6 +9870,7 @@ static void netdev_wait_allrefs(struct net_device *dev) netdev_unregister_timeout_secs * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); warning_time = jiffies; } } @@ -10835,6 +10161,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; + ref_tracker_dir_init(&dev->refcnt_tracker, 128); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) @@ -10854,6 +10181,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gro_max_size = GRO_MAX_SIZE; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP @@ -10951,6 +10279,7 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; @@ -11643,8 +10972,6 @@ static int __init net_dev_init(void) for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); - INIT_LIST_HEAD(&offload_base); - if (register_pernet_subsys(&netdev_net_ops)) goto out; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index f0cb38344126..bead38ca50bd 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -16,6 +16,35 @@ * General list handling functions */ +static int __hw_addr_insert(struct netdev_hw_addr_list *list, + struct netdev_hw_addr *new, int addr_len) +{ + struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; + struct netdev_hw_addr *ha; + + while (*ins_point) { + int diff; + + ha = rb_entry(*ins_point, struct netdev_hw_addr, node); + diff = memcmp(new->addr, ha->addr, addr_len); + if (diff == 0) + diff = memcmp(&new->type, &ha->type, sizeof(new->type)); + + parent = *ins_point; + if (diff < 0) + ins_point = &parent->rb_left; + else if (diff > 0) + ins_point = &parent->rb_right; + else + return -EEXIST; + } + + rb_link_node_rcu(&new->node, parent, ins_point); + rb_insert_color(&new->node, &list->tree); + + return 0; +} + static struct netdev_hw_addr* __hw_addr_create(const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) @@ -50,11 +79,6 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, if (addr_len > MAX_ADDR_LEN) return -EINVAL; - ha = list_first_entry(&list->list, struct netdev_hw_addr, list); - if (ha && !memcmp(addr, ha->addr, addr_len) && - (!addr_type || addr_type == ha->type)) - goto found_it; - while (*ins_point) { int diff; @@ -69,7 +93,6 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, } else if (diff > 0) { ins_point = &parent->rb_right; } else { -found_it: if (exclusive) return -EEXIST; if (global) { @@ -94,16 +117,8 @@ found_it: if (!ha) return -ENOMEM; - /* The first address in dev->dev_addrs is pointed to by dev->dev_addr - * and mutated freely by device drivers and netdev ops, so if we insert - * it into the tree we'll end up with an invalid rbtree. - */ - if (list->count > 0) { - rb_link_node(&ha->node, parent, ins_point); - rb_insert_color(&ha->node, &list->tree); - } else { - RB_CLEAR_NODE(&ha->node); - } + rb_link_node(&ha->node, parent, ins_point); + rb_insert_color(&ha->node, &list->tree); list_add_tail_rcu(&ha->list, &list->list); list->count++; @@ -138,8 +153,7 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, if (--ha->refcount) return 0; - if (!RB_EMPTY_NODE(&ha->node)) - rb_erase(&ha->node, &list->tree); + rb_erase(&ha->node, &list->tree); list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); @@ -151,18 +165,8 @@ static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { - struct netdev_hw_addr *ha; struct rb_node *node; - /* The first address isn't inserted into the tree because in the dev->dev_addrs - * list it's the address pointed to by dev->dev_addr which is freely mutated - * in place, so we need to check it separately. - */ - ha = list_first_entry(&list->list, struct netdev_hw_addr, list); - if (ha && !memcmp(addr, ha->addr, addr_len) && - (!addr_type || addr_type == ha->type)) - return ha; - node = list->tree.rb_node; while (node) { @@ -498,6 +502,21 @@ EXPORT_SYMBOL(__hw_addr_init); * Device addresses handling functions */ +/* Check that netdev->dev_addr is not written to directly as this would + * break the rbtree layout. All changes should go thru dev_addr_set() and co. + * Remove this check in mid-2024. + */ +void dev_addr_check(struct net_device *dev) +{ + if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN)) + return; + + netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr); + netdev_warn(dev, "Expected addr: %*ph\n", + MAX_ADDR_LEN, dev->dev_addr_shadow); + netdev_WARN(dev, "Incorrect netdev->dev_addr\n"); +} + /** * dev_addr_flush - Flush device address list * @dev: device @@ -509,11 +528,11 @@ EXPORT_SYMBOL(__hw_addr_init); void dev_addr_flush(struct net_device *dev) { /* rtnl_mutex must be held here */ + dev_addr_check(dev); __hw_addr_flush(&dev->dev_addrs); dev->dev_addr = NULL; } -EXPORT_SYMBOL(dev_addr_flush); /** * dev_addr_init - Init device address list @@ -547,7 +566,21 @@ int dev_addr_init(struct net_device *dev) } return err; } -EXPORT_SYMBOL(dev_addr_init); + +void dev_addr_mod(struct net_device *dev, unsigned int offset, + const void *addr, size_t len) +{ + struct netdev_hw_addr *ha; + + dev_addr_check(dev); + + ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]); + rb_erase(&ha->node, &dev->dev_addrs.tree); + memcpy(&ha->addr[offset], addr, len); + memcpy(&dev->dev_addr_shadow[offset], addr, len); + WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len)); +} +EXPORT_SYMBOL(dev_addr_mod); /** * dev_addr_add - Add a device address diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c new file mode 100644 index 000000000000..049cfbc58aa9 --- /dev/null +++ b/net/core/dev_addr_lists_test.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <kunit/test.h> +#include <linux/etherdevice.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> + +static const struct net_device_ops dummy_netdev_ops = { +}; + +struct dev_addr_test_priv { + u32 addr_seen; +}; + +static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a) +{ + struct dev_addr_test_priv *datp = netdev_priv(netdev); + + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + datp->addr_seen |= 1 << a[0]; + return 0; +} + +static int dev_addr_test_unsync(struct net_device *netdev, + const unsigned char *a) +{ + struct dev_addr_test_priv *datp = netdev_priv(netdev); + + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + datp->addr_seen &= ~(1 << a[0]); + return 0; +} + +static int dev_addr_test_init(struct kunit *test) +{ + struct dev_addr_test_priv *datp; + struct net_device *netdev; + int err; + + netdev = alloc_etherdev(sizeof(*datp)); + KUNIT_ASSERT_TRUE(test, !!netdev); + + test->priv = netdev; + netdev->netdev_ops = &dummy_netdev_ops; + + err = register_netdev(netdev); + if (err) { + free_netdev(netdev); + KUNIT_FAIL(test, "Can't register netdev %d", err); + } + + rtnl_lock(); + return 0; +} + +static void dev_addr_test_exit(struct kunit *test) +{ + struct net_device *netdev = test->priv; + + rtnl_unlock(); + unregister_netdev(netdev); + free_netdev(netdev); +} + +static void dev_addr_test_basic(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + + KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr); + + memset(addr, 2, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr))); + + memset(addr, 3, sizeof(addr)); + dev_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr))); +} + +static void dev_addr_test_sync_one(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + + datp = netdev_priv(netdev); + + memset(addr, 1, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 2, datp->addr_seen); + + memset(addr, 2, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + datp->addr_seen = 0; + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + /* It's not going to sync anything because the main address is + * considered synced and we overwrite in place. + */ + KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); +} + +static void dev_addr_test_add_del(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + int i; + + datp = netdev_priv(netdev); + + for (i = 1; i < 4; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + /* Add 3 again */ + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen); + + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen); + + for (i = 1; i < 4; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); +} + +static void dev_addr_test_del_main(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + + memset(addr, 1, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); +} + +static void dev_addr_test_add_set(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + int i; + + datp = netdev_priv(netdev); + + /* There is no external API like dev_addr_add_excl(), + * so shuffle the tree a little bit and exploit aliasing. + */ + for (i = 1; i < 16; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + + memset(addr, i, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + memset(addr, 0, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); +} + +static void dev_addr_test_add_excl(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + int i; + + for (i = 0; i < 10; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); + } + KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); + + for (i = 0; i < 10; i += 2) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr)); + } + for (i = 1; i < 10; i += 2) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); + } +} + +static struct kunit_case dev_addr_test_cases[] = { + KUNIT_CASE(dev_addr_test_basic), + KUNIT_CASE(dev_addr_test_sync_one), + KUNIT_CASE(dev_addr_test_add_del), + KUNIT_CASE(dev_addr_test_del_main), + KUNIT_CASE(dev_addr_test_add_set), + KUNIT_CASE(dev_addr_test_add_excl), + {} +}; + +static struct kunit_suite dev_addr_test_suite = { + .name = "dev-addr-list-test", + .test_cases = dev_addr_test_cases, + .init = dev_addr_test_init, + .exit = dev_addr_test_exit, +}; +kunit_test_suite(dev_addr_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index cbab5fec64b1..1b807d119da5 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -192,7 +192,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr) if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) return -EFAULT; - if (cfg.flags) /* reserved for future extensions */ + if (cfg.flags & ~HWTSTAMP_FLAG_MASK) return -EINVAL; tx_type = cfg.tx_type; @@ -313,6 +313,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; + netdevice_tracker dev_tracker; if (!dev) return -ENODEV; @@ -381,10 +382,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return -ENODEV; if (!netif_is_bridge_master(dev)) return -EOPNOTSUPP; - dev_hold(dev); + dev_hold_track(dev, &dev_tracker, GFP_KERNEL); rtnl_unlock(); err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); - dev_put(dev); + dev_put_track(dev, &dev_tracker); rtnl_lock(); return err; diff --git a/net/core/devlink.c b/net/core/devlink.c index c06c9ba6e8c5..fcd9f6d85cf1 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7,6 +7,7 @@ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ +#include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> @@ -69,6 +70,35 @@ struct devlink { char priv[] __aligned(NETDEV_ALIGN); }; +/** + * struct devlink_resource - devlink resource + * @name: name of the resource + * @id: id, per devlink instance + * @size: size of the resource + * @size_new: updated size of the resource, reload is needed + * @size_valid: valid in case the total size of the resource is valid + * including its children + * @parent: parent resource + * @size_params: size parameters + * @list: parent list + * @resource_list: list of child resources + * @occ_get: occupancy getter callback + * @occ_get_priv: occupancy getter callback priv + */ +struct devlink_resource { + const char *name; + u64 id; + u64 size; + u64 size_new; + bool size_valid; + struct devlink_resource *parent; + struct devlink_resource_size_params size_params; + struct list_head list; + struct list_head resource_list; + devlink_resource_occ_get_t *occ_get; + void *occ_get_priv; +}; + void *devlink_priv(struct devlink *devlink) { return &devlink->priv; @@ -4432,6 +4462,21 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, + .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, + .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME, + .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, + .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -8840,8 +8885,6 @@ static const struct genl_small_ops devlink_nl_ops[] = { GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | - DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, @@ -9905,34 +9948,38 @@ out: } EXPORT_SYMBOL_GPL(devlink_resource_register); +static void devlink_resource_unregister(struct devlink *devlink, + struct devlink_resource *resource) +{ + struct devlink_resource *tmp, *child_resource; + + list_for_each_entry_safe(child_resource, tmp, &resource->resource_list, + list) { + devlink_resource_unregister(devlink, child_resource); + list_del(&child_resource->list); + kfree(child_resource); + } +} + /** * devlink_resources_unregister - free all resources * * @devlink: devlink - * @resource: resource */ -void devlink_resources_unregister(struct devlink *devlink, - struct devlink_resource *resource) +void devlink_resources_unregister(struct devlink *devlink) { struct devlink_resource *tmp, *child_resource; - struct list_head *resource_list; - if (resource) - resource_list = &resource->resource_list; - else - resource_list = &devlink->resource_list; - - if (!resource) - mutex_lock(&devlink->lock); + mutex_lock(&devlink->lock); - list_for_each_entry_safe(child_resource, tmp, resource_list, list) { - devlink_resources_unregister(devlink, child_resource); + list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list, + list) { + devlink_resource_unregister(devlink, child_resource); list_del(&child_resource->list); kfree(child_resource); } - if (!resource) - mutex_unlock(&devlink->lock); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_resources_unregister); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 49442cae6f69..7b288a121a41 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -110,7 +110,8 @@ static u32 net_dm_queue_len = 1000; struct net_dm_alert_ops { void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, - void *location); + void *location, + enum skb_drop_reason reason); void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, int work, int budget); void (*work_item_func)(struct work_struct *work); @@ -262,7 +263,9 @@ out: spin_unlock_irqrestore(&data->lock, flags); } -static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) +static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, + void *location, + enum skb_drop_reason reason) { trace_drop_common(skb, location); } @@ -490,7 +493,8 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = { static void net_dm_packet_trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, - void *location) + void *location, + enum skb_drop_reason reason) { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; @@ -850,7 +854,7 @@ net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) } hw_metadata->input_dev = metadata->input_dev; - dev_hold(hw_metadata->input_dev); + dev_hold_track(hw_metadata->input_dev, &hw_metadata->dev_tracker, GFP_ATOMIC); return hw_metadata; @@ -864,9 +868,9 @@ free_hw_metadata: } static void -net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata) +net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata) { - dev_put(hw_metadata->input_dev); + dev_put_track(hw_metadata->input_dev, &hw_metadata->dev_tracker); kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); diff --git a/net/core/dst.c b/net/core/dst.c index 497ef9b3fc6a..d16c2c9bfebd 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -49,7 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, unsigned short flags) { dst->dev = dev; - dev_hold(dev); + dev_hold_track(dev, &dst->dev_tracker, GFP_ATOMIC); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; @@ -117,7 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) if (dst->ops->destroy) dst->ops->destroy(dst); - dev_put(dst->dev); + dev_put_track(dst->dev, &dst->dev_tracker); lwtstate_put(dst->lwtstate); @@ -159,8 +159,8 @@ void dst_dev_put(struct dst_entry *dst) dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; - dev_hold(dst->dev); - dev_put(dev); + dev_replace_track(dev, blackhole_netdev, &dst->dev_tracker, + GFP_ATOMIC); } EXPORT_SYMBOL(dst_dev_put); diff --git a/net/core/failover.c b/net/core/failover.c index b5cd3c727285..dcaa92a85ea2 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -252,7 +252,7 @@ struct failover *failover_register(struct net_device *dev, return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); - dev_hold(dev); + dev_hold_track(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); @@ -285,7 +285,7 @@ void failover_unregister(struct failover *failover) failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; - dev_put(failover_dev); + dev_put_track(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 1bb567a3b329..75282222e0b4 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -750,6 +750,27 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, return 0; } +static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { + [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, + [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [FRA_PRIORITY] = { .type = NLA_U32 }, + [FRA_FWMARK] = { .type = NLA_U32 }, + [FRA_FLOW] = { .type = NLA_U32 }, + [FRA_TUN_ID] = { .type = NLA_U64 }, + [FRA_FWMASK] = { .type = NLA_U32 }, + [FRA_TABLE] = { .type = NLA_U32 }, + [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, + [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, + [FRA_GOTO] = { .type = NLA_U32 }, + [FRA_L3MDEV] = { .type = NLA_U8 }, + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, + [FRA_PROTOCOL] = { .type = NLA_U8 }, + [FRA_IP_PROTO] = { .type = NLA_U8 }, + [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, + [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } +}; + int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -774,7 +795,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, - ops->policy, extack); + fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; @@ -882,7 +903,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, - ops->policy, extack); + fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; diff --git a/net/core/filter.c b/net/core/filter.c index 6102f093d59a..4603b7cd3cd1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -301,7 +301,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_PKTTYPE: - *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); @@ -323,7 +323,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, offsetof(struct sk_buff, vlan_tci)); break; case SKF_AD_VLAN_TAG_PRESENT: - *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET); if (PKT_VLAN_PRESENT_BIT) *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); if (PKT_VLAN_PRESENT_BIT < 7) @@ -1242,10 +1242,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) int err, new_len, old_len = fp->len; bool seen_ld_abs = false; - /* We are free to overwrite insns et al right here as it - * won't be used at this point in time anymore internally - * after the migration to the internal BPF instruction - * representation. + /* We are free to overwrite insns et al right here as it won't be used at + * this point in time anymore internally after the migration to the eBPF + * instruction representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); @@ -1336,8 +1335,8 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, */ bpf_jit_compile(fp); - /* JIT compiler couldn't process this filter, so do the - * internal BPF translation for the optimized interpreter. + /* JIT compiler couldn't process this filter, so do the eBPF translation + * for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); @@ -1713,7 +1712,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -2018,9 +2017,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -2541,7 +2540,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -3958,10 +3957,35 @@ u32 xdp_master_redirect(struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_master_redirect); -int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) +static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, + struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; + + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + err = __xsk_map_redirect(fwd, xdp); + if (unlikely(err)) + goto err; + + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); + return err; +} + +static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, + struct net_device *dev, + struct xdp_frame *xdpf, + struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; @@ -3971,6 +3995,11 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; + if (unlikely(!xdpf)) { + err = -EOVERFLOW; + goto err; + } + switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; @@ -3978,17 +4007,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, map = READ_ONCE(ri->map); if (unlikely(map)) { WRITE_ONCE(ri->map, NULL); - err = dev_map_enqueue_multi(xdp, dev, map, + err = dev_map_enqueue_multi(xdpf, dev, map, ri->flags & BPF_F_EXCLUDE_INGRESS); } else { - err = dev_map_enqueue(fwd, xdp, dev); + err = dev_map_enqueue(fwd, xdpf, dev); } break; case BPF_MAP_TYPE_CPUMAP: - err = cpu_map_enqueue(fwd, xdp, dev); - break; - case BPF_MAP_TYPE_XSKMAP: - err = __xsk_map_redirect(fwd, xdp); + err = cpu_map_enqueue(fwd, xdpf, dev); break; case BPF_MAP_TYPE_UNSPEC: if (map_id == INT_MAX) { @@ -3997,7 +4023,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, err = -EINVAL; break; } - err = dev_xdp_enqueue(fwd, xdp, dev); + err = dev_xdp_enqueue(fwd, xdpf, dev); break; } fallthrough; @@ -4014,8 +4040,34 @@ err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } + +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + enum bpf_map_type map_type = ri->map_type; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), + xdp_prog); +} EXPORT_SYMBOL_GPL(xdp_do_redirect); +int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, + struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + enum bpf_map_type map_type = ri->map_type; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); +} +EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); + static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, @@ -4174,7 +4226,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4188,7 +4240,7 @@ const struct bpf_func_proto bpf_skb_output_proto = { .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4371,7 +4423,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -4397,7 +4449,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -4567,7 +4619,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4581,7 +4633,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = { .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4742,12 +4794,14 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case SO_RCVBUF: val = min_t(u32, val, sysctl_rmem_max); + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_SNDBUF: val = min_t(u32, val, sysctl_wmem_max); + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, max_t(int, val * 2, SOCK_MIN_SNDBUF)); @@ -4968,6 +5022,12 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, goto err_clear; switch (optname) { + case SO_RCVBUF: + *((int *)optval) = sk->sk_rcvbuf; + break; + case SO_SNDBUF: + *((int *)optval) = sk->sk_sndbuf; + break; case SO_MARK: *((int *)optval) = sk->sk_mark; break; @@ -5067,7 +5127,7 @@ const struct bpf_func_proto bpf_sk_setsockopt_proto = { .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5101,7 +5161,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5135,7 +5195,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5310,7 +5370,7 @@ static const struct bpf_func_proto bpf_bind_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -5898,7 +5958,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5908,7 +5968,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5951,7 +6011,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -6039,7 +6099,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -6264,7 +6324,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6283,7 +6343,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6302,7 +6362,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6339,7 +6399,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6362,7 +6422,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6385,7 +6445,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6404,7 +6464,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6423,7 +6483,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6442,7 +6502,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6755,9 +6815,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -6824,9 +6884,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -7055,7 +7115,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -8029,7 +8089,7 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, * (Fast-path, otherwise approximation that we might be * a clone, do the rest in helper.) */ - *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); @@ -8181,13 +8241,13 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -void bpf_warn_invalid_xdp_action(u32 act) +void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; - WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n", - act > act_max ? "Illegal" : "Driver unsupported", - act); + pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", + act > act_max ? "Illegal" : "Driver unsupported", + act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); @@ -8617,7 +8677,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, pkt_type): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, - PKT_TYPE_OFFSET()); + PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); @@ -8642,7 +8702,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, vlan_present): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, - PKT_VLAN_PRESENT_OFFSET()); + PKT_VLAN_PRESENT_OFFSET); if (PKT_VLAN_PRESENT_BIT) *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); if (PKT_VLAN_PRESENT_BIT < 7) @@ -10543,6 +10603,7 @@ static bool sk_lookup_is_valid_access(int off, int size, case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct bpf_sk_lookup, remote_port): case bpf_ctx_range(struct bpf_sk_lookup, local_port): + case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): bpf_ctx_record_field_size(info, sizeof(__u32)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); @@ -10632,6 +10693,12 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct bpf_sk_lookup_kern, dport, 2, target_size)); break; + + case offsetof(struct bpf_sk_lookup, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + ingress_ifindex, 4, target_size)); + break; } return insn - insn_buf; @@ -10656,14 +10723,10 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } -#ifdef CONFIG_DEBUG_INFO_BTF -BTF_ID_LIST_GLOBAL(btf_sock_ids) +BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE -#else -u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; -#endif BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1b094c481f1d..15833e1d6ea1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -5,6 +5,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> +#include <linux/filter.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include <net/ip.h> @@ -1461,7 +1462,7 @@ out_bad: } EXPORT_SYMBOL(__skb_flow_dissect); -static siphash_key_t hashrnd __read_mostly; +static siphash_aligned_key_t hashrnd; static __always_inline void __flow_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 6beaea13564a..73f68d4625f3 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/kernel.h> #include <linux/slab.h> +#include <net/act_api.h> #include <net/flow_offload.h> #include <linux/rtnetlink.h> #include <linux/mutex.h> @@ -27,6 +28,26 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) } EXPORT_SYMBOL(flow_rule_alloc); +struct flow_offload_action *offload_action_alloc(unsigned int num_actions) +{ + struct flow_offload_action *fl_action; + int i; + + fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions), + GFP_KERNEL); + if (!fl_action) + return NULL; + + fl_action->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + fl_action->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; + + return fl_action; +} + #define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \ const struct flow_match *__m = &(__rule)->match; \ struct flow_dissector *__d = (__m)->dissector; \ @@ -397,6 +418,8 @@ int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) existing_qdiscs_register(cb, cb_priv); mutex_unlock(&flow_indr_block_lock); + tcf_action_reoffload_cb(cb, cb_priv, true); + return 0; } EXPORT_SYMBOL(flow_indr_dev_register); @@ -449,6 +472,7 @@ void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, __flow_block_indr_cleanup(release, cb_priv, &cleanup_list); mutex_unlock(&flow_indr_block_lock); + tcf_action_reoffload_cb(cb, cb_priv, false); flow_block_indr_notify(&cleanup_list); kfree(indr_dev); } @@ -549,19 +573,25 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, void (*cleanup)(struct flow_block_cb *block_cb)) { struct flow_indr_dev *this; + u32 count = 0; + int err; mutex_lock(&flow_indr_block_lock); + if (bo) { + if (bo->command == FLOW_BLOCK_BIND) + indir_dev_add(data, dev, sch, type, cleanup, bo); + else if (bo->command == FLOW_BLOCK_UNBIND) + indir_dev_remove(data); + } - if (bo->command == FLOW_BLOCK_BIND) - indir_dev_add(data, dev, sch, type, cleanup, bo); - else if (bo->command == FLOW_BLOCK_UNBIND) - indir_dev_remove(data); - - list_for_each_entry(this, &flow_block_indr_dev_list, list) - this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); + list_for_each_entry(this, &flow_block_indr_dev_list, list) { + err = this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); + if (!err) + count++; + } mutex_unlock(&flow_indr_block_lock); - return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; + return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count; } EXPORT_SYMBOL(flow_indr_dev_setup_offload); diff --git a/net/core/gro.c b/net/core/gro.c new file mode 100644 index 000000000000..a11b286d1495 --- /dev/null +++ b/net/core/gro.c @@ -0,0 +1,770 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <net/gro.h> +#include <net/dst_metadata.h> +#include <net/busy_poll.h> +#include <trace/events/net.h> + +#define MAX_GRO_SKBS 8 + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + +static DEFINE_SPINLOCK(offload_lock); +static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); +/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ +int gro_normal_batch __read_mostly = 8; + +/** + * dev_add_offload - register offload handlers + * @po: protocol offload declaration + * + * Add protocol offload handlers to the networking stack. The passed + * &proto_offload is linked into kernel lists and may not be freed until + * it has been removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ + struct packet_offload *elem; + + spin_lock(&offload_lock); + list_for_each_entry(elem, &offload_base, list) { + if (po->priority < elem->priority) + break; + } + list_add_rcu(&po->list, elem->list.prev); + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + * __dev_remove_offload - remove offload handler + * @po: packet offload declaration + * + * Remove a protocol offload handler that was previously added to the + * kernel offload handlers by dev_add_offload(). The passed &offload_type + * is removed from the kernel lists and can be freed or reused once this + * function returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +static void __dev_remove_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + struct packet_offload *po1; + + spin_lock(&offload_lock); + + list_for_each_entry(po1, head, list) { + if (po == po1) { + list_del_rcu(&po->list); + goto out; + } + } + + pr_warn("dev_remove_offload: %p not found\n", po); +out: + spin_unlock(&offload_lock); +} + +/** + * dev_remove_offload - remove packet offload handler + * @po: packet offload declaration + * + * Remove a packet offload handler that was previously added to the kernel + * offload handlers by dev_add_offload(). The passed &offload_type is + * removed from the kernel lists and can be freed or reused once this + * function returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ + __dev_remove_offload(po); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); + +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) +{ + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); + unsigned int offset = skb_gro_offset(skb); + unsigned int headlen = skb_headlen(skb); + unsigned int len = skb_gro_len(skb); + unsigned int delta_truesize; + unsigned int gro_max_size; + unsigned int new_truesize; + struct sk_buff *lp; + + /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ + gro_max_size = READ_ONCE(p->dev->gro_max_size); + + if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) + return -E2BIG; + + lp = NAPI_GRO_CB(p)->last; + pinfo = skb_shinfo(lp); + + if (headlen <= offset) { + skb_frag_t *frag; + skb_frag_t *frag2; + int i = skbinfo->nr_frags; + int nr_frags = pinfo->nr_frags + i; + + if (nr_frags > MAX_SKB_FRAGS) + goto merge; + + offset -= headlen; + pinfo->nr_frags = nr_frags; + skbinfo->nr_frags = 0; + + frag = pinfo->frags + nr_frags; + frag2 = skbinfo->frags + i; + do { + *--frag = *--frag2; + } while (--i); + + skb_frag_off_add(frag, offset); + skb_frag_size_sub(frag, offset); + + /* all fragments truesize : remove (head size + sk_buff) */ + new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); + delta_truesize = skb->truesize - new_truesize; + + skb->truesize = new_truesize; + skb->len -= skb->data_len; + skb->data_len = 0; + + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; + goto done; + } else if (skb->head_frag) { + int nr_frags = pinfo->nr_frags; + skb_frag_t *frag = pinfo->frags + nr_frags; + struct page *page = virt_to_head_page(skb->head); + unsigned int first_size = headlen - offset; + unsigned int first_offset; + + if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) + goto merge; + + first_offset = skb->data - + (unsigned char *)page_address(page) + + offset; + + pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; + + __skb_frag_set_page(frag, page); + skb_frag_off_set(frag, first_offset); + skb_frag_size_set(frag, first_size); + + memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); + /* We dont need to clear skbinfo->nr_frags here */ + + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + delta_truesize = skb->truesize - new_truesize; + skb->truesize = new_truesize; + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; + goto done; + } + +merge: + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + delta_truesize = skb->truesize; + if (offset > headlen) { + unsigned int eat = offset - headlen; + + skb_frag_off_add(&skbinfo->frags[0], eat); + skb_frag_size_sub(&skbinfo->frags[0], eat); + skb->data_len -= eat; + skb->len -= eat; + offset = headlen; + } + + __skb_pull(skb, offset); + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + NAPI_GRO_CB(p)->last = skb; + __skb_header_release(skb); + lp = p; + +done: + NAPI_GRO_CB(p)->count++; + p->data_len += len; + p->truesize += delta_truesize; + p->len += len; + if (lp != p) { + lp->data_len += len; + lp->truesize += delta_truesize; + lp->len += len; + } + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} + + +static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +{ + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct list_head *head = &offload_base; + int err = -ENOENT; + + BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + + if (NAPI_GRO_CB(skb)->count == 1) { + skb_shinfo(skb)->gso_size = 0; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return; + } + +out: + gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); +} + +static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, + bool flush_old) +{ + struct list_head *head = &napi->gro_hash[index].list; + struct sk_buff *skb, *p; + + list_for_each_entry_safe_reverse(skb, p, head, list) { + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + skb_list_del_init(skb); + napi_gro_complete(napi, skb); + napi->gro_hash[index].count--; + } + + if (!napi->gro_hash[index].count) + __clear_bit(index, &napi->gro_bitmask); +} + +/* napi->gro_hash[].list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; + + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); + } +} +EXPORT_SYMBOL(napi_gro_flush); + +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) +{ + unsigned int maclen = skb->dev->hard_header_len; + u32 hash = skb_get_hash_raw(skb); + struct sk_buff *p; + + list_for_each_entry(p, head, list) { + unsigned long diffs; + + NAPI_GRO_CB(p)->flush = 0; + + if (hash != skb_get_hash_raw(p)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); + diffs |= skb_metadata_differs(p, skb); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_mac_header(skb), + maclen); + + /* in most common scenarions 'slow_gro' is 0 + * otherwise we are already on some slower paths + * either skip all the infrequent tests altogether or + * avoid trying too hard to skip each of them individually + */ + if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; +#endif + + diffs |= p->sk != skb->sk; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + } + + NAPI_GRO_CB(p)->same_flow = !diffs; + } +} + +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) +{ + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (!skb_headlen(skb) && pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0)) && + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); + } +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + skb_frag_off_add(&pinfo->frags[0], grow); + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); + } +} + +static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) +{ + struct sk_buff *oldest; + + oldest = list_last_entry(head, struct sk_buff, list); + + /* We are called with head length >= MAX_GRO_SKBS, so this is + * impossible. + */ + if (WARN_ON_ONCE(!oldest)) + return; + + /* Do not adjust napi->gro_hash[].count, caller is adding a new + * SKB to the chain. + */ + skb_list_del_init(oldest); + napi_gro_complete(napi, oldest); +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct gro_list *gro_list = &napi->gro_hash[bucket]; + struct list_head *head = &offload_base; + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct sk_buff *pp = NULL; + enum gro_result ret; + int same_flow; + int grow; + + if (netif_elide_gro(skb->dev)) + goto normal; + + gro_list_prepare(&gro_list->list, skb); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + + skb_set_network_header(skb, skb_gro_offset(skb)); + skb_reset_mac_len(skb); + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); + NAPI_GRO_CB(skb)->free = 0; + NAPI_GRO_CB(skb)->encap_mark = 0; + NAPI_GRO_CB(skb)->recursion_counter = 0; + NAPI_GRO_CB(skb)->is_fou = 0; + NAPI_GRO_CB(skb)->is_atomic = 1; + NAPI_GRO_CB(skb)->gro_remcsum_start = 0; + + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + NAPI_GRO_CB(skb)->csum_cnt = 0; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + NAPI_GRO_CB(skb)->csum_valid = 0; + break; + default: + NAPI_GRO_CB(skb)->csum_cnt = 0; + NAPI_GRO_CB(skb)->csum_valid = 0; + } + + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + &gro_list->list, skb); + break; + } + rcu_read_unlock(); + + if (&ptype->list == head) + goto normal; + + if (PTR_ERR(pp) == -EINPROGRESS) { + ret = GRO_CONSUMED; + goto ok; + } + + same_flow = NAPI_GRO_CB(skb)->same_flow; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + + if (pp) { + skb_list_del_init(pp); + napi_gro_complete(napi, pp); + gro_list->count--; + } + + if (same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush) + goto normal; + + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) + gro_flush_oldest(napi, &gro_list->list); + else + gro_list->count++; + + NAPI_GRO_CB(skb)->count = 1; + NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; + skb_shinfo(skb)->gso_size = skb_gro_len(skb); + list_add(&skb->list, &gro_list->list); + ret = GRO_HELD; + +pull: + grow = skb_gro_offset(skb) - skb_headlen(skb); + if (grow > 0) + gro_pull_from_frag0(skb, grow); +ok: + if (gro_list->count) { + if (!test_bit(bucket, &napi->gro_bitmask)) + __set_bit(bucket, &napi->gro_bitmask); + } else if (test_bit(bucket, &napi->gro_bitmask)) { + __clear_bit(bucket, &napi->gro_bitmask); + } + + return ret; + +normal: + ret = GRO_NORMAL; + goto pull; +} + +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type); + +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + gro_normal_one(napi, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); + break; + + case GRO_HELD: + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + gro_result_t ret; + + skb_mark_napi_id(skb, napi); + trace_napi_gro_receive_entry(skb); + + skb_gro_reset_offset(skb, 0); + + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_receive_exit(ret); + + return ret; +} +EXPORT_SYMBOL(napi_gro_receive); + +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + if (unlikely(skb->pfmemalloc)) { + consume_skb(skb); + return; + } + __skb_pull(skb, skb_headlen(skb)); + /* restore the reserve we had after netdev_alloc_skb_ip_align() */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); + __vlan_hwaccel_clear_tag(skb); + skb->dev = napi->dev; + skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + if (unlikely(skb->slow_gro)) { + skb_orphan(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + skb->slow_gro = 0; + } + + napi->skb = skb; +} + +struct sk_buff *napi_get_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + + if (!skb) { + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } + } + return skb; +} +EXPORT_SYMBOL(napi_get_frags); + +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + case GRO_HELD: + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + if (ret == GRO_NORMAL) + gro_normal_one(napi, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else + napi_reuse_skb(napi, skb); + break; + + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ +static struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb, hlen); + + if (unlikely(skb_gro_header_hard(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); + if (unlikely(!eth)) { + net_warn_ratelimited("%s: dropping impossible skb from %s\n", + __func__, napi->dev->name); + napi_reuse_skb(napi, skb); + return NULL; + } + } else { + eth = (const struct ethhdr *)skb->data; + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; + } + __skb_pull(skb, hlen); + + /* + * This works because the only protocols we care about don't require + * special handling. + * We'll fix it up properly in napi_frags_finish() + */ + skb->protocol = eth->h_proto; + + return skb; +} + +gro_result_t napi_gro_frags(struct napi_struct *napi) +{ + gro_result_t ret; + struct sk_buff *skb = napi_frags_skb(napi); + + trace_napi_gro_frags_entry(skb); + + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; +} +EXPORT_SYMBOL(napi_gro_frags); + +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1a455847da54..b0f5344d1185 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -55,7 +55,7 @@ static void rfc2863_policy(struct net_device *dev) if (operstate == dev->operstate) return; - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); switch(dev->link_mode) { case IF_LINK_MODE_TESTING: @@ -74,7 +74,7 @@ static void rfc2863_policy(struct net_device *dev) dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } @@ -109,7 +109,7 @@ static void linkwatch_add_event(struct net_device *dev) spin_lock_irqsave(&lweventlist_lock, flags); if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); - dev_hold(dev); + dev_hold_track(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); } spin_unlock_irqrestore(&lweventlist_lock, flags); } @@ -166,6 +166,9 @@ static void linkwatch_do_dev(struct net_device *dev) netdev_state_change(dev); } + /* Note: our callers are responsible for + * calling netdev_tracker_free(). + */ dev_put(dev); } @@ -209,6 +212,10 @@ static void __linkwatch_run_queue(int urgent_only) list_add_tail(&dev->link_watch_list, &lweventlist); continue; } + /* We must free netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); linkwatch_do_dev(dev); do_dev--; @@ -232,6 +239,10 @@ void linkwatch_forget_dev(struct net_device *dev) if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); clean = 1; + /* We must release netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); if (clean) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 2f7940bcf715..349480ef68a5 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -2,6 +2,7 @@ /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> */ +#include <linux/filter.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> diff --git a/net/core/neighbour.c b/net/core/neighbour.c index dda12fbd177b..213cb7b26b7a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -624,7 +624,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, memcpy(n->primary_key, pkey, key_len); n->dev = dev; - dev_hold(dev); + dev_hold_track(dev, &n->dev_tracker, GFP_ATOMIC); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { @@ -770,10 +770,10 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; - dev_hold(dev); + dev_hold_track(dev, &n->dev_tracker, GFP_KERNEL); if (tbl->pconstructor && tbl->pconstructor(n)) { - dev_put(dev); + dev_put_track(dev, &n->dev_tracker); kfree(n); n = NULL; goto out; @@ -805,7 +805,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); - dev_put(n->dev); + dev_put_track(n->dev, &n->dev_tracker); kfree(n); return 0; } @@ -838,7 +838,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); - dev_put(n->dev); + dev_put_track(n->dev, &n->dev_tracker); kfree(n); } return -ENOENT; @@ -879,7 +879,7 @@ void neigh_destroy(struct neighbour *neigh) if (dev->netdev_ops->ndo_neigh_destroy) dev->netdev_ops->ndo_neigh_destroy(dev, neigh); - dev_put(dev); + dev_put_track(dev, &neigh->dev_tracker); neigh_parms_put(neigh->parms); neigh_dbg(2, "neigh %p is destroyed\n", neigh); @@ -1665,13 +1665,13 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, refcount_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); - dev_hold(dev); + dev_hold_track(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { - dev_put(dev); + dev_put_track(dev, &p->dev_tracker); kfree(p); return NULL; } @@ -1702,7 +1702,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) list_del(&parms->list); parms->dead = 1; write_unlock_bh(&tbl->lock); - dev_put(parms->dev); + dev_put_track(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); @@ -3770,10 +3770,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, neigh_proc_base_reachable_time; } - /* Don't export sysctls to unprivileged users */ - if (neigh_parms_net(p)->user_ns != &init_user_ns) - t->neigh_vars[0].procname = NULL; - switch (neigh_parms_family(p)) { case AF_INET: p_name = "ipv4"; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 9c01c642cf9e..53ea262ecafd 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -488,14 +488,6 @@ static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - struct net_device *netdev = to_net_dev(dev); - - /* The check is also done in change_proto_down; this helps returning - * early without hitting the trylock/restart in netdev_store. - */ - if (!netdev->netdev_ops->ndo_change_proto_down) - return -EOPNOTSUPP; - return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); @@ -1012,7 +1004,7 @@ static void rx_queue_release(struct kobject *kobj) #endif memset(kobj, 0, sizeof(*kobj)); - dev_put(queue->dev); + dev_put_track(queue->dev, &queue->dev_tracker); } static const void *rx_queue_namespace(struct kobject *kobj) @@ -1052,7 +1044,7 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ - dev_hold(queue->dev); + dev_hold_track(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, @@ -1201,11 +1193,7 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) { - unsigned long trans_timeout; - - spin_lock_irq(&queue->_xmit_lock); - trans_timeout = queue->trans_timeout; - spin_unlock_irq(&queue->_xmit_lock); + unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); return sprintf(buf, fmt_ulong, trans_timeout); } @@ -1452,7 +1440,7 @@ static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, for (i = map->len; i--;) { if (map->queues[i] == index) { - set_bit(j, mask); + __set_bit(j, mask); break; } } @@ -1619,7 +1607,7 @@ static void netdev_queue_release(struct kobject *kobj) struct netdev_queue *queue = to_netdev_queue(kobj); memset(kobj, 0, sizeof(*kobj)); - dev_put(queue->dev); + dev_put_track(queue->dev, &queue->dev_tracker); } static const void *netdev_queue_namespace(struct kobject *kobj) @@ -1659,7 +1647,7 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ - dev_hold(queue->dev); + dev_hold_track(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, @@ -1706,6 +1694,13 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) int i; int error = 0; + /* Tx queue kobjects are allowed to be updated when a device is being + * unregistered, but solely to remove queues from qdiscs. Any path + * adding queues should be fixed. + */ + WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num, + "New queues can't be registered after device unregistration."); + for (i = old_num; i < new_num; i++) { error = netdev_queue_add_kobject(dev, i); if (error) { @@ -1820,6 +1815,9 @@ static void remove_queue_kobjects(struct net_device *dev) net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); + + dev->real_num_rx_queues = 0; + dev->real_num_tx_queues = 0; #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 202fa5eacd0f..9b7171c40434 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -311,6 +311,8 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) LIST_HEAD(net_exit_list); refcount_set(&net->ns.count, 1); + ref_tracker_dir_init(&net->refcnt_tracker, 128); + refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); preempt_disable(); @@ -635,6 +637,7 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { + ref_tracker_dir_exit(&net->refcnt_tracker); /* Cleanup the network namespace in process context */ if (llist_add(&net->cleanup_list, &cleanup_list)) queue_work(netns_wq, &net_cleanup_work); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index edfc0f8011f8..db724463e7cd 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -776,7 +776,7 @@ put_noaddr: err = __netpoll_setup(np, ndev); if (err) goto put; - + netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL); rtnl_unlock(); return 0; @@ -853,7 +853,7 @@ void netpoll_cleanup(struct netpoll *np) if (!np->dev) goto out; __netpoll_cleanup(np); - dev_put(np->dev); + dev_put_track(np->dev, &np->dev_tracker); np->dev = NULL; out: rtnl_unlock(); diff --git a/net/core/of_net.c b/net/core/of_net.c index f1a9bf7578e7..95a64c813ae5 100644 --- a/net/core/of_net.c +++ b/net/core/of_net.c @@ -61,7 +61,7 @@ static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) { struct platform_device *pdev = of_find_device_by_node(np); struct nvmem_cell *cell; - const void *mac; + const void *buf; size_t len; int ret; @@ -78,21 +78,32 @@ static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) if (IS_ERR(cell)) return PTR_ERR(cell); - mac = nvmem_cell_read(cell, &len); + buf = nvmem_cell_read(cell, &len); nvmem_cell_put(cell); - if (IS_ERR(mac)) - return PTR_ERR(mac); - - if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { - kfree(mac); - return -EINVAL; + if (IS_ERR(buf)) + return PTR_ERR(buf); + + ret = 0; + if (len == ETH_ALEN) { + if (is_valid_ether_addr(buf)) + memcpy(addr, buf, ETH_ALEN); + else + ret = -EINVAL; + } else if (len == 3 * ETH_ALEN - 1) { + u8 mac[ETH_ALEN]; + + if (mac_pton(buf, mac)) + memcpy(addr, mac, ETH_ALEN); + else + ret = -EINVAL; + } else { + ret = -EINVAL; } - memcpy(addr, mac, ETH_ALEN); - kfree(mac); + kfree(buf); - return 0; + return ret; } /** diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 1a6978427d6c..bd62c01a2ec3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -130,9 +130,6 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ #endif - /* Slower-path: Get pages from locked ring queue */ - spin_lock(&r->consumer_lock); - /* Refill alloc array, but only if NUMA match */ do { page = __ptr_ring_consume(r); @@ -157,7 +154,6 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) if (likely(pool->alloc.count > 0)) page = pool->alloc.cache[--pool->alloc.count]; - spin_unlock(&r->consumer_lock); return page; } @@ -217,6 +213,8 @@ static void page_pool_set_pp_info(struct page_pool *pool, { page->pp = pool; page->pp_magic |= PP_SIGNATURE; + if (pool->p.init_callback) + pool->p.init_callback(page, pool->p.init_arg); } static void page_pool_clear_pp_info(struct page *page) @@ -691,10 +689,12 @@ static void page_pool_release_retry(struct work_struct *wq) schedule_delayed_work(&pool->release_dw, DEFER_TIME); } -void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; + pool->xdp_mem_id = mem->id; } void page_pool_destroy(struct page_pool *pool) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a3d74e2704c4..560a5e712dc3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -410,6 +410,7 @@ struct pktgen_dev { * device name (not when the inject is * started as it used to do.) */ + netdevice_tracker dev_tracker; char odevname[32]; struct flow_state *flows; unsigned int cflows; /* Concurrent flows (config) */ @@ -2099,7 +2100,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn, /* Clean old setups */ if (pkt_dev->odev) { - dev_put(pkt_dev->odev); + dev_put_track(pkt_dev->odev, &pkt_dev->dev_tracker); pkt_dev->odev = NULL; } @@ -2117,6 +2118,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn, err = -ENETDOWN; } else { pkt_dev->odev = odev; + netdev_tracker_alloc(odev, &pkt_dev->dev_tracker, GFP_KERNEL); return 0; } @@ -3805,7 +3807,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) return add_dev_to_thread(t, pkt_dev); out2: - dev_put(pkt_dev->odev); + dev_put_track(pkt_dev->odev, &pkt_dev->dev_tracker); out1: #ifdef CONFIG_XFRM free_SAs(pkt_dev); @@ -3899,7 +3901,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Dis-associate from the interface */ if (pkt_dev->odev) { - dev_put(pkt_dev->odev); + dev_put_track(pkt_dev->odev, &pkt_dev->dev_tracker); pkt_dev->odev = NULL; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2af8aeeadadf..e476403231f0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -842,9 +842,9 @@ static void set_operstate(struct net_device *dev, unsigned char transition) } if (dev->operstate != operstate) { - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); netdev_state_change(dev); } } @@ -1026,6 +1026,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -1728,6 +1729,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif @@ -1880,6 +1882,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, + [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2299,6 +2302,14 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], } } + if (tb[IFLA_GRO_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); + + if (gro_max_size > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_max_size"); + return -EINVAL; + } + } return 0; } @@ -2539,13 +2550,12 @@ static int do_set_proto_down(struct net_device *dev, struct netlink_ext_ack *extack) { struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; - const struct net_device_ops *ops = dev->netdev_ops; unsigned long mask = 0; u32 value; bool proto_down; int err; - if (!ops->ndo_change_proto_down) { + if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } @@ -2768,7 +2778,16 @@ static int do_setlink(const struct sk_buff *skb, } if (dev->gso_max_segs ^ max_segs) { - dev->gso_max_segs = max_segs; + netif_set_gso_max_segs(dev, max_segs); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GRO_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); + + if (dev->gro_max_size ^ gro_max_size) { + netif_set_gro_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } @@ -2779,11 +2798,11 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; dev->link_mode = value; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } if (tb[IFLA_VFINFO_LIST]) { @@ -3222,7 +3241,9 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) - dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); + if (tb[IFLA_GRO_MAX_SIZE]) + netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); return dev; } diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index b5bc680d4755..9b8443774449 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -19,8 +19,8 @@ #include <linux/in6.h> #include <net/tcp.h> -static siphash_key_t net_secret __read_mostly; -static siphash_key_t ts_secret __read_mostly; +static siphash_aligned_key_t net_secret; +static siphash_aligned_key_t ts_secret; static __always_inline void net_secret_init(void) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 909db87d7383..0118f0afaa4f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -759,21 +759,23 @@ void __kfree_skb(struct sk_buff *skb) EXPORT_SYMBOL(__kfree_skb); /** - * kfree_skb - free an sk_buff + * kfree_skb_reason - free an sk_buff with special reason * @skb: buffer to free + * @reason: reason why this skb is dropped * * Drop a reference to the buffer and free it if the usage count has - * hit zero. + * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' + * tracepoint. */ -void kfree_skb(struct sk_buff *skb) +void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) { if (!skb_unref(skb)) return; - trace_kfree_skb(skb, __builtin_return_address(0)); + trace_kfree_skb(skb, __builtin_return_address(0), reason); __kfree_skb(skb); } -EXPORT_SYMBOL(kfree_skb); +EXPORT_SYMBOL(kfree_skb_reason); void kfree_skb_list(struct sk_buff *segs) { @@ -992,12 +994,10 @@ void napi_consume_skb(struct sk_buff *skb, int budget) } EXPORT_SYMBOL(napi_consume_skb); -/* Make sure a field is enclosed inside headers_start/headers_end section */ +/* Make sure a field is contained by headers group */ #define CHECK_SKB_FIELD(field) \ - BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ - offsetof(struct sk_buff, headers_start)); \ - BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ - offsetof(struct sk_buff, headers_end)); \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ + offsetof(struct sk_buff, headers.field)); \ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { @@ -1009,14 +1009,12 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) __skb_ext_copy(new, old); __nf_copy(new, old, false); - /* Note : this field could be in headers_start/headers_end section + /* Note : this field could be in the headers group. * It is not yet because we do not want to have a 16 bit hole */ new->queue_mapping = old->queue_mapping; - memcpy(&new->headers_start, &old->headers_start, - offsetof(struct sk_buff, headers_end) - - offsetof(struct sk_buff, headers_start)); + memcpy(&new->headers, &old->headers, sizeof(new->headers)); CHECK_SKB_FIELD(protocol); CHECK_SKB_FIELD(csum); CHECK_SKB_FIELD(hash); @@ -2028,6 +2026,30 @@ void *skb_pull(struct sk_buff *skb, unsigned int len) EXPORT_SYMBOL(skb_pull); /** + * skb_pull_data - remove data from the start of a buffer returning its + * original position. + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the original data in the buffer + * is returned after checking if there is enough data to pull. Once the + * data has been pulled future pushes will overwrite the old data. + */ +void *skb_pull_data(struct sk_buff *skb, size_t len) +{ + void *data = skb->data; + + if (skb->len < len) + return NULL; + + skb_pull(skb, len); + + return data; +} +EXPORT_SYMBOL(skb_pull_data); + +/** * skb_trim - remove end from a buffer * @skb: buffer to alter * @len: new length @@ -3919,32 +3941,6 @@ err_linearize: } EXPORT_SYMBOL_GPL(skb_segment_list); -int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) -{ - if (unlikely(p->len + skb->len >= 65536)) - return -E2BIG; - - if (NAPI_GRO_CB(p)->last == p) - skb_shinfo(p)->frag_list = skb; - else - NAPI_GRO_CB(p)->last->next = skb; - - skb_pull(skb, skb_gro_offset(skb)); - - NAPI_GRO_CB(p)->last = skb; - NAPI_GRO_CB(p)->count++; - p->data_len += skb->len; - - /* sk owenrship - if any - completely transferred to the aggregated packet */ - skb->destructor = NULL; - p->truesize += skb->truesize; - p->len += skb->len; - - NAPI_GRO_CB(skb)->same_flow = 1; - - return 0; -} - /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment @@ -4297,122 +4293,6 @@ err: } EXPORT_SYMBOL_GPL(skb_segment); -int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) -{ - struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); - unsigned int offset = skb_gro_offset(skb); - unsigned int headlen = skb_headlen(skb); - unsigned int len = skb_gro_len(skb); - unsigned int delta_truesize; - unsigned int new_truesize; - struct sk_buff *lp; - - if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) - return -E2BIG; - - lp = NAPI_GRO_CB(p)->last; - pinfo = skb_shinfo(lp); - - if (headlen <= offset) { - skb_frag_t *frag; - skb_frag_t *frag2; - int i = skbinfo->nr_frags; - int nr_frags = pinfo->nr_frags + i; - - if (nr_frags > MAX_SKB_FRAGS) - goto merge; - - offset -= headlen; - pinfo->nr_frags = nr_frags; - skbinfo->nr_frags = 0; - - frag = pinfo->frags + nr_frags; - frag2 = skbinfo->frags + i; - do { - *--frag = *--frag2; - } while (--i); - - skb_frag_off_add(frag, offset); - skb_frag_size_sub(frag, offset); - - /* all fragments truesize : remove (head size + sk_buff) */ - new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); - delta_truesize = skb->truesize - new_truesize; - - skb->truesize = new_truesize; - skb->len -= skb->data_len; - skb->data_len = 0; - - NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; - goto done; - } else if (skb->head_frag) { - int nr_frags = pinfo->nr_frags; - skb_frag_t *frag = pinfo->frags + nr_frags; - struct page *page = virt_to_head_page(skb->head); - unsigned int first_size = headlen - offset; - unsigned int first_offset; - - if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) - goto merge; - - first_offset = skb->data - - (unsigned char *)page_address(page) + - offset; - - pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - - __skb_frag_set_page(frag, page); - skb_frag_off_set(frag, first_offset); - skb_frag_size_set(frag, first_size); - - memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); - /* We dont need to clear skbinfo->nr_frags here */ - - new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); - delta_truesize = skb->truesize - new_truesize; - skb->truesize = new_truesize; - NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; - goto done; - } - -merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ - skb->destructor = NULL; - delta_truesize = skb->truesize; - if (offset > headlen) { - unsigned int eat = offset - headlen; - - skb_frag_off_add(&skbinfo->frags[0], eat); - skb_frag_size_sub(&skbinfo->frags[0], eat); - skb->data_len -= eat; - skb->len -= eat; - offset = headlen; - } - - __skb_pull(skb, offset); - - if (NAPI_GRO_CB(p)->last == p) - skb_shinfo(p)->frag_list = skb; - else - NAPI_GRO_CB(p)->last->next = skb; - NAPI_GRO_CB(p)->last = skb; - __skb_header_release(skb); - lp = p; - -done: - NAPI_GRO_CB(p)->count++; - p->data_len += len; - p->truesize += delta_truesize; - p->len += len; - if (lp != p) { - lp->data_len += len; - lp->truesize += delta_truesize; - lp->len += len; - } - NAPI_GRO_CB(skb)->same_flow = 1; - return 0; -} - #ifdef CONFIG_SKB_EXTENSIONS #define SKB_EXT_ALIGN_VALUE 8 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) @@ -4849,8 +4729,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) + if (sk_is_tcp(sk)) serr->ee.ee_data -= sk->sk_tskey; } @@ -4919,8 +4798,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (tsonly) { #ifdef CONFIG_INET if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && - sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) { + sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); opt_stats = true; diff --git a/net/core/sock.c b/net/core/sock.c index 41e91d0f7061..e21485ab285d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -144,8 +144,6 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); -static void sock_inuse_add(struct net *net, int val); - /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through @@ -327,7 +325,10 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); noreclaim_flag = memalloc_noreclaim_save(); - ret = sk->sk_backlog_rcv(sk, skb); + ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, + tcp_v6_do_rcv, + tcp_v4_do_rcv, + sk, skb); memalloc_noreclaim_restore(noreclaim_flag); return ret; @@ -872,8 +873,7 @@ int sock_set_timestamping(struct sock *sk, int optname, if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) { + if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; @@ -1135,6 +1135,7 @@ set_sndbuf: case SO_PRIORITY: if ((val >= 0 && val <= 6) || + ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) sk->sk_priority = val; else @@ -1280,7 +1281,8 @@ set_sndbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1370,8 +1372,7 @@ set_sndbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (!((sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP) || + if (!(sk_is_tcp(sk) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; @@ -1982,7 +1983,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { - get_net(net); + get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } @@ -2038,7 +2039,7 @@ static void __sk_destruct(struct rcu_head *head) put_pid(sk->sk_peer_pid); if (likely(sk->sk_net_refcnt)) - put_net(sock_net(sk)); + put_net_track(sock_net(sk), &sk->ns_tracker); sk_prot_free(sk->sk_prot_creator, sk); } @@ -2125,7 +2126,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* SANITY */ if (likely(newsk->sk_net_refcnt)) { - get_net(sock_net(newsk)); + get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); sock_inuse_add(sock_net(newsk), 1); } sk_node_init(&newsk->sk_node); @@ -2246,17 +2247,22 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) u32 max_segs = 1; sk_dst_set(sk, dst); - sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; + sk->sk_route_caps = dst->dev->features; + if (sk_is_tcp(sk)) + sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; - sk->sk_route_caps &= ~sk->sk_route_nocaps; + if (unlikely(sk->sk_gso_disabled)) + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = dst->dev->gso_max_size; - max_segs = max_t(u32, dst->dev->gso_max_segs, 1); + /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ + sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); + /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ + max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; @@ -3286,7 +3292,7 @@ void lock_sock_nested(struct sock *sk, int subclass) might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (sk->sk_lock.owned) + if (sock_owned_by_user_nocheck(sk)) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); @@ -3317,7 +3323,7 @@ bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (!sk->sk_lock.owned) { + if (!sock_owned_by_user_nocheck(sk)) { /* * Fast path return with bottom halves disabled and * sock::sk_lock.slock held. @@ -3532,19 +3538,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) } #ifdef CONFIG_PROC_FS -#define PROTO_INUSE_NR 64 /* should be enough for the first time */ -struct prot_inuse { - int val[PROTO_INUSE_NR]; -}; - static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); -void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) -{ - __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); -} -EXPORT_SYMBOL_GPL(sock_prot_inuse_add); - int sock_prot_inuse_get(struct net *net, struct proto *prot) { int cpu, idx = prot->inuse_idx; @@ -3557,17 +3552,12 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); -static void sock_inuse_add(struct net *net, int val) -{ - this_cpu_add(*net->core.sock_inuse, val); -} - int sock_inuse_get(struct net *net) { int cpu, res = 0; for_each_possible_cpu(cpu) - res += *per_cpu_ptr(net->core.sock_inuse, cpu); + res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; return res; } @@ -3579,22 +3569,12 @@ static int __net_init sock_inuse_init_net(struct net *net) net->core.prot_inuse = alloc_percpu(struct prot_inuse); if (net->core.prot_inuse == NULL) return -ENOMEM; - - net->core.sock_inuse = alloc_percpu(int); - if (net->core.sock_inuse == NULL) - goto out; - return 0; - -out: - free_percpu(net->core.prot_inuse); - return -ENOMEM; } static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.prot_inuse); - free_percpu(net->core.sock_inuse); } static struct pernet_operations net_inuse_ops = { @@ -3640,9 +3620,6 @@ static inline void release_proto_idx(struct proto *prot) { } -static void sock_inuse_add(struct net *net, int val) -{ -} #endif static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index c9c45b935f99..f7cf74cdd3db 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -1,5 +1,6 @@ /* License: GPL */ +#include <linux/filter.h> #include <linux/mutex.h> #include <linux/socket.h> #include <linux/skbuff.h> diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4ca4b11f4e5f..1827669eedd6 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -292,15 +292,23 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + /* msg_* and stream_* programs references tracked in psock after this + * point. Reference dec and cleanup will occur through psock destructor + */ ret = sock_map_init_proto(sk, psock); - if (ret < 0) - goto out_drop; + if (ret < 0) { + sk_psock_put(sk, psock); + goto out; + } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); - if (ret) - goto out_unlock_drop; + if (ret) { + write_unlock_bh(&sk->sk_callback_lock); + sk_psock_put(sk, psock); + goto out; + } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); @@ -309,10 +317,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } write_unlock_bh(&sk->sk_callback_lock); return 0; -out_unlock_drop: - write_unlock_bh(&sk->sk_callback_lock); -out_drop: - sk_psock_put(sk, psock); out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); @@ -325,6 +329,7 @@ out_put_stream_parser: out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); +out: return ret; } @@ -1564,7 +1569,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5f88526ad61c..7b4d485aac7a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -6,6 +6,7 @@ * Added /proc/sys/net/core directory entry (empty =) ). [MS] */ +#include <linux/filter.h> #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/module.h> diff --git a/net/core/xdp.c b/net/core/xdp.c index 5ddc29f29bad..7aba35504986 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -110,20 +110,15 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +void xdp_unreg_mem_model(struct xdp_mem_info *mem) { struct xdp_mem_allocator *xa; - int type = xdp_rxq->mem.type; - int id = xdp_rxq->mem.id; + int type = mem->type; + int id = mem->id; /* Reset mem info to defaults */ - xdp_rxq->mem.id = 0; - xdp_rxq->mem.type = 0; - - if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { - WARN(1, "Missing register, driver bug"); - return; - } + mem->id = 0; + mem->type = 0; if (id == 0) return; @@ -135,6 +130,17 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) rcu_read_unlock(); } } +EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); + +void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return; + } + + xdp_unreg_mem_model(&xdp_rxq->mem); +} EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) @@ -159,6 +165,11 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id) { + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); return -EINVAL; @@ -169,11 +180,6 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_unreg(xdp_rxq); } - if (!dev) { - WARN(1, "Missing net_device from driver"); - return -ENODEV; - } - /* State either UNREGISTERED or NEW */ xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; @@ -259,28 +265,24 @@ static bool __is_supported_mem_type(enum xdp_mem_type type) return true; } -int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, - enum xdp_mem_type type, void *allocator) +static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, + void *allocator) { struct xdp_mem_allocator *xdp_alloc; gfp_t gfp = GFP_KERNEL; int id, errno, ret; void *ptr; - if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { - WARN(1, "Missing register, driver bug"); - return -EFAULT; - } - if (!__is_supported_mem_type(type)) - return -EOPNOTSUPP; + return ERR_PTR(-EOPNOTSUPP); - xdp_rxq->mem.type = type; + mem->type = type; if (!allocator) { if (type == MEM_TYPE_PAGE_POOL) - return -EINVAL; /* Setup time check page_pool req */ - return 0; + return ERR_PTR(-EINVAL); /* Setup time check page_pool req */ + return NULL; } /* Delay init of rhashtable to save memory if feature isn't used */ @@ -290,13 +292,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, mutex_unlock(&mem_id_lock); if (ret < 0) { WARN_ON(1); - return ret; + return ERR_PTR(ret); } } xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); if (!xdp_alloc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); mutex_lock(&mem_id_lock); id = __mem_id_cyclic_get(gfp); @@ -304,31 +306,61 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, errno = id; goto err; } - xdp_rxq->mem.id = id; - xdp_alloc->mem = xdp_rxq->mem; + mem->id = id; + xdp_alloc->mem = *mem; xdp_alloc->allocator = allocator; /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { - ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id); - xdp_rxq->mem.id = 0; + ida_simple_remove(&mem_id_pool, mem->id); + mem->id = 0; errno = PTR_ERR(ptr); goto err; } if (type == MEM_TYPE_PAGE_POOL) - page_pool_use_xdp_mem(allocator, mem_allocator_disconnect); + page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem); mutex_unlock(&mem_id_lock); - trace_mem_connect(xdp_alloc, xdp_rxq); - return 0; + return xdp_alloc; err: mutex_unlock(&mem_id_lock); kfree(xdp_alloc); - return errno; + return ERR_PTR(errno); } + +int xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + xdp_alloc = __xdp_reg_mem_model(mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + return 0; +} +EXPORT_SYMBOL_GPL(xdp_reg_mem_model); + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + + xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + + trace_mem_connect(xdp_alloc, xdp_rxq); + return 0; +} + EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); /* XDP RX runs under NAPI protection, and in different delivery error |