diff options
Diffstat (limited to 'net')
188 files changed, 12170 insertions, 2181 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8ccee3d01822..5e9950453955 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -647,13 +647,14 @@ out: return err; } -static struct sk_buff **vlan_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *vlan_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct vlan_hdr *vhdr; - unsigned int hlen, off_vlan; const struct packet_offload *ptype; + unsigned int hlen, off_vlan; + struct sk_buff *pp = NULL; + struct vlan_hdr *vhdr; + struct sk_buff *p; __be16 type; int flush = 1; @@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct vlan_hdr *vhdr2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index de8034d80623..361116f77cb9 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -24,7 +24,6 @@ config BATMAN_ADV depends on NET select CRC16 select LIBCRC32C - default n help B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is a routing protocol for multi-hop ad-hoc mesh networks. The @@ -33,7 +32,7 @@ config BATMAN_ADV tools. config BATMAN_ADV_BATMAN_V - bool "B.A.T.M.A.N. V protocol (experimental)" + bool "B.A.T.M.A.N. V protocol" depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y) default y help @@ -60,7 +59,7 @@ config BATMAN_ADV_BLA config BATMAN_ADV_DAT bool "Distributed ARP Table" depends on BATMAN_ADV && INET - default n + default y help This option enables DAT (Distributed ARP Table), a DHT based mechanism that increases ARP reliability on sparse wireless @@ -70,7 +69,6 @@ config BATMAN_ADV_DAT config BATMAN_ADV_NC bool "Network Coding" depends on BATMAN_ADV - default n help This option enables network coding, a mechanism that aims to increase the overall network throughput by fusing multiple @@ -84,7 +82,6 @@ config BATMAN_ADV_NC config BATMAN_ADV_MCAST bool "Multicast optimisation" depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) - default n help This option enables the multicast optimisation which aims to reduce the air overhead while improving the reliability of @@ -94,7 +91,6 @@ config BATMAN_ADV_DEBUGFS bool "batman-adv debugfs entries" depends on BATMAN_ADV depends on DEBUG_FS - default n help Enable this to export routing related debug tables via debugfs. The information for each soft-interface and used hard-interface can be diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 317cafd302cf..3dc6a7a43eb7 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -16,11 +16,11 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_ -#define _BATMAN_ADV_BATADV_IV_OGM_H_ +#ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_ +#define _NET_BATMAN_ADV_BAT_IV_OGM_H_ #include "main.h" int batadv_iv_init(void); -#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */ +#endif /* _NET_BATMAN_ADV_BAT_IV_OGM_H_ */ diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index ed36c5e79fde..e5be14c908c6 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -16,8 +16,8 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef _BATMAN_ADV_BATADV_V_OGM_H_ -#define _BATMAN_ADV_BATADV_V_OGM_H_ +#ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_ +#define _NET_BATMAN_ADV_BAT_V_OGM_H_ #include "main.h" @@ -34,4 +34,4 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_ogm_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); -#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */ +#endif /* _NET_BATMAN_ADV_BAT_V_OGM_H_ */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index a2de5a44bd41..ff9659af6b91 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1449,7 +1449,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) * detection frames. Set the locally administered bit to avoid * collisions with users mac addresses. */ - random_ether_addr(bat_priv->bla.loopdetect_addr); + eth_random_addr(bat_priv->bla.loopdetect_addr); bat_priv->bla.loopdetect_addr[0] = 0xba; bat_priv->bla.loopdetect_addr[1] = 0xbe; bat_priv->bla.loopdetect_lasttime = jiffies; diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 87479c60670e..3cb82378300b 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -118,7 +118,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode, #ifdef CONFIG_BATMAN_ADV_DAT /** - * batadv_dat_cache_open() - Prepare file handler for reads from dat_chache + * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache * @inode: inode which was opened * @file: file handle to be initialized * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 716e5b43acfa..1d295da3e342 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1339,7 +1339,11 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, return false; } -static void _batadv_purge_orig(struct batadv_priv *bat_priv) +/** + * batadv_purge_orig_ref() - Purge all outdated originators + * @bat_priv: the bat priv with all the soft interface information + */ +void batadv_purge_orig_ref(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_node *node_tmp; @@ -1385,21 +1389,12 @@ static void batadv_purge_orig(struct work_struct *work) delayed_work = to_delayed_work(work); bat_priv = container_of(delayed_work, struct batadv_priv, orig_work); - _batadv_purge_orig(bat_priv); + batadv_purge_orig_ref(bat_priv); queue_delayed_work(batadv_event_workqueue, &bat_priv->orig_work, msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD)); } -/** - * batadv_purge_orig_ref() - Purge all outdated originators - * @bat_priv: the bat priv with all the soft interface information - */ -void batadv_purge_orig_ref(struct batadv_priv *bat_priv) -{ - _batadv_purge_orig(bat_priv); -} - #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 360357f83f20..343d304851a5 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -43,12 +43,13 @@ struct seq_file; #ifdef CONFIG_BATMAN_ADV_DAT /** - * batadv_dat_addr_t - it is the type used for all DHT addresses. If it is - * changed, BATADV_DAT_ADDR_MAX is changed as well. + * typedef batadv_dat_addr_t - type used for all DHT addresses + * + * If it is changed, BATADV_DAT_ADDR_MAX is changed as well. * * *Please be careful: batadv_dat_addr_t must be UNSIGNED* */ -#define batadv_dat_addr_t u16 +typedef u16 batadv_dat_addr_t; #endif /* CONFIG_BATMAN_ADV_DAT */ diff --git a/net/core/dev.c b/net/core/dev.c index a5aa1c7444e6..4f8b92d81d10 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -149,7 +149,6 @@ #include "net-sysfs.h" -/* Instead of increasing this, you should create a hash table. */ #define MAX_GRO_SKBS 8 /* This should be increased if a protocol with a bigger head is added. */ @@ -2068,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; + /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } + /* didn't find it, just return -1 to indicate no match */ return -1; } @@ -2081,6 +2082,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS +struct static_key xps_needed __read_mostly; +EXPORT_SYMBOL(xps_needed); +struct static_key xps_rxqs_needed __read_mostly; +EXPORT_SYMBOL(xps_rxqs_needed); static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) @@ -2092,7 +2097,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, int pos; if (dev_maps) - map = xmap_dereference(dev_maps->cpu_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -2105,7 +2110,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, break; } - RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } @@ -2135,33 +2140,68 @@ static bool remove_xps_queue_cpu(struct net_device *dev, return active; } +static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, + struct xps_dev_maps *dev_maps, unsigned int nr_ids, + u16 offset, u16 count, bool is_rxqs_map) +{ + bool active = false; + int i, j; + + for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), + j < nr_ids;) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, + count); + if (!active) { + if (is_rxqs_map) { + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + } else { + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); + + for (i = offset + (count - 1); count--; i--) + netdev_queue_numa_node_write( + netdev_get_tx_queue(dev, i), + NUMA_NO_NODE); + } + kfree_rcu(dev_maps, rcu); + } +} + static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { + const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; - int cpu, i; - bool active = false; + unsigned int nr_ids; + + if (!static_key_false(&xps_needed)) + return; mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); + if (static_key_false(&xps_rxqs_needed)) { + dev_maps = xmap_dereference(dev->xps_rxqs_map); + if (dev_maps) { + nr_ids = dev->num_rx_queues; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, + offset, count, true); + } + } + + dev_maps = xmap_dereference(dev->xps_cpus_map); if (!dev_maps) goto out_no_maps; - for_each_possible_cpu(cpu) - active |= remove_xps_queue_cpu(dev, dev_maps, cpu, - offset, count); - - if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); - } - - for (i = offset + (count - 1); count--; i--) - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); + if (num_possible_cpus() > 1) + possible_mask = cpumask_bits(cpu_possible_mask); + nr_ids = nr_cpu_ids; + clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, + false); out_no_maps: + if (static_key_enabled(&xps_rxqs_needed)) + static_key_slow_dec(&xps_rxqs_needed); + + static_key_slow_dec(&xps_needed); mutex_unlock(&xps_map_mutex); } @@ -2170,8 +2210,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } -static struct xps_map *expand_xps_map(struct xps_map *map, - int cpu, u16 index) +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, + u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; @@ -2183,7 +2223,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return map; } - /* Need to add queue to this CPU's existing map */ + /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; @@ -2191,9 +2231,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map, alloc_len = map->alloc_len * 2; } - /* Need to allocate new map to store queue on this CPU's map */ - new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, - cpu_to_node(cpu)); + /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's + * map + */ + if (is_rxqs_map) + new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); + else + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, + cpu_to_node(attr_index)); if (!new_map) return NULL; @@ -2205,32 +2250,52 @@ static struct xps_map *expand_xps_map(struct xps_map *map, return new_map; } -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, - u16 index) +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, + u16 index, bool is_rxqs_map) { + const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; - int i, cpu, tci, numa_node_id = -2; + int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; bool active = false; + unsigned int nr_ids; if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; } - maps_sz = XPS_DEV_MAPS_SIZE(num_tc); - if (maps_sz < L1_CACHE_BYTES) - maps_sz = L1_CACHE_BYTES; - mutex_lock(&xps_map_mutex); + if (is_rxqs_map) { + maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); + dev_maps = xmap_dereference(dev->xps_rxqs_map); + nr_ids = dev->num_rx_queues; + } else { + maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); + if (num_possible_cpus() > 1) { + online_mask = cpumask_bits(cpu_online_mask); + possible_mask = cpumask_bits(cpu_possible_mask); + } + dev_maps = xmap_dereference(dev->xps_cpus_map); + nr_ids = nr_cpu_ids; + } - dev_maps = xmap_dereference(dev->xps_maps); + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; /* allocate memory for queue storage */ - for_each_cpu_and(cpu, cpu_online_mask, mask) { + for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), + j < nr_ids;) { if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { @@ -2238,73 +2303,85 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -ENOMEM; } - tci = cpu * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : + tci = j * num_tc + tc; + map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, cpu, index); + map = expand_xps_map(map, j, index, is_rxqs_map); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; - for_each_possible_cpu(cpu) { + static_key_slow_inc(&xps_needed); + if (is_rxqs_map) + static_key_slow_inc(&xps_rxqs_needed); + + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { + for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* We need to explicitly update tci as prevous loop * could break out early if dev_maps is NULL. */ - tci = cpu * num_tc + tc; + tci = j * num_tc + tc; - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { - /* add queue to CPU maps */ + if (netif_attr_test_mask(j, mask, nr_ids) && + netif_attr_test_online(j, online_mask, nr_ids)) { + /* add tx-queue to CPU/rx-queue maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (numa_node_id == -2) - numa_node_id = cpu_to_node(cpu); - else if (numa_node_id != cpu_to_node(cpu)) - numa_node_id = -1; + if (!is_rxqs_map) { + if (numa_node_id == -2) + numa_node_id = cpu_to_node(j); + else if (numa_node_id != cpu_to_node(j)) + numa_node_id = -1; + } #endif } else if (dev_maps) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* copy maps belonging to foreign traffic classes */ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[tci]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } - rcu_assign_pointer(dev->xps_maps, new_dev_maps); + if (is_rxqs_map) + rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); + else + rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); - map = xmap_dereference(dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (map && map != new_map) kfree_rcu(map, rcu); } @@ -2317,19 +2394,23 @@ out_no_old_maps: active = true; out_no_new_maps: - /* update Tx queue numa node */ - netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), - (numa_node_id >= 0) ? numa_node_id : - NUMA_NO_NODE); + if (!is_rxqs_map) { + /* update Tx queue numa node */ + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), + (numa_node_id >= 0) ? + numa_node_id : NUMA_NO_NODE); + } if (!dev_maps) goto out_no_maps; - /* removes queue from unused CPUs */ - for_each_possible_cpu(cpu) { - for (i = tc, tci = cpu * num_tc; i--; tci++) + /* removes tx-queue from unused CPUs/rx-queues */ + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = tc, tci = j * num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); - if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) + if (!netif_attr_test_mask(j, mask, nr_ids) || + !netif_attr_test_online(j, online_mask, nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); for (i = num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); @@ -2337,7 +2418,10 @@ out_no_new_maps: /* free map if not active */ if (!active) { - RCU_INIT_POINTER(dev->xps_maps, NULL); + if (is_rxqs_map) + RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); + else + RCU_INIT_POINTER(dev->xps_cpus_map, NULL); kfree_rcu(dev_maps, rcu); } @@ -2347,11 +2431,12 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for_each_possible_cpu(cpu) { - for (i = num_tc, tci = cpu * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), + j < nr_ids;) { + for (i = num_tc, tci = j * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = dev_maps ? - xmap_dereference(dev_maps->cpu_map[tci]) : + xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); @@ -2363,14 +2448,34 @@ error: kfree(new_dev_maps); return -ENOMEM; } + +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, + u16 index) +{ + return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); +} EXPORT_SYMBOL(netif_set_xps_queue); #endif +static void netdev_unbind_all_sb_channels(struct net_device *dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + + /* Unbind any subordinate channels */ + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev) + netdev_unbind_sb_channel(dev, txq->sb_dev); + } +} + void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + + /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); @@ -2399,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc) #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif + netdev_unbind_all_sb_channels(dev); + dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(sb_dev, 0); +#endif + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); + + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev == sb_dev) + txq->sb_dev = NULL; + } +} +EXPORT_SYMBOL(netdev_unbind_sb_channel); + +int netdev_bind_sb_channel_queue(struct net_device *dev, + struct net_device *sb_dev, + u8 tc, u16 count, u16 offset) +{ + /* Make certain the sb_dev and dev are already configured */ + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) + return -EINVAL; + + /* We cannot hand out queues we don't have */ + if ((offset + count) > dev->real_num_tx_queues) + return -EINVAL; + + /* Record the mapping */ + sb_dev->tc_to_txq[tc].count = count; + sb_dev->tc_to_txq[tc].offset = offset; + + /* Provide a way for Tx queue to find the tc_to_txq map or + * XPS map for itself. + */ + while (count--) + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; + + return 0; +} +EXPORT_SYMBOL(netdev_bind_sb_channel_queue); + +int netdev_set_sb_channel(struct net_device *dev, u16 channel) +{ + /* Do not use a multiqueue device to represent a subordinate channel */ + if (netif_is_multiqueue(dev)) + return -ENODEV; + + /* We allow channels 1 - 32767 to be used for subordinate channels. + * Channel 0 is meant to be "native" mode and used only to represent + * the main root device. We allow writing 0 to reset the device back + * to normal mode after being used as a subordinate channel. + */ + if (channel > S16_MAX) + return -EINVAL; + + dev->num_tc = -channel; + + return 0; +} +EXPORT_SYMBOL(netdev_set_sb_channel); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -2615,24 +2786,26 @@ EXPORT_SYMBOL(netif_device_attach); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb) +static u16 skb_tx_hash(const struct net_device *dev, + const struct net_device *sb_dev, + struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + + qoffset = sb_dev->tc_to_txq[tc].offset; + qcount = sb_dev->tc_to_txq[tc].count; + } + if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); while (unlikely(hash >= qcount)) hash -= qcount; - return hash; - } - - if (dev->num_tc) { - u8 tc = netdev_get_prio_tc_map(dev, skb->priority); - - qoffset = dev->tc_to_txq[tc].offset; - qcount = dev->tc_to_txq[tc].count; + return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; @@ -3376,32 +3549,64 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) } #endif /* CONFIG_NET_EGRESS */ -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +#ifdef CONFIG_XPS +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, + struct xps_dev_maps *dev_maps, unsigned int tci) +{ + struct xps_map *map; + int queue_index = -1; + + if (dev->num_tc) { + tci *= dev->num_tc; + tci += netdev_get_prio_tc_map(dev, skb->priority); + } + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale( + skb_get_hash(skb), map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + return queue_index; +} +#endif + +static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, + struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; - struct xps_map *map; + struct sock *sk = skb->sk; int queue_index = -1; + if (!static_key_false(&xps_needed)) + return -1; + rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + if (!static_key_false(&xps_rxqs_needed)) + goto get_cpus_map; + + dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); if (dev_maps) { - unsigned int tci = skb->sender_cpu - 1; + int tci = sk_rx_queue_get(sk); - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tci >= 0 && tci < dev->num_rx_queues) + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } - map = rcu_dereference(dev_maps->cpu_map[tci]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else - queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), - map->len)]; - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; +get_cpus_map: + if (queue_index < 0) { + dev_maps = rcu_dereference(sb_dev->xps_cpus_map); + if (dev_maps) { + unsigned int tci = skb->sender_cpu - 1; + + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); } } rcu_read_unlock(); @@ -3412,17 +3617,36 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) #endif } -static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev, + select_queue_fallback_t fallback) +{ + return 0; +} +EXPORT_SYMBOL(dev_pick_tx_zero); + +u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev, + select_queue_fallback_t fallback) +{ + return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; +} +EXPORT_SYMBOL(dev_pick_tx_cpu_id); + +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); + sb_dev = sb_dev ? : dev; + if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { - int new_index = get_xps_queue(dev, skb); + int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) - new_index = skb_tx_hash(dev, skb); + new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && @@ -3437,7 +3661,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + struct net_device *sb_dev) { int queue_index = 0; @@ -3452,10 +3676,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + queue_index = ops->ndo_select_queue(dev, skb, sb_dev, __netdev_pick_tx); else - queue_index = __netdev_pick_tx(dev, skb); + queue_index = __netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } @@ -3467,7 +3691,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, /** * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit - * @accel_priv: private data used for L2 forwarding offload + * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling @@ -3490,7 +3714,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) +static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -3529,7 +3753,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); - txq = netdev_pick_tx(dev, skb, accel_priv); + txq = netdev_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); @@ -3603,9 +3827,9 @@ int dev_queue_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(dev_queue_xmit); -int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) +int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) { - return __dev_queue_xmit(skb, accel_priv); + return __dev_queue_xmit(skb, sb_dev); } EXPORT_SYMBOL(dev_queue_xmit_accel); @@ -4494,7 +4718,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, return 0; } -static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) +static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc, + struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; @@ -4624,8 +4849,7 @@ skip_classify: if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; - else - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) @@ -4643,6 +4867,18 @@ out: return ret; } +static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) +{ + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + int ret; + + ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + if (pt_prev) + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + return ret; +} + /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process @@ -4663,13 +4899,72 @@ int netif_receive_skb_core(struct sk_buff *skb) int ret; rcu_read_lock(); - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); +static inline void __netif_receive_skb_list_ptype(struct list_head *head, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + struct sk_buff *skb, *next; + + if (!pt_prev) + return; + if (list_empty(head)) + return; + if (pt_prev->list_func != NULL) + pt_prev->list_func(head, pt_prev, orig_dev); + else + list_for_each_entry_safe(skb, next, head, list) + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) +{ + /* Fast-path assumptions: + * - There is no RX handler. + * - Only one packet_type matches. + * If either of these fails, we will end up doing some per-packet + * processing in-line, then handling the 'last ptype' for the whole + * sublist. This can't cause out-of-order delivery to any single ptype, + * because the 'last ptype' must be constant across the sublist, and all + * other ptypes are handled per-packet. + */ + /* Current (common) ptype of sublist */ + struct packet_type *pt_curr = NULL; + /* Current (common) orig_dev of sublist */ + struct net_device *od_curr = NULL; + struct list_head sublist; + struct sk_buff *skb, *next; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *orig_dev = skb->dev; + struct packet_type *pt_prev = NULL; + + list_del(&skb->list); + __netif_receive_skb_core(skb, pfmemalloc, &pt_prev); + if (!pt_prev) + continue; + if (pt_curr != pt_prev || od_curr != orig_dev) { + /* dispatch old sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + pt_curr = pt_prev; + od_curr = orig_dev; + } + list_add_tail(&skb->list, &sublist); + } + + /* dispatch final sublist */ + __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); +} + static int __netif_receive_skb(struct sk_buff *skb) { int ret; @@ -4687,14 +4982,44 @@ static int __netif_receive_skb(struct sk_buff *skb) * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); - ret = __netif_receive_skb_core(skb, true); + ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else - ret = __netif_receive_skb_core(skb, false); + ret = __netif_receive_skb_one_core(skb, false); return ret; } +static void __netif_receive_skb_list(struct list_head *head) +{ + unsigned long noreclaim_flag = 0; + struct sk_buff *skb, *next; + bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ + + list_for_each_entry_safe(skb, next, head, list) { + if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { + struct list_head sublist; + + /* Handle the previous sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + __netif_receive_skb_list_core(&sublist, pfmemalloc); + pfmemalloc = !pfmemalloc; + /* See comments in __netif_receive_skb */ + if (pfmemalloc) + noreclaim_flag = memalloc_noreclaim_save(); + else + memalloc_noreclaim_restore(noreclaim_flag); + } + } + /* Handle the remaining sublist */ + if (!list_empty(head)) + __netif_receive_skb_list_core(head, pfmemalloc); + /* Restore pflags */ + if (pfmemalloc) + memalloc_noreclaim_restore(noreclaim_flag); +} + static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); @@ -4717,7 +5042,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) break; case XDP_QUERY_PROG: - xdp->prog_attached = !!old; xdp->prog_id = old ? old->aux->id : 0; break; @@ -4769,6 +5093,55 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } +static void netif_receive_skb_list_internal(struct list_head *head) +{ + struct bpf_prog *xdp_prog = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + net_timestamp_check(netdev_tstamp_prequeue, skb); + list_del(&skb->list); + if (!skb_defer_rx_timestamp(skb)) + list_add_tail(&skb->list, &sublist); + } + list_splice_init(&sublist, head); + + if (static_branch_unlikely(&generic_xdp_needed_key)) { + preempt_disable(); + rcu_read_lock(); + list_for_each_entry_safe(skb, next, head, list) { + xdp_prog = rcu_dereference(skb->dev->xdp_prog); + list_del(&skb->list); + if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) + list_add_tail(&skb->list, &sublist); + } + rcu_read_unlock(); + preempt_enable(); + /* Put passed packets back on main list */ + list_splice_init(&sublist, head); + } + + rcu_read_lock(); +#ifdef CONFIG_RPS + if (static_key_false(&rps_needed)) { + list_for_each_entry_safe(skb, next, head, list) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + /* Will be handled, remove from list */ + list_del(&skb->list); + enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + } + } + } +#endif + __netif_receive_skb_list(head); + rcu_read_unlock(); +} + /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process @@ -4792,6 +5165,28 @@ int netif_receive_skb(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb); +/** + * netif_receive_skb_list - process many receive buffers from network + * @head: list of skbs to process. + * + * Since return value of netif_receive_skb() is normally ignored, and + * wouldn't be meaningful for a list, this function returns void. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + */ +void netif_receive_skb_list(struct list_head *head) +{ + struct sk_buff *skb; + + if (list_empty(head)) + return; + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + netif_receive_skb_list_internal(head); +} +EXPORT_SYMBOL(netif_receive_skb_list); + DEFINE_PER_CPU(struct work_struct, flush_works); /* Network device is going away, flush any packets still pending */ @@ -4875,42 +5270,50 @@ out: return netif_receive_skb_internal(skb); } -/* napi->gro_list contains packets ordered by age. - * youngest packets at the head of it. - * Complete skbs in reverse order to reduce latencies. - */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, + bool flush_old) { - struct sk_buff *skb, *prev = NULL; - - /* scan list and build reverse chain */ - for (skb = napi->gro_list; skb != NULL; skb = skb->next) { - skb->prev = prev; - prev = skb; - } - - for (skb = prev; skb; skb = prev) { - skb->next = NULL; + struct list_head *head = &napi->gro_hash[index].list; + struct sk_buff *skb, *p; + list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; - - prev = skb->prev; + list_del(&skb->list); + skb->next = NULL; napi_gro_complete(skb); - napi->gro_count--; + napi->gro_hash[index].count--; } - napi->gro_list = NULL; + if (!napi->gro_hash[index].count) + __clear_bit(index, &napi->gro_bitmask); +} + +/* napi->gro_hash[].list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + u32 i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + if (test_bit(i, &napi->gro_bitmask)) + __napi_gro_flush_chain(napi, i, flush_old); + } } EXPORT_SYMBOL(napi_gro_flush); -static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +static struct list_head *gro_list_prepare(struct napi_struct *napi, + struct sk_buff *skb) { - struct sk_buff *p; unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); + struct list_head *head; + struct sk_buff *p; - for (p = napi->gro_list; p; p = p->next) { + head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; + list_for_each_entry(p, head, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; @@ -4933,6 +5336,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } + + return head; } static void skb_gro_reset_offset(struct sk_buff *skb) @@ -4975,20 +5380,41 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) } } +static void gro_flush_oldest(struct list_head *head) +{ + struct sk_buff *oldest; + + oldest = list_last_entry(head, struct sk_buff, list); + + /* We are called with head length >= MAX_GRO_SKBS, so this is + * impossible. + */ + if (WARN_ON_ONCE(!oldest)) + return; + + /* Do not adjust napi->gro_hash[].count, caller is adding a new + * SKB to the chain. + */ + list_del(&oldest->list); + napi_gro_complete(oldest); +} + static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int same_flow; + struct list_head *gro_head; + struct sk_buff *pp = NULL; enum gro_result ret; + int same_flow; int grow; if (netif_elide_gro(skb->dev)) goto normal; - gro_list_prepare(napi, skb); + gro_head = gro_list_prepare(napi, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -5022,7 +5448,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->csum_valid = 0; } - pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); + pp = ptype->callbacks.gro_receive(gro_head, skb); break; } rcu_read_unlock(); @@ -5039,12 +5465,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { - struct sk_buff *nskb = *pp; - - *pp = nskb->next; - nskb->next = NULL; - napi_gro_complete(nskb); - napi->gro_count--; + list_del(&pp->list); + pp->next = NULL; + napi_gro_complete(pp); + napi->gro_hash[hash].count--; } if (same_flow) @@ -5053,26 +5477,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { - struct sk_buff *nskb = napi->gro_list; - - /* locate the end of the list to select the 'oldest' flow */ - while (nskb->next) { - pp = &nskb->next; - nskb = *pp; - } - *pp = NULL; - nskb->next = NULL; - napi_gro_complete(nskb); + if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { + gro_flush_oldest(gro_head); } else { - napi->gro_count++; + napi->gro_hash[hash].count++; } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - skb->next = napi->gro_list; - napi->gro_list = skb; + list_add(&skb->list, gro_head); ret = GRO_HELD; pull: @@ -5080,6 +5494,13 @@ pull: if (grow > 0) gro_pull_from_frag0(skb, grow); ok: + if (napi->gro_hash[hash].count) { + if (!test_bit(hash, &napi->gro_bitmask)) + __set_bit(hash, &napi->gro_bitmask); + } else if (test_bit(hash, &napi->gro_bitmask)) { + __clear_bit(hash, &napi->gro_bitmask); + } + return ret; normal: @@ -5478,7 +5899,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_list) { + if (n->gro_bitmask) { unsigned long timeout = 0; if (work_done) @@ -5687,7 +6108,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_list && !napi_disable_pending(napi) && + if (napi->gro_bitmask && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -5697,11 +6118,16 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { + int i; + INIT_LIST_HEAD(&napi->poll_list); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; - napi->gro_count = 0; - napi->gro_list = NULL; + napi->gro_bitmask = 0; + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&napi->gro_hash[i].list); + napi->gro_hash[i].count = 0; + } napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -5734,6 +6160,19 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +static void flush_gro_hash(struct napi_struct *napi) +{ + int i; + + for (i = 0; i < GRO_HASH_BUCKETS; i++) { + struct sk_buff *skb, *n; + + list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) + kfree_skb(skb); + napi->gro_hash[i].count = 0; + } +} + /* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { @@ -5743,9 +6182,8 @@ void netif_napi_del(struct napi_struct *napi) list_del_init(&napi->dev_list); napi_free_frags(napi); - kfree_skb_list(napi->gro_list); - napi->gro_list = NULL; - napi->gro_count = 0; + flush_gro_hash(napi); + napi->gro_bitmask = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -5787,7 +6225,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - if (n->gro_list) { + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ @@ -7276,23 +7714,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, - struct netdev_bpf *xdp) +u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + enum bpf_netdev_command cmd) { - memset(xdp, 0, sizeof(*xdp)); - xdp->command = XDP_QUERY_PROG; + struct netdev_bpf xdp; - /* Query must always succeed. */ - WARN_ON(bpf_op(dev, xdp) < 0); -} + if (!bpf_op) + return 0; -static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) -{ - struct netdev_bpf xdp; + memset(&xdp, 0, sizeof(xdp)); + xdp.command = cmd; - __dev_xdp_query(dev, bpf_op, &xdp); + /* Query must always succeed. */ + WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG); - return xdp.prog_attached; + return xdp.prog_id; } static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, @@ -7326,12 +7762,19 @@ static void dev_xdp_uninstall(struct net_device *dev) if (!ndo_bpf) return; - __dev_xdp_query(dev, ndo_bpf, &xdp); - if (xdp.prog_attached == XDP_ATTACHED_NONE) - return; + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + WARN_ON(ndo_bpf(dev, &xdp)); + if (xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); - /* Program removal should always succeed */ - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); + /* Remove HW offload */ + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG_HW; + if (!ndo_bpf(dev, &xdp) && xdp.prog_id) + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, + NULL)); } /** @@ -7347,12 +7790,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; + enum bpf_netdev_command query; struct bpf_prog *prog = NULL; bpf_op_t bpf_op, bpf_chk; int err; ASSERT_RTNL(); + query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; + bpf_op = bpf_chk = ops->ndo_bpf; if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; @@ -7362,10 +7808,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) + if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) || + __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op)) + __dev_xdp_query(dev, bpf_op, query)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, @@ -8834,6 +9281,9 @@ static struct hlist_head * __net_init netdev_create_hash(void) /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { + BUILD_BUG_ON(GRO_HASH_BUCKETS > + 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask)); + if (net != &init_net) INIT_LIST_HEAD(&net->dev_base_head); diff --git a/net/core/devlink.c b/net/core/devlink.c index 22099705cc41..65fc366a78a4 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -326,6 +326,57 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, pool_type, p_tc_index); } +struct devlink_region { + struct devlink *devlink; + struct list_head list; + const char *name; + struct list_head snapshot_list; + u32 max_snapshots; + u32 cur_snapshots; + u64 size; +}; + +struct devlink_snapshot { + struct list_head list; + struct devlink_region *region; + devlink_snapshot_data_dest_t *data_destructor; + u64 data_len; + u8 *data; + u32 id; +}; + +static struct devlink_region * +devlink_region_get_by_name(struct devlink *devlink, const char *region_name) +{ + struct devlink_region *region; + + list_for_each_entry(region, &devlink->region_list, list) + if (!strcmp(region->name, region_name)) + return region; + + return NULL; +} + +static struct devlink_snapshot * +devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) +{ + struct devlink_snapshot *snapshot; + + list_for_each_entry(snapshot, ®ion->snapshot_list, list) + if (snapshot->id == id) + return snapshot; + + return NULL; +} + +static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot) +{ + snapshot->region->cur_snapshots--; + list_del(&snapshot->list); + (*snapshot->data_destructor)(snapshot->data); + kfree(snapshot); +} + #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) @@ -2604,6 +2655,919 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static const struct devlink_param devlink_param_generic[] = { + { + .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, + .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, + .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, + .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, + .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, + .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, + .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, + .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, + }, +}; + +static int devlink_param_generic_verify(const struct devlink_param *param) +{ + /* verify it match generic parameter by id and name */ + if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + if (strcmp(param->name, devlink_param_generic[param->id].name)) + return -ENOENT; + + WARN_ON(param->type != devlink_param_generic[param->id].type); + + return 0; +} + +static int devlink_param_driver_verify(const struct devlink_param *param) +{ + int i; + + if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + /* verify no such name in generic params */ + for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) + if (!strcmp(param->name, devlink_param_generic[i].name)) + return -EEXIST; + + return 0; +} + +static struct devlink_param_item * +devlink_param_find_by_name(struct list_head *param_list, + const char *param_name) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, param_list, list) + if (!strcmp(param_item->param->name, param_name)) + return param_item; + return NULL; +} + +static struct devlink_param_item * +devlink_param_find_by_id(struct list_head *param_list, u32 param_id) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, param_list, list) + if (param_item->param->id == param_id) + return param_item; + return NULL; +} + +static bool +devlink_param_cmode_is_supported(const struct devlink_param *param, + enum devlink_param_cmode cmode) +{ + return test_bit(cmode, ¶m->supported_cmodes); +} + +static int devlink_param_get(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx) +{ + if (!param->get) + return -EOPNOTSUPP; + return param->get(devlink, param->id, ctx); +} + +static int devlink_param_set(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx) +{ + if (!param->set) + return -EOPNOTSUPP; + return param->set(devlink, param->id, ctx); +} + +static int +devlink_param_type_to_nla_type(enum devlink_param_type param_type) +{ + switch (param_type) { + case DEVLINK_PARAM_TYPE_U8: + return NLA_U8; + case DEVLINK_PARAM_TYPE_U16: + return NLA_U16; + case DEVLINK_PARAM_TYPE_U32: + return NLA_U32; + case DEVLINK_PARAM_TYPE_STRING: + return NLA_STRING; + case DEVLINK_PARAM_TYPE_BOOL: + return NLA_FLAG; + default: + return -EINVAL; + } +} + +static int +devlink_nl_param_value_fill_one(struct sk_buff *msg, + enum devlink_param_type type, + enum devlink_param_cmode cmode, + union devlink_param_value val) +{ + struct nlattr *param_value_attr; + + param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE); + if (!param_value_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) + goto value_nest_cancel; + + switch (type) { + case DEVLINK_PARAM_TYPE_U8: + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_U16: + if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_U32: + if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_STRING: + if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, + val.vstr)) + goto value_nest_cancel; + break; + case DEVLINK_PARAM_TYPE_BOOL: + if (val.vbool && + nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) + goto value_nest_cancel; + break; + } + + nla_nest_end(msg, param_value_attr); + return 0; + +value_nest_cancel: + nla_nest_cancel(msg, param_value_attr); +nla_put_failure: + return -EMSGSIZE; +} + +static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_param_item *param_item, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; + const struct devlink_param *param = param_item->param; + struct devlink_param_gset_ctx ctx; + struct nlattr *param_values_list; + struct nlattr *param_attr; + int nla_type; + void *hdr; + int err; + int i; + + /* Get value from driver part to driverinit configuration mode */ + for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { + if (!devlink_param_cmode_is_supported(param, i)) + continue; + if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { + if (!param_item->driverinit_value_valid) + return -EOPNOTSUPP; + param_value[i] = param_item->driverinit_value; + } else { + ctx.cmode = i; + err = devlink_param_get(devlink, param, &ctx); + if (err) + return err; + param_value[i] = ctx.val; + } + } + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM); + if (!param_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) + goto param_nest_cancel; + if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) + goto param_nest_cancel; + + nla_type = devlink_param_type_to_nla_type(param->type); + if (nla_type < 0) + goto param_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) + goto param_nest_cancel; + + param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST); + if (!param_values_list) + goto param_nest_cancel; + + for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { + if (!devlink_param_cmode_is_supported(param, i)) + continue; + err = devlink_nl_param_value_fill_one(msg, param->type, + i, param_value[i]); + if (err) + goto values_list_nest_cancel; + } + + nla_nest_end(msg, param_values_list); + nla_nest_end(msg, param_attr); + genlmsg_end(msg, hdr); + return 0; + +values_list_nest_cancel: + nla_nest_end(msg, param_values_list); +param_nest_cancel: + nla_nest_cancel(msg, param_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_param_notify(struct devlink *devlink, + struct devlink_param_item *param_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_param_item *param_item; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(param_item, &devlink->param_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_param_fill(msg, devlink, param_item, + DEVLINK_CMD_PARAM_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int +devlink_param_type_get_from_info(struct genl_info *info, + enum devlink_param_type *param_type) +{ + if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE]) + return -EINVAL; + + switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { + case NLA_U8: + *param_type = DEVLINK_PARAM_TYPE_U8; + break; + case NLA_U16: + *param_type = DEVLINK_PARAM_TYPE_U16; + break; + case NLA_U32: + *param_type = DEVLINK_PARAM_TYPE_U32; + break; + case NLA_STRING: + *param_type = DEVLINK_PARAM_TYPE_STRING; + break; + case NLA_FLAG: + *param_type = DEVLINK_PARAM_TYPE_BOOL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +devlink_param_value_get_from_info(const struct devlink_param *param, + struct genl_info *info, + union devlink_param_value *value) +{ + if (param->type != DEVLINK_PARAM_TYPE_BOOL && + !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) + return -EINVAL; + + switch (param->type) { + case DEVLINK_PARAM_TYPE_U8: + value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_U16: + value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_U32: + value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_STRING: + if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) > + DEVLINK_PARAM_MAX_STRING_VALUE) + return -EINVAL; + value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_BOOL: + value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ? + true : false; + break; + } + return 0; +} + +static struct devlink_param_item * +devlink_param_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + char *param_name; + + if (!info->attrs[DEVLINK_ATTR_PARAM_NAME]) + return NULL; + + param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); + return devlink_param_find_by_name(&devlink->param_list, param_name); +} + +static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_param_item *param_item; + struct sk_buff *msg; + int err; + + param_item = devlink_param_get_from_info(devlink, info); + if (!param_item) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_param_fill(msg, devlink, param_item, + DEVLINK_CMD_PARAM_GET, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + enum devlink_param_type param_type; + struct devlink_param_gset_ctx ctx; + enum devlink_param_cmode cmode; + struct devlink_param_item *param_item; + const struct devlink_param *param; + union devlink_param_value value; + int err = 0; + + param_item = devlink_param_get_from_info(devlink, info); + if (!param_item) + return -EINVAL; + param = param_item->param; + err = devlink_param_type_get_from_info(info, ¶m_type); + if (err) + return err; + if (param_type != param->type) + return -EINVAL; + err = devlink_param_value_get_from_info(param, info, &value); + if (err) + return err; + if (param->validate) { + err = param->validate(devlink, param->id, value, info->extack); + if (err) + return err; + } + + if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]) + return -EINVAL; + cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); + if (!devlink_param_cmode_is_supported(param, cmode)) + return -EOPNOTSUPP; + + if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { + param_item->driverinit_value = value; + param_item->driverinit_value_valid = true; + } else { + if (!param->set) + return -EOPNOTSUPP; + ctx.val = value; + ctx.cmode = cmode; + err = devlink_param_set(devlink, param, &ctx); + if (err) + return err; + } + + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + return 0; +} + +static int devlink_param_register_one(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + if (devlink_param_find_by_name(&devlink->param_list, + param->name)) + return -EEXIST; + + if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) + WARN_ON(param->get || param->set); + else + WARN_ON(!param->get || !param->set); + + param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); + if (!param_item) + return -ENOMEM; + param_item->param = param; + + list_add_tail(¶m_item->list, &devlink->param_list); + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + return 0; +} + +static void devlink_param_unregister_one(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_name(&devlink->param_list, + param->name); + WARN_ON(!param_item); + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL); + list_del(¶m_item->list); + kfree(param_item); +} + +static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_snapshot *snapshot) +{ + struct nlattr *snap_attr; + int err; + + snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT); + if (!snap_attr) + return -EINVAL; + + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); + if (err) + goto nla_put_failure; + + nla_nest_end(msg, snap_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, snap_attr); + return err; +} + +static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_region *region) +{ + struct devlink_snapshot *snapshot; + struct nlattr *snapshots_attr; + int err; + + snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS); + if (!snapshots_attr) + return -EINVAL; + + list_for_each_entry(snapshot, ®ion->snapshot_list, list) { + err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); + if (err) + goto nla_put_failure; + } + + nla_nest_end(msg, snapshots_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, snapshots_attr); + return err; +} + +static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, + struct devlink_region *region) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto nla_put_failure; + + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name); + if (err) + goto nla_put_failure; + + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, + region->size, + DEVLINK_ATTR_PAD); + if (err) + goto nla_put_failure; + + err = devlink_nl_region_snapshots_id_put(msg, devlink, region); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return err; +} + +static void devlink_nl_region_notify(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd) +{ + struct devlink *devlink = region->devlink; + struct sk_buff *msg; + void *hdr; + int err; + + WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto out_cancel_msg; + + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, + region->name); + if (err) + goto out_cancel_msg; + + if (snapshot) { + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, + snapshot->id); + if (err) + goto out_cancel_msg; + } else { + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, + region->size, DEVLINK_ATTR_PAD); + if (err) + goto out_cancel_msg; + } + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + + return; + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); +} + +static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_region *region; + const char *region_name; + struct sk_buff *msg; + int err; + + if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) + return -EINVAL; + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + region = devlink_region_get_by_name(devlink, region_name); + if (!region) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, + info->snd_portid, info->snd_seq, 0, + region); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_region *region; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + + mutex_lock(&devlink->lock); + list_for_each_entry(region, &devlink->region_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_region_fill(msg, devlink, + DEVLINK_CMD_REGION_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, region); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_region_del(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct devlink_region *region; + const char *region_name; + u32 snapshot_id; + + if (!info->attrs[DEVLINK_ATTR_REGION_NAME] || + !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + return -EINVAL; + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + + region = devlink_region_get_by_name(devlink, region_name); + if (!region) + return -EINVAL; + + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) + return -EINVAL; + + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); + devlink_region_snapshot_del(snapshot); + return 0; +} + +static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, + struct devlink *devlink, + u8 *chunk, u32 chunk_size, + u64 addr) +{ + struct nlattr *chunk_attr; + int err; + + chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK); + if (!chunk_attr) + return -EINVAL; + + err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk); + if (err) + goto nla_put_failure; + + err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr, + DEVLINK_ATTR_PAD); + if (err) + goto nla_put_failure; + + nla_nest_end(msg, chunk_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, chunk_attr); + return err; +} + +#define DEVLINK_REGION_READ_CHUNK_SIZE 256 + +static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, + struct devlink *devlink, + struct devlink_region *region, + struct nlattr **attrs, + u64 start_offset, + u64 end_offset, + bool dump, + u64 *new_offset) +{ + struct devlink_snapshot *snapshot; + u64 curr_offset = start_offset; + u32 snapshot_id; + int err = 0; + + *new_offset = start_offset; + + snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) + return -EINVAL; + + if (end_offset > snapshot->data_len || dump) + end_offset = snapshot->data_len; + + while (curr_offset < end_offset) { + u32 data_size; + u8 *data; + + if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE) + data_size = end_offset - curr_offset; + else + data_size = DEVLINK_REGION_READ_CHUNK_SIZE; + + data = &snapshot->data[curr_offset]; + err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink, + data, data_size, + curr_offset); + if (err) + break; + + curr_offset += data_size; + } + *new_offset = curr_offset; + + return err; +} + +static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + u64 ret_offset, start_offset, end_offset = 0; + struct nlattr *attrs[DEVLINK_ATTR_MAX + 1]; + const struct genl_ops *ops = cb->data; + struct devlink_region *region; + struct nlattr *chunks_attr; + const char *region_name; + struct devlink *devlink; + bool dump = true; + void *hdr; + int err; + + start_offset = *((u64 *)&cb->args[0]); + + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize, + attrs, DEVLINK_ATTR_MAX, ops->policy, NULL); + if (err) + goto out; + + devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); + if (IS_ERR(devlink)) + goto out; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); + + if (!attrs[DEVLINK_ATTR_REGION_NAME] || + !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + goto out_unlock; + + region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]); + region = devlink_region_get_by_name(devlink, region_name); + if (!region) + goto out_unlock; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, + DEVLINK_CMD_REGION_READ); + if (!hdr) + goto out_unlock; + + err = devlink_nl_put_handle(skb, devlink); + if (err) + goto nla_put_failure; + + err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); + if (err) + goto nla_put_failure; + + chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS); + if (!chunks_attr) + goto nla_put_failure; + + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && + attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { + if (!start_offset) + start_offset = + nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + + end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + dump = false; + } + + err = devlink_nl_region_read_snapshot_fill(skb, devlink, + region, attrs, + start_offset, + end_offset, dump, + &ret_offset); + + if (err && err != -EMSGSIZE) + goto nla_put_failure; + + /* Check if there was any progress done to prevent infinite loop */ + if (ret_offset == start_offset) + goto nla_put_failure; + + *((u64 *)&cb->args[0]) = ret_offset; + + nla_nest_end(skb, chunks_attr); + genlmsg_end(skb, hdr); + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); + + return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); +out_unlock: + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); +out: + return 0; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2624,6 +3588,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, + [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -2807,6 +3776,43 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, + { + .cmd = DEVLINK_CMD_PARAM_GET, + .doit = devlink_nl_cmd_param_get_doit, + .dumpit = devlink_nl_cmd_param_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_PARAM_SET, + .doit = devlink_nl_cmd_param_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_REGION_GET, + .doit = devlink_nl_cmd_region_get_doit, + .dumpit = devlink_nl_cmd_region_get_dumpit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_REGION_DEL, + .doit = devlink_nl_cmd_region_del, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_REGION_READ, + .dumpit = devlink_nl_cmd_region_read_dumpit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -2845,6 +3851,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); + INIT_LIST_HEAD(&devlink->param_list); + INIT_LIST_HEAD(&devlink->region_list); mutex_init(&devlink->lock); return devlink; } @@ -3434,6 +4442,320 @@ out: } EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); +/** + * devlink_params_register - register configuration parameters + * + * @devlink: devlink + * @params: configuration parameters array + * @params_count: number of parameters provided + * + * Register the configuration parameters supported by the driver. + */ +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i; + int err; + + mutex_lock(&devlink->lock); + for (i = 0; i < params_count; i++, param++) { + if (!param || !param->name || !param->supported_cmodes) { + err = -EINVAL; + goto rollback; + } + if (param->generic) { + err = devlink_param_generic_verify(param); + if (err) + goto rollback; + } else { + err = devlink_param_driver_verify(param); + if (err) + goto rollback; + } + err = devlink_param_register_one(devlink, param); + if (err) + goto rollback; + } + + mutex_unlock(&devlink->lock); + return 0; + +rollback: + if (!i) + goto unlock; + for (param--; i > 0; i--, param--) + devlink_param_unregister_one(devlink, param); +unlock: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_params_register); + +/** + * devlink_params_unregister - unregister configuration parameters + * @devlink: devlink + * @params: configuration parameters to unregister + * @params_count: number of parameters provided + */ +void devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i; + + mutex_lock(&devlink->lock); + for (i = 0; i < params_count; i++, param++) + devlink_param_unregister_one(devlink, param); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_params_unregister); + +/** + * devlink_param_driverinit_value_get - get configuration parameter + * value for driver initializing + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter in driverinit configuration mode + * + * This function should be used by the driver to get driverinit + * configuration for initialization after reload command. + */ +int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val) +{ + struct devlink_param_item *param_item; + + if (!devlink->ops || !devlink->ops->reload) + return -EOPNOTSUPP; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!param_item->driverinit_value_valid || + !devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + *init_val = param_item->driverinit_value; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); + +/** + * devlink_param_driverinit_value_set - set value of configuration + * parameter for driverinit + * configuration mode + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter to set for driverinit configuration mode + * + * This function should be used by the driver to set driverinit + * configuration mode default value. + */ +int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + param_item->driverinit_value = init_val; + param_item->driverinit_value_valid = true; + + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); + +/** + * devlink_param_value_changed - notify devlink on a parameter's value + * change. Should be called by the driver + * right after the change. + * + * @devlink: devlink + * @param_id: parameter ID + * + * This function should be used by the driver to notify devlink on value + * change, excluding driverinit configuration mode. + * For driverinit configuration mode driver should use the function + * devlink_param_driverinit_value_set() instead. + */ +void devlink_param_value_changed(struct devlink *devlink, u32 param_id) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + WARN_ON(!param_item); + + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devlink_param_value_changed); + +/** + * devlink_region_create - create a new address region + * + * @devlink: devlink + * @region_name: region name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region + */ +struct devlink_region *devlink_region_create(struct devlink *devlink, + const char *region_name, + u32 region_max_snapshots, + u64 region_size) +{ + struct devlink_region *region; + int err = 0; + + mutex_lock(&devlink->lock); + + if (devlink_region_get_by_name(devlink, region_name)) { + err = -EEXIST; + goto unlock; + } + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) { + err = -ENOMEM; + goto unlock; + } + + region->devlink = devlink; + region->max_snapshots = region_max_snapshots; + region->name = region_name; + region->size = region_size; + INIT_LIST_HEAD(®ion->snapshot_list); + list_add_tail(®ion->list, &devlink->region_list); + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); + + mutex_unlock(&devlink->lock); + return region; + +unlock: + mutex_unlock(&devlink->lock); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(devlink_region_create); + +/** + * devlink_region_destroy - destroy address region + * + * @region: devlink region to destroy + */ +void devlink_region_destroy(struct devlink_region *region) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot, *ts; + + mutex_lock(&devlink->lock); + + /* Free all snapshots of region */ + list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) + devlink_region_snapshot_del(snapshot); + + list_del(®ion->list); + + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); + mutex_unlock(&devlink->lock); + kfree(region); +} +EXPORT_SYMBOL_GPL(devlink_region_destroy); + +/** + * devlink_region_shapshot_id_get - get snapshot ID + * + * This callback should be called when adding a new snapshot, + * Driver should use the same id for multiple snapshots taken + * on multiple regions at the same time/by the same trigger. + * + * @devlink: devlink + */ +u32 devlink_region_shapshot_id_get(struct devlink *devlink) +{ + u32 id; + + mutex_lock(&devlink->lock); + id = ++devlink->snapshot_id; + mutex_unlock(&devlink->lock); + + return id; +} +EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get); + +/** + * devlink_region_snapshot_create - create a new snapshot + * This will add a new snapshot of a region. The snapshot + * will be stored on the region struct and can be accessed + * from devlink. This is useful for future analyses of snapshots. + * Multiple snapshots can be created on a region. + * The @snapshot_id should be obtained using the getter function. + * + * @devlink_region: devlink region of the snapshot + * @data_len: size of snapshot data + * @data: snapshot data + * @snapshot_id: snapshot id to be created + * @data_destructor: pointer to destructor function to free data + */ +int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, + u8 *data, u32 snapshot_id, + devlink_snapshot_data_dest_t *data_destructor) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot; + int err; + + mutex_lock(&devlink->lock); + + /* check if region can hold one more snapshot */ + if (region->cur_snapshots == region->max_snapshots) { + err = -ENOMEM; + goto unlock; + } + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + err = -EEXIST; + goto unlock; + } + + snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); + if (!snapshot) { + err = -ENOMEM; + goto unlock; + } + + snapshot->id = snapshot_id; + snapshot->region = region; + snapshot->data = data; + snapshot->data_len = data_len; + snapshot->data_destructor = data_destructor; + + list_add_tail(&snapshot->list, ®ion->snapshot_list); + + region->cur_snapshots++; + + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); + mutex_unlock(&devlink->lock); + return 0; + +unlock: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e677a20180cf..c9993c6c2fd4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", + [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", }; static const char diff --git a/net/core/filter.c b/net/core/filter.c index 06da770f543f..104d560946da 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3679,7 +3679,7 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size); + ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); return 0; } @@ -4751,6 +4751,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + /* else: fall through */ default: return NULL; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 53f96e4f7bf5..08a5184f4b34 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -152,7 +152,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL) && !dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ENC_PORTS)) + FLOW_DISSECTOR_KEY_ENC_PORTS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP)) return; info = skb_tunnel_info(skb); @@ -212,6 +214,16 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, tp->src = key->tp_src; tp->dst = key->tp_dst; } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { + struct flow_dissector_key_ip *ip; + + ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP, + target_container); + ip->tos = key->tos; + ip->ttl = key->ttl; + } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); @@ -589,7 +601,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; - bool skip_vlan = false; + enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -748,14 +760,14 @@ proto_again: } case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { - const struct vlan_hdr *vlan; + const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; - bool vlan_tag_present = skb && skb_vlan_tag_present(skb); + __be16 saved_vlan_tpid = proto; - if (vlan_tag_present) + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && + skb && skb_vlan_tag_present(skb)) { proto = skb->protocol; - - if (!vlan_tag_present || eth_type_vlan(skb->protocol)) { + } else { vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) { @@ -765,20 +777,23 @@ proto_again: proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); - if (skip_vlan) { - fdret = FLOW_DISSECT_RET_PROTO_AGAIN; - break; - } } - skip_vlan = true; - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLAN)) { + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { + dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; + } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { + dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; + } else { + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + + if (dissector_uses_key(flow_dissector, dissector_vlan)) { key_vlan = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_VLAN, + dissector_vlan, target_container); - if (vlan_tag_present) { + if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); key_vlan->vlan_priority = (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); @@ -789,6 +804,7 @@ proto_again: (ntohs(vlan->h_vlan_TCI) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } + key_vlan->vlan_tpid = saved_vlan_tpid; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8e3fda9e725c..cbe85d8d4cc2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->nud_state = new; err = 0; notify = old & NUD_VALID; - if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && + if (((old & (NUD_INCOMPLETE | NUD_PROBE)) || + (flags & NEIGH_UPDATE_F_ADMIN)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bb7e80f4ced3..ffa1d18f2c2c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1047,13 +1047,30 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int index = get_netdev_queue_index(queue); - int tc = netdev_txq_to_tc(dev, index); + int index; + int tc; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + index = get_netdev_queue_index(queue); + + /* If queue belongs to subordinate dev use its TC mapping */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; - return sprintf(buf, "%u\n", tc); + /* We can report the traffic class one of two ways: + * Subordinate device traffic classes are reported with the traffic + * class first, and then the subordinate class so for example TC0 on + * subordinate device 2 will be reported as "0-2". If the queue + * belongs to the root device it will be reported with just the + * traffic class, so just "0" for TC 0 for example. + */ + return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) : + sprintf(buf, "%u\n", tc); } #ifdef CONFIG_XPS @@ -1214,10 +1231,20 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, cpumask_var_t mask; unsigned long index; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + index = get_netdev_queue_index(queue); if (dev->num_tc) { + /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; + if (num_tc < 0) + return -EINVAL; + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; @@ -1227,13 +1254,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, return -ENOMEM; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps); + dev_maps = rcu_dereference(dev->xps_cpus_map); if (dev_maps) { for_each_possible_cpu(cpu) { int i, tci = cpu * num_tc + tc; struct xps_map *map; - map = rcu_dereference(dev_maps->cpu_map[tci]); + map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; @@ -1260,6 +1287,9 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, cpumask_var_t mask; int err; + if (!netif_is_multiqueue(dev)) + return -ENOENT; + if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1283,6 +1313,88 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); + +static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + unsigned long *mask, index; + int j, len, num_tc = 1, tc = 0; + + index = get_netdev_queue_index(queue); + + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_rxqs_map); + if (!dev_maps) + goto out_no_maps; + + for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), + j < dev->num_rx_queues;) { + int i, tci = j * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + set_bit(j, mask); + break; + } + } + } +out_no_maps: + rcu_read_unlock(); + + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); + kfree(mask); + + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, + size_t len) +{ + struct net_device *dev = queue->dev; + struct net *net = dev_net(dev); + unsigned long *mask, index; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), + GFP_KERNEL); + if (!mask) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, mask, dev->num_rx_queues); + if (err) { + kfree(mask); + return err; + } + + err = __netif_set_xps_queue(dev, mask, index, true); + kfree(mask); + return err ? : len; +} + +static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init + = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { @@ -1290,6 +1402,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, + &xps_rxqs_attribute.attr, &queue_tx_maxrate.attr, #endif NULL diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 49368e21d228..308ed04984de 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1265,7 +1265,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strncpy(pkt_dev->dst_min, buf, len); + strcpy(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } @@ -1280,14 +1280,12 @@ static ssize_t pktgen_if_write(struct file *file, if (len < 0) return len; - if (copy_from_user(buf, &user_buffer[i], len)) return -EFAULT; - buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strncpy(pkt_dev->dst_max, buf, len); + strcpy(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } @@ -1396,7 +1394,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strncpy(pkt_dev->src_min, buf, len); + strcpy(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } @@ -1416,7 +1414,7 @@ static ssize_t pktgen_if_write(struct file *file, buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strncpy(pkt_dev->src_max, buf, len); + strcpy(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5ef61222fdef..92b6fa5d5f6e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -964,7 +964,8 @@ static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ - nla_total_size(4); /* XDP_PROG_ID */ + nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ + nla_total_size(4); /* XDP_<mode>_PROG_ID */ return xdp_size; } @@ -1353,27 +1354,51 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } -static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) +static u32 rtnl_xdp_prog_skb(struct net_device *dev) { - const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; - struct netdev_bpf xdp; ASSERT_RTNL(); - *prog_id = 0; generic_xdp_prog = rtnl_dereference(dev->xdp_prog); - if (generic_xdp_prog) { - *prog_id = generic_xdp_prog->aux->id; - return XDP_ATTACHED_SKB; - } - if (!ops->ndo_bpf) - return XDP_ATTACHED_NONE; + if (!generic_xdp_prog) + return 0; + return generic_xdp_prog->aux->id; +} + +static u32 rtnl_xdp_prog_drv(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG); +} + +static u32 rtnl_xdp_prog_hw(struct net_device *dev) +{ + return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, + XDP_QUERY_PROG_HW); +} + +static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, + u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, + u32 (*get_prog_id)(struct net_device *dev)) +{ + u32 curr_id; + int err; + + curr_id = get_prog_id(dev); + if (!curr_id) + return 0; - __dev_xdp_query(dev, ops->ndo_bpf, &xdp); - *prog_id = xdp.prog_id; + *prog_id = curr_id; + err = nla_put_u32(skb, attr, curr_id); + if (err) + return err; + + if (*mode != XDP_ATTACHED_NONE) + *mode = XDP_ATTACHED_MULTI; + else + *mode = tgt_mode; - return xdp.prog_attached; + return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) @@ -1381,17 +1406,32 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) struct nlattr *xdp; u32 prog_id; int err; + u8 mode; xdp = nla_nest_start(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; - err = nla_put_u8(skb, IFLA_XDP_ATTACHED, - rtnl_xdp_attached_mode(dev, &prog_id)); + prog_id = 0; + mode = XDP_ATTACHED_NONE; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, + IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); + if (err) + goto err_cancel; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, + IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); + if (err) + goto err_cancel; + err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, + IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); + if (err) + goto err_cancel; + + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; - if (prog_id) { + if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; @@ -2759,9 +2799,12 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) return err; } - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - - __dev_notify_flags(dev, old_flags, ~0U); + if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { + __dev_notify_flags(dev, old_flags, 0U); + } else { + dev->rtnl_link_state = RTNL_LINK_INITIALIZED; + __dev_notify_flags(dev, old_flags, ~0U); + } return 0; } EXPORT_SYMBOL(rtnl_configure_link); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8e51f8555e11..0c1a00672ba9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3816,14 +3816,14 @@ err: } EXPORT_SYMBOL_GPL(skb_segment); -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); - struct sk_buff *lp, *p = *head; unsigned int delta_truesize; + struct sk_buff *lp; if (unlikely(p->len + len >= 65536)) return -E2BIG; @@ -4899,7 +4899,6 @@ EXPORT_SYMBOL(skb_try_coalesce); */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb->tstamp = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; @@ -4912,8 +4911,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) return; ipvs_reset(skb); - skb_orphan(skb); skb->mark = 0; + skb->tstamp = 0; } EXPORT_SYMBOL_GPL(skb_scrub_packet); diff --git a/net/core/sock.c b/net/core/sock.c index 9e8f65585b81..03fdea5b0f57 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -91,6 +91,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> @@ -697,6 +698,7 @@ EXPORT_SYMBOL(sk_mc_loop); int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { + struct sock_txtime sk_txtime; struct sock *sk = sock->sk; int val; int valbool; @@ -1070,6 +1072,26 @@ set_rcvbuf: } break; + case SO_TXTIME: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + } else if (optlen != sizeof(struct sock_txtime)) { + ret = -EINVAL; + } else if (copy_from_user(&sk_txtime, optval, + sizeof(struct sock_txtime))) { + ret = -EFAULT; + } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { + ret = -EINVAL; + } else { + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + } + break; + default: ret = -ENOPROTOOPT; break; @@ -1115,6 +1137,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, u64 val64; struct linger ling; struct timeval tm; + struct sock_txtime txtime; } v; int lv = sizeof(int); @@ -1403,6 +1426,15 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_ZEROCOPY); break; + case SO_TXTIME: + lv = sizeof(v.txtime); + v.txtime.clockid = sk->sk_clockid; + v.txtime.flags |= sk->sk_txtime_deadline_mode ? + SOF_TXTIME_DEADLINE_MODE : 0; + v.txtime.flags |= sk->sk_txtime_report_errors ? + SOF_TXTIME_REPORT_ERRORS : 0; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2137,6 +2169,13 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + case SCM_TXTIME: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) + return -EINVAL; + sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: @@ -2401,9 +2440,10 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct proto *prot = sk->sk_prot; long allocated = sk_memory_allocated_add(sk, amt); + bool charged = true; if (mem_cgroup_sockets_enabled && sk->sk_memcg && - !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) + !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) goto suppress_allocation; /* Under limit. */ @@ -2461,7 +2501,8 @@ suppress_allocation: return 1; } - trace_sock_exceed_buf_limit(sk, prot, allocated); + if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) + trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); @@ -2818,6 +2859,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0U; sk->sk_pacing_shift = 10; sk->sk_incoming_cpu = -1; + + sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/core/xdp.c b/net/core/xdp.c index 9d1f22072d5d..57285383ed00 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -3,8 +3,11 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. * Released under terms in GPL version 2. See COPYING. */ +#include <linux/bpf.h> +#include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> +#include <linux/netdevice.h> #include <linux/slab.h> #include <linux/idr.h> #include <linux/rhashtable.h> @@ -45,8 +48,8 @@ static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) != sizeof(u32)); - /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ - return key << RHT_HASH_RESERVED_SPACE; + /* Use cyclic increasing ID as direct hash key */ + return key; } static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, @@ -370,3 +373,34 @@ void xdp_return_buff(struct xdp_buff *xdp) __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); } EXPORT_SYMBOL_GPL(xdp_return_buff); + +int xdp_attachment_query(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + bpf->prog_id = info->prog ? info->prog->aux->id : 0; + bpf->prog_flags = info->prog ? info->flags : 0; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_attachment_query); + +bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) { + NL_SET_ERR_MSG(bpf->extack, + "program loaded with different flags"); + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok); + +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog) + bpf_prog_put(info->prog); + info->prog = bpf->prog; + info->flags = bpf->flags; +} +EXPORT_SYMBOL_GPL(xdp_attachment_setup); diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 1b2120645730..34aba55ed573 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -491,6 +491,7 @@ static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb) break; case DN_RUN: sk->sk_shutdown |= SHUTDOWN_MASK; + /* fall through */ case DN_CC: scp->state = DN_CN; } diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dc5d9af3dc80..a1917025e155 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -775,6 +775,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) if (!ds) return NULL; + /* We avoid allocating memory outside dsa_switch + * if it is not needed. + */ + if (n <= sizeof(ds->_bitmap) * 8) { + ds->bitmap = &ds->_bitmap; + } else { + ds->bitmap = devm_kcalloc(dev, + BITS_TO_LONGS(n), + sizeof(unsigned long), + GFP_KERNEL); + if (unlikely(!ds->bitmap)) + return NULL; + } + ds->dev = dev; ds->num_ports = n; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1e3b6a6d8a40..71536c435132 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -900,7 +900,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: - return tcf_block_cb_register(f->block, cb, dev, dev); + return tcf_block_cb_register(f->block, cb, dev, dev, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, cb, dev); return 0; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index b93511726069..142b294d3446 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -136,21 +136,20 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, { const struct switchdev_obj_port_mdb *mdb = info->mdb; struct switchdev_trans *trans = info->trans; - DECLARE_BITMAP(group, ds->num_ports); int port; /* Build a mask of Multicast group members */ - bitmap_zero(group, ds->num_ports); + bitmap_zero(ds->bitmap, ds->num_ports); if (ds->index == info->sw_index) - set_bit(info->port, group); + set_bit(info->port, ds->bitmap); for (port = 0; port < ds->num_ports; port++) if (dsa_is_dsa_port(ds, port)) - set_bit(port, group); + set_bit(port, ds->bitmap); if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_mdb_prepare_bitmap(ds, mdb, group); + return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap); - dsa_switch_mdb_add_bitmap(ds, mdb, group); + dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap); return 0; } @@ -204,21 +203,20 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, { const struct switchdev_obj_port_vlan *vlan = info->vlan; struct switchdev_trans *trans = info->trans; - DECLARE_BITMAP(members, ds->num_ports); int port; /* Build a mask of VLAN members */ - bitmap_zero(members, ds->num_ports); + bitmap_zero(ds->bitmap, ds->num_ports); if (ds->index == info->sw_index) - set_bit(info->port, members); + set_bit(info->port, ds->bitmap); for (port = 0; port < ds->num_ports; port++) if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) - set_bit(port, members); + set_bit(port, ds->bitmap); if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_vlan_prepare_bitmap(ds, vlan, members); + return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap); - dsa_switch_vlan_add_bitmap(ds, vlan, members); + dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap); return 0; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index ee28440f57c5..fd8faa0dfa61 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) } EXPORT_SYMBOL(sysfs_format_mac); -struct sk_buff **eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct ethhdr *eh, *eh2; - unsigned int hlen, off_eth; const struct packet_offload *ptype; + unsigned int hlen, off_eth; + struct sk_buff *pp = NULL; + struct ethhdr *eh, *eh2; + struct sk_buff *p; __be16 type; int flush = 1; @@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b403499fdabe..f2a0a3bab6b5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog) err = inet_csk_listen_start(sk, backlog); if (err) goto out; + tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } sk->sk_max_ack_backlog = backlog; err = 0; @@ -1384,12 +1385,12 @@ out: } EXPORT_SYMBOL(inet_gso_segment); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; - struct sk_buff *p; + struct sk_buff *pp = NULL; const struct iphdr *iph; + struct sk_buff *p; unsigned int hlen; unsigned int off; unsigned int id; @@ -1425,7 +1426,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct iphdr *iph2; u16 flush_id; @@ -1505,8 +1506,8 @@ out: } EXPORT_SYMBOL(inet_gro_receive); -static struct sk_buff **ipip_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipip_gro_receive(struct list_head *head, + struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; @@ -1882,6 +1883,7 @@ fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, + .list_func = ip_list_rcv, }; static int __init inet_init(void) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 7cf755ef9efb..bbeecd13e534 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -28,8 +28,8 @@ #include <linux/spinlock.h> #include <net/udp.h> -static struct sk_buff **esp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index c9ec1603666b..500a59906b87 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -224,14 +224,14 @@ drop: return 0; } -static struct sk_buff **fou_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *fou_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - const struct net_offload *ops; - struct sk_buff **pp = NULL; u8 proto = fou_from_sock(sk)->protocol; const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *pp = NULL; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel @@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return guehdr; } -static struct sk_buff **gue_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gue_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { const struct net_offload **offloads; const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct guehdr *guehdr; size_t len, optlen, hdrlen, off; @@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, skb_gro_pull(skb, hdrlen); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct guehdr *guehdr2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 6a7d980105f6..6c63524f598a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -108,10 +108,10 @@ out: return segs; } -static struct sk_buff **gre_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gre_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; @@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1617604c9284..695979b7ef6d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -429,14 +429,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; + ipcm_init(&ipc); inet->tos = ip_hdr(skb)->tos; sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -710,11 +707,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; sk->sk_mark = mark; + ipcm_init(&ipc); ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 1e4cf3ab560f..d3162baca9f1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -20,6 +20,7 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <linux/rhashtable.h> #include <net/sock.h> #include <net/inet_frag.h> diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2d8efeecf619..c8ca5d8f0f75 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -587,6 +587,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto err_free_rt; md = ip_tunnel_info_opts(tun_info); if (!md) goto err_free_rt; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7582713dd18f..3196cf58f418 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -307,7 +307,8 @@ drop: return true; } -static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static int ip_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); @@ -315,13 +316,6 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) struct rtable *rt; int err; - /* if ingress device is enslaved to an L3 master device pass the - * skb to its handler for processing - */ - skb = l3mdev_ip_rcv(skb); - if (!skb) - return NET_RX_SUCCESS; - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -393,7 +387,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - return dst_input(skb); + return NET_RX_SUCCESS; drop: kfree_skb(skb); @@ -405,13 +399,29 @@ drop_error: goto drop; } +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + + ret = ip_rcv_finish_core(net, sk, skb); + if (ret != NET_RX_DROP) + ret = dst_input(skb); + return ret; +} + /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; - struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -421,7 +431,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - net = dev_net(dev); __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -489,9 +498,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip_rcv_finish); + return skb; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); @@ -500,5 +507,113 @@ inhdr_error: drop: kfree_skb(skb); out: - return NET_RX_DROP; + return NULL; +} + +/* + * IP receive entry point + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip_rcv_finish); +} + +static void ip_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + list_del(&skb->list); + /* Handle ip{6}_forward case, as sch_direct_xmit have + * another kind of SKB-list usage (see validate_xmit_skb_list) + */ + skb->next = NULL; + dst_input(skb); + } +} + +static void ip_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + list_del(&skb->list); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + continue; + if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) + continue; + + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv_finish(&sublist); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dst = dst; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip_sublist_rcv_finish(&sublist); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip_rcv_finish); + ip_list_rcv_finish(net, NULL, head); +} + +/* Receive a list of IP packets */ +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + list_del(&skb->list); + skb = ip_rcv_core(skb, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dev = dev; + curr_net = net; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip_sublist_rcv(&sublist, curr_dev, curr_net); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b3308e9d9762..e2b6bd478afb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) } /* Note: skb->sk can be different from sk, in case of tunnels */ -int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + __u8 tos) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), + RT_CONN_FLAGS_TOS(sk, tos), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; @@ -478,7 +479,7 @@ packet_routed: skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); - *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else @@ -511,7 +512,7 @@ no_route: kfree_skb(skb); return -EHOSTUNREACH; } -EXPORT_SYMBOL(ip_queue_xmit); +EXPORT_SYMBOL(__ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { @@ -1145,14 +1146,15 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->fragsize = ip_sk_use_pmtu(sk) ? dst_mtu(&rt->dst) : rt->dst.dev->mtu; - cork->gso_size = sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0; + cork->gso_size = ipc->gso_size; cork->dst = &rt->dst; cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->priority = ipc->priority; - cork->tx_flags = ipc->tx_flags; + cork->transmit_time = ipc->sockc.transmit_time; + cork->tx_flags = 0; + sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); return 0; } @@ -1413,6 +1415,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1545,11 +1548,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) return; + ipcm_init(&ipc); ipc.addr = daddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9f79b9803a16..5660adcf7a04 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -60,6 +60,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/export.h> +#include <linux/rhashtable.h> #include <net/ip_tunnels.h> #include <net/checksum.h> #include <net/netlink.h> @@ -1051,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *skb; int ret; - if (assert == IGMPMSG_WHOLEPKT) + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else skb = alloc_skb(128, GFP_ATOMIC); @@ -1059,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt, if (!skb) return -ENOBUFS; - if (assert == IGMPMSG_WHOLEPKT) { + if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. * And all this only to mangle msg->im_msgtype and @@ -1070,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt, skb_reset_transport_header(skb); msg = (struct igmpmsg *)skb_network_header(skb); memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); - msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_msgtype = assert; msg->im_mbz = 0; - msg->im_vif = mrt->mroute_reg_vif_num; + if (assert == IGMPMSG_WRVIFWHOLE) + msg->im_vif = vifi; + else + msg->im_vif = mrt->mroute_reg_vif_num; ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1371,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, struct mr_table *mrt; struct vifctl vif; struct mfcctl mfc; + bool do_wrvifwhole; u32 uval; /* There's one exception to the lock - MRT_DONE which needs to unlock */ @@ -1501,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, break; } + do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { mrt->mroute_do_pim = val; mrt->mroute_do_assert = val; + mrt->mroute_do_wrvifwhole = do_wrvifwhole; } break; case MRT_TABLE: @@ -1982,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); + if (mrt->mroute_do_wrvifwhole) + ipmr_cache_report(mrt, skb, true_vifi, + IGMPMSG_WRVIFWHOLE); } goto dont_forward; } @@ -2658,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) mrt->mroute_reg_vif_num) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, mrt->mroute_do_assert) || - nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim)) + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, + mrt->mroute_do_wrvifwhole)) return false; return true; diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index cafb0506c8c9..1ad9aa62a97b 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -2,6 +2,7 @@ * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation */ +#include <linux/rhashtable.h> #include <linux/mroute_base.h> /* Sets everything common except 'dev', since that is done under locking */ diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 4388de0e5380..1e6f28c97d3a 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -35,7 +35,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct nf_log_buf *m, +static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { @@ -183,7 +183,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ nf_log_buf_add(m, "["); - dump_ipv4_packet(m, info, skb, + dump_ipv4_packet(net, m, info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); nf_log_buf_add(m, "] "); } @@ -251,7 +251,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && !iphoff) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) @@ -333,7 +333,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv4_mac_header(m, loginfo, skb); - dump_ipv4_packet(m, loginfo, skb, 0); + dump_ipv4_packet(net, m, loginfo, skb, 0); nf_log_buf_close(m); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2ed64bca54e3..b54c964ad925 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -739,13 +739,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* no remote port */ } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.oif = sk->sk_bound_dev_if; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -769,8 +763,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 77350c1256ce..b46e4cf9a55a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -287,6 +287,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), + SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP), + SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index abb3c9490c55..33df4d76db2d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -561,13 +562,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = inet->inet_daddr; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -670,8 +665,6 @@ back_from_confirm: &rt, msg->msg_flags, &ipc.sockc); else { - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4491faf83f4f..bce53b1728a6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -817,8 +817,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, * This occurs when user tries to read * from never connected socket. */ - if (!sock_flag(sk, SOCK_DONE)) - ret = -ENOTCONN; + ret = -ENOTCONN; break; } if (!timeo) { @@ -1241,7 +1240,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { @@ -1275,9 +1274,6 @@ restart: int linear; new_segment: - /* Allocate new segment. If the interface is SG, - * allocate skb fitting to single page. - */ if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -2042,13 +2038,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; if (sk->sk_state == TCP_CLOSE) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - copied = -ENOTCONN; - break; - } + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; break; } @@ -2576,6 +2569,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; + tp->rcv_rtt_last_tsecr = 0; tp->write_seq += tp->max_window + 2; if (tp->write_seq == 0) tp->write_seq = 1; diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 58e2f479ffb4..3b5f45b9e81e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -205,7 +205,11 @@ static u32 bbr_bw(const struct sock *sk) */ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { - rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); + unsigned int mss = tcp_sk(sk)->mss_cache; + + if (!tcp_needs_internal_pacing(sk)) + mss = tcp_mss_to_mtu(sk, mss); + rate *= mss; rate *= gain; rate >>= BBR_SCALE; rate *= USEC_PER_SEC; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8e5522c6833a..91dbb9afb950 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -78,6 +78,7 @@ #include <linux/errqueue.h> #include <trace/events/tcp.h> #include <linux/static_key.h> +#include <net/busy_poll.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -582,9 +583,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, { struct tcp_sock *tp = tcp_sk(sk); - if (tp->rx_opt.rcv_tsecr && - (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { + if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr) + return; + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; + + if (TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; @@ -3458,7 +3462,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) @@ -4339,6 +4343,11 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; +#ifdef CONFIG_TLS_DEVICE + if (from->decrypted != to->decrypted) + return false; +#endif + if (!skb_try_coalesce(to, from, fragstolen, &delta)) return false; @@ -4617,8 +4626,10 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) skb->data_len = data_len; skb->len = size; - if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto err_free; + } err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (err) @@ -4674,18 +4685,21 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (tcp_receive_window(tp) == 0) + if (tcp_receive_window(tp) == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } /* Ok. In sequence. In window. */ queue_and_out: if (skb_queue_len(&sk->sk_receive_queue) == 0) sk_forced_mem_schedule(sk, skb->truesize); - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); - tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); if (skb->len) tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -4741,8 +4755,10 @@ drop: /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. */ - if (!tcp_receive_window(tp)) + if (!tcp_receive_window(tp)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; + } goto queue_and_out; } @@ -4860,6 +4876,9 @@ restart: break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); +#ifdef CONFIG_TLS_DEVICE + nskb->decrypted = skb->decrypted; +#endif TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); @@ -4887,6 +4906,10 @@ restart: skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; +#ifdef CONFIG_TLS_DEVICE + if (skb->decrypted != nskb->decrypted) + goto end; +#endif } } } @@ -5484,6 +5507,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_ack(sk, skb, 0); __kfree_skb(skb); tcp_data_snd_check(sk); + /* When receiving pure ack in fast path, update + * last ts ecr directly instead of calling + * tcp_rcv_rtt_measure_ts() + */ + tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); @@ -5585,6 +5613,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); + sk_mark_napi_id(sk, skb); } tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); @@ -6413,6 +6442,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); + sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3b2711e33e4c..9e041fa5c545 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -155,7 +155,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + (!twp || (reuse && time_after32(ktime_get_seconds(), + tcptw->tw_ts_recent_stamp)))) { /* In case of repair and re-using TIME-WAIT sockets we still * want to be sure that it is safe as above but honor the * sequence numbers and time stamps set as part of the repair diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dda1341a223..75ef332a7caf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } @@ -189,7 +189,7 @@ kill: if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); @@ -449,119 +449,122 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); + const struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk; + struct tcp_sock *oldtp, *newtp; - if (newsk) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct tcp_request_sock *treq = tcp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct tcp_sock *newtp = tcp_sk(newsk); - struct tcp_sock *oldtp = tcp_sk(sk); - - smc_check_reset_syn_req(oldtp, req, newtp); - - /* Now setup tcp_sock */ - newtp->pred_flags = 0; - - newtp->rcv_wup = newtp->copied_seq = - newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->segs_in = 1; - - newtp->snd_sml = newtp->snd_una = - newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; - - INIT_LIST_HEAD(&newtp->tsq_node); - INIT_LIST_HEAD(&newtp->tsorted_sent_queue); - - tcp_init_wl(newtp, treq->rcv_isn); - - newtp->srtt_us = 0; - newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); - newicsk->icsk_rto = TCP_TIMEOUT_INIT; - newicsk->icsk_ack.lrcvtime = tcp_jiffies32; - - newtp->packets_out = 0; - newtp->retrans_out = 0; - newtp->sacked_out = 0; - newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - newtp->tlp_high_seq = 0; - newtp->lsndtime = tcp_jiffies32; - newsk->sk_txhash = treq->txhash; - newtp->last_oow_ack_time = 0; - newtp->total_retrans = req->num_retrans; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - newtp->snd_cwnd = TCP_INIT_CWND; - newtp->snd_cwnd_cnt = 0; - - /* There's a bubble in the pipe until at least the first ACK. */ - newtp->app_limited = ~0U; - - tcp_init_xmit_timers(newsk); - newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; - - newtp->rx_opt.saw_tstamp = 0; - - newtp->rx_opt.dsack = 0; - newtp->rx_opt.num_sacks = 0; - - newtp->urg_data = 0; - - if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); - - newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; - newtp->rx_opt.sack_ok = ireq->sack_ok; - newtp->window_clamp = req->rsk_window_clamp; - newtp->rcv_ssthresh = req->rsk_rcv_wnd; - newtp->rcv_wnd = req->rsk_rcv_wnd; - newtp->rx_opt.wscale_ok = ireq->wscale_ok; - if (newtp->rx_opt.wscale_ok) { - newtp->rx_opt.snd_wscale = ireq->snd_wscale; - newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; - } else { - newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp, 65535U); - } - newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << - newtp->rx_opt.snd_wscale); - newtp->max_window = newtp->snd_wnd; - - if (newtp->rx_opt.tstamp_ok) { - newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = get_seconds(); - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - } else { - newtp->rx_opt.ts_recent_stamp = 0; - newtp->tcp_header_len = sizeof(struct tcphdr); - } - newtp->tsoffset = treq->ts_off; + if (!newsk) + return NULL; + + newicsk = inet_csk(newsk); + newtp = tcp_sk(newsk); + oldtp = tcp_sk(sk); + + smc_check_reset_syn_req(oldtp, req, newtp); + + /* Now setup tcp_sock */ + newtp->pred_flags = 0; + + newtp->rcv_wup = newtp->copied_seq = + newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 1; + + newtp->snd_sml = newtp->snd_una = + newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; + + INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); + + tcp_init_wl(newtp, treq->rcv_isn); + + newtp->srtt_us = 0; + newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; + + newtp->packets_out = 0; + newtp->retrans_out = 0; + newtp->sacked_out = 0; + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + newtp->tlp_high_seq = 0; + newtp->lsndtime = tcp_jiffies32; + newsk->sk_txhash = treq->txhash; + newtp->last_oow_ack_time = 0; + newtp->total_retrans = req->num_retrans; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = TCP_INIT_CWND; + newtp->snd_cwnd_cnt = 0; + + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + + tcp_init_xmit_timers(newsk); + newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; + + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.num_sacks = 0; + + newtp->urg_data = 0; + + if (sock_flag(newsk, SOCK_KEEPOPEN)) + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); + + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; + newtp->rx_opt.sack_ok = ireq->sack_ok; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG - newtp->md5sig_info = NULL; /*XXX*/ - if (newtp->af_specific->md5_lookup(sk, newsk)) - newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; + newtp->md5sig_info = NULL; /*XXX*/ + if (newtp->af_specific->md5_lookup(sk, newsk)) + newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif - if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) - newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; - newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); - newtp->fastopen_req = NULL; - newtp->fastopen_rsk = NULL; - newtp->syn_data_acked = 0; - newtp->rack.mstamp = 0; - newtp->rack.advanced = 0; - newtp->rack.reo_wnd_steps = 1; - newtp->rack.last_delivered = 0; - newtp->rack.reo_wnd_persist = 0; - newtp->rack.dsack_seen = 0; - - __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); - } + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; + newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; + newtp->rack.mstamp = 0; + newtp->rack.advanced = 0; + newtp->rack.reo_wnd_steps = 1; + newtp->rack.last_delivered = 0; + newtp->rack.reo_wnd_persist = 0; + newtp->rack.dsack_seen = 0; + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); @@ -600,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); + tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8cc7c3487330..870b0a335061 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -180,9 +180,9 @@ out: return segs; } -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct tcphdr *th; struct tcphdr *th2; @@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) len = skb_gro_len(skb); flags = tcp_flag_word(th); - for (; (p = *head); head = &p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto found; } - + p = NULL; goto out_check_final; found: @@ -262,8 +262,11 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); +#ifdef CONFIG_TLS_DEVICE + flush |= p->decrypted ^ skb->decrypted; +#endif - if (flush || skb_gro_receive(head, skb)) { + if (flush || skb_gro_receive(p, skb)) { mss = 1; goto out_check_final; } @@ -277,7 +280,7 @@ out_check_final: TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; + pp = p; out: NAPI_GRO_CB(skb)->flush |= (flush != 0); @@ -302,7 +305,7 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 00e5a300ddb9..6cbab56e7407 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -973,17 +973,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) return HRTIMER_NORESTART; } -/* BBR congestion control needs pacing. - * Same remark for SO_MAX_PACING_RATE. - * sch_fq packet scheduler is efficiently handling pacing, - * but is not always installed/used. - * Return true if TCP stack should pace packets itself. - */ -static bool tcp_needs_internal_pacing(const struct sock *sk) -{ - return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; -} - static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) { u64 len_ns; @@ -995,9 +984,6 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) if (!rate || rate == ~0U) return; - /* Should account for header sizes as sch_fq does, - * but lets make things simple. - */ len_ns = (u64)skb->len * NSEC_PER_SEC; do_div(len_ns, rate); hrtimer_start(&tcp_sk(sk)->pacing_timer, diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index c61240e43923..4dff40dad4dc 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + /* Normally we expect interval_us >= min-rtt. * Note that rate may still be over-estimated when a spuriously * retransmistted skb was first (s)acked because "interval_us" diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 24e116ddae79..060e841dde40 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -926,11 +926,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; @@ -977,9 +972,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) connected = 1; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); ipc.gso_size = up->gso_size; if (msg->msg_controllen) { @@ -1027,8 +1020,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = ipc.addr; ipc.addr = faddr = daddr; - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 69c54540d5b4..0c0522b79b43 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -343,10 +343,11 @@ out: return segs; } -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; @@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, unflush: flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -399,8 +400,8 @@ out: } EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff **udp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 91580c62bb86..1659a6b3cf42 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -385,8 +385,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) if (ndev->cnf.stable_secret.initialized) ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; - else - ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode; ndev->cnf.mtu6 = dev->mtu; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); @@ -5210,7 +5208,9 @@ static inline size_t inet6_ifla6_size(void) + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ - + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */ + + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */ + + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */ + + 0; } static inline size_t inet6_if_nlmsg_size(void) @@ -5892,32 +5892,31 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, loff_t *ppos) { int ret = 0; - int new_val; + u32 new_val; struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; struct net *net = (struct net *)ctl->extra2; + struct ctl_table tmp = { + .data = &new_val, + .maxlen = sizeof(new_val), + .mode = ctl->mode, + }; if (!rtnl_trylock()) return restart_syscall(); - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + new_val = *((u32 *)ctl->data); - if (write) { - new_val = *((int *)ctl->data); + ret = proc_douintvec(&tmp, write, buffer, lenp, ppos); + if (ret != 0) + goto out; + if (write) { if (check_addr_gen_mode(new_val) < 0) { ret = -EINVAL; goto out; } - /* request for default */ - if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) { - ipv6_devconf_dflt.addr_gen_mode = new_val; - - /* request for individual net device */ - } else { - if (!idev) - goto out; - + if (idev) { if (check_stable_privacy(idev, net, new_val) < 0) { ret = -EINVAL; goto out; @@ -5927,7 +5926,21 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, idev->cnf.addr_gen_mode = new_val; addrconf_dev_config(idev->dev); } + } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { + struct net_device *dev; + + net->ipv6.devconf_dflt->addr_gen_mode = new_val; + for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev && + idev->cnf.addr_gen_mode != new_val) { + idev->cnf.addr_gen_mode = new_val; + addrconf_dev_config(idev->dev); + } + } } + + *((u32 *)ctl->data) = new_val; } out: diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9ed0eae91758..c9535354149f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .func = ipv6_rcv, + .list_func = ipv6_list_rcv, }; static int __init ipv6_packet_init(void) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2ee08b6a86a4..201306b9b5ea 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -736,7 +736,7 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, - struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc) + struct ipcm6_cookie *ipc6) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; @@ -755,7 +755,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } if (cmsg->cmsg_level == SOL_SOCKET) { - err = __sock_cmsg_send(sk, msg, cmsg, sockc); + err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc); if (err) return err; continue; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 27f59b61f70f..ddfa533a84e5 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen) return 0; } -static struct sk_buff **esp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index be491bf6ab6e..24611c8b0562 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -430,7 +430,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; - struct sockcm_cookie sockc_unused = {0}; struct ipcm6_cookie ipc6; int iif = 0; int addr_type = 0; @@ -545,7 +544,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - ipc6.tclass = np->tclass; + ipcm6_init_sk(&ipc6, np); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = icmpv6_route_lookup(net, skb, sk, &fl6); @@ -553,8 +552,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, goto out; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; msg.skb = skb; msg.offset = skb_network_offset(skb); @@ -575,7 +572,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, &sockc_unused)) { + MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { @@ -679,7 +676,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); - struct sockcm_cookie sockc_unused = {0}; saddr = &ipv6_hdr(skb)->daddr; @@ -726,16 +722,14 @@ static void icmpv6_echo_reply(struct sk_buff *skb) msg.offset = 0; msg.type = ICMPV6_ECHO_REPLY; + ipcm6_init_sk(&ipc6, np); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT, - &sockc_unused)) { + (struct rt6_info *)dst, MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile index 4b32e5921e5c..b7739aba6e68 100644 --- a/net/ipv6/ila/Makefile +++ b/net/ipv6/ila/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_IPV6_ILA) += ila.o -ila-objs := ila_common.o ila_lwt.o ila_xlat.o +ila-objs := ila_main.o ila_common.o ila_lwt.o ila_xlat.o diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index 3c7a11b62334..1f747bcbec29 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -19,6 +19,7 @@ #include <linux/skbuff.h> #include <linux/types.h> #include <net/checksum.h> +#include <net/genetlink.h> #include <net/ip.h> #include <net/protocol.h> #include <uapi/linux/ila.h> @@ -104,9 +105,31 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, void ila_init_saved_csum(struct ila_params *p); +struct ila_net { + struct { + struct rhashtable rhash_table; + spinlock_t *locks; /* Bucket locks for entry manipulation */ + unsigned int locks_mask; + bool hooks_registered; + } xlat; +}; + int ila_lwt_init(void); void ila_lwt_fini(void); -int ila_xlat_init(void); -void ila_xlat_fini(void); + +int ila_xlat_init_net(struct net *net); +void ila_xlat_exit_net(struct net *net); + +int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info); +int ila_xlat_nl_dump_start(struct netlink_callback *cb); +int ila_xlat_nl_dump_done(struct netlink_callback *cb); +int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb); + +extern unsigned int ila_net_id; + +extern struct genl_family ila_nl_family; #endif /* __ILA_H */ diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 8c88ecf29b93..579310466eac 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -154,33 +154,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, iaddr->loc = p->locator; } -static int __init ila_init(void) -{ - int ret; - - ret = ila_lwt_init(); - - if (ret) - goto fail_lwt; - - ret = ila_xlat_init(); - if (ret) - goto fail_xlat; - - return 0; -fail_xlat: - ila_lwt_fini(); -fail_lwt: - return ret; -} - -static void __exit ila_fini(void) -{ - ila_xlat_fini(); - ila_lwt_fini(); -} - -module_init(ila_init); -module_exit(ila_fini); -MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); -MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c new file mode 100644 index 000000000000..18fac76b9520 --- /dev/null +++ b/net/ipv6/ila/ila_main.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <net/genetlink.h> +#include <net/ila.h> +#include <net/netns/generic.h> +#include <uapi/linux/genetlink.h> +#include "ila.h" + +static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { + [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, + [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, + [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, + [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, + [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, +}; + +static const struct genl_ops ila_nl_ops[] = { + { + .cmd = ILA_CMD_ADD, + .doit = ila_xlat_nl_cmd_add_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_DEL, + .doit = ila_xlat_nl_cmd_del_mapping, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_FLUSH, + .doit = ila_xlat_nl_cmd_flush, + .policy = ila_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = ILA_CMD_GET, + .doit = ila_xlat_nl_cmd_get_mapping, + .start = ila_xlat_nl_dump_start, + .dumpit = ila_xlat_nl_dump, + .done = ila_xlat_nl_dump_done, + .policy = ila_nl_policy, + }, +}; + +unsigned int ila_net_id; + +struct genl_family ila_nl_family __ro_after_init = { + .hdrsize = 0, + .name = ILA_GENL_NAME, + .version = ILA_GENL_VERSION, + .maxattr = ILA_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .ops = ila_nl_ops, + .n_ops = ARRAY_SIZE(ila_nl_ops), +}; + +static __net_init int ila_init_net(struct net *net) +{ + int err; + + err = ila_xlat_init_net(net); + if (err) + goto ila_xlat_init_fail; + + return 0; + +ila_xlat_init_fail: + return err; +} + +static __net_exit void ila_exit_net(struct net *net) +{ + ila_xlat_exit_net(net); +} + +static struct pernet_operations ila_net_ops = { + .init = ila_init_net, + .exit = ila_exit_net, + .id = &ila_net_id, + .size = sizeof(struct ila_net), +}; + +static int __init ila_init(void) +{ + int ret; + + ret = register_pernet_device(&ila_net_ops); + if (ret) + goto register_device_fail; + + ret = genl_register_family(&ila_nl_family); + if (ret) + goto register_family_fail; + + ret = ila_lwt_init(); + if (ret) + goto fail_lwt; + + return 0; + +fail_lwt: + genl_unregister_family(&ila_nl_family); +register_family_fail: + unregister_pernet_device(&ila_net_ops); +register_device_fail: + return ret; +} + +static void __exit ila_fini(void) +{ + ila_lwt_fini(); + genl_unregister_family(&ila_nl_family); + unregister_pernet_device(&ila_net_ops); +} + +module_init(ila_init); +module_exit(ila_fini); +MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 10ae13560b40..51a15ce50a64 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -22,36 +22,14 @@ struct ila_map { struct rcu_head rcu; }; -static unsigned int ila_net_id; - -struct ila_net { - struct rhashtable rhash_table; - spinlock_t *locks; /* Bucket locks for entry manipulation */ - unsigned int locks_mask; - bool hooks_registered; -}; - +#define MAX_LOCKS 1024 #define LOCKS_PER_CPU 10 static int alloc_ila_locks(struct ila_net *ilan) { - unsigned int i, size; - unsigned int nr_pcpus = num_possible_cpus(); - - nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); - size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); - - if (sizeof(spinlock_t) != 0) { - ilan->locks = kvmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); - if (!ilan->locks) - return -ENOMEM; - for (i = 0; i < size; i++) - spin_lock_init(&ilan->locks[i]); - } - ilan->locks_mask = size - 1; - - return 0; + return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask, + MAX_LOCKS, LOCKS_PER_CPU, + GFP_KERNEL); } static u32 hashrnd __read_mostly; @@ -71,7 +49,7 @@ static inline u32 ila_locator_hash(struct ila_locator loc) static inline spinlock_t *ila_get_lock(struct ila_net *ilan, struct ila_locator loc) { - return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask]; + return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask]; } static inline int ila_cmp_wildcards(struct ila_map *ila, @@ -115,16 +93,6 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = ila_cmpfn, }; -static struct genl_family ila_nl_family; - -static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { - [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, - [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, - [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, - [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, - [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, -}; - static int parse_nl_config(struct genl_info *info, struct ila_xlat_params *xp) { @@ -162,7 +130,7 @@ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc, + ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc, rht_params); while (ila) { if (!ila_cmp_wildcards(ila, iaddr, ifindex)) @@ -179,7 +147,7 @@ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, + ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); while (ila) { @@ -196,9 +164,9 @@ static inline void ila_release(struct ila_map *ila) kfree_rcu(ila, rcu); } -static void ila_free_cb(void *ptr, void *arg) +static void ila_free_node(struct ila_map *ila) { - struct ila_map *ila = (struct ila_map *)ptr, *next; + struct ila_map *next; /* Assume rcu_readlock held */ while (ila) { @@ -208,6 +176,11 @@ static void ila_free_cb(void *ptr, void *arg) } } +static void ila_free_cb(void *ptr, void *arg) +{ + ila_free_node((struct ila_map *)ptr); +} + static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int @@ -235,7 +208,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->hooks_registered) { + if (!ilan->xlat.hooks_registered) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ @@ -244,7 +217,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) if (err) return err; - ilan->hooks_registered = true; + ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); @@ -259,12 +232,12 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spin_lock(lock); - head = rhashtable_lookup_fast(&ilan->rhash_table, + head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ - err = rhashtable_lookup_insert_fast(&ilan->rhash_table, + err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); } else { struct ila_map *tila = head, *prev = NULL; @@ -290,7 +263,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) } else { /* Make this ila new head */ RCU_INIT_POINTER(ila->next, head); - err = rhashtable_replace_fast(&ilan->rhash_table, + err = rhashtable_replace_fast(&ilan->xlat.rhash_table, &head->node, &ila->node, rht_params); if (err) @@ -316,7 +289,7 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) spin_lock(lock); - head = rhashtable_lookup_fast(&ilan->rhash_table, + head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); ila = head; @@ -346,15 +319,15 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) * table */ err = rhashtable_replace_fast( - &ilan->rhash_table, &ila->node, + &ilan->xlat.rhash_table, &ila->node, &head->node, rht_params); if (err) goto out; } else { /* Entry no longer used */ - err = rhashtable_remove_fast(&ilan->rhash_table, - &ila->node, - rht_params); + err = rhashtable_remove_fast( + &ilan->xlat.rhash_table, + &ila->node, rht_params); } } @@ -369,7 +342,7 @@ out: return err; } -static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) +int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params p; @@ -382,7 +355,7 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) return ila_add_mapping(net, &p); } -static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) +int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params xp; @@ -397,6 +370,59 @@ static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) return 0; } +static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan, + struct ila_map *ila) +{ + return ila_get_lock(ilan, ila->xp.ip.locator_match); +} + +int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct ila_net *ilan = net_generic(net, ila_net_id); + struct rhashtable_iter iter; + struct ila_map *ila; + spinlock_t *lock; + int ret; + + ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL); + if (ret) + goto done; + + rhashtable_walk_start(&iter); + + for (;;) { + ila = rhashtable_walk_next(&iter); + + if (IS_ERR(ila)) { + if (PTR_ERR(ila) == -EAGAIN) + continue; + ret = PTR_ERR(ila); + goto done; + } else if (!ila) { + break; + } + + lock = lock_from_ila_map(ilan, ila); + + spin_lock(lock); + + ret = rhashtable_remove_fast(&ilan->xlat.rhash_table, + &ila->node, rht_params); + if (!ret) + ila_free_node(ila); + + spin_unlock(lock); + + if (ret) + break; + } + +done: + rhashtable_walk_stop(&iter); + return ret; +} + static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, @@ -434,7 +460,7 @@ nla_put_failure: return -EMSGSIZE; } -static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) +int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); @@ -475,27 +501,34 @@ out_free: struct ila_dump_iter { struct rhashtable_iter rhiter; + int skip; }; -static int ila_nl_dump_start(struct netlink_callback *cb) +int ila_xlat_nl_dump_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct ila_net *ilan = net_generic(net, ila_net_id); - struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; + struct ila_dump_iter *iter; + int ret; - if (!iter) { - iter = kmalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; - cb->args[0] = (long)iter; + ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter, + GFP_KERNEL); + if (ret) { + kfree(iter); + return ret; } - return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter, - GFP_KERNEL); + iter->skip = 0; + cb->args[0] = (long)iter; + + return ret; } -static int ila_nl_dump_done(struct netlink_callback *cb) +int ila_xlat_nl_dump_done(struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; @@ -506,24 +539,49 @@ static int ila_nl_dump_done(struct netlink_callback *cb) return 0; } -static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) +int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; struct rhashtable_iter *rhiter = &iter->rhiter; + int skip = iter->skip; struct ila_map *ila; int ret; rhashtable_walk_start(rhiter); - for (;;) { - ila = rhashtable_walk_next(rhiter); + /* Get first entry */ + ila = rhashtable_walk_peek(rhiter); + + if (ila && !IS_ERR(ila) && skip) { + /* Skip over visited entries */ + while (ila && skip) { + /* Skip over any ila entries in this list that we + * have already dumped. + */ + ila = rcu_access_pointer(ila->next); + skip--; + } + } + + skip = 0; + + for (;;) { if (IS_ERR(ila)) { - if (PTR_ERR(ila) == -EAGAIN) - continue; ret = PTR_ERR(ila); - goto done; + if (ret == -EAGAIN) { + /* Table has changed and iter has reset. Return + * -EAGAIN to the application even if we have + * written data to the skb. The application + * needs to deal with this. + */ + + goto out_ret; + } else { + break; + } } else if (!ila) { + ret = 0; break; } @@ -532,90 +590,54 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, ILA_CMD_GET); if (ret) - goto done; + goto out; + skip++; ila = rcu_access_pointer(ila->next); } + + skip = 0; + ila = rhashtable_walk_next(rhiter); } - ret = skb->len; +out: + iter->skip = skip; + ret = (skb->len ? : ret); -done: +out_ret: rhashtable_walk_stop(rhiter); return ret; } -static const struct genl_ops ila_nl_ops[] = { - { - .cmd = ILA_CMD_ADD, - .doit = ila_nl_cmd_add_mapping, - .policy = ila_nl_policy, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = ILA_CMD_DEL, - .doit = ila_nl_cmd_del_mapping, - .policy = ila_nl_policy, - .flags = GENL_ADMIN_PERM, - }, - { - .cmd = ILA_CMD_GET, - .doit = ila_nl_cmd_get_mapping, - .start = ila_nl_dump_start, - .dumpit = ila_nl_dump, - .done = ila_nl_dump_done, - .policy = ila_nl_policy, - }, -}; - -static struct genl_family ila_nl_family __ro_after_init = { - .hdrsize = 0, - .name = ILA_GENL_NAME, - .version = ILA_GENL_VERSION, - .maxattr = ILA_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, - .module = THIS_MODULE, - .ops = ila_nl_ops, - .n_ops = ARRAY_SIZE(ila_nl_ops), -}; - #define ILA_HASH_TABLE_SIZE 1024 -static __net_init int ila_init_net(struct net *net) +int ila_xlat_init_net(struct net *net) { - int err; struct ila_net *ilan = net_generic(net, ila_net_id); + int err; err = alloc_ila_locks(ilan); if (err) return err; - rhashtable_init(&ilan->rhash_table, &rht_params); + rhashtable_init(&ilan->xlat.rhash_table, &rht_params); return 0; } -static __net_exit void ila_exit_net(struct net *net) +void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); - rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL); + rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); - kvfree(ilan->locks); + free_bucket_spinlocks(ilan->xlat.locks); - if (ilan->hooks_registered) + if (ilan->xlat.hooks_registered) nf_unregister_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); } -static struct pernet_operations ila_net_ops = { - .init = ila_init_net, - .exit = ila_exit_net, - .id = &ila_net_id, - .size = sizeof(struct ila_net), -}; - static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; @@ -642,28 +664,3 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) return 0; } -int __init ila_xlat_init(void) -{ - int ret; - - ret = register_pernet_device(&ila_net_ops); - if (ret) - goto exit; - - ret = genl_register_family(&ila_nl_family); - if (ret < 0) - goto unregister; - - return 0; - -unregister: - unregister_pernet_device(&ila_net_ops); -exit: - return ret; -} - -void ila_xlat_fini(void) -{ - genl_unregister_family(&ila_nl_family); - unregister_pernet_device(&ila_net_ops); -} diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 3eee7637bdfe..cb54a8a3c273 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -373,7 +373,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; - struct sockcm_cookie sockc_junk; struct ipcm6_cookie ipc6; err = -ENOMEM; @@ -392,7 +391,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; - err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk); + err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index cd2cfb04e5d8..fc7dd3a04360 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -989,6 +989,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = key->tos; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto tx_err; md = ip_tunnel_info_opts(tun_info); if (!md) goto tx_err; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index f08d34491ece..6242682be876 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -47,17 +47,11 @@ #include <net/inet_ecn.h> #include <net/dst_metadata.h> -int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static void ip6_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { void (*edemux)(struct sk_buff *skb); - /* if ingress device is enslaved to an L3 master device pass the - * skb to its handler for processing - */ - skb = l3mdev_ip6_rcv(skb); - if (!skb) - return NET_RX_SUCCESS; - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; @@ -67,20 +61,73 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) } if (!skb_valid_dst(skb)) ip6_route_input(skb); +} + +int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + ip6_rcv_finish_core(net, sk, skb); return dst_input(skb); } -int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static void ip6_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + dst_input(skb); +} + +static void ip6_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + list_del(&skb->list); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + continue; + ip6_rcv_finish_core(net, sk, skb); + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip6_sublist_rcv_finish(&sublist); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dst = dst; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip6_sublist_rcv_finish(&sublist); +} + +static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, + struct net *net) { const struct ipv6hdr *hdr; u32 pkt_len; struct inet6_dev *idev; - struct net *net = dev_net(skb->dev); if (skb->pkt_type == PACKET_OTHERHOST) { kfree_skb(skb); - return NET_RX_DROP; + return NULL; } rcu_read_lock(); @@ -196,7 +243,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (ipv6_parse_hopopts(skb) < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); rcu_read_unlock(); - return NET_RX_DROP; + return NULL; } } @@ -205,15 +252,67 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip6_rcv_finish); + return skb; err: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: rcu_read_unlock(); kfree_skb(skb); - return NET_RX_DROP; + return NULL; +} + +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + struct net *net = dev_net(skb->dev); + + skb = ip6_rcv_core(skb, dev, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip6_rcv_finish); +} + +static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip6_rcv_finish); + ip6_list_rcv_finish(net, NULL, head); +} + +/* Receive a list of IPv6 packets */ +void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + list_del(&skb->list); + skb = ip6_rcv_core(skb, dev, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip6_sublist_rcv(&sublist, curr_dev, curr_net); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dev = dev; + curr_net = net; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip6_sublist_rcv(&sublist, curr_dev, curr_net); } /* diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 5b3f2f89ef41..37ff4805b20c 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -163,11 +163,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipv6_gro_receive(struct list_head *head, + struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; @@ -214,7 +214,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, flush--; nlen = skb_network_header_len(skb); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ @@ -263,8 +263,8 @@ out: return pp; } -static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ @@ -278,8 +278,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, return ipv6_gro_receive(head, skb); } -static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a14fb4fcdf18..8047fd41ba88 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1219,13 +1219,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (mtu < IPV6_MIN_MTU) return -EINVAL; cork->base.fragsize = mtu; - cork->base.gso_size = sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0; + cork->base.gso_size = ipc6->gso_size; + cork->base.tx_flags = 0; + sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; + cork->base.transmit_time = ipc6->sockc.transmit_time; + return 0; } @@ -1238,8 +1241,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6, - const struct sockcm_cookie *sockc) + unsigned int flags, struct ipcm6_cookie *ipc6) { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; @@ -1249,7 +1251,6 @@ static int __ip6_append_data(struct sock *sk, int copy; int err; int offset = 0; - __u8 tx_flags = 0; u32 tskey = 0; struct rt6_info *rt = (struct rt6_info *)cork->dst; struct ipv6_txoptions *opt = v6_cork->opt; @@ -1268,6 +1269,10 @@ static int __ip6_append_data(struct sock *sk, mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; + if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + @@ -1317,13 +1322,6 @@ emsgsize: rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; - if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { - sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); - if (tx_flags & SKBTX_ANY_SW_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; - } - /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1442,8 +1440,8 @@ alloc_new_skb: dst_exthdrlen); /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = tx_flags; - tx_flags = 0; + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; @@ -1560,8 +1558,7 @@ int ip6_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, - const struct sockcm_cookie *sockc) + struct rt6_info *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -1589,7 +1586,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6, sockc); + from, length, transhdrlen, flags, ipc6); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1673,6 +1670,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->base.transmit_time; + skb_dst_set(skb, dst_clone(&rt->dst)); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { @@ -1747,8 +1746,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, - struct inet_cork_full *cork, - const struct sockcm_cookie *sockc) + struct inet_cork_full *cork) { struct inet6_cork v6_cork; struct sk_buff_head queue; @@ -1776,7 +1774,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6, sockc); + flags, ipc6); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 0d0f0053bb11..d0b7e0249c13 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -32,6 +32,7 @@ #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compat.h> +#include <linux/rhashtable.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/raw.h> diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 568ca4187cd1..c0cac9cc3a28 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -500,7 +500,6 @@ sticky_done: struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; - struct sockcm_cookie sockc_junk; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); @@ -533,7 +532,7 @@ sticky_done: msg.msg_control = (void *)(opt+1); ipc6.opt = opt; - retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk); + retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index b397a8fe88b9..c6bf580d0f33 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -36,7 +36,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv6_packet(struct nf_log_buf *m, +static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) @@ -258,7 +258,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (recurse) { nf_log_buf_add(m, "["); - dump_ipv6_packet(m, info, skb, + dump_ipv6_packet(net, m, info, skb, ptr + sizeof(_icmp6h), 0); nf_log_buf_add(m, "] "); } @@ -278,7 +278,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && recurse) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (recurse && skb->mark) @@ -365,7 +365,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv6_mac_header(m, loginfo, skb); - dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1); nf_log_buf_close(m); } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 96f56bf49a30..4c04bccc7417 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -62,7 +62,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst; struct rt6_info *rt; struct pingfakehdr pfh; - struct sockcm_cookie junk = {0}; struct ipcm6_cookie ipc6; pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -119,7 +118,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - ipc6.tclass = np->tclass; + ipcm6_init_sk(&ipc6, np); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); @@ -142,13 +141,11 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) pfh.family = AF_INET6; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, 0, &ipc6, &fl6, rt, - MSG_DONTWAIT, &junk); + MSG_DONTWAIT); if (err) { ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index afc307c89d1a..413d98bf24f4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -620,7 +620,7 @@ out: static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, - unsigned int flags) + unsigned int flags, const struct sockcm_cookie *sockc) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); @@ -650,6 +650,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *dstp = NULL; @@ -766,7 +767,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst = NULL; struct raw6_frag_vec rfv; struct flowi6 fl6; - struct sockcm_cookie sockc; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; u16 proto; @@ -790,10 +790,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; - ipc6.opt = NULL; + ipcm6_init(&ipc6); + ipc6.sockc.tsflags = sk->sk_tsflags; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) @@ -847,14 +845,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; - sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -921,13 +918,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: if (inet->hdrincl) - err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags); + err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, + msg->msg_flags, &ipc6.sockc); else { ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, len, 0, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &sockc); + msg->msg_flags); if (err) ip6_flush_pending_frames(sk); diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 0fdf2a55e746..8d0ba757a46c 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -17,6 +17,7 @@ #include <linux/net.h> #include <linux/in6.h> #include <linux/slab.h> +#include <linux/rhashtable.h> #include <net/ipv6.h> #include <net/protocol.h> diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 558fe8cc6d43..8546f94f30d4 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -22,6 +22,7 @@ #include <linux/icmpv6.h> #include <linux/mroute6.h> #include <linux/slab.h> +#include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 278e49cd67d4..e72947c99454 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -15,8 +15,8 @@ #include <net/ip6_checksum.h> #include "ip6_offload.h" -static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *tcp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e6645cae403e..f6b96956a8ed 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1141,13 +1141,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - struct sockcm_cookie sockc; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; + ipcm6_init(&ipc6); ipc6.gso_size = up->gso_size; - sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = sk->sk_tsflags; /* destination address check */ if (sin6) { @@ -1282,7 +1279,7 @@ do_udp_sendmsg: err = udp_cmsg_send(sk, msg, &ipc6.gso_size); if (err > 0) err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, - &ipc6, &sockc); + &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1376,7 +1373,7 @@ back_from_confirm: skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &cork, &sockc); + msg->msg_flags, &cork); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_v6_send_skb(skb, &fl6, &cork.base); @@ -1402,7 +1399,7 @@ do_append_data: up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc); + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 03a2ff3fe1e6..95dee9ca8d22 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -114,8 +114,8 @@ out: return segs; } -static struct sk_buff **udp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 40261cb68e83..1ea285bad84b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -322,8 +322,7 @@ int l2tp_session_register(struct l2tp_session *session, if (tunnel->version == L2TP_HDR_VER_3) { pn = l2tp_pernet(tunnel->l2tp_net); - g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net), - session->session_id); + g_head = l2tp_session_id_hash_2(pn, session->session_id); spin_lock_bh(&pn->l2tp_session_hlist_lock); @@ -783,7 +782,7 @@ EXPORT_SYMBOL(l2tp_recv_common); /* Drop skbs from the session's reorder_q */ -int l2tp_session_queue_purge(struct l2tp_session *session) +static int l2tp_session_queue_purge(struct l2tp_session *session) { struct sk_buff *skb = NULL; BUG_ON(!session); @@ -794,7 +793,6 @@ int l2tp_session_queue_purge(struct l2tp_session *session) } return 0; } -EXPORT_SYMBOL_GPL(l2tp_session_queue_purge); /* Internal UDP receive frame. Do the real work of receiving an L2TP data frame * here. The skb is not on a list when we get here. @@ -1009,8 +1007,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf) return bufp - optr; } -static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, - struct flowi *fl, size_t data_len) +static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, + struct flowi *fl, size_t data_len) { struct l2tp_tunnel *tunnel = session->tunnel; unsigned int len = skb->len; @@ -1052,8 +1050,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, atomic_long_inc(&tunnel->stats.tx_errors); atomic_long_inc(&session->stats.tx_errors); } - - return 0; } /* If caller requires the skb to have a ppp header, the header must be @@ -1193,7 +1189,7 @@ end: /* When the tunnel is closed, all the attached sessions need to go too. */ -void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) +static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) { int hash; struct hlist_node *walk; @@ -1242,7 +1238,6 @@ again: } write_unlock_bh(&tunnel->hlist_lock); } -EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall); /* Tunnel socket destroy hook for UDP encapsulation */ static void l2tp_udp_encap_destroy(struct sock *sk) diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index c199020f8a8a..a5c09d3a5698 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -180,9 +180,6 @@ struct l2tp_tunnel { struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; -#ifdef CONFIG_DEBUG_FS - void (*show)(struct seq_file *m, void *arg); -#endif int (*recv_payload_hook)(struct sk_buff *skb); void (*old_sk_destruct)(struct sock *); struct sock *sock; /* Parent socket */ @@ -190,8 +187,6 @@ struct l2tp_tunnel { * was created by userspace */ struct work_struct del_work; - - uint8_t priv[0]; /* private data */ }; struct l2tp_nl_cmd_ops { @@ -201,11 +196,6 @@ struct l2tp_nl_cmd_ops { int (*session_delete)(struct l2tp_session *session); }; -static inline void *l2tp_tunnel_priv(struct l2tp_tunnel *tunnel) -{ - return &tunnel->priv[0]; -} - static inline void *l2tp_session_priv(struct l2tp_session *session) { return &session->priv[0]; @@ -229,7 +219,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, struct l2tp_tunnel_cfg *cfg); -void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, @@ -244,7 +233,6 @@ void l2tp_session_free(struct l2tp_session *session); void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb)); -int l2tp_session_queue_purge(struct l2tp_session *session); int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); void l2tp_session_set_header_len(struct l2tp_session *session, int version); diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index e87686f7d63c..b5d7dde003ef 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -177,9 +177,6 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) atomic_long_read(&tunnel->stats.rx_packets), atomic_long_read(&tunnel->stats.rx_bytes), atomic_long_read(&tunnel->stats.rx_errors)); - - if (tunnel->show != NULL) - tunnel->show(m, tunnel); } static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 957369192ca1..672e5b753738 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -500,7 +500,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; - struct sockcm_cookie sockc_unused = {0}; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ @@ -525,9 +524,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; + ipcm6_init(&ipc6); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) @@ -575,8 +572,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, - &sockc_unused); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -641,7 +637,7 @@ back_from_confirm: err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &sockc_unused); + msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index e398797878a9..9ac02c93df98 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -424,12 +424,6 @@ static void pppol2tp_put_sk(struct rcu_head *head) sock_put(ps->__sk); } -/* Called by l2tp_core when a session socket is being closed. - */ -static void pppol2tp_session_close(struct l2tp_session *session) -{ -} - /* Really kill the session socket. (Called from sock_put() if * refcnt == 0.) */ @@ -573,7 +567,6 @@ static void pppol2tp_session_init(struct l2tp_session *session) struct dst_entry *dst; session->recv_skb = pppol2tp_recv; - session->session_close = pppol2tp_session_close; #if IS_ENABLED(CONFIG_L2TP_DEBUGFS) session->show = pppol2tp_show; #endif @@ -595,40 +588,113 @@ static void pppol2tp_session_init(struct l2tp_session *session) } } +struct l2tp_connect_info { + u8 version; + int fd; + u32 tunnel_id; + u32 peer_tunnel_id; + u32 session_id; + u32 peer_session_id; +}; + +static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len, + struct l2tp_connect_info *info) +{ + switch (sa_len) { + case sizeof(struct sockaddr_pppol2tp): + { + const struct sockaddr_pppol2tp *sa_v2in4 = sa; + + if (sa_v2in4->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 2; + info->fd = sa_v2in4->pppol2tp.fd; + info->tunnel_id = sa_v2in4->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v2in4->pppol2tp.d_tunnel; + info->session_id = sa_v2in4->pppol2tp.s_session; + info->peer_session_id = sa_v2in4->pppol2tp.d_session; + + break; + } + case sizeof(struct sockaddr_pppol2tpv3): + { + const struct sockaddr_pppol2tpv3 *sa_v3in4 = sa; + + if (sa_v3in4->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 3; + info->fd = sa_v3in4->pppol2tp.fd; + info->tunnel_id = sa_v3in4->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v3in4->pppol2tp.d_tunnel; + info->session_id = sa_v3in4->pppol2tp.s_session; + info->peer_session_id = sa_v3in4->pppol2tp.d_session; + + break; + } + case sizeof(struct sockaddr_pppol2tpin6): + { + const struct sockaddr_pppol2tpin6 *sa_v2in6 = sa; + + if (sa_v2in6->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 2; + info->fd = sa_v2in6->pppol2tp.fd; + info->tunnel_id = sa_v2in6->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v2in6->pppol2tp.d_tunnel; + info->session_id = sa_v2in6->pppol2tp.s_session; + info->peer_session_id = sa_v2in6->pppol2tp.d_session; + + break; + } + case sizeof(struct sockaddr_pppol2tpv3in6): + { + const struct sockaddr_pppol2tpv3in6 *sa_v3in6 = sa; + + if (sa_v3in6->sa_protocol != PX_PROTO_OL2TP) + return -EINVAL; + + info->version = 3; + info->fd = sa_v3in6->pppol2tp.fd; + info->tunnel_id = sa_v3in6->pppol2tp.s_tunnel; + info->peer_tunnel_id = sa_v3in6->pppol2tp.d_tunnel; + info->session_id = sa_v3in6->pppol2tp.s_session; + info->peer_session_id = sa_v3in6->pppol2tp.d_session; + + break; + } + default: + return -EINVAL; + } + + return 0; +} + /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; - struct sockaddr_pppol2tp *sp = (struct sockaddr_pppol2tp *) uservaddr; struct pppox_sock *po = pppox_sk(sk); struct l2tp_session *session = NULL; + struct l2tp_connect_info info; struct l2tp_tunnel *tunnel; struct pppol2tp_session *ps; struct l2tp_session_cfg cfg = { 0, }; - int error = 0; - u32 tunnel_id, peer_tunnel_id; - u32 session_id, peer_session_id; bool drop_refcnt = false; bool drop_tunnel = false; bool new_session = false; bool new_tunnel = false; - int ver = 2; - int fd; - - lock_sock(sk); - - error = -EINVAL; + int error; - if (sockaddr_len != sizeof(struct sockaddr_pppol2tp) && - sockaddr_len != sizeof(struct sockaddr_pppol2tpv3) && - sockaddr_len != sizeof(struct sockaddr_pppol2tpin6) && - sockaddr_len != sizeof(struct sockaddr_pppol2tpv3in6)) - goto end; + error = pppol2tp_sockaddr_get_info(uservaddr, sockaddr_len, &info); + if (error < 0) + return error; - if (sp->sa_protocol != PX_PROTO_OL2TP) - goto end; + lock_sock(sk); /* Check for already bound sockets */ error = -EBUSY; @@ -640,56 +706,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (sk->sk_user_data) goto end; /* socket is already attached */ - /* Get params from socket address. Handle L2TPv2 and L2TPv3. - * This is nasty because there are different sockaddr_pppol2tp - * structs for L2TPv2, L2TPv3, over IPv4 and IPv6. We use - * the sockaddr size to determine which structure the caller - * is using. - */ - peer_tunnel_id = 0; - if (sockaddr_len == sizeof(struct sockaddr_pppol2tp)) { - fd = sp->pppol2tp.fd; - tunnel_id = sp->pppol2tp.s_tunnel; - peer_tunnel_id = sp->pppol2tp.d_tunnel; - session_id = sp->pppol2tp.s_session; - peer_session_id = sp->pppol2tp.d_session; - } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3)) { - struct sockaddr_pppol2tpv3 *sp3 = - (struct sockaddr_pppol2tpv3 *) sp; - ver = 3; - fd = sp3->pppol2tp.fd; - tunnel_id = sp3->pppol2tp.s_tunnel; - peer_tunnel_id = sp3->pppol2tp.d_tunnel; - session_id = sp3->pppol2tp.s_session; - peer_session_id = sp3->pppol2tp.d_session; - } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpin6)) { - struct sockaddr_pppol2tpin6 *sp6 = - (struct sockaddr_pppol2tpin6 *) sp; - fd = sp6->pppol2tp.fd; - tunnel_id = sp6->pppol2tp.s_tunnel; - peer_tunnel_id = sp6->pppol2tp.d_tunnel; - session_id = sp6->pppol2tp.s_session; - peer_session_id = sp6->pppol2tp.d_session; - } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3in6)) { - struct sockaddr_pppol2tpv3in6 *sp6 = - (struct sockaddr_pppol2tpv3in6 *) sp; - ver = 3; - fd = sp6->pppol2tp.fd; - tunnel_id = sp6->pppol2tp.s_tunnel; - peer_tunnel_id = sp6->pppol2tp.d_tunnel; - session_id = sp6->pppol2tp.s_session; - peer_session_id = sp6->pppol2tp.d_session; - } else { - error = -EINVAL; - goto end; /* bad socket address */ - } - /* Don't bind if tunnel_id is 0 */ error = -EINVAL; - if (tunnel_id == 0) + if (!info.tunnel_id) goto end; - tunnel = l2tp_tunnel_get(sock_net(sk), tunnel_id); + tunnel = l2tp_tunnel_get(sock_net(sk), info.tunnel_id); if (tunnel) drop_tunnel = true; @@ -697,7 +719,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, * peer_session_id is 0. Otherwise look up tunnel using supplied * tunnel id. */ - if ((session_id == 0) && (peer_session_id == 0)) { + if (!info.session_id && !info.peer_session_id) { if (tunnel == NULL) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, @@ -707,12 +729,16 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, /* Prevent l2tp_tunnel_register() from trying to set up * a kernel socket. */ - if (fd < 0) { + if (info.fd < 0) { error = -EBADF; goto end; } - error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); + error = l2tp_tunnel_create(sock_net(sk), info.fd, + info.version, + info.tunnel_id, + info.peer_tunnel_id, &tcfg, + &tunnel); if (error < 0) goto end; @@ -741,9 +767,9 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, tunnel->recv_payload_hook = pppol2tp_recv_payload_hook; if (tunnel->peer_tunnel_id == 0) - tunnel->peer_tunnel_id = peer_tunnel_id; + tunnel->peer_tunnel_id = info.peer_tunnel_id; - session = l2tp_session_get(sock_net(sk), tunnel, session_id); + session = l2tp_session_get(sock_net(sk), tunnel, info.session_id); if (session) { drop_refcnt = true; @@ -772,8 +798,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), - tunnel, session_id, - peer_session_id, &cfg); + tunnel, info.session_id, + info.peer_session_id, &cfg); if (IS_ERR(session)) { error = PTR_ERR(session); goto end; diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index e3589ade62e0..bb707789ef2b 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -12,6 +12,7 @@ mac80211-y := \ scan.o offchannel.o \ ht.o agg-tx.o agg-rx.o \ vht.o \ + he.o \ ibss.o \ iface.o \ rate.o \ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index e83c19d4c292..6a4f154c99f6 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -245,6 +245,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, }; int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; + u16 max_buf_size; if (tid >= IEEE80211_FIRST_TSPEC_TSID) { ht_dbg(sta->sdata, @@ -268,13 +269,18 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } + if (sta->sta.he_cap.has_he) + max_buf_size = IEEE80211_MAX_AMPDU_BUF; + else + max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + /* sanity check for incoming parameters: * check if configuration can support the BA policy * and if buffer size does not exceeds max value */ /* XXX: check own ht delayed BA capability?? */ if (((ba_policy != 1) && (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || - (buf_size > IEEE80211_MAX_AMPDU_BUF)) { + (buf_size > max_buf_size)) { status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", @@ -283,7 +289,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, } /* determine default buffer size */ if (buf_size == 0) - buf_size = IEEE80211_MAX_AMPDU_BUF; + buf_size = max_buf_size; /* make sure the size doesn't exceed the maximum supported by the hw */ if (buf_size > sta->sta.max_rx_aggregation_subframes) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index ac4295296514..69e831bc317b 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -463,6 +463,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) .timeout = 0, }; int ret; + u16 buf_size; tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -511,11 +512,22 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) sta->ampdu_mlme.addba_req_num[tid]++; spin_unlock_bh(&sta->lock); + if (sta->sta.he_cap.has_he) { + buf_size = local->hw.max_tx_aggregation_subframes; + } else { + /* + * We really should use what the driver told us it will + * transmit as the maximum, but certain APs (e.g. the + * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) + * will crash when we use a lower number. + */ + buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + } + /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, tid_tx->dialog_token, params.ssn, - IEEE80211_MAX_AMPDU_BUF, - tid_tx->timeout); + buf_size, tid_tx->timeout); } /* @@ -905,8 +917,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, { struct tid_ampdu_tx *tid_tx; struct ieee80211_txq *txq; - u16 capab, tid; - u8 buf_size; + u16 capab, tid, buf_size; bool amsdu; capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index bdf6fa78d0d2..02f3672e7b5e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1412,6 +1412,11 @@ static int sta_apply_parameters(struct ieee80211_local *local, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, sta); + if (params->he_capa) + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, + (void *)params->he_capa, + params->he_capa_len, sta); + if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it @@ -3486,7 +3491,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, } local_bh_disable(); - ieee80211_xmit(sdata, sta, skb); + ieee80211_xmit(sdata, sta, skb, 0); local_bh_enable(); ret = 0; diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 690c142a7a44..5ac743816b59 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -116,16 +116,16 @@ static void ieee80211_get_stats(struct net_device *dev, data[i++] = sta->sta_state; - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; - if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; - if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) data[i] = (u8)sinfo.signal_avg; i++; } else { diff --git a/net/mac80211/he.c b/net/mac80211/he.c new file mode 100644 index 000000000000..769078ed5a12 --- /dev/null +++ b/net/mac80211/he.c @@ -0,0 +1,55 @@ +/* + * HE handling + * + * Copyright(c) 2017 Intel Deutschland GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "ieee80211_i.h" + +void +ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const u8 *he_cap_ie, u8 he_cap_len, + struct sta_info *sta) +{ + struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; + struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie; + u8 he_ppe_size; + u8 mcs_nss_size; + u8 he_total_size; + + memset(he_cap, 0, sizeof(*he_cap)); + + if (!he_cap_ie || !ieee80211_get_he_sta_cap(sband)) + return; + + /* Make sure size is OK */ + mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap_ie_elem); + he_ppe_size = + ieee80211_he_ppe_size(he_cap_ie[sizeof(he_cap->he_cap_elem) + + mcs_nss_size], + he_cap_ie_elem->phy_cap_info); + he_total_size = sizeof(he_cap->he_cap_elem) + mcs_nss_size + + he_ppe_size; + if (he_cap_len < he_total_size) + return; + + memcpy(&he_cap->he_cap_elem, he_cap_ie, sizeof(he_cap->he_cap_elem)); + + /* HE Tx/Rx HE MCS NSS Support Field */ + memcpy(&he_cap->he_mcs_nss_supp, + &he_cap_ie[sizeof(he_cap->he_cap_elem)], mcs_nss_size); + + /* Check if there are (optional) PPE Thresholds */ + if (he_cap->he_cap_elem.phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) + memcpy(he_cap->ppe_thres, + &he_cap_ie[sizeof(he_cap->he_cap_elem) + mcs_nss_size], + he_ppe_size); + + he_cap->has_he = true; +} diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 26a7ba3b698f..f849ea814993 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -352,7 +352,7 @@ void ieee80211_ba_session_work(struct work_struct *work) test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, - IEEE80211_MAX_AMPDU_BUF, + IEEE80211_MAX_AMPDU_BUF_HT, false, true); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d1978aa1c15d..172aeae21ae9 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -165,6 +165,7 @@ typedef unsigned __bitwise ieee80211_tx_result; #define TX_DROP ((__force ieee80211_tx_result) 1u) #define TX_QUEUED ((__force ieee80211_tx_result) 2u) +#define IEEE80211_TX_NO_SEQNO BIT(0) #define IEEE80211_TX_UNICAST BIT(1) #define IEEE80211_TX_PS_BUFFERED BIT(2) @@ -364,6 +365,7 @@ enum ieee80211_sta_flags { IEEE80211_STA_DISABLE_160MHZ = BIT(13), IEEE80211_STA_DISABLE_WMM = BIT(14), IEEE80211_STA_ENABLE_RRM = BIT(15), + IEEE80211_STA_DISABLE_HE = BIT(16), }; struct ieee80211_mgd_auth_data { @@ -1453,6 +1455,10 @@ struct ieee802_11_elems { const struct ieee80211_vht_cap *vht_cap_elem; const struct ieee80211_vht_operation *vht_operation; const struct ieee80211_meshconf_ie *mesh_config; + const u8 *he_cap; + const struct ieee80211_he_operation *he_operation; + const struct ieee80211_mu_edca_param_set *mu_edca_param_set; + const u8 *uora_element; const u8 *mesh_id; const u8 *peering; const __le16 *awake_window; @@ -1482,6 +1488,7 @@ struct ieee802_11_elems { u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; + u8 he_cap_len; u8 mesh_id_len; u8 peering_len; u8 preq_len; @@ -1824,6 +1831,13 @@ void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta); +/* HE */ +void +ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const u8 *he_cap_ie, u8 he_cap_len, + struct sta_info *sta); + /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, @@ -1880,19 +1894,20 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb); + struct sta_info *sta, struct sk_buff *skb, + u32 txdata_flags); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band); + enum nl80211_band band, u32 txdata_flags); static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band) + enum nl80211_band band, u32 txdata_flags) { rcu_read_lock(); - __ieee80211_tx_skb_tid_band(sdata, skb, tid, band); + __ieee80211_tx_skb_tid_band(sdata, skb, tid, band, txdata_flags); rcu_read_unlock(); } @@ -1910,7 +1925,7 @@ static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, } __ieee80211_tx_skb_tid_band(sdata, skb, tid, - chanctx_conf->def.chan->band); + chanctx_conf->def.chan->band, 0); rcu_read_unlock(); } @@ -2031,26 +2046,27 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, const u8 *bssid, u16 stype, u16 reason, bool send_frame, u8 *frame_buf); + +enum { + IEEE80211_PROBE_FLAG_DIRECTED = BIT(0), + IEEE80211_PROBE_FLAG_MIN_CONTENT = BIT(1), + IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2), +}; + int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, - struct cfg80211_chan_def *chandef); + struct cfg80211_chan_def *chandef, + u32 flags); struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, u32 ratemask, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - bool directed); -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, - const u8 *src, const u8 *dst, - const u8 *ssid, size_t ssid_len, - const u8 *ie, size_t ie_len, - u32 ratemask, bool directed, u32 tx_flags, - struct ieee80211_channel *channel, bool scan); - + u32 flags); u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates); @@ -2073,6 +2089,9 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); +u8 *ieee80211_ie_build_he_cap(u8 *pos, + const struct ieee80211_sta_he_cap *he_cap, + u8 *end); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 555e389b7dfa..5e6cf2cee965 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1130,7 +1130,7 @@ static void ieee80211_uninit(struct net_device *dev) static u16 ieee80211_netdev_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); @@ -1176,7 +1176,7 @@ static const struct net_device_ops ieee80211_dataif_ops = { static u16 ieee80211_monitor_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv, + struct net_device *sb_dev, select_queue_fallback_t fallback) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index fb73451ed85e..4fb2709cb527 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -557,10 +558,19 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); - if (!ops->hw_scan) + if (!ops->hw_scan) { wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | NL80211_FEATURE_AP_SCAN; - + /* + * if the driver behaves correctly using the probe request + * (template) from mac80211, then both of these should be + * supported even with hw scan - but let drivers opt in. + */ + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_RANDOM_SN); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT); + } if (!ops->set_key) wiphy->flags |= WIPHY_FLAG_IBSS_RSN; @@ -588,8 +598,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, local->hw.queues = 1; local->hw.max_rates = 1; local->hw.max_report_rates = 0; - local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; - local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; + local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT; + local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT; local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE; local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; @@ -816,7 +826,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int result, i; enum nl80211_band band; int channels, max_bitrates; - bool supp_ht, supp_vht; + bool supp_ht, supp_vht, supp_he; netdev_features_t feature_whitelist; struct cfg80211_chan_def dflt_chandef = {}; @@ -896,6 +906,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) max_bitrates = 0; supp_ht = false; supp_vht = false; + supp_he = false; for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; @@ -922,6 +933,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; + if (!supp_he) + supp_he = !!ieee80211_get_he_sta_cap(sband); + if (!sband->ht_cap.ht_supported) continue; @@ -1011,6 +1025,18 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->scan_ies_len += 2 + sizeof(struct ieee80211_vht_cap); + /* HE cap element is variable in size - set len to allow max size */ + /* + * TODO: 1 is added at the end of the calculation to accommodate for + * the temporary placing of the HE capabilities IE under EXT. + * Remove it once it is placed in the final place. + */ + if (supp_he) + local->scan_ies_len += + 2 + sizeof(struct ieee80211_he_cap_elem) + + sizeof(struct ieee80211_he_mcs_nss_supp) + + IEEE80211_HE_PPE_THRES_MAX_LEN + 1; + if (!local->ops->hw_scan) { /* For hw_scan, driver needs to set these up. */ local->hw.wiphy->max_scan_ssids = 4; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a59187c016e0..7fb9957359a3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -149,6 +149,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, + const struct ieee80211_he_operation *he_oper, struct cfg80211_chan_def *chandef, bool tracking) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -207,7 +208,27 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, } vht_chandef = *chandef; - if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && he_oper && + (le32_to_cpu(he_oper->he_oper_params) & + IEEE80211_HE_OPERATION_VHT_OPER_INFO)) { + struct ieee80211_vht_operation he_oper_vht_cap; + + /* + * Set only first 3 bytes (other 2 aren't used in + * ieee80211_chandef_vht_oper() anyway) + */ + memcpy(&he_oper_vht_cap, he_oper->optional, 3); + he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); + + if (!ieee80211_chandef_vht_oper(&he_oper_vht_cap, + &vht_chandef)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + sdata_info(sdata, + "HE AP VHT information is invalid, disable HE\n"); + ret = IEEE80211_STA_DISABLE_HE; + goto out; + } + } else if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information is invalid, disable VHT\n"); @@ -300,12 +321,14 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, + const struct ieee80211_he_operation *he_oper, const u8 *bssid, u32 *changed) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_supported_band *sband; - struct ieee80211_channel *chan; + struct ieee80211_channel *chan = sdata->vif.bss_conf.chandef.chan; + struct ieee80211_supported_band *sband = + local->hw.wiphy->bands[chan->band]; struct cfg80211_chan_def chandef; u16 ht_opmode; u32 flags; @@ -320,6 +343,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT) vht_oper = NULL; + /* don't check HE if we associated as non-HE station */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HE || + !ieee80211_get_he_sta_cap(sband)) + he_oper = NULL; + if (WARN_ON_ONCE(!sta)) return -EINVAL; @@ -333,12 +361,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.ht_operation_mode = ht_opmode; } - chan = sdata->vif.bss_conf.chandef.chan; - sband = local->hw.wiphy->bands[chan->band]; - - /* calculate new channel (type) based on HT/VHT operation IEs */ + /* calculate new channel (type) based on HT/VHT/HE operation IEs */ flags = ieee80211_determine_chantype(sdata, sband, chan, - ht_oper, vht_oper, + ht_oper, vht_oper, he_oper, &chandef, true); /* @@ -582,6 +607,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); } +/* This function determines HE capability flags for the association + * and builds the IE. + */ +static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + struct ieee80211_supported_band *sband) +{ + u8 *pos; + const struct ieee80211_sta_he_cap *he_cap = NULL; + u8 he_cap_size; + + he_cap = ieee80211_get_he_sta_cap(sband); + if (!he_cap) + return; + + /* + * TODO: the 1 added is because this temporarily is under the EXTENSION + * IE. Get rid of it when it moves. + */ + he_cap_size = + 2 + 1 + sizeof(he_cap->he_cap_elem) + + ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) + + ieee80211_he_ppe_size(he_cap->ppe_thres[0], + he_cap->he_cap_elem.phy_cap_info); + pos = skb_put(skb, he_cap_size); + ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size); +} + static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; @@ -643,6 +696,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + 2 * sband->n_channels + /* supported channels */ 2 + sizeof(struct ieee80211_ht_cap) + /* HT */ 2 + sizeof(struct ieee80211_vht_cap) + /* VHT */ + 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */ + sizeof(struct ieee80211_he_mcs_nss_supp) + + IEEE80211_HE_PPE_THRES_MAX_LEN + assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9, /* WMM */ @@ -827,11 +883,41 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) offset = noffset; } + /* if present, add any custom IEs that go before HE */ + if (assoc_data->ie_len) { + static const u8 before_he[] = { + /* + * no need to list the ones split off before VHT + * or generated here + */ + WLAN_EID_OPMODE_NOTIF, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE, + /* 11ai elements */ + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN, + /* TODO: add 11ah/11aj/11ak elements */ + }; + + /* RIC already taken above, so no need to handle here anymore */ + noffset = ieee80211_ie_split(assoc_data->ie, assoc_data->ie_len, + before_he, ARRAY_SIZE(before_he), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, assoc_data->ie + offset, noffset - offset); + offset = noffset; + } + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->ap_vht_cap); - /* if present, add any custom non-vendor IEs that go after HT */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + ieee80211_add_he_ie(sdata, skb, sband); + + /* if present, add any custom non-vendor IEs that go after HE */ if (assoc_data->ie_len) { noffset = ieee80211_ie_split_vendor(assoc_data->ie, assoc_data->ie_len, @@ -898,6 +984,11 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + /* Don't send NDPs when STA is connected HE */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + return; + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) @@ -929,6 +1020,10 @@ static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; + /* Don't send NDPs when connected HE */ + if (!(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) + return; + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); if (!skb) return; @@ -1700,9 +1795,11 @@ static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work) } /* MLME */ -static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - const u8 *wmm_param, size_t wmm_param_len) +static bool +ieee80211_sta_wmm_params(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const u8 *wmm_param, size_t wmm_param_len, + const struct ieee80211_mu_edca_param_set *mu_edca) { struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -1749,6 +1846,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_bk; break; case 2: /* AC_VI */ ac = IEEE80211_AC_VI; @@ -1756,6 +1856,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_vi; break; case 3: /* AC_VO */ ac = IEEE80211_AC_VO; @@ -1763,6 +1866,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_vo; break; case 0: /* AC_BE */ default: @@ -1771,6 +1877,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_be; break; } @@ -2219,6 +2328,20 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, ieee80211_sta_reset_conn_monitor(sdata); } +static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata, + const u8 *src, const u8 *dst, + const u8 *ssid, size_t ssid_len, + struct ieee80211_channel *channel) +{ + struct sk_buff *skb; + + skb = ieee80211_build_probe_req(sdata, src, dst, (u32)-1, channel, + ssid, ssid_len, NULL, 0, + IEEE80211_PROBE_FLAG_DIRECTED); + if (skb) + ieee80211_tx_skb(sdata, skb); +} + static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -2265,10 +2388,9 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) else ssid_len = ssid[1]; - ieee80211_send_probe_req(sdata, sdata->vif.addr, dst, - ssid + 2, ssid_len, NULL, - 0, (u32) -1, true, 0, - ifmgd->associated->channel, false); + ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst, + ssid + 2, ssid_len, + ifmgd->associated->channel); rcu_read_unlock(); } @@ -2370,7 +2492,7 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid, (u32) -1, cbss->channel, ssid + 2, ssid_len, - NULL, 0, true); + NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); rcu_read_unlock(); return skb; @@ -3008,6 +3130,25 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } + /* + * If AP doesn't support HT, or it doesn't have HE mandatory IEs, mark + * HE as disabled. If on the 5GHz band, make sure it supports VHT. + */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HT || + (sband->band == NL80211_BAND_5GHZ && + ifmgd->flags & IEEE80211_STA_DISABLE_VHT) || + (!elems.he_cap && !elems.he_operation)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + (!elems.he_cap || !elems.he_operation)) { + mutex_unlock(&sdata->local->sta_mtx); + sdata_info(sdata, + "HE AP is missing HE capability/operation\n"); + ret = false; + goto out; + } + /* Set up internal HT/VHT capabilities */ if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, @@ -3017,6 +3158,48 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems.vht_cap_elem, sta); + if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + elems.he_cap) { + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, + elems.he_cap, + elems.he_cap_len, + sta); + + bss_conf->he_support = sta->sta.he_cap.has_he; + } else { + bss_conf->he_support = false; + } + + if (bss_conf->he_support) { + u32 he_oper_params = + le32_to_cpu(elems.he_operation->he_oper_params); + + bss_conf->bss_color = he_oper_params & + IEEE80211_HE_OPERATION_BSS_COLOR_MASK; + bss_conf->htc_trig_based_pkt_ext = + (he_oper_params & + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK) << + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET; + bss_conf->frame_time_rts_th = + (he_oper_params & + IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK) << + IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET; + + bss_conf->multi_sta_back_32bit = + sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP; + + bss_conf->ack_enabled = + sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_ACK_EN; + + bss_conf->uora_exists = !!elems.uora_element; + if (elems.uora_element) + bss_conf->uora_ocw_range = elems.uora_element[0]; + + /* TODO: OPEN: what happens if BSS color disable is set? */ + } + /* * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data * in their association response, so ignore that data for our own @@ -3076,7 +3259,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { ieee80211_set_wmm_default(sdata, false, false); } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len)) { + elems.wmm_param_len, + elems.mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(sdata, false, true); /* set the disable-WMM flag in this case to disable @@ -3590,7 +3774,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) && ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len)) + elems.wmm_param_len, + elems.mu_edca_param_set)) changed |= BSS_CHANGED_QOS; /* @@ -3629,7 +3814,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, elems.ht_operation, - elems.vht_operation, bssid, &changed)) { + elems.vht_operation, elems.he_operation, + bssid, &changed)) { mutex_unlock(&local->sta_mtx); sdata_info(sdata, "failed to follow AP %pM bandwidth change, disconnect\n", @@ -4266,6 +4452,68 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata, return chains; } +static bool +ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband, + const struct ieee80211_he_operation *he_op) +{ + const struct ieee80211_sta_he_cap *sta_he_cap = + ieee80211_get_he_sta_cap(sband); + u16 ap_min_req_set; + int i; + + if (!sta_he_cap || !he_op) + return false; + + ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); + + /* Need to go over for 80MHz, 160MHz and for 80+80 */ + for (i = 0; i < 3; i++) { + const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = + &sta_he_cap->he_mcs_nss_supp; + u16 sta_mcs_map_rx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]); + u16 sta_mcs_map_tx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]); + u8 nss; + bool verified = true; + + /* + * For each band there is a maximum of 8 spatial streams + * possible. Each of the sta_mcs_map_* is a 16-bit struct built + * of 2 bits per NSS (1-8), with the values defined in enum + * ieee80211_he_mcs_support. Need to make sure STA TX and RX + * capabilities aren't less than the AP's minimum requirements + * for this HE BSS per SS. + * It is enough to find one such band that meets the reqs. + */ + for (nss = 8; nss > 0; nss--) { + u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3; + u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3; + u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; + + if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED) + continue; + + /* + * Make sure the HE AP doesn't require MCSs that aren't + * supported by the client + */ + if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) { + verified = false; + break; + } + } + + if (verified) + return true; + } + + /* If here, STA doesn't meet AP's HE min requirements */ + return false; +} + static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss) { @@ -4274,6 +4522,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ht_cap *ht_cap = NULL; const struct ieee80211_ht_operation *ht_oper = NULL; const struct ieee80211_vht_operation *vht_oper = NULL; + const struct ieee80211_he_operation *he_oper = NULL; struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; int ret; @@ -4329,6 +4578,24 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + ieee80211_get_he_sta_cap(sband)) { + const struct cfg80211_bss_ies *ies; + const u8 *he_oper_ie; + + ies = rcu_dereference(cbss->ies); + he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, + ies->data, ies->len); + if (he_oper_ie && + he_oper_ie[1] == ieee80211_he_oper_size(&he_oper_ie[3])) + he_oper = (void *)(he_oper_ie + 3); + else + he_oper = NULL; + + if (!ieee80211_verify_sta_he_mcs_support(sband, he_oper)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + } + /* Allow VHT if at least one channel on the sband supports 80 MHz */ have_80mhz = false; for (i = 0; i < sband->n_channels; i++) { @@ -4345,7 +4612,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, - ht_oper, vht_oper, + ht_oper, vht_oper, he_oper, &chandef, false); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), @@ -4751,8 +5018,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; netdev_info(sdata->dev, - "disabling HT/VHT due to WEP/TKIP use\n"); + "disabling HE/HT/VHT due to WEP/TKIP use\n"); } } diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index f1d40b6645ff..8ef4153cd299 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -262,7 +262,7 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, - roc->chan->band); + roc->chan->band, 0); roc->frame = NULL; } } else { diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 0a38cc1cbebc..a16ba568e2a3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -175,6 +175,20 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len += 12; } + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE) { + len = ALIGN(len, 2); + len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he) != 12); + } + + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE_MU) { + len = ALIGN(len, 2); + len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12); + } + if (status->chains) { /* antenna and antenna signal fields */ len += 2 * hweight8(status->chains); @@ -263,6 +277,19 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, int mpdulen, chain; unsigned long chains = status->chains; struct ieee80211_vendor_radiotap rtap = {}; + struct ieee80211_radiotap_he he = {}; + struct ieee80211_radiotap_he_mu he_mu = {}; + + if (status->flag & RX_FLAG_RADIOTAP_HE) { + he = *(struct ieee80211_radiotap_he *)skb->data; + skb_pull(skb, sizeof(he)); + WARN_ON_ONCE(status->encoding != RX_ENC_HE); + } + + if (status->flag & RX_FLAG_RADIOTAP_HE_MU) { + he_mu = *(struct ieee80211_radiotap_he_mu *)skb->data; + skb_pull(skb, sizeof(he_mu)); + } if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) { rtap = *(struct ieee80211_vendor_radiotap *)skb->data; @@ -520,6 +547,89 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos++ = flags; } + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE) { +#define HE_PREP(f, val) cpu_to_le16(FIELD_PREP(IEEE80211_RADIOTAP_HE_##f, val)) + + if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) { + he.data6 |= HE_PREP(DATA6_NSTS, + FIELD_GET(RX_ENC_FLAG_STBC_MASK, + status->enc_flags)); + he.data3 |= HE_PREP(DATA3_STBC, 1); + } else { + he.data6 |= HE_PREP(DATA6_NSTS, status->nss); + } + +#define CHECK_GI(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ + (int)NL80211_RATE_INFO_HE_GI_##s) + + CHECK_GI(0_8); + CHECK_GI(1_6); + CHECK_GI(3_2); + + he.data3 |= HE_PREP(DATA3_DATA_MCS, status->rate_idx); + he.data3 |= HE_PREP(DATA3_DATA_DCM, status->he_dcm); + he.data3 |= HE_PREP(DATA3_CODING, + !!(status->enc_flags & RX_ENC_FLAG_LDPC)); + + he.data5 |= HE_PREP(DATA5_GI, status->he_gi); + + switch (status->bw) { + case RATE_INFO_BW_20: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); + break; + case RATE_INFO_BW_40: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); + break; + case RATE_INFO_BW_80: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); + break; + case RATE_INFO_BW_160: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); + break; + case RATE_INFO_BW_HE_RU: +#define CHECK_RU_ALLOC(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ + NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) + + CHECK_RU_ALLOC(26); + CHECK_RU_ALLOC(52); + CHECK_RU_ALLOC(106); + CHECK_RU_ALLOC(242); + CHECK_RU_ALLOC(484); + CHECK_RU_ALLOC(996); + CHECK_RU_ALLOC(2x996); + + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + status->he_ru + 4); + break; + default: + WARN_ONCE(1, "Invalid SU BW %d\n", status->bw); + } + + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE); + memcpy(pos, &he, sizeof(he)); + pos += sizeof(he); + } + + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE_MU) { + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE_MU); + memcpy(pos, &he_mu, sizeof(he_mu)); + pos += sizeof(he_mu); + } + for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { *pos++ = status->chain_signal[chain]; *pos++ = chain; @@ -613,6 +723,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, rcu_dereference(local->monitor_sdata); bool only_monitor = false; + if (status->flag & RX_FLAG_RADIOTAP_HE) + rtap_space += sizeof(struct ieee80211_radiotap_he); + + if (status->flag & RX_FLAG_RADIOTAP_HE_MU) + rtap_space += sizeof(struct ieee80211_radiotap_he_mu); + if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; @@ -3241,7 +3357,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) } __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, - status->band); + status->band, 0); } dev_kfree_skb(rx->skb); return RX_QUEUED; @@ -3386,8 +3502,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, status = IEEE80211_SKB_RXCB((rx->skb)); sband = rx->local->hw.wiphy->bands[status->band]; - if (!(status->encoding == RX_ENC_HT) && - !(status->encoding == RX_ENC_VHT)) + if (status->encoding == RX_ENC_LEGACY) rate = &sband->bitrates[status->rate_idx]; ieee80211_rx_cooked_monitor(rx, rate); @@ -4386,6 +4501,14 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rate_idx, status->nss)) goto drop; break; + case RX_ENC_HE: + if (WARN_ONCE(status->rate_idx > 11 || + !status->nss || + status->nss > 8, + "Rate marked as an HE rate but data is invalid: MCS: %d, NSS: %d\n", + status->rate_idx, status->nss)) + goto drop; + break; default: WARN_ON_ONCE(1); /* fall through */ diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 2e917a6d239d..5d2a11777718 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -20,6 +20,7 @@ #include <net/sch_generic.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/random.h> #include <net/mac80211.h> #include "ieee80211_i.h" @@ -293,6 +294,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) struct cfg80211_chan_def chandef; u8 bands_used = 0; int i, ielen, n_chans; + u32 flags = 0; req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); @@ -331,12 +333,16 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) local->hw_scan_req->req.n_channels = n_chans; ieee80211_prepare_scan_chandef(&chandef, req->scan_width); + if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) + flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; + ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, - bands_used, req->rates, &chandef); + bands_used, req->rates, &chandef, + flags); local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); @@ -528,6 +534,35 @@ void ieee80211_run_deferred_scan(struct ieee80211_local *local) round_jiffies_relative(0)); } +static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, + const u8 *src, const u8 *dst, + const u8 *ssid, size_t ssid_len, + const u8 *ie, size_t ie_len, + u32 ratemask, u32 flags, u32 tx_flags, + struct ieee80211_channel *channel) +{ + struct sk_buff *skb; + u32 txdata_flags = 0; + + skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, + ssid, ssid_len, + ie, ie_len, flags); + + if (skb) { + if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { + struct ieee80211_hdr *hdr = (void *)skb->data; + u16 sn = get_random_u32(); + + txdata_flags |= IEEE80211_TX_NO_SEQNO; + hdr->seq_ctrl = + cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); + } + IEEE80211_SKB_CB(skb)->flags |= tx_flags; + ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band, + txdata_flags); + } +} + static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, unsigned long *next_delay) { @@ -535,7 +570,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; enum nl80211_band band = local->hw.conf.chandef.chan->band; - u32 tx_flags; + u32 flags = 0, tx_flags; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); @@ -543,17 +578,21 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (scan_req->no_cck) tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; + if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) + flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; + if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN) + flags |= IEEE80211_PROBE_FLAG_RANDOM_SN; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->mtx)); for (i = 0; i < scan_req->n_ssids; i++) - ieee80211_send_probe_req( + ieee80211_send_scan_probe_req( sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, - scan_req->rates[band], false, - tx_flags, local->hw.conf.chandef.chan, true); + scan_req->rates[band], flags, + tx_flags, local->hw.conf.chandef.chan); /* * After sending probe requests, wait for probe responses @@ -1141,6 +1180,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; + u32 flags = 0; iebufsz = local->scan_ies_len + req->ie_len; @@ -1157,6 +1197,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, } } + if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) + flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; + ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); if (!ie) { ret = -ENOMEM; @@ -1167,7 +1210,8 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, - req->ie_len, bands_used, rate_masks, &chandef); + req->ie_len, bands_used, rate_masks, &chandef, + flags); ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 6428f1ac37b6..f34202242d24 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1323,6 +1323,11 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; + /* Don't send NDPs when STA is connected HE */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) + return; + if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | @@ -1391,7 +1396,7 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, } info->band = chanctx_conf->def.chan->band; - ieee80211_xmit(sdata, sta, skb); + ieee80211_xmit(sdata, sta, skb, 0); rcu_read_unlock(); } @@ -1968,7 +1973,7 @@ sta_get_last_rx_stats(struct sta_info *sta) return stats; } -static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, +static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); @@ -2005,6 +2010,14 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } + case STA_STATS_RATE_TYPE_HE: + rinfo->flags = RATE_INFO_FLAGS_HE_MCS; + rinfo->mcs = STA_STATS_GET(HE_MCS, rate); + rinfo->nss = STA_STATS_GET(HE_NSS, rate); + rinfo->he_gi = STA_STATS_GET(HE_GI, rate); + rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); + rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); + break; } } @@ -2101,38 +2114,38 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, drv_sta_statistics(local, sdata, &sta->sta, sinfo); - sinfo->filled |= BIT(NL80211_STA_INFO_INACTIVE_TIME) | - BIT(NL80211_STA_INFO_STA_FLAGS) | - BIT(NL80211_STA_INFO_BSS_PARAM) | - BIT(NL80211_STA_INFO_CONNECTED_TIME) | - BIT(NL80211_STA_INFO_RX_DROP_MISC); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | + BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | + BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | + BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count; - sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_LOSS); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); - if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) | - BIT(NL80211_STA_INFO_TX_BYTES)))) { + if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->tx_stats.bytes[ac]; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_BYTES64); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_PACKETS))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->tx_stats.packets[ac]; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } - if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) | - BIT(NL80211_STA_INFO_RX_BYTES)))) { + if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); if (sta->pcpu_rx_stats) { @@ -2144,10 +2157,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } } - sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->rx_stats.packets; if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { @@ -2157,17 +2170,17 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->rx_packets += cpurxs->packets; } } - sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_RETRIES))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->status_stats.retry_count; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_RETRIES); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_FAILED))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->status_stats.retry_failed; - sinfo->filled |= BIT(NL80211_STA_INFO_TX_FAILED); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } sinfo->rx_dropped_misc = sta->rx_stats.dropped; @@ -2182,23 +2195,23 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { - sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_RX) | - BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | + BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { - if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; - sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->pcpu_rx_stats && - !(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) { + !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->rx_stats_avg.signal); - sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } @@ -2207,11 +2220,11 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, * pcpu statistics */ if (last_rxstats->chains && - !(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) | - BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { - sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL); + !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | + BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->pcpu_rx_stats) - sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; @@ -2223,15 +2236,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, } } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &sinfo->txrate); - sinfo->filled |= BIT(NL80211_STA_INFO_TX_BITRATE); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) - sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { @@ -2244,18 +2257,18 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH - sinfo->filled |= BIT(NL80211_STA_INFO_LLID) | - BIT(NL80211_STA_INFO_PLID) | - BIT(NL80211_STA_INFO_PLINK_STATE) | - BIT(NL80211_STA_INFO_LOCAL_PM) | - BIT(NL80211_STA_INFO_PEER_PM) | - BIT(NL80211_STA_INFO_NONPEER_PM); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | + BIT_ULL(NL80211_STA_INFO_PLID) | + BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | + BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | + BIT_ULL(NL80211_STA_INFO_PEER_PM) | + BIT_ULL(NL80211_STA_INFO_NONPEER_PM); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { - sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; @@ -2300,7 +2313,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, thr = sta_get_expected_throughput(sta); if (thr != 0) { - sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT); + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 81b35f623792..9a04327d71d1 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -170,7 +170,7 @@ struct tid_ampdu_tx { u8 dialog_token; u8 stop_initiator; bool tx_stop; - u8 buf_size; + u16 buf_size; u16 failed_bar_ssn; bool bar_pending; @@ -405,7 +405,7 @@ struct ieee80211_sta_rx_stats { int last_signal; u8 chains; s8 chain_signal_last[IEEE80211_MAX_CHAINS]; - u16 last_rate; + u32 last_rate; struct u64_stats_sync syncp; u64 bytes; u64 msdu[IEEE80211_NUM_TIDS + 1]; @@ -764,6 +764,7 @@ enum sta_stats_type { STA_STATS_RATE_TYPE_LEGACY, STA_STATS_RATE_TYPE_HT, STA_STATS_RATE_TYPE_VHT, + STA_STATS_RATE_TYPE_HE, }; #define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) @@ -771,9 +772,14 @@ enum sta_stats_type { #define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) #define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) +#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) +#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_BW GENMASK(11, 8) #define STA_STATS_FIELD_SGI GENMASK(12, 12) #define STA_STATS_FIELD_TYPE GENMASK(15, 13) +#define STA_STATS_FIELD_HE_RU GENMASK(18, 16) +#define STA_STATS_FIELD_HE_GI GENMASK(20, 19) +#define STA_STATS_FIELD_HE_DCM GENMASK(21, 21) #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) @@ -782,7 +788,7 @@ enum sta_stats_type { static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) { - u16 r; + u32 r; r = STA_STATS_FIELD(BW, s->bw); @@ -804,6 +810,14 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r |= STA_STATS_FIELD(LEGACY_BAND, s->band); r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx); break; + case RX_ENC_HE: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE); + r |= STA_STATS_FIELD(HE_NSS, s->nss); + r |= STA_STATS_FIELD(HE_MCS, s->rate_idx); + r |= STA_STATS_FIELD(HE_GI, s->he_gi); + r |= STA_STATS_FIELD(HE_RU, s->he_ru); + r |= STA_STATS_FIELD(HE_DCM, s->he_dcm); + break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 80a7edf8d314..0ab69a1964f8 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -92,7 +92,7 @@ STA_ENTRY \ __field(u16, tid) \ __field(u16, ssn) \ - __field(u8, buf_size) \ + __field(u16, buf_size) \ __field(bool, amsdu) \ __field(u16, timeout) \ __field(u16, action) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index fa1f1e63a264..6a79d564de35 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -825,6 +825,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) */ if (!ieee80211_is_data_qos(hdr->frame_control) || is_multicast_ether_addr(hdr->addr1)) { + if (tx->flags & IEEE80211_TX_NO_SEQNO) + return TX_CONTINUE; /* driver should assign sequence number */ info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; /* for pure STA mode without beacons, we can do it */ @@ -1854,7 +1856,7 @@ EXPORT_SYMBOL(ieee80211_tx_prepare_skb); */ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb, - bool txpending) + bool txpending, u32 txdata_flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_data tx; @@ -1872,6 +1874,8 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, led_len = skb->len; res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); + tx.flags |= txdata_flags; + if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); return true; @@ -1933,7 +1937,8 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata, } void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb) + struct sta_info *sta, struct sk_buff *skb, + u32 txdata_flags) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -1968,7 +1973,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, } ieee80211_set_qos_hdr(sdata, skb); - ieee80211_tx(sdata, sta, skb, false); + ieee80211_tx(sdata, sta, skb, false, txdata_flags); } static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, @@ -2289,7 +2294,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, if (!ieee80211_parse_tx_radiotap(local, skb)) goto fail_rcu; - ieee80211_xmit(sdata, NULL, skb); + ieee80211_xmit(sdata, NULL, skb, 0); rcu_read_unlock(); return NETDEV_TX_OK; @@ -3648,7 +3653,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, ieee80211_tx_stats(dev, skb->len); - ieee80211_xmit(sdata, sta, skb); + ieee80211_xmit(sdata, sta, skb, 0); } goto out; out_free: @@ -3867,7 +3872,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, return true; } info->band = chanctx_conf->def.chan->band; - result = ieee80211_tx(sdata, NULL, skb, true); + result = ieee80211_tx(sdata, NULL, skb, true, 0); } else { struct sk_buff_head skbs; @@ -4783,7 +4788,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum nl80211_band band) + enum nl80211_band band, u32 txdata_flags) { int ac = ieee80211_ac_from_tid(tid); @@ -4800,7 +4805,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, */ local_bh_disable(); IEEE80211_SKB_CB(skb)->band = band; - ieee80211_xmit(sdata, NULL, skb); + ieee80211_xmit(sdata, NULL, skb, txdata_flags); local_bh_enable(); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 5e2e511c4a6f..3e68132a41fa 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1095,6 +1095,21 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, if (elen >= sizeof(*elems->max_idle_period_ie)) elems->max_idle_period_ie = (void *)pos; break; + case WLAN_EID_EXTENSION: + if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA && + elen >= (sizeof(*elems->mu_edca_param_set) + 1)) { + elems->mu_edca_param_set = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) { + elems->he_cap = (void *)&pos[1]; + elems->he_cap_len = elen - 1; + } else if (pos[0] == WLAN_EID_EXT_HE_OPERATION && + elen >= sizeof(*elems->he_operation) && + elen >= ieee80211_he_oper_size(&pos[1])) { + elems->he_operation = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_UORA && elen >= 1) { + elems->uora_element = (void *)&pos[1]; + } + break; default: break; } @@ -1353,9 +1368,10 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, - size_t *offset) + size_t *offset, u32 flags) { struct ieee80211_supported_band *sband; + const struct ieee80211_sta_he_cap *he_cap; u8 *pos = buffer, *end = buffer + buffer_len; size_t noffset; int supp_rates_len, i; @@ -1433,6 +1449,9 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, chandef->chan->center_freq); } + if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT) + goto done; + /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { @@ -1460,11 +1479,6 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, sband->ht_cap.cap); } - /* - * If adding more here, adjust code in main.c - * that calculates local->scan_ies_len. - */ - /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { @@ -1507,9 +1521,43 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, sband->vht_cap.cap); } + /* insert custom IEs that go before HE */ + if (ie && ie_len) { + static const u8 before_he[] = { + /* + * no need to list the ones split off before VHT + * or generated here + */ + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS, + WLAN_EID_AP_CSN, + /* TODO: add 11ah/11aj/11ak elements */ + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_he, ARRAY_SIZE(before_he), + *offset); + if (end - pos < noffset - *offset) + goto out_err; + memcpy(pos, ie + *offset, noffset - *offset); + pos += noffset - *offset; + *offset = noffset; + } + + he_cap = ieee80211_get_he_sta_cap(sband); + if (he_cap) { + pos = ieee80211_ie_build_he_cap(pos, he_cap, end); + if (!pos) + goto out_err; + } + + /* + * If adding more here, adjust code in main.c + * that calculates local->scan_ies_len. + */ + return pos - buffer; out_err: WARN_ONCE(1, "not enough space for preq IEs\n"); + done: return pos - buffer; } @@ -1518,7 +1566,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, u8 bands_used, u32 *rate_masks, - struct cfg80211_chan_def *chandef) + struct cfg80211_chan_def *chandef, + u32 flags) { size_t pos = 0, old_pos = 0, custom_ie_offset = 0; int i; @@ -1533,7 +1582,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, ie, ie_len, i, rate_masks[i], chandef, - &custom_ie_offset); + &custom_ie_offset, + flags); ie_desc->ies[i] = buffer + old_pos; ie_desc->len[i] = pos - old_pos; old_pos = pos; @@ -1561,7 +1611,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *chan, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, - bool directed) + u32 flags) { struct ieee80211_local *local = sdata->local; struct cfg80211_chan_def chandef; @@ -1577,7 +1627,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, * badly-behaved APs don't respond when this parameter is included. */ chandef.width = sdata->vif.bss_conf.chandef.width; - if (directed) + if (flags & IEEE80211_PROBE_FLAG_DIRECTED) chandef.chan = NULL; else chandef.chan = chan; @@ -1591,7 +1641,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb), skb_tailroom(skb), &dummy_ie_desc, ie, ie_len, BIT(chan->band), - rate_masks, &chandef); + rate_masks, &chandef, flags); skb_put(skb, ies_len); if (dst) { @@ -1605,27 +1655,6 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, return skb; } -void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, - const u8 *src, const u8 *dst, - const u8 *ssid, size_t ssid_len, - const u8 *ie, size_t ie_len, - u32 ratemask, bool directed, u32 tx_flags, - struct ieee80211_channel *channel, bool scan) -{ - struct sk_buff *skb; - - skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, - ssid, ssid_len, - ie, ie_len, directed); - if (skb) { - IEEE80211_SKB_CB(skb)->flags |= tx_flags; - if (scan) - ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); - else - ieee80211_tx_skb(sdata, skb); - } -} - u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band band, u32 *basic_rates) @@ -2412,6 +2441,72 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos; } +u8 *ieee80211_ie_build_he_cap(u8 *pos, + const struct ieee80211_sta_he_cap *he_cap, + u8 *end) +{ + u8 n; + u8 ie_len; + u8 *orig_pos = pos; + + /* Make sure we have place for the IE */ + /* + * TODO: the 1 added is because this temporarily is under the EXTENSION + * IE. Get rid of it when it moves. + */ + if (!he_cap) + return orig_pos; + + n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); + ie_len = 2 + 1 + + sizeof(he_cap->he_cap_elem) + n + + ieee80211_he_ppe_size(he_cap->ppe_thres[0], + he_cap->he_cap_elem.phy_cap_info); + + if ((end - pos) < ie_len) + return orig_pos; + + *pos++ = WLAN_EID_EXTENSION; + pos++; /* We'll set the size later below */ + *pos++ = WLAN_EID_EXT_HE_CAPABILITY; + + /* Fixed data */ + memcpy(pos, &he_cap->he_cap_elem, sizeof(he_cap->he_cap_elem)); + pos += sizeof(he_cap->he_cap_elem); + + memcpy(pos, &he_cap->he_mcs_nss_supp, n); + pos += n; + + /* Check if PPE Threshold should be present */ + if ((he_cap->he_cap_elem.phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) + goto end; + + /* + * Calculate how many PPET16/PPET8 pairs are to come. Algorithm: + * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK) + */ + n = hweight8(he_cap->ppe_thres[0] & + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >> + IEEE80211_PPE_THRES_NSS_POS)); + + /* + * Each pair is 6 bits, and we need to add the 7 "header" bits to the + * total size. + */ + n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; + n = DIV_ROUND_UP(n, 8); + + /* Copy PPE Thresholds */ + memcpy(pos, &he_cap->ppe_thres, n); + pos += n; + +end: + orig_pos[1] = (pos - orig_pos) - 2; + return pos; +} + u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 168af54db975..dc240cb47ddf 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -603,6 +603,21 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_conntrack_destroy); +bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + struct nf_ct_hook *ct_hook; + bool ret = false; + + rcu_read_lock(); + ct_hook = rcu_dereference(nf_ct_hook); + if (ct_hook) + ret = ct_hook->get_tuple_skb(dst_tuple, skb); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_ct_get_tuple_skb); + /* Built-in default zone used e.g. by modules. */ const struct nf_conntrack_zone nf_ct_zone_dflt = { .id = NF_CT_DEFAULT_ZONE_ID, diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index a1086bdec242..5423b197d98a 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -32,7 +32,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, __be32 mask = 0; /* we're only interested in locally generated packets */ - if (skb->sk == NULL) + if (skb->sk == NULL || !net_eq(nf_ct_net(ct), sock_net(skb->sk))) goto out; if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) goto out; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3d5280425027..805500197c22 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1683,6 +1683,41 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; } +static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, + const struct sk_buff *skb) +{ + const struct nf_conntrack_tuple *src_tuple; + const struct nf_conntrack_tuple_hash *hash; + struct nf_conntrack_tuple srctuple; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); + memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + NFPROTO_IPV4, dev_net(skb->dev), + &srctuple)) + return false; + + hash = nf_conntrack_find_get(dev_net(skb->dev), + &nf_ct_zone_dflt, + &srctuple); + if (!hash) + return false; + + ct = nf_ct_tuplehash_to_ctrack(hash); + src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); + memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); + nf_ct_put(ct); + + return true; +} + /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), @@ -2204,6 +2239,7 @@ err_cachep: static struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, .destroy = destroy_conntrack, + .get_tuple_skb = nf_conntrack_get_tuple_skb, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index dc61399e30be..a8c5c846aec1 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -132,9 +132,10 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); -void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) +void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, + struct sock *sk) { - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) return; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 46f9df99d276..86df2a1666fd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -108,6 +108,7 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) struct flowi fl; unsigned int hh_len; struct dst_entry *dst; + struct sock *sk = skb->sk; int err; err = xfrm_decode_session(skb, &fl, family); @@ -119,7 +120,10 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - dst = xfrm_lookup(net, dst, &fl, skb->sk, 0); + if (sk && !net_eq(net, sock_net(sk))) + sk = NULL; + + dst = xfrm_lookup(net, dst, &fl, sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 896d4a36081d..3f211e1025c1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -14,6 +14,7 @@ #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/vmalloc.h> +#include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 1105a23bda5e..2b94dcc43456 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -107,7 +107,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, break; case NFT_META_SKUID: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; read_lock_bh(&sk->sk_callback_lock); @@ -123,7 +124,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, break; case NFT_META_SKGID: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; read_lock_bh(&sk->sk_callback_lock); @@ -214,7 +216,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 74e1b3bd6954..998c2b546f6d 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -23,6 +23,9 @@ static void nft_socket_eval(const struct nft_expr *expr, struct sock *sk = skb->sk; u32 *dest = ®s->data[priv->dreg]; + if (sk && !net_eq(nft_net(pkt), sock_net(sk))) + sk = NULL; + if (!sk) switch(nft_pf(pkt)) { case NFPROTO_IPV4: @@ -39,7 +42,7 @@ static void nft_socket_eval(const struct nft_expr *expr, return; } - if(!sk) { + if (!sk) { nft_reg_store8(dest, 0); return; } diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 7df2dece57d3..5d92e1781980 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -72,8 +72,9 @@ static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v0 *info = par->matchinfo; + struct sock *sk = skb->sk; - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ @@ -85,8 +86,9 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_cgroup_info_v1 *info = par->matchinfo; struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; struct cgroup *ancestor = info->priv; + struct sock *sk = skb->sk; - if (!skb->sk || !sk_fullsock(skb->sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; if (ancestor) diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 3d705c688a27..46686fb73784 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -67,7 +67,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) struct sock *sk = skb_to_full_sk(skb); struct net *net = xt_net(par); - if (sk == NULL || sk->sk_socket == NULL) + if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk))) return (info->match ^ info->invert) == 0; else if (info->match & info->invert & XT_OWNER_SOCKET) /* diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 07085c22b19c..f44de4bc2100 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -265,7 +265,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) } /* use TTL as seen before forwarding */ - if (xt_out(par) != NULL && skb->sk == NULL) + if (xt_out(par) != NULL && + (!skb->sk || !net_eq(net, sock_net(skb->sk)))) ttl++; spin_lock_bh(&recent_lock); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 5c0779c4fa3c..0472f3472842 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; + if (!net_eq(xt_net(par), sock_net(sk))) + sk = NULL; + if (!sk) sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par)); + if (sk) { bool wildcard; bool transparent = true; @@ -113,8 +117,12 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; + if (!net_eq(xt_net(par), sock_net(sk))) + sk = NULL; + if (!sk) sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par)); + if (sk) { bool wildcard; bool transparent = true; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 30a5df27116e..85ae53d8fd09 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff *skb, clone_flow_key); } +/* When 'last' is true, clone() should always consume the 'skb'. + * Otherwise, clone() should keep 'skb' intact regardless what + * actions are executed within clone(). + */ +static int clone(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr, + bool last) +{ + struct nlattr *actions; + struct nlattr *clone_arg; + int rem = nla_len(attr); + bool dont_clone_flow_key; + + /* The first action is always 'OVS_CLONE_ATTR_ARG'. */ + clone_arg = nla_data(attr); + dont_clone_flow_key = nla_get_u32(clone_arg); + actions = nla_next(clone_arg, &rem); + + return clone_execute(dp, skb, key, 0, actions, rem, last, + !dont_clone_flow_key); +} + static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr) { @@ -1336,6 +1358,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, consume_skb(skb); return 0; } + break; + + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = clone(dp, skb, key, a, last); + if (last) + return err; + + break; + } } if (unlikely(err)) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 492ab0c36f7c..a70097ecf33c 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2460,6 +2460,40 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return 0; } +static int validate_and_copy_clone(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + bool log, bool last) +{ + int start, err; + u32 exec; + + if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN) + return -EINVAL; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log); + if (start < 0) + return start; + + exec = last || !actions_may_change_flow(attr); + + err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec, + sizeof(exec), log); + if (err) + return err; + + err = __ovs_nla_copy_actions(net, attr, key, sfa, + eth_type, vlan_tci, log); + if (err) + return err; + + add_nested_action_end(*sfa, start); + + return 0; +} + void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, bool reset_key, @@ -2516,7 +2550,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; + __be16 dst_opt_type; + dst_opt_type = 0; ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) @@ -2528,10 +2564,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, err = validate_geneve_opts(&key); if (err < 0) return err; + dst_opt_type = TUNNEL_GENEVE_OPT; break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + dst_opt_type = TUNNEL_VXLAN_OPT; break; case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + dst_opt_type = TUNNEL_ERSPAN_OPT; break; } } @@ -2574,7 +2613,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, */ ip_tunnel_info_opts_set(tun_info, TUN_METADATA_OPTS(&key, key.tun_opts_len), - key.tun_opts_len); + key.tun_opts_len, dst_opt_type); add_nested_action_end(*sfa, start); return err; @@ -2844,6 +2883,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, [OVS_ACTION_ATTR_POP_NSH] = 0, [OVS_ACTION_ATTR_METER] = sizeof(u32), + [OVS_ACTION_ATTR_CLONE] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -3033,6 +3073,18 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, /* Non-existent meters are simply ignored. */ break; + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_clone(net, a, key, sfa, + eth_type, vlan_tci, + log, last); + if (err) + return err; + skip_copy = true; + break; + } + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -3111,6 +3163,26 @@ out: return err; } +static int clone_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start; + int err = 0, rem = nla_len(attr); + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE); + if (!start) + return -EMSGSIZE; + + err = ovs_nla_put_actions(nla_data(attr), rem, skb); + + if (err) + nla_nest_cancel(skb, start); + else + nla_nest_end(skb, start); + + return err; +} + static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) { const struct nlattr *ovs_key = nla_data(a); @@ -3199,6 +3271,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) return err; break; + case OVS_ACTION_ATTR_CLONE: + err = clone_action_to_attr(a, skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9b27d0cd766d..e3e00d3a972e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -275,9 +275,10 @@ static bool packet_use_direct_xmit(const struct packet_sock *po) return po->xmit == packet_direct_xmit; } -static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) { - return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; + return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) @@ -291,7 +292,7 @@ static u16 packet_pick_tx_queue(struct sk_buff *skb) __packet_pick_tx_queue); queue_index = netdev_cap_txqueue(dev, queue_index); } else { - queue_index = __packet_pick_tx_queue(dev, skb); + queue_index = __packet_pick_tx_queue(dev, skb, NULL); } return queue_index; @@ -1951,7 +1952,7 @@ retry: goto out_unlock; } - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) @@ -1962,6 +1963,7 @@ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc.transmit_time; sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); @@ -2457,6 +2459,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; + skb->tstamp = sockc->transmit_time; sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; @@ -2633,7 +2636,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; - sockc.tsflags = po->sk.sk_tsflags; + sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) @@ -2829,7 +2832,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); sockc.mark = sk->sk_mark; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); @@ -2905,6 +2908,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sockc.mark; + skb->tstamp = sockc.transmit_time; if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index b4e421aa9727..1eaf2550a9f8 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -376,8 +376,6 @@ static void release_refill(struct rds_connection *conn) * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into * sockets. - * - * -1 is returned if posting fails due to temporary resource exhaustion. */ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) { @@ -1025,7 +1023,6 @@ int rds_ib_recv_path(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; struct rds_ib_connection *ic = conn->c_transport_data; - int ret = 0; rdsdebug("conn %p\n", conn); if (rds_conn_up(conn)) { @@ -1034,7 +1031,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp) rds_ib_stats_inc(s_ib_rx_refill_from_thread); } - return ret; + return 0; } int rds_ib_recv_init(void) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a01169fb5325..7af246764a35 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -183,6 +183,17 @@ config NET_SCH_CBS To compile this code as a module, choose M here: the module will be called sch_cbs. +config NET_SCH_ETF + tristate "Earliest TxTime First (ETF)" + help + Say Y here if you want to use the Earliest TxTime First (ETF) packet + scheduling algorithm. + + See the top of <file:net/sched/sch_etf.c> for more details. + + To compile this code as a module, choose M here: the + module will be called sch_etf. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- @@ -284,6 +295,17 @@ config NET_SCH_FQ_CODEL If unsure, say N. +config NET_SCH_CAKE + tristate "Common Applications Kept Enhanced (CAKE)" + help + Say Y here if you want to use the Common Applications Kept Enhanced + (CAKE) queue management algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_cake. + + If unsure, say N. + config NET_SCH_FQ tristate "Fair Queue" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 8811d3804878..673ee7d26ff2 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -50,10 +50,12 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o +obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3f4cf930f809..148a89ab789b 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a, res->goto_tp = rcu_dereference_bh(chain->filter_chain); } +static void tcf_free_cookie_rcu(struct rcu_head *p) +{ + struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu); + + kfree(cookie->data); + kfree(cookie); +} + +static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, + struct tc_cookie *new_cookie) +{ + struct tc_cookie *old; + + old = xchg((__force struct tc_cookie **)old_cookie, new_cookie); + if (old) + call_rcu(&old->rcu, tcf_free_cookie_rcu); +} + /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected @@ -65,44 +83,64 @@ static void free_tcf(struct tc_action *p) free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); - if (p->act_cookie) { - kfree(p->act_cookie->data); - kfree(p->act_cookie); - } + tcf_set_action_cookie(&p->act_cookie, NULL); if (p->goto_chain) tcf_action_goto_chain_fini(p); kfree(p); } -static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) +static void tcf_action_cleanup(struct tc_action *p) { - spin_lock(&idrinfo->lock); - idr_remove(&idrinfo->action_idr, p->tcfa_index); - spin_unlock(&idrinfo->lock); + if (p->ops->cleanup) + p->ops->cleanup(p); + gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } +static int __tcf_action_put(struct tc_action *p, bool bind) +{ + struct tcf_idrinfo *idrinfo = p->idrinfo; + + if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) { + if (bind) + atomic_dec(&p->tcfa_bindcnt); + idr_remove(&idrinfo->action_idr, p->tcfa_index); + spin_unlock(&idrinfo->lock); + + tcf_action_cleanup(p); + return 1; + } + + if (bind) + atomic_dec(&p->tcfa_bindcnt); + + return 0; +} + int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) { int ret = 0; - ASSERT_RTNL(); - + /* Release with strict==1 and bind==0 is only called through act API + * interface (classifiers always bind). Only case when action with + * positive reference count and zero bind count can exist is when it was + * also created with act API (unbinding last classifier will destroy the + * action if it was created by classifier). So only case when bind count + * can be changed after initial check is when unbound action is + * destroyed by act API while classifier binds to action with same id + * concurrently. This result either creation of new action(same behavior + * as before), or reusing existing action if concurrent process + * increments reference count before action is deleted. Both scenarios + * are acceptable. + */ if (p) { - if (bind) - p->tcfa_bindcnt--; - else if (strict && p->tcfa_bindcnt > 0) + if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; - p->tcfa_refcnt--; - if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { - if (p->ops->cleanup) - p->ops->cleanup(p); - tcf_idr_remove(p->idrinfo, p); + if (__tcf_action_put(p, bind)) ret = ACT_P_DELETED; - } } return ret; @@ -111,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release); static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { + struct tc_cookie *act_cookie; u32 cookie_len = 0; - if (act->act_cookie) - cookie_len = nla_total_size(act->act_cookie->len); + rcu_read_lock(); + act_cookie = rcu_dereference(act->act_cookie); + + if (act_cookie) + cookie_len = nla_total_size(act_cookie->len); + rcu_read_unlock(); return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ @@ -257,46 +300,77 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, } EXPORT_SYMBOL(tcf_generic_walker); -static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo) +static bool __tcf_idr_check(struct tc_action_net *tn, u32 index, + struct tc_action **a, int bind) { - struct tc_action *p = NULL; + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); + if (IS_ERR(p)) { + p = NULL; + } else if (p) { + refcount_inc(&p->tcfa_refcnt); + if (bind) + atomic_inc(&p->tcfa_bindcnt); + } spin_unlock(&idrinfo->lock); - return p; + if (p) { + *a = p; + return true; + } + return false; } int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { - struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct tc_action *p = tcf_idr_lookup(index, idrinfo); - - if (p) { - *a = p; - return 1; - } - return 0; + return __tcf_idr_check(tn, index, a, 0); } EXPORT_SYMBOL(tcf_idr_search); bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind) { + return __tcf_idr_check(tn, index, a, bind); +} +EXPORT_SYMBOL(tcf_idr_check); + +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index) +{ struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct tc_action *p = tcf_idr_lookup(index, idrinfo); + struct tc_action *p; + int ret = 0; - if (index && p) { - if (bind) - p->tcfa_bindcnt++; - p->tcfa_refcnt++; - *a = p; - return true; + spin_lock(&idrinfo->lock); + p = idr_find(&idrinfo->action_idr, index); + if (!p) { + spin_unlock(&idrinfo->lock); + return -ENOENT; } - return false; + + if (!atomic_read(&p->tcfa_bindcnt)) { + if (refcount_dec_and_test(&p->tcfa_refcnt)) { + struct module *owner = p->ops->owner; + + WARN_ON(p != idr_remove(&idrinfo->action_idr, + p->tcfa_index)); + spin_unlock(&idrinfo->lock); + + tcf_action_cleanup(p); + module_put(owner); + return 0; + } + ret = 0; + } else { + ret = -EPERM; + } + + spin_unlock(&idrinfo->lock); + return ret; } -EXPORT_SYMBOL(tcf_idr_check); +EXPORT_SYMBOL(tcf_idr_delete_index); int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, @@ -304,14 +378,13 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct idr *idr = &idrinfo->action_idr; int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; - p->tcfa_refcnt = 1; + refcount_set(&p->tcfa_refcnt, 1); if (bind) - p->tcfa_bindcnt = 1; + atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); @@ -322,20 +395,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, goto err2; } spin_lock_init(&p->tcfa_lock); - idr_preload(GFP_KERNEL); - spin_lock(&idrinfo->lock); - /* user doesn't specify an index */ - if (!index) { - index = 1; - err = idr_alloc_u32(idr, NULL, &index, UINT_MAX, GFP_ATOMIC); - } else { - err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC); - } - spin_unlock(&idrinfo->lock); - idr_preload_end(); - if (err) - goto err3; - p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; @@ -345,7 +404,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, &p->tcfa_rate_est, &p->tcfa_lock, NULL, est); if (err) - goto err4; + goto err3; } p->idrinfo = idrinfo; @@ -353,8 +412,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, INIT_LIST_HEAD(&p->list); *a = p; return 0; -err4: - idr_remove(idr, index); err3: free_percpu(p->cpu_qstats); err2: @@ -370,11 +427,78 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) struct tcf_idrinfo *idrinfo = tn->idrinfo; spin_lock(&idrinfo->lock); - idr_replace(&idrinfo->action_idr, a, a->tcfa_index); + /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); spin_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_insert); +/* Cleanup idr index that was allocated but not initialized. */ + +void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + + spin_lock(&idrinfo->lock); + /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); + spin_unlock(&idrinfo->lock); +} +EXPORT_SYMBOL(tcf_idr_cleanup); + +/* Check if action with specified index exists. If actions is found, increments + * its reference and bind counters, and return 1. Otherwise insert temporary + * error pointer (to prevent concurrent users from inserting actions with same + * index) and return 0. + */ + +int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, + struct tc_action **a, int bind) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; + int ret; + +again: + spin_lock(&idrinfo->lock); + if (*index) { + p = idr_find(&idrinfo->action_idr, *index); + if (IS_ERR(p)) { + /* This means that another process allocated + * index but did not assign the pointer yet. + */ + spin_unlock(&idrinfo->lock); + goto again; + } + + if (p) { + refcount_inc(&p->tcfa_refcnt); + if (bind) + atomic_inc(&p->tcfa_bindcnt); + *a = p; + ret = 1; + } else { + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + *index, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, + ERR_PTR(-EBUSY), *index); + } + } else { + *index = 1; + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + UINT_MAX, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), + *index); + } + spin_unlock(&idrinfo->lock); + return ret; +} +EXPORT_SYMBOL(tcf_idr_check_alloc); + void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { @@ -538,13 +662,15 @@ repeat: } EXPORT_SYMBOL(tcf_action_exec); -int tcf_action_destroy(struct list_head *actions, int bind) +int tcf_action_destroy(struct tc_action *actions[], int bind) { const struct tc_action_ops *ops; - struct tc_action *a, *tmp; - int ret = 0; + struct tc_action *a; + int ret = 0, i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; + actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) @@ -555,6 +681,24 @@ int tcf_action_destroy(struct list_head *actions, int bind) return ret; } +static int tcf_action_put(struct tc_action *p) +{ + return __tcf_action_put(p, false); +} + +static void tcf_action_put_many(struct tc_action *actions[]) +{ + int i; + + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; + const struct tc_action_ops *ops = a->ops; + + if (tcf_action_put(a)) + module_put(ops->owner); + } +} + int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -567,16 +711,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; + struct tc_cookie *cookie; if (nla_put_string(skb, TCA_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; - if (a->act_cookie) { - if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len, - a->act_cookie->data)) + + rcu_read_lock(); + cookie = rcu_dereference(a->act_cookie); + if (cookie) { + if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { + rcu_read_unlock(); goto nla_put_failure; + } } + rcu_read_unlock(); nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) @@ -593,14 +743,15 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_action_dump_1); -int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, +int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref) { struct tc_action *a; - int err = -EINVAL; + int err = -EINVAL, i; struct nlattr *nest; - list_for_each_entry(a, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; @@ -638,6 +789,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action *a; @@ -688,9 +840,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES - rtnl_unlock(); + if (rtnl_held) + rtnl_unlock(); request_module("act_%s", act_name); - rtnl_lock(); + if (rtnl_held) + rtnl_lock(); a_o = tc_lookup_action_n(act_name); @@ -713,19 +867,15 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - extack); + rtnl_held, extack); else - err = a_o->init(net, nla, est, &a, ovr, bind, extack); + err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, + extack); if (err < 0) goto err_mod; - if (name == NULL && tb[TCA_ACT_COOKIE]) { - if (a->act_cookie) { - kfree(a->act_cookie->data); - kfree(a->act_cookie); - } - a->act_cookie = cookie; - } + if (!name && tb[TCA_ACT_COOKIE]) + tcf_set_action_cookie(&a->act_cookie, cookie); /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then @@ -737,10 +887,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { err = tcf_action_goto_chain_init(a, tp); if (err) { - LIST_HEAD(actions); + struct tc_action *actions[] = { a, NULL }; - list_add_tail(&a->list, &actions); - tcf_action_destroy(&actions, bind); + tcf_action_destroy(actions, bind); NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); return ERR_PTR(err); } @@ -758,21 +907,12 @@ err_out: return ERR_PTR(err); } -static void cleanup_a(struct list_head *actions, int ovr) -{ - struct tc_action *a; - - if (!ovr) - return; - - list_for_each_entry(a, actions, list) - a->tcfa_refcnt--; -} +/* Returns numbers of initialized actions or negative error. */ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, size_t *attr_size, - struct netlink_ext_ack *extack) + struct tc_action *actions[], size_t *attr_size, + bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -786,25 +926,19 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - extack); + rtnl_held, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; } act->order = i; sz += tcf_action_fill_size(act); - if (ovr) - act->tcfa_refcnt++; - list_add_tail(&act->list, actions); + /* Start from index 0 */ + actions[i - 1] = act; } *attr_size = tcf_action_full_attrs_size(sz); - - /* Remove the temp refcnt which was necessary to protect against - * destroying an existing action which was being replaced - */ - cleanup_a(actions, ovr); - return 0; + return i - 1; err: tcf_action_destroy(actions, bind); @@ -855,7 +989,7 @@ errout: return -1; } -static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, +static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { @@ -891,7 +1025,7 @@ out_nlmsg_trim: static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, - struct list_head *actions, int event, + struct tc_action *actions[], int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -900,7 +1034,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, - 0, 0) <= 0) { + 0, 1) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; @@ -1027,9 +1161,41 @@ err_out: return err; } +static int tcf_action_delete(struct net *net, struct tc_action *actions[], + int *acts_deleted, struct netlink_ext_ack *extack) +{ + u32 act_index; + int ret, i; + + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; + const struct tc_action_ops *ops = a->ops; + + /* Actions can be deleted concurrently so we must save their + * type and id to search again after reference is released. + */ + act_index = a->tcfa_index; + + if (tcf_action_put(a)) { + /* last reference, action was deleted concurrently */ + module_put(ops->owner); + } else { + /* now do the delete */ + ret = ops->delete(net, act_index); + if (ret < 0) { + *acts_deleted = i + 1; + return ret; + } + } + } + *acts_deleted = i; + return 0; +} + static int -tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid, size_t attr_size, struct netlink_ext_ack *extack) +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], + int *acts_deleted, u32 portid, size_t attr_size, + struct netlink_ext_ack *extack) { int ret; struct sk_buff *skb; @@ -1040,14 +1206,14 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, - 0, 1) <= 0) { + 0, 2) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; } /* now do the delete */ - ret = tcf_action_destroy(actions, 0); + ret = tcf_action_delete(net, actions, acts_deleted, extack); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); @@ -1069,7 +1235,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t attr_size = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO + 1] = {}; + int acts_deleted = 0; ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) @@ -1091,27 +1258,27 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } act->order = i; attr_size += tcf_action_fill_size(act); - list_add_tail(&act->list, &actions); + actions[i - 1] = act; } attr_size = tcf_action_full_attrs_size(attr_size); if (event == RTM_GETACTION) - ret = tcf_get_notify(net, portid, n, &actions, event, extack); + ret = tcf_get_notify(net, portid, n, actions, event, extack); else { /* delete */ - ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); + ret = tcf_del_notify(net, n, actions, &acts_deleted, portid, + attr_size, extack); if (ret) goto err; return ret; } err: - if (event != RTM_GETACTION) - tcf_action_destroy(&actions, 0); + tcf_action_put_many(&actions[acts_deleted]); return ret; } static int -tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -1142,14 +1309,17 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, { size_t attr_size = 0; int ret = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, - &attr_size, extack); - if (ret) + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions, + &attr_size, true, extack); + if (ret < 0) return ret; + ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); + if (ovr) + tcf_action_put_many(actions); - return tcf_add_notify(net, n, &actions, portid, attr_size, extack); + return ret; } static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 18089c02e557..06f743d8ed41 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, - .refcnt = prog->tcf_refcnt - ref, - .bindcnt = prog->tcf_bindcnt - bind, + .refcnt = refcount_read(&prog->tcf_refcnt) - ref, + .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, .action = prog->tcf_action, }; struct tcf_t tm; @@ -276,7 +276,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, - int replace, int bind, struct netlink_ext_ack *extack) + int replace, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; @@ -298,21 +299,27 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - if (!tcf_idr_check(tn, parm->index, act, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, act, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, act, &act_bpf_ops, bind, true); - if (ret < 0) + if (ret < 0) { + tcf_idr_cleanup(tn, parm->index); return ret; + } res = ACT_P_CREATED; - } else { + } else if (ret > 0) { /* Don't override defaults. */ if (bind) return 0; - tcf_idr_release(*act, bind); - if (!replace) + if (!replace) { + tcf_idr_release(*act, bind); return -EEXIST; + } + } else { + return ret; } is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; @@ -355,8 +362,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return res; out: - if (res == ACT_P_CREATED) - tcf_idr_release(*act, bind); + tcf_idr_release(*act, bind); return ret; } @@ -387,6 +393,13 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_bpf_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, @@ -397,6 +410,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .walk = tcf_bpf_walker, .lookup = tcf_bpf_search, + .delete = tcf_bpf_delete, .size = sizeof(struct tcf_bpf), }; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e4b880fa51fe..1e31f0e448e2 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -96,7 +96,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); @@ -118,11 +118,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, a, &act_connmark_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ci = to_connmark(*a); ci->tcf_action = parm->action; @@ -131,16 +134,18 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, tcf_idr_insert(tn, *a); ret = ACT_P_CREATED; - } else { + } else if (ret > 0) { ci = to_connmark(*a); if (bind) return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } /* replacing action and zone */ ci->tcf_action = parm->action; ci->zone = parm->zone; + ret = 0; } return ret; @@ -154,8 +159,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, struct tc_connmark opt = { .index = ci->tcf_index, - .refcnt = ci->tcf_refcnt - ref, - .bindcnt = ci->tcf_bindcnt - bind, + .refcnt = refcount_read(&ci->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, .action = ci->tcf_action, .zone = ci->zone, }; @@ -193,6 +198,13 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_connmark_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, @@ -202,6 +214,7 @@ static struct tc_action_ops act_connmark_ops = { .init = tcf_connmark_init, .walk = tcf_connmark_walker, .lookup = tcf_connmark_search, + .delete = tcf_connmark_delete, .size = sizeof(struct tcf_connmark_info), }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 6e7124e57918..4e8c383f379e 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -46,7 +46,8 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_old, *params_new; @@ -66,18 +67,24 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_csum_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } + } else { + return err; } p = to_tcf_csum(*a); @@ -85,8 +92,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } params_old = rtnl_dereference(p->params); @@ -597,8 +603,8 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, .action = p->tcf_action, }; struct tcf_t t; @@ -653,6 +659,13 @@ static size_t tcf_csum_get_fill_size(const struct tc_action *act) return nla_total_size(sizeof(struct tc_csum)); } +static int tcf_csum_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, @@ -664,6 +677,7 @@ static struct tc_action_ops act_csum_ops = { .walk = tcf_csum_walker, .lookup = tcf_csum_search, .get_fill_size = tcf_csum_get_fill_size, + .delete = tcf_csum_delete, .size = sizeof(struct tcf_csum), }; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 4dc4f153cad8..661b72b9147d 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -56,7 +56,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; @@ -90,18 +91,24 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_gact_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } + } else { + return err; } gact = to_gact(*a); @@ -169,8 +176,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_gact *gact = to_gact(a); struct tc_gact opt = { .index = gact->tcf_index, - .refcnt = gact->tcf_refcnt - ref, - .bindcnt = gact->tcf_bindcnt - bind, + .refcnt = refcount_read(&gact->tcf_refcnt) - ref, + .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, .action = gact->tcf_action, }; struct tcf_t t; @@ -230,6 +237,13 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act) return sz; } +static int tcf_gact_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -241,6 +255,7 @@ static struct tc_action_ops act_gact_ops = { .walk = tcf_gact_walker, .lookup = tcf_gact_search, .get_fill_size = tcf_gact_get_fill_size, + .delete = tcf_gact_delete, .size = sizeof(struct tcf_gact), }; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 20d7d36b2fc9..3d6e265758c0 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -448,7 +448,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; @@ -483,7 +484,12 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!p) return -ENOMEM; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) { + kfree(p); + return err; + } + exists = err; if (exists && bind) { kfree(p); return 0; @@ -493,16 +499,15 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, bind, true); if (ret) { + tcf_idr_cleanup(tn, parm->index); kfree(p); return ret; } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) { - kfree(p); - return -EEXIST; - } + kfree(p); + return -EEXIST; } ife = to_ife(*a); @@ -547,6 +552,8 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + tcf_idr_release(*a, bind); + kfree(p); return err; } @@ -596,8 +603,8 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_ife_params *p = rtnl_dereference(ife->params); struct tc_ife opt = { .index = ife->tcf_index, - .refcnt = ife->tcf_refcnt - ref, - .bindcnt = ife->tcf_bindcnt - bind, + .refcnt = refcount_read(&ife->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, .action = ife->tcf_action, .flags = p->flags, }; @@ -843,6 +850,13 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ife_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ife_ops = { .kind = "ife", .type = TCA_ACT_IFE, @@ -853,6 +867,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .walk = tcf_ife_walker, .lookup = tcf_ife_search, + .delete = tcf_ife_delete, .size = sizeof(struct tcf_ife_info), }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 14c312d7908f..0dc787a57798 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -119,13 +119,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); - exists = tcf_idr_check(tn, index, a, bind); + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } @@ -133,22 +138,27 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, index); return ret; + } ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } hook = nla_get_u32(tb[TCA_IPT_HOOK]); @@ -196,7 +206,8 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, bind); @@ -204,7 +215,8 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool unlocked, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, bind); @@ -280,8 +292,8 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (unlikely(!t)) goto nla_put_failure; - c.bindcnt = ipt->tcf_bindcnt - bind; - c.refcnt = ipt->tcf_refcnt - ref; + c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind; + c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || @@ -322,6 +334,13 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ipt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, @@ -332,6 +351,7 @@ static struct tc_action_ops act_ipt_ops = { .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, + .delete = tcf_ipt_delete, .size = sizeof(struct tcf_ipt), }; @@ -372,6 +392,13 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_xt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, @@ -382,6 +409,7 @@ static struct tc_action_ops act_xt_ops = { .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, + .delete = tcf_xt_delete, .size = sizeof(struct tcf_ipt), }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fd34015331ab..6afd89a36c69 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -68,8 +68,9 @@ static unsigned int mirred_net_id; static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; @@ -78,7 +79,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tcf_mirred *m; struct net_device *dev; bool exists = false; - int ret; + int ret, err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); @@ -93,7 +94,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } parm = nla_data(tb[TCA_MIRRED_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -106,6 +110,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } @@ -114,6 +120,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (dev == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ENODEV; } mac_header_xmit = dev_is_mac_header_xmit(dev); @@ -123,18 +131,20 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (!exists) { if (!dev) { + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } m = to_mirred(*a); @@ -250,8 +260,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, - .refcnt = m->tcf_refcnt - ref, - .bindcnt = m->tcf_bindcnt - bind, + .refcnt = refcount_read(&m->tcf_refcnt) - ref, + .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, .eaction = m->tcfm_eaction, .ifindex = dev ? dev->ifindex : 0, }; @@ -321,6 +331,13 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) return rtnl_dereference(m->tcfm_dev); } +static int tcf_mirred_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, @@ -334,6 +351,7 @@ static struct tc_action_ops act_mirred_ops = { .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, + .delete = tcf_mirred_delete, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4b5848b6c252..4dd9188a72fd 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -38,7 +38,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; @@ -57,18 +57,24 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_nat_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } + } else { + return err; } p = to_tcf_nat(*a); @@ -257,8 +263,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, .index = p->tcf_index, .action = p->tcf_action, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; @@ -294,6 +300,13 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_nat_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, @@ -303,6 +316,7 @@ static struct tc_action_ops act_nat_ops = { .init = tcf_nat_init, .walk = tcf_nat_walker, .lookup = tcf_nat_search, + .delete = tcf_nat_delete, .size = sizeof(struct tcf_nat), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 8a925c72db5f..cc8ffcd1ddb5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -132,20 +132,23 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb, static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; - struct nlattr *pattr; - struct tc_pedit *parm; - int ret = 0, err; - struct tcf_pedit *p; struct tc_pedit_key *keys = NULL; struct tcf_pedit_key_ex *keys_ex; + struct tc_pedit *parm; + struct nlattr *pattr; + struct tcf_pedit *p; + int ret = 0, err; int ksize; - if (nla == NULL) + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; + } err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) @@ -154,47 +157,62 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; - if (!pattr) + if (!pattr) { + NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; + } parm = nla_data(pattr); ksize = parm->nkeys * sizeof(struct tc_pedit_key); - if (nla_len(pattr) < sizeof(*parm) + ksize) + if (nla_len(pattr) < sizeof(*parm) + ksize) { + NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); return -EINVAL; + } keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); if (IS_ERR(keys_ex)) return PTR_ERR(keys_ex); - if (!tcf_idr_check(tn, parm->index, a, bind)) { - if (!parm->nkeys) - return -EINVAL; + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { + if (!parm->nkeys) { + tcf_idr_cleanup(tn, parm->index); + NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); + ret = -EINVAL; + goto out_free; + } ret = tcf_idr_create(tn, parm->index, est, a, &act_pedit_ops, bind, false); - if (ret) - return ret; + if (ret) { + tcf_idr_cleanup(tn, parm->index); + goto out_free; + } p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); - if (keys == NULL) { + if (!keys) { tcf_idr_release(*a, bind); - kfree(keys_ex); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) - return 0; - tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + goto out_free; + if (!ovr) { + tcf_idr_release(*a, bind); + ret = -EEXIST; + goto out_free; + } p = to_pedit(*a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (!keys) { - kfree(keys_ex); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } } + } else { + return err; } spin_lock_bh(&p->tcf_lock); @@ -214,12 +232,17 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +out_free: + kfree(keys_ex); + return ret; + } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; + kfree(keys); kfree(p->tcfp_keys_ex); } @@ -284,11 +307,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, if (p->tcfp_nkeys > 0) { struct tc_pedit_key *tkey = p->tcfp_keys; struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex; - enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + enum pedit_header_type htype = + TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { - u32 *ptr, _data; + u32 *ptr, hdata; int offset = tkey->off; int hoffset; u32 val; @@ -303,39 +327,39 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { - pr_info("tc filter pedit bad header type specified (0x%x)\n", + pr_info("tc action pedit bad header type specified (0x%x)\n", htype); goto bad; } if (tkey->offmask) { - char *d, _d; + u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { - pr_info("tc filter pedit 'at' offset %d out of bounds\n", + pr_info("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } - d = skb_header_pointer(skb, hoffset + tkey->at, 1, - &_d); + d = skb_header_pointer(skb, hoffset + tkey->at, + sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; } if (offset % 4) { - pr_info("tc filter pedit" - " offset must be on 32 bit boundaries\n"); + pr_info("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } if (!offset_valid(skb, hoffset + offset)) { - pr_info("tc filter pedit offset %d out of bounds\n", + pr_info("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } - ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data); + ptr = skb_header_pointer(skb, hoffset + offset, + sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ @@ -347,19 +371,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, val = (*ptr + tkey->val) & ~tkey->mask; break; default: - pr_info("tc filter pedit bad command (%d)\n", + pr_info("tc action pedit bad command (%d)\n", cmd); goto bad; } *ptr = ((*ptr & tkey->mask) ^ val); - if (ptr == &_data) + if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } goto done; - } else + } else { WARN(1, "pedit BUG: index %d\n", p->tcf_index); + } bad: p->tcf_qstats.overlimits++; @@ -391,8 +416,8 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->nkeys = p->tcfp_nkeys; opt->flags = p->tcfp_flags; opt->action = p->tcf_action; - opt->refcnt = p->tcf_refcnt - ref; - opt->bindcnt = p->tcf_bindcnt - bind; + opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; + opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (p->tcfp_keys_ex) { tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys); @@ -435,6 +460,13 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_pedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, @@ -445,6 +477,7 @@ static struct tc_action_ops act_pedit_ops = { .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, + .delete = tcf_pedit_delete, .size = sizeof(struct tcf_pedit), }; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 4e72bc2a0dfb..1f3192ea8df7 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -75,7 +75,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_act_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { int ret = 0, err; @@ -101,20 +101,24 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_POLICE_TBF]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, NULL, a, &act_police_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } police = to_police(*a); @@ -195,8 +199,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return err; } @@ -274,8 +277,8 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, .action = police->tcf_action, .mtu = police->tcfp_mtu, .burst = PSCHED_NS2TICKS(police->tcfp_burst), - .refcnt = police->tcf_refcnt - ref, - .bindcnt = police->tcf_bindcnt - bind, + .refcnt = refcount_read(&police->tcf_refcnt) - ref, + .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; @@ -314,6 +317,13 @@ static int tcf_police_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_police_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tcf_idr_delete_index(tn, index); +} + MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); @@ -327,6 +337,7 @@ static struct tc_action_ops act_police_ops = { .init = tcf_act_police_init, .walk = tcf_act_police_walker, .lookup = tcf_police_search, + .delete = tcf_police_delete, .size = sizeof(struct tcf_police), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 5db358497c9e..3079e7be5bde 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -37,7 +37,8 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; @@ -45,7 +46,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, struct tc_sample *parm; struct tcf_sample *s; bool exists = false; - int ret; + int ret, err; if (!nla) return -EINVAL; @@ -58,20 +59,24 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SAMPLE_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_sample_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } s = to_sample(*a); @@ -80,8 +85,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, s->psample_group_num); if (!psample_group) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } RCU_INIT_POINTER(s->psample_group, psample_group); @@ -173,8 +177,8 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, struct tc_sample opt = { .index = s->tcf_index, .action = s->tcf_action, - .refcnt = s->tcf_refcnt - ref, - .bindcnt = s->tcf_bindcnt - bind, + .refcnt = refcount_read(&s->tcf_refcnt) - ref, + .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; @@ -219,6 +223,13 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_sample_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_sample_ops = { .kind = "sample", .type = TCA_ACT_SAMPLE, @@ -229,6 +240,7 @@ static struct tc_action_ops act_sample_ops = { .cleanup = tcf_sample_cleanup, .walk = tcf_sample_walker, .lookup = tcf_sample_search, + .delete = tcf_sample_delete, .size = sizeof(struct tcf_sample), }; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 98c4afe7c15b..aa51152e0066 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -79,7 +79,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; @@ -99,21 +100,28 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_DEF_DATA] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_simp_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_defact(*a); ret = alloc_defdata(d, tb[TCA_DEF_DATA]); @@ -126,9 +134,10 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } else { d = to_defact(*a); - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } reset_policy(d, tb[TCA_DEF_DATA], parm); } @@ -145,8 +154,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; @@ -183,6 +192,13 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_simp_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, @@ -193,6 +209,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .walk = tcf_simp_walker, .lookup = tcf_simp_search, + .delete = tcf_simp_delete, .size = sizeof(struct tcf_defact), }; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6138d1d71900..da56e6938c9e 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -23,6 +23,9 @@ #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/dsfield.h> #include <linux/tc_act/tc_skbedit.h> #include <net/tc_act/tc_skbedit.h> @@ -34,25 +37,54 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + int action; - spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); - bstats_update(&d->tcf_bstats, skb); - - if (d->flags & SKBEDIT_F_PRIORITY) - skb->priority = d->priority; - if (d->flags & SKBEDIT_F_QUEUE_MAPPING && - skb->dev->real_num_tx_queues > d->queue_mapping) - skb_set_queue_mapping(skb, d->queue_mapping); - if (d->flags & SKBEDIT_F_MARK) { - skb->mark &= ~d->mask; - skb->mark |= d->mark & d->mask; + bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + + rcu_read_lock(); + params = rcu_dereference(d->params); + action = READ_ONCE(d->tcf_action); + + if (params->flags & SKBEDIT_F_PRIORITY) + skb->priority = params->priority; + if (params->flags & SKBEDIT_F_INHERITDSFIELD) { + int wlen = skb_network_offset(skb); + + switch (tc_skb_protocol(skb)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + break; + + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + break; + } } - if (d->flags & SKBEDIT_F_PTYPE) - skb->pkt_type = d->ptype; - - spin_unlock(&d->tcf_lock); - return d->tcf_action; + if (params->flags & SKBEDIT_F_QUEUE_MAPPING && + skb->dev->real_num_tx_queues > params->queue_mapping) + skb_set_queue_mapping(skb, params->queue_mapping); + if (params->flags & SKBEDIT_F_MARK) { + skb->mark &= ~params->mask; + skb->mark |= params->mark & params->mask; + } + if (params->flags & SKBEDIT_F_PTYPE) + skb->pkt_type = params->ptype; + +unlock: + rcu_read_unlock(); + return action; +err: + qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); + action = TC_ACT_SHOT; + goto unlock; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { @@ -62,13 +94,16 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); + struct tcf_skbedit_params *params_old, *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; @@ -114,52 +149,76 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, mask = nla_data(tb[TCA_SKBEDIT_MASK]); } + if (tb[TCA_SKBEDIT_FLAGS] != NULL) { + u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]); + + if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) + flags |= SKBEDIT_F_INHERITDSFIELD; + } + parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!flags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_skbedit_ops, bind, false); - if (ret) + &act_skbedit_ops, bind, true); + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_skbedit(*a); ret = ACT_P_CREATED; } else { d = to_skbedit(*a); - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } - spin_lock_bh(&d->tcf_lock); + ASSERT_RTNL(); + + params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + if (unlikely(!params_new)) { + if (ret == ACT_P_CREATED) + tcf_idr_release(*a, bind); + return -ENOMEM; + } - d->flags = flags; + params_new->flags = flags; if (flags & SKBEDIT_F_PRIORITY) - d->priority = *priority; + params_new->priority = *priority; if (flags & SKBEDIT_F_QUEUE_MAPPING) - d->queue_mapping = *queue_mapping; + params_new->queue_mapping = *queue_mapping; if (flags & SKBEDIT_F_MARK) - d->mark = *mark; + params_new->mark = *mark; if (flags & SKBEDIT_F_PTYPE) - d->ptype = *ptype; + params_new->ptype = *ptype; /* default behaviour is to use all the bits */ - d->mask = 0xffffffff; + params_new->mask = 0xffffffff; if (flags & SKBEDIT_F_MASK) - d->mask = *mask; + params_new->mask = *mask; d->tcf_action = parm->action; - - spin_unlock_bh(&d->tcf_lock); + params_old = rtnl_dereference(d->params); + rcu_assign_pointer(d->params, params_new); + if (params_old) + kfree_rcu(params_old, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -171,30 +230,39 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; + u64 pure_flags = 0; struct tcf_t t; + params = rtnl_dereference(d->params); + if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PRIORITY) && - nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority)) + if ((params->flags & SKBEDIT_F_PRIORITY) && + nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority)) + goto nla_put_failure; + if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) && + nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && - nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping)) + if ((params->flags & SKBEDIT_F_MARK) && + nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MARK) && - nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark)) + if ((params->flags & SKBEDIT_F_PTYPE) && + nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PTYPE) && - nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) + if ((params->flags & SKBEDIT_F_MASK) && + nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MASK) && - nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) + if (params->flags & SKBEDIT_F_INHERITDSFIELD) + pure_flags |= SKBEDIT_F_INHERITDSFIELD; + if (pure_flags != 0 && + nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); @@ -207,6 +275,16 @@ nla_put_failure: return -1; } +static void tcf_skbedit_cleanup(struct tc_action *a) +{ + struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + + params = rcu_dereference_protected(d->params, 1); + if (params) + kfree_rcu(params, rcu); +} + static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, @@ -225,6 +303,13 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, @@ -232,8 +317,10 @@ static struct tc_action_ops act_skbedit_ops = { .act = tcf_skbedit, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, + .cleanup = tcf_skbedit_cleanup, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, + .delete = tcf_skbedit_delete, .size = sizeof(struct tcf_skbedit), }; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ad050d7d4b46..cdc6bacfb190 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -84,7 +84,8 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); struct nlattr *tb[TCA_SKBMOD_MAX + 1]; @@ -127,27 +128,33 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (parm->flags & SKBMOD_F_SWAPMAC) lflags = SKBMOD_F_SWAPMAC; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!lflags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_skbmod_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } d = to_skbmod(*a); @@ -155,8 +162,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); if (unlikely(!p)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } @@ -205,8 +211,8 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); struct tc_skbmod opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; @@ -252,6 +258,13 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbmod_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", .type = TCA_ACT_SKBMOD, @@ -262,6 +275,7 @@ static struct tc_action_ops act_skbmod_ops = { .cleanup = tcf_skbmod_cleanup, .walk = tcf_skbmod_walker, .lookup = tcf_skbmod_search, + .delete = tcf_skbmod_delete, .size = sizeof(struct tcf_skbmod), }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 9bc6c2ae98a5..f811850fd1d0 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <net/geneve.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -57,6 +58,135 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, return action; } +static const struct nla_policy +enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, + .len = 128 }, +}; + +static int +tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1]; + int err, data_len, opt_len; + u8 *data; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, + nla, geneve_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data"); + return -EINVAL; + } + + data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); + data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); + if (data_len < 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long"); + return -ERANGE; + } + if (data_len % 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long"); + return -ERANGE; + } + + opt_len = sizeof(struct geneve_opt) + data_len; + if (dst) { + struct geneve_opt *opt = dst; + + WARN_ON(dst_len < opt_len); + + opt->opt_class = + nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]); + opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]); + opt->length = data_len / 4; /* length is in units of 4 bytes */ + opt->r1 = 0; + opt->r2 = 0; + opt->r3 = 0; + + memcpy(opt + 1, data, data_len); + } + + return opt_len; +} + +static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, + int dst_len, struct netlink_ext_ack *extack) +{ + int err, rem, opt_len, len = nla_len(nla), opts_len = 0; + const struct nlattr *attr, *head = nla_data(nla); + + err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; + + nla_for_each_attr(attr, head, len, rem) { + switch (nla_type(attr)) { + case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: + opt_len = tunnel_key_copy_geneve_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + if (dst) { + dst_len -= opt_len; + dst += opt_len; + } + break; + } + } + + if (!opts_len) { + NL_SET_ERR_MSG(extack, "Empty list of tunnel options"); + return -EINVAL; + } + + if (rem > 0) { + NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes"); + return -EINVAL; + } + + return opts_len; +} + +static int tunnel_key_get_opts_len(struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + return tunnel_key_copy_opts(nla, NULL, 0, extack); +} + +static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, + int opts_len, struct netlink_ext_ack *extack) +{ + info->options_len = opts_len; + switch (nla_type(nla_data(nla))) { + case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_GENEVE_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif + default: + NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type"); + return -EINVAL; + } +} + static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) }, [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, @@ -66,11 +196,15 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 }, }; static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; @@ -81,24 +215,35 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct tcf_tunnel_key *t; bool exists = false; __be16 dst_port = 0; + int opts_len = 0; __be64 key_id; __be16 flags; + u8 tos, ttl; int ret = 0; int err; - if (!nla) + if (!nla) { + NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed"); return -EINVAL; + } err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy, - NULL); - if (err < 0) + extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes"); return err; + } - if (!tb[TCA_TUNNEL_KEY_PARMS]) + if (!tb[TCA_TUNNEL_KEY_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key parameters"); return -EINVAL; + } parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -107,6 +252,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, break; case TCA_TUNNEL_KEY_ACT_SET: if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key id"); ret = -EINVAL; goto err_out; } @@ -121,6 +267,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); + if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) { + opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS], + extack); + if (opts_len < 0) { + ret = opts_len; + goto err_out; + } + } + + tos = 0; + if (tb[TCA_TUNNEL_KEY_ENC_TOS]) + tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]); + ttl = 0; + if (tb[TCA_TUNNEL_KEY_ENC_TTL]) + ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]); + if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { __be32 saddr; @@ -129,9 +291,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); - metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, + metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl, dst_port, flags, - key_id, 0); + key_id, opts_len); } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { struct in6_addr saddr; @@ -140,19 +302,33 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); - metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port, + metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port, 0, flags, key_id, 0); + } else { + NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst"); + ret = -EINVAL; + goto err_out; } if (!metadata) { - ret = -EINVAL; + NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst"); + ret = -ENOMEM; goto err_out; } + if (opts_len) { + ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS], + &metadata->u.tun_info, + opts_len, extack); + if (ret < 0) + goto err_out; + } + metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; break; default: + NL_SET_ERR_MSG(extack, "Unknown tunnel key action"); ret = -EINVAL; goto err_out; } @@ -160,14 +336,16 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_tunnel_key_ops, bind, true); - if (ret) - return ret; + if (ret) { + NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); + goto err_out; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + NL_SET_ERR_MSG(extack, "TC IDR already exists"); + return -EEXIST; } t = to_tunnel_key(*a); @@ -175,8 +353,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); + NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); return -ENOMEM; } @@ -199,6 +377,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, err_out: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return ret; } @@ -216,6 +396,61 @@ static void tunnel_key_release(struct tc_action *a) } } +static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + int len = info->options_len; + u8 *src = (u8 *)(info + 1); + struct nlattr *start; + + start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); + if (!start) + return -EMSGSIZE; + + while (len > 0) { + struct geneve_opt *opt = (struct geneve_opt *)src; + + if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE, + opt->type) || + nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA, + opt->length * 4, opt + 1)) + return -EMSGSIZE; + + len -= sizeof(struct geneve_opt) + opt->length * 4; + src += sizeof(struct geneve_opt) + opt->length * 4; + } + + nla_nest_end(skb, start); + return 0; +} + +static int tunnel_key_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct nlattr *start; + int err; + + if (!info->options_len) + return 0; + + start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS); + if (!start) + return -EMSGSIZE; + + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + err = tunnel_key_geneve_opts_dump(skb, info); + if (err) + return err; + } else { + return -EINVAL; + } + + nla_nest_end(skb, start); + return 0; +} + static int tunnel_key_dump_addresses(struct sk_buff *skb, const struct ip_tunnel_info *info) { @@ -252,8 +487,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_tunnel_key_params *params; struct tc_tunnel_key opt = { .index = t->tcf_index, - .refcnt = t->tcf_refcnt - ref, - .bindcnt = t->tcf_bindcnt - bind, + .refcnt = refcount_read(&t->tcf_refcnt) - ref, + .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, .action = t->tcf_action, }; struct tcf_t tm; @@ -266,8 +501,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { - struct ip_tunnel_key *key = - ¶ms->tcft_enc_metadata->u.tun_info.key; + struct ip_tunnel_info *info = + ¶ms->tcft_enc_metadata->u.tun_info; + struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || @@ -275,7 +511,14 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, ¶ms->tcft_enc_metadata->u.tun_info) || nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, - !(key->tun_flags & TUNNEL_CSUM))) + !(key->tun_flags & TUNNEL_CSUM)) || + tunnel_key_opts_dump(skb, info)) + goto nla_put_failure; + + if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos)) + goto nla_put_failure; + + if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl)) goto nla_put_failure; } @@ -309,6 +552,13 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tunnel_key_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_tunnel_key_ops = { .kind = "tunnel_key", .type = TCA_ACT_TUNNEL_KEY, @@ -319,6 +569,7 @@ static struct tc_action_ops act_tunnel_key_ops = { .cleanup = tunnel_key_release, .walk = tunnel_key_walker, .lookup = tunnel_key_search, + .delete = tunnel_key_delete, .size = sizeof(struct tcf_tunnel_key), }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 1fb39e1f9d07..ad37f308175a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; @@ -133,7 +134,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_VLAN_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -145,12 +149,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ERANGE; } @@ -163,6 +171,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EPROTONOSUPPORT; } } else { @@ -175,6 +185,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } action = parm->v_action; @@ -182,14 +194,15 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_vlan_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } v = to_vlan(*a); @@ -197,8 +210,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } @@ -239,8 +251,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); struct tc_vlan opt = { .index = v->tcf_index, - .refcnt = v->tcf_refcnt - ref, - .bindcnt = v->tcf_bindcnt - bind, + .refcnt = refcount_read(&v->tcf_refcnt) - ref, + .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, .action = v->tcf_action, .v_action = p->tcfv_action, }; @@ -286,6 +298,13 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_vlan_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, @@ -296,6 +315,7 @@ static struct tc_action_ops act_vlan_ops = { .cleanup = tcf_vlan_cleanup, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, + .delete = tcf_vlan_delete, .size = sizeof(struct tcf_vlan), }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f74513a7c7a8..623fe2cfe529 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -277,18 +277,21 @@ static bool tcf_block_offload_in_use(struct tcf_block *block) static int tcf_block_offload_cmd(struct tcf_block *block, struct net_device *dev, struct tcf_block_ext_info *ei, - enum tc_block_command command) + enum tc_block_command command, + struct netlink_ext_ack *extack) { struct tc_block_offload bo = {}; bo.command = command; bo.binder_type = ei->binder_type; bo.block = block; + bo.extack = extack; return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, - struct tcf_block_ext_info *ei) + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { struct net_device *dev = q->dev_queue->dev; int err; @@ -299,10 +302,12 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ - if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) + if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { + NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); return -EOPNOTSUPP; + } - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; return err; @@ -322,7 +327,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_dec; - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; return; @@ -612,7 +617,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, if (err) goto err_chain_head_change_cb_add; - err = tcf_block_offload_bind(block, q, ei); + err = tcf_block_offload_bind(block, q, ei, extack); if (err) goto err_block_offload_bind; @@ -746,18 +751,53 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) } EXPORT_SYMBOL(tcf_block_cb_decref); +static int +tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb, + void *cb_priv, bool add, bool offload_in_use, + struct netlink_ext_ack *extack) +{ + struct tcf_chain *chain; + struct tcf_proto *tp; + int err; + + list_for_each_entry(chain, &block->chain_list, list) { + for (tp = rtnl_dereference(chain->filter_chain); tp; + tp = rtnl_dereference(tp->next)) { + if (tp->ops->reoffload) { + err = tp->ops->reoffload(tp, add, cb, cb_priv, + extack); + if (err && add) + goto err_playback_remove; + } else if (add && offload_in_use) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support"); + goto err_playback_remove; + } + } + } + + return 0; + +err_playback_remove: + tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use, + extack); + return err; +} + struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, + struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; + int err; - /* At this point, playback of previous block cb calls is not supported, - * so forbid to register to block which already has some offloaded - * filters present. - */ - if (tcf_block_offload_in_use(block)) - return ERR_PTR(-EOPNOTSUPP); + /* Replay any already present rules */ + err = tcf_block_playback_offloads(block, cb, cb_priv, true, + tcf_block_offload_in_use(block), + extack); + if (err) + return ERR_PTR(err); block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); if (!block_cb) @@ -772,17 +812,22 @@ EXPORT_SYMBOL(__tcf_block_cb_register); int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; - block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv); + block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv, + extack); return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0; } EXPORT_SYMBOL(tcf_block_cb_register); -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb) { + tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, + false, tcf_block_offload_in_use(block), + NULL); list_del(&block_cb->list); kfree(block_cb); } @@ -796,7 +841,7 @@ void tcf_block_cb_unregister(struct tcf_block *block, block_cb = tcf_block_cb_lookup(block, cb, cb_ident); if (!block_cb) return; - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); } EXPORT_SYMBOL(tcf_block_cb_unregister); @@ -1463,7 +1508,9 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; + arg.w.cookie = cb->args[2]; tp->ops->walk(tp, &arg.w); + cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; if (arg.w.stop) return false; @@ -1564,11 +1611,7 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - LIST_HEAD(actions); - - ASSERT_RTNL(); - tcf_exts_to_list(exts, &actions); - tcf_action_destroy(&actions, TCA_ACT_UNBIND); + tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); exts->nr_actions = 0; #endif @@ -1587,7 +1630,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, extack); + TCA_ACT_BIND, true, extack); if (IS_ERR(act)) return PTR_ERR(act); @@ -1595,17 +1638,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, exts->actions[0] = act; exts->nr_actions = 1; } else if (exts->action && tb[exts->action]) { - LIST_HEAD(actions); - int err, i = 0; + int err; err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, &attr_size, extack); - if (err) + exts->actions, &attr_size, true, + extack); + if (err < 0) return err; - list_for_each_entry(act, &actions, list) - exts->actions[i++] = act; - exts->nr_actions = i; + exts->nr_actions = err; } exts->net = net; } @@ -1654,14 +1695,11 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { - LIST_HEAD(actions); - nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - tcf_exts_to_list(exts, &actions); - if (tcf_action_dump(skb, &actions, 0, 0) < 0) + if (tcf_action_dump(skb, exts->actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1aa7f6511065..66e0ac9811f9 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -43,6 +43,7 @@ struct cls_bpf_prog { struct tcf_result res; bool exts_integrated; u32 gen_flags; + unsigned int in_hw_count; struct tcf_exts exts; u32 handle; u16 bpf_num_ops; @@ -174,6 +175,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, cls_bpf_offload_cmd(tp, oldprog, prog, extack); return err; } else if (err > 0) { + prog->in_hw_count = err; tcf_block_offload_inc(block, &prog->gen_flags); } } @@ -652,6 +654,42 @@ skip: } } +static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_bpf_head *head = rtnl_dereference(tp->root); + struct tcf_block *block = tp->chain->block; + struct tc_cls_bpf_offload cls_bpf = {}; + struct cls_bpf_prog *prog; + int err; + + list_for_each_entry(prog, &head->plist, link) { + if (tc_skip_hw(prog->gen_flags)) + continue; + + tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, + extack); + cls_bpf.command = TC_CLSBPF_OFFLOAD; + cls_bpf.exts = &prog->exts; + cls_bpf.prog = add ? prog->filter : NULL; + cls_bpf.oldprog = add ? NULL : prog->filter; + cls_bpf.name = prog->bpf_name; + cls_bpf.exts_integrated = prog->exts_integrated; + + err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv); + if (err) { + if (add && tc_skip_sw(prog->gen_flags)) + return err; + continue; + } + + tc_cls_offload_cnt_update(block, &prog->in_hw_count, + &prog->gen_flags, add); + } + + return 0; +} + static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .kind = "bpf", .owner = THIS_MODULE, @@ -662,6 +700,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .change = cls_bpf_change, .delete = cls_bpf_delete, .walk = cls_bpf_walk, + .reoffload = cls_bpf_reoffload, .dump = cls_bpf_dump, .bind_class = cls_bpf_bind_class, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9e8b26a80fb3..38d74803e2df 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -35,6 +35,7 @@ struct fl_flow_key { struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; @@ -51,6 +52,7 @@ struct fl_flow_key { struct flow_dissector_key_mpls mpls; struct flow_dissector_key_tcp tcp; struct flow_dissector_key_ip ip; + struct flow_dissector_key_ip enc_ip; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -87,6 +89,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; + unsigned int in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; }; @@ -289,6 +292,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, fl_hw_destroy_filter(tp, f, NULL); return err; } else if (err > 0) { + f->in_hw_count = err; tcf_block_offload_inc(block, &f->flags); } @@ -447,6 +451,13 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_IP_TOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -498,22 +509,26 @@ static int fl_set_key_mpls(struct nlattr **tb, } static void fl_set_key_vlan(struct nlattr **tb, + __be16 ethertype, + int vlan_id_key, int vlan_prio_key, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) { #define VLAN_PRIORITY_MASK 0x7 - if (tb[TCA_FLOWER_KEY_VLAN_ID]) { + if (tb[vlan_id_key]) { key_val->vlan_id = - nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK; + nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK; key_mask->vlan_id = VLAN_VID_MASK; } - if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) { + if (tb[vlan_prio_key]) { key_val->vlan_priority = - nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & + nla_get_u8(tb[vlan_prio_key]) & VLAN_PRIORITY_MASK; key_mask->vlan_priority = VLAN_PRIORITY_MASK; } + key_val->vlan_tpid = ethertype; + key_mask->vlan_tpid = cpu_to_be16(~0); } static void fl_set_key_flag(u32 flower_key, u32 flower_mask, @@ -551,17 +566,17 @@ static int fl_set_key_flags(struct nlattr **tb, return 0; } -static void fl_set_key_ip(struct nlattr **tb, +static void fl_set_key_ip(struct nlattr **tb, bool encap, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { - fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS, - &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK, - sizeof(key->tos)); + int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS; + int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL; + int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK; + int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK; - fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, - &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK, - sizeof(key->ttl)); + fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)); + fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)); } static int fl_set_key(struct net *net, struct nlattr **tb, @@ -590,12 +605,28 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (tb[TCA_FLOWER_KEY_ETH_TYPE]) { ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); - if (ethertype == htons(ETH_P_8021Q)) { - fl_set_key_vlan(tb, &key->vlan, &mask->vlan); - fl_set_key_val(tb, &key->basic.n_proto, - TCA_FLOWER_KEY_VLAN_ETH_TYPE, - &mask->basic.n_proto, TCA_FLOWER_UNSPEC, - sizeof(key->basic.n_proto)); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID, + TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, + &mask->vlan); + + if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) { + ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, + TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan); + fl_set_key_val(tb, &key->basic.n_proto, + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + &mask->basic.n_proto, + TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + } else { + key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); + } + } } else { key->basic.n_proto = ethertype; mask->basic.n_proto = cpu_to_be16(~0); @@ -607,7 +638,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); - fl_set_key_ip(tb, &key->ip, &mask->ip); + fl_set_key_ip(tb, false, &key->ip, &mask->ip); } if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { @@ -742,6 +773,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)); + fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + if (tb[TCA_FLOWER_KEY_FLAGS]) ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); @@ -821,6 +854,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask) FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_CVLAN, cvlan); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4); @@ -832,6 +867,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask) enc_control); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); skb_flow_dissector_init(&mask->dissector, keys, cnt); } @@ -1071,20 +1108,59 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f; + + arg->count = arg->skip; + + while ((f = idr_get_next_ul(&head->handle_idr, + &arg->cookie)) != NULL) { + if (arg->fn(tp, f, arg) < 0) { + arg->stop = 1; + break; + } + arg->cookie = f->handle + 1; + arg->count++; + } +} + +static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = tp->chain->block; struct fl_flow_mask *mask; + struct cls_fl_filter *f; + int err; - list_for_each_entry_rcu(mask, &head->masks, list) { - list_for_each_entry_rcu(f, &mask->filters, list) { - if (arg->count < arg->skip) - goto skip; - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; - break; + list_for_each_entry(mask, &head->masks, list) { + list_for_each_entry(f, &mask->filters, list) { + if (tc_skip_hw(f->flags)) + continue; + + tc_cls_common_offload_init(&cls_flower.common, tp, + f->flags, extack); + cls_flower.command = add ? + TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; + cls_flower.cookie = (unsigned long)f; + cls_flower.dissector = &mask->dissector; + cls_flower.mask = &f->mkey; + cls_flower.key = &f->key; + cls_flower.exts = &f->exts; + cls_flower.classid = f->res.classid; + + err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); + if (err) { + if (add && tc_skip_sw(f->flags)) + return err; + continue; } -skip: - arg->count++; + + tc_cls_offload_cnt_update(block, &f->in_hw_count, + &f->flags, add); } } + + return 0; } static int fl_dump_key_val(struct sk_buff *skb, @@ -1141,20 +1217,24 @@ static int fl_dump_key_mpls(struct sk_buff *skb, return 0; } -static int fl_dump_key_ip(struct sk_buff *skb, +static int fl_dump_key_ip(struct sk_buff *skb, bool encap, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { - if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos, - TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) || - fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl, - TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl))) + int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS; + int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL; + int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK; + int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK; + + if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) || + fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl))) return -1; return 0; } static int fl_dump_key_vlan(struct sk_buff *skb, + int vlan_id_key, int vlan_prio_key, struct flow_dissector_key_vlan *vlan_key, struct flow_dissector_key_vlan *vlan_mask) { @@ -1163,13 +1243,13 @@ static int fl_dump_key_vlan(struct sk_buff *skb, if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask))) return 0; if (vlan_mask->vlan_id) { - err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID, + err = nla_put_u16(skb, vlan_id_key, vlan_key->vlan_id); if (err) return err; } if (vlan_mask->vlan_priority) { - err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO, + err = nla_put_u8(skb, vlan_prio_key, vlan_key->vlan_priority); if (err) return err; @@ -1264,15 +1344,36 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls)) goto nla_put_failure; - if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) + if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_VLAN_ID, + TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan)) + goto nla_put_failure; + + if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan) || + (mask->cvlan.vlan_tpid && + nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->cvlan.vlan_tpid))) goto nla_put_failure; + if (mask->basic.n_proto) { + if (mask->cvlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } else if (mask->vlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } + } + if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)) || - fl_dump_key_ip(skb, &key->ip, &mask->ip))) + fl_dump_key_ip(skb, false, &key->ip, &mask->ip))) goto nla_put_failure; if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && @@ -1397,7 +1498,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, TCA_FLOWER_KEY_ENC_UDP_DST_PORT, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, - sizeof(key->enc_tp.dst))) + sizeof(key->enc_tp.dst)) || + fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip)) goto nla_put_failure; if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) @@ -1438,6 +1540,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .change = fl_change, .delete = fl_delete, .walk = fl_walk, + .reoffload = fl_reoffload, .dump = fl_dump, .bind_class = fl_bind_class, .owner = THIS_MODULE, diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 47b207ef7762..af16f36ed578 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,6 +21,7 @@ struct cls_mall_head { struct tcf_result res; u32 handle; u32 flags; + unsigned int in_hw_count; struct rcu_work rwork; }; @@ -95,6 +96,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, mall_destroy_hw_filter(tp, head, cookie, NULL); return err; } else if (err > 0) { + head->in_hw_count = err; tcf_block_offload_inc(block, &head->flags); } @@ -235,6 +237,35 @@ skip: arg->count++; } +static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct tc_cls_matchall_offload cls_mall = {}; + struct tcf_block *block = tp->chain->block; + int err; + + if (tc_skip_hw(head->flags)) + return 0; + + tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); + cls_mall.command = add ? + TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY; + cls_mall.exts = &head->exts; + cls_mall.cookie = (unsigned long)head; + + err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv); + if (err) { + if (add && tc_skip_sw(head->flags)) + return err; + return 0; + } + + tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add); + + return 0; +} + static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { @@ -289,6 +320,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = { .change = mall_change, .delete = mall_delete, .walk = mall_walk, + .reoffload = mall_reoffload, .dump = mall_dump, .bind_class = mall_bind_class, .owner = THIS_MODULE, diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index fb861f90fde6..d5d2a6dc3921 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -62,6 +62,7 @@ struct tc_u_knode { struct tc_u32_pcnt __percpu *pf; #endif u32 flags; + unsigned int in_hw_count; #ifdef CONFIG_CLS_U32_MARK u32 val; u32 mask; @@ -571,6 +572,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32_remove_hw_knode(tp, n, NULL); return err; } else if (err > 0) { + n->in_hw_count = err; tcf_block_offload_inc(block, &n->flags); } @@ -1199,6 +1201,114 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } +static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, + bool add, tc_setup_cb_t *cb, void *cb_priv, + struct netlink_ext_ack *extack) +{ + struct tc_cls_u32_offload cls_u32 = {}; + int err; + + tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack); + cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE; + cls_u32.hnode.divisor = ht->divisor; + cls_u32.hnode.handle = ht->handle; + cls_u32.hnode.prio = ht->prio; + + err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); + if (err && add && tc_skip_sw(ht->flags)) + return err; + + return 0; +} + +static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, + bool add, tc_setup_cb_t *cb, void *cb_priv, + struct netlink_ext_ack *extack) +{ + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); + struct tcf_block *block = tp->chain->block; + struct tc_cls_u32_offload cls_u32 = {}; + int err; + + tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); + cls_u32.command = add ? + TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE; + cls_u32.knode.handle = n->handle; + + if (add) { + cls_u32.knode.fshift = n->fshift; +#ifdef CONFIG_CLS_U32_MARK + cls_u32.knode.val = n->val; + cls_u32.knode.mask = n->mask; +#else + cls_u32.knode.val = 0; + cls_u32.knode.mask = 0; +#endif + cls_u32.knode.sel = &n->sel; + cls_u32.knode.exts = &n->exts; + if (n->ht_down) + cls_u32.knode.link_handle = ht->handle; + } + + err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); + if (err) { + if (add && tc_skip_sw(n->flags)) + return err; + return 0; + } + + tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add); + + return 0; +} + +static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned int h; + int err; + + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) { + if (ht->prio != tp->prio) + continue; + + /* When adding filters to a new dev, try to offload the + * hashtable first. When removing, do the filters before the + * hashtable. + */ + if (add && !tc_skip_hw(ht->flags)) { + err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv, + extack); + if (err) + return err; + } + + for (h = 0; h <= ht->divisor; h++) { + for (n = rtnl_dereference(ht->ht[h]); + n; + n = rtnl_dereference(n->next)) { + if (tc_skip_hw(n->flags)) + continue; + + err = u32_reoffload_knode(tp, n, add, cb, + cb_priv, extack); + if (err) + return err; + } + } + + if (!add && !tc_skip_hw(ht->flags)) + u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack); + } + + return 0; +} + static void u32_bind_class(void *fh, u32 classid, unsigned long cl) { struct tc_u_knode *n = fh; @@ -1336,6 +1446,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = { .change = u32_change, .delete = u32_delete, .walk = u32_walk, + .reoffload = u32_reoffload, .dump = u32_dump, .bind_class = u32_bind_class, .owner = THIS_MODULE, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 54eca685420f..98541c6399db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } +EXPORT_SYMBOL(qdisc_watchdog_init_clockid); + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); +} EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c new file mode 100644 index 000000000000..539c9490c308 --- /dev/null +++ b/net/sched/sch_cake.c @@ -0,0 +1,3019 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* COMMON Applications Kept Enhanced (CAKE) discipline + * + * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> + * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> + * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> + * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> + * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> + * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> + * + * The CAKE Principles: + * (or, how to have your cake and eat it too) + * + * This is a combination of several shaping, AQM and FQ techniques into one + * easy-to-use package: + * + * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE + * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), + * eliminating the need for any sort of burst parameter (eg. token bucket + * depth). Burst support is limited to that necessary to overcome scheduling + * latency. + * + * - A Diffserv-aware priority queue, giving more priority to certain classes, + * up to a specified fraction of bandwidth. Above that bandwidth threshold, + * the priority is reduced to avoid starving other tins. + * + * - Each priority tin has a separate Flow Queue system, to isolate traffic + * flows from each other. This prevents a burst on one flow from increasing + * the delay to another. Flows are distributed to queues using a + * set-associative hash function. + * + * - Each queue is actively managed by Cobalt, which is a combination of the + * Codel and Blue AQM algorithms. This serves flows fairly, and signals + * congestion early via ECN (if available) and/or packet drops, to keep + * latency low. The codel parameters are auto-tuned based on the bandwidth + * setting, as is necessary at low bandwidths. + * + * The configuration parameters are kept deliberately simple for ease of use. + * Everything has sane defaults. Complete generality of configuration is *not* + * a goal. + * + * The priority queue operates according to a weighted DRR scheme, combined with + * a bandwidth tracker which reuses the shaper logic to detect which side of the + * bandwidth sharing threshold the tin is operating. This determines whether a + * priority-based weight (high) or a bandwidth-based weight (low) is used for + * that tin in the current pass. + * + * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly + * granted us permission to leverage. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/reciprocal_div.h> +#include <net/netlink.h> +#include <linux/version.h> +#include <linux/if_vlan.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> +#include <net/tcp.h> +#include <net/flow_dissector.h> + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack_core.h> +#endif + +#define CAKE_SET_WAYS (8) +#define CAKE_MAX_TINS (8) +#define CAKE_QUEUES (1024) +#define CAKE_FLOW_MASK 63 +#define CAKE_FLOW_NAT_FLAG 64 +#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */ + +/* struct cobalt_params - contains codel and blue parameters + * @interval: codel initial drop rate + * @target: maximum persistent sojourn time & blue update rate + * @mtu_time: serialisation delay of maximum-size packet + * @p_inc: increment of blue drop probability (0.32 fxp) + * @p_dec: decrement of blue drop probability (0.32 fxp) + */ +struct cobalt_params { + u64 interval; + u64 target; + u64 mtu_time; + u32 p_inc; + u32 p_dec; +}; + +/* struct cobalt_vars - contains codel and blue variables + * @count: codel dropping frequency + * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 + * @drop_next: time to drop next packet, or when we dropped last + * @blue_timer: Blue time to next drop + * @p_drop: BLUE drop probability (0.32 fxp) + * @dropping: set if in dropping state + * @ecn_marked: set if marked + */ +struct cobalt_vars { + u32 count; + u32 rec_inv_sqrt; + ktime_t drop_next; + ktime_t blue_timer; + u32 p_drop; + bool dropping; + bool ecn_marked; +}; + +enum { + CAKE_SET_NONE = 0, + CAKE_SET_SPARSE, + CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ + CAKE_SET_BULK, + CAKE_SET_DECAYING +}; + +struct cake_flow { + /* this stuff is all needed per-flow at dequeue time */ + struct sk_buff *head; + struct sk_buff *tail; + struct list_head flowchain; + s32 deficit; + u32 dropped; + struct cobalt_vars cvars; + u16 srchost; /* index into cake_host table */ + u16 dsthost; + u8 set; +}; /* please try to keep this structure <= 64 bytes */ + +struct cake_host { + u32 srchost_tag; + u32 dsthost_tag; + u16 srchost_refcnt; + u16 dsthost_refcnt; +}; + +struct cake_heap_entry { + u16 t:3, b:10; +}; + +struct cake_tin_data { + struct cake_flow flows[CAKE_QUEUES]; + u32 backlogs[CAKE_QUEUES]; + u32 tags[CAKE_QUEUES]; /* for set association */ + u16 overflow_idx[CAKE_QUEUES]; + struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ + u16 flow_quantum; + + struct cobalt_params cparams; + u32 drop_overlimit; + u16 bulk_flow_count; + u16 sparse_flow_count; + u16 decaying_flow_count; + u16 unresponsive_flow_count; + + u32 max_skblen; + + struct list_head new_flows; + struct list_head old_flows; + struct list_head decaying_flows; + + /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ + ktime_t time_next_packet; + u64 tin_rate_ns; + u64 tin_rate_bps; + u16 tin_rate_shft; + + u16 tin_quantum_prio; + u16 tin_quantum_band; + s32 tin_deficit; + u32 tin_backlog; + u32 tin_dropped; + u32 tin_ecn_mark; + + u32 packets; + u64 bytes; + + u32 ack_drops; + + /* moving averages */ + u64 avge_delay; + u64 peak_delay; + u64 base_delay; + + /* hash function stats */ + u32 way_directs; + u32 way_hits; + u32 way_misses; + u32 way_collisions; +}; /* number of tins is small, so size of this struct doesn't matter much */ + +struct cake_sched_data { + struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; + struct cake_tin_data *tins; + + struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; + u16 overflow_timeout; + + u16 tin_cnt; + u8 tin_mode; + u8 flow_mode; + u8 ack_filter; + u8 atm_mode; + + /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ + u16 rate_shft; + ktime_t time_next_packet; + ktime_t failsafe_next_packet; + u64 rate_ns; + u64 rate_bps; + u16 rate_flags; + s16 rate_overhead; + u16 rate_mpu; + u64 interval; + u64 target; + + /* resource tracking */ + u32 buffer_used; + u32 buffer_max_used; + u32 buffer_limit; + u32 buffer_config_limit; + + /* indices for dequeue */ + u16 cur_tin; + u16 cur_flow; + + struct qdisc_watchdog watchdog; + const u8 *tin_index; + const u8 *tin_order; + + /* bandwidth capacity estimate */ + ktime_t last_packet_time; + ktime_t avg_window_begin; + u64 avg_packet_interval; + u64 avg_window_bytes; + u64 avg_peak_bandwidth; + ktime_t last_reconfig_time; + + /* packet length stats */ + u32 avg_netoff; + u16 max_netlen; + u16 max_adjlen; + u16 min_netlen; + u16 min_adjlen; +}; + +enum { + CAKE_FLAG_OVERHEAD = BIT(0), + CAKE_FLAG_AUTORATE_INGRESS = BIT(1), + CAKE_FLAG_INGRESS = BIT(2), + CAKE_FLAG_WASH = BIT(3), + CAKE_FLAG_SPLIT_GSO = BIT(4) +}; + +/* COBALT operates the Codel and BLUE algorithms in parallel, in order to + * obtain the best features of each. Codel is excellent on flows which + * respond to congestion signals in a TCP-like way. BLUE is more effective on + * unresponsive flows. + */ + +struct cobalt_skb_cb { + ktime_t enqueue_time; + u32 adjusted_len; +}; + +static u64 us_to_ns(u64 us) +{ + return us * NSEC_PER_USEC; +} + +static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); + return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) +{ + return get_cobalt_cb(skb)->enqueue_time; +} + +static void cobalt_set_enqueue_time(struct sk_buff *skb, + ktime_t now) +{ + get_cobalt_cb(skb)->enqueue_time = now; +} + +static u16 quantum_div[CAKE_QUEUES + 1] = {0}; + +/* Diffserv lookup tables */ + +static const u8 precedence[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, +}; + +static const u8 diffserv8[] = { + 2, 5, 1, 2, 4, 2, 2, 2, + 0, 2, 1, 2, 1, 2, 1, 2, + 5, 2, 4, 2, 4, 2, 4, 2, + 3, 2, 3, 2, 3, 2, 3, 2, + 6, 2, 3, 2, 3, 2, 3, 2, + 6, 2, 2, 2, 6, 2, 6, 2, + 7, 2, 2, 2, 2, 2, 2, 2, + 7, 2, 2, 2, 2, 2, 2, 2, +}; + +static const u8 diffserv4[] = { + 0, 2, 0, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 2, 0, 2, 0, 2, 0, + 2, 0, 2, 0, 2, 0, 2, 0, + 3, 0, 2, 0, 2, 0, 2, 0, + 3, 0, 0, 0, 3, 0, 3, 0, + 3, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, +}; + +static const u8 diffserv3[] = { + 0, 0, 0, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 0, 2, 0, + 2, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, +}; + +static const u8 besteffort[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* tin priority order for stats dumping */ + +static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; +static const u8 bulk_order[] = {1, 0, 2, 3}; + +#define REC_INV_SQRT_CACHE (16) +static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; + +/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) + * + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 + */ + +static void cobalt_newton_step(struct cobalt_vars *vars) +{ + u32 invsqrt, invsqrt2; + u64 val; + + invsqrt = vars->rec_inv_sqrt; + invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; + val = (3LL << 32) - ((u64)vars->count * invsqrt2); + + val >>= 2; /* avoid overflow in following multiply */ + val = (val * invsqrt) >> (32 - 2 + 1); + + vars->rec_inv_sqrt = val; +} + +static void cobalt_invsqrt(struct cobalt_vars *vars) +{ + if (vars->count < REC_INV_SQRT_CACHE) + vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; + else + cobalt_newton_step(vars); +} + +/* There is a big difference in timing between the accurate values placed in + * the cache and the approximations given by a single Newton step for small + * count values, particularly when stepping from count 1 to 2 or vice versa. + * Above 16, a single Newton step gives sufficient accuracy in either + * direction, given the precision stored. + * + * The magnitude of the error when stepping up to count 2 is such as to give + * the value that *should* have been produced at count 4. + */ + +static void cobalt_cache_init(void) +{ + struct cobalt_vars v; + + memset(&v, 0, sizeof(v)); + v.rec_inv_sqrt = ~0U; + cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; + + for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { + cobalt_newton_step(&v); + cobalt_newton_step(&v); + cobalt_newton_step(&v); + cobalt_newton_step(&v); + + cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; + } +} + +static void cobalt_vars_init(struct cobalt_vars *vars) +{ + memset(vars, 0, sizeof(*vars)); + + if (!cobalt_rec_inv_sqrt_cache[0]) { + cobalt_cache_init(); + cobalt_rec_inv_sqrt_cache[0] = ~0; + } +} + +/* CoDel control_law is t + interval/sqrt(count) + * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid + * both sqrt() and divide operation. + */ +static ktime_t cobalt_control(ktime_t t, + u64 interval, + u32 rec_inv_sqrt) +{ + return ktime_add_ns(t, reciprocal_scale(interval, + rec_inv_sqrt)); +} + +/* Call this when a packet had to be dropped due to queue overflow. Returns + * true if the BLUE state was quiescent before but active after this call. + */ +static bool cobalt_queue_full(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now) +{ + bool up = false; + + if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { + up = !vars->p_drop; + vars->p_drop += p->p_inc; + if (vars->p_drop < p->p_inc) + vars->p_drop = ~0; + vars->blue_timer = now; + } + vars->dropping = true; + vars->drop_next = now; + if (!vars->count) + vars->count = 1; + + return up; +} + +/* Call this when the queue was serviced but turned out to be empty. Returns + * true if the BLUE state was active before but quiescent after this call. + */ +static bool cobalt_queue_empty(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now) +{ + bool down = false; + + if (vars->p_drop && + ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { + if (vars->p_drop < p->p_dec) + vars->p_drop = 0; + else + vars->p_drop -= p->p_dec; + vars->blue_timer = now; + down = !vars->p_drop; + } + vars->dropping = false; + + if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + } + + return down; +} + +/* Call this with a freshly dequeued packet for possible congestion marking. + * Returns true as an instruction to drop the packet, false for delivery. + */ +static bool cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb, + u32 bulk_flows) +{ + bool next_due, over_target, drop = false; + ktime_t schedule; + u64 sojourn; + +/* The 'schedule' variable records, in its sign, whether 'now' is before or + * after 'drop_next'. This allows 'drop_next' to be updated before the next + * scheduling decision is actually branched, without destroying that + * information. Similarly, the first 'schedule' value calculated is preserved + * in the boolean 'next_due'. + * + * As for 'drop_next', we take advantage of the fact that 'interval' is both + * the delay between first exceeding 'target' and the first signalling event, + * *and* the scaling factor for the signalling frequency. It's therefore very + * natural to use a single mechanism for both purposes, and eliminates a + * significant amount of reference Codel's spaghetti code. To help with this, + * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close + * as possible to 1.0 in fixed-point. + */ + + sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); + schedule = ktime_sub(now, vars->drop_next); + over_target = sojourn > p->target && + sojourn > p->mtu_time * bulk_flows * 2 && + sojourn > p->mtu_time * 4; + next_due = vars->count && ktime_to_ns(schedule) >= 0; + + vars->ecn_marked = false; + + if (over_target) { + if (!vars->dropping) { + vars->dropping = true; + vars->drop_next = cobalt_control(now, + p->interval, + vars->rec_inv_sqrt); + } + if (!vars->count) + vars->count = 1; + } else if (vars->dropping) { + vars->dropping = false; + } + + if (next_due && vars->dropping) { + /* Use ECN mark if possible, otherwise drop */ + drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); + + vars->count++; + if (!vars->count) + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + schedule = ktime_sub(now, vars->drop_next); + } else { + while (next_due) { + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + schedule = ktime_sub(now, vars->drop_next); + next_due = vars->count && ktime_to_ns(schedule) >= 0; + } + } + + /* Simple BLUE implementation. Lack of ECN is deliberate. */ + if (vars->p_drop) + drop |= (prandom_u32() < vars->p_drop); + + /* Overload the drop_next field as an activity timeout */ + if (!vars->count) + vars->drop_next = ktime_add_ns(now, p->interval); + else if (ktime_to_ns(schedule) > 0 && !drop) + vars->drop_next = now; + + return drop; +} + +static void cake_update_flowkeys(struct flow_keys *keys, + const struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + struct nf_conntrack_tuple tuple = {}; + bool rev = !skb->_nfct; + + if (tc_skb_protocol(skb) != htons(ETH_P_IP)) + return; + + if (!nf_ct_get_tuple_skb(&tuple, skb)) + return; + + keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; + keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + + if (keys->ports.ports) { + keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; + keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; + } +#endif +} + +/* Cake has several subtle multiple bit settings. In these cases you + * would be matching triple isolate mode as well. + */ + +static bool cake_dsrc(int flow_mode) +{ + return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; +} + +static bool cake_ddst(int flow_mode) +{ + return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; +} + +static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, + int flow_mode) +{ + u32 flow_hash = 0, srchost_hash, dsthost_hash; + u16 reduced_hash, srchost_idx, dsthost_idx; + struct flow_keys keys, host_keys; + + if (unlikely(flow_mode == CAKE_FLOW_NONE)) + return 0; + + skb_flow_dissect_flow_keys(skb, &keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + if (flow_mode & CAKE_FLOW_NAT_FLAG) + cake_update_flowkeys(&keys, skb); + + /* flow_hash_from_keys() sorts the addresses by value, so we have + * to preserve their order in a separate data structure to treat + * src and dst host addresses as independently selectable. + */ + host_keys = keys; + host_keys.ports.ports = 0; + host_keys.basic.ip_proto = 0; + host_keys.keyid.keyid = 0; + host_keys.tags.flow_label = 0; + + switch (host_keys.control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + host_keys.addrs.v4addrs.src = 0; + dsthost_hash = flow_hash_from_keys(&host_keys); + host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + host_keys.addrs.v4addrs.dst = 0; + srchost_hash = flow_hash_from_keys(&host_keys); + break; + + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + memset(&host_keys.addrs.v6addrs.src, 0, + sizeof(host_keys.addrs.v6addrs.src)); + dsthost_hash = flow_hash_from_keys(&host_keys); + host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + memset(&host_keys.addrs.v6addrs.dst, 0, + sizeof(host_keys.addrs.v6addrs.dst)); + srchost_hash = flow_hash_from_keys(&host_keys); + break; + + default: + dsthost_hash = 0; + srchost_hash = 0; + } + + /* This *must* be after the above switch, since as a + * side-effect it sorts the src and dst addresses. + */ + if (flow_mode & CAKE_FLOW_FLOWS) + flow_hash = flow_hash_from_keys(&keys); + + if (!(flow_mode & CAKE_FLOW_FLOWS)) { + if (flow_mode & CAKE_FLOW_SRC_IP) + flow_hash ^= srchost_hash; + + if (flow_mode & CAKE_FLOW_DST_IP) + flow_hash ^= dsthost_hash; + } + + reduced_hash = flow_hash % CAKE_QUEUES; + + /* set-associative hashing */ + /* fast path if no hash collision (direct lookup succeeds) */ + if (likely(q->tags[reduced_hash] == flow_hash && + q->flows[reduced_hash].set)) { + q->way_directs++; + } else { + u32 inner_hash = reduced_hash % CAKE_SET_WAYS; + u32 outer_hash = reduced_hash - inner_hash; + bool allocate_src = false; + bool allocate_dst = false; + u32 i, k; + + /* check if any active queue in the set is reserved for + * this flow. + */ + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->tags[outer_hash + k] == flow_hash) { + if (i) + q->way_hits++; + + if (!q->flows[outer_hash + k].set) { + /* need to increment host refcnts */ + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); + } + + goto found; + } + } + + /* no queue is reserved for this flow, look for an + * empty one. + */ + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->flows[outer_hash + k].set) { + q->way_misses++; + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); + goto found; + } + } + + /* With no empty queues, default to the original + * queue, accept the collision, update the host tags. + */ + q->way_collisions++; + q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--; + q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--; + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); +found: + /* reserve queue for future packets in same flow */ + reduced_hash = outer_hash + k; + q->tags[reduced_hash] = flow_hash; + + if (allocate_src) { + srchost_idx = srchost_hash % CAKE_QUEUES; + inner_hash = srchost_idx % CAKE_SET_WAYS; + outer_hash = srchost_idx - inner_hash; + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->hosts[outer_hash + k].srchost_tag == + srchost_hash) + goto found_src; + } + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->hosts[outer_hash + k].srchost_refcnt) + break; + } + q->hosts[outer_hash + k].srchost_tag = srchost_hash; +found_src: + srchost_idx = outer_hash + k; + q->hosts[srchost_idx].srchost_refcnt++; + q->flows[reduced_hash].srchost = srchost_idx; + } + + if (allocate_dst) { + dsthost_idx = dsthost_hash % CAKE_QUEUES; + inner_hash = dsthost_idx % CAKE_SET_WAYS; + outer_hash = dsthost_idx - inner_hash; + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->hosts[outer_hash + k].dsthost_tag == + dsthost_hash) + goto found_dst; + } + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->hosts[outer_hash + k].dsthost_refcnt) + break; + } + q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; +found_dst: + dsthost_idx = outer_hash + k; + q->hosts[dsthost_idx].dsthost_refcnt++; + q->flows[reduced_hash].dsthost = dsthost_idx; + } + } + + return reduced_hash; +} + +/* helper functions : might be changed when/if skb use a standard list_head */ +/* remove one skb from head of slot queue */ + +static struct sk_buff *dequeue_head(struct cake_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + } + + return skb; +} + +/* add skb to flow queue (tail add) */ + +static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) +{ + if (!flow->head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, + struct ipv6hdr *buf) +{ + unsigned int offset = skb_network_offset(skb); + struct iphdr *iph; + + iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); + + if (!iph) + return NULL; + + if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) + return skb_header_pointer(skb, offset + iph->ihl * 4, + sizeof(struct ipv6hdr), buf); + + else if (iph->version == 4) + return iph; + + else if (iph->version == 6) + return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), + buf); + + return NULL; +} + +static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, + void *buf, unsigned int bufsize) +{ + unsigned int offset = skb_network_offset(skb); + const struct ipv6hdr *ipv6h; + const struct tcphdr *tcph; + const struct iphdr *iph; + struct ipv6hdr _ipv6h; + struct tcphdr _tcph; + + ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h) + return NULL; + + if (ipv6h->version == 4) { + iph = (struct iphdr *)ipv6h; + offset += iph->ihl * 4; + + /* special-case 6in4 tunnelling, as that is a common way to get + * v6 connectivity in the home + */ + if (iph->protocol == IPPROTO_IPV6) { + ipv6h = skb_header_pointer(skb, offset, + sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) + return NULL; + + offset += sizeof(struct ipv6hdr); + + } else if (iph->protocol != IPPROTO_TCP) { + return NULL; + } + + } else if (ipv6h->version == 6) { + if (ipv6h->nexthdr != IPPROTO_TCP) + return NULL; + + offset += sizeof(struct ipv6hdr); + } else { + return NULL; + } + + tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (!tcph) + return NULL; + + return skb_header_pointer(skb, offset, + min(__tcp_hdrlen(tcph), bufsize), buf); +} + +static const void *cake_get_tcpopt(const struct tcphdr *tcph, + int code, int *oplen) +{ + /* inspired by tcp_parse_options in tcp_input.c */ + int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); + const u8 *ptr = (const u8 *)(tcph + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + if (opcode == TCPOPT_EOL) + break; + if (opcode == TCPOPT_NOP) { + length--; + continue; + } + opsize = *ptr++; + if (opsize < 2 || opsize > length) + break; + + if (opcode == code) { + *oplen = opsize; + return ptr; + } + + ptr += opsize - 2; + length -= opsize; + } + + return NULL; +} + +/* Compare two SACK sequences. A sequence is considered greater if it SACKs more + * bytes than the other. In the case where both sequences ACKs bytes that the + * other doesn't, A is considered greater. DSACKs in A also makes A be + * considered greater. + * + * @return -1, 0 or 1 as normal compare functions + */ +static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, + const struct tcphdr *tcph_b) +{ + const struct tcp_sack_block_wire *sack_a, *sack_b; + u32 ack_seq_a = ntohl(tcph_a->ack_seq); + u32 bytes_a = 0, bytes_b = 0; + int oplen_a, oplen_b; + bool first = true; + + sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); + sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); + + /* pointers point to option contents */ + oplen_a -= TCPOLEN_SACK_BASE; + oplen_b -= TCPOLEN_SACK_BASE; + + if (sack_a && oplen_a >= sizeof(*sack_a) && + (!sack_b || oplen_b < sizeof(*sack_b))) + return -1; + else if (sack_b && oplen_b >= sizeof(*sack_b) && + (!sack_a || oplen_a < sizeof(*sack_a))) + return 1; + else if ((!sack_a || oplen_a < sizeof(*sack_a)) && + (!sack_b || oplen_b < sizeof(*sack_b))) + return 0; + + while (oplen_a >= sizeof(*sack_a)) { + const struct tcp_sack_block_wire *sack_tmp = sack_b; + u32 start_a = get_unaligned_be32(&sack_a->start_seq); + u32 end_a = get_unaligned_be32(&sack_a->end_seq); + int oplen_tmp = oplen_b; + bool found = false; + + /* DSACK; always considered greater to prevent dropping */ + if (before(start_a, ack_seq_a)) + return -1; + + bytes_a += end_a - start_a; + + while (oplen_tmp >= sizeof(*sack_tmp)) { + u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); + u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); + + /* first time through we count the total size */ + if (first) + bytes_b += end_b - start_b; + + if (!after(start_b, start_a) && !before(end_b, end_a)) { + found = true; + if (!first) + break; + } + oplen_tmp -= sizeof(*sack_tmp); + sack_tmp++; + } + + if (!found) + return -1; + + oplen_a -= sizeof(*sack_a); + sack_a++; + first = false; + } + + /* If we made it this far, all ranges SACKed by A are covered by B, so + * either the SACKs are equal, or B SACKs more bytes. + */ + return bytes_b > bytes_a ? 1 : 0; +} + +static void cake_tcph_get_tstamp(const struct tcphdr *tcph, + u32 *tsval, u32 *tsecr) +{ + const u8 *ptr; + int opsize; + + ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); + + if (ptr && opsize == TCPOLEN_TIMESTAMP) { + *tsval = get_unaligned_be32(ptr); + *tsecr = get_unaligned_be32(ptr + 4); + } +} + +static bool cake_tcph_may_drop(const struct tcphdr *tcph, + u32 tstamp_new, u32 tsecr_new) +{ + /* inspired by tcp_parse_options in tcp_input.c */ + int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); + const u8 *ptr = (const u8 *)(tcph + 1); + u32 tstamp, tsecr; + + /* 3 reserved flags must be unset to avoid future breakage + * ACK must be set + * ECE/CWR are handled separately + * All other flags URG/PSH/RST/SYN/FIN must be unset + * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) + * 0x00C00000 = CWR/ECE (handled separately) + * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 + */ + if (((tcp_flag_word(tcph) & + cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) + return false; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + if (opcode == TCPOPT_EOL) + break; + if (opcode == TCPOPT_NOP) { + length--; + continue; + } + opsize = *ptr++; + if (opsize < 2 || opsize > length) + break; + + switch (opcode) { + case TCPOPT_MD5SIG: /* doesn't influence state */ + break; + + case TCPOPT_SACK: /* stricter checking performed later */ + if (opsize % 8 != 2) + return false; + break; + + case TCPOPT_TIMESTAMP: + /* only drop timestamps lower than new */ + if (opsize != TCPOLEN_TIMESTAMP) + return false; + tstamp = get_unaligned_be32(ptr); + tsecr = get_unaligned_be32(ptr + 4); + if (after(tstamp, tstamp_new) || + after(tsecr, tsecr_new)) + return false; + break; + + case TCPOPT_MSS: /* these should only be set on SYN */ + case TCPOPT_WINDOW: + case TCPOPT_SACK_PERM: + case TCPOPT_FASTOPEN: + case TCPOPT_EXP: + default: /* don't drop if any unknown options are present */ + return false; + } + + ptr += opsize - 2; + length -= opsize; + } + + return true; +} + +static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, + struct cake_flow *flow) +{ + bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; + struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; + struct sk_buff *skb_check, *skb_prev = NULL; + const struct ipv6hdr *ipv6h, *ipv6h_check; + unsigned char _tcph[64], _tcph_check[64]; + const struct tcphdr *tcph, *tcph_check; + const struct iphdr *iph, *iph_check; + struct ipv6hdr _iph, _iph_check; + const struct sk_buff *skb; + int seglen, num_found = 0; + u32 tstamp = 0, tsecr = 0; + __be32 elig_flags = 0; + int sack_comp; + + /* no other possible ACKs to filter */ + if (flow->head == flow->tail) + return NULL; + + skb = flow->tail; + tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); + iph = cake_get_iphdr(skb, &_iph); + if (!tcph) + return NULL; + + cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); + + /* the 'triggering' packet need only have the ACK flag set. + * also check that SYN is not set, as there won't be any previous ACKs. + */ + if ((tcp_flag_word(tcph) & + (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) + return NULL; + + /* the 'triggering' ACK is at the tail of the queue, we have already + * returned if it is the only packet in the flow. loop through the rest + * of the queue looking for pure ACKs with the same 5-tuple as the + * triggering one. + */ + for (skb_check = flow->head; + skb_check && skb_check != skb; + skb_prev = skb_check, skb_check = skb_check->next) { + iph_check = cake_get_iphdr(skb_check, &_iph_check); + tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, + sizeof(_tcph_check)); + + /* only TCP packets with matching 5-tuple are eligible, and only + * drop safe headers + */ + if (!tcph_check || iph->version != iph_check->version || + tcph_check->source != tcph->source || + tcph_check->dest != tcph->dest) + continue; + + if (iph_check->version == 4) { + if (iph_check->saddr != iph->saddr || + iph_check->daddr != iph->daddr) + continue; + + seglen = ntohs(iph_check->tot_len) - + (4 * iph_check->ihl); + } else if (iph_check->version == 6) { + ipv6h = (struct ipv6hdr *)iph; + ipv6h_check = (struct ipv6hdr *)iph_check; + + if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || + ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) + continue; + + seglen = ntohs(ipv6h_check->payload_len); + } else { + WARN_ON(1); /* shouldn't happen */ + continue; + } + + /* If the ECE/CWR flags changed from the previous eligible + * packet in the same flow, we should no longer be dropping that + * previous packet as this would lose information. + */ + if (elig_ack && (tcp_flag_word(tcph_check) & + (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { + elig_ack = NULL; + elig_ack_prev = NULL; + num_found--; + } + + /* Check TCP options and flags, don't drop ACKs with segment + * data, and don't drop ACKs with a higher cumulative ACK + * counter than the triggering packet. Check ACK seqno here to + * avoid parsing SACK options of packets we are going to exclude + * anyway. + */ + if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || + (seglen - __tcp_hdrlen(tcph_check)) != 0 || + after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) + continue; + + /* Check SACK options. The triggering packet must SACK more data + * than the ACK under consideration, or SACK the same range but + * have a larger cumulative ACK counter. The latter is a + * pathological case, but is contained in the following check + * anyway, just to be safe. + */ + sack_comp = cake_tcph_sack_compare(tcph_check, tcph); + + if (sack_comp < 0 || + (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && + sack_comp == 0)) + continue; + + /* At this point we have found an eligible pure ACK to drop; if + * we are in aggressive mode, we are done. Otherwise, keep + * searching unless this is the second eligible ACK we + * found. + * + * Since we want to drop ACK closest to the head of the queue, + * save the first eligible ACK we find, even if we need to loop + * again. + */ + if (!elig_ack) { + elig_ack = skb_check; + elig_ack_prev = skb_prev; + elig_flags = (tcp_flag_word(tcph_check) + & (TCP_FLAG_ECE | TCP_FLAG_CWR)); + } + + if (num_found++ > 0) + goto found; + } + + /* We made it through the queue without finding two eligible ACKs . If + * we found a single eligible ACK we can drop it in aggressive mode if + * we can guarantee that this does not interfere with ECN flag + * information. We ensure this by dropping it only if the enqueued + * packet is consecutive with the eligible ACK, and their flags match. + */ + if (elig_ack && aggressive && elig_ack->next == skb && + (elig_flags == (tcp_flag_word(tcph) & + (TCP_FLAG_ECE | TCP_FLAG_CWR)))) + goto found; + + return NULL; + +found: + if (elig_ack_prev) + elig_ack_prev->next = elig_ack->next; + else + flow->head = elig_ack->next; + + elig_ack->next = NULL; + + return elig_ack; +} + +static u64 cake_ewma(u64 avg, u64 sample, u32 shift) +{ + avg -= avg >> shift; + avg += sample >> shift; + return avg; +} + +static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) +{ + if (q->rate_flags & CAKE_FLAG_OVERHEAD) + len -= off; + + if (q->max_netlen < len) + q->max_netlen = len; + if (q->min_netlen > len) + q->min_netlen = len; + + len += q->rate_overhead; + + if (len < q->rate_mpu) + len = q->rate_mpu; + + if (q->atm_mode == CAKE_ATM_ATM) { + len += 47; + len /= 48; + len *= 53; + } else if (q->atm_mode == CAKE_ATM_PTM) { + /* Add one byte per 64 bytes or part thereof. + * This is conservative and easier to calculate than the + * precise value. + */ + len += (len + 63) / 64; + } + + if (q->max_adjlen < len) + q->max_adjlen = len; + if (q->min_adjlen > len) + q->min_adjlen = len; + + return len; +} + +static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int hdr_len, last_len = 0; + u32 off = skb_network_offset(skb); + u32 len = qdisc_pkt_len(skb); + u16 segs = 1; + + q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); + + if (!shinfo->gso_size) + return cake_calc_overhead(q, len, off); + + /* borrowed from qdisc_pkt_len_init() */ + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + /* + transport layer */ + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | + SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } + + if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) + segs = DIV_ROUND_UP(skb->len - hdr_len, + shinfo->gso_size); + else + segs = shinfo->gso_segs; + + len = shinfo->gso_size + hdr_len; + last_len = skb->len - shinfo->gso_size * (segs - 1); + + return (cake_calc_overhead(q, len, off) * (segs - 1) + + cake_calc_overhead(q, last_len, off)); +} + +static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) +{ + struct cake_heap_entry ii = q->overflow_heap[i]; + struct cake_heap_entry jj = q->overflow_heap[j]; + + q->overflow_heap[i] = jj; + q->overflow_heap[j] = ii; + + q->tins[ii.t].overflow_idx[ii.b] = j; + q->tins[jj.t].overflow_idx[jj.b] = i; +} + +static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) +{ + struct cake_heap_entry ii = q->overflow_heap[i]; + + return q->tins[ii.t].backlogs[ii.b]; +} + +static void cake_heapify(struct cake_sched_data *q, u16 i) +{ + static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; + u32 mb = cake_heap_get_backlog(q, i); + u32 m = i; + + while (m < a) { + u32 l = m + m + 1; + u32 r = l + 1; + + if (l < a) { + u32 lb = cake_heap_get_backlog(q, l); + + if (lb > mb) { + m = l; + mb = lb; + } + } + + if (r < a) { + u32 rb = cake_heap_get_backlog(q, r); + + if (rb > mb) { + m = r; + mb = rb; + } + } + + if (m != i) { + cake_heap_swap(q, i, m); + i = m; + } else { + break; + } + } +} + +static void cake_heapify_up(struct cake_sched_data *q, u16 i) +{ + while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { + u16 p = (i - 1) >> 1; + u32 ib = cake_heap_get_backlog(q, i); + u32 pb = cake_heap_get_backlog(q, p); + + if (ib > pb) { + cake_heap_swap(q, i, p); + i = p; + } else { + break; + } + } +} + +static int cake_advance_shaper(struct cake_sched_data *q, + struct cake_tin_data *b, + struct sk_buff *skb, + ktime_t now, bool drop) +{ + u32 len = get_cobalt_cb(skb)->adjusted_len; + + /* charge packet bandwidth to this tin + * and to the global shaper. + */ + if (q->rate_ns) { + u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; + u64 global_dur = (len * q->rate_ns) >> q->rate_shft; + u64 failsafe_dur = global_dur + (global_dur >> 1); + + if (ktime_before(b->time_next_packet, now)) + b->time_next_packet = ktime_add_ns(b->time_next_packet, + tin_dur); + + else if (ktime_before(b->time_next_packet, + ktime_add_ns(now, tin_dur))) + b->time_next_packet = ktime_add_ns(now, tin_dur); + + q->time_next_packet = ktime_add_ns(q->time_next_packet, + global_dur); + if (!drop) + q->failsafe_next_packet = \ + ktime_add_ns(q->failsafe_next_packet, + failsafe_dur); + } + return len; +} + +static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) +{ + struct cake_sched_data *q = qdisc_priv(sch); + ktime_t now = ktime_get(); + u32 idx = 0, tin = 0, len; + struct cake_heap_entry qq; + struct cake_tin_data *b; + struct cake_flow *flow; + struct sk_buff *skb; + + if (!q->overflow_timeout) { + int i; + /* Build fresh max-heap */ + for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) + cake_heapify(q, i); + } + q->overflow_timeout = 65535; + + /* select longest queue for pruning */ + qq = q->overflow_heap[0]; + tin = qq.t; + idx = qq.b; + + b = &q->tins[tin]; + flow = &b->flows[idx]; + skb = dequeue_head(flow); + if (unlikely(!skb)) { + /* heap has gone wrong, rebuild it next time */ + q->overflow_timeout = 0; + return idx + (tin << 16); + } + + if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) + b->unresponsive_flow_count++; + + len = qdisc_pkt_len(skb); + q->buffer_used -= skb->truesize; + b->backlogs[idx] -= len; + b->tin_backlog -= len; + sch->qstats.backlog -= len; + qdisc_tree_reduce_backlog(sch, 1, len); + + flow->dropped++; + b->tin_dropped++; + sch->qstats.drops++; + + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, skb, now, true); + + __qdisc_drop(skb, to_free); + sch->q.qlen--; + + cake_heapify(q, 0); + + return idx + (tin << 16); +} + +static void cake_wash_diffserv(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + break; + case htons(ETH_P_IPV6): + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + break; + default: + break; + } +} + +static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) +{ + u8 dscp; + + switch (skb->protocol) { + case htons(ETH_P_IP): + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + if (wash && dscp) + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + return dscp; + + case htons(ETH_P_IPV6): + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + if (wash && dscp) + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + return dscp; + + case htons(ETH_P_ARP): + return 0x38; /* CS7 - Net Control */ + + default: + /* If there is no Diffserv field, treat as best-effort */ + return 0; + } +} + +static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, + struct sk_buff *skb) +{ + struct cake_sched_data *q = qdisc_priv(sch); + u32 tin; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->tin_cnt) { + tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; + + if (q->rate_flags & CAKE_FLAG_WASH) + cake_wash_diffserv(skb); + } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) { + /* extract the Diffserv Precedence field, if it exists */ + /* and clear DSCP bits if washing */ + tin = q->tin_index[cake_handle_diffserv(skb, + q->rate_flags & CAKE_FLAG_WASH)]; + if (unlikely(tin >= q->tin_cnt)) + tin = 0; + } else { + tin = 0; + if (q->rate_flags & CAKE_FLAG_WASH) + cake_wash_diffserv(skb); + } + + return &q->tins[tin]; +} + +static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, + struct sk_buff *skb, int flow_mode, int *qerr) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; + struct tcf_result res; + u32 flow = 0; + int result; + + filter = rcu_dereference_bh(q->filter_list); + if (!filter) + goto hash; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tcf_classify(skb, filter, &res, false); + + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= CAKE_QUEUES) + flow = TC_H_MIN(res.classid); + } +hash: + *t = cake_select_tin(sch, skb); + return flow ?: cake_hash(*t, skb, flow_mode) + 1; +} + +static void cake_reconfigure(struct Qdisc *sch); + +static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int len = qdisc_pkt_len(skb); + int uninitialized_var(ret); + struct sk_buff *ack = NULL; + ktime_t now = ktime_get(); + struct cake_tin_data *b; + struct cake_flow *flow; + u32 idx; + + /* choose flow to insert into */ + idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); + if (idx == 0) { + if (ret & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return ret; + } + idx--; + flow = &b->flows[idx]; + + /* ensure shaper state isn't stale */ + if (!b->tin_backlog) { + if (ktime_before(b->time_next_packet, now)) + b->time_next_packet = now; + + if (!sch->q.qlen) { + if (ktime_before(q->time_next_packet, now)) { + q->failsafe_next_packet = now; + q->time_next_packet = now; + } else if (ktime_after(q->time_next_packet, now) && + ktime_after(q->failsafe_next_packet, now)) { + u64 next = \ + min(ktime_to_ns(q->time_next_packet), + ktime_to_ns( + q->failsafe_next_packet)); + sch->qstats.overlimits++; + qdisc_watchdog_schedule_ns(&q->watchdog, next); + } + } + } + + if (unlikely(len > b->max_skblen)) + b->max_skblen = len; + + if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + struct sk_buff *segs, *nskb; + netdev_features_t features = netif_skb_features(skb); + unsigned int slen = 0; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) + return qdisc_drop(skb, sch, to_free); + + while (segs) { + nskb = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + cobalt_set_enqueue_time(segs, now); + get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, + segs); + flow_queue_add(flow, segs); + + sch->q.qlen++; + slen += segs->len; + q->buffer_used += segs->truesize; + b->packets++; + segs = nskb; + } + + /* stats */ + b->bytes += slen; + b->backlogs[idx] += slen; + b->tin_backlog += slen; + sch->qstats.backlog += slen; + q->avg_window_bytes += slen; + + qdisc_tree_reduce_backlog(sch, 1, len); + consume_skb(skb); + } else { + /* not splitting */ + cobalt_set_enqueue_time(skb, now); + get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); + flow_queue_add(flow, skb); + + if (q->ack_filter) + ack = cake_ack_filter(q, flow); + + if (ack) { + b->ack_drops++; + sch->qstats.drops++; + b->bytes += qdisc_pkt_len(ack); + len -= qdisc_pkt_len(ack); + q->buffer_used += skb->truesize - ack->truesize; + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, ack, now, true); + + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); + consume_skb(ack); + } else { + sch->q.qlen++; + q->buffer_used += skb->truesize; + } + + /* stats */ + b->packets++; + b->bytes += len; + b->backlogs[idx] += len; + b->tin_backlog += len; + sch->qstats.backlog += len; + q->avg_window_bytes += len; + } + + if (q->overflow_timeout) + cake_heapify_up(q, b->overflow_idx[idx]); + + /* incoming bandwidth capacity estimate */ + if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { + u64 packet_interval = \ + ktime_to_ns(ktime_sub(now, q->last_packet_time)); + + if (packet_interval > NSEC_PER_SEC) + packet_interval = NSEC_PER_SEC; + + /* filter out short-term bursts, eg. wifi aggregation */ + q->avg_packet_interval = \ + cake_ewma(q->avg_packet_interval, + packet_interval, + (packet_interval > q->avg_packet_interval ? + 2 : 8)); + + q->last_packet_time = now; + + if (packet_interval > q->avg_packet_interval) { + u64 window_interval = \ + ktime_to_ns(ktime_sub(now, + q->avg_window_begin)); + u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; + + do_div(b, window_interval); + q->avg_peak_bandwidth = + cake_ewma(q->avg_peak_bandwidth, b, + b > q->avg_peak_bandwidth ? 2 : 8); + q->avg_window_bytes = 0; + q->avg_window_begin = now; + + if (ktime_after(now, + ktime_add_ms(q->last_reconfig_time, + 250))) { + q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; + cake_reconfigure(sch); + } + } + } else { + q->avg_window_bytes = 0; + q->last_packet_time = now; + } + + /* flowchain */ + if (!flow->set || flow->set == CAKE_SET_DECAYING) { + struct cake_host *srchost = &b->hosts[flow->srchost]; + struct cake_host *dsthost = &b->hosts[flow->dsthost]; + u16 host_load = 1; + + if (!flow->set) { + list_add_tail(&flow->flowchain, &b->new_flows); + } else { + b->decaying_flow_count--; + list_move_tail(&flow->flowchain, &b->new_flows); + } + flow->set = CAKE_SET_SPARSE; + b->sparse_flow_count++; + + if (cake_dsrc(q->flow_mode)) + host_load = max(host_load, srchost->srchost_refcnt); + + if (cake_ddst(q->flow_mode)) + host_load = max(host_load, dsthost->dsthost_refcnt); + + flow->deficit = (b->flow_quantum * + quantum_div[host_load]) >> 16; + } else if (flow->set == CAKE_SET_SPARSE_WAIT) { + /* this flow was empty, accounted as a sparse flow, but actually + * in the bulk rotation. + */ + flow->set = CAKE_SET_BULK; + b->sparse_flow_count--; + b->bulk_flow_count++; + } + + if (q->buffer_used > q->buffer_max_used) + q->buffer_max_used = q->buffer_used; + + if (q->buffer_used > q->buffer_limit) { + u32 dropped = 0; + + while (q->buffer_used > q->buffer_limit) { + dropped++; + cake_drop(sch, to_free); + } + b->drop_overlimit += dropped; + } + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[q->cur_tin]; + struct cake_flow *flow = &b->flows[q->cur_flow]; + struct sk_buff *skb = NULL; + u32 len; + + if (flow->head) { + skb = dequeue_head(flow); + len = qdisc_pkt_len(skb); + b->backlogs[q->cur_flow] -= len; + b->tin_backlog -= len; + sch->qstats.backlog -= len; + q->buffer_used -= skb->truesize; + sch->q.qlen--; + + if (q->overflow_timeout) + cake_heapify(q, b->overflow_idx[q->cur_flow]); + } + return skb; +} + +/* Discard leftover packets from a tin no longer in use. */ +static void cake_clear_tin(struct Qdisc *sch, u16 tin) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + q->cur_tin = tin; + for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) + while (!!(skb = cake_dequeue_one(sch))) + kfree_skb(skb); +} + +static struct sk_buff *cake_dequeue(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[q->cur_tin]; + struct cake_host *srchost, *dsthost; + ktime_t now = ktime_get(); + struct cake_flow *flow; + struct list_head *head; + bool first_flow = true; + struct sk_buff *skb; + u16 host_load; + u64 delay; + u32 len; + +begin: + if (!sch->q.qlen) + return NULL; + + /* global hard shaper */ + if (ktime_after(q->time_next_packet, now) && + ktime_after(q->failsafe_next_packet, now)) { + u64 next = min(ktime_to_ns(q->time_next_packet), + ktime_to_ns(q->failsafe_next_packet)); + + sch->qstats.overlimits++; + qdisc_watchdog_schedule_ns(&q->watchdog, next); + return NULL; + } + + /* Choose a class to work on. */ + if (!q->rate_ns) { + /* In unlimited mode, can't rely on shaper timings, just balance + * with DRR + */ + bool wrapped = false, empty = true; + + while (b->tin_deficit < 0 || + !(b->sparse_flow_count + b->bulk_flow_count)) { + if (b->tin_deficit <= 0) + b->tin_deficit += b->tin_quantum_band; + if (b->sparse_flow_count + b->bulk_flow_count) + empty = false; + + q->cur_tin++; + b++; + if (q->cur_tin >= q->tin_cnt) { + q->cur_tin = 0; + b = q->tins; + + if (wrapped) { + /* It's possible for q->qlen to be + * nonzero when we actually have no + * packets anywhere. + */ + if (empty) + return NULL; + } else { + wrapped = true; + } + } + } + } else { + /* In shaped mode, choose: + * - Highest-priority tin with queue and meeting schedule, or + * - The earliest-scheduled tin with queue. + */ + ktime_t best_time = KTIME_MAX; + int tin, best_tin = 0; + + for (tin = 0; tin < q->tin_cnt; tin++) { + b = q->tins + tin; + if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { + ktime_t time_to_pkt = \ + ktime_sub(b->time_next_packet, now); + + if (ktime_to_ns(time_to_pkt) <= 0 || + ktime_compare(time_to_pkt, + best_time) <= 0) { + best_time = time_to_pkt; + best_tin = tin; + } + } + } + + q->cur_tin = best_tin; + b = q->tins + best_tin; + + /* No point in going further if no packets to deliver. */ + if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) + return NULL; + } + +retry: + /* service this class */ + head = &b->decaying_flows; + if (!first_flow || list_empty(head)) { + head = &b->new_flows; + if (list_empty(head)) { + head = &b->old_flows; + if (unlikely(list_empty(head))) { + head = &b->decaying_flows; + if (unlikely(list_empty(head))) + goto begin; + } + } + } + flow = list_first_entry(head, struct cake_flow, flowchain); + q->cur_flow = flow - b->flows; + first_flow = false; + + /* triple isolation (modified DRR++) */ + srchost = &b->hosts[flow->srchost]; + dsthost = &b->hosts[flow->dsthost]; + host_load = 1; + + if (cake_dsrc(q->flow_mode)) + host_load = max(host_load, srchost->srchost_refcnt); + + if (cake_ddst(q->flow_mode)) + host_load = max(host_load, dsthost->dsthost_refcnt); + + WARN_ON(host_load > CAKE_QUEUES); + + /* flow isolation (DRR++) */ + if (flow->deficit <= 0) { + /* The shifted prandom_u32() is a way to apply dithering to + * avoid accumulating roundoff errors + */ + flow->deficit += (b->flow_quantum * quantum_div[host_load] + + (prandom_u32() >> 16)) >> 16; + list_move_tail(&flow->flowchain, &b->old_flows); + + /* Keep all flows with deficits out of the sparse and decaying + * rotations. No non-empty flow can go into the decaying + * rotation, so they can't get deficits + */ + if (flow->set == CAKE_SET_SPARSE) { + if (flow->head) { + b->sparse_flow_count--; + b->bulk_flow_count++; + flow->set = CAKE_SET_BULK; + } else { + /* we've moved it to the bulk rotation for + * correct deficit accounting but we still want + * to count it as a sparse flow, not a bulk one. + */ + flow->set = CAKE_SET_SPARSE_WAIT; + } + } + goto retry; + } + + /* Retrieve a packet via the AQM */ + while (1) { + skb = cake_dequeue_one(sch); + if (!skb) { + /* this queue was actually empty */ + if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) + b->unresponsive_flow_count--; + + if (flow->cvars.p_drop || flow->cvars.count || + ktime_before(now, flow->cvars.drop_next)) { + /* keep in the flowchain until the state has + * decayed to rest + */ + list_move_tail(&flow->flowchain, + &b->decaying_flows); + if (flow->set == CAKE_SET_BULK) { + b->bulk_flow_count--; + b->decaying_flow_count++; + } else if (flow->set == CAKE_SET_SPARSE || + flow->set == CAKE_SET_SPARSE_WAIT) { + b->sparse_flow_count--; + b->decaying_flow_count++; + } + flow->set = CAKE_SET_DECAYING; + } else { + /* remove empty queue from the flowchain */ + list_del_init(&flow->flowchain); + if (flow->set == CAKE_SET_SPARSE || + flow->set == CAKE_SET_SPARSE_WAIT) + b->sparse_flow_count--; + else if (flow->set == CAKE_SET_BULK) + b->bulk_flow_count--; + else + b->decaying_flow_count--; + + flow->set = CAKE_SET_NONE; + srchost->srchost_refcnt--; + dsthost->dsthost_refcnt--; + } + goto begin; + } + + /* Last packet in queue may be marked, shouldn't be dropped */ + if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, + (b->bulk_flow_count * + !!(q->rate_flags & + CAKE_FLAG_INGRESS))) || + !flow->head) + break; + + /* drop this packet, get another one */ + if (q->rate_flags & CAKE_FLAG_INGRESS) { + len = cake_advance_shaper(q, b, skb, + now, true); + flow->deficit -= len; + b->tin_deficit -= len; + } + flow->dropped++; + b->tin_dropped++; + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); + qdisc_qstats_drop(sch); + kfree_skb(skb); + if (q->rate_flags & CAKE_FLAG_INGRESS) + goto retry; + } + + b->tin_ecn_mark += !!flow->cvars.ecn_marked; + qdisc_bstats_update(sch, skb); + + /* collect delay stats */ + delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); + b->avge_delay = cake_ewma(b->avge_delay, delay, 8); + b->peak_delay = cake_ewma(b->peak_delay, delay, + delay > b->peak_delay ? 2 : 8); + b->base_delay = cake_ewma(b->base_delay, delay, + delay < b->base_delay ? 2 : 8); + + len = cake_advance_shaper(q, b, skb, now, false); + flow->deficit -= len; + b->tin_deficit -= len; + + if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { + u64 next = min(ktime_to_ns(q->time_next_packet), + ktime_to_ns(q->failsafe_next_packet)); + + qdisc_watchdog_schedule_ns(&q->watchdog, next); + } else if (!sch->q.qlen) { + int i; + + for (i = 0; i < q->tin_cnt; i++) { + if (q->tins[i].decaying_flow_count) { + ktime_t next = \ + ktime_add_ns(now, + q->tins[i].cparams.target); + + qdisc_watchdog_schedule_ns(&q->watchdog, + ktime_to_ns(next)); + break; + } + } + } + + if (q->overflow_timeout) + q->overflow_timeout--; + + return skb; +} + +static void cake_reset(struct Qdisc *sch) +{ + u32 c; + + for (c = 0; c < CAKE_MAX_TINS; c++) + cake_clear_tin(sch, c); +} + +static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { + [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, + [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, + [TCA_CAKE_ATM] = { .type = NLA_U32 }, + [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, + [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, + [TCA_CAKE_RTT] = { .type = NLA_U32 }, + [TCA_CAKE_TARGET] = { .type = NLA_U32 }, + [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, + [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, + [TCA_CAKE_NAT] = { .type = NLA_U32 }, + [TCA_CAKE_RAW] = { .type = NLA_U32 }, + [TCA_CAKE_WASH] = { .type = NLA_U32 }, + [TCA_CAKE_MPU] = { .type = NLA_U32 }, + [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, + [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, +}; + +static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, + u64 target_ns, u64 rtt_est_ns) +{ + /* convert byte-rate into time-per-byte + * so it will always unwedge in reasonable time. + */ + static const u64 MIN_RATE = 64; + u32 byte_target = mtu; + u64 byte_target_ns; + u8 rate_shft = 0; + u64 rate_ns = 0; + + b->flow_quantum = 1514; + if (rate) { + b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); + rate_shft = 34; + rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; + rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); + while (!!(rate_ns >> 34)) { + rate_ns >>= 1; + rate_shft--; + } + } /* else unlimited, ie. zero delay */ + + b->tin_rate_bps = rate; + b->tin_rate_ns = rate_ns; + b->tin_rate_shft = rate_shft; + + byte_target_ns = (byte_target * rate_ns) >> rate_shft; + + b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); + b->cparams.interval = max(rtt_est_ns + + b->cparams.target - target_ns, + b->cparams.target * 2); + b->cparams.mtu_time = byte_target_ns; + b->cparams.p_inc = 1 << 24; /* 1/256 */ + b->cparams.p_dec = 1 << 20; /* 1/4096 */ +} + +static int cake_config_besteffort(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[0]; + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + + q->tin_cnt = 1; + + q->tin_index = besteffort; + q->tin_order = normal_order; + + cake_set_rate(b, rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + b->tin_quantum_band = 65535; + b->tin_quantum_prio = 65535; + + return 0; +} + +static int cake_config_precedence(struct Qdisc *sch) +{ + /* convert high-level (user visible) parameters into internal format */ + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum1 = 256; + u32 quantum2 = 256; + u32 i; + + q->tin_cnt = 8; + q->tin_index = precedence; + q->tin_order = normal_order; + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), + us_to_ns(q->interval)); + + b->tin_quantum_prio = max_t(u16, 1U, quantum1); + b->tin_quantum_band = max_t(u16, 1U, quantum2); + + /* calculate next class's parameters */ + rate *= 7; + rate >>= 3; + + quantum1 *= 3; + quantum1 >>= 1; + + quantum2 *= 7; + quantum2 >>= 3; + } + + return 0; +} + +/* List of known Diffserv codepoints: + * + * Least Effort (CS1) + * Best Effort (CS0) + * Max Reliability & LLT "Lo" (TOS1) + * Max Throughput (TOS2) + * Min Delay (TOS4) + * LLT "La" (TOS5) + * Assured Forwarding 1 (AF1x) - x3 + * Assured Forwarding 2 (AF2x) - x3 + * Assured Forwarding 3 (AF3x) - x3 + * Assured Forwarding 4 (AF4x) - x3 + * Precedence Class 2 (CS2) + * Precedence Class 3 (CS3) + * Precedence Class 4 (CS4) + * Precedence Class 5 (CS5) + * Precedence Class 6 (CS6) + * Precedence Class 7 (CS7) + * Voice Admit (VA) + * Expedited Forwarding (EF) + + * Total 25 codepoints. + */ + +/* List of traffic classes in RFC 4594: + * (roughly descending order of contended priority) + * (roughly ascending order of uncontended throughput) + * + * Network Control (CS6,CS7) - routing traffic + * Telephony (EF,VA) - aka. VoIP streams + * Signalling (CS5) - VoIP setup + * Multimedia Conferencing (AF4x) - aka. video calls + * Realtime Interactive (CS4) - eg. games + * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch + * Broadcast Video (CS3) + * Low Latency Data (AF2x,TOS4) - eg. database + * Ops, Admin, Management (CS2,TOS1) - eg. ssh + * Standard Service (CS0 & unrecognised codepoints) + * High Throughput Data (AF1x,TOS2) - eg. web traffic + * Low Priority Data (CS1) - eg. BitTorrent + + * Total 12 traffic classes. + */ + +static int cake_config_diffserv8(struct Qdisc *sch) +{ +/* Pruned list of traffic classes for typical applications: + * + * Network Control (CS6, CS7) + * Minimum Latency (EF, VA, CS5, CS4) + * Interactive Shell (CS2, TOS1) + * Low Latency Transactions (AF2x, TOS4) + * Video Streaming (AF4x, AF3x, CS3) + * Bog Standard (CS0 etc.) + * High Throughput (AF1x, TOS2) + * Background Traffic (CS1) + * + * Total 8 traffic classes. + */ + + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum1 = 256; + u32 quantum2 = 256; + u32 i; + + q->tin_cnt = 8; + + /* codepoint to class mapping */ + q->tin_index = diffserv8; + q->tin_order = normal_order; + + /* class characteristics */ + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), + us_to_ns(q->interval)); + + b->tin_quantum_prio = max_t(u16, 1U, quantum1); + b->tin_quantum_band = max_t(u16, 1U, quantum2); + + /* calculate next class's parameters */ + rate *= 7; + rate >>= 3; + + quantum1 *= 3; + quantum1 >>= 1; + + quantum2 *= 7; + quantum2 >>= 3; + } + + return 0; +} + +static int cake_config_diffserv4(struct Qdisc *sch) +{ +/* Further pruned list of traffic classes for four-class system: + * + * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) + * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) + * Best Effort (CS0, AF1x, TOS2, and those not specified) + * Background Traffic (CS1) + * + * Total 4 traffic classes. + */ + + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum = 1024; + + q->tin_cnt = 4; + + /* codepoint to class mapping */ + q->tin_index = diffserv4; + q->tin_order = bulk_order; + + /* class characteristics */ + cake_set_rate(&q->tins[0], rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[1], rate >> 4, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[2], rate >> 1, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[3], rate >> 2, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + + /* priority weights */ + q->tins[0].tin_quantum_prio = quantum; + q->tins[1].tin_quantum_prio = quantum >> 4; + q->tins[2].tin_quantum_prio = quantum << 2; + q->tins[3].tin_quantum_prio = quantum << 4; + + /* bandwidth-sharing weights */ + q->tins[0].tin_quantum_band = quantum; + q->tins[1].tin_quantum_band = quantum >> 4; + q->tins[2].tin_quantum_band = quantum >> 1; + q->tins[3].tin_quantum_band = quantum >> 2; + + return 0; +} + +static int cake_config_diffserv3(struct Qdisc *sch) +{ +/* Simplified Diffserv structure with 3 tins. + * Low Priority (CS1) + * Best Effort + * Latency Sensitive (TOS4, VA, EF, CS6, CS7) + */ + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum = 1024; + + q->tin_cnt = 3; + + /* codepoint to class mapping */ + q->tin_index = diffserv3; + q->tin_order = bulk_order; + + /* class characteristics */ + cake_set_rate(&q->tins[0], rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[1], rate >> 4, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[2], rate >> 2, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + + /* priority weights */ + q->tins[0].tin_quantum_prio = quantum; + q->tins[1].tin_quantum_prio = quantum >> 4; + q->tins[2].tin_quantum_prio = quantum << 4; + + /* bandwidth-sharing weights */ + q->tins[0].tin_quantum_band = quantum; + q->tins[1].tin_quantum_band = quantum >> 4; + q->tins[2].tin_quantum_band = quantum >> 2; + + return 0; +} + +static void cake_reconfigure(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int c, ft; + + switch (q->tin_mode) { + case CAKE_DIFFSERV_BESTEFFORT: + ft = cake_config_besteffort(sch); + break; + + case CAKE_DIFFSERV_PRECEDENCE: + ft = cake_config_precedence(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV8: + ft = cake_config_diffserv8(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV4: + ft = cake_config_diffserv4(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV3: + default: + ft = cake_config_diffserv3(sch); + break; + } + + for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { + cake_clear_tin(sch, c); + q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; + } + + q->rate_ns = q->tins[ft].tin_rate_ns; + q->rate_shft = q->tins[ft].tin_rate_shft; + + if (q->buffer_config_limit) { + q->buffer_limit = q->buffer_config_limit; + } else if (q->rate_bps) { + u64 t = q->rate_bps * q->interval; + + do_div(t, USEC_PER_SEC / 4); + q->buffer_limit = max_t(u32, t, 4U << 20); + } else { + q->buffer_limit = ~0; + } + + sch->flags &= ~TCQ_F_CAN_BYPASS; + + q->buffer_limit = min(q->buffer_limit, + max(sch->limit * psched_mtu(qdisc_dev(sch)), + q->buffer_config_limit)); +} + +static int cake_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CAKE_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack); + if (err < 0) + return err; + + if (tb[TCA_CAKE_NAT]) { +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; + q->flow_mode |= CAKE_FLOW_NAT_FLAG * + !!nla_get_u32(tb[TCA_CAKE_NAT]); +#else + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], + "No conntrack support in kernel"); + return -EOPNOTSUPP; +#endif + } + + if (tb[TCA_CAKE_BASE_RATE64]) + q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + + if (tb[TCA_CAKE_DIFFSERV_MODE]) + q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); + + if (tb[TCA_CAKE_WASH]) { + if (!!nla_get_u32(tb[TCA_CAKE_WASH])) + q->rate_flags |= CAKE_FLAG_WASH; + else + q->rate_flags &= ~CAKE_FLAG_WASH; + } + + if (tb[TCA_CAKE_FLOW_MODE]) + q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | + (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & + CAKE_FLOW_MASK)); + + if (tb[TCA_CAKE_ATM]) + q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); + + if (tb[TCA_CAKE_OVERHEAD]) { + q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); + q->rate_flags |= CAKE_FLAG_OVERHEAD; + + q->max_netlen = 0; + q->max_adjlen = 0; + q->min_netlen = ~0; + q->min_adjlen = ~0; + } + + if (tb[TCA_CAKE_RAW]) { + q->rate_flags &= ~CAKE_FLAG_OVERHEAD; + + q->max_netlen = 0; + q->max_adjlen = 0; + q->min_netlen = ~0; + q->min_adjlen = ~0; + } + + if (tb[TCA_CAKE_MPU]) + q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); + + if (tb[TCA_CAKE_RTT]) { + q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); + + if (!q->interval) + q->interval = 1; + } + + if (tb[TCA_CAKE_TARGET]) { + q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); + + if (!q->target) + q->target = 1; + } + + if (tb[TCA_CAKE_AUTORATE]) { + if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) + q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + else + q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + } + + if (tb[TCA_CAKE_INGRESS]) { + if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) + q->rate_flags |= CAKE_FLAG_INGRESS; + else + q->rate_flags &= ~CAKE_FLAG_INGRESS; + } + + if (tb[TCA_CAKE_ACK_FILTER]) + q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); + + if (tb[TCA_CAKE_MEMORY]) + q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + + if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD) + q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + else + q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + + if (q->tins) { + sch_tree_lock(sch); + cake_reconfigure(sch); + sch_tree_unlock(sch); + } + + return 0; +} + +static void cake_destroy(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); + tcf_block_put(q->block); + kvfree(q->tins); +} + +static int cake_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int i, j, err; + + sch->limit = 10240; + q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; + q->flow_mode = CAKE_FLOW_TRIPLE; + + q->rate_bps = 0; /* unlimited by default */ + + q->interval = 100000; /* 100ms default */ + q->target = 5000; /* 5ms: codel RFC argues + * for 5 to 10% of interval + */ + + q->cur_tin = 0; + q->cur_flow = 0; + + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) { + int err = cake_change(sch, opt, extack); + + if (err) + return err; + } + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + return err; + + quantum_div[0] = ~0; + for (i = 1; i <= CAKE_QUEUES; i++) + quantum_div[i] = 65535 / i; + + q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data), + GFP_KERNEL); + if (!q->tins) + goto nomem; + + for (i = 0; i < CAKE_MAX_TINS; i++) { + struct cake_tin_data *b = q->tins + i; + + INIT_LIST_HEAD(&b->new_flows); + INIT_LIST_HEAD(&b->old_flows); + INIT_LIST_HEAD(&b->decaying_flows); + b->sparse_flow_count = 0; + b->bulk_flow_count = 0; + b->decaying_flow_count = 0; + + for (j = 0; j < CAKE_QUEUES; j++) { + struct cake_flow *flow = b->flows + j; + u32 k = j * CAKE_MAX_TINS + i; + + INIT_LIST_HEAD(&flow->flowchain); + cobalt_vars_init(&flow->cvars); + + q->overflow_heap[k].t = i; + q->overflow_heap[k].b = j; + b->overflow_idx[j] = k; + } + } + + cake_reconfigure(sch); + q->avg_peak_bandwidth = q->rate_bps; + q->min_netlen = ~0; + q->min_adjlen = ~0; + return 0; + +nomem: + cake_destroy(sch); + return -ENOMEM; +} + +static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (!opts) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, + TCA_CAKE_PAD)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, + q->flow_mode & CAKE_FLOW_MASK)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_AUTORATE, + !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_INGRESS, + !!(q->rate_flags & CAKE_FLAG_INGRESS))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_NAT, + !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_WASH, + !!(q->rate_flags & CAKE_FLAG_WASH))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) + goto nla_put_failure; + + if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) + if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, + !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP); + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *tstats, *ts; + int i; + + if (!stats) + return -1; + +#define PUT_STAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_STAT_U64(attr, data) do { \ + if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ + data, TCA_CAKE_STATS_PAD)) \ + goto nla_put_failure; \ + } while (0) + + PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); + PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); + PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); + PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); + PUT_STAT_U32(MAX_NETLEN, q->max_netlen); + PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); + PUT_STAT_U32(MIN_NETLEN, q->min_netlen); + PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); + +#undef PUT_STAT_U32 +#undef PUT_STAT_U64 + + tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS); + if (!tstats) + goto nla_put_failure; + +#define PUT_TSTAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_TSTAT_U64(attr, data) do { \ + if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ + data, TCA_CAKE_TIN_STATS_PAD)) \ + goto nla_put_failure; \ + } while (0) + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[q->tin_order[i]]; + + ts = nla_nest_start(d->skb, i + 1); + if (!ts) + goto nla_put_failure; + + PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); + PUT_TSTAT_U64(SENT_BYTES64, b->bytes); + PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); + + PUT_TSTAT_U32(TARGET_US, + ktime_to_us(ns_to_ktime(b->cparams.target))); + PUT_TSTAT_U32(INTERVAL_US, + ktime_to_us(ns_to_ktime(b->cparams.interval))); + + PUT_TSTAT_U32(SENT_PACKETS, b->packets); + PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); + PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); + PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); + + PUT_TSTAT_U32(PEAK_DELAY_US, + ktime_to_us(ns_to_ktime(b->peak_delay))); + PUT_TSTAT_U32(AVG_DELAY_US, + ktime_to_us(ns_to_ktime(b->avge_delay))); + PUT_TSTAT_U32(BASE_DELAY_US, + ktime_to_us(ns_to_ktime(b->base_delay))); + + PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); + PUT_TSTAT_U32(WAY_MISSES, b->way_misses); + PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); + + PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + + b->decaying_flow_count); + PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); + PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); + PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); + + PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); + nla_nest_end(d->skb, ts); + } + +#undef PUT_TSTAT_U32 +#undef PUT_TSTAT_U64 + + nla_nest_end(d->skb, tstats); + return nla_nest_end(d->skb, stats); + +nla_put_failure: + nla_nest_cancel(d->skb, stats); + return -1; +} + +static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long cake_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static void cake_unbind(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return q->block; +} + +static int cake_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct cake_sched_data *q = qdisc_priv(sch); + const struct cake_flow *flow = NULL; + struct gnet_stats_queue qs = { 0 }; + struct nlattr *stats; + u32 idx = cl - 1; + + if (idx < CAKE_QUEUES * q->tin_cnt) { + const struct cake_tin_data *b = \ + &q->tins[q->tin_order[idx / CAKE_QUEUES]]; + const struct sk_buff *skb; + + flow = &b->flows[idx % CAKE_QUEUES]; + + if (flow->head) { + sch_tree_lock(sch); + skb = flow->head; + while (skb) { + qs.qlen++; + skb = skb->next; + } + sch_tree_unlock(sch); + } + qs.backlog = b->backlogs[idx % CAKE_QUEUES]; + qs.drops = flow->dropped; + } + if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) + return -1; + if (flow) { + ktime_t now = ktime_get(); + + stats = nla_nest_start(d->skb, TCA_STATS_APP); + if (!stats) + return -1; + +#define PUT_STAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_STAT_S32(attr, data) do { \ + if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) + + PUT_STAT_S32(DEFICIT, flow->deficit); + PUT_STAT_U32(DROPPING, flow->cvars.dropping); + PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); + PUT_STAT_U32(P_DROP, flow->cvars.p_drop); + if (flow->cvars.p_drop) { + PUT_STAT_S32(BLUE_TIMER_US, + ktime_to_us( + ktime_sub(now, + flow->cvars.blue_timer))); + } + if (flow->cvars.dropping) { + PUT_STAT_S32(DROP_NEXT_US, + ktime_to_us( + ktime_sub(now, + flow->cvars.drop_next))); + } + + if (nla_nest_end(d->skb, stats) < 0) + return -1; + } + + return 0; + +nla_put_failure: + nla_nest_cancel(d->skb, stats); + return -1; +} + +static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cake_sched_data *q = qdisc_priv(sch); + unsigned int i, j; + + if (arg->stop) + return; + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[q->tin_order[i]]; + + for (j = 0; j < CAKE_QUEUES; j++) { + if (list_empty(&b->flows[j].flowchain) || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } +} + +static const struct Qdisc_class_ops cake_class_ops = { + .leaf = cake_leaf, + .find = cake_find, + .tcf_block = cake_tcf_block, + .bind_tcf = cake_bind, + .unbind_tcf = cake_unbind, + .dump = cake_dump_class, + .dump_stats = cake_dump_class_stats, + .walk = cake_walk, +}; + +static struct Qdisc_ops cake_qdisc_ops __read_mostly = { + .cl_ops = &cake_class_ops, + .id = "cake", + .priv_size = sizeof(struct cake_sched_data), + .enqueue = cake_enqueue, + .dequeue = cake_dequeue, + .peek = qdisc_peek_dequeued, + .init = cake_init, + .reset = cake_reset, + .destroy = cake_destroy, + .change = cake_change, + .dump = cake_dump, + .dump_stats = cake_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init cake_module_init(void) +{ + return register_qdisc(&cake_qdisc_ops); +} + +static void __exit cake_module_exit(void) +{ + unregister_qdisc(&cake_qdisc_ops); +} + +module_init(cake_module_init) +module_exit(cake_module_exit) +MODULE_AUTHOR("Jonathan Morton"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("The CAKE shaper."); diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c new file mode 100644 index 000000000000..1538d6fa8165 --- /dev/null +++ b/net/sched/sch_etf.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_etf.c Earliest TxTime First queueing discipline. + * + * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> + * Vinicius Costa Gomes <vinicius.gomes@intel.com> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/errqueue.h> +#include <linux/rbtree.h> +#include <linux/skbuff.h> +#include <linux/posix-timers.h> +#include <net/netlink.h> +#include <net/sch_generic.h> +#include <net/pkt_sched.h> +#include <net/sock.h> + +#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) +#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) + +struct etf_sched_data { + bool offload; + bool deadline_mode; + int clockid; + int queue; + s32 delta; /* in ns */ + ktime_t last; /* The txtime of the last skb sent to the netdevice. */ + struct rb_root head; + struct qdisc_watchdog watchdog; + ktime_t (*get_time)(void); +}; + +static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { + [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, +}; + +static inline int validate_input_params(struct tc_etf_qopt *qopt, + struct netlink_ext_ack *extack) +{ + /* Check if params comply to the following rules: + * * Clockid and delta must be valid. + * + * * Dynamic clockids are not supported. + * + * * Delta must be a positive integer. + * + * Also note that for the HW offload case, we must + * expect that system clocks have been synchronized to PHC. + */ + if (qopt->clockid < 0) { + NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); + return -ENOTSUPP; + } + + if (qopt->clockid != CLOCK_TAI) { + NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); + return -EINVAL; + } + + if (qopt->delta < 0) { + NL_SET_ERR_MSG(extack, "Delta must be positive"); + return -EINVAL; + } + + return 0; +} + +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + ktime_t txtime = nskb->tstamp; + struct sock *sk = nskb->sk; + ktime_t now; + + if (!sk) + return false; + + if (!sock_flag(sk, SOCK_TXTIME)) + return false; + + /* We don't perform crosstimestamping. + * Drop if packet's clockid differs from qdisc's. + */ + if (sk->sk_clockid != q->clockid) + return false; + + if (sk->sk_txtime_deadline_mode != q->deadline_mode) + return false; + + now = q->get_time(); + if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) + return false; + + return true; +} + +static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + p = rb_first(&q->head); + if (!p) + return NULL; + + return rb_to_skb(p); +} + +static void reset_watchdog(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = etf_peek_timesortedlist(sch); + ktime_t next; + + if (!skb) + return; + + next = ktime_sub_ns(skb->tstamp, q->delta); + qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); +} + +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) +{ + struct sock_exterr_skb *serr; + struct sk_buff *clone; + ktime_t txtime = skb->tstamp; + + if (!skb->sk || !(skb->sk->sk_txtime_report_errors)) + return; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) + return; + + serr = SKB_EXT_ERR(clone); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; + serr->ee.ee_type = 0; + serr->ee.ee_code = code; + serr->ee.ee_pad = 0; + serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ + serr->ee.ee_info = txtime; /* low part of tstamp */ + + if (sock_queue_err_skb(skb->sk, clone)) + kfree_skb(clone); +} + +static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node **p = &q->head.rb_node, *parent = NULL; + ktime_t txtime = nskb->tstamp; + + if (!is_packet_valid(sch, nskb)) { + report_sock_error(nskb, EINVAL, + SO_EE_CODE_TXTIME_INVALID_PARAM); + return qdisc_drop(nskb, sch, to_free); + } + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (ktime_after(txtime, skb->tstamp)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->head); + + qdisc_qstats_backlog_inc(sch, nskb); + sch->q.qlen++; + + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return NET_XMIT_SUCCESS; +} + +static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, + bool drop) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + rb_erase(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + qdisc_qstats_backlog_dec(sch, skb); + + if (drop) { + struct sk_buff *to_free = NULL; + + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + + qdisc_drop(skb, sch, &to_free); + kfree_skb_list(to_free); + qdisc_qstats_overlimit(sch); + } else { + qdisc_bstats_update(sch, skb); + + q->last = skb->tstamp; + } + + sch->q.qlen--; +} + +static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + ktime_t now, next; + + skb = etf_peek_timesortedlist(sch); + if (!skb) + return NULL; + + now = q->get_time(); + + /* Drop if packet has expired while in queue. */ + if (ktime_before(skb->tstamp, now)) { + timesortedlist_erase(sch, skb, true); + skb = NULL; + goto out; + } + + /* When in deadline mode, dequeue as soon as possible and change the + * txtime from deadline to (now + delta). + */ + if (q->deadline_mode) { + timesortedlist_erase(sch, skb, false); + skb->tstamp = now; + goto out; + } + + next = ktime_sub_ns(skb->tstamp, q->delta); + + /* Dequeue only if now is within the [txtime - delta, txtime] range. */ + if (ktime_after(now, next)) + timesortedlist_erase(sch, skb, false); + else + skb = NULL; + +out: + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return skb; +} + +static void etf_disable_offload(struct net_device *dev, + struct etf_sched_data *q) +{ + struct tc_etf_qopt_offload etf = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + etf.queue = q->queue; + etf.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) + pr_warn("Couldn't disable ETF offload for queue %d\n", + etf.queue); +} + +static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_etf_qopt_offload etf = { }; + int err; + + if (q->offload) + return 0; + + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); + return -EOPNOTSUPP; + } + + etf.queue = q->queue; + etf.enable = 1; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); + return err; + } + + return 0; +} + +static int etf_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_ETF_MAX + 1]; + struct tc_etf_qopt *qopt; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, + "Missing ETF qdisc options which are mandatory"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETF_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); + return -EINVAL; + } + + qopt = nla_data(tb[TCA_ETF_PARMS]); + + pr_debug("delta %d clockid %d offload %s deadline %s\n", + qopt->delta, qopt->clockid, + OFFLOAD_IS_ON(qopt) ? "on" : "off", + DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); + + err = validate_input_params(qopt, extack); + if (err < 0) + return err; + + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + if (OFFLOAD_IS_ON(qopt)) { + err = etf_enable_offload(dev, q, extack); + if (err < 0) + return err; + } + + /* Everything went OK, save the parameters used. */ + q->delta = qopt->delta; + q->clockid = qopt->clockid; + q->offload = OFFLOAD_IS_ON(qopt); + q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + NL_SET_ERR_MSG(extack, "Clockid is not supported"); + return -ENOTSUPP; + } + + qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); + + return 0; +} + +static void timesortedlist_clear(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p = rb_first(&q->head); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + + rb_erase(&skb->rbnode, &q->head); + rtnl_kfree_skbs(skb, skb); + sch->q.qlen--; + } +} + +static void etf_reset(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + /* No matter which mode we are on, it's safe to clear both lists. */ + timesortedlist_clear(sch); + __qdisc_reset_queue(&sch->q); + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + q->last = 0; +} + +static void etf_destroy(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + etf_disable_offload(dev, q); +} + +static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct tc_etf_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.delta = q->delta; + opt.clockid = q->clockid; + if (q->offload) + opt.flags |= TC_ETF_OFFLOAD_ON; + + if (q->deadline_mode) + opt.flags |= TC_ETF_DEADLINE_MODE_ON; + + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops etf_qdisc_ops __read_mostly = { + .id = "etf", + .priv_size = sizeof(struct etf_sched_data), + .enqueue = etf_enqueue_timesortedlist, + .dequeue = etf_dequeue_timesortedlist, + .peek = etf_peek_timesortedlist, + .init = etf_init, + .reset = etf_reset, + .destroy = etf_destroy, + .dump = etf_dump, + .owner = THIS_MODULE, +}; + +static int __init etf_module_init(void) +{ + return register_qdisc(&etf_qdisc_ops); +} + +static void __exit etf_module_exit(void) +{ + unregister_qdisc(&etf_qdisc_ops); +} +module_init(etf_module_init) +module_exit(etf_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 2a4ab7caf553..43c4bfe625a9 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -126,7 +126,6 @@ struct htb_class { union { struct htb_class_leaf { - struct list_head drop_list; int deficit[TC_HTB_MAXDEPTH]; struct Qdisc *q; } leaf; @@ -171,7 +170,6 @@ struct htb_sched { struct qdisc_watchdog watchdog; s64 now; /* cached dequeue time */ - struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ /* time of nearest event per level (row) */ s64 near_ev_cache[TC_HTB_MAXDEPTH]; @@ -562,8 +560,6 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; htb_activate_prios(q, cl); - list_add_tail(&cl->un.leaf.drop_list, - q->drops + cl->prio); } } @@ -579,7 +575,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) htb_deactivate_prios(q, cl); cl->prio_activity = 0; - list_del_init(&cl->un.leaf.drop_list); } static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, @@ -981,7 +976,6 @@ static void htb_reset(struct Qdisc *sch) else { if (cl->un.leaf.q) qdisc_reset(cl->un.leaf.q); - INIT_LIST_HEAD(&cl->un.leaf.drop_list); } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; @@ -993,8 +987,6 @@ static void htb_reset(struct Qdisc *sch) sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); - for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops + i); } static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { @@ -1024,7 +1016,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; int err; - int i; qdisc_watchdog_init(&q->watchdog, sch); INIT_WORK(&q->work, htb_work_func); @@ -1050,8 +1041,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; - for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops + i); qdisc_skb_head_init(&q->direct_queue); @@ -1224,7 +1213,6 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->level = 0; memset(&parent->un.inner, 0, sizeof(parent->un.inner)); - INIT_LIST_HEAD(&parent->un.leaf.drop_list); parent->un.leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; @@ -1418,7 +1406,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, } cl->children = 0; - INIT_LIST_HEAD(&cl->un.leaf.drop_list); RB_CLEAR_NODE(&cl->pq_node); for (prio = 0; prio < TC_HTB_NUMPRIO; prio++) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 7d6801fc5340..ad18a2052416 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -68,6 +68,11 @@ Fabio Ludovici <fabio.ludovici at yahoo.it> */ +struct disttable { + u32 size; + s16 table[0]; +}; + struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; @@ -99,10 +104,7 @@ struct netem_sched_data { u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; - struct disttable { - u32 size; - s16 table[0]; - } *delay_dist; + struct disttable *delay_dist; enum { CLG_RANDOM, @@ -142,6 +144,7 @@ struct netem_sched_data { s32 bytes_left; } slot; + struct disttable *slot_dist; }; /* Time stamp put into socket buffer control block @@ -180,7 +183,7 @@ static u32 get_crandom(struct crndstate *state) u64 value, rho; unsigned long answer; - if (state->rho == 0) /* no correlation */ + if (!state || state->rho == 0) /* no correlation */ return prandom_u32(); value = prandom_u32(); @@ -601,10 +604,19 @@ finish_segs: static void get_slot_next(struct netem_sched_data *q, u64 now) { - q->slot.slot_next = now + q->slot_config.min_delay + - (prandom_u32() * - (q->slot_config.max_delay - - q->slot_config.min_delay) >> 32); + s64 next_delay; + + if (!q->slot_dist) + next_delay = q->slot_config.min_delay + + (prandom_u32() * + (q->slot_config.max_delay - + q->slot_config.min_delay) >> 32); + else + next_delay = tabledist(q->slot_config.dist_delay, + (s32)(q->slot_config.dist_jitter), + NULL, q->slot_dist); + + q->slot.slot_next = now + next_delay; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; } @@ -721,9 +733,9 @@ static void dist_free(struct disttable *d) * signed 16 bit values. */ -static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) +static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, + const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); size_t n = nla_len(attr)/sizeof(__s16); const __s16 *data = nla_data(attr); spinlock_t *root_lock; @@ -744,7 +756,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) root_lock = qdisc_root_sleeping_lock(sch); spin_lock_bh(root_lock); - swap(q->delay_dist, d); + swap(*tbl, d); spin_unlock_bh(root_lock); dist_free(d); @@ -762,7 +774,8 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) q->slot_config.max_bytes = INT_MAX; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; - if (q->slot_config.min_delay | q->slot_config.max_delay) + if (q->slot_config.min_delay | q->slot_config.max_delay | + q->slot_config.dist_jitter) q->slot.slot_next = ktime_get_ns(); else q->slot.slot_next = 0; @@ -926,16 +939,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_NETEM_DELAY_DIST]) { - ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); - if (ret) { - /* recover clg and loss_model, in case of - * q->clg and q->loss_model were modified - * in get_loss_clg() - */ - q->clg = old_clg; - q->loss_model = old_loss_model; - return ret; - } + ret = get_dist_table(sch, &q->delay_dist, + tb[TCA_NETEM_DELAY_DIST]); + if (ret) + goto get_table_failure; + } + + if (tb[TCA_NETEM_SLOT_DIST]) { + ret = get_dist_table(sch, &q->slot_dist, + tb[TCA_NETEM_SLOT_DIST]); + if (ret) + goto get_table_failure; } sch->limit = qopt->limit; @@ -983,6 +997,15 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, get_slot(q, tb[TCA_NETEM_SLOT]); return ret; + +get_table_failure: + /* recover clg and loss_model, in case of + * q->clg and q->loss_model were modified + * in get_loss_clg() + */ + q->clg = old_clg; + q->loss_model = old_loss_model; + return ret; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, @@ -1011,6 +1034,7 @@ static void netem_destroy(struct Qdisc *sch) if (q->qdisc) qdisc_destroy(q->qdisc); dist_free(q->delay_dist); + dist_free(q->slot_dist); } static int dump_loss_model(const struct netem_sched_data *q, @@ -1127,7 +1151,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (dump_loss_model(q, skb) != 0) goto nla_put_failure; - if (q->slot_config.min_delay | q->slot_config.max_delay) { + if (q->slot_config.min_delay | q->slot_config.max_delay | + q->slot_config.dist_jitter) { slot = q->slot_config; if (slot.max_packets == INT_MAX) slot.max_packets = 0; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5d5a16204d50..297d9cf960b9 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init( /* Initialize path max retrans value. */ asoc->pathmaxrxt = sp->pathmaxrxt; + asoc->flowlabel = sp->flowlabel; + asoc->dscp = sp->dscp; + /* Initialize default path MTU. */ asoc->pathmtu = sp->pathmtu; @@ -647,6 +650,18 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->sackdelay = asoc->sackdelay; peer->sackfreq = asoc->sackfreq; + if (addr->sa.sa_family == AF_INET6) { + __be32 info = addr->v6.sin6_flowinfo; + + if (info) { + peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK); + peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } else { + peer->flowlabel = asoc->flowlabel; + } + } + peer->dscp = asoc->dscp; + /* Enable/disable heartbeat, SACK delay, and path MTU discovery * based on association setting. */ diff --git a/net/sctp/input.c b/net/sctp/input.c index ba8a6e6c36fa..9bbc5f92c941 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -56,6 +56,7 @@ #include <net/sctp/sm.h> #include <net/sctp/checksum.h> #include <net/net_namespace.h> +#include <linux/rhashtable.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0cd2e764f47f..fc6c5e4bffa5 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 *fl6 = &transport->fl.u.ip6; + __u8 tclass = np->tclass; int res; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); - IP6_ECN_flow_xmit(sk, fl6->flowlabel); + if (transport->dscp & SCTP_DSCP_SET_MASK) + tclass = transport->dscp & SCTP_DSCP_VAL_MASK; + + if (INET_ECN_is_capable(tclass)) + IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(transport->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; @@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass); + tclass); rcu_read_unlock(); return res; } @@ -254,6 +259,17 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->flowi6_oif = daddr->v6.sin6_scope_id; else if (asoc) fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; + if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) + fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); + + if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) { + struct ip6_flowlabel *flowlabel; + + flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); + if (!flowlabel) + goto out; + fl6_sock_release(flowlabel); + } pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 67f73d3a1356..e948db29ab53 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; + __u8 tos = inet_sk(sk)->tos; + if (t->dscp & SCTP_DSCP_SET_MASK) + tos = t->dscp & SCTP_DSCP_VAL_MASK; memset(fl4, 0x0, sizeof(struct flowi4)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk); + fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } @@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_sport = laddr->a.v4.sin_port; flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS(asoc->base.sk), + RT_CONN_FLAGS_TOS(asoc->base.sk, tos), daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); @@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *transport) { struct inet_sock *inet = inet_sk(skb->sk); + __u8 dscp = inet->tos; pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, - skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr); + skb->len, &transport->fl.u.ip4.saddr, + &transport->fl.u.ip4.daddr); + + if (transport->dscp & SCTP_DSCP_SET_MASK) + dscp = transport->dscp & SCTP_DSCP_VAL_MASK; inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO : IP_PMTUDISC_DONT; SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); - return ip_queue_xmit(&inet->sk, skb, &transport->fl); + return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp); } static struct sctp_af sctp_af_inet; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ce620e878538..502c0d7cb105 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -66,6 +66,7 @@ #include <linux/slab.h> #include <linux/file.h> #include <linux/compat.h> +#include <linux/rhashtable.h> #include <net/ip.h> #include <net/icmp.h> @@ -1696,6 +1697,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct sctp_association *asoc; enum sctp_scope scope; struct cmsghdr *cmsg; + __be32 flowinfo = 0; struct sctp_af *af; int err; @@ -1780,6 +1782,9 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, if (!cmsgs->addrs_msg) return 0; + if (daddr->sa.sa_family == AF_INET6) + flowinfo = daddr->v6.sin6_flowinfo; + /* sendv addr list parse */ for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { struct sctp_transport *transport; @@ -1812,6 +1817,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, } dlen = sizeof(struct in6_addr); + daddr->v6.sin6_flowinfo = flowinfo; daddr->v6.sin6_family = AF_INET6; daddr->v6.sin6_port = htons(asoc->peer.port); memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); @@ -2392,6 +2398,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * uint32_t spp_pathmtu; * uint32_t spp_sackdelay; * uint32_t spp_flags; + * uint32_t spp_ipv6_flowlabel; + * uint8_t spp_dscp; * }; * * spp_assoc_id - (one-to-many style socket) This is filled in the @@ -2471,6 +2479,45 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. + * + * SPP_IPV6_FLOWLABEL: Setting this flag enables the + * setting of the IPV6 flow label value. The value is + * contained in the spp_ipv6_flowlabel field. + * Upon retrieval, this flag will be set to indicate that + * the spp_ipv6_flowlabel field has a valid value returned. + * If a specific destination address is set (in the + * spp_address field), then the value returned is that of + * the address. If just an association is specified (and + * no address), then the association's default flow label + * is returned. If neither an association nor a destination + * is specified, then the socket's default flow label is + * returned. For non-IPv6 sockets, this flag will be left + * cleared. + * + * SPP_DSCP: Setting this flag enables the setting of the + * Differentiated Services Code Point (DSCP) value + * associated with either the association or a specific + * address. The value is obtained in the spp_dscp field. + * Upon retrieval, this flag will be set to indicate that + * the spp_dscp field has a valid value returned. If a + * specific destination address is set when called (in the + * spp_address field), then that specific destination + * address's DSCP value is returned. If just an association + * is specified, then the association's default DSCP is + * returned. If neither an association nor a destination is + * specified, then the socket's default DSCP is returned. + * + * spp_ipv6_flowlabel + * - This field is used in conjunction with the + * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. + * The 20 least significant bits are used for the flow + * label. This setting has precedence over any IPv6-layer + * setting. + * + * spp_dscp - This field is used in conjunction with the SPP_DSCP flag + * and contains the DSCP. The 6 most significant bits are + * used for the DSCP. This setting has precedence over any + * IPv4- or IPv6- layer setting. */ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, struct sctp_transport *trans, @@ -2610,6 +2657,51 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, } } + if (params->spp_flags & SPP_IPV6_FLOWLABEL) { + if (trans && trans->ipaddr.sa.sa_family == AF_INET6) { + trans->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } else if (asoc) { + list_for_each_entry(trans, + &asoc->peer.transport_addr_list, + transports) { + if (trans->ipaddr.sa.sa_family != AF_INET6) + continue; + trans->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } + asoc->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) { + sp->flowlabel = params->spp_ipv6_flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK; + } + } + + if (params->spp_flags & SPP_DSCP) { + if (trans) { + trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; + trans->dscp |= SCTP_DSCP_SET_MASK; + } else if (asoc) { + list_for_each_entry(trans, + &asoc->peer.transport_addr_list, + transports) { + trans->dscp = params->spp_dscp & + SCTP_DSCP_VAL_MASK; + trans->dscp |= SCTP_DSCP_SET_MASK; + } + asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; + asoc->dscp |= SCTP_DSCP_SET_MASK; + } else { + sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; + sp->dscp |= SCTP_DSCP_SET_MASK; + } + } + return 0; } @@ -2624,11 +2716,18 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, int error; int hb_change, pmtud_change, sackdelay_change; - if (optlen != sizeof(struct sctp_paddrparams)) + if (optlen == sizeof(params)) { + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + } else if (optlen == ALIGN(offsetof(struct sctp_paddrparams, + spp_ipv6_flowlabel), 4)) { + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + if (params.spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL)) + return -EINVAL; + } else { return -EINVAL; - - if (copy_from_user(¶ms, optval, optlen)) - return -EFAULT; + } /* Validate flags and value parameters. */ hb_change = params.spp_flags & SPP_HB; @@ -4169,6 +4268,28 @@ out: return retval; } +static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + int val; + + if (!sctp_style(sk, TCP)) + return -EOPNOTSUPP; + + if (sctp_sk(sk)->ep->base.bind_addr.port) + return -EFAULT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->reuse = !!val; + + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4363,6 +4484,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_interleaving_supported(sk, optval, optlen); break; + case SCTP_REUSE_PORT: + retval = sctp_setsockopt_reuse_port(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -5427,6 +5551,45 @@ out: * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. + * + * SPP_IPV6_FLOWLABEL: Setting this flag enables the + * setting of the IPV6 flow label value. The value is + * contained in the spp_ipv6_flowlabel field. + * Upon retrieval, this flag will be set to indicate that + * the spp_ipv6_flowlabel field has a valid value returned. + * If a specific destination address is set (in the + * spp_address field), then the value returned is that of + * the address. If just an association is specified (and + * no address), then the association's default flow label + * is returned. If neither an association nor a destination + * is specified, then the socket's default flow label is + * returned. For non-IPv6 sockets, this flag will be left + * cleared. + * + * SPP_DSCP: Setting this flag enables the setting of the + * Differentiated Services Code Point (DSCP) value + * associated with either the association or a specific + * address. The value is obtained in the spp_dscp field. + * Upon retrieval, this flag will be set to indicate that + * the spp_dscp field has a valid value returned. If a + * specific destination address is set when called (in the + * spp_address field), then that specific destination + * address's DSCP value is returned. If just an association + * is specified, then the association's default DSCP is + * returned. If neither an association nor a destination is + * specified, then the socket's default DSCP is returned. + * + * spp_ipv6_flowlabel + * - This field is used in conjunction with the + * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. + * The 20 least significant bits are used for the flow + * label. This setting has precedence over any IPv6-layer + * setting. + * + * spp_dscp - This field is used in conjunction with the SPP_DSCP flag + * and contains the DSCP. The 6 most significant bits are + * used for the DSCP. This setting has precedence over any + * IPv4- or IPv6- layer setting. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, char __user *optval, int __user *optlen) @@ -5436,9 +5599,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); - if (len < sizeof(struct sctp_paddrparams)) + if (len >= sizeof(params)) + len = sizeof(params); + else if (len >= ALIGN(offsetof(struct sctp_paddrparams, + spp_ipv6_flowlabel), 4)) + len = ALIGN(offsetof(struct sctp_paddrparams, + spp_ipv6_flowlabel), 4); + else return -EINVAL; - len = sizeof(struct sctp_paddrparams); + if (copy_from_user(¶ms, optval, len)) return -EFAULT; @@ -5473,6 +5642,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = trans->param_flags; + if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) { + params.spp_ipv6_flowlabel = trans->flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + params.spp_flags |= SPP_IPV6_FLOWLABEL; + } + if (trans->dscp & SCTP_DSCP_SET_MASK) { + params.spp_dscp = trans->dscp & SCTP_DSCP_VAL_MASK; + params.spp_flags |= SPP_DSCP; + } } else if (asoc) { /* Fetch association values. */ params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); @@ -5482,6 +5660,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = asoc->param_flags; + if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) { + params.spp_ipv6_flowlabel = asoc->flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + params.spp_flags |= SPP_IPV6_FLOWLABEL; + } + if (asoc->dscp & SCTP_DSCP_SET_MASK) { + params.spp_dscp = asoc->dscp & SCTP_DSCP_VAL_MASK; + params.spp_flags |= SPP_DSCP; + } } else { /* Fetch socket values. */ params.spp_hbinterval = sp->hbinterval; @@ -5491,6 +5678,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = sp->param_flags; + if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) { + params.spp_ipv6_flowlabel = sp->flowlabel & + SCTP_FLOWLABEL_VAL_MASK; + params.spp_flags |= SPP_IPV6_FLOWLABEL; + } + if (sp->dscp & SCTP_DSCP_SET_MASK) { + params.spp_dscp = sp->dscp & SCTP_DSCP_VAL_MASK; + params.spp_flags |= SPP_DSCP; + } } if (copy_to_user(optval, ¶ms, len)) @@ -7196,6 +7392,26 @@ out: return retval; } +static int sctp_getsockopt_reuse_port(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = sctp_sk(sk)->reuse; + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7391,6 +7607,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_interleaving_supported(sk, len, optval, optlen); break; + case SCTP_REUSE_PORT: + retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7428,6 +7647,7 @@ static struct sctp_bind_bucket *sctp_bucket_create( static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { + bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse); struct sctp_bind_hashbucket *head; /* hash list */ struct sctp_bind_bucket *pp; unsigned short snum; @@ -7500,13 +7720,11 @@ pp_found: * used by other socket (pp->owner not empty); that other * socket is going to be sk2. */ - int reuse = sk->sk_reuse; struct sock *sk2; pr_debug("%s: found a possible match\n", __func__); - if (pp->fastreuse && sk->sk_reuse && - sk->sk_state != SCTP_SS_LISTENING) + if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING) goto success; /* Run through the list of sockets bound to the port @@ -7524,7 +7742,7 @@ pp_found: ep2 = sctp_sk(sk2)->ep; if (sk == sk2 || - (reuse && sk2->sk_reuse && + (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) && sk2->sk_state != SCTP_SS_LISTENING)) continue; @@ -7548,12 +7766,12 @@ pp_not_found: * SO_REUSEADDR on this socket -sk-). */ if (hlist_empty(&pp->owner)) { - if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING) + if (reuse && sk->sk_state != SCTP_SS_LISTENING) pp->fastreuse = 1; else pp->fastreuse = 0; } else if (pp->fastreuse && - (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING)) + (!reuse || sk->sk_state == SCTP_SS_LISTENING)) pp->fastreuse = 0; /* We are set, so fill up all the data in the hash table @@ -7684,7 +7902,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) err = 0; sctp_unhash_endpoint(ep); sk->sk_state = SCTP_SS_CLOSED; - if (sk->sk_reuse) + if (sk->sk_reuse || sctp_sk(sk)->reuse) sctp_sk(sk)->bind_hash->fastreuse = 1; goto out; } @@ -8551,6 +8769,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_no_check_tx = sk->sk_no_check_tx; newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; + sctp_sk(newsk)->reuse = sp->reuse; newsk->sk_shutdown = sk->sk_shutdown; newsk->sk_destruct = sctp_destruct_sock; diff --git a/net/smc/Makefile b/net/smc/Makefile index 188104654b54..4df96b4b8130 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 05e4ffe5aabd..143b2220c0c8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -23,6 +23,7 @@ #include <linux/workqueue.h> #include <linux/in.h> #include <linux/sched/signal.h> +#include <linux/if_vlan.h> #include <net/sock.h> #include <net/tcp.h> @@ -35,6 +36,7 @@ #include "smc_cdc.h" #include "smc_core.h" #include "smc_ib.h" +#include "smc_ism.h" #include "smc_pnet.h" #include "smc_tx.h" #include "smc_rx.h" @@ -381,8 +383,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) return 0; } -static void smc_conn_save_peer_info(struct smc_sock *smc, - struct smc_clc_msg_accept_confirm *clc) +static void smcr_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) { int bufsize = smc_uncompress_bufsize(clc->rmbe_size); @@ -393,6 +395,28 @@ static void smc_conn_save_peer_info(struct smc_sock *smc, smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } +static void smcd_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + int bufsize = smc_uncompress_bufsize(clc->dmbe_size); + + smc->conn.peer_rmbe_idx = clc->dmbe_idx; + smc->conn.peer_token = clc->token; + /* msg header takes up space in the buffer */ + smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); + atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); + smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; +} + +static void smc_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + if (smc->conn.lgr->is_smcd) + smcd_conn_save_peer_info(smc, clc); + else + smcr_conn_save_peer_info(smc, clc); +} + static void smc_link_save_peer_info(struct smc_link *link, struct smc_clc_msg_accept_confirm *clc) { @@ -463,15 +487,51 @@ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, return reason_code; } +/* check if there is an ISM device available for this connection. */ +/* called for connect and listen */ +static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev) +{ + /* Find ISM device with same PNETID as connecting interface */ + smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev); + if (!(*ismdev)) + return SMC_CLC_DECL_CNFERR; /* configuration error */ + return 0; +} + +/* Check for VLAN ID and register it on ISM device just for CLC handshake */ +static int smc_connect_ism_vlan_setup(struct smc_sock *smc, + struct smcd_dev *ismdev, + unsigned short vlan_id) +{ + if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id)) + return SMC_CLC_DECL_CNFERR; + return 0; +} + +/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is + * used, the VLAN ID will be registered again during the connection setup. + */ +static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, + struct smcd_dev *ismdev, + unsigned short vlan_id) +{ + if (!is_smcd) + return 0; + if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id)) + return SMC_CLC_DECL_CNFERR; + return 0; +} + /* CLC handshake during connect */ -static int smc_connect_clc(struct smc_sock *smc, +static int smc_connect_clc(struct smc_sock *smc, int smc_type, struct smc_clc_msg_accept_confirm *aclc, - struct smc_ib_device *ibdev, u8 ibport) + struct smc_ib_device *ibdev, u8 ibport, + struct smcd_dev *ismdev) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, ibdev, ibport); + rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, ismdev); if (rc) return rc; /* receive SMC Accept CLC message */ @@ -488,8 +548,8 @@ static int smc_connect_rdma(struct smc_sock *smc, int reason_code = 0; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, - aclc->hdr.flag); + local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, + ibport, &aclc->lcl, NULL, 0); if (local_contact < 0) { if (local_contact == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -504,7 +564,7 @@ static int smc_connect_rdma(struct smc_sock *smc, smc_conn_save_peer_info(smc, aclc); /* create send buffer and rmb */ - if (smc_buf_create(smc)) + if (smc_buf_create(smc, false)) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); if (local_contact == SMC_FIRST_CONTACT) @@ -551,11 +611,50 @@ static int smc_connect_rdma(struct smc_sock *smc, return 0; } +/* setup for ISM connection of client */ +static int smc_connect_ism(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smcd_dev *ismdev) +{ + int local_contact = SMC_FIRST_CONTACT; + int rc = 0; + + mutex_lock(&smc_create_lgr_pending); + local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, + NULL, ismdev, aclc->gid); + if (local_contact < 0) + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0); + + /* Create send and receive buffers */ + if (smc_buf_create(smc, true)) + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); + + smc_conn_save_peer_info(smc, aclc); + smc_close_init(smc); + smc_rx_init(smc); + smc_tx_init(smc); + + rc = smc_clc_send_confirm(smc); + if (rc) + return smc_connect_abort(smc, rc, local_contact); + mutex_unlock(&smc_create_lgr_pending); + + smc_copy_sock_settings_to_clc(smc); + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + + return 0; +} + /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { + bool ism_supported = false, rdma_supported = false; struct smc_clc_msg_accept_confirm aclc; struct smc_ib_device *ibdev; + struct smcd_dev *ismdev; + unsigned short vlan; + int smc_type; int rc = 0; u8 ibport; @@ -572,20 +671,52 @@ static int __smc_connect(struct smc_sock *smc) if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); - /* check if a RDMA device is available; if not, fall back */ - if (smc_check_rdma(smc, &ibdev, &ibport)) + /* check for VLAN ID */ + if (smc_vlan_by_tcpsk(smc->clcsock, &vlan)) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + + /* check if there is an ism device available */ + if (!smc_check_ism(smc, &ismdev) && + !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) { + /* ISM is supported for this connection */ + ism_supported = true; + smc_type = SMC_TYPE_D; + } + + /* check if there is a rdma device available */ + if (!smc_check_rdma(smc, &ibdev, &ibport)) { + /* RDMA is supported for this connection */ + rdma_supported = true; + if (ism_supported) + smc_type = SMC_TYPE_B; /* both */ + else + smc_type = SMC_TYPE_R; /* only RDMA */ + } + + /* if neither ISM nor RDMA are supported, fallback */ + if (!rdma_supported && !ism_supported) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); /* perform CLC handshake */ - rc = smc_connect_clc(smc, &aclc, ibdev, ibport); - if (rc) + rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, ismdev); + if (rc) { + smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return smc_connect_decline_fallback(smc, rc); + } - /* connect using rdma */ - rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); - if (rc) + /* depending on previous steps, connect using rdma or ism */ + if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) + rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) + rc = smc_connect_ism(smc, &aclc, ismdev); + else + rc = SMC_CLC_DECL_CNFERR; + if (rc) { + smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return smc_connect_decline_fallback(smc, rc); + } + smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan); return 0; } @@ -953,7 +1084,8 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, int *local_contact) { /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); + *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, + &pclc->lcl, NULL, 0); if (*local_contact < 0) { if (*local_contact == -ENOMEM) return SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -961,12 +1093,50 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, } /* create send buffer and rmb */ - if (smc_buf_create(new_smc)) + if (smc_buf_create(new_smc, false)) return SMC_CLC_DECL_MEM; return 0; } +/* listen worker: initialize connection and buffers for SMC-D */ +static int smc_listen_ism_init(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smcd_dev *ismdev, + int *local_contact) +{ + struct smc_clc_msg_smcd *pclc_smcd; + + pclc_smcd = smc_get_clc_msg_smcd(pclc); + *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL, + ismdev, pclc_smcd->gid); + if (*local_contact < 0) { + if (*local_contact == -ENOMEM) + return SMC_CLC_DECL_MEM;/* insufficient memory*/ + return SMC_CLC_DECL_INTERR; /* other error */ + } + + /* Check if peer can be reached via ISM device */ + if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, + new_smc->conn.lgr->vlan_id, + new_smc->conn.lgr->smcd)) { + if (*local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); + smc_conn_free(&new_smc->conn); + return SMC_CLC_DECL_CNFERR; + } + + /* Create send and receive buffers */ + if (smc_buf_create(new_smc, true)) { + if (*local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); + smc_conn_free(&new_smc->conn); + return SMC_CLC_DECL_MEM; + } + + return 0; +} + /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { @@ -1025,6 +1195,8 @@ static void smc_listen_work(struct work_struct *work) struct smc_clc_msg_accept_confirm cclc; struct smc_clc_msg_proposal *pclc; struct smc_ib_device *ibdev; + bool ism_supported = false; + struct smcd_dev *ismdev; u8 buf[SMC_CLC_MAX_LEN]; int local_contact = 0; int reason_code = 0; @@ -1065,12 +1237,21 @@ static void smc_listen_work(struct work_struct *work) smc_rx_init(new_smc); smc_tx_init(new_smc); + /* check if ISM is available */ + if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) && + !smc_check_ism(new_smc, &ismdev) && + !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) { + ism_supported = true; + } + /* check if RDMA is available */ - if (smc_check_rdma(new_smc, &ibdev, &ibport) || - smc_listen_rdma_check(new_smc, pclc) || - smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, - &local_contact) || - smc_listen_rdma_reg(new_smc, local_contact)) { + if (!ism_supported && + ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) || + smc_check_rdma(new_smc, &ibdev, &ibport) || + smc_listen_rdma_check(new_smc, pclc) || + smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, + &local_contact) || + smc_listen_rdma_reg(new_smc, local_contact))) { /* SMC not supported, decline */ mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); @@ -1095,7 +1276,8 @@ static void smc_listen_work(struct work_struct *work) } /* finish worker */ - smc_listen_rdma_finish(new_smc, &cclc, local_contact); + if (!ism_supported) + smc_listen_rdma_finish(new_smc, &cclc, local_contact); smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); smc_listen_out_connected(new_smc); diff --git a/net/smc/smc.h b/net/smc/smc.h index d7ca26570482..be20acd7b5ab 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,8 +21,6 @@ #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ -#define SMC_MAX_PORTS 2 /* Max # of ports */ - extern struct proto smc_proto; extern struct proto smc_proto6; @@ -185,6 +183,11 @@ struct smc_connection { spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ + struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ + u8 rx_off; /* receive offset: + * 0 for SMC-R, 32 for SMC-D + */ + u64 peer_token; /* SMC-D token of peer */ }; struct smc_connect_info { diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a7e8d63fc8ae..621d8cca570b 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -117,7 +117,7 @@ int smc_cdc_msg_send(struct smc_connection *conn, return rc; } -int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) +static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) { struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; @@ -130,6 +130,21 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) return smc_cdc_msg_send(conn, wr_buf, pend); } +int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) +{ + int rc; + + if (conn->lgr->is_smcd) { + spin_lock_bh(&conn->send_lock); + rc = smcd_cdc_msg_send(conn); + spin_unlock_bh(&conn->send_lock); + } else { + rc = smcr_cdc_get_slot_and_msg_send(conn); + } + + return rc; +} + static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, unsigned long data) { @@ -157,6 +172,45 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) (unsigned long)conn); } +/* Send a SMC-D CDC header. + * This increments the free space available in our send buffer. + * Also update the confirmed receive buffer with what was sent to the peer. + */ +int smcd_cdc_msg_send(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + struct smcd_cdc_msg cdc; + int rc, diff; + + memset(&cdc, 0, sizeof(cdc)); + cdc.common.type = SMC_CDC_MSG_TYPE; + cdc.prod_wrap = conn->local_tx_ctrl.prod.wrap; + cdc.prod_count = conn->local_tx_ctrl.prod.count; + + cdc.cons_wrap = conn->local_tx_ctrl.cons.wrap; + cdc.cons_count = conn->local_tx_ctrl.cons.count; + cdc.prod_flags = conn->local_tx_ctrl.prod_flags; + cdc.conn_state_flags = conn->local_tx_ctrl.conn_state_flags; + rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1); + if (rc) + return rc; + smc_curs_write(&conn->rx_curs_confirmed, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), conn); + /* Calculate transmitted data and increment free send buffer space */ + diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, + &conn->tx_curs_sent); + /* increased by confirmed number of bytes */ + smp_mb__before_atomic(); + atomic_add(diff, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + smc_curs_write(&conn->tx_curs_fin, + smc_curs_read(&conn->tx_curs_sent, conn), conn); + + smc_tx_sndbuf_nonfull(smc); + return rc; +} + /********************************* receive ***********************************/ static inline bool smc_cdc_before(u16 seq1, u16 seq2) @@ -178,7 +232,7 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, if (!sock_flag(&smc->sk, SOCK_URGINLINE)) /* we'll skip the urgent byte, so don't account for it */ (*diff_prod)--; - base = (char *)conn->rmb_desc->cpu_addr; + base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off; if (conn->urg_curs.count) conn->urg_rx_byte = *(base + conn->urg_curs.count - 1); else @@ -276,6 +330,34 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) sock_put(&smc->sk); /* no free sk in softirq-context */ } +/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ + * handler to indicate update in the DMBE. + * + * Context: + * - tasklet context + */ +static void smcd_cdc_rx_tsklet(unsigned long data) +{ + struct smc_connection *conn = (struct smc_connection *)data; + struct smcd_cdc_msg cdc; + struct smc_sock *smc; + + if (!conn) + return; + + memcpy(&cdc, conn->rmb_desc->cpu_addr, sizeof(cdc)); + smc = container_of(conn, struct smc_sock, conn); + smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc); +} + +/* Initialize receive tasklet. Called from ISM device IRQ handler to start + * receiver side. + */ +void smcd_cdc_rx_init(struct smc_connection *conn) +{ + tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn); +} + /***************************** init, exit, misc ******************************/ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index f60082fee5b8..8fbce4fee3e4 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -50,6 +50,20 @@ struct smc_cdc_msg { u8 reserved[18]; } __packed; /* format defined in RFC7609 */ +/* CDC message for SMC-D */ +struct smcd_cdc_msg { + struct smc_wr_rx_hdr common; /* Type = 0xFE */ + u8 res1[7]; + u16 prod_wrap; + u32 prod_count; + u8 res2[2]; + u16 cons_wrap; + u32 cons_count; + struct smc_cdc_producer_flags prod_flags; + struct smc_cdc_conn_state_flags conn_state_flags; + u8 res3[8]; +} __packed; + static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) { return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort || @@ -204,9 +218,9 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local, smc_curs_write(local, smc_curs_read(&temp, conn), conn); } -static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, - struct smc_cdc_msg *peer, - struct smc_connection *conn) +static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) { local->common.type = peer->common.type; local->len = peer->len; @@ -218,6 +232,27 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, local->conn_state_flags = peer->conn_state_flags; } +static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smcd_cdc_msg *peer) +{ + local->prod.wrap = peer->prod_wrap; + local->prod.count = peer->prod_count; + local->cons.wrap = peer->cons_wrap; + local->cons.count = peer->cons_count; + local->prod_flags = peer->prod_flags; + local->conn_state_flags = peer->conn_state_flags; +} + +static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) +{ + if (conn->lgr->is_smcd) + smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer); + else + smcr_cdc_msg_to_host(local, peer, conn); +} + struct smc_cdc_tx_pend; int smc_cdc_get_free_slot(struct smc_connection *conn, @@ -227,6 +262,8 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); +int smcd_cdc_msg_send(struct smc_connection *conn); int smc_cdc_init(void) __init; +void smcd_cdc_rx_init(struct smc_connection *conn); #endif /* SMC_CDC_H */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ae5d168653ce..ad39efdb4f1c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -23,9 +23,15 @@ #include "smc_core.h" #include "smc_clc.h" #include "smc_ib.h" +#include "smc_ism.h" + +#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 +#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 /* eye catcher "SMCR" EBCDIC for CLC messages */ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; +/* eye catcher "SMCD" EBCDIC for CLC messages */ +static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'}; /* check if received message has a correct header length and contains valid * heading and trailing eyecatchers @@ -38,10 +44,14 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) struct smc_clc_msg_decline *dclc; struct smc_clc_msg_trail *trl; - if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; switch (clcm->type) { case SMC_CLC_PROPOSAL: + if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && + clcm->path != SMC_TYPE_B) + return false; pclc = (struct smc_clc_msg_proposal *)clcm; pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (ntohs(pclc->hdr.length) != @@ -56,10 +66,16 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) break; case SMC_CLC_ACCEPT: case SMC_CLC_CONFIRM: + if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D) + return false; clc = (struct smc_clc_msg_accept_confirm *)clcm; - if (ntohs(clc->hdr.length) != sizeof(*clc)) + if ((clcm->path == SMC_TYPE_R && + ntohs(clc->hdr.length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) || + (clcm->path == SMC_TYPE_D && + ntohs(clc->hdr.length) != SMCD_CLC_ACCEPT_CONFIRM_LEN)) return false; - trl = &clc->trl; + trl = (struct smc_clc_msg_trail *) + ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl)); break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; @@ -70,7 +86,8 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) default: return false; } - if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER))) + if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; return true; } @@ -296,6 +313,9 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || (datlen > buflen) || + (clcm->version != SMC_CLC_V1) || + (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && + clcm->path != SMC_TYPE_B) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; @@ -357,17 +377,18 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) } /* send CLC PROPOSAL message across internal TCP socket */ -int smc_clc_send_proposal(struct smc_sock *smc, - struct smc_ib_device *smcibdev, - u8 ibport) +int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, + struct smc_ib_device *ibdev, u8 ibport, + struct smcd_dev *ismdev) { struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; + struct smc_clc_msg_smcd pclc_smcd; struct smc_clc_msg_proposal pclc; struct smc_clc_msg_trail trl; int len, i, plen, rc; int reason_code = 0; - struct kvec vec[4]; + struct kvec vec[5]; struct msghdr msg; /* retrieve ip prefixes for CLC proposal msg */ @@ -382,18 +403,34 @@ int smc_clc_send_proposal(struct smc_sock *smc, memset(&pclc, 0, sizeof(pclc)); memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc.hdr.type = SMC_CLC_PROPOSAL; - pclc.hdr.length = htons(plen); pclc.hdr.version = SMC_CLC_V1; /* SMC version */ - memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); - memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); - pclc.iparea_offset = htons(0); + pclc.hdr.path = smc_type; + if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) { + /* add SMC-R specifics */ + memcpy(pclc.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&pclc.lcl.gid, &ibdev->gid[ibport - 1], SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN); + pclc.iparea_offset = htons(0); + } + if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { + /* add SMC-D specifics */ + memset(&pclc_smcd, 0, sizeof(pclc_smcd)); + plen += sizeof(pclc_smcd); + pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET); + pclc_smcd.gid = ismdev->local_gid; + } + pclc.hdr.length = htons(plen); memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); i = 0; vec[i].iov_base = &pclc; vec[i++].iov_len = sizeof(pclc); + if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { + vec[i].iov_base = &pclc_smcd; + vec[i++].iov_len = sizeof(pclc_smcd); + } vec[i].iov_base = &pclc_prfx; vec[i++].iov_len = sizeof(pclc_prfx); if (pclc_prfx.ipv6_prefixes_cnt > 0) { @@ -429,35 +466,56 @@ int smc_clc_send_confirm(struct smc_sock *smc) struct kvec vec; int len; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; /* send SMC Confirm CLC msg */ memset(&cclc, 0, sizeof(cclc)); - memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); cclc.hdr.type = SMC_CLC_CONFIRM; - cclc.hdr.length = htons(sizeof(cclc)); cclc.hdr.version = SMC_CLC_V1; /* SMC version */ - memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], - SMC_GID_SIZE); - memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); - hton24(cclc.qpn, link->roce_qp->qp_num); - cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ - cclc.rmbe_alert_token = htonl(conn->alert_token_local); - cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); - cclc.rmbe_size = conn->rmbe_size_short; - cclc.rmb_dma_addr = cpu_to_be64( - (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); - hton24(cclc.psn, link->psn_initial); - - memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + if (smc->conn.lgr->is_smcd) { + /* SMC-D specific settings */ + memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + cclc.hdr.path = SMC_TYPE_D; + cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + cclc.gid = conn->lgr->smcd->local_gid; + cclc.token = conn->rmb_desc->token; + cclc.dmbe_size = conn->rmbe_size_short; + cclc.dmbe_idx = 0; + memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + } else { + /* SMC-R specific settings */ + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + cclc.hdr.path = SMC_TYPE_R; + cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + memcpy(cclc.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], + SMC_GID_SIZE); + memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + hton24(cclc.qpn, link->roce_qp->qp_num); + cclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ + cclc.rmbe_alert_token = htonl(conn->alert_token_local); + cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); + cclc.rmbe_size = conn->rmbe_size_short; + cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + hton24(cclc.psn, link->psn_initial); + memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + } memset(&msg, 0, sizeof(msg)); vec.iov_base = &cclc; - vec.iov_len = sizeof(cclc); - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc)); - if (len < sizeof(cclc)) { + vec.iov_len = ntohs(cclc.hdr.length); + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, + ntohs(cclc.hdr.length)); + if (len < ntohs(cclc.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; @@ -480,35 +538,58 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) int rc = 0; int len; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; memset(&aclc, 0, sizeof(aclc)); - memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); aclc.hdr.type = SMC_CLC_ACCEPT; - aclc.hdr.length = htons(sizeof(aclc)); aclc.hdr.version = SMC_CLC_V1; /* SMC version */ if (srv_first_contact) aclc.hdr.flag = 1; - memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], - SMC_GID_SIZE); - memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); - hton24(aclc.qpn, link->roce_qp->qp_num); - aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ - aclc.rmbe_alert_token = htonl(conn->alert_token_local); - aclc.qp_mtu = link->path_mtu; - aclc.rmbe_size = conn->rmbe_size_short, - aclc.rmb_dma_addr = cpu_to_be64( - (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); - hton24(aclc.psn, link->psn_initial); - memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + if (new_smc->conn.lgr->is_smcd) { + /* SMC-D specific settings */ + aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + aclc.hdr.path = SMC_TYPE_D; + aclc.gid = conn->lgr->smcd->local_gid; + aclc.token = conn->rmb_desc->token; + aclc.dmbe_size = conn->rmbe_size_short; + aclc.dmbe_idx = 0; + memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + } else { + /* SMC-R specific settings */ + aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + aclc.hdr.path = SMC_TYPE_R; + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + memcpy(aclc.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], + SMC_GID_SIZE); + memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + hton24(aclc.qpn, link->roce_qp->qp_num); + aclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ + aclc.rmbe_alert_token = htonl(conn->alert_token_local); + aclc.qp_mtu = link->path_mtu; + aclc.rmbe_size = conn->rmbe_size_short, + aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + hton24(aclc.psn, link->psn_initial); + memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + } memset(&msg, 0, sizeof(msg)); vec.iov_base = &aclc; - vec.iov_len = sizeof(aclc); - len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc)); - if (len < sizeof(aclc)) { + vec.iov_len = ntohs(aclc.hdr.length); + len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, + ntohs(aclc.hdr.length)); + if (len < ntohs(aclc.hdr.length)) { if (len >= 0) new_smc->sk.sk_err = EPROTO; else diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 41ff9ea96139..100e988ad1a8 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -23,6 +23,9 @@ #define SMC_CLC_DECLINE 0x04 #define SMC_CLC_V1 0x1 /* SMC version */ +#define SMC_TYPE_R 0 /* SMC-R only */ +#define SMC_TYPE_D 1 /* SMC-D only */ +#define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ #define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */ @@ -42,9 +45,11 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */ #if defined(__BIG_ENDIAN_BITFIELD) u8 version : 4, flag : 1, - rsvd : 3; + rsvd : 1, + path : 2; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 rsvd : 3, + u8 path : 2, + rsvd : 1, flag : 1, version : 4; #endif @@ -77,6 +82,11 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ } __aligned(4); +struct smc_clc_msg_smcd { /* SMC-D GID information */ + u64 gid; /* ISM GID of requestor */ + u8 res[32]; +}; + struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_hdr hdr; struct smc_clc_msg_local lcl; @@ -94,23 +104,45 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; - struct smc_clc_msg_local lcl; - u8 qpn[3]; /* QP number */ - __be32 rmb_rkey; /* RMB rkey */ - u8 rmbe_idx; /* Index of RMBE in RMB */ - __be32 rmbe_alert_token;/* unique connection id */ + union { + struct { /* SMC-R */ + struct smc_clc_msg_local lcl; + u8 qpn[3]; /* QP number */ + __be32 rmb_rkey; /* RMB rkey */ + u8 rmbe_idx; /* Index of RMBE in RMB */ + __be32 rmbe_alert_token;/* unique connection id */ #if defined(__BIG_ENDIAN_BITFIELD) - u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ - qp_mtu : 4; /* QP mtu */ + u8 rmbe_size : 4, /* buf size (compressed) */ + qp_mtu : 4; /* QP mtu */ #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 qp_mtu : 4, - rmbe_size : 4; + u8 qp_mtu : 4, + rmbe_size : 4; #endif - u8 reserved; - __be64 rmb_dma_addr; /* RMB virtual address */ - u8 reserved2; - u8 psn[3]; /* initial packet sequence number */ - struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ + u8 reserved; + __be64 rmb_dma_addr; /* RMB virtual address */ + u8 reserved2; + u8 psn[3]; /* packet sequence number */ + struct smc_clc_msg_trail smcr_trl; + /* eye catcher "SMCR" EBCDIC */ + } __packed; + struct { /* SMC-D */ + u64 gid; /* Sender GID */ + u64 token; /* DMB token */ + u8 dmbe_idx; /* DMBE index */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 dmbe_size : 4, /* buf size (compressed) */ + reserved3 : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved3 : 4, + dmbe_size : 4; +#endif + u16 reserved4; + u32 linkid; /* Link identifier */ + u32 reserved5[3]; + struct smc_clc_msg_trail smcd_trl; + /* eye catcher "SMCD" EBCDIC */ + } __packed; + }; } __packed; /* format defined in RFC7609 */ struct smc_clc_msg_decline { /* clc decline message */ @@ -129,13 +161,26 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } +/* get SMC-D info from proposal message */ +static inline struct smc_clc_msg_smcd * +smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) +{ + if (ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd)) + return NULL; + + return (struct smc_clc_msg_smcd *)(prop + 1); +} + +struct smcd_dev; + int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); -int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, - u8 ibport); +int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, + struct smc_ib_device *smcibdev, u8 ibport, + struct smcd_dev *ismdev); int smc_clc_send_confirm(struct smc_sock *smc); int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index add82b0266f3..66741e61a3b0 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,6 +25,7 @@ #include "smc_llc.h" #include "smc_cdc.h" #include "smc_close.h" +#include "smc_ism.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) @@ -46,8 +47,8 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) * otherwise there is a risk of out-of-sync link groups. */ mod_delayed_work(system_wq, &lgr->free_work, - lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : - SMC_LGR_FREE_DELAY_SERV); + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); } /* Register connection's alert token in our lookup structure. @@ -153,16 +154,18 @@ static void smc_lgr_free_work(struct work_struct *work) free: spin_unlock_bh(&smc_lgr_list.lock); if (!delayed_work_pending(&lgr->free_work)) { - if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) + if (!lgr->is_smcd && + lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); smc_lgr_free(lgr); } } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, +static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, struct smc_ib_device *smcibdev, u8 ibport, - char *peer_systemid, unsigned short vlan_id) + char *peer_systemid, unsigned short vlan_id, + struct smcd_dev *smcismdev, u64 peer_gid) { struct smc_link_group *lgr; struct smc_link *lnk; @@ -170,17 +173,23 @@ static int smc_lgr_create(struct smc_sock *smc, int rc = 0; int i; + if (is_smcd && vlan_id) { + rc = smc_ism_get_vlan(smcismdev, vlan_id); + if (rc) + goto out; + } + lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = -ENOMEM; goto out; } - lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + lgr->is_smcd = is_smcd; lgr->sync_err = 0; - memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); + rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); @@ -189,36 +198,44 @@ static int smc_lgr_create(struct smc_sock *smc, memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; - - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = smcibdev; - lnk->ibport = ibport; - lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; - if (!smcibdev->initialized) - smc_ib_setup_per_ibdev(smcibdev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); - rc = smc_llc_link_init(lnk); - if (rc) - goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; - + if (is_smcd) { + /* SMC-D specific settings */ + lgr->peer_gid = peer_gid; + lgr->smcd = smcismdev; + } else { + /* SMC-R specific settings */ + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + + lnk = &lgr->lnk[SMC_SINGLE_LINK]; + /* initialize link */ + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = SMC_SINGLE_LINK; + lnk->smcibdev = smcibdev; + lnk->ibport = ibport; + lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; + if (!smcibdev->initialized) + smc_ib_setup_per_ibdev(smcibdev); + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_llc_link_init(lnk); + if (rc) + goto free_lgr; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + } smc->conn.lgr = lgr; - rwlock_init(&lgr->conns_lock); spin_lock_bh(&smc_lgr_list.lock); list_add(&lgr->list, &smc_lgr_list.list); spin_unlock_bh(&smc_lgr_list.lock); @@ -264,7 +281,12 @@ void smc_conn_free(struct smc_connection *conn) { if (!conn->lgr) return; - smc_cdc_tx_dismiss_slots(conn); + if (conn->lgr->is_smcd) { + smc_ism_unset_conn(conn); + tasklet_kill(&conn->rx_tsklet); + } else { + smc_cdc_tx_dismiss_slots(conn); + } smc_lgr_unregister_conn(conn); smc_buf_unuse(conn); } @@ -280,8 +302,8 @@ static void smc_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); } -static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, - struct smc_buf_desc *buf_desc) +static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) { struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; @@ -301,6 +323,28 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, kfree(buf_desc); } +static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, + struct smc_buf_desc *buf_desc) +{ + if (is_dmb) { + /* restore original buf len */ + buf_desc->len += sizeof(struct smcd_cdc_msg); + smc_ism_unregister_dmb(lgr->smcd, buf_desc); + } else { + kfree(buf_desc->cpu_addr); + } + kfree(buf_desc); +} + +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) +{ + if (lgr->is_smcd) + smcd_buf_free(lgr, is_rmb, buf_desc); + else + smcr_buf_free(lgr, is_rmb, buf_desc); +} + static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf_desc; @@ -332,7 +376,10 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + if (lgr->is_smcd) + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + else + smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); kfree(lgr); } @@ -357,7 +404,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) lgr->terminating = 1; if (!list_empty(&lgr->list)) /* forget lgr */ list_del_init(&lgr->list); - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + if (!lgr->is_smcd) + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); @@ -374,7 +422,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) node = rb_first(&lgr->conns_all); } write_unlock_bh(&lgr->conns_lock); - wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); + if (!lgr->is_smcd) + wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); smc_lgr_schedule_free_work(lgr); } @@ -392,17 +441,44 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + if (!lgr->is_smcd && + lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) __smc_lgr_terminate(lgr); } spin_unlock_bh(&smc_lgr_list.lock); } +/* Called when SMC-D device is terminated or peer is lost */ +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) +{ + struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); + + /* run common cleanup function and build free list */ + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->is_smcd && lgr->smcd == dev && + (!peer_gid || lgr->peer_gid == peer_gid) && + !list_empty(&lgr->list)) { + __smc_lgr_terminate(lgr); + list_move(&lgr->list, &lgr_free_list); + } + } + spin_unlock_bh(&smc_lgr_list.lock); + + /* cancel the regular free workers and actually free lgrs */ + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + cancel_delayed_work_sync(&lgr->free_work); + smc_lgr_free(lgr); + } +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ -static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) +int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct net_device *ndev; @@ -477,10 +553,30 @@ static int smc_link_determine_gid(struct smc_link_group *lgr) return -ENODEV; } +static bool smcr_lgr_match(struct smc_link_group *lgr, + struct smc_clc_msg_local *lcl, + enum smc_lgr_role role) +{ + return !memcmp(lgr->peer_systemid, lcl->id_for_peer, + SMC_SYSTEMID_LEN) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, + SMC_GID_SIZE) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, + sizeof(lcl->mac)) && + lgr->role == role; +} + +static bool smcd_lgr_match(struct smc_link_group *lgr, + struct smcd_dev *smcismdev, u64 peer_gid) +{ + return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; +} + /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, +int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact) + struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, + u64 peer_gid) { struct smc_connection *conn = &smc->conn; int local_contact = SMC_FIRST_CONTACT; @@ -502,17 +598,12 @@ int smc_conn_create(struct smc_sock *smc, spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); - if (!memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && + if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : + smcr_lgr_match(lgr, lcl, role)) && !lgr->sync_err && - (lgr->role == role) && - (lgr->vlan_id == vlan_id) && - ((role == SMC_CLNT) || - (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) { + lgr->vlan_id == vlan_id && + (role == SMC_CLNT || + lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ local_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; @@ -535,16 +626,21 @@ int smc_conn_create(struct smc_sock *smc, create: if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, smcibdev, ibport, - lcl->id_for_peer, vlan_id); + rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, + lcl->id_for_peer, vlan_id, smcd, peer_gid); if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ - rc = smc_link_determine_gid(conn->lgr); + if (!is_smcd) + rc = smc_link_determine_gid(conn->lgr); } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; + if (is_smcd) { + conn->rx_off = sizeof(struct smcd_cdc_msg); + smcd_cdc_rx_init(conn); /* init tasklet for this conn */ + } #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif @@ -609,8 +705,8 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, - bool is_rmb, int bufsize) +static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, + bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; struct smc_link *lnk; @@ -668,7 +764,44 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, return buf_desc; } -static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) +#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ + +static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, + bool is_dmb, int bufsize) +{ + struct smc_buf_desc *buf_desc; + int rc; + + if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) + return ERR_PTR(-EAGAIN); + + /* try to alloc a new DMB */ + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return ERR_PTR(-ENOMEM); + if (is_dmb) { + rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); + if (rc) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + buf_desc->pages = virt_to_page(buf_desc->cpu_addr); + /* CDC header stored in buf. So, pretend it was smaller */ + buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); + } else { + buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!buf_desc->cpu_addr) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + buf_desc->len = bufsize; + } + return buf_desc; +} + +static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) { struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; @@ -706,7 +839,11 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) break; /* found reusable slot */ } - buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize); + if (is_smcd) + buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); + else + buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); + if (PTR_ERR(buf_desc) == -ENOMEM) break; if (IS_ERR(buf_desc)) @@ -727,7 +864,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) conn->rmbe_size_short = bufsize_short; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); - conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); + conn->rmbe_update_limit = + smc_rmb_wnd_update_limit(buf_desc->len); + if (is_smcd) + smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; smc->sk.sk_sndbuf = bufsize * 2; @@ -740,6 +880,8 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -748,6 +890,8 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -756,6 +900,8 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } @@ -764,6 +910,8 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } @@ -774,16 +922,16 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) * the Linux implementation uses just one RMB-element per RMB, i.e. uses an * extra RMB for every connection in a link group */ -int smc_buf_create(struct smc_sock *smc) +int smc_buf_create(struct smc_sock *smc, bool is_smcd) { int rc; /* create send buffer */ - rc = __smc_buf_create(smc, false); + rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; /* create rmb */ - rc = __smc_buf_create(smc, true); + rc = __smc_buf_create(smc, is_smcd, true); if (rc) smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); return rc; @@ -865,7 +1013,8 @@ void smc_core_exit(void) spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + if (!lgr->is_smcd) + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); cancel_delayed_work_sync(&lgr->free_work); smc_lgr_free(lgr); /* free link group */ } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 93cb3523bf50..8b47e0168fc3 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -124,15 +124,28 @@ struct smc_buf_desc { void *cpu_addr; /* virtual address of buffer */ struct page *pages; int len; /* length of buffer */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ u32 used; /* currently used / unused */ u8 reused : 1; /* new created / reused */ u8 regerr : 1; /* err during registration */ + union { + struct { /* SMC-R */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer + */ + u32 order; /* allocation order */ + }; + struct { /* SMC-D */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ + }; + }; }; struct smc_rtoken { /* address/key of remote RMB */ @@ -148,12 +161,10 @@ struct smc_rtoken { /* address/key of remote RMB */ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) */ +struct smcd_dev; + struct smc_link_group { struct list_head list; - enum smc_lgr_role role; /* client or server */ - struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ - char peer_systemid[SMC_SYSTEMID_LEN]; - /* unique system_id of peer */ struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ @@ -163,17 +174,35 @@ struct smc_link_group { rwlock_t sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ rwlock_t rmbs_lock; /* protects rx buffers */ - struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] - [SMC_LINKS_PER_LGR_MAX]; - /* remote addr/key pairs */ - unsigned long rtokens_used_mask[BITS_TO_LONGS( - SMC_RMBS_PER_LGR_MAX)]; - /* used rtoken elements */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ + + bool is_smcd; /* SMC-R or SMC-D */ + union { + struct { /* SMC-R */ + enum smc_lgr_role role; + /* client or server */ + struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; + /* smc link */ + char peer_systemid[SMC_SYSTEMID_LEN]; + /* unique system_id of peer */ + struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] + [SMC_LINKS_PER_LGR_MAX]; + /* remote addr/key pairs */ + unsigned long rtokens_used_mask[BITS_TO_LONGS + (SMC_RMBS_PER_LGR_MAX)]; + /* used rtoken elements */ + }; + struct { /* SMC-D */ + u64 peer_gid; + /* Peer GID (remote) */ + struct smcd_dev *smcd; + /* ISM device for VLAN reg. */ + }; + }; }; /* Find the connection associated with the given alert token in the link group. @@ -217,7 +246,8 @@ void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); -int smc_buf_create(struct smc_sock *smc); +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid); +int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); @@ -227,9 +257,13 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); +int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id); + void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, +int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact); + struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, + u64 peer_gid); +void smcd_conn_free(struct smc_connection *conn); void smc_core_exit(void); #endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 839354402215..6d83eef1b743 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,7 +136,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr && + if (smc->conn.lgr && !smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, @@ -155,6 +156,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } + if (smc->conn.lgr && smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && + !list_empty(&smc->conn.lgr->list)) { + struct smc_connection *conn = &smc->conn; + struct smcd_diag_dmbinfo dinfo = { + .linkid = *((u32 *)conn->lgr->id), + .peer_gid = conn->lgr->peer_gid, + .my_gid = conn->lgr->smcd->local_gid, + .token = conn->rmb_desc->token, + .peer_token = conn->peer_token + }; + + if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0) + goto errout; + } nlmsg_end(skb, nlh); return 0; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 0eed7ab9f28b..36de2fd76170 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -143,6 +143,62 @@ out: return rc; } +static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct ib_gid_attr gattr; + int rc; + + rc = ib_query_gid(smcibdev->ibdev, ibport, 0, + &smcibdev->gid[ibport - 1], &gattr); + if (rc || !gattr.ndev) + return -ENODEV; + + memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); + dev_put(gattr.ndev); + return 0; +} + +/* Create an identifier unique for this instance of SMC-R. + * The MAC-address of the first active registered IB device + * plus a random 2-byte number is used to create this identifier. + * This name is delivered to the peer during connection initialization. + */ +static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, + u8 ibport) +{ + memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], + sizeof(smcibdev->mac[ibport - 1])); + get_random_bytes(&local_systemid[0], 2); +} + +bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) +{ + return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; +} + +static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) +{ + int rc; + + memset(&smcibdev->pattr[ibport - 1], 0, + sizeof(smcibdev->pattr[ibport - 1])); + rc = ib_query_port(smcibdev->ibdev, ibport, + &smcibdev->pattr[ibport - 1]); + if (rc) + goto out; + /* the SMC protocol requires specification of the RoCE MAC address */ + rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); + if (rc) + goto out; + if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, + sizeof(local_systemid)) && + smc_ib_port_active(smcibdev, ibport)) + /* create unique system identifier */ + smc_ib_define_local_systemid(smcibdev, ibport); +out: + return rc; +} + /* process context wrapper for might_sleep smc_ib_remember_port_attr */ static void smc_ib_port_event_work(struct work_struct *work) { @@ -370,62 +426,6 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; } -static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct ib_gid_attr gattr; - int rc; - - rc = ib_query_gid(smcibdev->ibdev, ibport, 0, - &smcibdev->gid[ibport - 1], &gattr); - if (rc || !gattr.ndev) - return -ENODEV; - - memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); - dev_put(gattr.ndev); - return 0; -} - -/* Create an identifier unique for this instance of SMC-R. - * The MAC-address of the first active registered IB device - * plus a random 2-byte number is used to create this identifier. - * This name is delivered to the peer during connection initialization. - */ -static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, - u8 ibport) -{ - memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], - sizeof(smcibdev->mac[ibport - 1])); - get_random_bytes(&local_systemid[0], 2); -} - -bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) -{ - return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; -} - -int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) -{ - int rc; - - memset(&smcibdev->pattr[ibport - 1], 0, - sizeof(smcibdev->pattr[ibport - 1])); - rc = ib_query_port(smcibdev->ibdev, ibport, - &smcibdev->pattr[ibport - 1]); - if (rc) - goto out; - /* the SMC protocol requires specification of the RoCE MAC address */ - rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); - if (rc) - goto out; - if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, - sizeof(local_systemid)) && - smc_ib_port_active(smcibdev, ibport)) - /* create unique system identifier */ - smc_ib_define_local_systemid(smcibdev, ibport); -out: - return rc; -} - long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { @@ -454,9 +454,6 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smcibdev->roce_cq_recv = NULL; goto err; } - INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, - smc_ib_global_event_handler); - ib_register_event_handler(&smcibdev->event_handler); smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; return rc; @@ -472,7 +469,6 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) return; smcibdev->initialized = 0; smc_wr_remove_dev(smcibdev); - ib_unregister_event_handler(&smcibdev->event_handler); ib_destroy_cq(smcibdev->roce_cq_recv); ib_destroy_cq(smcibdev->roce_cq_send); } @@ -483,6 +479,8 @@ static struct ib_client smc_ib_client; static void smc_ib_add_dev(struct ib_device *ibdev) { struct smc_ib_device *smcibdev; + u8 port_cnt; + int i; if (ibdev->node_type != RDMA_NODE_IB_CA) return; @@ -498,6 +496,21 @@ static void smc_ib_add_dev(struct ib_device *ibdev) list_add_tail(&smcibdev->list, &smc_ib_devices.list); spin_unlock(&smc_ib_devices.lock); ib_set_client_data(ibdev, &smc_ib_client, smcibdev); + INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, + smc_ib_global_event_handler); + ib_register_event_handler(&smcibdev->event_handler); + + /* trigger reading of the port attributes */ + port_cnt = smcibdev->ibdev->phys_port_cnt; + for (i = 0; + i < min_t(size_t, port_cnt, SMC_MAX_PORTS); + i++) { + set_bit(i, &smcibdev->port_event_mask); + /* determine pnetids of the port */ + smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i]); + } + schedule_work(&smcibdev->port_event_work); } /* callback function for ib_register_client() */ @@ -512,6 +525,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_unlock(&smc_ib_devices.lock); smc_pnet_remove_by_ibdev(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); + ib_unregister_event_handler(&smcibdev->event_handler); kfree(smcibdev); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index e90630dadf8e..7c1223c91229 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -15,6 +15,7 @@ #include <linux/interrupt.h> #include <linux/if_ether.h> #include <rdma/ib_verbs.h> +#include <net/smc.h> #define SMC_MAX_PORTS 2 /* Max # of ports */ #define SMC_GID_SIZE sizeof(union ib_gid) @@ -40,6 +41,8 @@ struct smc_ib_device { /* ib-device infos for smc */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */ + u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; + /* pnetid per port */ u8 initialized : 1; /* ib dev CQ, evthdl done */ struct work_struct port_event_work; unsigned long port_event_mask; @@ -51,7 +54,6 @@ struct smc_link; int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport); int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c new file mode 100644 index 000000000000..cfade7fdcc6d --- /dev/null +++ b/net/smc/smc_ism.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * Functions for ISM device. + * + * Copyright IBM Corp. 2018 + */ + +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <asm/page.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_ism.h" +#include "smc_pnet.h" + +struct smcd_dev_list smcd_dev_list = { + .list = LIST_HEAD_INIT(smcd_dev_list.list), + .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock) +}; + +/* Test if an ISM communication is possible. */ +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) +{ + return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, + vlan_id); +} + +int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos, + void *data, size_t len) +{ + int rc; + + rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal, + pos->offset, data, len); + + return rc < 0 ? rc : 0; +} + +/* Set a connection using this DMBE. */ +void smc_ism_set_conn(struct smc_connection *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Unset a connection using this DMBE. */ +void smc_ism_unset_conn(struct smc_connection *conn) +{ + unsigned long flags; + + if (!conn->rmb_desc) + return; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Register a VLAN identifier with the ISM device. Use a reference count + * and add a VLAN identifier only when the first DMB using this VLAN is + * registered. + */ +int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *new_vlan, *vlan; + unsigned long flags; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + /* create new vlan entry, in case we need it */ + new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); + if (!new_vlan) + return -ENOMEM; + new_vlan->vlanid = vlanid; + refcount_set(&new_vlan->refcnt, 1); + + /* if there is an existing entry, increase count and return */ + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + refcount_inc(&vlan->refcnt); + kfree(new_vlan); + goto out; + } + } + + /* no existing entry found. + * add new entry to device; might fail, e.g., if HW limit reached + */ + if (smcd->ops->add_vlan_id(smcd, vlanid)) { + kfree(new_vlan); + rc = -EIO; + goto out; + } + list_add_tail(&new_vlan->list, &smcd->vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +/* Unregister a VLAN identifier with the ISM device. Use a reference count + * and remove a VLAN identifier only when the last DMB using this VLAN is + * unregistered. + */ +int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *vlan; + unsigned long flags; + bool found = false; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + if (!refcount_dec_and_test(&vlan->refcnt)) + goto out; + found = true; + break; + } + } + if (!found) { + rc = -ENOENT; + goto out; /* VLAN id not in table */ + } + + /* Found and the last reference just gone */ + if (smcd->ops->del_vlan_id(smcd, vlanid)) + rc = -EIO; + list_del(&vlan->list); + kfree(vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_tok = dmb_desc->token; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.cpu_addr = dmb_desc->cpu_addr; + dmb.dma_addr = dmb_desc->dma_addr; + dmb.dmb_len = dmb_desc->len; + return smcd->ops->unregister_dmb(smcd, &dmb); +} + +int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, + struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + int rc; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_len = dmb_len; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.vlan_id = lgr->vlan_id; + dmb.rgid = lgr->peer_gid; + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb); + if (!rc) { + dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->token = dmb.dmb_tok; + dmb_desc->cpu_addr = dmb.cpu_addr; + dmb_desc->dma_addr = dmb.dma_addr; + dmb_desc->len = dmb.dmb_len; + } + return rc; +} + +struct smc_ism_event_work { + struct work_struct work; + struct smcd_dev *smcd; + struct smcd_event event; +}; + +/* worker for SMC-D events */ +static void smc_ism_event_work(struct work_struct *work) +{ + struct smc_ism_event_work *wrk = + container_of(work, struct smc_ism_event_work, work); + + switch (wrk->event.type) { + case ISM_EVENT_GID: /* GID event, token is peer GID */ + smc_smcd_terminate(wrk->smcd, wrk->event.tok); + break; + case ISM_EVENT_DMB: + break; + } + kfree(wrk); +} + +static void smcd_release(struct device *dev) +{ + struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev); + + kfree(smcd->conn); + kfree(smcd); +} + +struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs) +{ + struct smcd_dev *smcd; + + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + if (!smcd) + return NULL; + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); + if (!smcd->conn) { + kfree(smcd); + return NULL; + } + + smcd->dev.parent = parent; + smcd->dev.release = smcd_release; + device_initialize(&smcd->dev); + dev_set_name(&smcd->dev, name); + smcd->ops = ops; + smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); + + spin_lock_init(&smcd->lock); + INIT_LIST_HEAD(&smcd->vlan); + smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", + WQ_MEM_RECLAIM, name); + return smcd; +} +EXPORT_SYMBOL_GPL(smcd_alloc_dev); + +int smcd_register_dev(struct smcd_dev *smcd) +{ + spin_lock(&smcd_dev_list.lock); + list_add_tail(&smcd->list, &smcd_dev_list.list); + spin_unlock(&smcd_dev_list.lock); + + return device_add(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_register_dev); + +void smcd_unregister_dev(struct smcd_dev *smcd) +{ + spin_lock(&smcd_dev_list.lock); + list_del(&smcd->list); + spin_unlock(&smcd_dev_list.lock); + flush_workqueue(smcd->event_wq); + destroy_workqueue(smcd->event_wq); + smc_smcd_terminate(smcd, 0); + + device_del(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_unregister_dev); + +void smcd_free_dev(struct smcd_dev *smcd) +{ + put_device(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_free_dev); + +/* SMCD Device event handler. Called from ISM device interrupt handler. + * Parameters are smcd device pointer, + * - event->type (0 --> DMB, 1 --> GID), + * - event->code (event code), + * - event->tok (either DMB token when event type 0, or GID when event type 1) + * - event->time (time of day) + * - event->info (debug info). + * + * Context: + * - Function called in IRQ context from ISM device driver event handler. + */ +void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) +{ + struct smc_ism_event_work *wrk; + + /* copy event to event work queue, and let it be handled there */ + wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + if (!wrk) + return; + INIT_WORK(&wrk->work, smc_ism_event_work); + wrk->smcd = smcd; + wrk->event = *event; + queue_work(smcd->event_wq, &wrk->work); +} +EXPORT_SYMBOL_GPL(smcd_handle_event); + +/* SMCD Device interrupt handler. Called from ISM device interrupt handler. + * Parameters are smcd device pointer and DMB number. Find the connection and + * schedule the tasklet for this connection. + * + * Context: + * - Function called in IRQ context from ISM device driver IRQ handler. + */ +void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) +{ + struct smc_connection *conn = NULL; + unsigned long flags; + + spin_lock_irqsave(&smcd->lock, flags); + conn = smcd->conn[dmbno]; + if (conn) + tasklet_schedule(&conn->rx_tsklet); + spin_unlock_irqrestore(&smcd->lock, flags); +} +EXPORT_SYMBOL_GPL(smcd_handle_irq); diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h new file mode 100644 index 000000000000..aee45b860b79 --- /dev/null +++ b/net/smc/smc_ism.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * SMC-D ISM device structure definitions. + * + * Copyright IBM Corp. 2018 + */ + +#ifndef SMCD_ISM_H +#define SMCD_ISM_H + +#include <linux/uio.h> + +#include "smc.h" + +struct smcd_dev_list { /* List of SMCD devices */ + struct list_head list; + spinlock_t lock; /* Protects list of devices */ +}; + +extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ + +struct smc_ism_vlanid { /* VLAN id set on ISM device */ + struct list_head list; + unsigned short vlanid; /* Vlan id */ + refcount_t refcnt; /* Reference count */ +}; + +struct smc_ism_position { /* ISM device position to write to */ + u64 token; /* Token of DMB */ + u32 offset; /* Offset into DMBE */ + u8 index; /* Index of DMBE */ + u8 signal; /* Generate interrupt on owner side */ +}; + +struct smcd_dev; + +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev); +void smc_ism_set_conn(struct smc_connection *conn); +void smc_ism_unset_conn(struct smc_connection *conn); +int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, + struct smc_buf_desc *dmb_desc); +int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, + void *data, size_t len); +#endif diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index d7b88b2d1b22..1b6c066d3495 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -22,13 +22,12 @@ #include "smc_pnet.h" #include "smc_ib.h" - -#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */ +#include "smc_ism.h" static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, - .len = SMC_MAX_PNET_ID_LEN - 1 + .len = SMC_MAX_PNETID_LEN - 1 }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, @@ -65,7 +64,7 @@ static struct smc_pnettable { */ struct smc_pnetentry { struct list_head list; - char pnet_name[SMC_MAX_PNET_ID_LEN + 1]; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; struct net_device *ndev; struct smc_ib_device *smcibdev; u8 ib_port; @@ -209,7 +208,7 @@ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) return false; while (--end >= bf && isspace(*end)) ; - if (end - bf >= SMC_MAX_PNET_ID_LEN) + if (end - bf >= SMC_MAX_PNETID_LEN) return false; while (bf <= end) { if (!isalnum(*bf)) @@ -358,9 +357,6 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) kfree(pnetelem); return rc; } - rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port); - if (rc) - smc_pnet_remove_by_pnetid(pnetelem->pnet_name); return rc; } @@ -485,10 +481,10 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); + return NOTIFY_OK; default: - break; + return NOTIFY_DONE; } - return NOTIFY_DONE; } static struct notifier_block smc_netdev_notifier = { @@ -515,26 +511,91 @@ void smc_pnet_exit(void) genl_unregister_family(&smc_pnet_nl_family); } -/* PNET table analysis for a given sock: - * determine ib_device and port belonging to used internal TCP socket - * ethernet interface. +/* Determine one base device for stacked net devices. + * If the lower device level contains more than one devices + * (for instance with bonding slaves), just the first device + * is used to reach a base device. */ -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport) +static struct net_device *pnet_find_base_ndev(struct net_device *ndev) { - struct dst_entry *dst = sk_dst_get(sk); - struct smc_pnetentry *pnetelem; + int i, nest_lvl; - *smcibdev = NULL; - *ibport = 0; + rtnl_lock(); + nest_lvl = dev_get_nest_level(ndev); + for (i = 0; i < nest_lvl; i++) { + struct list_head *lower = &ndev->adj_list.lower; + + if (list_empty(lower)) + break; + lower = lower->next; + ndev = netdev_lower_get_next(ndev, &lower); + } + rtnl_unlock(); + return ndev; +} + +/* Determine the corresponding IB device port based on the hardware PNETID. + * Searching stops at the first matching active IB device port. + */ +static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, + struct smc_ib_device **smcibdev, + u8 *ibport) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct smc_ib_device *ibdev; + int i; + + ndev = pnet_find_base_ndev(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid)) + return; /* pnetid could not be determined */ + + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!memcmp(ibdev->pnetid[i - 1], ndev_pnetid, + SMC_MAX_PNETID_LEN) && + smc_ib_port_active(ibdev, i)) { + *smcibdev = ibdev; + *ibport = i; + break; + } + } + } + spin_unlock(&smc_ib_devices.lock); +} + +static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, + struct smcd_dev **smcismdev) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct smcd_dev *ismdev; + + ndev = pnet_find_base_ndev(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid)) + return; /* pnetid could not be determined */ + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(ismdev, &smcd_dev_list.list, list) { + if (!memcmp(ismdev->pnetid, ndev_pnetid, SMC_MAX_PNETID_LEN)) { + *smcismdev = ismdev; + break; + } + } + spin_unlock(&smcd_dev_list.lock); +} + +/* Lookup of coupled ib_device via SMC pnet table */ +static void smc_pnet_find_roce_by_table(struct net_device *netdev, + struct smc_ib_device **smcibdev, + u8 *ibport) +{ + struct smc_pnetentry *pnetelem; - if (!dst) - return; - if (!dst->dev) - goto out_rel; read_lock(&smc_pnettable.lock); list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { - if (dst->dev == pnetelem->ndev) { + if (netdev == pnetelem->ndev) { if (smc_ib_port_active(pnetelem->smcibdev, pnetelem->ib_port)) { *smcibdev = pnetelem->smcibdev; @@ -544,6 +605,54 @@ void smc_pnet_find_roce_resource(struct sock *sk, } } read_unlock(&smc_pnettable.lock); +} + +/* PNET table analysis for a given sock: + * determine ib_device and port belonging to used internal TCP socket + * ethernet interface. + */ +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport) +{ + struct dst_entry *dst = sk_dst_get(sk); + + *smcibdev = NULL; + *ibport = 0; + + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + /* if possible, lookup via hardware-defined pnetid */ + smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport); + if (*smcibdev) + goto out_rel; + + /* lookup via SMC PNET table */ + smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport); + +out_rel: + dst_release(dst); +out: + return; +} + +void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev) +{ + struct dst_entry *dst = sk_dst_get(sk); + + *smcismdev = NULL; + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + /* if possible, lookup via hardware-defined pnetid */ + smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev); + out_rel: dst_release(dst); +out: + return; } diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 5a29519db976..1e94fd4df7bc 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -12,12 +12,28 @@ #ifndef _SMC_PNET_H #define _SMC_PNET_H +#if IS_ENABLED(CONFIG_HAVE_PNETID) +#include <asm/pnet.h> +#endif + struct smc_ib_device; +struct smcd_dev; + +static inline int smc_pnetid_by_dev_port(struct device *dev, + unsigned short port, u8 *pnetid) +{ +#if IS_ENABLED(CONFIG_HAVE_PNETID) + return pnet_id_by_dev_port(dev, port, pnetid); +#else + return -ENOENT; +#endif +} int smc_pnet_init(void) __init; void smc_pnet_exit(void); int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_ib_device **smcibdev, u8 *ibport); +void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev); #endif diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 3d77b383cccd..b329803c8339 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -305,7 +305,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ - rcvbuf_base = conn->rmb_desc->cpu_addr; + rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; do { /* while (read_remaining) */ if (read_done >= target || (pipe && read_done)) diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f82886b7d1d8..142bcb134dd6 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -24,6 +24,7 @@ #include "smc.h" #include "smc_wr.h" #include "smc_cdc.h" +#include "smc_ism.h" #include "smc_tx.h" #define SMC_TX_WORK_DELAY HZ @@ -250,6 +251,24 @@ out_err: /***************************** sndbuf consumer *******************************/ +/* sndbuf consumer: actual data transfer of one target chunk with ISM write */ +int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, + u32 offset, int signal) +{ + struct smc_ism_position pos; + int rc; + + memset(&pos, 0, sizeof(pos)); + pos.token = conn->peer_token; + pos.index = conn->peer_rmbe_idx; + pos.offset = conn->tx_off + offset; + pos.signal = signal; + rc = smc_ism_write(conn->lgr->smcd, &pos, data, len); + if (rc) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + return rc; +} + /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_sge sges[]) @@ -297,21 +316,104 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn, smc_curs_add(conn->sndbuf_desc->len, sent, len); } +/* SMC-R helper for smc_tx_rdma_writes() */ +static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, + size_t src_off, size_t src_len, + size_t dst_off, size_t dst_len) +{ + dma_addr_t dma_addr = + sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + int src_len_sum = src_len, dst_len_sum = dst_len; + struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; + int sent_count = src_off; + int srcchunk, dstchunk; + int num_sges; + int rc; + + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + num_sges = 0; + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + sges[srcchunk].addr = dma_addr + src_off; + sges[srcchunk].length = src_len; + sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; + num_sges++; + + src_off += src_len; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges); + if (rc) + return rc; + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, dst_len, conn->sndbuf_desc->len - + sent_count); + src_len_sum = src_len; + } + return 0; +} + +/* SMC-D helper for smc_tx_rdma_writes() */ +static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, + size_t src_off, size_t src_len, + size_t dst_off, size_t dst_len) +{ + int src_len_sum = src_len, dst_len_sum = dst_len; + int srcchunk, dstchunk; + int rc; + + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + void *data = conn->sndbuf_desc->cpu_addr + src_off; + + rc = smcd_tx_ism_write(conn, data, src_len, dst_off + + sizeof(struct smcd_cdc_msg), 0); + if (rc) + return rc; + dst_off += src_len; + src_off += src_len; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off); + src_len_sum = src_len; + } + return 0; +} + /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; * usable snd_wnd as max transmit */ static int smc_tx_rdma_writes(struct smc_connection *conn) { - size_t src_off, src_len, dst_off, dst_len; /* current chunk values */ - size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk; + size_t len, src_len, dst_off, dst_len; /* current chunk values */ union smc_host_cursor sent, prep, prod, cons; - struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; - struct smc_link_group *lgr = conn->lgr; struct smc_cdc_producer_flags *pflags; int to_send, rmbespace; - struct smc_link *link; - dma_addr_t dma_addr; - int num_sges; int rc; /* source: sndbuf */ @@ -341,7 +443,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) len = min(to_send, rmbespace); /* initialize variables for first iteration of subsequent nested loop */ - link = &lgr->lnk[SMC_SINGLE_LINK]; dst_off = prod.count; if (prod.wrap == cons.wrap) { /* the filled destination area is unwrapped, @@ -358,8 +459,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) */ dst_len = len; } - dst_len_sum = dst_len; - src_off = sent.count; /* dst_len determines the maximum src_len */ if (sent.count + dst_len <= conn->sndbuf_desc->len) { /* unwrapped src case: single chunk of entire dst_len */ @@ -368,38 +467,15 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ src_len = conn->sndbuf_desc->len - sent.count; } - src_len_sum = src_len; - dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); - for (dstchunk = 0; dstchunk < 2; dstchunk++) { - num_sges = 0; - for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sges[srcchunk].addr = dma_addr + src_off; - sges[srcchunk].length = src_len; - sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; - num_sges++; - src_off += src_len; - if (src_off >= conn->sndbuf_desc->len) - src_off -= conn->sndbuf_desc->len; - /* modulo in send ring */ - if (src_len_sum == dst_len) - break; /* either on 1st or 2nd iteration */ - /* prepare next (== 2nd) iteration */ - src_len = dst_len - src_len; /* remainder */ - src_len_sum += src_len; - } - rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges); - if (rc) - return rc; - if (dst_len_sum == len) - break; /* either on 1st or 2nd iteration */ - /* prepare next (== 2nd) iteration */ - dst_off = 0; /* modulo offset in RMBE ring buffer */ - dst_len = len - dst_len; /* remainder */ - dst_len_sum += dst_len; - src_len = min_t(int, - dst_len, conn->sndbuf_desc->len - sent.count); - src_len_sum = src_len; - } + + if (conn->lgr->is_smcd) + rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len, + dst_off, dst_len); + else + rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len, + dst_off, dst_len); + if (rc) + return rc; if (conn->urg_tx_pend && len == to_send) pflags->urg_data_present = 1; @@ -420,7 +496,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags; struct smc_cdc_tx_pend *pend; @@ -467,6 +543,37 @@ out_unlock: return rc; } +static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; + int rc = 0; + + spin_lock_bh(&conn->send_lock); + if (!pflags->urg_data_present) + rc = smc_tx_rdma_writes(conn); + if (!rc) + rc = smcd_cdc_msg_send(conn); + + if (!rc && pflags->urg_data_present) { + pflags->urg_data_pending = 0; + pflags->urg_data_present = 0; + } + spin_unlock_bh(&conn->send_lock); + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + if (conn->lgr->is_smcd) + rc = smcd_tx_sndbuf_nonempty(conn); + else + rc = smcr_tx_sndbuf_nonempty(conn); + + return rc; +} + /* Wakeup sndbuf consumers from process context * since there is more data to transmit */ diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 9d2238909fa0..b22bdc5694c4 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -33,5 +33,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); +int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, + u32 offset, int signal); #endif /* SMC_TX_H */ diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 625acb27efcc..3a512936eea9 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -140,11 +140,13 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, /* We are going to append to the frags_list of head. * Need to unshare the frag_list. */ - err = skb_unclone(head, GFP_ATOMIC); - if (err) { - STRP_STATS_INCR(strp->stats.mem_fail); - desc->error = err; - return 0; + if (skb_has_frag_list(head)) { + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.mem_fail); + desc->error = err; + return 0; + } } if (unlikely(skb_shinfo(head)->frag_list)) { @@ -201,14 +203,16 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, memset(stm, 0, sizeof(*stm)); stm->strp.offset = orig_offset + eaten; } else { - /* Unclone since we may be appending to an skb that we + /* Unclone if we are appending to an skb that we * already share a frag_list with. */ - err = skb_unclone(skb, GFP_ATOMIC); - if (err) { - STRP_STATS_INCR(strp->stats.mem_fail); - desc->error = err; - break; + if (skb_has_frag_list(skb)) { + err = skb_unclone(skb, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.mem_fail); + desc->error = err; + break; + } } stm = _strp_msg(head); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 2dfb492a7c94..fd6d8f18955c 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -610,6 +610,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, case NETDEV_CHANGE: if (netif_carrier_ok(dev)) break; + /* else: fall through */ case NETDEV_UP: test_and_set_bit_lock(0, &b->up); break; diff --git a/net/tipc/group.c b/net/tipc/group.c index d7a7befeddd4..8f43e7d6046b 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -159,11 +159,6 @@ u32 tipc_group_exclude(struct tipc_group *grp) return 0; } -int tipc_group_size(struct tipc_group *grp) -{ - return grp->member_cnt; -} - struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq, bool *group_is_open) @@ -918,3 +913,35 @@ void tipc_group_member_evt(struct tipc_group *grp, } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } + +int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) +{ + struct nlattr *group = nla_nest_start(skb, TIPC_NLA_SOCK_GROUP); + + if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID, + grp->type) || + nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE, + grp->instance) || + nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, + grp->bc_snd_nxt)) + goto group_msg_cancel; + + if (grp->scope == TIPC_NODE_SCOPE) + if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE)) + goto group_msg_cancel; + + if (grp->scope == TIPC_CLUSTER_SCOPE) + if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE)) + goto group_msg_cancel; + + if (*grp->open) + if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN)) + goto group_msg_cancel; + + nla_nest_end(skb, group); + return 0; + +group_msg_cancel: + nla_nest_cancel(skb, group); + return -1; +} diff --git a/net/tipc/group.h b/net/tipc/group.h index 5996af6e9f1d..76b4e5a7b39d 100644 --- a/net/tipc/group.h +++ b/net/tipc/group.h @@ -72,4 +72,5 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq); u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); void tipc_group_update_member(struct tipc_member *m, int len); +int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb); #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index 695acb783969..6987ffc8e7a1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -106,7 +106,8 @@ struct tipc_stats { * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages * @last_retransmitted: sequence number of most recently retransmitted message - * @stale_count: # of identical retransmit requests made by peer + * @stale_cnt: counter for number of identical retransmit attempts + * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released * @acked: # last packet acked by a certain peer. Used for broadcast. * @rcv_nxt: next sequence number to expect for inbound messages @@ -127,14 +128,17 @@ struct tipc_link { struct net *net; /* Management and link supervision data */ - u32 peer_session; - u32 session; + u16 peer_session; + u16 session; + u16 snd_nxt_state; + u16 rcv_nxt_state; u32 peer_bearer_id; u32 bearer_id; u32 tolerance; u32 abort_limit; u32 state; u16 peer_caps; + bool in_session; bool active; u32 silent_intv_cnt; char if_name[TIPC_MAX_IF_NAME]; @@ -161,7 +165,8 @@ struct tipc_link { u16 snd_nxt; u16 last_retransm; u16 window; - u32 stale_count; + u16 stale_cnt; + unsigned long stale_limit; /* Reception */ u16 rcv_nxt; @@ -212,11 +217,6 @@ enum { */ #define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) -/* Wildcard value for link session numbers. When it is known that - * peer endpoint is down, any session number must be accepted. - */ -#define ANY_SESSION 0x10000 - /* Link FSM states: */ enum { @@ -297,11 +297,6 @@ static bool link_is_bc_rcvlink(struct tipc_link *l) return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l)); } -int tipc_link_is_active(struct tipc_link *l) -{ - return l->active; -} - void tipc_link_set_active(struct tipc_link *l, bool active) { l->active = active; @@ -337,6 +332,11 @@ char tipc_link_plane(struct tipc_link *l) return l->net_plane; } +void tipc_link_update_caps(struct tipc_link *l, u16 capabilities) +{ + l->peer_caps = capabilities; +} + void tipc_link_add_bc_peer(struct tipc_link *snd_l, struct tipc_link *uc_l, struct sk_buff_head *xmitq) @@ -469,7 +469,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->addr = peer; l->peer_caps = peer_caps; l->net = net; - l->peer_session = ANY_SESSION; + l->in_session = false; l->bearer_id = bearer_id; l->tolerance = tolerance; l->net_plane = net_plane; @@ -838,7 +838,7 @@ void link_prepare_wakeup(struct tipc_link *l) void tipc_link_reset(struct tipc_link *l) { - l->peer_session = ANY_SESSION; + l->in_session = false; l->session++; l->mtu = l->advertised_mtu; __skb_queue_purge(&l->transmq); @@ -857,10 +857,12 @@ void tipc_link_reset(struct tipc_link *l) l->rcv_unacked = 0; l->snd_nxt = 1; l->rcv_nxt = 1; + l->snd_nxt_state = 1; + l->rcv_nxt_state = 1; l->acked = 0; l->silent_intv_cnt = 0; l->rst_cnt = 0; - l->stale_count = 0; + l->stale_cnt = 0; l->bc_peer_is_up = false; memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); @@ -997,39 +999,41 @@ static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb) msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); } -int tipc_link_retrans(struct tipc_link *l, struct tipc_link *nacker, +/* tipc_link_retrans() - retransmit one or more packets + * @l: the link to transmit on + * @r: the receiving link ordering the retransmit. Same as l if unicast + * @from: retransmit from (inclusive) this sequence number + * @to: retransmit to (inclusive) this sequence number + * xmitq: queue for accumulating the retransmitted packets + */ +int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, u16 from, u16 to, struct sk_buff_head *xmitq) { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); - struct tipc_msg *hdr; - u16 ack = l->rcv_nxt - 1; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + u16 ack = l->rcv_nxt - 1; + struct tipc_msg *hdr; if (!skb) return 0; /* Detect repeated retransmit failures on same packet */ - if (nacker->last_retransm != buf_seqno(skb)) { - nacker->last_retransm = buf_seqno(skb); - nacker->stale_count = 1; - } else if (++nacker->stale_count > 100) { + if (r->last_retransm != buf_seqno(skb)) { + r->last_retransm = buf_seqno(skb); + r->stale_limit = jiffies + msecs_to_jiffies(l->tolerance); + } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { link_retransmit_failure(l, skb); - nacker->stale_count = 0; if (link_is_bc_sndlink(l)) return TIPC_LINK_DOWN_EVT; return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } - /* Move forward to where retransmission should start */ skb_queue_walk(&l->transmq, skb) { - if (!less(buf_seqno(skb), from)) - break; - } - - skb_queue_walk_from(&l->transmq, skb) { - if (more(buf_seqno(skb), to)) - break; hdr = buf_msg(skb); + if (less(msg_seqno(hdr), from)) + continue; + if (more(msg_seqno(hdr), to)) + break; _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; @@ -1063,6 +1067,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, skb_queue_tail(mc_inputq, skb); return true; } + /* else: fall through */ case CONN_MANAGER: skb_queue_tail(inputq, skb); return true; @@ -1271,6 +1276,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Forward queues and wake up waiting users */ if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { + l->stale_cnt = 0; tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); @@ -1347,6 +1353,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2); if (mtyp == STATE_MSG) { + if (l->peer_caps & TIPC_LINK_PROTO_SEQNO) + msg_set_seqno(hdr, l->snd_nxt_state++); msg_set_seq_gap(hdr, rcvgap); msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); msg_set_probe(hdr, probe); @@ -1438,6 +1446,44 @@ tnl: } } +/* tipc_link_validate_msg(): validate message against current link state + * Returns true if message should be accepted, otherwise false + */ +bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr) +{ + u16 curr_session = l->peer_session; + u16 session = msg_session(hdr); + int mtyp = msg_type(hdr); + + if (msg_user(hdr) != LINK_PROTOCOL) + return true; + + switch (mtyp) { + case RESET_MSG: + if (!l->in_session) + return true; + /* Accept only RESET with new session number */ + return more(session, curr_session); + case ACTIVATE_MSG: + if (!l->in_session) + return true; + /* Accept only ACTIVATE with new or current session number */ + return !less(session, curr_session); + case STATE_MSG: + /* Accept only STATE with current session number */ + if (!l->in_session) + return false; + if (session != curr_session) + return false; + if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO)) + return true; + /* Accept only STATE with new sequence number */ + return !less(msg_seqno(hdr), l->rcv_nxt_state); + default: + return false; + } +} + /* tipc_link_proto_rcv(): receive link level protocol message : * Note that network plane id propagates through the network, and may * change at any time. The node with lowest numerical id determines @@ -1471,17 +1517,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, hdr = buf_msg(skb); data = msg_data(hdr); + if (!tipc_link_validate_msg(l, hdr)) + goto exit; + switch (mtyp) { case RESET_MSG: - - /* Ignore duplicate RESET with old session number */ - if ((less_eq(msg_session(hdr), l->peer_session)) && - (l->peer_session != ANY_SESSION)) - break; - /* fall thru' */ - case ACTIVATE_MSG: - /* Complete own link name with peer's interface name */ if_name = strrchr(l->name, ':') + 1; if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME) @@ -1509,12 +1550,14 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rc = TIPC_LINK_UP_EVT; l->peer_session = msg_session(hdr); + l->in_session = true; l->peer_bearer_id = msg_bearer_id(hdr); if (l->mtu > msg_max_pkt(hdr)) l->mtu = msg_max_pkt(hdr); break; case STATE_MSG: + l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) diff --git a/net/tipc/link.h b/net/tipc/link.h index ec59348a81e8..7bc494a33fdf 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -110,6 +110,8 @@ char *tipc_link_name(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); +void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); +bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); unsigned long tipc_link_tolerance(struct tipc_link *l); void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index b6c45dccba3d..b61891054709 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -416,26 +416,31 @@ bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu) */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { - struct tipc_msg *msg; - int imsz, offset; + struct tipc_msg *hdr, *ihdr; + int imsz; *iskb = NULL; if (unlikely(skb_linearize(skb))) goto none; - msg = buf_msg(skb); - offset = msg_hdr_sz(msg) + *pos; - if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE))) + hdr = buf_msg(skb); + if (unlikely(*pos > (msg_data_sz(hdr) - MIN_H_SIZE))) goto none; - *iskb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!*iskb)) + ihdr = (struct tipc_msg *)(msg_data(hdr) + *pos); + imsz = msg_size(ihdr); + + if ((*pos + imsz) > msg_data_sz(hdr)) goto none; - skb_pull(*iskb, offset); - imsz = msg_size(buf_msg(*iskb)); - skb_trim(*iskb, imsz); + + *iskb = tipc_buf_acquire(imsz, GFP_ATOMIC); + if (!*iskb) + goto none; + + skb_copy_to_linear_data(*iskb, ihdr, imsz); if (unlikely(!tipc_msg_validate(iskb))) goto none; + *pos += align(imsz); return true; none: @@ -531,12 +536,6 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) msg_set_hdr_sz(hdr, BASIC_H_SIZE); } - if (skb_cloned(_skb) && - pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) - goto exit; - - /* reassign after skb header modifications */ - hdr = buf_msg(_skb); /* Now reverse the concerned fields */ msg_set_errcode(hdr, err); msg_set_non_seq(hdr, 0); @@ -595,10 +594,6 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) if (!skb_cloned(skb)) return true; - /* Unclone buffer in case it was bundled */ - if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) - return false; - return true; } diff --git a/net/tipc/node.c b/net/tipc/node.c index 0453bd451ce8..68014f1b6976 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -45,6 +45,7 @@ #include "netlink.h" #define INVALID_NODE_SIG 0x10000 +#define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down @@ -96,6 +97,7 @@ struct tipc_bclink_entry { * @link_id: local and remote bearer ids of changing link, if any * @publ_list: list of publications * @rcu: rcu struct for tipc_node + * @delete_at: indicates the time for deleting a down node */ struct tipc_node { u32 addr; @@ -121,6 +123,7 @@ struct tipc_node { unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; + unsigned long delete_at; }; /* Node FSM states and events: @@ -160,6 +163,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static void tipc_node_put(struct tipc_node *node); static bool node_is_up(struct tipc_node *n); +static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; @@ -359,13 +363,24 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; + struct tipc_link *l; + int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr); if (n) { + if (n->capabilities == capabilities) + goto exit; /* Same node may come back with new capabilities */ + write_lock_bh(&n->lock); n->capabilities = capabilities; + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + l = n->links[bearer_id].link; + if (l) + tipc_link_update_caps(l, capabilities); + } + write_unlock_bh(&n->lock); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -390,6 +405,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; + n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; @@ -433,11 +449,16 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } -static void tipc_node_delete(struct tipc_node *node) +static void tipc_node_delete_from_list(struct tipc_node *node) { list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); +} + +static void tipc_node_delete(struct tipc_node *node) +{ + tipc_node_delete_from_list(node); del_timer_sync(&node->timer); tipc_node_put(node); @@ -544,6 +565,42 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) tipc_node_put(node); } +static void tipc_node_clear_links(struct tipc_node *node) +{ + int i; + + for (i = 0; i < MAX_BEARERS; i++) { + struct tipc_link_entry *le = &node->links[i]; + + if (le->link) { + kfree(le->link); + le->link = NULL; + node->link_cnt--; + } + } +} + +/* tipc_node_cleanup - delete nodes that does not + * have active links for NODE_CLEANUP_AFTER time + */ +static int tipc_node_cleanup(struct tipc_node *peer) +{ + struct tipc_net *tn = tipc_net(peer->net); + bool deleted = false; + + spin_lock_bh(&tn->node_list_lock); + tipc_node_write_lock(peer); + + if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { + tipc_node_clear_links(peer); + tipc_node_delete_from_list(peer); + deleted = true; + } + tipc_node_write_unlock(peer); + spin_unlock_bh(&tn->node_list_lock); + return deleted; +} + /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) @@ -551,21 +608,29 @@ static void tipc_node_timeout(struct timer_list *t) struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; + int remains = n->link_cnt; int bearer_id; int rc = 0; + if (!node_is_up(n) && tipc_node_cleanup(n)) { + /*Removing the reference of Timer*/ + tipc_node_put(n); + return; + } + __skb_queue_head_init(&xmitq); - for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; - spin_lock_bh(&le->lock); if (le->link) { + spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); + spin_unlock_bh(&le->lock); + remains--; } - spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); if (rc & TIPC_LINK_DOWN_EVT) @@ -1174,6 +1239,7 @@ static void node_lost_contact(struct tipc_node *n, uint i; pr_debug("Lost contact with %x\n", n->addr); + n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); @@ -1481,7 +1547,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id * tipc_node_check_state - check and if necessary update node state * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet - * Returns true if state is ok, otherwise consumes buffer and returns false + * Returns true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) @@ -1515,6 +1581,9 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, } } + if (!tipc_link_validate_msg(l, hdr)) + return false; + /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) @@ -1743,7 +1812,6 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) struct tipc_node *peer; u32 addr; int err; - int i; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) @@ -1778,15 +1846,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) goto err_out; } - for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link_entry *le = &peer->links[i]; - - if (le->link) { - kfree(le->link); - le->link = NULL; - peer->link_cnt--; - } - } + tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); diff --git a/net/tipc/node.h b/net/tipc/node.h index 846c8f240872..48b3298a248d 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -49,14 +49,16 @@ enum { TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), TIPC_BCAST_RCAST = (1 << 4), - TIPC_NODE_ID128 = (1 << 5) + TIPC_NODE_ID128 = (1 << 5), + TIPC_LINK_PROTO_SEQNO = (1 << 6) }; -#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ - TIPC_BCAST_STATE_NACK | \ - TIPC_BCAST_RCAST | \ - TIPC_BLOCK_FLOWCTL | \ - TIPC_NODE_ID128) +#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ + TIPC_BCAST_STATE_NACK | \ + TIPC_BCAST_RCAST | \ + TIPC_BLOCK_FLOWCTL | \ + TIPC_NODE_ID128 | \ + TIPC_LINK_PROTO_SEQNO) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 930852c54d7a..3d21414ba357 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3320,6 +3320,11 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, goto stat_msg_cancel; nla_nest_end(skb, stat); + + if (tsk->group) + if (tipc_group_fill_sock_diag(tsk->group, skb)) + goto stat_msg_cancel; + nla_nest_end(skb, attrs); return 0; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a7a8f8e20ff3..1e968d238adf 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,9 +52,12 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx); + if (ctx->tx_conf == TLS_HW) + kfree(tls_offload_ctx_tx(ctx)); + + if (ctx->rx_conf == TLS_HW) + kfree(tls_offload_ctx_rx(ctx)); - kfree(offload_ctx); kfree(ctx); } @@ -71,10 +74,11 @@ static void tls_device_gc_task(struct work_struct *work) list_for_each_entry_safe(ctx, tmp, &gc_list, list) { struct net_device *netdev = ctx->netdev; - if (netdev) { + if (netdev && ctx->tx_conf == TLS_HW) { netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); + ctx->netdev = NULL; } list_del(&ctx->list); @@ -82,6 +86,22 @@ static void tls_device_gc_task(struct work_struct *work) } } +static void tls_device_attach(struct tls_context *ctx, struct sock *sk, + struct net_device *netdev) +{ + if (sk->sk_destruct != tls_device_sk_destruct) { + refcount_set(&ctx->refcount, 1); + dev_hold(netdev); + ctx->netdev = netdev; + spin_lock_irq(&tls_device_lock); + list_add_tail(&ctx->list, &tls_device_list); + spin_unlock_irq(&tls_device_lock); + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_device_sk_destruct; + } +} + static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { unsigned long flags; @@ -125,7 +145,7 @@ static void destroy_record(struct tls_record_info *record) kfree(record); } -static void delete_all_records(struct tls_offload_context *offload_ctx) +static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; @@ -141,14 +161,14 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; - struct tls_offload_context *ctx; + struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; - ctx = tls_offload_ctx(tls_ctx); + ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; @@ -179,15 +199,17 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - if (ctx->open_record) - destroy_record(ctx->open_record); + tls_ctx->sk_destruct(sk); - delete_all_records(ctx); - crypto_free_aead(ctx->aead_send); - ctx->sk_destruct(sk); - clean_acked_data_disable(inet_csk(sk)); + if (tls_ctx->tx_conf == TLS_HW) { + if (ctx->open_record) + destroy_record(ctx->open_record); + delete_all_records(ctx); + crypto_free_aead(ctx->aead_send); + clean_acked_data_disable(inet_csk(sk)); + } if (refcount_dec_and_test(&tls_ctx->refcount)) tls_device_queue_ctx_destruction(tls_ctx); @@ -219,7 +241,7 @@ static void tls_append_frag(struct tls_record_info *record, static int tls_push_record(struct sock *sk, struct tls_context *ctx, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, struct page_frag *pfrag, int flags, @@ -264,7 +286,7 @@ static int tls_push_record(struct sock *sk, return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } -static int tls_create_new_record(struct tls_offload_context *offload_ctx, +static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -290,7 +312,7 @@ static int tls_create_new_record(struct tls_offload_context *offload_ctx, } static int tls_do_allocation(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -324,7 +346,7 @@ static int tls_push_data(struct sock *sk, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; @@ -477,7 +499,7 @@ out: return rc; } -struct tls_record_info *tls_get_record(struct tls_offload_context *context, +struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; @@ -520,11 +542,123 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); } +void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev = tls_ctx->netdev; + struct tls_offload_context_rx *rx_ctx; + u32 is_req_pending; + s64 resync_req; + u32 req_seq; + + if (tls_ctx->rx_conf != TLS_HW) + return; + + rx_ctx = tls_offload_ctx_rx(tls_ctx); + resync_req = atomic64_read(&rx_ctx->resync_req); + req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); + is_req_pending = resync_req; + + if (unlikely(is_req_pending) && req_seq == seq && + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, + seq + TLS_HEADER_SIZE - 1, + rcd_sn); +} + +static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + int err = 0, offset = rxm->offset, copy, nsg; + struct sk_buff *skb_iter, *unused; + struct scatterlist sg[1]; + char *orig_buf, *buf; + + orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation); + if (!orig_buf) + return -ENOMEM; + buf = orig_buf; + + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + err = nsg; + goto free_buf; + } + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], buf, + rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE); + skb_copy_bits(skb, offset, buf, + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + + /* We are interested only in the decrypted data not the auth */ + err = decrypt_skb(sk, skb, sg); + if (err != -EBADMSG) + goto free_buf; + else + err = 0; + + copy = min_t(int, skb_pagelen(skb) - offset, + rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb->decrypted) + skb_store_bits(skb, offset, buf, copy); + + offset += copy; + buf += copy; + + skb_walk_frags(skb, skb_iter) { + copy = min_t(int, skb_iter->len, + rxm->full_len - offset + rxm->offset - + TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb_iter->decrypted) + skb_store_bits(skb_iter, offset, buf, copy); + + offset += copy; + buf += copy; + } + +free_buf: + kfree(orig_buf); + return err; +} + +int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); + int is_decrypted = skb->decrypted; + int is_encrypted = !is_decrypted; + struct sk_buff *skb_iter; + + /* Skip if it is already decrypted */ + if (ctx->sw.decrypted) + return 0; + + /* Check if all the data is decrypted already */ + skb_walk_frags(skb, skb_iter) { + is_decrypted &= skb_iter->decrypted; + is_encrypted &= !skb_iter->decrypted; + } + + ctx->sw.decrypted |= is_decrypted; + + /* Return immedeatly if the record is either entirely plaintext or + * entirely ciphertext. Otherwise handle reencrypt partially decrypted + * record. + */ + return (is_encrypted || is_decrypted) ? 0 : + tls_device_reencrypt(sk, skb); +} + int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { u16 nonce_size, tag_size, iv_size, rec_seq_size; struct tls_record_info *start_marker_record; - struct tls_offload_context *offload_ctx; + struct tls_offload_context_tx *offload_ctx; struct tls_crypto_info *crypto_info; struct net_device *netdev; char *iv, *rec_seq; @@ -546,7 +680,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto out; } - offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL); + offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; @@ -609,7 +743,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; - offload_ctx->sk_destruct = sk->sk_destruct; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. @@ -619,8 +752,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (skb) TCP_SKB_CB(skb)->eor = 1; - refcount_set(&ctx->refcount, 1); - /* We support starting offload on multiple sockets * concurrently, so we only need a read lock here. * This lock must precede get_netdev_for_sock to prevent races between @@ -655,19 +786,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (rc) goto release_netdev; - ctx->netdev = netdev; - - spin_lock_irq(&tls_device_lock); - list_add_tail(&ctx->list, &tls_device_list); - spin_unlock_irq(&tls_device_lock); + tls_device_attach(ctx, sk, netdev); - sk->sk_validate_xmit_skb = tls_validate_xmit_skb; /* following this assignment tls_is_sk_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ - smp_store_release(&sk->sk_destruct, - &tls_device_sk_destruct); + smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); + dev_put(netdev); up_read(&device_offload_lock); goto out; @@ -690,6 +816,105 @@ out: return rc; } +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) +{ + struct tls_offload_context_rx *context; + struct net_device *netdev; + int rc = 0; + + /* We support starting offload on multiple sockets + * concurrently, so we only need a read lock here. + * This lock must precede get_netdev_for_sock to prevent races between + * NETDEV_DOWN and setsockopt. + */ + down_read(&device_offload_lock); + netdev = get_netdev_for_sock(sk); + if (!netdev) { + pr_err_ratelimited("%s: netdev not found\n", __func__); + rc = -EINVAL; + goto release_lock; + } + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: netdev %s with no TLS offload\n", + __func__, netdev->name); + rc = -ENOTSUPP; + goto release_netdev; + } + + /* Avoid offloading if the device is down + * We don't want to offload new flows after + * the NETDEV_DOWN event + */ + if (!(netdev->flags & IFF_UP)) { + rc = -EINVAL; + goto release_netdev; + } + + context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL); + if (!context) { + rc = -ENOMEM; + goto release_netdev; + } + + ctx->priv_ctx_rx = context; + rc = tls_set_sw_offload(sk, ctx, 0); + if (rc) + goto release_ctx; + + rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, + &ctx->crypto_recv, + tcp_sk(sk)->copied_seq); + if (rc) { + pr_err_ratelimited("%s: The netdev has refused to offload this socket\n", + __func__); + goto free_sw_resources; + } + + tls_device_attach(ctx, sk, netdev); + goto release_netdev; + +free_sw_resources: + tls_sw_free_resources_rx(sk); +release_ctx: + ctx->priv_ctx_rx = NULL; +release_netdev: + dev_put(netdev); +release_lock: + up_read(&device_offload_lock); + return rc; +} + +void tls_device_offload_cleanup_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev; + + down_read(&device_offload_lock); + netdev = tls_ctx->netdev; + if (!netdev) + goto out; + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n", + __func__); + goto out; + } + + netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, + TLS_OFFLOAD_CTX_DIR_RX); + + if (tls_ctx->tx_conf != TLS_HW) { + dev_put(netdev); + tls_ctx->netdev = NULL; + } +out: + up_read(&device_offload_lock); + kfree(tls_ctx->rx.rec_seq); + kfree(tls_ctx->rx.iv); + tls_sw_release_resources_rx(sk); +} + static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; @@ -710,8 +935,12 @@ static int tls_device_down(struct net_device *netdev) spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { - netdev->tlsdev_ops->tls_dev_del(netdev, ctx, - TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->tx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->rx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_RX); ctx->netdev = NULL; dev_put(netdev); list_del_init(&ctx->list); @@ -732,12 +961,16 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (!(dev->features & NETIF_F_HW_TLS_TX)) + if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: + if ((dev->features & NETIF_F_HW_TLS_RX) && + !dev->tlsdev_ops->tls_dev_resync_rx) + return NOTIFY_BAD; + if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 748914abdb60..e3313c45663f 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -214,7 +214,7 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) static int fill_sg_in(struct scatterlist *sg_in, struct sk_buff *skb, - struct tls_offload_context *ctx, + struct tls_offload_context_tx *ctx, u64 *rcd_sn, s32 *sync_size, int *resync_sgs) @@ -299,7 +299,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, s32 sync_size, u64 rcd_sn) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; void *buf, *iv, *aad, *dummy_buf; struct aead_request *aead_req; @@ -361,7 +361,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; struct scatterlist *sg_in, sg_out[3]; struct sk_buff *nskb = NULL; @@ -413,9 +413,10 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, return tls_sw_fallback(sk, skb); } +EXPORT_SYMBOL_GPL(tls_validate_xmit_skb); int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) { const u8 *key; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 301f22430469..b09867c8b817 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -51,15 +51,6 @@ enum { TLSV6, TLS_NUM_PROTS, }; -enum { - TLS_BASE, - TLS_SW, -#ifdef CONFIG_TLS_DEVICE - TLS_HW, -#endif - TLS_HW_RECORD, - TLS_NUM_CONFIG, -}; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); @@ -290,7 +281,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } #ifdef CONFIG_TLS_DEVICE - if (ctx->tx_conf != TLS_HW) { + if (ctx->rx_conf == TLS_HW) + tls_device_offload_cleanup_rx(sk); + + if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) { #else { #endif @@ -470,8 +464,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, conf = TLS_SW; } } else { - rc = tls_set_sw_offload(sk, ctx, 0); - conf = TLS_SW; +#ifdef CONFIG_TLS_DEVICE + rc = tls_set_device_offload_rx(sk, ctx); + conf = TLS_HW; + if (rc) { +#else + { +#endif + rc = tls_set_sw_offload(sk, ctx, 0); + conf = TLS_SW; + } } if (rc) @@ -629,6 +631,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; + + prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; + + prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW]; + + prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 4618f1c31137..0c2d029c9d4c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -53,18 +53,14 @@ static int tls_do_decryption(struct sock *sk, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm = strp_msg(skb); struct aead_request *aead_req; int ret; - unsigned int req_size = sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_recv); - aead_req = kzalloc(req_size, flags); + aead_req = aead_request_alloc(ctx->aead_recv, flags); if (!aead_req) return -ENOMEM; - aead_request_set_tfm(aead_req, ctx->aead_recv); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, sgin, sgout, data_len + tls_ctx->rx.tag_size, @@ -74,19 +70,7 @@ static int tls_do_decryption(struct sock *sk, ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); - if (ret < 0) - goto out; - - rxm->offset += tls_ctx->rx.prepend_size; - rxm->full_len -= tls_ctx->rx.overhead_size; - tls_advance_record_sn(sk, &tls_ctx->rx); - - ctx->decrypted = true; - - ctx->saved_data_ready(sk); - -out: - kfree(aead_req); + aead_request_free(aead_req); return ret; } @@ -224,8 +208,7 @@ static int tls_push_record(struct sock *sk, int flags, struct aead_request *req; int rc; - req = kzalloc(sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation); + req = aead_request_alloc(ctx->aead_send, sk->sk_allocation); if (!req) return -ENOMEM; @@ -267,7 +250,7 @@ static int tls_push_record(struct sock *sk, int flags, tls_advance_record_sn(sk, &tls_ctx->tx); out_req: - kfree(req); + aead_request_free(req); return rc; } @@ -280,7 +263,7 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, int length, int *pages_used, unsigned int *size_used, struct scatterlist *to, int to_max_pages, - bool charge) + bool charge, bool revert) { struct page *pages[MAX_SKB_FRAGS]; @@ -331,6 +314,8 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, out: *size_used = size; *pages_used = num_elem; + if (revert) + iov_iter_revert(from, size); return rc; } @@ -432,7 +417,7 @@ alloc_encrypted: &ctx->sg_plaintext_size, ctx->sg_plaintext_data, ARRAY_SIZE(ctx->sg_plaintext_data), - true); + true, false); if (ret) goto fallback_to_reg_send; @@ -670,8 +655,38 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } -static int decrypt_skb(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout) +static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout, bool *zc) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + int err = 0; + +#ifdef CONFIG_TLS_DEVICE + err = tls_device_decrypted(sk, skb); + if (err < 0) + return err; +#endif + if (!ctx->decrypted) { + err = decrypt_skb(sk, skb, sgout); + if (err < 0) + return err; + } else { + *zc = false; + } + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; + tls_advance_record_sn(sk, &tls_ctx->rx); + ctx->decrypted = true; + ctx->saved_data_ready(sk); + + return err; +} + +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -817,11 +832,11 @@ int tls_sw_recvmsg(struct sock *sk, err = zerocopy_from_iter(sk, &msg->msg_iter, to_copy, &pages, &chunk, &sgin[1], - MAX_SKB_FRAGS, false); + MAX_SKB_FRAGS, false, true); if (err < 0) goto fallback_to_reg_recv; - err = decrypt_skb(sk, skb, sgin); + err = decrypt_skb_update(sk, skb, sgin, &zc); for (; pages > 0; pages--) put_page(sg_page(&sgin[pages])); if (err < 0) { @@ -830,7 +845,7 @@ int tls_sw_recvmsg(struct sock *sk, } } else { fallback_to_reg_recv: - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); goto recv_end; @@ -885,6 +900,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int err = 0; long timeo; int chunk; + bool zc; lock_sock(sk); @@ -901,7 +917,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); @@ -947,7 +963,7 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - char header[tls_ctx->rx.prepend_size]; + char header[TLS_HEADER_SIZE + MAX_IV_SIZE]; struct strp_msg *rxm = strp_msg(skb); size_t cipher_overhead; size_t data_len = 0; @@ -957,6 +973,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) if (rxm->offset + tls_ctx->rx.prepend_size > skb->len) return 0; + /* Sanity-check size of on-stack buffer. */ + if (WARN_ON(tls_ctx->rx.prepend_size > sizeof(header))) { + ret = -EINVAL; + goto read_failure; + } + /* Linearize header to local buffer */ ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size); @@ -984,6 +1006,10 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } +#ifdef CONFIG_TLS_DEVICE + handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset, + *(u64*)tls_ctx->rx.rec_seq); +#endif return data_len + TLS_HEADER_SIZE; read_failure: @@ -996,9 +1022,6 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm; - - rxm = strp_msg(skb); ctx->decrypted = false; @@ -1028,7 +1051,7 @@ void tls_sw_free_resources_tx(struct sock *sk) kfree(ctx); } -void tls_sw_free_resources_rx(struct sock *sk) +void tls_sw_release_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -1047,6 +1070,14 @@ void tls_sw_free_resources_rx(struct sock *sk) strp_done(&ctx->strp); lock_sock(sk); } +} + +void tls_sw_free_resources_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + + tls_sw_release_resources_rx(sk); kfree(ctx); } @@ -1071,28 +1102,38 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } if (tx) { - sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); - if (!sw_ctx_tx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx_tx) { + sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); + if (!sw_ctx_tx) { + rc = -ENOMEM; + goto out; + } + ctx->priv_ctx_tx = sw_ctx_tx; + } else { + sw_ctx_tx = + (struct tls_sw_context_tx *)ctx->priv_ctx_tx; } - crypto_init_wait(&sw_ctx_tx->async_wait); - ctx->priv_ctx_tx = sw_ctx_tx; } else { - sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); - if (!sw_ctx_rx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx_rx) { + sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); + if (!sw_ctx_rx) { + rc = -ENOMEM; + goto out; + } + ctx->priv_ctx_rx = sw_ctx_rx; + } else { + sw_ctx_rx = + (struct tls_sw_context_rx *)ctx->priv_ctx_rx; } - crypto_init_wait(&sw_ctx_rx->async_wait); - ctx->priv_ctx_rx = sw_ctx_rx; } if (tx) { + crypto_init_wait(&sw_ctx_tx->async_wait); crypto_info = &ctx->crypto_send; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { + crypto_init_wait(&sw_ctx_rx->async_wait); crypto_info = &ctx->crypto_recv; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; @@ -1117,7 +1158,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } /* Sanity-check the IV size for stack allocations. */ - if (iv_size > MAX_IV_SIZE) { + if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) { rc = -EINVAL; goto free_priv; } diff --git a/net/wireless/core.c b/net/wireless/core.c index 48e8097339ab..a88551f3bc43 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -3,7 +3,7 @@ * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2015 Intel Deutschland GmbH + * Copyright 2015-2017 Intel Deutschland GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -744,6 +744,8 @@ int wiphy_register(struct wiphy *wiphy) /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { + u16 types = 0; + sband = wiphy->bands[band]; if (!sband) continue; @@ -788,6 +790,23 @@ int wiphy_register(struct wiphy *wiphy) sband->channels[i].band = band; } + for (i = 0; i < sband->n_iftype_data; i++) { + const struct ieee80211_sband_iftype_data *iftd; + + iftd = &sband->iftype_data[i]; + + if (WARN_ON(!iftd->types_mask)) + return -EINVAL; + if (WARN_ON(types & iftd->types_mask)) + return -EINVAL; + + /* at least one piece of information must be present */ + if (WARN_ON(!iftd->he_cap.has_he)) + return -EINVAL; + + types |= iftd->types_mask; + } + have_band = true; } diff --git a/net/wireless/core.h b/net/wireless/core.h index 63eb1b5fdd04..7f52ef569320 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -76,7 +76,7 @@ struct cfg80211_registered_device { struct cfg80211_scan_request *scan_req; /* protected by RTNL */ struct sk_buff *scan_msg; struct list_head sched_scan_req_list; - unsigned long suspend_at; + time64_t suspend_at; struct work_struct scan_done_wk; struct genl_info *cur_cmd_info; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4eece06be1e7..e4e5f025d16b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -428,6 +428,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 }, [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, + [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY, + .len = NL80211_HE_MAX_CAPABILITY_LEN }, }; /* policy for the key attributes */ @@ -1324,6 +1326,34 @@ static int nl80211_send_coalesce(struct sk_buff *msg, return 0; } +static int +nl80211_send_iftype_data(struct sk_buff *msg, + const struct ieee80211_sband_iftype_data *iftdata) +{ + const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; + + if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES, + iftdata->types_mask)) + return -ENOBUFS; + + if (he_cap->has_he) { + if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC, + sizeof(he_cap->he_cap_elem.mac_cap_info), + he_cap->he_cap_elem.mac_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY, + sizeof(he_cap->he_cap_elem.phy_cap_info), + he_cap->he_cap_elem.phy_cap_info) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET, + sizeof(he_cap->he_mcs_nss_supp), + &he_cap->he_mcs_nss_supp) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, + sizeof(he_cap->ppe_thres), he_cap->ppe_thres)) + return -ENOBUFS; + } + + return 0; +} + static int nl80211_send_band_rateinfo(struct sk_buff *msg, struct ieee80211_supported_band *sband) { @@ -1353,6 +1383,32 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, sband->vht_cap.cap))) return -ENOBUFS; + if (sband->n_iftype_data) { + struct nlattr *nl_iftype_data = + nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA); + int err; + + if (!nl_iftype_data) + return -ENOBUFS; + + for (i = 0; i < sband->n_iftype_data; i++) { + struct nlattr *iftdata; + + iftdata = nla_nest_start(msg, i + 1); + if (!iftdata) + return -ENOBUFS; + + err = nl80211_send_iftype_data(msg, + &sband->iftype_data[i]); + if (err) + return err; + + nla_nest_end(msg, iftdata); + } + + nla_nest_end(msg, nl_iftype_data); + } + /* add bitrates */ nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES); if (!nl_rates) @@ -2757,7 +2813,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) || nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->devlist_generation ^ - (cfg80211_rdev_list_generation << 2))) + (cfg80211_rdev_list_generation << 2)) || + nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr)) goto nla_put_failure; if (rdev->ops->get_channel) { @@ -4471,6 +4528,9 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, case RATE_INFO_BW_160: rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH; break; + case RATE_INFO_BW_HE_RU: + rate_flg = 0; + WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS)); } if (rate_flg && nla_put_flag(msg, rate_flg)) @@ -4490,6 +4550,19 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, if (info->flags & RATE_INFO_FLAGS_SHORT_GI && nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI)) return false; + } else if (info->flags & RATE_INFO_FLAGS_HE_MCS) { + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm)) + return false; + if (info->bw == RATE_INFO_BW_HE_RU && + nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC, + info->he_ru_alloc)) + return false; } nla_nest_end(msg, rate); @@ -4546,13 +4619,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, #define PUT_SINFO(attr, memb, type) do { \ BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \ - if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \ nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \ sinfo->memb)) \ goto nla_put_failure; \ } while (0) #define PUT_SINFO_U64(attr, memb) do { \ - if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \ nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \ sinfo->memb, NL80211_STA_INFO_PAD)) \ goto nla_put_failure; \ @@ -4561,14 +4634,14 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(CONNECTED_TIME, connected_time, u32); PUT_SINFO(INACTIVE_TIME, inactive_time, u32); - if (sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES) | - BIT(NL80211_STA_INFO_RX_BYTES64)) && + if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) | + BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) && nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES, (u32)sinfo->rx_bytes)) goto nla_put_failure; - if (sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES) | - BIT(NL80211_STA_INFO_TX_BYTES64)) && + if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES64)) && nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES, (u32)sinfo->tx_bytes)) goto nla_put_failure; @@ -4588,24 +4661,24 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, default: break; } - if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL)) { if (!nl80211_put_signal(msg, sinfo->chains, sinfo->chain_signal, NL80211_STA_INFO_CHAIN_SIGNAL)) goto nla_put_failure; } - if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) { if (!nl80211_put_signal(msg, sinfo->chains, sinfo->chain_signal_avg, NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) goto nla_put_failure; } - if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) { if (!nl80211_put_sta_rate(msg, &sinfo->txrate, NL80211_STA_INFO_TX_BITRATE)) goto nla_put_failure; } - if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) { if (!nl80211_put_sta_rate(msg, &sinfo->rxrate, NL80211_STA_INFO_RX_BITRATE)) goto nla_put_failure; @@ -4621,7 +4694,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); - if (sinfo->filled & BIT(NL80211_STA_INFO_BSS_PARAM)) { + if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM); if (!bss_param) goto nla_put_failure; @@ -4640,7 +4713,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, nla_nest_end(msg, bss_param); } - if ((sinfo->filled & BIT(NL80211_STA_INFO_STA_FLAGS)) && + if ((sinfo->filled & BIT_ULL(NL80211_STA_INFO_STA_FLAGS)) && nla_put(msg, NL80211_STA_INFO_STA_FLAGS, sizeof(struct nl80211_sta_flag_update), &sinfo->sta_flags)) @@ -4886,7 +4959,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; if (params->supported_rates) return -EINVAL; - if (params->ext_capab || params->ht_capa || params->vht_capa) + if (params->ext_capab || params->ht_capa || params->vht_capa || + params->he_capa) return -EINVAL; } @@ -5092,6 +5166,15 @@ static int nl80211_set_station_tdls(struct genl_info *info, if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) params->vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { + params->he_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + params->he_capa_len = + nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) + return -EINVAL; + } err = nl80211_parse_sta_channel_info(info, params); if (err) @@ -5319,6 +5402,17 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]); + if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) { + params.he_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + params.he_capa_len = + nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + + /* max len is validated in nla policy */ + if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN) + return -EINVAL; + } + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.opmode_notif_used = true; params.opmode_notif = @@ -5351,6 +5445,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { params.ht_capa = NULL; params.vht_capa = NULL; + + /* HE requires WME */ + if (params.he_capa_len) + return -EINVAL; } /* When you run into this, adjust the code below for the new flag */ @@ -6848,6 +6946,16 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev) return regulatory_pre_cac_allowed(wdev->wiphy); } +static bool nl80211_check_scan_feat(struct wiphy *wiphy, u32 flags, u32 flag, + enum nl80211_ext_feature_index feat) +{ + if (!(flags & flag)) + return true; + if (wiphy_ext_feature_isset(wiphy, feat)) + return true; + return false; +} + static int nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, void *request, struct nlattr **attrs, @@ -6882,15 +6990,33 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) || - ((*flags & NL80211_SCAN_FLAG_LOW_SPAN) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_LOW_SPAN_SCAN)) || - ((*flags & NL80211_SCAN_FLAG_LOW_POWER) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_LOW_POWER_SCAN)) || - ((*flags & NL80211_SCAN_FLAG_HIGH_ACCURACY) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN))) + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_LOW_SPAN, + NL80211_EXT_FEATURE_LOW_SPAN_SCAN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_LOW_POWER, + NL80211_EXT_FEATURE_LOW_POWER_SCAN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_HIGH_ACCURACY, + NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME, + NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP, + NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_RANDOM_SN, + NL80211_EXT_FEATURE_SCAN_RANDOM_SN) || + !nl80211_check_scan_feat(wiphy, *flags, + NL80211_SCAN_FLAG_MIN_PREQ_CONTENT, + NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT)) return -EOPNOTSUPP; if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { @@ -6905,26 +7031,6 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, return err; } - if ((*flags & NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME)) - return -EOPNOTSUPP; - - if ((*flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP)) - return -EOPNOTSUPP; - - if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION)) - return -EOPNOTSUPP; - - if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE) && - !wiphy_ext_feature_isset(wiphy, - NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE)) - return -EOPNOTSUPP; - return 0; } @@ -10147,7 +10253,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, if (err) return err; - if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) wdev->cqm_config->last_rssi_event_value = (s8) sinfo.rx_beacon_signal_avg; } diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 570a2b67ca10..6ab32f6a1961 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -102,7 +102,7 @@ static int wiphy_suspend(struct device *dev) struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; - rdev->suspend_at = get_seconds(); + rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); if (rdev->wiphy.registered) { @@ -130,7 +130,7 @@ static int wiphy_resume(struct device *dev) int ret = 0; /* Age scan results with time spent in suspend */ - cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at); + cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); if (rdev->wiphy.registered && rdev->ops->resume) diff --git a/net/wireless/util.c b/net/wireless/util.c index 3c654cd7ba56..e0825a019e9f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -4,6 +4,7 @@ * * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2017 Intel Deutschland GmbH */ #include <linux/export.h> #include <linux/bitops.h> @@ -1142,6 +1143,85 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) return 0; } +static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) +{ +#define SCALE 2048 + u16 mcs_divisors[12] = { + 34133, /* 16.666666... */ + 17067, /* 8.333333... */ + 11378, /* 5.555555... */ + 8533, /* 4.166666... */ + 5689, /* 2.777777... */ + 4267, /* 2.083333... */ + 3923, /* 1.851851... */ + 3413, /* 1.666666... */ + 2844, /* 1.388888... */ + 2560, /* 1.250000... */ + 2276, /* 1.111111... */ + 2048, /* 1.000000... */ + }; + u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; + u32 rates_969[3] = { 480388888, 453700000, 408333333 }; + u32 rates_484[3] = { 229411111, 216666666, 195000000 }; + u32 rates_242[3] = { 114711111, 108333333, 97500000 }; + u32 rates_106[3] = { 40000000, 37777777, 34000000 }; + u32 rates_52[3] = { 18820000, 17777777, 16000000 }; + u32 rates_26[3] = { 9411111, 8888888, 8000000 }; + u64 tmp; + u32 result; + + if (WARN_ON_ONCE(rate->mcs > 11)) + return 0; + + if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) + return 0; + if (WARN_ON_ONCE(rate->he_ru_alloc > + NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) + return 0; + if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) + return 0; + + if (rate->bw == RATE_INFO_BW_160) + result = rates_160M[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_80 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) + result = rates_969[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_40 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) + result = rates_484[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_20 || + (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) + result = rates_242[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) + result = rates_106[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) + result = rates_52[rate->he_gi]; + else if (rate->bw == RATE_INFO_BW_HE_RU && + rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) + result = rates_26[rate->he_gi]; + else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", + rate->bw, rate->he_ru_alloc)) + return 0; + + /* now scale to the appropriate MCS */ + tmp = result; + tmp *= SCALE; + do_div(tmp, mcs_divisors[rate->mcs]); + result = tmp; + + /* and take NSS, DCM into account */ + result = (result * rate->nss) / 8; + if (rate->he_dcm) + result /= 2; + + return result; +} + u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) @@ -1150,6 +1230,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_60g(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); + if (rate->flags & RATE_INFO_FLAGS_HE_MCS) + return cfg80211_calculate_bitrate_he(rate); return rate->legacy; } @@ -1791,8 +1873,9 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { - sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)), - IEEE80211_NUM_TIDS + 1, gfp); + sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, + sizeof(*(sinfo->pertid)), + gfp); if (!sinfo->pertid) return -ENOMEM; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 05186a47878f..167f7025ac98 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1278,7 +1278,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; - if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))) + if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) return -EOPNOTSUPP; rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); @@ -1320,7 +1320,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: - if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) { + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { int sig = sinfo.signal; wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; @@ -1334,7 +1334,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) break; } case CFG80211_SIGNAL_TYPE_UNSPEC: - if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) { + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; wstats.qual.level = sinfo.signal; @@ -1347,9 +1347,9 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) } wstats.qual.updated |= IW_QUAL_NOISE_INVALID; - if (sinfo.filled & BIT(NL80211_STA_INFO_RX_DROP_MISC)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC)) wstats.discard.misc = sinfo.rx_dropped_misc; - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_FAILED)) + if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; return &wstats; |