From 9403cf2302588022d06f1878b072d3f6933021f0 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 19 Mar 2019 16:05:44 +0100 Subject: tcp: free request sock directly upon TFO or syncookies error Since the request socket is created locally, it'd make more sense to use reqsk_free() instead of reqsk_put() in TFO and syncookies' error path. However, tcp_get_cookie_sock() may set ->rsk_refcnt before freeing the socket; tcp_conn_request() may also have non-null ->rsk_refcnt because of tcp_try_fastopen(). In both cases 'req' hasn't been exposed to the outside world and is safe to free immediately, but that'd trigger the WARN_ON_ONCE in reqsk_free(). Define __reqsk_free() for these situations where we know nobody's referencing the socket, even though ->rsk_refcnt might be non-null. Now we can consolidate the error path of tcp_get_cookie_sock() and tcp_conn_request(). Signed-off-by: Guillaume Nault Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 21a5243fecd1..9dfd7960d90a 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -106,10 +106,8 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, return req; } -static inline void reqsk_free(struct request_sock *req) +static inline void __reqsk_free(struct request_sock *req) { - WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); - req->rsk_ops->destructor(req); if (req->rsk_listener) sock_put(req->rsk_listener); @@ -117,6 +115,12 @@ static inline void reqsk_free(struct request_sock *req) kmem_cache_free(req->rsk_ops->slab, req); } +static inline void reqsk_free(struct request_sock *req) +{ + WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); + __reqsk_free(req); +} + static inline void reqsk_put(struct request_sock *req) { if (refcount_dec_and_test(&req->rsk_refcnt)) -- cgit v1.2.3 From 03f1eccc7a69c965351e6bee41c62afa2844752f Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Tue, 19 Mar 2019 12:37:12 -0400 Subject: ipv6: Add icmp_echo_ignore_multicast support for ICMPv6 IPv4 has icmp_echo_ignore_broadcast to prevent responding to broadcast pings. IPv6 needs a similar mechanism. v1->v2: - Remove NET_IPV6_ICMP_ECHO_IGNORE_MULTICAST. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index b028a1dc150d..e29aff15acc9 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -33,6 +33,7 @@ struct netns_sysctl_ipv6 { int auto_flowlabels; int icmpv6_time; int icmpv6_echo_ignore_all; + int icmpv6_echo_ignore_multicast; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; -- cgit v1.2.3 From f295b3ae9f5927e084bd5decdff82390e3471801 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Wed, 20 Mar 2019 02:03:36 +0000 Subject: net/tls: Add support of AES128-CCM based ciphers Added support for AES128-CCM based record encryption. AES128-CCM is similar to AES128-GCM. Both of them have same salt/iv/mac size. The notable difference between the two is that while invoking AES128-CCM operation, the salt||nonce (which is passed as IV) has to be prefixed with a hardcoded value '2'. Further, CCM implementation in kernel requires IV passed in crypto_aead_request() to be full '16' bytes. Therefore, the record structure 'struct tls_rec' has been modified to reserve '16' bytes for IV. This works for both GCM and CCM based cipher. Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index a5a938583295..3ce71d78414c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -60,6 +60,17 @@ #define TLS_AAD_SPACE_SIZE 13 #define TLS_DEVICE_NAME_MAX 32 +#define MAX_IV_SIZE 16 + +/* For AES-CCM, the full 16-bytes of IV is made of '4' fields of given sizes. + * + * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3] + * + * The field 'length' is encoded in field 'b0' as '(length width - 1)'. + * Hence b0 contains (3 - 1) = 2. + */ +#define TLS_AES_CCM_IV_B0_BYTE 2 + /* * This structure defines the routines for Inline TLS driver. * The following routines are optional and filled with a @@ -123,8 +134,7 @@ struct tls_rec { struct scatterlist sg_content_type; char aad_space[TLS_AAD_SPACE_SIZE]; - u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE + - TLS_CIPHER_AES_GCM_128_SALT_SIZE]; + u8 iv_data[MAX_IV_SIZE]; struct aead_request aead_req; u8 aead_req_ctx[]; }; @@ -219,6 +229,7 @@ struct tls_prot_info { u16 tag_size; u16 overhead_size; u16 iv_size; + u16 salt_size; u16 rec_seq_size; u16 aad_size; u16 tail_size; -- cgit v1.2.3 From 0b03a5ca8b14321366eec4a903922d2b46d585ff Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 20 Mar 2019 10:29:27 -0400 Subject: ipv6: Add icmp_echo_ignore_anycast for ICMPv6 In addition to icmp_echo_ignore_multicast, there is a need to also prevent responding to pings to anycast addresses for security. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index e29aff15acc9..64e29b58bb5e 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -34,6 +34,7 @@ struct netns_sysctl_ipv6 { int icmpv6_time; int icmpv6_echo_ignore_all; int icmpv6_echo_ignore_multicast; + int icmpv6_echo_ignore_anycast; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; -- cgit v1.2.3 From c7a1ce397adacaf5d4bb2eab0a738b5f80dc3e43 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 21 Mar 2019 05:21:35 -0700 Subject: ipv6: Change addrconf_f6i_alloc to use ip6_route_info_create Change addrconf_f6i_alloc to generate a fib6_config and call ip6_route_info_create. addrconf_f6i_alloc is the last caller to fib6_info_alloc besides ip6_route_info_create, and there is no reason for it to do its own initialization on a fib6_info. Host routes need to be created even if the device is down, so add a new flag, fc_ignore_dev_down, to fib6_config and update fib6_nh_init to not error out if device is not up. Notes on the conversion: - ip_fib_metrics_init is the same as fib6_config has fc_mx set to NULL and fc_mx_len set to 0 - dst_nocount is handled by the RTF_ADDRCONF flag - dst_host is handled by fc_dst_len = 128 nh_gw does not get set after the conversion to ip6_route_info_create but it should not be set in addrconf_f6i_alloc since this is a host route not a gateway route. Everything else is a straight forward map between fib6_info and fib6_config. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 84097010237c..2acb78a762ee 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -50,7 +50,8 @@ struct fib6_config { u32 fc_protocol; u16 fc_type; /* only 8 bits are used */ u16 fc_delete_all_nh : 1, - __unused : 15; + fc_ignore_dev_down:1, + __unused : 14; struct in6_addr fc_dst; struct in6_addr fc_src; -- cgit v1.2.3 From 9ab948a91b2c2abc8e82845c0e61f4b1683e3a4f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 20 Mar 2019 09:18:59 -0700 Subject: ipv4: Allow amount of dirty memory from fib resizing to be controllable fib_trie implementation calls synchronize_rcu when a certain amount of pages are dirty from freed entries. The number of pages was determined experimentally in 2009 (commit c3059477fce2d). At the current setting, synchronize_rcu is called often -- 51 times in a second in one test with an average of an 8 msec delay adding a fib entry. The total impact is a lot of slow down modifying the fib. This is seen in the output of 'time' - the difference between real time and sys+user. For example, using 720,022 single path routes and 'ip -batch'[1]: $ time ./ip -batch ipv4/routes-1-hops real 0m14.214s user 0m2.513s sys 0m6.783s So roughly 35% of the actual time to install the routes is from the ip command getting scheduled out, most notably due to synchronize_rcu (this is observed using 'perf sched timehist'). This patch makes the amount of dirty memory configurable between 64k where the synchronize_rcu is called often (small, low end systems that are memory sensitive) to 64M where synchronize_rcu is called rarely during a large FIB change (for high end systems with lots of memory). The default is 512kB which corresponds to the current setting of 128 pages with a 4kB page size. As an example, at 16MB the worst interval shows 4 calls to synchronize_rcu in a second blocking for up to 30 msec in a single instance, and a total of almost 100 msec across the 4 calls in the second. The trade off is allowing FIB entries to consume more memory in a given time window but but with much better fib insertion rates (~30% increase in prefixes/sec). With this patch and net.ipv4.fib_sync_mem set to 16MB, the same batch file runs in: $ time ./ip -batch ipv4/routes-1-hops real 0m9.692s user 0m2.491s sys 0m6.769s So the dead time is reduced to about 1/2 second or <5% of the real time. [1] 'ip' modified to not request ACK messages which improves route insertion times by about 20% Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index be3cad9c2e4c..aa09ae5f01a5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -38,6 +38,10 @@ #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ +extern unsigned int sysctl_fib_sync_mem; +extern unsigned int sysctl_fib_sync_mem_min; +extern unsigned int sysctl_fib_sync_mem_max; + struct sock; struct inet_skb_parm { -- cgit v1.2.3 From 02afc7ad45bd6cfc9fd51fdbc132455371b63469 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 20 Mar 2019 20:02:56 +0100 Subject: net: dst: remove gc leftovers Get rid of some obsolete gc-related documentation and macros that were missed in commit 5b7c9a8ff828 ("net: remove dst gc related code"). CC: Wei Wang Signed-off-by: Julian Wiedmann Acked-by: Wei Wang Signed-off-by: David S. Miller --- include/net/dst.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 6cf0870414c7..12b31c602cb0 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -19,17 +19,6 @@ #include #include -#define DST_GC_MIN (HZ/10) -#define DST_GC_INC (HZ/2) -#define DST_GC_MAX (120*HZ) - -/* Each dst_entry has reference count and sits in some parent list(s). - * When it is removed from parent list, it is "freed" (dst_free). - * After this it enters dead state (dst->obsolete > 0) and if its refcnt - * is zero, it can be destroyed immediately, otherwise it is added - * to gc list and garbage collector periodically checks the refcnt. - */ - struct sk_buff; struct dst_entry { -- cgit v1.2.3 From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 21 Mar 2019 22:51:02 +0100 Subject: genetlink: make policy common to family Since maxattr is common, the policy can't really differ sanely, so make it common as well. The only user that did in fact manage to make a non-common policy is taskstats, which has to be really careful about it (since it's still using a common maxattr!). This is no longer supported, but we can fake it using pre_doit. This reduces the size of e.g. nl80211.o (which has lots of commands): text data bss dec hex filename 398745 14323 2240 415308 6564c net/wireless/nl80211.o (before) 397913 14331 2240 414484 65314 net/wireless/nl80211.o (after) -------------------------------- -832 +8 0 -824 Which is obviously just 8 bytes for each command, and an added 8 bytes for the new policy pointer. I'm not sure why the ops list is counted as .text though. Most of the code transformations were done using the following spatch: @ops@ identifier OPS; expression POLICY; @@ struct genl_ops OPS[] = { ..., { - .policy = POLICY, }, ... }; @@ identifier ops.OPS; expression ops.POLICY; identifier fam; expression M; @@ struct genl_family fam = { .ops = OPS, .maxattr = M, + .policy = POLICY, ... }; This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing the cb->data as ops, which we want to change in a later genl patch. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index aa2e5888f18d..6850c7b1a3a6 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -26,6 +26,7 @@ struct genl_info; * @name: name of family * @version: protocol version * @maxattr: maximum number of attributes supported + * @policy: netlink policy * @netnsok: set to true if the family can handle network * namespaces and should be presented in all of them * @parallel_ops: operations can be called in parallel and aren't @@ -56,6 +57,7 @@ struct genl_family { unsigned int maxattr; bool netnsok; bool parallel_ops; + const struct nla_policy *policy; int (*pre_doit)(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info); @@ -124,14 +126,12 @@ static inline int genl_err_attr(struct genl_info *info, int err, * @cmd: command identifier * @internal_flags: flags used by the family * @flags: flags - * @policy: attribute validation policy * @doit: standard command callback * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps */ struct genl_ops { - const struct nla_policy *policy; int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*start)(struct netlink_callback *cb); -- cgit v1.2.3 From 974eff2b5793eeaa2eb433bca7eba9640d890c4a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 21 Mar 2019 15:51:36 -0700 Subject: net: Move the definition of the default Geneve udp port to public header file Move the definition of the default Geneve udp port from the geneve source to the header file, so we can re-use it from drivers. Modify existing drivers to use it. Signed-off-by: Moshe Shemesh Reviewed-by: Or Gerlitz Cc: John Hurley Cc: Jakub Kicinski Reviewed-by: Tariq Toukan Acked-by: Jakub Kicinski Signed-off-by: Saeed Mahameed --- include/net/geneve.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/geneve.h b/include/net/geneve.h index fc6a7e0a874a..bced0b1d9fe4 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -4,6 +4,8 @@ #include +#define GENEVE_UDP_PORT 6081 + /* Geneve Header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |Ver| Opt Len |O|C| Rsvd. | Protocol Type | -- cgit v1.2.3 From bea964107fa78ffe484ef8659ecc26f9ae2bcd2f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 21 Mar 2019 15:51:39 -0700 Subject: net: Add IANA_VXLAN_UDP_PORT definition to vxlan header file Added IANA_VXLAN_UDP_PORT (4789) definition to vxlan header file so it can be used by drivers instead of local definition. Updated drivers which locally defined it as 4789 to use it. Signed-off-by: Moshe Shemesh Reviewed-by: Or Gerlitz Cc: John Hurley Cc: Jakub Kicinski Cc: Yunsheng Lin Cc: Peng Li Reviewed-by: Tariq Toukan Acked-by: Jakub Kicinski Signed-off-by: Saeed Mahameed --- include/net/vxlan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 00254a58824b..83b5999a2587 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -8,6 +8,8 @@ #include #include +#define IANA_VXLAN_UDP_PORT 4789 + /* VXLAN protocol (RFC 7348) header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|I|R|R|R| Reserved | -- cgit v1.2.3 From 28cff537ef2eed9307bc7e4e40745075637bec56 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 22 Mar 2019 16:01:55 +0100 Subject: net: sched: add empty status flag for NOLOCK qdisc The queue is marked not empty after acquiring the seqlock, and it's up to the NOLOCK qdisc clearing such flag on dequeue. Since the empty status lays on the same cache-line of the seqlock, it's always hot on cache during the updates. This makes the empty flag update a little bit loosy. Given the lack of synchronization between enqueue and dequeue, this is unavoidable. v2 -> v3: - qdisc_is_empty() has a const argument (Eric) v1 -> v2: - use really an 'empty' flag instead of 'not_empty', as suggested by Eric Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Reviewed-by: Ivan Vecera Signed-off-by: David S. Miller --- include/net/sch_generic.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 31284c078d06..e227475e78ca 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -113,6 +113,9 @@ struct Qdisc { spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; + + /* for NOLOCK qdisc, true if there are no enqueued skbs */ + bool empty; struct rcu_head rcu; }; @@ -143,11 +146,19 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } +static inline bool qdisc_is_empty(const struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_NOLOCK) + return qdisc->empty; + return !qdisc->q.qlen; +} + static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { if (!spin_trylock(&qdisc->seqlock)) return false; + qdisc->empty = false; } else if (qdisc_is_running(qdisc)) { return false; } -- cgit v1.2.3 From dc05360fee660a9dbe59824b3f7896534210432b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2019 08:56:38 -0700 Subject: net: convert rps_needed and rfs_needed to new static branch api We prefer static_branch_unlikely() over static_key_false() these days. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 8de5ee258b93..fecdf639225c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -966,7 +966,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash) static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS - if (static_key_false(&rfs_needed)) { + if (static_branch_unlikely(&rfs_needed)) { /* Reading sk->sk_rxhash might incur an expensive cache line * miss. * -- cgit v1.2.3 From 472c2e07eef045145bc1493cc94a01c87140780a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2019 08:56:39 -0700 Subject: tcp: add one skb cache for tx On hosts with a lot of cores, RPC workloads suffer from heavy contention on slab spinlocks. 20.69% [kernel] [k] queued_spin_lock_slowpath 5.64% [kernel] [k] _raw_spin_lock 3.83% [kernel] [k] syscall_return_via_sysret 3.48% [kernel] [k] __entry_text_start 1.76% [kernel] [k] __netif_receive_skb_core 1.64% [kernel] [k] __fget For each sendmsg(), we allocate one skb, and free it at the time ACK packet comes. In many cases, ACK packets are handled by another cpus, and this unfortunately incurs heavy costs for slab layer. This patch uses an extra pointer in socket structure, so that we try to reuse the same skb and avoid these expensive costs. We cache at most one skb per socket so this should be safe as far as memory pressure is concerned. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index fecdf639225c..314c47a8f5d1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -414,6 +414,7 @@ struct sock { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; + struct sk_buff *sk_tx_skb_cache; struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; @@ -1463,6 +1464,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { + if (!sk->sk_tx_skb_cache) { + sk->sk_tx_skb_cache = skb; + return; + } sock_set_flag(sk, SOCK_QUEUE_SHRUNK); sk->sk_wmem_queued -= skb->truesize; sk_mem_uncharge(sk, skb->truesize); -- cgit v1.2.3 From 8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2019 08:56:40 -0700 Subject: tcp: add one skb cache for rx Often times, recvmsg() system calls and BH handling for a particular TCP socket are done on different cpus. This means the incoming skb had to be allocated on a cpu, but freed on another. This incurs a high spinlock contention in slab layer for small rpc, but also a high number of cache line ping pongs for larger packets. A full size GRO packet might use 45 page fragments, meaning that up to 45 put_page() can be involved. More over performing the __kfree_skb() in the recvmsg() context adds a latency for user applications, and increase probability of trapping them in backlog processing, since the BH handler might found the socket owned by the user. This patch, combined with the prior one increases the rpc performance by about 10 % on servers with large number of cores. (tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps instead of 8 Mpps) This also increases single bulk flow performance on 40Gbit+ links, since in this case there are often two cpus working in tandem : - CPU handling the NIC rx interrupts, feeding the receive queue, and (after this patch) freeing the skbs that were consumed. - CPU in recvmsg() system call, essentially 100 % busy copying out data to user space. Having at most one skb in a per-socket cache has very little risk of memory exhaustion, and since it is protected by socket lock, its management is essentially free. Note that if rps/rfs is used, we do not enable this feature, because there is high chance that the same cpu is handling both the recvmsg() system call and the TCP rx path, but that another cpu did the skb allocations in the device driver right before the RPS/RFS logic. To properly handle this case, it seems we would need to record on which cpu skb was allocated, and use a different channel to give skbs back to this cpu. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 314c47a8f5d1..577d91fb5626 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -368,6 +368,7 @@ struct sock { atomic_t sk_drops; int sk_rcvlowat; struct sk_buff_head sk_error_queue; + struct sk_buff *sk_rx_skb_cache; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -2438,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); + if ( +#ifdef CONFIG_RPS + !static_branch_unlikely(&rps_needed) && +#endif + !sk->sk_rx_skb_cache) { + sk->sk_rx_skb_cache = skb; + skb_orphan(skb); + return; + } __kfree_skb(skb); } -- cgit v1.2.3 From b8f975545cdbcc316cf20e827e7966d4410b5c5a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 24 Mar 2019 11:14:37 +0100 Subject: net: devlink: add port type spinlock Add spinlock to protect port type and type_dev pointer consistency. Without that, userspace may see inconsistent type and type_dev combinations. Signed-off-by: Jiri Pirko v1->v2: - rebased Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 63de99e09f04..cb9b060033e1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,9 @@ struct devlink_port { struct devlink *devlink; unsigned index; bool registered; + spinlock_t type_lock; /* Protects type and type_dev + * pointer consistency. + */ enum devlink_port_type type; enum devlink_port_type desired_type; void *type_dev; -- cgit v1.2.3 From f6b19b354d50c5ae46ad66b5273f92e563fbc847 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 24 Mar 2019 11:14:38 +0100 Subject: net: devlink: select NET_DEVLINK from drivers Some drivers are becoming more dependent on NET_DEVLINK being selected in configuration. With upcoming compat functions, the behavior would be wrong in case devlink was not compiled in. So make the drivers select NET_DEVLINK and rely on the functions being there, not just stubs. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 495 +------------------------------------------------- 1 file changed, 3 insertions(+), 492 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index cb9b060033e1..03fb16f4fb6c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -549,17 +549,13 @@ static inline struct devlink *priv_to_devlink(void *priv) static inline struct devlink *netdev_to_devlink(struct net_device *dev) { -#if IS_ENABLED(CONFIG_NET_DEVLINK) if (dev->netdev_ops->ndo_get_devlink) return dev->netdev_ops->ndo_get_devlink(dev); -#endif return NULL; } struct ib_device; -#if IS_ENABLED(CONFIG_NET_DEVLINK) - struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); @@ -728,500 +724,14 @@ void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state); +#if IS_ENABLED(CONFIG_NET_DEVLINK) + void devlink_compat_running_version(struct net_device *dev, char *buf, size_t len); int devlink_compat_flash_update(struct net_device *dev, const char *file_name); #else -static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, - size_t priv_size) -{ - return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL); -} - -static inline int devlink_register(struct devlink *devlink, struct device *dev) -{ - return 0; -} - -static inline void devlink_unregister(struct devlink *devlink) -{ -} - -static inline void devlink_params_publish(struct devlink *devlink) -{ -} - -static inline void devlink_params_unpublish(struct devlink *devlink) -{ -} - -static inline void devlink_free(struct devlink *devlink) -{ - kfree(devlink); -} - -static inline int devlink_port_register(struct devlink *devlink, - struct devlink_port *devlink_port, - unsigned int port_index) -{ - return 0; -} - -static inline void devlink_port_unregister(struct devlink_port *devlink_port) -{ -} - -static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port, - struct net_device *netdev) -{ -} - -static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port, - struct ib_device *ibdev) -{ -} - -static inline void devlink_port_type_clear(struct devlink_port *devlink_port) -{ -} - -static inline void devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour, - u32 port_number, bool split, - u32 split_subport_number) -{ -} - -static inline int -devlink_port_get_phys_port_name(struct devlink_port *devlink_port, - char *name, size_t len) -{ - return -EOPNOTSUPP; -} - -static inline int devlink_sb_register(struct devlink *devlink, - unsigned int sb_index, u32 size, - u16 ingress_pools_count, - u16 egress_pools_count, - u16 ingress_tc_count, - u16 egress_tc_count) -{ - return 0; -} - -static inline void devlink_sb_unregister(struct devlink *devlink, - unsigned int sb_index) -{ -} - -static inline int -devlink_dpipe_table_register(struct devlink *devlink, - const char *table_name, - struct devlink_dpipe_table_ops *table_ops, - void *priv, bool counter_control_extern) -{ - return 0; -} - -static inline void devlink_dpipe_table_unregister(struct devlink *devlink, - const char *table_name) -{ -} - -static inline int devlink_dpipe_headers_register(struct devlink *devlink, - struct devlink_dpipe_headers * - dpipe_headers) -{ - return 0; -} - -static inline void devlink_dpipe_headers_unregister(struct devlink *devlink) -{ -} - -static inline bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, - const char *table_name) -{ - return false; -} - -static inline int -devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) -{ - return 0; -} - -static inline int -devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, - struct devlink_dpipe_entry *entry) -{ - return 0; -} - -static inline int -devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) -{ - return 0; -} - -static inline void -devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry) -{ -} - -static inline int -devlink_dpipe_action_put(struct sk_buff *skb, - struct devlink_dpipe_action *action) -{ - return 0; -} - -static inline int -devlink_dpipe_match_put(struct sk_buff *skb, - struct devlink_dpipe_match *match) -{ - return 0; -} - -static inline int -devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) -{ - return 0; -} - -static inline void -devlink_resources_unregister(struct devlink *devlink, - struct devlink_resource *resource) -{ -} - -static inline int -devlink_resource_size_get(struct devlink *devlink, u64 resource_id, - u64 *p_resource_size) -{ - return -EOPNOTSUPP; -} - -static inline int -devlink_dpipe_table_resource_set(struct devlink *devlink, - const char *table_name, u64 resource_id, - u64 resource_units) -{ - return -EOPNOTSUPP; -} - -static inline void -devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv) -{ -} - -static inline void -devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id) -{ -} - -static inline int -devlink_params_register(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - return 0; -} - -static inline void -devlink_params_unregister(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - -} - -static inline int -devlink_port_params_register(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count) -{ - return 0; -} - -static inline void -devlink_port_params_unregister(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count) -{ -} - -static inline int -devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, - union devlink_param_value *init_val) -{ - return -EOPNOTSUPP; -} - -static inline int -devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, - union devlink_param_value init_val) -{ - return -EOPNOTSUPP; -} - -static inline int -devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port, - u32 param_id, - union devlink_param_value *init_val) -{ - return -EOPNOTSUPP; -} - -static inline int -devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port, - u32 param_id, - union devlink_param_value init_val) -{ - return -EOPNOTSUPP; -} - -static inline void -devlink_param_value_changed(struct devlink *devlink, u32 param_id) -{ -} - -static inline void -devlink_port_param_value_changed(struct devlink_port *devlink_port, - u32 param_id) -{ -} - -static inline void -devlink_param_value_str_fill(union devlink_param_value *dst_val, - const char *src) -{ -} - -static inline struct devlink_region * -devlink_region_create(struct devlink *devlink, - const char *region_name, - u32 region_max_snapshots, - u64 region_size) -{ - return NULL; -} - -static inline void -devlink_region_destroy(struct devlink_region *region) -{ -} - -static inline u32 -devlink_region_shapshot_id_get(struct devlink *devlink) -{ - return 0; -} - -static inline int -devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, - u8 *data, u32 snapshot_id, - devlink_snapshot_data_dest_t *data_destructor) -{ - return 0; -} - -static inline int -devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) -{ - return 0; -} - -static inline int -devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) -{ - return 0; -} - -static inline int -devlink_info_version_fixed_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return 0; -} - -static inline int -devlink_info_version_stored_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return 0; -} - -static inline int -devlink_info_version_running_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return 0; -} - -static inline int -devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) -{ - return 0; -} - -static inline int -devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) -{ - return 0; -} - -static inline int -devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) -{ - return 0; -} - -static inline int -devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) -{ - return 0; -} - -static inline int -devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name) -{ - return 0; -} - -static inline int -devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) -{ - return 0; -} - -static inline int -devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) -{ - return 0; -} - -static inline int -devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) -{ - return 0; -} - -static inline int -devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) -{ - return 0; -} - -static inline int -devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) -{ - return 0; -} - -static inline int -devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) -{ - return 0; -} - -static inline int -devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len) -{ - return 0; -} - -static inline int -devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, - bool value) -{ - return 0; -} - -static inline int -devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, - u8 value) -{ - return 0; -} - -static inline int -devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, - u32 value) -{ - return 0; -} - -static inline int -devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, - u64 value) -{ - return 0; -} - -static inline int -devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, - const char *value) -{ - return 0; -} - -static inline int -devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u16 value_len) -{ - return 0; -} - -static inline struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv) -{ - return NULL; -} - -static inline void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ -} - -static inline void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter) -{ - return NULL; -} - -static inline int -devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx) -{ - return 0; -} - -static inline void -devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, - enum devlink_health_reporter_state state) -{ -} - static inline void devlink_compat_running_version(struct net_device *dev, char *buf, size_t len) { @@ -1232,6 +742,7 @@ devlink_compat_flash_update(struct net_device *dev, const char *file_name) { return -EOPNOTSUPP; } + #endif #endif /* _NET_DEVLINK_H_ */ -- cgit v1.2.3 From 4f661542a40217713f2cee0bb6678fbb30d9d367 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Mar 2019 08:34:55 -0700 Subject: tcp: fix zerocopy and notsent_lowat issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My recent patch had at least three problems : 1) TX zerocopy wants notification when skb is acknowledged, thus we need to call skb_zcopy_clear() if the skb is cached into sk->sk_tx_skb_cache 2) Some applications might expect precise EPOLLOUT notifications, so we need to update sk->sk_wmem_queued and call sk_mem_uncharge() from sk_wmem_free_skb() in all cases. The SOCK_QUEUE_SHRUNK flag must also be set. 3) Reuse of saved skb should have used skb_cloned() instead of simply checking if the fast clone has been freed. Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Tested-by: Holger Hoffstätte Signed-off-by: David S. Miller --- include/net/sock.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 577d91fb5626..7fa223278522 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1465,13 +1465,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + sk->sk_wmem_queued -= skb->truesize; + sk_mem_uncharge(sk, skb->truesize); if (!sk->sk_tx_skb_cache) { + skb_zcopy_clear(skb, true); sk->sk_tx_skb_cache = skb; return; } - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - sk->sk_wmem_queued -= skb->truesize; - sk_mem_uncharge(sk, skb->truesize); __kfree_skb(skb); } -- cgit v1.2.3 From df453700e8d81b1bdafdf684365ee2b9431fb702 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Mar 2019 12:40:33 -0700 Subject: inet: switch IP ID generator to siphash According to Amit Klein and Benny Pinkas, IP ID generation is too weak and might be used by attackers. Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix()) having 64bit key and Jenkins hash is risky. It is time to switch to siphash and its 128bit keys. Signed-off-by: Eric Dumazet Reported-by: Amit Klein Reported-by: Benny Pinkas Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 104a6669e344..7698460a3dd1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -9,6 +9,7 @@ #include #include #include +#include struct tcpm_hash_bucket; struct ctl_table_header; @@ -217,5 +218,6 @@ struct netns_ipv4 { unsigned int ipmr_seq; /* protected by rtnl_mutex */ atomic_t rt_genid; + siphash_key_t ip_id_key; }; #endif -- cgit v1.2.3 From 5dc37bb9b03586e8fdeb47d25e8d2a0399984936 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 28 Mar 2019 13:56:36 +0100 Subject: net: replace ndo_get_devlink with ndo_get_devlink_port Follow-up patch is going to need a devlink port instance according to a netdev. Devlink port instance should be always available when devlink is used. So change the recently introduced ndo_get_devlink to ndo_get_devlink_port. With that, adjust the wrapper for the only user to get devlink pointer. Signed-off-by: Jiri Pirko Reviewed-by: Michal Kubecek Reviewed-by: Florian Fainelli Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 03fb16f4fb6c..81b5ed04a341 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -547,10 +547,20 @@ static inline struct devlink *priv_to_devlink(void *priv) return container_of(priv, struct devlink, priv); } +static inline struct devlink_port * +netdev_to_devlink_port(struct net_device *dev) +{ + if (dev->netdev_ops->ndo_get_devlink_port) + return dev->netdev_ops->ndo_get_devlink_port(dev); + return NULL; +} + static inline struct devlink *netdev_to_devlink(struct net_device *dev) { - if (dev->netdev_ops->ndo_get_devlink) - return dev->netdev_ops->ndo_get_devlink(dev); + struct devlink_port *devlink_port = netdev_to_devlink_port(dev); + + if (devlink_port) + return devlink_port->devlink; return NULL; } -- cgit v1.2.3 From af3836df9a59e7339d60c9c46729a7d9094d0582 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 28 Mar 2019 13:56:37 +0100 Subject: net: devlink: introduce devlink_compat_phys_port_name_get() Introduce devlink_compat_phys_port_name_get() helper that gets the physical port name for specified netdevice according to devlink port attributes. Call this helper from dev_get_phys_port_name() in case ndo_get_phys_port_name is not defined. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 81b5ed04a341..85e577d6ec3b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -739,6 +739,8 @@ devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, void devlink_compat_running_version(struct net_device *dev, char *buf, size_t len); int devlink_compat_flash_update(struct net_device *dev, const char *file_name); +int devlink_compat_phys_port_name_get(struct net_device *dev, + char *name, size_t len); #else @@ -753,6 +755,13 @@ devlink_compat_flash_update(struct net_device *dev, const char *file_name) return -EOPNOTSUPP; } +static inline int +devlink_compat_phys_port_name_get(struct net_device *dev, + char *name, size_t len) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ -- cgit v1.2.3 From 14c03ac4c100e4b81ec4747f5ec861701ff52de2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 28 Mar 2019 13:56:40 +0100 Subject: net: devlink: remove unused devlink_port_get_phys_port_name() function Now it is unused, remove it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 85e577d6ec3b..31d5cec4d06b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -583,8 +583,6 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour, u32 port_number, bool split, u32 split_subport_number); -int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, - char *name, size_t len); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, -- cgit v1.2.3 From 717700d183d65bd2e6511566aa6d32395419d158 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Tue, 26 Mar 2019 11:31:13 -0700 Subject: netfilter: Export nf_ct_{set,destroy}_timeout() This patch exports nf_ct_set_timeout() and nf_ct_destroy_timeout(). The two functions are derived from xt_ct_destroy_timeout() and xt_ct_set_timeout() in xt_CT.c, and moved to nf_conntrack_timeout.c without any functional change. It would be useful for other users (i.e. OVS) that utilizes the finer-grain conntrack timeout feature. CC: Pablo Neira Ayuso CC: Pravin Shelar Signed-off-by: Yi-Hung Wei Signed-off-by: David S. Miller --- include/net/netfilter/nf_conntrack_timeout.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 3394d75e1c80..00a8fbb2d735 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -88,6 +88,9 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) int nf_conntrack_timeout_init(void); void nf_conntrack_timeout_fini(void); void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); +int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, + const char *timeout_name); +void nf_ct_destroy_timeout(struct nf_conn *ct); #else static inline int nf_conntrack_timeout_init(void) { @@ -98,6 +101,18 @@ static inline void nf_conntrack_timeout_fini(void) { return; } + +static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, + u8 l3num, u8 l4num, + const char *timeout_name) +{ + return -EOPNOTSUPP; +} + +static inline void nf_ct_destroy_timeout(struct nf_conn *ct) +{ + return; +} #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -- cgit v1.2.3 From e4516ef65490ef29d48a98ad4d7c90dccf39068f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:48 -0700 Subject: ipv4: Create init helper for fib_nh Consolidate the fib_nh initialization which is duplicated between fib_create_info for single path and fib_get_nhs for multipath. Export the helper to allow for use with nexthop objects in the future. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 9c8214d2116d..1af1f552644a 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -416,6 +416,10 @@ void fib_select_multipath(struct fib_result *res, int hash); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); +int fib_nh_init(struct net *net, struct fib_nh *fib_nh, + struct fib_config *cfg, int nh_weight, + struct netlink_ext_ack *extack); + /* Exported by fib_trie.c */ void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); -- cgit v1.2.3 From faa041a40b9fa64913789fcc0161c7c73161f57e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:49 -0700 Subject: ipv4: Create cleanup helper for fib_nh Move the fib_nh cleanup code from free_fib_info_rcu into a new helper, fib_nh_release. Move classid accounting into fib_nh_release which is called per fib_nh to make accounting symmetrical with fib_nh_init. Export the helper to allow for use with nexthop objects in the future. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1af1f552644a..5a4df0ba175e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -419,6 +419,7 @@ void fib_select_path(struct net *net, struct fib_result *res, int fib_nh_init(struct net *net, struct fib_nh *fib_nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack); +void fib_nh_release(struct net *net, struct fib_nh *fib_nh); /* Exported by fib_trie.c */ void fib_trie_init(void); -- cgit v1.2.3 From 83c442515917812d4ff643e90cd456c630b7e762 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:50 -0700 Subject: ipv6: Create init helper for fib6_nh Similar to IPv4, consolidate the fib6_nh initialization into a helper. As a new standalone function, add a cleanup path to put lwtstate on error. To avoid modifying fib6_config flags, move the reject check to a helper that is invoked once by fib6_nh_init to reset the device and then again in ip6_route_info_create to set the fib6_flags. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 2acb78a762ee..ce1f81345c8e 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -444,6 +444,10 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) return f6i->fib6_nh.nh_dev; } +int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); + static inline struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) { -- cgit v1.2.3 From dac7d0f27075ce54017a7efdd6ae0a55352a0f80 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:51 -0700 Subject: ipv6: Create cleanup helper for fib6_nh Move the fib6_nh cleanup code to a new helper, fib6_nh_release. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ce1f81345c8e..2d2a468b3d6d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -447,6 +447,7 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); +void fib6_nh_release(struct fib6_nh *fib6_nh); static inline struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) -- cgit v1.2.3 From 2b2450ca4a2d9d772dc45e1220c04cb3ba761843 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:52 -0700 Subject: ipv6: Move gateway checks to a fib6_nh setting The gateway setting is not per fib6_info entry but per-fib6_nh. Add a new fib_nh_has_gw flag to fib6_nh and convert references to RTF_GATEWAY to the new flag. For IPv6 address the flag is cheaper than checking that nh_gw is non-0 like IPv4 does. While this increases fib6_nh by 8-bytes, the effective allocation size of a fib6_info is unchanged. The 8 bytes is recovered later with a fib_nh_common change. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + include/net/ip6_route.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 2d2a468b3d6d..3b04b318cf13 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -126,6 +126,7 @@ struct rt6_exception { struct fib6_nh { struct in6_addr nh_gw; + bool fib_nh_has_gw; struct net_device *nh_dev; struct lwtunnel_state *nh_lwtstate; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 7ab119936e69..95cd8a2f6284 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,8 +68,8 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { - return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; + return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) && + f6i->fib6_nh.fib_nh_has_gw; } void ip6_route_input(struct sk_buff *skb); -- cgit v1.2.3 From 6d3d07b45c86f984424ccbad110ca500397fd18c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:53 -0700 Subject: ipv6: Refactor fib6_ignore_linkdown fib6_ignore_linkdown takes a fib6_info but only looks at the net_device and its IPv6 config. Change it to take a net_device over a fib6_info as its input argument. In addition, move it to a header file to make the check inline and usable later with IPv4 code without going through the ipv6 stub, and rename to ip6_ignore_linkdown since it is only checking the setting based on the ipv6 struct on a device. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/addrconf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 269ec27385e9..ec8e6784a6f7 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -425,6 +425,14 @@ static inline void in6_dev_hold(struct inet6_dev *idev) refcount_inc(&idev->refcnt); } +/* called with rcu_read_lock held */ +static inline bool ip6_ignore_linkdown(const struct net_device *dev) +{ + const struct inet6_dev *idev = __in6_dev_get(dev); + + return !!idev->cnf.ignore_routes_with_linkdown; +} + void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); static inline void in6_ifa_put(struct inet6_ifaddr *ifp) -- cgit v1.2.3 From b75ed8b1aa9c3a99702159c3be8b0c1d54972ae5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:55 -0700 Subject: ipv4: Rename fib_nh entries Rename fib_nh entries that will be moved to a fib_nh_common struct. Specifically, the device, oif, gateway, flags, scope, lwtstate, nh_weight and nh_upper_bound are common with all nexthop definitions. In the process shorten fib_nh_lwtstate to fib_nh_lws to avoid really long lines. Rename only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 5a4df0ba175e..029acd333d29 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -77,26 +77,26 @@ struct fnhe_hash_bucket { #define FNHE_RECLAIM_DEPTH 5 struct fib_nh { - struct net_device *nh_dev; + struct net_device *fib_nh_dev; struct hlist_node nh_hash; struct fib_info *nh_parent; - unsigned int nh_flags; - unsigned char nh_scope; + unsigned int fib_nh_flags; + unsigned char fib_nh_scope; #ifdef CONFIG_IP_ROUTE_MULTIPATH - int nh_weight; - atomic_t nh_upper_bound; + int fib_nh_weight; + atomic_t fib_nh_upper_bound; #endif #ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif - int nh_oif; - __be32 nh_gw; + int fib_nh_oif; + __be32 fib_nh_gw4; __be32 nh_saddr; int nh_saddr_genid; struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; - struct lwtunnel_state *nh_lwtstate; + struct lwtunnel_state *fib_nh_lws; }; /* @@ -125,7 +125,7 @@ struct fib_info { int fib_nhs; struct rcu_head rcu; struct fib_nh fib_nh[0]; -#define fib_dev fib_nh[0].nh_dev +#define fib_dev fib_nh[0].fib_nh_dev }; @@ -180,9 +180,9 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); atomic_read(&(net)->ipv4.dev_addr_genid)) ? \ FIB_RES_NH(res).nh_saddr : \ fib_info_update_nh_saddr((net), &FIB_RES_NH(res))) -#define FIB_RES_GW(res) (FIB_RES_NH(res).nh_gw) -#define FIB_RES_DEV(res) (FIB_RES_NH(res).nh_dev) -#define FIB_RES_OIF(res) (FIB_RES_NH(res).nh_oif) +#define FIB_RES_GW(res) (FIB_RES_NH(res).fib_nh_gw4) +#define FIB_RES_DEV(res) (FIB_RES_NH(res).fib_nh_dev) +#define FIB_RES_OIF(res) (FIB_RES_NH(res).fib_nh_oif) #define FIB_RES_PREFSRC(net, res) ((res).fi->fib_prefsrc ? : \ FIB_RES_SADDR(net, res)) -- cgit v1.2.3 From ad1601ae0260551f85691ca1ac814773fdcec239 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:56 -0700 Subject: ipv6: Rename fib6_nh entries Rename fib6_nh entries that will be moved to a fib_nh_common struct. Specifically, the device, gateway, flags, and lwtstate are common with all nexthop definitions. In some places new temporary variables are declared or local variables renamed to maintain line lengths. Rename only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 16 ++++++++-------- include/net/ip6_route.h | 8 +++++--- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3b04b318cf13..aff8570725c8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -125,14 +125,14 @@ struct rt6_exception { #define FIB6_MAX_DEPTH 5 struct fib6_nh { - struct in6_addr nh_gw; + struct in6_addr fib_nh_gw6; bool fib_nh_has_gw; - struct net_device *nh_dev; - struct lwtunnel_state *nh_lwtstate; + struct net_device *fib_nh_dev; + struct lwtunnel_state *fib_nh_lws; - unsigned int nh_flags; - atomic_t nh_upper_bound; - int nh_weight; + unsigned int fib_nh_flags; + atomic_t fib_nh_upper_bound; + int fib_nh_weight; }; struct fib6_info { @@ -442,7 +442,7 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr) static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) { - return f6i->fib6_nh.nh_dev; + return f6i->fib6_nh.fib_nh_dev; } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, @@ -453,7 +453,7 @@ void fib6_nh_release(struct fib6_nh *fib6_nh); static inline struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) { - return f6i->fib6_nh.nh_lwtstate; + return f6i->fib6_nh.fib_nh_lws; } void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 95cd8a2f6284..342180a7285c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -274,9 +274,11 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { - return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && - ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && - !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); + struct fib6_nh *nha = &a->fib6_nh, *nhb = &b->fib6_nh; + + return nha->fib_nh_dev == nhb->fib_nh_dev && + ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) && + !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws); } static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) -- cgit v1.2.3 From f1741730dd18828fe3ea5fa91c22f41cf001c625 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:57 -0700 Subject: net: Add fib_nh_common and update fib_nh and fib6_nh Add fib_nh_common struct with common nexthop attributes. Convert fib_nh and fib6_nh to use it. Use macros to move existing fib_nh_* references to the new nh_common.nhc_*. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 10 ++-------- include/net/ip_fib.h | 41 +++++++++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index aff8570725c8..58dbb4e82908 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -125,14 +126,7 @@ struct rt6_exception { #define FIB6_MAX_DEPTH 5 struct fib6_nh { - struct in6_addr fib_nh_gw6; - bool fib_nh_has_gw; - struct net_device *fib_nh_dev; - struct lwtunnel_state *fib_nh_lws; - - unsigned int fib_nh_flags; - atomic_t fib_nh_upper_bound; - int fib_nh_weight; + struct fib_nh_common nh_common; }; struct fib6_info { diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 029acd333d29..70548b1a6322 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -76,27 +76,48 @@ struct fnhe_hash_bucket { #define FNHE_HASH_SIZE (1 << FNHE_HASH_SHIFT) #define FNHE_RECLAIM_DEPTH 5 +struct fib_nh_common { + struct net_device *nhc_dev; + int nhc_oif; + unsigned int nhc_flags; + struct lwtunnel_state *nhc_lwtstate; + unsigned char nhc_scope; + u8 nhc_family; + u8 nhc_has_gw:1, + unused:7; + union { + __be32 ipv4; + struct in6_addr ipv6; + } nhc_gw; + + int nhc_weight; + atomic_t nhc_upper_bound; +}; + struct fib_nh { - struct net_device *fib_nh_dev; + struct fib_nh_common nh_common; struct hlist_node nh_hash; struct fib_info *nh_parent; - unsigned int fib_nh_flags; - unsigned char fib_nh_scope; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - int fib_nh_weight; - atomic_t fib_nh_upper_bound; -#endif #ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif - int fib_nh_oif; - __be32 fib_nh_gw4; __be32 nh_saddr; int nh_saddr_genid; struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; - struct lwtunnel_state *fib_nh_lws; +#define fib_nh_family nh_common.nhc_family +#define fib_nh_dev nh_common.nhc_dev +#define fib_nh_oif nh_common.nhc_oif +#define fib_nh_flags nh_common.nhc_flags +#define fib_nh_lws nh_common.nhc_lwtstate +#define fib_nh_scope nh_common.nhc_scope +#define fib_nh_family nh_common.nhc_family +#define fib_nh_has_gw nh_common.nhc_has_gw +#define fib_nh_gw4 nh_common.nhc_gw.ipv4 +#define fib_nh_gw6 nh_common.nhc_gw.ipv6 +#define fib_nh_weight nh_common.nhc_weight +#define fib_nh_upper_bound nh_common.nhc_upper_bound }; /* -- cgit v1.2.3 From 979e276ebebd537782797c439c9cb42b6d3aba27 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:58 -0700 Subject: net: Use common nexthop init and release helpers With fib_nh_common in place, move common initialization and release code into helpers used by both ipv4 and ipv6. For the moment, the init is just the lwt encap and the release is both the netdev reference and the the lwt state reference. More will be added later. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 70548b1a6322..12a6d759cf57 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -441,6 +441,10 @@ int fib_nh_init(struct net *net, struct fib_nh *fib_nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack); void fib_nh_release(struct net *net, struct fib_nh *fib_nh); +int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *fc_encap, + u16 fc_encap_type, void *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); +void fib_nh_common_release(struct fib_nh_common *nhc); /* Exported by fib_trie.c */ void fib_trie_init(void); -- cgit v1.2.3 From 3616d08bcbb564c7765187cd45ad392e49bad73a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 22 Mar 2019 06:06:09 -0700 Subject: ipv6: Move ipv6 stubs to a separate header file The number of stubs is growing and has nothing to do with addrconf. Move the definition of the stubs to a separate header file and update users. In the move, drop the vxlan specific comment before ipv6_stub. Code move only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 47 ------------------------------------ include/net/ipv6_stubs.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ include/net/udp_tunnel.h | 2 +- 3 files changed, 64 insertions(+), 48 deletions(-) create mode 100644 include/net/ipv6_stubs.h (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index ec8e6784a6f7..2f67ae854ff0 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -238,53 +238,6 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, void ipv6_mc_dad_complete(struct inet6_dev *idev); -/* A stub used by vxlan module. This is ugly, ideally these - * symbols should be built into the core kernel. - */ -struct ipv6_stub { - int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex, - const struct in6_addr *addr); - int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, - const struct in6_addr *addr); - int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, - struct dst_entry **dst, struct flowi6 *fl6); - int (*ipv6_route_input)(struct sk_buff *skb); - - struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); - struct fib6_info *(*fib6_lookup)(struct net *net, int oif, - struct flowi6 *fl6, int flags); - struct fib6_info *(*fib6_table_lookup)(struct net *net, - struct fib6_table *table, - int oif, struct flowi6 *fl6, - int flags); - struct fib6_info *(*fib6_multipath_select)(const struct net *net, - struct fib6_info *f6i, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict); - u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr); - - void (*udpv6_encap_enable)(void); - void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, - const struct in6_addr *solicited_addr, - bool router, bool solicited, bool override, bool inc_opt); - struct neigh_table *nd_tbl; -}; -extern const struct ipv6_stub *ipv6_stub __read_mostly; - -/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ -struct ipv6_bpf_stub { - int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); - struct sock *(*udp6_lib_lookup)(struct net *net, - const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, __be16 dport, - int dif, int sdif, struct udp_table *tbl, - struct sk_buff *skb); -}; -extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; - /* * identify MLD packets for MLD filter exceptions */ diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h new file mode 100644 index 000000000000..d8d9c0b0e8c0 --- /dev/null +++ b/include/net/ipv6_stubs.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IPV6_STUBS_H +#define _IPV6_STUBS_H + +#include +#include +#include +#include +#include +#include +#include + +/* structs from net/ip6_fib.h */ +struct fib6_info; + +/* This is ugly, ideally these symbols should be built + * into the core kernel. + */ +struct ipv6_stub { + int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex, + const struct in6_addr *addr); + int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, + const struct in6_addr *addr); + int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, + struct dst_entry **dst, struct flowi6 *fl6); + int (*ipv6_route_input)(struct sk_buff *skb); + + struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); + struct fib6_info *(*fib6_lookup)(struct net *net, int oif, + struct flowi6 *fl6, int flags); + struct fib6_info *(*fib6_table_lookup)(struct net *net, + struct fib6_table *table, + int oif, struct flowi6 *fl6, + int flags); + struct fib6_info *(*fib6_multipath_select)(const struct net *net, + struct fib6_info *f6i, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, + int strict); + u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); + + void (*udpv6_encap_enable)(void); + void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + bool router, bool solicited, bool override, bool inc_opt); + struct neigh_table *nd_tbl; +}; +extern const struct ipv6_stub *ipv6_stub __read_mostly; + +/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ +struct ipv6_bpf_stub { + int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); + struct sock *(*udp6_lib_lookup)(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif, int sdif, struct udp_table *tbl, + struct sk_buff *skb); +}; +extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; + +#endif diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index b8137953fea3..4b1f95e08307 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -7,7 +7,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include -#include +#include #endif struct udp_port_cfg { -- cgit v1.2.3 From a2c7023f7075ca9b80f944d3f20f60e6574538e2 Mon Sep 17 00:00:00 2001 From: Xiaofei Shen Date: Fri, 29 Mar 2019 11:04:58 +0530 Subject: net: dsa: read mac address from DT for slave device Before creating a slave netdevice, get the mac address from DTS and apply in case it is valid. Signed-off-by: Xiaofei Shen Signed-off-by: Vinod Koul Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index ae480bba11f5..0cfc2f828b87 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -140,6 +140,7 @@ struct dsa_port { unsigned int index; const char *name; const struct dsa_port *cpu_dp; + const char *mac; struct device_node *dn; unsigned int ageing_time; u8 stp_state; -- cgit v1.2.3 From eba618abacade71669eb67c3360eecfee810cc88 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:55 -0700 Subject: ipv4: Add fib_nh_common to fib_result Most of the ipv4 code only needs data from fib_nh_common. Add fib_nh_common selection to fib_result and update users to use it. Right now, fib_nh_common in fib_result will point to a fib_nh struct that is embedded within a fib_info: fib_info --> fib_nh fib_nh ... fib_nh ^ fib_result->nhc ----+ Later, nhc can point to a fib_nh within a nexthop struct: fib_info --> nexthop --> fib_nh ^ fib_result->nhc ---------------+ or for a nexthop group: fib_info --> nexthop --> nexthop --> fib_nh nexthop --> fib_nh ... nexthop --> fib_nh ^ fib_result->nhc ---------------------------+ In all cases nhsel within fib_result will point to which leg in the multipath route is used. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 12a6d759cf57..1f4a3b8bf584 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -156,15 +156,16 @@ struct fib_rule; struct fib_table; struct fib_result { - __be32 prefix; - unsigned char prefixlen; - unsigned char nh_sel; - unsigned char type; - unsigned char scope; - u32 tclassid; - struct fib_info *fi; - struct fib_table *table; - struct hlist_head *fa_head; + __be32 prefix; + unsigned char prefixlen; + unsigned char nh_sel; + unsigned char type; + unsigned char scope; + u32 tclassid; + struct fib_nh_common *nhc; + struct fib_info *fi; + struct fib_table *table; + struct hlist_head *fa_head; }; struct fib_result_nl { @@ -182,11 +183,10 @@ struct fib_result_nl { int err; }; -#ifdef CONFIG_IP_ROUTE_MULTIPATH -#define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) -#else /* CONFIG_IP_ROUTE_MULTIPATH */ -#define FIB_RES_NH(res) ((res).fi->fib_nh[0]) -#endif /* CONFIG_IP_ROUTE_MULTIPATH */ +static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel) +{ + return &fi->fib_nh[nhsel].nh_common; +} #ifdef CONFIG_IP_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 @@ -195,18 +195,11 @@ struct fib_result_nl { #endif __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); +__be32 fib_result_prefsrc(struct net *net, struct fib_result *res); -#define FIB_RES_SADDR(net, res) \ - ((FIB_RES_NH(res).nh_saddr_genid == \ - atomic_read(&(net)->ipv4.dev_addr_genid)) ? \ - FIB_RES_NH(res).nh_saddr : \ - fib_info_update_nh_saddr((net), &FIB_RES_NH(res))) -#define FIB_RES_GW(res) (FIB_RES_NH(res).fib_nh_gw4) -#define FIB_RES_DEV(res) (FIB_RES_NH(res).fib_nh_dev) -#define FIB_RES_OIF(res) (FIB_RES_NH(res).fib_nh_oif) - -#define FIB_RES_PREFSRC(net, res) ((res).fi->fib_prefsrc ? : \ - FIB_RES_SADDR(net, res)) +#define FIB_RES_NHC(res) ((res).nhc) +#define FIB_RES_DEV(res) (FIB_RES_NHC(res)->nhc_dev) +#define FIB_RES_OIF(res) (FIB_RES_NHC(res)->nhc_oif) struct fib_entry_notifier_info { struct fib_notifier_info info; /* must be first */ @@ -453,10 +446,12 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) { #ifdef CONFIG_IP_ROUTE_CLASSID + struct fib_nh_common *nhc = res->nhc; + struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); #ifdef CONFIG_IP_MULTIPLE_TABLES u32 rtag; #endif - *itag = FIB_RES_NH(*res).nh_tclassid<<16; + *itag = nh->nh_tclassid << 16; #ifdef CONFIG_IP_MULTIPLE_TABLES rtag = res->tclassid; if (*itag == 0) -- cgit v1.2.3 From c0a720770c01e67374b15f348f17a52409f6545c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:58 -0700 Subject: ipv6: Flip to fib_nexthop_info Export fib_nexthop_info and fib_add_nexthop for use by IPv6 code. Remove rt6_nexthop_info and rt6_add_nexthop in favor of the IPv4 versions. Update fib_nexthop_info for IPv6 linkdown check and RTA_GATEWAY for AF_INET6. Signed-off-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1f4a3b8bf584..3ce07841dc3b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -492,4 +492,9 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb); + +int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, + unsigned int *flags, bool skip_oif); +int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, + int nh_weight); #endif /* _NET_FIB_H */ -- cgit v1.2.3 From 407dd706fb5245c138f3a972f8aaa1c8a09a574c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 3 Apr 2019 14:24:15 +0200 Subject: net: devlink: convert devlink_port_attrs bools to bits In order to save space in the struct, convert bools to bits. Signed-off-by: Jiri Pirko Reviewed-by: Florian Fainelli Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 31d5cec4d06b..4a1e3452a4ce 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -41,10 +41,10 @@ struct devlink { }; struct devlink_port_attrs { - bool set; + u8 set:1, + split:1; enum devlink_port_flavour flavour; u32 port_number; /* same value as "split group" */ - bool split; u32 split_subport_number; }; -- cgit v1.2.3 From bec5267cded268acdf679b651778c300d204e9f2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 3 Apr 2019 14:24:16 +0200 Subject: net: devlink: extend port attrs for switch ID Extend devlink_port_attrs_set() to pass switch ID for ports which are part of switch and store it in port attrs. For other ports, this is NULL. Note that this allows the driver to group devlink ports into one or more switches according to the actual topology. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4a1e3452a4ce..0f7968761204 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -42,10 +42,12 @@ struct devlink { struct devlink_port_attrs { u8 set:1, - split:1; + split:1, + switch_port:1; enum devlink_port_flavour flavour; u32 port_number; /* same value as "split group" */ u32 split_subport_number; + struct netdev_phys_item_id switch_id; }; struct devlink_port { @@ -582,7 +584,9 @@ void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour, u32 port_number, bool split, - u32 split_subport_number); + u32 split_subport_number, + const unsigned char *switch_id, + unsigned char switch_id_len); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, -- cgit v1.2.3 From 7e1146e8c10c00f859843817da8ecc5d902ea409 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 3 Apr 2019 14:24:17 +0200 Subject: net: devlink: introduce devlink_compat_switch_id_get() helper Introduce devlink_compat_switch_id_get() helper which fills up switch_id according to passed netdev pointer. Call it directly from dev_get_port_parent_id() as a fallback when ndo_get_port_parent_id is not defined for given netdev. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 0f7968761204..70c7d1ac8344 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -743,6 +743,8 @@ void devlink_compat_running_version(struct net_device *dev, int devlink_compat_flash_update(struct net_device *dev, const char *file_name); int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len); +int devlink_compat_switch_id_get(struct net_device *dev, + struct netdev_phys_item_id *ppid); #else @@ -764,6 +766,13 @@ devlink_compat_phys_port_name_get(struct net_device *dev, return -EOPNOTSUPP; } +static inline int +devlink_compat_switch_id_get(struct net_device *dev, + struct netdev_phys_item_id *ppid) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ -- cgit v1.2.3 From b262a69582a4676c7378a73077b7bb186c7c5b2a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:22 +0100 Subject: xfrm: place af number into xfrm_mode struct This will be useful to know if we're supposed to decode ipv4 or ipv6. While at it, make the unregister function return void, all module_exit functions did just BUG(); there is never a point in doing error checks if there is no way to handle such error. Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 85386becbaea..9a155063c25f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -482,8 +482,9 @@ struct xfrm_mode { struct xfrm_state_afinfo *afinfo; struct module *owner; - unsigned int encap; - int flags; + u8 encap; + u8 family; + u8 flags; }; /* Flags for xfrm_mode. */ @@ -491,8 +492,8 @@ enum { XFRM_MODE_FLAG_TUNNEL = 1, }; -int xfrm_register_mode(struct xfrm_mode *mode, int family); -int xfrm_unregister_mode(struct xfrm_mode *mode, int family); +int xfrm_register_mode(struct xfrm_mode *mode); +void xfrm_unregister_mode(struct xfrm_mode *mode); static inline int xfrm_af2proto(unsigned int family) { -- cgit v1.2.3 From c2d305e51038167dd9de8d476c72f667d84cad8b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:24 +0100 Subject: xfrm: remove input indirection from xfrm_mode No need for any indirection or abstraction here, both functions are pretty much the same and quite small, they also have no external dependencies. xfrm_prepare_input can then be made static. With allmodconfig build, size increase of vmlinux is 25 byte: Before: text data bss dec filename 15730207 6936924 4046908 26714039 vmlinux After: 15730208 6936948 4046908 26714064 vmlinux v2: Fix INET_XFRM_MODE_TRANSPORT name in is-enabled test (Sabrina Dubroca) change copied comment to refer to transport and network header, not skb->{h,nh}, which don't exist anymore. (Sabrina) make xfrm_prepare_input static (Eyal Birger) Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9a155063c25f..2c5fc9cc367d 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -436,16 +436,6 @@ struct xfrm_mode { */ int (*input2)(struct xfrm_state *x, struct sk_buff *skb); - /* - * This is the actual input entry point. - * - * For transport mode and equivalent this would be identical to - * input2 (which does not need to be set). While tunnel mode - * and equivalent would set this to the tunnel encapsulation function - * xfrm4_prepare_input that would in turn call input2. - */ - int (*input)(struct xfrm_state *x, struct sk_buff *skb); - /* * Add encapsulation header. * @@ -1606,7 +1596,6 @@ int xfrm_init_replay(struct xfrm_state *x); int xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); -int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); int xfrm_trans_queue(struct sk_buff *skb, -- cgit v1.2.3 From 0c620e97b3490890facbbe06d5deed9b024de255 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:25 +0100 Subject: xfrm: remove output indirection from xfrm_mode Same is input indirection. Only exception: we need to export xfrm_outer_mode_output for pktgen. Increases size of vmlinux by about 163 byte: Before: text data bss dec filename 15730208 6936948 4046908 26714064 vmlinux After: 15730311 6937008 4046908 26714227 vmlinux xfrm_inner_extract_output has no more external callers, make it static. v2: add IS_ENABLED(IPV6) guard in xfrm6_prepare_output add two missing breaks in xfrm_outer_mode_output (Sabrina Dubroca) add WARN_ON_ONCE for 'call AF_INET6 related output function, but CONFIG_IPV6=n' case. make xfrm_inner_extract_output static Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2c5fc9cc367d..01e7e9c0e8a9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -449,17 +449,6 @@ struct xfrm_mode { */ int (*output2)(struct xfrm_state *x,struct sk_buff *skb); - /* - * This is the actual output entry point. - * - * For transport mode and equivalent this would be identical to - * output2 (which does not need to be set). While tunnel mode - * and equivalent would set this to a tunnel encapsulation function - * (xfrm4_prepare_output or xfrm6_prepare_output) that would in turn - * call output2. - */ - int (*output)(struct xfrm_state *x, struct sk_buff *skb); - /* * Adjust pointers into the packet and do GSO segmentation. */ @@ -1603,7 +1592,11 @@ int xfrm_trans_queue(struct sk_buff *skb, struct sk_buff *)); int xfrm_output_resume(struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); -int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); + +#if IS_ENABLED(CONFIG_NET_PKTGEN) +int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb); +#endif + void xfrm_local_error(struct sk_buff *skb, int mtu); int xfrm4_extract_header(struct sk_buff *skb); int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb); @@ -1622,7 +1615,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) } int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); -int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb); int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err); @@ -1649,7 +1641,6 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr); __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr); int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb); -int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb); int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, -- cgit v1.2.3 From 303c5fab1272888b22088fbdd08cb770205ccb7a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:26 +0100 Subject: xfrm: remove xmit indirection from xfrm_mode There are only two versions (tunnel and transport). The ip/ipv6 versions are only differ in sizeof(iphdr) vs ipv6hdr. Place this in the core and use x->outer_mode->encap type to call the correct adjustment helper. Before: text data bss dec filename 15730311 6937008 4046908 26714227 vmlinux After: 15730428 6937008 4046908 26714344 vmlinux (about 117 byte increase) v2: use family from x->outer_mode, not inner Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 01e7e9c0e8a9..07966a27e4a4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -454,11 +454,6 @@ struct xfrm_mode { */ struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features); - /* - * Adjust pointers into the packet when IPsec is done at layer2. - */ - void (*xmit)(struct xfrm_state *x, struct sk_buff *skb); - struct xfrm_state_afinfo *afinfo; struct module *owner; u8 encap; -- cgit v1.2.3 From 7613b92b1ae37141704948b77e8762c5de896510 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:27 +0100 Subject: xfrm: remove gso_segment indirection from xfrm_mode These functions are small and we only have versions for tunnel and transport mode for ipv4 and ipv6 respectively. Just place the 'transport or tunnel' conditional in the protocol specific function instead of using an indirection. Before: 3226 12 0 3238 net/ipv4/esp4_offload.o 7004 492 0 7496 net/ipv4/ip_vti.o 3339 12 0 3351 net/ipv6/esp6_offload.o 11294 460 0 11754 net/ipv6/ip6_vti.o 1180 72 0 1252 net/ipv4/xfrm4_mode_beet.o 428 48 0 476 net/ipv4/xfrm4_mode_transport.o 1271 48 0 1319 net/ipv4/xfrm4_mode_tunnel.o 1083 60 0 1143 net/ipv6/xfrm6_mode_beet.o 172 48 0 220 net/ipv6/xfrm6_mode_ro.o 429 48 0 477 net/ipv6/xfrm6_mode_transport.o 1164 48 0 1212 net/ipv6/xfrm6_mode_tunnel.o 15730428 6937008 4046908 26714344 vmlinux After: 3461 12 0 3473 net/ipv4/esp4_offload.o 7000 492 0 7492 net/ipv4/ip_vti.o 3574 12 0 3586 net/ipv6/esp6_offload.o 11295 460 0 11755 net/ipv6/ip6_vti.o 1180 64 0 1244 net/ipv4/xfrm4_mode_beet.o 171 40 0 211 net/ipv4/xfrm4_mode_transport.o 1163 40 0 1203 net/ipv4/xfrm4_mode_tunnel.o 1083 52 0 1135 net/ipv6/xfrm6_mode_beet.o 172 40 0 212 net/ipv6/xfrm6_mode_ro.o 172 40 0 212 net/ipv6/xfrm6_mode_transport.o 1056 40 0 1096 net/ipv6/xfrm6_mode_tunnel.o 15730424 6937008 4046908 26714340 vmlinux Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 07966a27e4a4..de103a6d1ef8 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -449,11 +449,6 @@ struct xfrm_mode { */ int (*output2)(struct xfrm_state *x,struct sk_buff *skb); - /* - * Adjust pointers into the packet and do GSO segmentation. - */ - struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features); - struct xfrm_state_afinfo *afinfo; struct module *owner; u8 encap; -- cgit v1.2.3 From b3284df1c86f7ac078dcb8fb250fe3d6437e740c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:28 +0100 Subject: xfrm: remove input2 indirection from xfrm_mode No external dependencies on any module, place this in the core. Increase is about 1800 byte for xfrm_input.o. The beet helpers get added to internal header, as they can be reused from xfrm_output.c in the next patch (kernel contains several copies of them in the xfrm{4,6}_mode_beet.c files). Before: text data bss dec filename 5578 176 2364 8118 net/xfrm/xfrm_input.o 1180 64 0 1244 net/ipv4/xfrm4_mode_beet.o 171 40 0 211 net/ipv4/xfrm4_mode_transport.o 1163 40 0 1203 net/ipv4/xfrm4_mode_tunnel.o 1083 52 0 1135 net/ipv6/xfrm6_mode_beet.o 172 40 0 212 net/ipv6/xfrm6_mode_ro.o 172 40 0 212 net/ipv6/xfrm6_mode_transport.o 1056 40 0 1096 net/ipv6/xfrm6_mode_tunnel.o After: text data bss dec filename 7373 200 2364 9937 net/xfrm/xfrm_input.o 587 44 0 631 net/ipv4/xfrm4_mode_beet.o 171 32 0 203 net/ipv4/xfrm4_mode_transport.o 649 32 0 681 net/ipv4/xfrm4_mode_tunnel.o 625 44 0 669 net/ipv6/xfrm6_mode_beet.o 172 32 0 204 net/ipv6/xfrm6_mode_ro.o 172 32 0 204 net/ipv6/xfrm6_mode_transport.o 599 32 0 631 net/ipv6/xfrm6_mode_tunnel.o v2: pass inner_mode to xfrm_inner_mode_encap_remove to fix AF_UNSPEC selector breakage (bisected by Benedict Wong) Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index de103a6d1ef8..bdda545cf740 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -423,19 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); struct xfrm_mode { - /* - * Remove encapsulation header. - * - * The IP header will be moved over the top of the encapsulation - * header. - * - * On entry, the transport header shall point to where the IP header - * should be and the network header shall be set to where the IP - * header currently is. skb->data shall point to the start of the - * payload. - */ - int (*input2)(struct xfrm_state *x, struct sk_buff *skb); - /* * Add encapsulation header. * -- cgit v1.2.3 From 1de70830066b72b6a8e259e5363f6c0bc4ba7bbc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:29 +0100 Subject: xfrm: remove output2 indirection from xfrm_mode similar to previous patch: no external module dependencies, so we can avoid the indirection by placing this in the core. This change removes the last indirection from xfrm_mode and the xfrm4|6_mode_{beet,tunnel}.c modules contain (almost) no code anymore. Before: text data bss dec hex filename 3957 136 0 4093 ffd net/xfrm/xfrm_output.o 587 44 0 631 277 net/ipv4/xfrm4_mode_beet.o 649 32 0 681 2a9 net/ipv4/xfrm4_mode_tunnel.o 625 44 0 669 29d net/ipv6/xfrm6_mode_beet.o 599 32 0 631 277 net/ipv6/xfrm6_mode_tunnel.o After: text data bss dec hex filename 5359 184 0 5543 15a7 net/xfrm/xfrm_output.o 171 24 0 195 c3 net/ipv4/xfrm4_mode_beet.o 171 24 0 195 c3 net/ipv4/xfrm4_mode_tunnel.o 172 24 0 196 c4 net/ipv6/xfrm6_mode_beet.o 172 24 0 196 c4 net/ipv6/xfrm6_mode_tunnel.o v2: fold the *encap_add functions into xfrm*_prepare_output preserve (move) output2 comment (Sabrina) use x->outer_mode->encap, not inner fix a build breakage on ppc (kbuild robot) Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index bdda545cf740..4351444c10fc 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -423,19 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); struct xfrm_mode { - /* - * Add encapsulation header. - * - * On exit, the transport header will be set to the start of the - * encapsulation header to be filled in by x->type->output and - * the mac header will be set to the nextheader (protocol for - * IPv4) field of the extension header directly preceding the - * encapsulation header, or in its absence, that of the top IP - * header. The value of the network header will always point - * to the top IP header while skb->data will point to the payload. - */ - int (*output2)(struct xfrm_state *x,struct sk_buff *skb); - struct xfrm_state_afinfo *afinfo; struct module *owner; u8 encap; -- cgit v1.2.3 From 733a5fac2f15b55b9059230d098ed04341d2d884 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:30 +0100 Subject: xfrm: remove afinfo pointer from xfrm_mode Adds an EXPORT_SYMBOL for afinfo_get_rcu, as it will now be called from ipv6 in case of CONFIG_IPV6=m. This change has virtually no effect on vmlinux size, but it reduces afinfo size and allows followup patch to make xfrm modes const. v2: mark if (afinfo) tests as likely (Sabrina) re-fetch afinfo according to inner_mode in xfrm_prepare_input(). Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 4351444c10fc..8d1c9506bcf6 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -423,7 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); struct xfrm_mode { - struct xfrm_state_afinfo *afinfo; struct module *owner; u8 encap; u8 family; -- cgit v1.2.3 From 4c145dce26013763490df88f2473714f5bc7857d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:31 +0100 Subject: xfrm: make xfrm modes builtin after previous changes, xfrm_mode contains no function pointers anymore and all modules defining such struct contain no code except an init/exit functions to register the xfrm_mode struct with the xfrm core. Just place the xfrm modes core and remove the modules, the run-time xfrm_mode register/unregister functionality is removed. Before: text data bss dec filename 7523 200 2364 10087 net/xfrm/xfrm_input.o 40003 628 440 41071 net/xfrm/xfrm_state.o 15730338 6937080 4046908 26714326 vmlinux 7389 200 2364 9953 net/xfrm/xfrm_input.o 40574 656 440 41670 net/xfrm/xfrm_state.o 15730084 6937068 4046908 26714060 vmlinux The xfrm*_mode_{transport,tunnel,beet} modules are gone. v2: replace CONFIG_INET6_XFRM_MODE_* IS_ENABLED guards with CONFIG_IPV6 ones rather than removing them. Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8d1c9506bcf6..4ca79cdc3460 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -234,9 +234,9 @@ struct xfrm_state { /* Reference to data common to all the instances of this * transformer. */ const struct xfrm_type *type; - struct xfrm_mode *inner_mode; - struct xfrm_mode *inner_mode_iaf; - struct xfrm_mode *outer_mode; + const struct xfrm_mode *inner_mode; + const struct xfrm_mode *inner_mode_iaf; + const struct xfrm_mode *outer_mode; const struct xfrm_type_offload *type_offload; @@ -347,7 +347,6 @@ struct xfrm_state_afinfo { struct module *owner; const struct xfrm_type *type_map[IPPROTO_MAX]; const struct xfrm_type_offload *type_offload_map[IPPROTO_MAX]; - struct xfrm_mode *mode_map[XFRM_MODE_MAX]; int (*init_flags)(struct xfrm_state *x); void (*init_tempsel)(struct xfrm_selector *sel, @@ -423,7 +422,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); struct xfrm_mode { - struct module *owner; u8 encap; u8 family; u8 flags; @@ -434,9 +432,6 @@ enum { XFRM_MODE_FLAG_TUNNEL = 1, }; -int xfrm_register_mode(struct xfrm_mode *mode); -void xfrm_unregister_mode(struct xfrm_mode *mode); - static inline int xfrm_af2proto(unsigned int family) { switch(family) { @@ -449,7 +444,7 @@ static inline int xfrm_af2proto(unsigned int family) } } -static inline struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) +static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) -- cgit v1.2.3 From c9500d7b7de8ff6ac88ee3e38b782889f1616593 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:32 +0100 Subject: xfrm: store xfrm_mode directly, not its address This structure is now only 4 bytes, so its more efficient to cache a copy rather than its address. No significant size difference in allmodconfig vmlinux. With non-modular kernel that has all XFRM options enabled, this series reduces vmlinux image size by ~11kb. All xfrm_mode indirections are gone and all modes are built-in. before (ipsec-next master): text data bss dec filename 21071494 7233140 11104324 39408958 vmlinux.master after this series: 21066448 7226772 11104324 39397544 vmlinux.patched With allmodconfig kernel, the size increase is only 362 bytes, even all the xfrm config options removed in this series are modular. before: text data bss dec filename 15731286 6936912 4046908 26715106 vmlinux.master after this series: 15731492 6937068 4046908 26715468 vmlinux Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 4ca79cdc3460..77eb578a0384 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -132,6 +132,17 @@ struct xfrm_state_offload { u8 flags; }; +struct xfrm_mode { + u8 encap; + u8 family; + u8 flags; +}; + +/* Flags for xfrm_mode. */ +enum { + XFRM_MODE_FLAG_TUNNEL = 1, +}; + /* Full description of state of transformer. */ struct xfrm_state { possible_net_t xs_net; @@ -234,9 +245,9 @@ struct xfrm_state { /* Reference to data common to all the instances of this * transformer. */ const struct xfrm_type *type; - const struct xfrm_mode *inner_mode; - const struct xfrm_mode *inner_mode_iaf; - const struct xfrm_mode *outer_mode; + struct xfrm_mode inner_mode; + struct xfrm_mode inner_mode_iaf; + struct xfrm_mode outer_mode; const struct xfrm_type_offload *type_offload; @@ -421,17 +432,6 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); -struct xfrm_mode { - u8 encap; - u8 family; - u8 flags; -}; - -/* Flags for xfrm_mode. */ -enum { - XFRM_MODE_FLAG_TUNNEL = 1, -}; - static inline int xfrm_af2proto(unsigned int family) { switch(family) { @@ -448,9 +448,9 @@ static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, i { if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) - return x->inner_mode; + return &x->inner_mode; else - return x->inner_mode_iaf; + return &x->inner_mode_iaf; } struct xfrm_tmpl { @@ -1990,7 +1990,7 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, tunnel = true; break; } - if (tunnel && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)) + if (tunnel && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL)) return -EINVAL; return 0; -- cgit v1.2.3 From 1e1b11b6a1111cd9e8af1fd6ccda270a9fa3eacf Mon Sep 17 00:00:00 2001 From: vamsi krishna Date: Fri, 1 Feb 2019 18:34:51 +0530 Subject: nl80211/cfg80211: Specify band specific min RSSI thresholds with sched scan This commit adds the support to specify the RSSI thresholds per band for each match set. This enhances the current behavior which specifies a single rssi_threshold across all the bands by introducing the rssi_threshold_per_band. These per band rssi thresholds are referred through NL80211_BAND_* (enum nl80211_band) variables as attribute types. Such attributes/values per each band are nested through NL80211_ATTR_SCHED_SCAN_MIN_RSSI. These band specific rssi thresholds shall take precedence over the current rssi_thold per match set. Drivers indicate this support through %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These per band rssi attributes/values does not specify "default RSSI filter" as done by NL80211_SCHED_SCAN_MATCH_ATTR_RSSI to stay backward compatible. That said, these per band rssi values have to be specified for the corresponding matchset. Signed-off-by: vamsi krishna Signed-off-by: Srinivas Dasari [rebase on refactoring, add policy] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bb307a11ee63..b13234a486e7 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1832,11 +1832,19 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match * or no match (RSSI only) * @rssi_thold: don't report scan results below this threshold (in s32 dBm) + * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied + * for filtering out scan results received. Drivers advertize this support + * of band specific rssi based filtering through the feature capability + * %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band + * specific rssi thresholds take precedence over rssi_thold, if specified. + * If not specified for any band, it will be assigned with rssi_thold of + * corresponding matchset. */ struct cfg80211_match_set { struct cfg80211_ssid ssid; u8 bssid[ETH_ALEN]; s32 rssi_thold; + s32 per_band_rssi_thold[NUM_NL80211_BANDS]; }; /** -- cgit v1.2.3 From ab60633c7136c300f15a390f3469d7c4be15a055 Mon Sep 17 00:00:00 2001 From: Narayanraddi Masti Date: Thu, 7 Feb 2019 12:16:05 -0800 Subject: mac80211: Add support for NL80211_STA_INFO_AIRTIME_LINK_METRIC Add support for mesh airtime link metric attribute NL80211_STA_INFO_AIRTIME_LINK_METRIC. Signed-off-by: Narayanraddi Masti Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b13234a486e7..5859a5e02454 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1327,6 +1327,7 @@ struct cfg80211_tid_stats { * @fcs_err_count: number of packets (MPDUs) received from this station with * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. + * @airtime_link_metric: mesh airtime link metric. */ struct station_info { u64 filled; @@ -1381,6 +1382,8 @@ struct station_info { u32 rx_mpdu_count; u32 fcs_err_count; + + u32 airtime_link_metric; }; #if IS_ENABLED(CONFIG_CFG80211) -- cgit v1.2.3 From cb74e9775871f8c82a1297cf76209f10ab5bbe3d Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Wed, 20 Feb 2019 16:18:07 +0530 Subject: cfg80211/nl80211: Offload OWE processing to user space in AP mode This interface allows the host driver to offload OWE processing to user space. This intends to support OWE (Opportunistic Wireless Encryption) AKM by the drivers that implement SME but rely on the user space for the cryptographic/OWE processing in AP mode. Such drivers are not capable of processing/deriving the DH IE. A new NL80211 command - NL80211_CMD_UPDATE_OWE_INFO is introduced to send the request/event between the host driver and user space. Driver shall provide the OWE info (MAC address and DH IE) of the peer to user space for cryptographic processing of the DH IE through the event. Accordingly, the user space shall update the OWE info/DH IE to the driver. Following is the sequence in AP mode for OWE authentication. Driver passes the OWE info obtained from the peer in the Association Request to the user space through the event cfg80211_update_owe_info_event. User space shall process the OWE info received and generate new OWE info. This OWE info is passed to the driver through NL80211_CMD_UPDATE_OWE_INFO request. Driver eventually uses this OWE info to send the Association Response to the peer. This OWE info in the command interface carries the IEs that include PMKID of the peer if the PMKSA is still valid or an updated DH IE for generating a new PMKSA with the peer. Signed-off-by: Liangwei Dong Signed-off-by: Sunil Dutt Signed-off-by: Srinivas Dasari [remove policy initialization - no longer exists] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5859a5e02454..70432fd638af 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3110,6 +3110,32 @@ struct cfg80211_pmsr_request { struct cfg80211_pmsr_request_peer peers[]; }; +/** + * struct cfg80211_update_owe_info - OWE Information + * + * This structure provides information needed for the drivers to offload OWE + * (Opportunistic Wireless Encryption) processing to the user space. + * + * Commonly used across update_owe_info request and event interfaces. + * + * @peer: MAC address of the peer device for which the OWE processing + * has to be done. + * @status: status code, %WLAN_STATUS_SUCCESS for successful OWE info + * processing, use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space + * cannot give you the real status code for failures. Used only for + * OWE update request command interface (user space to driver). + * @ie: IEs obtained from the peer or constructed by the user space. These are + * the IEs of the remote peer in the event from the host driver and + * the constructed IEs by the user space in the request interface. + * @ie_len: Length of IEs in octets. + */ +struct cfg80211_update_owe_info { + u8 peer[ETH_ALEN] __aligned(2); + u16 status; + const u8 *ie; + size_t ie_len; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -3447,6 +3473,10 @@ struct cfg80211_pmsr_request { * Statistics should be cumulative, currently no way to reset is provided. * @start_pmsr: start peer measurement (e.g. FTM) * @abort_pmsr: abort peer measurement + * + * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME + * but offloading OWE processing to the user space will get the updated + * DH IE through this interface. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3761,6 +3791,8 @@ struct cfg80211_ops { struct cfg80211_pmsr_request *request); void (*abort_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request); + int (*update_owe_info)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_update_owe_info *owe_info); }; /* @@ -7219,4 +7251,14 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev, #define wiphy_WARN(wiphy, format, args...) \ WARN(1, "wiphy: %s\n" format, wiphy_name(wiphy), ##args); +/** + * cfg80211_update_owe_info_event - Notify the peer's OWE info to user space + * @netdev: network device + * @owe_info: peer's owe info + * @gfp: allocation flags + */ +void cfg80211_update_owe_info_event(struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info, + gfp_t gfp); + #endif /* __NET_CFG80211_H */ -- cgit v1.2.3 From fd69c399c7d6262086b6b820757c6aeaa71feeba Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 8 Apr 2019 10:15:59 +0200 Subject: datagram: remove rendundant 'peeked' argument After commit a297569fe00a ("net/udp: do not touch skb->peeked unless really needed") the 'peeked' argument of __skb_try_recv_datagram() and friends is always equal to !!'flags & MSG_PEEK'. Since such argument is really a boolean info, and the callers have already 'flags & MSG_PEEK' handy, we can remove it and clean-up the code a bit. Signed-off-by: Paolo Abeni Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/udp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index fd6d948755c8..d8ce937bc395 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -269,13 +269,13 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, - int noblock, int *peeked, int *off, int *err); + int noblock, int *off, int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *err) { - int peeked, off = 0; + int off = 0; - return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err); + return __skb_recv_udp(sk, flags, noblock, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); -- cgit v1.2.3 From 84c0d5e96f3ae20344fb3a79161eab18905dae56 Mon Sep 17 00:00:00 2001 From: Jacky Hu Date: Tue, 26 Mar 2019 18:31:21 +0800 Subject: ipvs: allow tunneling with gue encapsulation ipip packets are blocked in some public cloud environments, this patch allows gue encapsulation with the tunneling method, which would make tunneling working in those environments. Signed-off-by: Jacky Hu Acked-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 047f9a5ccaad..2ac40135b576 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -600,6 +600,9 @@ struct ip_vs_dest_user_kern { /* Address family of addr */ u16 af; + + u16 tun_type; /* tunnel type */ + __be16 tun_port; /* tunnel port */ }; @@ -660,6 +663,8 @@ struct ip_vs_dest { atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ atomic_t last_weight; /* server latest weight */ + __u16 tun_type; /* tunnel type */ + __be16 tun_port; /* tunnel port */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ -- cgit v1.2.3 From d164385ec572cbe3335a635ac308760e126d4ec0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 27 Mar 2019 09:22:24 +0100 Subject: netfilter: nat: add inet family nat support We need minimal support from the nat core for this, as we do not want to register additional base hooks. When an inet hook is registered, interally register ipv4 and ipv6 hooks for them and unregister those when inet hooks are removed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index cf332c4e0b32..423cda2c6542 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -69,9 +69,9 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum, #endif } -int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, +int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *nat_ops, unsigned int ops_count); -void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops, +void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count); unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -98,6 +98,9 @@ void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops); int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops); +int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops); +void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops); + unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); -- cgit v1.2.3 From c1deb065cf3b5bcd483e3f03479f930edb151b99 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 27 Mar 2019 09:22:25 +0100 Subject: netfilter: nf_tables: merge route type into core very little code, so it really doesn't make sense to have extra modules or even a kconfig knob for this. Merge them and make functionality available unconditionally. The merge makes inet family route support trivial, so add it as well here. Before: text data bss dec hex filename 835 832 0 1667 683 nft_chain_route_ipv4.ko 870 832 0 1702 6a6 nft_chain_route_ipv6.ko 111568 2556 529 114653 1bfdd nf_tables.ko After: text data bss dec hex filename 113133 2556 529 116218 1c5fa nf_tables.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3e9ab643eedf..55dff3ab44c7 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1411,4 +1411,6 @@ struct nft_trans_flowtable { int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); +void __init nft_chain_route_init(void); +void nft_chain_route_fini(void); #endif /* _NET_NF_TABLES_H */ -- cgit v1.2.3 From 4806e975729f99c7908d1688a143f1e16d464e6c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 27 Mar 2019 09:22:26 +0100 Subject: netfilter: replace NF_NAT_NEEDED with IS_ENABLED(CONFIG_NF_NAT) NF_NAT_NEEDED is true whenever nat support for either ipv4 or ipv6 is enabled. Now that the af-specific nat configuration switches have been removed, IS_ENABLED(CONFIG_NF_NAT) has the same effect. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_expect.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 006e430d1cdf..93ce6b0daaba 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -48,7 +48,7 @@ struct nf_conntrack_expect { /* Expectation class */ unsigned int class; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) union nf_inet_addr saved_addr; /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ -- cgit v1.2.3 From 3b0a081db1f730373993c7a27936778402a3322c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 Apr 2019 10:58:20 +0200 Subject: netfilter: make two functions static They have no external callers anymore. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 55dff3ab44c7..2d5a0a1a87b8 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -475,8 +475,6 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); -void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_binding *binding, bool commit); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** -- cgit v1.2.3 From 1aefd3de7bc667115bb77cb0bc21e874c7e190fc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:24 -0700 Subject: ipv6: Add fib6_nh_init and release to stubs Add fib6_nh_init and fib6_nh_release to ipv6_stubs. If fib6_nh_init fails, callers should not invoke fib6_nh_release, so there is no reason to have a dummy stub for the IPv6 is not enabled case. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ipv6_stubs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index d8d9c0b0e8c0..453b55bf6723 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -12,6 +12,8 @@ /* structs from net/ip6_fib.h */ struct fib6_info; +struct fib6_nh; +struct fib6_config; /* This is ugly, ideally these symbols should be built * into the core kernel. @@ -40,6 +42,10 @@ struct ipv6_stub { u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, struct in6_addr *saddr); + int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); + void (*fib6_nh_release)(struct fib6_nh *fib6_nh); void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, -- cgit v1.2.3 From 71df5777aaaeff673c242a49b945b1b96fe81718 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:25 -0700 Subject: ipv6: Add neighbor helpers that use the ipv6 stub Add ipv6 helpers to handle ndisc references via the stub. Update bpf_ipv6_fib_lookup to use __ipv6_neigh_lookup_noref_stub instead of the open code ___neigh_lookup_noref with the stub. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ndisc.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index ddfbb591e2c5..366150053043 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -2,6 +2,8 @@ #ifndef _NDISC_H #define _NDISC_H +#include + /* * ICMP codes for neighbour discovery messages */ @@ -379,6 +381,14 @@ static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev); } +static inline +struct neighbour *__ipv6_neigh_lookup_noref_stub(struct net_device *dev, + const void *pkey) +{ + return ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, + ndisc_hashfn, pkey, dev); +} + static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey) { struct neighbour *n; @@ -409,6 +419,36 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev, rcu_read_unlock_bh(); } +static inline void __ipv6_confirm_neigh_stub(struct net_device *dev, + const void *pkey) +{ + struct neighbour *n; + + rcu_read_lock_bh(); + n = __ipv6_neigh_lookup_noref_stub(dev, pkey); + if (n) { + unsigned long now = jiffies; + + /* avoid dirtying neighbour */ + if (n->confirmed != now) + n->confirmed = now; + } + rcu_read_unlock_bh(); +} + +/* uses ipv6_stub and is meant for use outside of IPv6 core */ +static inline struct neighbour *ip_neigh_gw6(struct net_device *dev, + const void *addr) +{ + struct neighbour *neigh; + + neigh = __ipv6_neigh_lookup_noref_stub(dev, addr); + if (unlikely(!neigh)) + neigh = __neigh_create(ipv6_stub->nd_tbl, addr, dev, false); + + return neigh; +} + int ndisc_init(void); int ndisc_late_init(void); -- cgit v1.2.3 From bdf004677107e3b847c5db09c9fbf8edefa24996 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:26 -0700 Subject: net: Replace nhc_has_gw with nhc_gw_family Allow the gateway in a fib_nh_common to be from a different address family than the outer fib{6}_nh. To that end, replace nhc_has_gw with nhc_gw_family and update users of nhc_has_gw to check nhc_gw_family. Now nhc_family is used to know if the nh_common is part of a fib_nh or fib6_nh (used for container_of to get to route family specific data), and nhc_gw_family represents the address family for the gateway. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- include/net/ip_fib.h | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 342180a7285c..5909fc421305 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -69,7 +69,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) && - f6i->fib6_nh.fib_nh_has_gw; + f6i->fib6_nh.fib_nh_gw_family; } void ip6_route_input(struct sk_buff *skb); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 3ce07841dc3b..c68a40435ee0 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -83,8 +83,8 @@ struct fib_nh_common { struct lwtunnel_state *nhc_lwtstate; unsigned char nhc_scope; u8 nhc_family; - u8 nhc_has_gw:1, - unused:7; + u8 nhc_gw_family; + union { __be32 ipv4; struct in6_addr ipv6; @@ -112,8 +112,7 @@ struct fib_nh { #define fib_nh_flags nh_common.nhc_flags #define fib_nh_lws nh_common.nhc_lwtstate #define fib_nh_scope nh_common.nhc_scope -#define fib_nh_family nh_common.nhc_family -#define fib_nh_has_gw nh_common.nhc_has_gw +#define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw4 nh_common.nhc_gw.ipv4 #define fib_nh_gw6 nh_common.nhc_gw.ipv6 #define fib_nh_weight nh_common.nhc_weight -- cgit v1.2.3 From 1550c171935d264f522581fd037db5e64a716bb6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:27 -0700 Subject: ipv4: Prepare rtable for IPv6 gateway To allow the gateway to be either an IPv4 or IPv6 address, remove rt_uses_gateway from rtable and replace with rt_gw_family. If rt_gw_family is set it implies rt_uses_gateway. Rename rt_gateway to rt_gw4 to represent the IPv4 version. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/route.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 9883dc82f723..96912b099c08 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -55,12 +55,12 @@ struct rtable { unsigned int rt_flags; __u16 rt_type; __u8 rt_is_input; - __u8 rt_uses_gateway; + u8 rt_gw_family; int rt_iif; /* Info on neighbour */ - __be32 rt_gateway; + __be32 rt_gw4; /* Miscellaneous cached information */ u32 rt_mtu_locked:1, @@ -82,8 +82,8 @@ static inline bool rt_is_output_route(const struct rtable *rt) static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr) { - if (rt->rt_gateway) - return rt->rt_gateway; + if (rt->rt_gw_family == AF_INET) + return rt->rt_gw4; return daddr; } -- cgit v1.2.3 From f35b794b3b405e2478654ea875bc0b29fe1a1bc5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:28 -0700 Subject: ipv4: Prepare fib_config for IPv6 gateway Similar to rtable, fib_config needs to allow the gateway to be either an IPv4 or an IPv6 address. To that end, rename fc_gw to fc_gw4 to mean an IPv4 address and add fc_gw_family. Checks on 'is a gateway set' are changed to see if fc_gw_family is set. In the process prepare the code for a fc_gw_family == AF_INET6. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index c68a40435ee0..1f72ad553c31 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -32,10 +32,11 @@ struct fib_config { u8 fc_protocol; u8 fc_scope; u8 fc_type; - /* 3 bytes unused */ + u8 fc_gw_family; + /* 2 bytes unused */ u32 fc_table; __be32 fc_dst; - __be32 fc_gw; + __be32 fc_gw4; int fc_oif; u32 fc_flags; u32 fc_priority; -- cgit v1.2.3 From 0f5f7d7bf6e6bda4dffe7b42812a16ada6ea9816 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:29 -0700 Subject: ipv4: Add support to rtable for ipv6 gateway Add support for an IPv6 gateway to rtable. Since a gateway is either IPv4 or IPv6, make it a union with rt_gw4 where rt_gw_family decides which address is in use. When dumping the route data, encode an ipv6 nexthop using RTA_VIA. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/route.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 96912b099c08..5d28a2509b58 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -60,7 +60,10 @@ struct rtable { int rt_iif; /* Info on neighbour */ - __be32 rt_gw4; + union { + __be32 rt_gw4; + struct in6_addr rt_gw6; + }; /* Miscellaneous cached information */ u32 rt_mtu_locked:1, -- cgit v1.2.3 From a4ea5d43c807be28545625c1e0641905022fa0d1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:30 -0700 Subject: ipv4: Add support to fib_config for IPv6 gateway Add support for an IPv6 gateway to fib_config. Since a gateway is either IPv4 or IPv6, make it a union with fc_gw4 where fc_gw_family decides which address is in use. Update current checks on family and gw4 to handle ipv6 as well. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1f72ad553c31..f1c452f618a9 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -36,7 +36,10 @@ struct fib_config { /* 2 bytes unused */ u32 fc_table; __be32 fc_dst; - __be32 fc_gw4; + union { + __be32 fc_gw4; + struct in6_addr fc_gw6; + }; int fc_oif; u32 fc_flags; u32 fc_priority; -- cgit v1.2.3 From 0353f28231c79416191326810e7fe656b69c63b7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:33 -0700 Subject: neighbor: Add skip_cache argument to neigh_output A later patch allows an IPv6 gateway with an IPv4 route. The neighbor entry will exist in the v6 ndisc table and the cached header will contain the ipv6 protocol which is wrong for an IPv4 packet. For an IPv4 packet to use the v6 neighbor entry, neigh_output needs to skip the cached header and just use the output callback for the neigh entry. A future patchset can look at expanding the hh_cache to handle 2 protocols. For now, IPv6 gateways with an IPv4 route will take the extra overhead of generating the header. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/neighbour.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7c1ab9edba03..3e5438bd0101 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -498,11 +498,12 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb return dev_queue_xmit(skb); } -static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) +static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, + bool skip_cache) { const struct hh_cache *hh = &n->hh; - if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) + if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache) return neigh_hh_output(hh, skb); else return n->output(n, skb); -- cgit v1.2.3 From 5c9f7c1dfc2e0776551ef1ceb335187c6698d1ff Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:34 -0700 Subject: ipv4: Add helpers for neigh lookup for nexthop A common theme in the output path is looking up a neigh entry for a nexthop, either the gateway in an rtable or a fallback to the daddr in the skb: nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); To allow the nexthop to be an IPv6 address we need to consider the family of the nexthop and then call __ipv{4,6}_neigh_lookup_noref based on it. To make this simpler, add a ip_neigh_gw4 helper similar to ip_neigh_gw6 added in an earlier patch which handles: neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); And then add a second one, ip_neigh_for_gw, that calls either ip_neigh_gw4 or ip_neigh_gw6 based on the address family of the gateway. Update the output paths in the VRF driver and core v4 code to use ip_neigh_for_gw simplifying the family based lookup and making both ready for a v6 nexthop. ipv4_neigh_lookup has a different need - the potential to resolve a passed in address in addition to any gateway in the rtable or skb. Since this is a one-off, add ip_neigh_gw4 and ip_neigh_gw6 diectly. The difference between __neigh_create used by the helpers and neigh_create called by ipv4_neigh_lookup is taking a refcount, so add rcu_read_lock_bh and bump the refcnt on the neigh entry. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/route.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 5d28a2509b58..96f6c9ae33c2 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include #include @@ -350,4 +352,34 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) return hoplimit; } +static inline struct neighbour *ip_neigh_gw4(struct net_device *dev, + __be32 daddr) +{ + struct neighbour *neigh; + + neigh = __ipv4_neigh_lookup_noref(dev, daddr); + if (unlikely(!neigh)) + neigh = __neigh_create(&arp_tbl, &daddr, dev, false); + + return neigh; +} + +static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt, + struct sk_buff *skb, + bool *is_v6gw) +{ + struct net_device *dev = rt->dst.dev; + struct neighbour *neigh; + + if (likely(rt->rt_gw_family == AF_INET)) { + neigh = ip_neigh_gw4(dev, rt->rt_gw4); + } else if (rt->rt_gw_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &rt->rt_gw6); + *is_v6gw = true; + } else { + neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr); + } + return neigh; +} + #endif /* _ROUTE_H */ -- cgit v1.2.3 From 19a9d136f198cd7c4e26ea6897a0cf067d3f7ecb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:39 -0700 Subject: ipv4: Flag fib_info with a fib_nh using IPv6 gateway Until support is added to the offload drivers, they need to be able to reject routes with an IPv6 gateway. To that end add a flag to fib_info that indicates if any fib_nh has a v6 gateway. The flag allows the drivers to efficiently know the use of a v6 gateway without walking all fib_nh tied to a fib_info each time a route is added. Update mlxsw and rocker to reject the routes with extack message as to why. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f1c452f618a9..337106469ec5 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -147,6 +147,7 @@ struct fib_info { #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; + bool fib_nh_is_v6; struct rcu_head rcu; struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].fib_nh_dev -- cgit v1.2.3 From d15662682db232da77136cd348f4c9df312ca6f9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:40 -0700 Subject: ipv4: Allow ipv6 gateway with ipv4 routes Add support for RTA_VIA and allow an IPv6 nexthop for v4 routes: $ ip ro add 172.16.1.0/24 via inet6 2001:db8::1 dev eth0 $ ip ro ls ... 172.16.1.0/24 via inet6 2001:db8::1 dev eth0 For convenience and simplicity, userspace can use RTA_VIA to specify AF_INET or AF_INET6 gateway. The common fib_nexthop_info dump function compares the gateway address family to the nh_common family to know if the gateway should be encoded as RTA_VIA or RTA_GATEWAY. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 337106469ec5..d8195c77e247 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -401,6 +401,8 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, /* Exported by fib_frontend.c */ extern const struct nla_policy rtm_ipv4_policy[]; void ip_fib_init(void); +int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, + struct netlink_ext_ack *extack); __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, -- cgit v1.2.3 From 1f5e6fdd6aec7929e67afad1e42e35d894a119ae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Apr 2019 14:32:38 +0200 Subject: net: sched: prefer qdisc_is_empty() over direct qlen access When checking for root qdisc queue length, do not access directly q.qlen. In the following patches we will move back qlen accounting to per CPU values for NOLOCK qdiscs. Instead, prefer the qdisc_is_empty() helper usage. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 0aea0e262452..7ecb6127e980 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -747,7 +747,7 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev) struct netdev_queue *txq = netdev_get_tx_queue(dev, i); const struct Qdisc *q = rcu_dereference(txq->qdisc); - if (q->q.qlen) { + if (!qdisc_is_empty(q)) { rcu_read_unlock(); return false; } -- cgit v1.2.3 From 9c01c9f1f2a3ddbddbf3b233cc6bfa86f5a59af0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Apr 2019 14:32:39 +0200 Subject: net: sched: always do stats accounting according to TCQ_F_CPUSTATS The core sched implementation checks independently for NOLOCK flag to acquire/release the root spin lock and for qdisc_is_percpu_stats() to account per CPU values in many places. This change update the last few places checking the TCQ_F_NOLOCK to do per CPU stats accounting according to qdisc_is_percpu_stats() value. The above allows to clean dev_requeue_skb() implementation a bit and makes stats update always consistent with a single flag. v1 -> v2: - do not move qdisc_is_empty definition, fix build issue Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7ecb6127e980..ed56474cfe3b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -146,9 +146,14 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } +static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) +{ + return q->flags & TCQ_F_CPUSTATS; +} + static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { - if (qdisc->flags & TCQ_F_NOLOCK) + if (qdisc_is_percpu_stats(qdisc)) return qdisc->empty; return !qdisc->q.qlen; } @@ -490,7 +495,7 @@ static inline u32 qdisc_qlen_sum(const struct Qdisc *q) { u32 qlen = q->qstats.qlen; - if (q->flags & TCQ_F_NOLOCK) + if (qdisc_is_percpu_stats(q)) qlen += atomic_read(&q->q.atomic_qlen); else qlen += q->q.qlen; @@ -817,11 +822,6 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return sch->enqueue(skb, sch, to_free); } -static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) -{ - return q->flags & TCQ_F_CPUSTATS; -} - static inline void _bstats_update(struct gnet_stats_basic_packed *bstats, __u64 bytes, __u32 packets) { @@ -1113,8 +1113,13 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) if (skb) { skb = __skb_dequeue(&sch->gso_skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; + if (qdisc_is_percpu_stats(sch)) { + qdisc_qstats_cpu_backlog_dec(sch, skb); + qdisc_qstats_atomic_qlen_dec(sch); + } else { + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + } } else { skb = sch->dequeue(sch); } -- cgit v1.2.3 From 8a53e616de294873fec1a75ddb77ecb3d225cee0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Apr 2019 14:32:40 +0200 Subject: net: sched: when clearing NOLOCK, clear TCQ_F_CPUSTATS, too Since stats updating is always consistent with TCQ_F_CPUSTATS flag, we can disable it at qdisc creation time flipping such bit. In my experiments, if the NOLOCK flag is cleared, per CPU stats accounting does not give any measurable performance gain, but it waste some memory. Let's clear TCQ_F_CPUSTATS together with NOLOCK, when enslaving a NOLOCK qdisc to 'lock' one. Use stats update helper inside pfifo_fast, to cope correctly with TCQ_F_CPUSTATS flag change. As a side effect, q.qlen value for any child qdiscs is always consistent for all lock classfull qdiscs. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ed56474cfe3b..f069011524ba 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1106,6 +1106,32 @@ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) return skb; } +static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch, + struct sk_buff *skb) +{ + if (qdisc_is_percpu_stats(sch)) { + qdisc_qstats_cpu_backlog_dec(sch, skb); + qdisc_bstats_cpu_update(sch, skb); + qdisc_qstats_atomic_qlen_dec(sch); + } else { + qdisc_qstats_backlog_dec(sch, skb); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + } +} + +static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch, + unsigned int pkt_len) +{ + if (qdisc_is_percpu_stats(sch)) { + qdisc_qstats_atomic_qlen_inc(sch); + this_cpu_add(sch->cpu_qstats->backlog, pkt_len); + } else { + sch->qstats.backlog += pkt_len; + sch->q.qlen++; + } +} + /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { -- cgit v1.2.3 From 73eb628ddfd3884d1e58a8022de2e78de7807fc6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Apr 2019 14:32:41 +0200 Subject: Revert: "net: sched: put back q.qlen into a single location" This revert commit 46b1c18f9deb ("net: sched: put back q.qlen into a single location"). After the previous patch, when a NOLOCK qdisc is enslaved to a locking qdisc it switches to global stats accounting. As a consequence, when a classful qdisc accesses directly a child qdisc's qlen, such qdisc is not doing per CPU accounting and qlen value is consistent. In the control path nobody uses directly qlen since commit e5f0e8f8e45 ("net: sched: introduce and use qdisc tree flush/purge helpers"), so we can remove the contented atomic ops from the datapath. v1 -> v2: - complete the qdisc_qstats_atomic_qlen_dec() -> qdisc_qstats_cpu_qlen_dec() replacement, fix build issue - more descriptive commit message Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f069011524ba..e8f85cd2afce 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -52,10 +52,7 @@ struct qdisc_size_table { struct qdisc_skb_head { struct sk_buff *head; struct sk_buff *tail; - union { - u32 qlen; - atomic_t atomic_qlen; - }; + __u32 qlen; spinlock_t lock; }; @@ -486,19 +483,27 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } +static inline int qdisc_qlen_cpu(const struct Qdisc *q) +{ + return this_cpu_ptr(q->cpu_qstats)->qlen; +} + static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; } -static inline u32 qdisc_qlen_sum(const struct Qdisc *q) +static inline int qdisc_qlen_sum(const struct Qdisc *q) { - u32 qlen = q->qstats.qlen; + __u32 qlen = q->qstats.qlen; + int i; - if (qdisc_is_percpu_stats(q)) - qlen += atomic_read(&q->q.atomic_qlen); - else + if (qdisc_is_percpu_stats(q)) { + for_each_possible_cpu(i) + qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; + } else { qlen += q->q.qlen; + } return qlen; } @@ -889,14 +894,14 @@ static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } -static inline void qdisc_qstats_atomic_qlen_inc(struct Qdisc *sch) +static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) { - atomic_inc(&sch->q.atomic_qlen); + this_cpu_inc(sch->cpu_qstats->qlen); } -static inline void qdisc_qstats_atomic_qlen_dec(struct Qdisc *sch) +static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) { - atomic_dec(&sch->q.atomic_qlen); + this_cpu_dec(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) @@ -1112,7 +1117,7 @@ static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch, if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_bstats_cpu_update(sch, skb); - qdisc_qstats_atomic_qlen_dec(sch); + qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); @@ -1124,7 +1129,7 @@ static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch, unsigned int pkt_len) { if (qdisc_is_percpu_stats(sch)) { - qdisc_qstats_atomic_qlen_inc(sch); + qdisc_qstats_cpu_qlen_inc(sch); this_cpu_add(sch->cpu_qstats->backlog, pkt_len); } else { sch->qstats.backlog += pkt_len; @@ -1141,7 +1146,7 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) skb = __skb_dequeue(&sch->gso_skb); if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); - qdisc_qstats_atomic_qlen_dec(sch); + qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; -- cgit v1.2.3 From bf8981a2aa082d9d64771b47c8a1c9c388d8cd40 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Apr 2019 10:44:06 +0200 Subject: netfilter: nf_nat: merge ip/ip6 masquerade headers Both are now implemented by nf_nat_masquerade.c, so no need to keep different headers. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_nat_masquerade.h | 15 --------------- include/net/netfilter/ipv6/nf_nat_masquerade.h | 11 ----------- include/net/netfilter/nf_nat_masquerade.h | 21 +++++++++++++++++++++ 3 files changed, 21 insertions(+), 26 deletions(-) delete mode 100644 include/net/netfilter/ipv4/nf_nat_masquerade.h delete mode 100644 include/net/netfilter/ipv6/nf_nat_masquerade.h create mode 100644 include/net/netfilter/nf_nat_masquerade.h (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h deleted file mode 100644 index 13d55206bb9f..000000000000 --- a/include/net/netfilter/ipv4/nf_nat_masquerade.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NF_NAT_MASQUERADE_IPV4_H_ -#define _NF_NAT_MASQUERADE_IPV4_H_ - -#include - -unsigned int -nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range2 *range, - const struct net_device *out); - -int nf_nat_masquerade_ipv4_register_notifier(void); -void nf_nat_masquerade_ipv4_unregister_notifier(void); - -#endif /*_NF_NAT_MASQUERADE_IPV4_H_ */ diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h deleted file mode 100644 index 2917bf95c437..000000000000 --- a/include/net/netfilter/ipv6/nf_nat_masquerade.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NF_NAT_MASQUERADE_IPV6_H_ -#define _NF_NAT_MASQUERADE_IPV6_H_ - -unsigned int -nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, - const struct net_device *out); -int nf_nat_masquerade_ipv6_register_notifier(void); -void nf_nat_masquerade_ipv6_unregister_notifier(void); - -#endif /* _NF_NAT_MASQUERADE_IPV6_H_ */ diff --git a/include/net/netfilter/nf_nat_masquerade.h b/include/net/netfilter/nf_nat_masquerade.h new file mode 100644 index 000000000000..cafe71822a53 --- /dev/null +++ b/include/net/netfilter/nf_nat_masquerade.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NF_NAT_MASQUERADE_H_ +#define _NF_NAT_MASQUERADE_H_ + +#include + +unsigned int +nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct nf_nat_range2 *range, + const struct net_device *out); + +int nf_nat_masquerade_ipv4_register_notifier(void); +void nf_nat_masquerade_ipv4_unregister_notifier(void); + +unsigned int +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, + const struct net_device *out); +int nf_nat_masquerade_ipv6_register_notifier(void); +void nf_nat_masquerade_ipv6_unregister_notifier(void); + +#endif /*_NF_NAT_MASQUERADE_H_ */ -- cgit v1.2.3 From 610a43149cabd0c7aa7bed19cbcf05a0249ab32a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Apr 2019 10:44:08 +0200 Subject: netfilter: nf_nat_masquerade: unify ipv4/6 notifier registration Only reason for having two different register functions was because of ipt_MASQUERADE and ip6t_MASQUERADE being two different modules. Previous patch merged those into xt_MASQUERADE, so we can merge this too. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_masquerade.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_nat_masquerade.h b/include/net/netfilter/nf_nat_masquerade.h index cafe71822a53..54a14d643c34 100644 --- a/include/net/netfilter/nf_nat_masquerade.h +++ b/include/net/netfilter/nf_nat_masquerade.h @@ -9,13 +9,11 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out); -int nf_nat_masquerade_ipv4_register_notifier(void); -void nf_nat_masquerade_ipv4_unregister_notifier(void); +int nf_nat_masquerade_inet_register_notifiers(void); +void nf_nat_masquerade_inet_unregister_notifiers(void); unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out); -int nf_nat_masquerade_ipv6_register_notifier(void); -void nf_nat_masquerade_ipv6_unregister_notifier(void); #endif /*_NF_NAT_MASQUERADE_H_ */ -- cgit v1.2.3 From cc3a86c802f0ba9a2627aef256d95ae3b3fa6e91 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 9 Apr 2019 14:41:12 -0700 Subject: ipv6: Change rt6_probe to take a fib6_nh rt6_probe sends probes for gateways in a nexthop. As such it really depends on a fib6_nh, not a fib entry. Move last_probe to fib6_nh and update rt6_probe to a fib6_nh struct. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 58dbb4e82908..2e9235adfa0d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -127,6 +127,10 @@ struct rt6_exception { struct fib6_nh { struct fib_nh_common nh_common; + +#ifdef CONFIG_IPV6_ROUTER_PREF + unsigned long last_probe; +#endif }; struct fib6_info { @@ -155,10 +159,6 @@ struct fib6_info { struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; -#ifdef CONFIG_IPV6_ROUTER_PREF - unsigned long last_probe; -#endif - u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; -- cgit v1.2.3 From 971502d77faa50a37c89bc6d172450294ad9a5fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Apr 2019 16:36:41 +0200 Subject: bridge: netfilter: unroll NF_HOOK helper in bridge input path Replace NF_HOOK() based invocation of the netfilter hooks with a private copy of nf_hook_slow(). This copy has one difference: it can return the rx handler value expected by the stack, i.e. RX_HANDLER_CONSUMED or RX_HANDLER_PASS. This is needed by the next patch to invoke the ebtables "broute" table via the standard netfilter hooks rather than the custom "br_should_route_hook" indirection that is used now. When the skb is to be "brouted", we must return RX_HANDLER_PASS from the bridge rx input handler, but there is no way to indicate this via NF_HOOK(), unless perhaps by some hack such as exposing bridge_cb in the netfilter core or a percpu flag. text data bss dec filename 3369 56 0 3425 net/bridge/br_input.o.before 3458 40 0 3498 net/bridge/br_input.o.after This allows removal of the "br_should_route_hook" in the next patch. Signed-off-by: Florian Westphal Acked-by: David S. Miller Acked-by: Nikolay Aleksandrov Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index a50a69f5334c..7239105d9d2e 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -119,4 +119,7 @@ nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family, return queue; } +int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, + const struct nf_hook_entries *entries, unsigned int index, + unsigned int verdict); #endif /* _NF_QUEUE_H */ -- cgit v1.2.3 From 013b96ec64616b57fc631b304dfcecc5bc288f90 Mon Sep 17 00:00:00 2001 From: David Miller Date: Thu, 11 Apr 2019 15:02:07 -0700 Subject: sctp: Pass sk_buff_head explicitly to sctp_ulpq_tail_event(). Now the SKB list implementation assumption can be removed. And now that we know that the list head is always non-NULL we can remove the code blocks dealing with that as well. Signed-off-by: David S. Miller Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/ulpqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index bb0ecba3db2b..f4ac7117ff29 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -59,7 +59,7 @@ void sctp_ulpq_free(struct sctp_ulpq *); int sctp_ulpq_tail_data(struct sctp_ulpq *, struct sctp_chunk *, gfp_t); /* Add a new event for propagation to the ULP. */ -int sctp_ulpq_tail_event(struct sctp_ulpq *, struct sctp_ulpevent *ev); +int sctp_ulpq_tail_event(struct sctp_ulpq *, struct sk_buff_head *skb_list); /* Renege previously received chunks. */ void sctp_ulpq_renege(struct sctp_ulpq *, struct sctp_chunk *, gfp_t); -- cgit v1.2.3 From 9dde27de3e5efa0d032f3c891a0ca833a0d31911 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Apr 2019 17:15:07 +0800 Subject: sctp: implement memory accounting on rx path sk_forward_alloc's updating is also done on rx path, but to be consistent we change to use sk_mem_charge() in sctp_skb_set_owner_r(). In sctp_eat_data(), it's not enough to check sctp_memory_pressure only, which doesn't work for mem_cgroup_sockets_enabled, so we change to use sk_under_memory_pressure(). When it's under memory pressure, sk_mem_reclaim() and sk_rmem_schedule() should be called on both RENEGE or CHUNK DELIVERY path exit the memory pressure status as soon as possible. Note that sk_rmem_schedule() is using datalen to make things easy there. Reported-by: Matteo Croce Tested-by: Matteo Croce Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 1d13ec3f2707..eefdfa5abf6e 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -421,7 +421,7 @@ static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) /* * This mimics the behavior of skb_set_owner_r */ - sk->sk_forward_alloc -= event->rmem_len; + sk_mem_charge(sk, event->rmem_len); } /* Tests if the list has one and only one entry. */ -- cgit v1.2.3 From b1d40991506aa9f1de310a2e74ef8e3bec6ba215 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 14:35:59 -0700 Subject: ipv6: Rename fib6_multipath_select and pass fib6_result Add 'struct fib6_result' to hold the fib entry and fib6_nh from a fib lookup as separate entries, similar to what IPv4 now has with fib_result. Rename fib6_multipath_select to fib6_select_path, pass fib6_result to it, and set f6i and nh in the result once a path selection is done. Call fib6_select_path unconditionally for path selection which means moving the sibling and oif check to fib6_select_path. To handle the two different call paths (2 only call multipath_select if flowi6_oif == 0 and the other always calls it), add a new have_oif_match that controls the sibling walk if relevant. Update callers of fib6_multipath_select accordingly and have them use the fib6_info and fib6_nh from the result. This is needed for multipath nexthop objects where a single f6i can point to multiple fib6_nh (similar to IPv4). Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 13 ++++++++----- include/net/ipv6_stubs.h | 9 ++++----- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 2e9235adfa0d..c4d818041663 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -190,6 +190,11 @@ struct rt6_info { unsigned short rt6i_nfheader_len; }; +struct fib6_result { + struct fib6_nh *nh; + struct fib6_info *f6i; +}; + #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ rt = rcu_dereference(rt->fib6_next)) @@ -391,11 +396,9 @@ struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int strict); -struct fib6_info *fib6_multipath_select(const struct net *net, - struct fib6_info *match, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, int strict); - +void fib6_select_path(const struct net *net, struct fib6_result *res, + struct flowi6 *fl6, int oif, bool have_oif_match, + const struct sk_buff *skb, int strict); struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 453b55bf6723..5df36d6a2613 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -14,6 +14,7 @@ struct fib6_info; struct fib6_nh; struct fib6_config; +struct fib6_result; /* This is ugly, ideally these symbols should be built * into the core kernel. @@ -34,11 +35,9 @@ struct ipv6_stub { struct fib6_table *table, int oif, struct flowi6 *fl6, int flags); - struct fib6_info *(*fib6_multipath_select)(const struct net *net, - struct fib6_info *f6i, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict); + void (*fib6_select_path)(const struct net *net, struct fib6_result *res, + struct flowi6 *fl6, int oif, bool oif_match, + const struct sk_buff *skb, int strict); u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, struct in6_addr *saddr); -- cgit v1.2.3 From b748f26092626332f73e71d75e4390de6b8bdf9b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 14:36:06 -0700 Subject: ipv6: Pass fib6_result to ip6_mtu_from_fib6 and fib6_mtu Change ip6_mtu_from_fib6 and fib6_mtu to take a fib6_result over a fib6_info. Update both to use the fib6_nh from fib6_result. Since the signature of ip6_mtu_from_fib6 is already changing, add const to daddr and saddr. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 5 +++-- include/net/ipv6_stubs.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 5909fc421305..46bbd8ff9cc6 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -302,8 +302,9 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } -u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr); +u32 ip6_mtu_from_fib6(const struct fib6_result *res, + const struct in6_addr *daddr, + const struct in6_addr *saddr); struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 5df36d6a2613..0d16b9ec0485 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -38,8 +38,9 @@ struct ipv6_stub { void (*fib6_select_path)(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool oif_match, const struct sk_buff *skb, int strict); - u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr); + u32 (*ip6_mtu_from_fib6)(const struct fib6_result *res, + const struct in6_addr *daddr, + const struct in6_addr *saddr); int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, -- cgit v1.2.3 From effda4dd97e878ab83336bec7411cc41b5cc6d37 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 14:36:10 -0700 Subject: ipv6: Pass fib6_result to fib lookups Change fib6_lookup and fib6_table_lookup to take a fib6_result and set f6i and nh rather than returning a fib6_info. For now both always return 0. A later patch set can make these more like the IPv4 counterparts and return EINVAL, EACCESS, etc based on fib6_type. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 9 +++++---- include/net/ipv6_stubs.h | 11 +++++------ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c4d818041663..cb3277cd1413 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -389,12 +389,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, /* called with rcu lock held; can return error pointer * caller needs to select path */ -struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, - int flags); +int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags); /* called with rcu lock held; caller needs to select path */ -struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int strict); +int fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, struct fib6_result *res, + int strict); void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 0d16b9ec0485..6c0c4fde16f8 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -29,12 +29,11 @@ struct ipv6_stub { int (*ipv6_route_input)(struct sk_buff *skb); struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); - struct fib6_info *(*fib6_lookup)(struct net *net, int oif, - struct flowi6 *fl6, int flags); - struct fib6_info *(*fib6_table_lookup)(struct net *net, - struct fib6_table *table, - int oif, struct flowi6 *fl6, - int flags); + int (*fib6_lookup)(struct net *net, int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags); + int (*fib6_table_lookup)(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags); void (*fib6_select_path)(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool oif_match, const struct sk_buff *skb, int strict); -- cgit v1.2.3 From 7d21fec90438941b44b699ae73673d2f8a3a9d76 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 14:36:11 -0700 Subject: ipv6: Add fib6_type and fib6_flags to fib6_result Add the fib6_flags and fib6_type to fib6_result. Update the lookup helpers to set them and update post fib lookup users to use the version from the result. This allows nexthop objects to have blackhole nexthop. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index cb3277cd1413..6b7557b71c8c 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -193,6 +193,8 @@ struct rt6_info { struct fib6_result { struct fib6_nh *nh; struct fib6_info *f6i; + u32 fib6_flags; + u8 fib6_type; }; #define for_each_fib6_node_rt_rcu(fn) \ -- cgit v1.2.3 From b8fb1ab46169ac016a8552a6455bb0bfc401f8e2 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 17:31:43 -0700 Subject: net ipv6: Prevent neighbor add if protocol is disabled on device Disabling IPv6 on an interface removes existing entries but nothing prevents new entries from being manually added. To that end, add a new neigh_table operation, allow_add, that is called on RTM_NEWNEIGH to see if neighbor entries are allowed on a given device. If IPv6 is disabled on the device, allow_add returns false and passes a message back to the user via extack. $ echo 1 > /proc/sys/net/ipv6/conf/eth1/disable_ipv6 $ ip -6 neigh add fe80::4c88:bff:fe21:2704 dev eth1 lladdr de:ad:be:ef:01:01 Error: IPv6 is disabled on this device. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 3e5438bd0101..50a67bd6a434 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -205,6 +205,8 @@ struct neigh_table { int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); + bool (*allow_add)(const struct net_device *dev, + struct netlink_ext_ack *extack); char *id; struct neigh_parms parms; struct list_head parms_list; -- cgit v1.2.3 From 0bc199854405543b0debe67c735c0aae94f1d319 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 17 Apr 2019 16:35:49 -0400 Subject: ipv6: Add rate limit mask for ICMPv6 messages To make ICMPv6 closer to ICMPv4, add ratemask parameter. Since the ICMP message types use larger numeric values, a simple bitmask doesn't fit. I use large bitmap. The input and output are the in form of list of ranges. Set the default to rate limit all error messages but Packet Too Big. For Packet Too Big, use ratemask instead of hard-coded. There are functions where icmpv6_xrlim_allow() and icmpv6_global_allow() aren't called. This patch only adds them to icmpv6_echo_reply(). Rate limiting error messages is mandated by RFC 4443 but RFC 4890 says that it is also acceptable to rate limit informational messages. Thus, I removed the current hard-coded behavior of icmpv6_mask_allow() that doesn't rate limit informational messages. v2: Add dummy function proc_do_large_bitmap() if CONFIG_PROC_SYSCTL isn't defined, expand the description in ip-sysctl.txt and remove unnecessary conditional before kfree(). v3: Inline the bitmap instead of dynamically allocated. Still is a pointer to it is needed because of the way proc_do_large_bitmap work. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 64e29b58bb5e..5e61b5a8635d 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -8,6 +8,7 @@ #ifndef __NETNS_IPV6_H__ #define __NETNS_IPV6_H__ #include +#include struct ctl_table_header; @@ -35,6 +36,8 @@ struct netns_sysctl_ipv6 { int icmpv6_echo_ignore_all; int icmpv6_echo_ignore_multicast; int icmpv6_echo_ignore_anycast; + DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1); + unsigned long *icmpv6_ratemask_ptr; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; -- cgit v1.2.3 From c7cbdbf29f488a19982cd9f4a109887f18028bbb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Apr 2019 22:51:48 +0200 Subject: net: rework SIOCGSTAMP ioctl handling The SIOCGSTAMP/SIOCGSTAMPNS ioctl commands are implemented by many socket protocol handlers, and all of those end up calling the same sock_get_timestamp()/sock_get_timestampns() helper functions, which results in a lot of duplicate code. With the introduction of 64-bit time_t on 32-bit architectures, this gets worse, as we then need four different ioctl commands in each socket protocol implementation. To simplify that, let's add a new .gettstamp() operation in struct proto_ops, and move ioctl implementation into the common sock_ioctl()/compat_sock_ioctl_trans() functions that these all go through. We can reuse the sock_get_timestamp() implementation, but generalize it so it can deal with both native and compat mode, as well as timeval and timespec structures. Acked-by: Stefan Schmidt Acked-by: Neil Horman Acked-by: Marc Kleine-Budde Link: https://lore.kernel.org/lkml/CAK8P3a038aDQQotzua_QtKGhq8O9n+rdiz2=WDCp82ys8eUT+A@mail.gmail.com/ Signed-off-by: Arnd Bergmann Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/compat.h | 3 --- include/net/sock.h | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/compat.h b/include/net/compat.h index 4c6d75612b6c..f277653c7e17 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -30,9 +30,6 @@ struct compat_cmsghdr { compat_int_t cmsg_type; }; -int compat_sock_get_timestamp(struct sock *, struct timeval __user *); -int compat_sock_get_timestampns(struct sock *, struct timespec __user *); - #else /* defined(CONFIG_COMPAT) */ /* * To avoid compiler warnings: diff --git a/include/net/sock.h b/include/net/sock.h index bdd77bbce7d8..784cd19d5ff7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1614,6 +1614,8 @@ int sock_setsockopt(struct socket *sock, int level, int op, int sock_getsockopt(struct socket *sock, int level, int op, char __user *optval, int __user *optlen); +int sock_gettstamp(struct socket *sock, void __user *userstamp, + bool timeval, bool time32); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, @@ -2503,8 +2505,6 @@ static inline bool sk_listener(const struct sock *sk) } void sock_enable_timestamp(struct sock *sk, int flag); -int sock_get_timestamp(struct sock *, struct timeval __user *); -int sock_get_timestampns(struct sock *, struct timespec __user *); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); -- cgit v1.2.3 From 4e54507ab1a9da05238b986292f6cb702e6696c7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 21 Apr 2019 08:49:01 -0700 Subject: ipv6: Simplify rt6_qualify_for_ecmp After commit c7a1ce397ada ("ipv6: Change addrconf_f6i_alloc to use ip6_route_info_create"), the gateway is no longer filled in for fib6_nh structs in a prefix route. Accordingly, the RTF_ADDRCONF flag check can be dropped from the 'rt6_qualify_for_ecmp'. Further, RTF_DYNAMIC is only set in rt6_info instances, so it can be removed from the check as well. This reduces rt6_qualify_for_ecmp and the mlxsw version to just checking if the nexthop has a gateway which is the real indication of whether entries can be coalesced into a multipath route. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 46bbd8ff9cc6..518d97fbe074 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,8 +68,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { - return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) && - f6i->fib6_nh.fib_nh_gw_family; + return f6i->fib6_nh.fib_nh_gw_family; } void ip6_route_input(struct sk_buff *skb); -- cgit v1.2.3 From be659b8d3c79afc54e087ebf8d849685d7b0d395 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 21 Apr 2019 17:39:18 -0700 Subject: ipv6: Restore RTF_ADDRCONF check in rt6_qualify_for_ecmp The RTF_ADDRCONF flag filters out routes added by RA's in determining which routes can be appended to an existing one to create a multipath route. Restore the flag check and add a comment to document the RA piece. Fixes: 4e54507ab1a9 ("ipv6: Simplify rt6_qualify_for_ecmp") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 518d97fbe074..df9cebc2b20c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,7 +68,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { - return f6i->fib6_nh.fib_nh_gw_family; + /* the RTF_ADDRCONF flag filters out RA's */ + return !(f6i->fib6_flags & RTF_ADDRCONF) && + f6i->fib6_nh.fib_nh_gw_family; } void ip6_route_input(struct sk_buff *skb); -- cgit v1.2.3 From 0b13c9bb96f6edf03018030f07929a16b4dd77a6 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 21 Apr 2019 00:50:42 +0900 Subject: include/net/tcp.h: whitespace cleanup at tcp_v4_check This patch makes trivial whitespace fix to the function tcp_v4_check at include/net/tcp.h file. It has stylistic issue, which is "space required after that ','" and it can be confirmed with ./scripts/checkpatch.pl tool. ERROR: space required after that ',' (ctx:VxV) #29: FILE: include/net/tcp.h:1317: + return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); ^ Signed-off-by: Daniel T. Lee Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 68ee02523b87..7cf1181630a3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1314,7 +1314,7 @@ static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq) static inline __sum16 tcp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { - return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); + return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline bool tcp_checksum_complete(struct sk_buff *skb) -- cgit v1.2.3 From 7e5f4cdb284be5ff862f84ccda084e2847f73fbb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 20 Apr 2019 09:27:27 -0700 Subject: ipv6: Remove fib6_info_nh_lwt fib6_info_nh_lwt is no longer used; remove it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6b7557b71c8c..352f767bea81 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -450,12 +450,6 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct netlink_ext_ack *extack); void fib6_nh_release(struct fib6_nh *fib6_nh); -static inline -struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) -{ - return f6i->fib6_nh.fib_nh_lws; -} - void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); -- cgit v1.2.3 From 3c618c1dbb8859625c643121ac80af9a6723533f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 20 Apr 2019 09:28:20 -0700 Subject: net: Rename net/nexthop.h net/rtnh.h The header contains rtnh_ macros so rename the file accordingly. Allows a later patch to use the nexthop.h name for the new nexthop code. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 34 ---------------------------------- include/net/rtnh.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) delete mode 100644 include/net/nexthop.h create mode 100644 include/net/rtnh.h (limited to 'include/net') diff --git a/include/net/nexthop.h b/include/net/nexthop.h deleted file mode 100644 index 902ff382a6dc..000000000000 --- a/include/net/nexthop.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NET_NEXTHOP_H -#define __NET_NEXTHOP_H - -#include -#include - -static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining) -{ - return remaining >= (int)sizeof(*rtnh) && - rtnh->rtnh_len >= sizeof(*rtnh) && - rtnh->rtnh_len <= remaining; -} - -static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh, - int *remaining) -{ - int totlen = NLA_ALIGN(rtnh->rtnh_len); - - *remaining -= totlen; - return (struct rtnexthop *) ((char *) rtnh + totlen); -} - -static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh) -{ - return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh))); -} - -static inline int rtnh_attrlen(const struct rtnexthop *rtnh) -{ - return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh)); -} - -#endif diff --git a/include/net/rtnh.h b/include/net/rtnh.h new file mode 100644 index 000000000000..aa2cfc508f7c --- /dev/null +++ b/include/net/rtnh.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_RTNH_H +#define __NET_RTNH_H + +#include +#include + +static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining) +{ + return remaining >= (int)sizeof(*rtnh) && + rtnh->rtnh_len >= sizeof(*rtnh) && + rtnh->rtnh_len <= remaining; +} + +static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh, + int *remaining) +{ + int totlen = NLA_ALIGN(rtnh->rtnh_len); + + *remaining -= totlen; + return (struct rtnexthop *) ((char *) rtnh + totlen); +} + +static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh) +{ + return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh))); +} + +static inline int rtnh_attrlen(const struct rtnexthop *rtnh) +{ + return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh)); +} + +#endif -- cgit v1.2.3 From a79eda3aaf30bc73752487fff5160cf67e99e313 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 20 Apr 2019 23:29:42 -0400 Subject: net: psample: drop include of module.h from psample.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. There doesn't appear to be anything in psample.h that is module related, and build coverage doesn't appear to show any other files/drivers relying implicitly on getting it from here. So it appears we are simply free to just remove it in this case. Cc: Yotam Gigi Cc: "David S. Miller" Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- include/net/psample.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/psample.h b/include/net/psample.h index 9b80f814ab04..37a4df2325b2 100644 --- a/include/net/psample.h +++ b/include/net/psample.h @@ -3,7 +3,6 @@ #define __NET_PSAMPLE_H #include -#include #include struct psample_group { -- cgit v1.2.3 From c517796ea91d11dd3f0ae7ff61a12fe5d4941eb0 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 20 Apr 2019 23:29:43 -0400 Subject: net: ife: drop include of module.h from net/ife.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. There doesn't appear to be anything in net/ife.h that is module related, and build coverage doesn't appear to show any other files/drivers relying implicitly on getting it from here. So it appears we are simply free to just remove it in this case. Cc: Yotam Gigi Cc: Jamal Hadi Salim Cc: "David S. Miller" Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- include/net/ife.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ife.h b/include/net/ife.h index e117617e3c34..7e2538d8585b 100644 --- a/include/net/ife.h +++ b/include/net/ife.h @@ -4,7 +4,6 @@ #include #include -#include #include #if IS_ENABLED(CONFIG_NET_IFE) -- cgit v1.2.3 From 113e63286697893127c3ee83471b45ad0cf8d75f Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 20 Apr 2019 23:29:44 -0400 Subject: net: fib: drop include of module.h from fib_notifier.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. Since fib_notifier.h is not going into a module struct looking for specific fields, we can just let it know that module is a struct, just like about 60 other include/linux headers already do. Cc: "David S. Miller" Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index c91ec732afd6..c49d7bfb5c30 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -2,10 +2,11 @@ #define __NET_FIB_NOTIFIER_H #include -#include #include #include +struct module; + struct fib_notifier_info { struct net *net; int family; -- cgit v1.2.3 From a130f9b27545f56880034c345da9a4efc2ba2b24 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 20 Apr 2019 23:29:45 -0400 Subject: net: tc_act: drop include of module.h from tc_ife.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. Since tc_ife.h is not going into a module struct looking for specific fields, we can just let it know that module is a struct, just like about 60 other include/linux headers already do. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- include/net/tc_act/tc_ife.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index 86d13b01b39d..c7f24a2da1ca 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -5,7 +5,8 @@ #include #include #include -#include + +struct module; struct tcf_ife_params { u8 eth_dst[ETH_ALEN]; -- cgit v1.2.3 From f2ad1a522e9817fba7799008e0a8dc6f8a32bf7d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 22 Apr 2019 12:08:39 +0000 Subject: net: devlink: Add extack to shared buffer operations Add extack to shared buffer set operations, so that meaningful error messages could be propagated to the user. Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Reviewed-by: Petr Machata Cc: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 70c7d1ac8344..4f5e41613503 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -491,13 +491,14 @@ struct devlink_ops { struct devlink_sb_pool_info *pool_info); int (*sb_pool_set)(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, - enum devlink_sb_threshold_type threshold_type); + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack); int (*sb_port_pool_get)(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, u32 *p_threshold); int (*sb_port_pool_set)(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, - u32 threshold); + u32 threshold, struct netlink_ext_ack *extack); int (*sb_tc_pool_bind_get)(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, @@ -507,7 +508,8 @@ struct devlink_ops { unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, - u16 pool_index, u32 threshold); + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack); int (*sb_occ_snapshot)(struct devlink *devlink, unsigned int sb_index); int (*sb_occ_max_clear)(struct devlink *devlink, -- cgit v1.2.3 From f24ea52873c726bf7b54318f00ec45050222b367 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Apr 2019 16:44:37 +0200 Subject: xfrm: remove tos indirection from afinfo_policy Only used by ipv4, we can read the fl4 tos value directly instead. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 77eb578a0384..652da5861772 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -329,7 +329,6 @@ struct xfrm_policy_afinfo { void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); - int (*get_tos)(const struct flowi *fl); int (*init_path)(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len); -- cgit v1.2.3 From 2e8b4aa816eaaf480fe68b1086614259caf1bf3c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Apr 2019 16:44:38 +0200 Subject: xfrm: remove init_path indirection from afinfo_policy handle this directly, its only used by ipv6. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 652da5861772..b8de1622141a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -329,9 +329,6 @@ struct xfrm_policy_afinfo { void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); - int (*init_path)(struct xfrm_dst *path, - struct dst_entry *dst, - int nfheader_len); int (*fill_dst)(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl); -- cgit v1.2.3 From c53ac41e3720926301c623d6682bb87ce992a3b3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Apr 2019 16:44:39 +0200 Subject: xfrm: remove decode_session indirection from afinfo_policy No external dependencies, might as well handle this directly. xfrm_afinfo_policy is now 40 bytes on x86_64. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b8de1622141a..18d6b33501b9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -326,9 +326,6 @@ struct xfrm_policy_afinfo { xfrm_address_t *saddr, xfrm_address_t *daddr, u32 mark); - void (*decode_session)(struct sk_buff *skb, - struct flowi *fl, - int reverse); int (*fill_dst)(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl); -- cgit v1.2.3 From bb9cd077e216b886438c5698e1cd75f762ecd3c9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 17 Apr 2019 11:45:13 +0200 Subject: xfrm: remove unneeded export_symbols None of them have any external callers, make them static. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 18d6b33501b9..eb5018b1cf9c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1568,7 +1568,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb); -int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err); int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); @@ -1584,7 +1583,6 @@ int xfrm6_rcv(struct sk_buff *skb); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto); void xfrm6_local_error(struct sk_buff *skb, u32 mtu); -int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err); int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol); int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family); -- cgit v1.2.3 From 089b19a9204fc090793d389a265f54124eacb05d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:44 -0700 Subject: flow_dissector: switch kernel context to struct bpf_flow_dissector struct bpf_flow_dissector has a small subset of sk_buff fields that flow dissector BPF program is allowed to access and an optional pointer to real skb. Real skb is used only in bpf_skb_load_bytes helper to read non-linear data. The real motivation for this is to be able to call flow dissector from eth_get_headlen context where we don't have an skb and need to dissect raw bytes. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/net/flow_dissector.h | 7 +++++++ include/net/sch_generic.h | 11 ++++------- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 2b26979efb48..7c5a8d9a8d2a 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -305,4 +305,11 @@ static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissec return ((char *)target_container) + flow_dissector->offset[key_id]; } +struct bpf_flow_dissector { + struct bpf_flow_keys *flow_keys; + const struct sk_buff *skb; + void *data; + void *data_end; +}; + #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e8f85cd2afce..21f434f3ac9e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -364,13 +364,10 @@ struct tcf_proto { }; struct qdisc_skb_cb { - union { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; - struct bpf_flow_keys *flow_keys; + struct { + unsigned int pkt_len; + u16 slave_dev_queue_mapping; + u16 tc_classid; }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; -- cgit v1.2.3 From f05713e0916ca46f127641b6afa74bd1a0772423 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Apr 2019 18:35:03 -0700 Subject: ipv6: convert fib6_ref to refcount_t We suspect some issues involving fib6_ref 0 -> 1 transitions might cause strange syzbot reports. Lets convert fib6_ref to refcount_t to catch them earlier. Signed-off-by: Eric Dumazet Cc: Wei Wang Acked-by: Wei Wang Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 352f767bea81..5a4a67b38712 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -146,7 +146,7 @@ struct fib6_info { struct list_head fib6_siblings; unsigned int fib6_nsiblings; - atomic_t fib6_ref; + refcount_t fib6_ref; unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] @@ -284,17 +284,17 @@ void fib6_info_destroy_rcu(struct rcu_head *head); static inline void fib6_info_hold(struct fib6_info *f6i) { - atomic_inc(&f6i->fib6_ref); + refcount_inc(&f6i->fib6_ref); } static inline bool fib6_info_hold_safe(struct fib6_info *f6i) { - return atomic_inc_not_zero(&f6i->fib6_ref); + return refcount_inc_not_zero(&f6i->fib6_ref); } static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) + if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } -- cgit v1.2.3 From ffa8ce54be3aaf6b15abae3bbd08282b867d3a4f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Apr 2019 08:23:41 -0700 Subject: lwtunnel: Pass encap and encap type attributes to lwtunnel_fill_encap Currently, lwtunnel_fill_encap hardcodes the encap and encap type attributes as RTA_ENCAP and RTA_ENCAP_TYPE, respectively. The nexthop objects want to re-use this code but the encap attributes passed to userspace as NHA_ENCAP and NHA_ENCAP_TYPE. Since that is the only difference, change lwtunnel_fill_encap to take the attribute type as an input. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 671113bcb2cc..5d6c5b1fc695 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -118,8 +118,8 @@ int lwtunnel_build_state(u16 encap_type, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack); -int lwtunnel_fill_encap(struct sk_buff *skb, - struct lwtunnel_state *lwtstate); +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, + int encap_attr, int encap_type_attr); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); @@ -219,7 +219,8 @@ static inline int lwtunnel_build_state(u16 encap_type, } static inline int lwtunnel_fill_encap(struct sk_buff *skb, - struct lwtunnel_state *lwtstate) + struct lwtunnel_state *lwtstate, + int encap_attr, int encap_type_attr) { return 0; } -- cgit v1.2.3 From ecc5663cce8c7d7e4eba32af4e1e3cab296c64b9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Apr 2019 08:48:09 -0700 Subject: net: Change nhc_flags to unsigned char nhc_flags holds the RTNH_F flags for a given nexthop (fib{6}_nh). All of the RTNH_F_ flags fit in an unsigned char, and since the API to userspace (rtnh_flags and lower byte of rtm_flags) is 1 byte it can not grow. Make nhc_flags in fib_nh_common an unsigned char and shrink the size of the struct by 8, from 56 to 48 bytes. Update the flags arguments for up netdevice events and fib_nexthop_info which determines the RTNH_F flags to return on a dump/event. The RTNH_F flags are passed in the lower byte of rtm_flags which is an unsigned int so use a temp variable for the flags to fib_nexthop_info and combine with rtm_flags in the caller. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- include/net/ip_fib.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index df9cebc2b20c..4790beaa86e0 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -182,7 +182,7 @@ int rt6_dump_route(struct fib6_info *f6i, void *p_arg); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); -void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); +void rt6_sync_up(struct net_device *dev, unsigned char nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); void rt6_multipath_rebalance(struct fib6_info *f6i); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index d8195c77e247..772a9e61bd84 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -83,11 +83,11 @@ struct fnhe_hash_bucket { struct fib_nh_common { struct net_device *nhc_dev; int nhc_oif; - unsigned int nhc_flags; - struct lwtunnel_state *nhc_lwtstate; unsigned char nhc_scope; u8 nhc_family; u8 nhc_gw_family; + unsigned char nhc_flags; + struct lwtunnel_state *nhc_lwtstate; union { __be32 ipv4; @@ -425,7 +425,7 @@ int fib_unmerge(struct net *net); int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); int fib_sync_down_addr(struct net_device *dev, __be32 local); -int fib_sync_up(struct net_device *dev, unsigned int nh_flags); +int fib_sync_up(struct net_device *dev, unsigned char nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -500,7 +500,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct netlink_callback *cb); int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, - unsigned int *flags, bool skip_oif); + unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, int nh_weight); #endif /* _NET_FIB_H */ -- cgit v1.2.3 From a65120bae4b7425a39c5783aa3d4fc29677eef0e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Apr 2019 18:05:33 -0700 Subject: ipv6: Use result arg in fib_lookup_arg consistently arg.result is sometimes used as fib6_result and sometimes used to hold the rt6_info. Add rt6_info to fib6_result and make the use of arg.result consistent through ipv6 rules. The rt6 entry is filled in for lookups returning a dst_entry, but not for direct fib_lookups that just want a fib6_info. Fixes: effda4dd97e8 ("ipv6: Pass fib6_result to fib lookups") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 5a4a67b38712..40105738e2f6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -195,6 +195,7 @@ struct fib6_result { struct fib6_info *f6i; u32 fib6_flags; u8 fib6_type; + struct rt6_info *rt6; }; #define for_each_fib6_node_rt_rcu(fn) \ -- cgit v1.2.3 From d5bb334a8e171b262e48f378bd2096c0ea458265 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 24 Apr 2019 22:19:17 +0200 Subject: Bluetooth: Align minimum encryption key size for LE and BR/EDR connections The minimum encryption key size for LE connections is 56 bits and to align LE with BR/EDR, enforce 56 bits of minimum encryption key size for BR/EDR connections as well. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg Cc: stable@vger.kernel.org --- include/net/bluetooth/hci_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 094e61e07030..05b1b96f4d9e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -190,6 +190,9 @@ struct adv_info { #define HCI_MAX_SHORT_NAME_LENGTH 10 +/* Min encryption key size to match with SMP */ +#define HCI_MIN_ENC_KEY_SIZE 7 + /* Default LE RPA expiry time, 15 minutes */ #define HCI_DEFAULT_RPA_TIMEOUT (15 * 60) -- cgit v1.2.3 From f7dacfb11475ba777e1e84ccec2e14b0ba5a17a3 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Fri, 15 Mar 2019 17:39:03 +0200 Subject: cfg80211: support non-inheritance element Subelement profile may specify element IDs it doesn't inherit from the management frame. Support it. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 70432fd638af..777c4f021610 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5534,6 +5534,14 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid, u64_to_ether_addr(new_bssid_u64, new_bssid); } +/** + * cfg80211_is_element_inherited - returns if element ID should be inherited + * @element: element to check + * @non_inherit_element: non inheritance element + */ +bool cfg80211_is_element_inherited(const struct element *element, + const struct element *non_inherit_element); + /** * enum cfg80211_bss_frame_type - frame type that the BSS data came from * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is -- cgit v1.2.3 From fe806e4992c9047affd263bcc13b2c047029a726 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Fri, 15 Mar 2019 17:39:05 +0200 Subject: cfg80211: support profile split between elements Since an element is limited to 255 octets, a profile may be split split to several elements. Support the split as defined in the 11ax draft 3. Detect legacy split and print a net-rate limited warning, since there is no ROI in supporting this probably non-existent split. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 777c4f021610..f6665f8eba5a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5542,6 +5542,20 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid, bool cfg80211_is_element_inherited(const struct element *element, const struct element *non_inherit_element); +/** + * cfg80211_merge_profile - merges a MBSSID profile if it is split between IEs + * @ie: ies + * @ielen: length of IEs + * @mbssid_elem: current MBSSID element + * @sub_elem: current MBSSID subelement (profile) + * @merged_ie: location of the merged profile + * @max_copy_len: max merged profile length + */ +size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, + const struct element *mbssid_elem, + const struct element *sub_elem, + u8 **merged_ie, size_t max_copy_len); + /** * enum cfg80211_bss_frame_type - frame type that the BSS data came from * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is -- cgit v1.2.3 From f2af2df800d3648b1d68e02d5b8a5d77cfee8970 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 16 Mar 2019 18:06:32 +0100 Subject: mac80211: calculate hash for fq without holding fq->lock in itxq enqueue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduces lock contention on enqueue/dequeue of iTXQ packets Signed-off-by: Felix Fietkau Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/fq_impl.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index be7c0fab3478..2caa86660ab0 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -107,21 +107,23 @@ begin: return skb; } +static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) +{ + u32 hash = skb_get_hash_perturb(skb, fq->perturbation); + + return reciprocal_scale(hash, fq->flows_cnt); +} + static struct fq_flow *fq_flow_classify(struct fq *fq, - struct fq_tin *tin, + struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_flow_get_default_t get_default_func) { struct fq_flow *flow; - u32 hash; - u32 idx; lockdep_assert_held(&fq->lock); - hash = skb_get_hash_perturb(skb, fq->perturbation); - idx = reciprocal_scale(hash, fq->flows_cnt); flow = &fq->flows[idx]; - if (flow->tin && flow->tin != tin) { flow = get_default_func(fq, tin, idx, skb); tin->collisions++; @@ -153,7 +155,7 @@ static void fq_recalc_backlog(struct fq *fq, } static void fq_tin_enqueue(struct fq *fq, - struct fq_tin *tin, + struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_skb_free_t free_func, fq_flow_get_default_t get_default_func) @@ -163,7 +165,7 @@ static void fq_tin_enqueue(struct fq *fq, lockdep_assert_held(&fq->lock); - flow = fq_flow_classify(fq, tin, skb, get_default_func); + flow = fq_flow_classify(fq, tin, idx, skb, get_default_func); flow->tin = tin; flow->backlog += skb->len; -- cgit v1.2.3 From 6cdd3979a2bdc16116c5b2eb09475abf54ba9e70 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Tue, 19 Mar 2019 21:34:07 +0100 Subject: nl80211/cfg80211: Extended Key ID support Add support for IEEE 802.11-2016 "Extended Key ID for Individually Addressed Frames". Extend cfg80211 and nl80211 to allow pairwise keys to be installed for Rx only, enable Tx separately and allow Key ID 1 for pairwise keys. Signed-off-by: Alexander Wetzel [use NLA_POLICY_RANGE() for NL80211_KEY_MODE] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f6665f8eba5a..2b039802ae2e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -485,6 +485,7 @@ struct vif_params { * with the get_key() callback, must be in little endian, * length given by @seq_len. * @seq_len: length of @seq. + * @mode: key install mode (RX_TX, NO_TX or SET_TX) */ struct key_params { const u8 *key; @@ -492,6 +493,7 @@ struct key_params { int key_len; int seq_len; u32 cipher; + enum nl80211_key_mode mode; }; /** -- cgit v1.2.3 From 96fc6efb9ad9d0cd8cbb4462f0eb2a07092649e6 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Tue, 19 Mar 2019 21:34:08 +0100 Subject: mac80211: IEEE 802.11 Extended Key ID support Add support for Extended Key ID as defined in IEEE 802.11-2016. - Implement the nl80211 API for Extended Key ID - Extend mac80211 API to allow drivers to support Extended Key ID - Enable Extended Key ID by default for drivers only supporting SW crypto (e.g. mac80211_hwsim) - Allow unicast Tx usage to be supressed (IEEE80211_KEY_FLAG_NO_AUTO_TX) - Select the decryption key based on the MPDU keyid - Enforce existing assumptions in the code that rekeys don't change the cipher Signed-off-by: Alexander Wetzel [remove module parameter] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ac2ed8ec662b..c10abca55fde 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1697,6 +1697,7 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for * a TKIP key if it only requires MIC space. Do not set together with * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. + * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation. */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), @@ -1708,6 +1709,7 @@ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_RX_MGMT = BIT(6), IEEE80211_KEY_FLAG_RESERVE_TAILROOM = BIT(7), IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), + IEEE80211_KEY_FLAG_NO_AUTO_TX = BIT(9), }; /** @@ -2243,6 +2245,9 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID * only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set. * + * @IEEE80211_HW_EXT_KEY_ID_NATIVE: Driver and hardware are supporting Extended + * Key ID and can handle two unicast keys per station for Rx and Tx. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2294,6 +2299,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN, IEEE80211_HW_SUPPORTS_MULTI_BSSID, IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, + IEEE80211_HW_EXT_KEY_ID_NATIVE, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From e96d1cd2635c05efdd01b4eafcfc50c22c40751f Mon Sep 17 00:00:00 2001 From: Ashok Raj Nagarajan Date: Fri, 29 Mar 2019 16:18:21 +0530 Subject: cfg80211: Add support to set tx power for a station associated This patch adds support to set transmit power setting type and transmit power level attributes to NL80211_CMD_SET_STATION in order to facilitate adjusting the transmit power level of a station associated to the AP. The added attributes allow selection of automatic and limited transmit power level, with the level defined in dBm format. Co-developed-by: Balaji Pothunoori Signed-off-by: Ashok Raj Nagarajan Signed-off-by: Balaji Pothunoori Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2b039802ae2e..2ea04e94b522 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -975,6 +975,27 @@ enum station_parameters_apply_mask { STATION_PARAM_APPLY_UAPSD = BIT(0), STATION_PARAM_APPLY_CAPABILITY = BIT(1), STATION_PARAM_APPLY_PLINK_STATE = BIT(2), + STATION_PARAM_APPLY_STA_TXPOWER = BIT(3), +}; + +/** + * struct sta_txpwr - station txpower configuration + * + * Used to configure txpower for station. + * + * @power: tx power (in dBm) to be used for sending data traffic. If tx power + * is not provided, the default per-interface tx power setting will be + * overriding. Driver should be picking up the lowest tx power, either tx + * power per-interface or per-station. + * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power + * will be less than or equal to specified from userspace, whereas if TPC + * %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power. + * NL80211_TX_POWER_FIXED is not a valid configuration option for + * per peer TPC. + */ +struct sta_txpwr { + s16 power; + enum nl80211_tx_power_setting type; }; /** @@ -1049,6 +1070,7 @@ struct station_parameters { const struct ieee80211_he_cap_elem *he_capa; u8 he_capa_len; u16 airtime_weight; + struct sta_txpwr txpwr; }; /** -- cgit v1.2.3 From ba905bf432f662cb907fd692a4f160e612c0408b Mon Sep 17 00:00:00 2001 From: Ashok Raj Nagarajan Date: Fri, 29 Mar 2019 16:19:09 +0530 Subject: mac80211: store tx power value from user to station This patch introduce a new driver callback drv_sta_set_txpwr. This API will copy the transmit power value passed from user space and call the driver callback to set the tx power for the station. Co-developed-by: Balaji Pothunoori Signed-off-by: Ashok Raj Nagarajan Signed-off-by: Balaji Pothunoori Signed-off-by: Johannes Berg --- include/net/mac80211.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c10abca55fde..d66fbfe8d55d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1889,6 +1889,24 @@ struct ieee80211_sta_rates { } rate[IEEE80211_TX_RATE_TABLE_SIZE]; }; +/** + * struct ieee80211_sta_txpwr - station txpower configuration + * + * Used to configure txpower for station. + * + * @power: indicates the tx power, in dBm, to be used when sending data frames + * to the STA. + * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power + * will be less than or equal to specified from userspace, whereas if TPC + * %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power. + * NL80211_TX_POWER_FIXED is not a valid configuration option for + * per peer TPC. + */ +struct ieee80211_sta_txpwr { + s16 power; + enum nl80211_tx_power_setting type; +}; + /** * struct ieee80211_sta - station table entry * @@ -1975,6 +1993,7 @@ struct ieee80211_sta { bool support_p2p_ps; u16 max_rc_amsdu_len; u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; + struct ieee80211_sta_txpwr txpwr; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; @@ -3800,6 +3819,9 @@ struct ieee80211_ops { #endif void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd, struct ieee80211_sta *sta); + int (*sta_set_txpwr)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta); int (*sta_state)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, enum ieee80211_sta_state old_state, -- cgit v1.2.3 From 5809a5d54bb9eda3a388b5a712657970c2cb9f8e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 11 Apr 2019 11:59:50 +0300 Subject: cfg80211: don't pass pointer to pointer unnecessarily The cfg80211_merge_profile() and ieee802_11_find_bssid_profile() are a bit cleaner if we just pass the merged_ie pointer instead of a pointer to the pointer. This isn't a functional change, it's just a clean up. Signed-off-by: Dan Carpenter Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2ea04e94b522..944de1802210 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5578,7 +5578,7 @@ bool cfg80211_is_element_inherited(const struct element *element, size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, const struct element *mbssid_elem, const struct element *sub_elem, - u8 **merged_ie, size_t max_copy_len); + u8 *merged_ie, size_t max_copy_len); /** * enum cfg80211_bss_frame_type - frame type that the BSS data came from -- cgit v1.2.3 From 5ab92e7fe49ad74293b50fb9e6f25be5521e2f68 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 11 Apr 2019 13:47:24 -0700 Subject: cfg80211: add support to probe unexercised mesh link Adding support to allow mesh HWMP to measure link metrics on unexercised direct mesh path by sending some data frames to other mesh points which are not currently selected as a primary traffic path but only 1 hop away. The absence of the primary path to the chosen node makes it necessary to apply some form of marking on a chosen packet stream so that the packets can be properly steered to the selected node for testing, and not by the regular mesh path lookup. Tested-by: Pradeep Kumar Chitrapu Signed-off-by: Rajkumar Manoharan Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 944de1802210..298301525f9f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3501,6 +3501,9 @@ struct cfg80211_update_owe_info { * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME * but offloading OWE processing to the user space will get the updated * DH IE through this interface. + * + * @probe_mesh_link: Probe direct Mesh peer's link quality by sending data frame + * and overrule HWMP path selection algorithm. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3817,6 +3820,8 @@ struct cfg80211_ops { struct cfg80211_pmsr_request *request); int (*update_owe_info)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_update_owe_info *owe_info); + int (*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev, + const u8 *buf, size_t len); }; /* -- cgit v1.2.3 From 8828f81ad4a2f4e89ebe6e7793c06ed767c31d53 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 11 Apr 2019 13:47:26 -0700 Subject: mac80211: probe unexercised mesh links The requirement for mesh link metric refreshing, is that from one mesh point we be able to send some data frames to other mesh points which are not currently selected as a primary traffic path, but which are only 1 hop away. The absence of the primary path to the chosen node makes it necessary to apply some form of marking on a chosen packet stream so that the packets can be properly steered to the selected node for testing, and not by the regular mesh path lookup. Tested-by: Pradeep Kumar Chitrapu Signed-off-by: Rajkumar Manoharan Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d66fbfe8d55d..76a443f32fc8 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -807,6 +807,7 @@ enum mac80211_tx_info_flags { * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path + * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup * * These flags are used in tx_info->control.flags. */ @@ -816,6 +817,7 @@ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_RATE_INJECT = BIT(2), IEEE80211_TX_CTRL_AMSDU = BIT(3), IEEE80211_TX_CTRL_FAST_XMIT = BIT(4), + IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP = BIT(5), }; /* -- cgit v1.2.3 From 6ac99e8f23d4b10258406ca0dd7bffca5f31da9d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 26 Apr 2019 16:39:39 -0700 Subject: bpf: Introduce bpf sk local storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After allowing a bpf prog to - directly read the skb->sk ptr - get the fullsock bpf_sock by "bpf_sk_fullsock()" - get the bpf_tcp_sock by "bpf_tcp_sock()" - get the listener sock by "bpf_get_listener_sock()" - avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock" into different bpf running context. this patch is another effort to make bpf's network programming more intuitive to do (together with memory and performance benefit). When bpf prog needs to store data for a sk, the current practice is to define a map with the usual 4-tuples (src/dst ip/port) as the key. If multiple bpf progs require to store different sk data, multiple maps have to be defined. Hence, wasting memory to store the duplicated keys (i.e. 4 tuples here) in each of the bpf map. [ The smallest key could be the sk pointer itself which requires some enhancement in the verifier and it is a separate topic. ] Also, the bpf prog needs to clean up the elem when sk is freed. Otherwise, the bpf map will become full and un-usable quickly. The sk-free tracking currently could be done during sk state transition (e.g. BPF_SOCK_OPS_STATE_CB). The size of the map needs to be predefined which then usually ended-up with an over-provisioned map in production. Even the map was re-sizable, while the sk naturally come and go away already, this potential re-size operation is arguably redundant if the data can be directly connected to the sk itself instead of proxy-ing through a bpf map. This patch introduces sk->sk_bpf_storage to provide local storage space at sk for bpf prog to use. The space will be allocated when the first bpf prog has created data for this particular sk. The design optimizes the bpf prog's lookup (and then optionally followed by an inline update). bpf_spin_lock should be used if the inline update needs to be protected. BPF_MAP_TYPE_SK_STORAGE: ----------------------- To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in this patch) needs to be created. Multiple BPF_MAP_TYPE_SK_STORAGE maps can be created to fit different bpf progs' needs. The map enforces BTF to allow printing the sk-local-storage during a system-wise sk dump (e.g. "ss -ta") in the future. The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete a "sk-local-storage" data from a particular sk. Think of the map as a meta-data (or "type") of a "sk-local-storage". This particular "type" of "sk-local-storage" data can then be stored in any sk. The main purposes of this map are mostly: 1. Define the size of a "sk-local-storage" type. 2. Provide a similar syscall userspace API as the map (e.g. lookup/update, map-id, map-btf...etc.) 3. Keep track of all sk's storages of this "type" and clean them up when the map is freed. sk->sk_bpf_storage: ------------------ The main lookup/update/delete is done on sk->sk_bpf_storage (which is a "struct bpf_sk_storage"). When doing a lookup, the "map" pointer is now used as the "key" to search on the sk_storage->list. The "map" pointer is actually serving as the "type" of the "sk-local-storage" that is being requested. To allow very fast lookup, it should be as fast as looking up an array at a stable-offset. At the same time, it is not ideal to set a hard limit on the number of sk-local-storage "type" that the system can have. Hence, this patch takes a cache approach. The last search result from sk_storage->list is cached in sk_storage->cache[] which is a stable sized array. Each "sk-local-storage" type has a stable offset to the cache[] array. In the future, a map's flag could be introduced to do cache opt-out/enforcement if it became necessary. The cache size is 16 (i.e. 16 types of "sk-local-storage"). Programs can share map. On the program side, having a few bpf_progs running in the networking hotpath is already a lot. The bpf_prog should have already consolidated the existing sock-key-ed map usage to minimize the map lookup penalty. 16 has enough runway to grow. All sk-local-storage data will be removed from sk->sk_bpf_storage during sk destruction. bpf_sk_storage_get() and bpf_sk_storage_delete(): ------------------------------------------------ Instead of using bpf_map_(lookup|update|delete)_elem(), the bpf prog needs to use the new helper bpf_sk_storage_get() and bpf_sk_storage_delete(). The verifier can then enforce the ARG_PTR_TO_SOCKET argument. The bpf_sk_storage_get() also allows to "create" new elem if one does not exist in the sk. It is done by the new BPF_SK_STORAGE_GET_F_CREATE flag. An optional value can also be provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE. The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock. Together, it has eliminated the potential use cases for an equivalent bpf_map_update_elem() API (for bpf_prog) in this patch. Misc notes: ---------- 1. map_get_next_key is not supported. From the userspace syscall perspective, the map has the socket fd as the key while the map can be shared by pinned-file or map-id. Since btf is enforced, the existing "ss" could be enhanced to pretty print the local-storage. Supporting a kernel defined btf with 4 tuples as the return key could be explored later also. 2. The sk->sk_lock cannot be acquired. Atomic operations is used instead. e.g. cmpxchg is done on the sk->sk_bpf_storage ptr. Please refer to the source code comments for the details in synchronization cases and considerations. 3. The mem is charged to the sk->sk_omem_alloc as the sk filter does. Benchmark: --------- Here is the benchmark data collected by turning on the "kernel.bpf_stats_enabled" sysctl. Two bpf progs are tested: One bpf prog with the usual bpf hashmap (max_entries = 8192) with the sk ptr as the key. (verifier is modified to support sk ptr as the key That should have shortened the key lookup time.) Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE. Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for each egress skb and then bump the cnt. netperf is used to drive data with 4096 connected UDP sockets. BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run) 27: cgroup_skb name egress_sk_map tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633 loaded_at 2019-04-15T13:46:39-0700 uid 0 xlated 344B jited 258B memlock 4096B map_ids 16 btf_id 5 BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run) 30: cgroup_skb name egress_sk_stora tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739 loaded_at 2019-04-15T13:47:54-0700 uid 0 xlated 168B jited 156B memlock 4096B map_ids 17 btf_id 6 Here is a high-level picture on how are the objects organized: sk ┌──────┐ │ │ │ │ │ │ │*sk_bpf_storage─────▶ bpf_sk_storage └──────┘ ┌───────┐ ┌───────────┤ list │ │ │ │ │ │ │ │ │ │ │ └───────┘ │ │ elem │ ┌────────┐ ├─▶│ snode │ │ ├────────┤ │ │ data │ bpf_map │ ├────────┤ ┌─────────┐ │ │map_node│◀─┬─────┤ list │ │ └────────┘ │ │ │ │ │ │ │ │ elem │ │ │ │ ┌────────┐ │ └─────────┘ └─▶│ snode │ │ ├────────┤ │ bpf_map │ data │ │ ┌─────────┐ ├────────┤ │ │ list ├───────▶│map_node│ │ │ │ └────────┘ │ │ │ │ │ │ elem │ └─────────┘ ┌────────┐ │ ┌─▶│ snode │ │ │ ├────────┤ │ │ │ data │ │ │ ├────────┤ │ │ │map_node│◀─┘ │ └────────┘ │ │ │ ┌───────┐ sk └──────────│ list │ ┌──────┐ │ │ │ │ │ │ │ │ │ │ │ │ └───────┘ │*sk_bpf_storage───────▶bpf_sk_storage └──────┘ Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/net/bpf_sk_storage.h | 13 +++++++++++++ include/net/sock.h | 5 +++++ 2 files changed, 18 insertions(+) create mode 100644 include/net/bpf_sk_storage.h (limited to 'include/net') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h new file mode 100644 index 000000000000..b9dcb02e756b --- /dev/null +++ b/include/net/bpf_sk_storage.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019 Facebook */ +#ifndef _BPF_SK_STORAGE_H +#define _BPF_SK_STORAGE_H + +struct sock; + +void bpf_sk_storage_free(struct sock *sk); + +extern const struct bpf_func_proto bpf_sk_storage_get_proto; +extern const struct bpf_func_proto bpf_sk_storage_delete_proto; + +#endif /* _BPF_SK_STORAGE_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 784cd19d5ff7..4d208c0f9c14 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -236,6 +236,8 @@ struct sock_common { /* public: */ }; +struct bpf_sk_storage; + /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock @@ -510,6 +512,9 @@ struct sock { #endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_sk_storage __rcu *sk_bpf_storage; +#endif struct rcu_head sk_rcu; }; -- cgit v1.2.3 From 9e9957973c7785b1f8fa77f099cac661cc5e7e5b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 12:32:02 -0700 Subject: net/tls: remove old exports of sk_destruct functions tls_device_sk_destruct being set on a socket used to indicate that socket is a kTLS device one. That is no longer true - now we use sk_validate_xmit_skb pointer for that purpose. Remove the export. tls_device_attach() needs to be moved. While at it, remove the dead declaration of tls_sk_destruct(). Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index d9d0ac66f040..20196cb31ecc 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -317,7 +317,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -void tls_device_sk_destruct(struct sock *sk); void tls_device_free_resources_tx(struct sock *sk); void tls_device_init(void); void tls_device_cleanup(void); @@ -336,7 +335,6 @@ static inline u32 tls_record_start_seq(struct tls_record_info *rec) return rec->end_seq - rec->len; } -void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); -- cgit v1.2.3 From da68b4ad02343862fee1e3e8c6315984f16a4778 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 12:32:03 -0700 Subject: net/tls: move definition of tls ops into net/tls.h There seems to be no reason for tls_ops to be defined in netdevice.h which is included in a lot of places. Don't wrap the struct/enum declaration in ifdefs, it trickles down unnecessary ifdefs into driver code. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 20196cb31ecc..41a2ee643fc5 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -277,6 +277,23 @@ struct tls_context { void (*unhash)(struct sock *sk); }; +enum tls_offload_ctx_dir { + TLS_OFFLOAD_CTX_DIR_RX, + TLS_OFFLOAD_CTX_DIR_TX, +}; + +struct tlsdev_ops { + int (*tls_dev_add)(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn); + void (*tls_dev_del)(struct net_device *netdev, + struct tls_context *ctx, + enum tls_offload_ctx_dir direction); + void (*tls_dev_resync_rx)(struct net_device *netdev, + struct sock *sk, u32 seq, u64 rcd_sn); +}; + struct tls_offload_context_rx { /* sw must be the first member of tls_offload_context_rx */ struct tls_sw_context_rx sw; -- cgit v1.2.3 From 63a1c95f3fe48b4e9fe0c261b376e5e527b71b25 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 12:32:04 -0700 Subject: net/tls: byte swap device req TCP seq no upon setting To avoid a sparse warning byteswap the be32 sequence number before it's stored in the atomic value. While at it drop unnecessary brackets and use kernel's u64 type. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 41a2ee643fc5..39ea62f0c1f6 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -562,7 +562,7 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); - atomic64_set(&rx_ctx->resync_req, ((((uint64_t)seq) << 32) | 1)); + atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | 1); } -- cgit v1.2.3 From ae0be8de9a53cda3505865c11826d8ff0640237c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 26 Apr 2019 11:13:06 +0200 Subject: netlink: make nla_nest_start() add NLA_F_NESTED flag Even if the NLA_F_NESTED flag was introduced more than 11 years ago, most netlink based interfaces (including recently added ones) are still not setting it in kernel generated messages. Without the flag, message parsers not aware of attribute semantics (e.g. wireshark dissector or libmnl's mnl_nlmsg_fprintf()) cannot recognize nested attributes and won't display the structure of their contents. Unfortunately we cannot just add the flag everywhere as there may be userspace applications which check nlattr::nla_type directly rather than through a helper masking out the flags. Therefore the patch renames nla_nest_start() to nla_nest_start_noflag() and introduces nla_nest_start() as a wrapper adding NLA_F_NESTED. The calls which add NLA_F_NESTED manually are rewritten to use nla_nest_start(). Except for changes in include/net/netlink.h, the patch was generated using this semantic patch: @@ expression E1, E2; @@ -nla_nest_start(E1, E2) +nla_nest_start_noflag(E1, E2) @@ expression E1, E2; @@ -nla_nest_start_noflag(E1, E2 | NLA_F_NESTED) +nla_nest_start(E1, E2) Signed-off-by: Michal Kubecek Acked-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/netlink.h | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 23f27b0b3cef..1f18b47f41b4 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1415,13 +1415,18 @@ static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) } /** - * nla_nest_start - Start a new level of nested attributes + * nla_nest_start_noflag - Start a new level of nested attributes * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * - * Returns the container attribute + * This function exists for backward compatibility to use in APIs which never + * marked their nest attributes with NLA_F_NESTED flag. New APIs should use + * nla_nest_start() which sets the flag. + * + * Returns the container attribute or NULL on error */ -static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) +static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, + int attrtype) { struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb); @@ -1431,6 +1436,21 @@ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) return start; } +/** + * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED + * @skb: socket buffer to add attributes to + * @attrtype: attribute type of container + * + * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED + * flag. This is the preferred function to use in new code. + * + * Returns the container attribute or NULL on error + */ +static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) +{ + return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED); +} + /** * nla_nest_end - Finalize nesting of attributes * @skb: socket buffer the attributes are stored in -- cgit v1.2.3 From 6f455f5f4e9c28aefaefbe18ce7304b499645d75 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:27 +0200 Subject: netlink: add NLA_MIN_LEN Rather than using NLA_UNSPEC for this type of thing, use NLA_MIN_LEN so we can make NLA_UNSPEC be NLA_REJECT under certain conditions for future attributes. While at it, also use NLA_EXACT_LEN for the struct example. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 1f18b47f41b4..c77ed51c18f1 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -183,6 +183,7 @@ enum { NLA_REJECT, NLA_EXACT_LEN, NLA_EXACT_LEN_WARN, + NLA_MIN_LEN, __NLA_TYPE_MAX, }; @@ -212,6 +213,7 @@ enum nla_policy_validation { * NLA_NUL_STRING Maximum length of string (excluding NUL) * NLA_FLAG Unused * NLA_BINARY Maximum length of attribute payload + * NLA_MIN_LEN Minimum length of attribute payload * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if @@ -230,6 +232,7 @@ enum nla_policy_validation { * it is rejected. * NLA_EXACT_LEN_WARN Attribute should have exactly this length, a warning * is logged if it is longer, shorter is rejected. + * NLA_MIN_LEN Minimum length of attribute payload * All other Minimum length of attribute payload * * Meaning of `validation_data' field: @@ -281,7 +284,7 @@ enum nla_policy_validation { * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, - * [ATTR_BAZ] = { .len = sizeof(struct mystruct) }, + * [ATTR_BAZ] = { .type = NLA_EXACT_LEN, .len = sizeof(struct mystruct) }, * [ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags }, * }; */ @@ -302,6 +305,7 @@ struct nla_policy { #define NLA_POLICY_EXACT_LEN(_len) { .type = NLA_EXACT_LEN, .len = _len } #define NLA_POLICY_EXACT_LEN_WARN(_len) { .type = NLA_EXACT_LEN_WARN, \ .len = _len } +#define NLA_POLICY_MIN_LEN(_len) { .type = NLA_MIN_LEN, .len = _len } #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) -- cgit v1.2.3 From 8cb081746c031fb164089322e2336a0bf5b3070c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:28 +0200 Subject: netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 16 ++-- include/net/netlink.h | 238 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 199 insertions(+), 55 deletions(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 6850c7b1a3a6..897cdba13569 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -165,7 +165,7 @@ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) } /** - * genlmsg_parse - parse attributes of a genetlink message + * genlmsg_parse_deprecated - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements @@ -173,14 +173,14 @@ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) * @policy: validation policy * @extack: extended ACK report struct */ -static inline int genlmsg_parse(const struct nlmsghdr *nlh, - const struct genl_family *family, - struct nlattr *tb[], int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh, + const struct genl_family *family, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - return nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, - policy, extack); + return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, + policy, NL_VALIDATE_LIBERAL, extack); } /** diff --git a/include/net/netlink.h b/include/net/netlink.h index c77ed51c18f1..ab26a5e3558b 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -369,21 +369,48 @@ struct nl_info { bool skip_notify; }; +/** + * enum netlink_validation - netlink message/attribute validation levels + * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about + * extra data at the end of the message, attributes being longer than + * they should be, or unknown attributes being present. + * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing. + * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING + * this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict(). + * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy. + * This can safely be set by the kernel when the given policy has no + * NLA_UNSPEC anymore, and can thus be used to ensure policy entries + * are enforced going forward. + * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g. + * U8, U16, U32 must have exact size, etc.) + */ +enum netlink_validation { + NL_VALIDATE_LIBERAL = 0, + NL_VALIDATE_TRAILING = BIT(0), + NL_VALIDATE_MAXTYPE = BIT(1), + NL_VALIDATE_UNSPEC = BIT(2), + NL_VALIDATE_STRICT_ATTRS = BIT(3), +}; + +#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\ + NL_VALIDATE_MAXTYPE) +#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\ + NL_VALIDATE_MAXTYPE |\ + NL_VALIDATE_UNSPEC |\ + NL_VALIDATE_STRICT_ATTRS) + int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)); int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags); -int nla_validate(const struct nlattr *head, int len, int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack); -int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, - int len, const struct nla_policy *policy, - struct netlink_ext_ack *extack); -int nla_parse_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, - int len, const struct nla_policy *policy, - struct netlink_ext_ack *extack); +int __nla_validate(const struct nlattr *head, int len, int maxtype, + const struct nla_policy *policy, unsigned int validate, + struct netlink_ext_ack *extack); +int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, + int len, const struct nla_policy *policy, unsigned int validate, + struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize); @@ -512,42 +539,121 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) } /** - * nlmsg_parse - parse attributes of a netlink message + * nla_parse_deprecated - Parse a stream of attributes into a tb buffer + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @head: head of attribute stream + * @len: length of attribute stream + * @policy: validation policy + * @extack: extended ACK pointer + * + * Parses a stream of attributes and stores a pointer to each attribute in + * the tb array accessible via the attribute type. Attributes with a type + * exceeding maxtype will be ignored and attributes from the policy are not + * always strictly validated (only for new attributes). + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, + const struct nlattr *head, int len, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, head, len, policy, + NL_VALIDATE_LIBERAL, extack); +} + +/** + * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @head: head of attribute stream + * @len: length of attribute stream + * @policy: validation policy + * @extack: extended ACK pointer + * + * Parses a stream of attributes and stores a pointer to each attribute in + * the tb array accessible via the attribute type. Attributes with a type + * exceeding maxtype will be rejected as well as trailing data, but the + * policy is not completely strictly validated (only for new attributes). + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, + const struct nlattr *head, + int len, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, head, len, policy, + NL_VALIDATE_DEPRECATED_STRICT, extack); +} + +/** + * __nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy + * @validate: validation strictness * @extack: extended ACK report struct * * See nla_parse() */ -static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, - struct nlattr *tb[], int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + unsigned int validate, + struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { NL_SET_ERR_MSG(extack, "Invalid header length"); return -EINVAL; } - return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), policy, extack); + return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), policy, validate, + extack); } -static inline int nlmsg_parse_strict(const struct nlmsghdr *nlh, int hdrlen, - struct nlattr *tb[], int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +/** + * nlmsg_parse_deprecated - parse attributes of a netlink message + * @nlh: netlink message header + * @hdrlen: length of family specific header + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @extack: extended ACK report struct + * + * See nla_parse_deprecated() + */ +static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { - NL_SET_ERR_MSG(extack, "Invalid header length"); - return -EINVAL; - } + return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, + NL_VALIDATE_LIBERAL, extack); +} - return nla_parse_strict(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), policy, extack); +/** + * nlmsg_parse_deprecated_strict - parse attributes of a netlink message + * @nlh: netlink message header + * @hdrlen: length of family specific header + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @extack: extended ACK report struct + * + * See nla_parse_deprecated_strict() + */ +static inline int +nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, + NL_VALIDATE_DEPRECATED_STRICT, extack); } /** @@ -566,26 +672,53 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, } /** - * nlmsg_validate - validate a netlink message including attributes + * nla_validate_deprecated - Validate a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * @validate: validation strictness + * @extack: extended ACK report struct + * + * Validates all attributes in the specified attribute stream against the + * specified policy. Validation is done in liberal mode. + * See documenation of struct nla_policy for more details. + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_validate_deprecated(const struct nlattr *head, int len, + int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL, + extack); +} + + +/** + * nlmsg_validate_deprecated - validate a netlink message including attributes * @nlh: netlinket message header * @hdrlen: length of familiy specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ -static inline int nlmsg_validate(const struct nlmsghdr *nlh, - int hdrlen, int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, + int hdrlen, int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; - return nla_validate(nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), maxtype, policy, - extack); + return __nla_validate(nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), maxtype, + policy, NL_VALIDATE_LIBERAL, extack); } + + /** * nlmsg_report - need to report back to application? * @nlh: netlink message header @@ -899,22 +1032,22 @@ nla_find_nested(const struct nlattr *nla, int attrtype) } /** - * nla_parse_nested - parse nested attributes + * nla_parse_nested_deprecated - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * - * See nla_parse() + * See nla_parse_deprecated() */ -static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, - const struct nlattr *nla, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype, + const struct nlattr *nla, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, - extack); + return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, + NL_VALIDATE_LIBERAL, extack); } /** @@ -1489,6 +1622,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy + * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the nested attribute stream against the @@ -1497,12 +1631,22 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * * Returns 0 on success or a negative error code. */ -static inline int nla_validate_nested(const struct nlattr *start, int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, + const struct nla_policy *policy, + unsigned int validate, + struct netlink_ext_ack *extack) +{ + return __nla_validate(nla_data(start), nla_len(start), maxtype, policy, + validate, extack); +} + +static inline int +nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - return nla_validate(nla_data(start), nla_len(start), maxtype, policy, - extack); + return __nla_validate_nested(start, maxtype, policy, + NL_VALIDATE_LIBERAL, extack); } /** -- cgit v1.2.3 From 3de644035446567017e952f16da2594d6bd195fc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:29 +0200 Subject: netlink: re-add parse/validate functions in strict mode This re-adds the parse and validate functions like nla_parse() that are now actually strict after the previous rename and were just split out to make sure everything is converted (and if not compilation of the previous patch would fail.) Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 19 +++++++++++ include/net/netlink.h | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 897cdba13569..68de579cfe5e 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -183,6 +183,25 @@ static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh, policy, NL_VALIDATE_LIBERAL, extack); } +/** + * genlmsg_parse - parse attributes of a genetlink message + * @nlh: netlink message header + * @family: genetlink message family + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * @extack: extended ACK report struct + */ +static inline int genlmsg_parse(const struct nlmsghdr *nlh, + const struct genl_family *family, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, + policy, NL_VALIDATE_STRICT, extack); +} + /** * genl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number diff --git a/include/net/netlink.h b/include/net/netlink.h index ab26a5e3558b..e4dd874412bf 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -538,6 +538,31 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) return (struct nlmsghdr *) ((unsigned char *) nlh + totlen); } +/** + * nla_parse - Parse a stream of attributes into a tb buffer + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @head: head of attribute stream + * @len: length of attribute stream + * @policy: validation policy + * @extack: extended ACK pointer + * + * Parses a stream of attributes and stores a pointer to each attribute in + * the tb array accessible via the attribute type. Attributes with a type + * exceeding maxtype will be rejected, policy must be specified, attributes + * will be validated in the strictest way possible. + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_parse(struct nlattr **tb, int maxtype, + const struct nlattr *head, int len, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, head, len, policy, + NL_VALIDATE_STRICT, extack); +} + /** * nla_parse_deprecated - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements @@ -617,6 +642,27 @@ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, extack); } +/** + * nlmsg_parse - parse attributes of a netlink message + * @nlh: netlink message header + * @hdrlen: length of family specific header + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @validate: validation strictness + * @extack: extended ACK report struct + * + * See nla_parse() + */ +static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), policy, + NL_VALIDATE_STRICT, extack); +} + /** * nlmsg_parse_deprecated - parse attributes of a netlink message * @nlh: netlink message header @@ -695,6 +741,28 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len, extack); } +/** + * nla_validate - Validate a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * @validate: validation strictness + * @extack: extended ACK report struct + * + * Validates all attributes in the specified attribute stream against the + * specified policy. Validation is done in strict mode. + * See documenation of struct nla_policy for more details. + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_validate(const struct nlattr *head, int len, int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT, + extack); +} /** * nlmsg_validate_deprecated - validate a netlink message including attributes @@ -1031,6 +1099,25 @@ nla_find_nested(const struct nlattr *nla, int attrtype) return nla_find(nla_data(nla), nla_len(nla), attrtype); } +/** + * nla_parse_nested - parse nested attributes + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @nla: attribute containing the nested attributes + * @policy: validation policy + * @extack: extended ACK report struct + * + * See nla_parse() + */ +static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, + const struct nlattr *nla, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, + NL_VALIDATE_STRICT, extack); +} + /** * nla_parse_nested_deprecated - parse nested attributes * @tb: destination array with maxtype+1 elements -- cgit v1.2.3 From 56738f460841761abc70347c919d5c45f6f05a42 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:30 +0200 Subject: netlink: add strict parsing for future attributes Unfortunately, we cannot add strict parsing for all attributes, as that would break existing userspace. We currently warn about it, but that's about all we can do. For new attributes, however, the story is better: nobody is using them, so we can reject bad sizes. Also, for new attributes, we need not accept them when the policy doesn't declare their usage. David Ahern and I went back and forth on how to best encode this, and the best way we found was to have a "boundary type", from which point on new attributes have all possible validation applied, and NLA_UNSPEC is rejected. As we didn't want to add another argument to all functions that get a netlink policy, the workaround is to encode that boundary in the first entry of the policy array (which is for type 0 and thus probably not really valid anyway). I put it into the validation union for the rare possibility that somebody is actually using attribute 0, which would continue to work fine unless they tried to use the extended validation, which isn't likely. We also didn't find any in-tree users with type 0. The reason for setting the "start strict here" attribute is that we never really need to start strict from 0, which is invalid anyway (or in legacy families where that isn't true, it cannot be set to strict), so we can thus reserve the value 0 for "don't do this check" and don't have to add the tag to all policies right now. Thus, policies can now opt in to this validation, which we should do for all existing policies, at least when adding new attributes. Note that entirely *new* policies won't need to set it, as the use of that should be using nla_parse()/nlmsg_parse() etc. which anyway do fully strict validation now, regardless of this. So in effect, this patch only covers the "existing command with new attribute" case. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index e4dd874412bf..679f649748d4 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -299,6 +299,24 @@ struct nla_policy { }; int (*validate)(const struct nlattr *attr, struct netlink_ext_ack *extack); + /* This entry is special, and used for the attribute at index 0 + * only, and specifies special data about the policy, namely it + * specifies the "boundary type" where strict length validation + * starts for any attribute types >= this value, also, strict + * nesting validation starts here. + * + * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT + * for any types >= this, so need to use NLA_MIN_LEN to get the + * previous pure { .len = xyz } behaviour. The advantage of this + * is that types not specified in the policy will be rejected. + * + * For completely new families it should be set to 1 so that the + * validation is enforced for all attributes. For existing ones + * it should be set at least when new attributes are added to + * the enum used by the policy, and be set to the new value that + * was added to enforce strict validation from thereon. + */ + u16 strict_start_type; }; }; -- cgit v1.2.3 From ef6243acb4782df587a4d7d6c310fa5b5d82684b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:31 +0200 Subject: genetlink: optionally validate strictly/dumps Add options to strictly validate messages and dump messages, sometimes perhaps validating dump messages non-strictly may be required, so add an option for that as well. Since none of this can really be applied to existing commands, set the options everwhere using the following spatch: @@ identifier ops; expression X; @@ struct genl_ops ops[] = { ..., { .cmd = X, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ... }, ... }; For new commands one should just not copy the .validate 'opt-out' flags and thus get strict validation. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 68de579cfe5e..9292f1c588b7 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -121,6 +121,12 @@ static inline int genl_err_attr(struct genl_info *info, int err, return err; } +enum genl_validate_flags { + GENL_DONT_VALIDATE_STRICT = BIT(0), + GENL_DONT_VALIDATE_DUMP = BIT(1), + GENL_DONT_VALIDATE_DUMP_STRICT = BIT(2), +}; + /** * struct genl_ops - generic netlink operations * @cmd: command identifier @@ -141,6 +147,7 @@ struct genl_ops { u8 cmd; u8 internal_flags; u8 flags; + u8 validate; }; int genl_register_family(struct genl_family *family); -- cgit v1.2.3 From 875138f81d71af3cfa80df57e32fe9efbc4f95bc Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:11 +0200 Subject: dsa: Move tagger name into its ops structure Rather than keep a list to map a tagger ops to a name, place the name into the ops structure. This removes the hard coded list, a step towards making the taggers more dynamic. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli v2: Move name to end of structure, keeping the hot entries at the beginning. Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 0cfc2f828b87..801346e31e9b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -56,6 +56,7 @@ struct dsa_device_ops { int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); unsigned int overhead; + const char *name; }; struct dsa_switch_tree { -- cgit v1.2.3 From 0b42f03363706609d621c31324fae5c1250f579f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:12 +0200 Subject: dsa: Add MODULE_ALIAS to taggers in preparation to become modules When the tag drivers become modules, we will need to dynamically load them based on what the switch drivers need. Add aliases to map between the TAG protocol and the driver. In order to do this, we need the tag protocol number as something which the C pre-processor can stringinfy. Only the compiler knows the value of an enum, CPP cannot use them. So add #defines. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 801346e31e9b..8f3d5e0825a2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -30,20 +30,33 @@ struct phy_device; struct fixed_phy_status; struct phylink_link_state; +#define DSA_TAG_PROTO_NONE_VALUE 0 +#define DSA_TAG_PROTO_BRCM_VALUE 1 +#define DSA_TAG_PROTO_BRCM_PREPEND_VALUE 2 +#define DSA_TAG_PROTO_DSA_VALUE 3 +#define DSA_TAG_PROTO_EDSA_VALUE 4 +#define DSA_TAG_PROTO_GSWIP_VALUE 5 +#define DSA_TAG_PROTO_KSZ9477_VALUE 6 +#define DSA_TAG_PROTO_KSZ9893_VALUE 7 +#define DSA_TAG_PROTO_LAN9303_VALUE 8 +#define DSA_TAG_PROTO_MTK_VALUE 9 +#define DSA_TAG_PROTO_QCA_VALUE 10 +#define DSA_TAG_PROTO_TRAILER_VALUE 11 + enum dsa_tag_protocol { - DSA_TAG_PROTO_NONE = 0, - DSA_TAG_PROTO_BRCM, - DSA_TAG_PROTO_BRCM_PREPEND, - DSA_TAG_PROTO_DSA, - DSA_TAG_PROTO_EDSA, - DSA_TAG_PROTO_GSWIP, - DSA_TAG_PROTO_KSZ9477, - DSA_TAG_PROTO_KSZ9893, - DSA_TAG_PROTO_LAN9303, - DSA_TAG_PROTO_MTK, - DSA_TAG_PROTO_QCA, - DSA_TAG_PROTO_TRAILER, - DSA_TAG_LAST, /* MUST BE LAST */ + DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, + DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, + DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, + DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, + DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, + DSA_TAG_PROTO_GSWIP = DSA_TAG_PROTO_GSWIP_VALUE, + DSA_TAG_PROTO_KSZ9477 = DSA_TAG_PROTO_KSZ9477_VALUE, + DSA_TAG_PROTO_KSZ9893 = DSA_TAG_PROTO_KSZ9893_VALUE, + DSA_TAG_PROTO_LAN9303 = DSA_TAG_PROTO_LAN9303_VALUE, + DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, + DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, + DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, + DSA_TAG_LAST, /* MUST BE LAST */ }; struct packet_type; @@ -59,6 +72,10 @@ struct dsa_device_ops { const char *name; }; +#define DSA_TAG_DRIVER_ALIAS "dsa_tag-" +#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ + MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) + struct dsa_switch_tree { struct list_head list; -- cgit v1.2.3 From 056eed2fb071c11535527fc792bdfb985a9a3e26 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:14 +0200 Subject: dsa: Add TAG protocol to tag ops In order that we can match the tagging protocol a switch driver request to the tagger, we need to know what protocol the tagger supports. Add this information to the ops structure. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli v2 More tag protocol to end of structure to keep hot members at the beginning. Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8f3d5e0825a2..720036f48fb3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -70,6 +70,7 @@ struct dsa_device_ops { int *offset); unsigned int overhead; const char *name; + enum dsa_tag_protocol proto; }; #define DSA_TAG_DRIVER_ALIAS "dsa_tag-" -- cgit v1.2.3 From d3b8c04988ca1685700e345a82a1396df79e6291 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:15 +0200 Subject: dsa: Add boilerplate helper to register DSA tag driver modules A DSA tag driver module will need to register the tag protocols it implements with the DSA core. Add macros containing this boiler plate. The registration/unregistration code is currently just a stub. A Later patch will add the real implementation. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli v2 Fix indent of #endif Rewrite to move list pointer into a new structure v3 Move kdoc next to macro Fix THIS_MODULE indentation Signed-off-by: David S. Miller --- include/net/dsa.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 720036f48fb3..08ac05c014e3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -594,4 +594,70 @@ int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); int dsa_port_get_phy_sset_count(struct dsa_port *dp); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); +struct dsa_tag_driver { + const struct dsa_device_ops *ops; + struct list_head list; + struct module *owner; +}; + +void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count, + struct module *owner); +void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count); + +#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ +static int __init dsa_tag_driver_module_init(void) \ +{ \ + dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ + THIS_MODULE); \ + return 0; \ +} \ +module_init(dsa_tag_driver_module_init); \ + \ +static void __exit dsa_tag_driver_module_exit(void) \ +{ \ + dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ +} \ +module_exit(dsa_tag_driver_module_exit) + +/** + * module_dsa_tag_drivers() - Helper macro for registering DSA tag + * drivers + * @__ops_array: Array of tag driver strucutres + * + * Helper macro for DSA tag drivers which do not do anything special + * in module init/exit. Each module may only use this macro once, and + * calling it replaces module_init() and module_exit(). + */ +#define module_dsa_tag_drivers(__ops_array) \ +dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) + +#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops + +/* Create a static structure we can build a linked list of dsa_tag + * drivers + */ +#define DSA_TAG_DRIVER(__ops) \ +static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ + .ops = &__ops, \ +} + +/** + * module_dsa_tag_driver() - Helper macro for registering a single DSA tag + * driver + * @__ops: Single tag driver structures + * + * Helper macro for DSA tag drivers which do not do anything special + * in module init/exit. Each module may only use this macro once, and + * calling it replaces module_init() and module_exit(). + */ +#define module_dsa_tag_driver(__ops) \ +DSA_TAG_DRIVER(__ops); \ + \ +static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ + &DSA_TAG_DRIVER_NAME(__ops) \ +}; \ +module_dsa_tag_drivers(dsa_tag_driver_array) #endif + -- cgit v1.2.3 From f81a43e8da07ccd91c4d923a44ffffaeee39dcc8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:21 +0200 Subject: dsa: Cleanup unneeded table and make tag structures static Now that tag drivers dynamically register, we don't need the static table. Remove it. This also means the tag driver structures can be made static. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 08ac05c014e3..b550f7bb5314 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -56,7 +56,6 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, - DSA_TAG_LAST, /* MUST BE LAST */ }; struct packet_type; -- cgit v1.2.3 From f1f86d09ca7e35fb161a47bc54ec9cb2f4fe42d8 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 15 Apr 2019 16:43:14 -0400 Subject: netfilter: nf_tables: relocate header content to consumer The nf_tables.h header is used in a lot of files, but it turns out that there is only one actual user of nft_expr_clone(). Hence we relocate that function to be with the one consumer of it and avoid having to process it with CPP for all the other files. This will also enable a reduction in the other headers that the nf_tables.h itself has to include just to be stand-alone, hence a pending further significant reduction in the CPP content that needs to get processed for each netfilter file. Note that the explicit "inline" has been dropped as part of this relocation. In similar changes to this, I believe Dave has asked this be done, so we free up gcc to make the choice of whether to inline or not. Signed-off-by: Paul Gortmaker Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2d5a0a1a87b8..706f744f7308 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -806,23 +806,6 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr); -static inline int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) -{ - int err; - - if (src->ops->clone) { - dst->ops = src->ops; - err = src->ops->clone(dst, src); - if (err < 0) - return err; - } else { - memcpy(dst, src, src->ops->size); - } - - __module_get(src->ops->type->owner); - return 0; -} - /** * struct nft_rule - nf_tables rule * -- cgit v1.2.3 From a4cb98f32c9046fea28bcb4979182f2ff731a27a Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 15 Apr 2019 16:43:16 -0400 Subject: netfilter: nf_tables: drop include of module.h from nf_tables.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. Since nf_tables.h is not going into a module struct looking for specific fields, we can just let it know that module is a struct, just like about 60 other include/linux headers already do. Signed-off-by: Paul Gortmaker Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 706f744f7308..5b8624ae4a27 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -2,7 +2,6 @@ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H -#include #include #include #include @@ -13,6 +12,8 @@ #include #include +struct module; + #define NFT_JUMP_STACK_SIZE 16 struct nft_pktinfo { -- cgit v1.2.3 From 8f14c99c7edaaba9c0bb1727d44db6ebf157cc61 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sun, 7 Apr 2019 08:14:20 -0700 Subject: netfilter: conntrack: limit sysctl setting for boolean options We use the zero and one to limit the boolean options setting. After this patch we only set 0 or 1 to boolean options for nf conntrack sysctl. Signed-off-by: Tonghao Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netns/conntrack.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index f19b53130bf7..806454e767bf 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -24,9 +24,9 @@ struct nf_generic_net { struct nf_tcp_net { unsigned int timeouts[TCP_CONNTRACK_TIMEOUT_MAX]; - unsigned int tcp_loose; - unsigned int tcp_be_liberal; - unsigned int tcp_max_retrans; + int tcp_loose; + int tcp_be_liberal; + int tcp_max_retrans; }; enum udp_conntrack { -- cgit v1.2.3 From e1f172e162c0a11721f1188f12e5b4c3f9f80de6 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 17 Apr 2019 11:46:14 -0300 Subject: netfilter: use macros to create module aliases. Each NAT helper creates a module alias which follows a pattern. Use macros for consistency. Signed-off-by: Flavio Leitner Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index ec52a8dc32fd..28bd4569aa64 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -15,6 +15,10 @@ #include #include +#define NF_NAT_HELPER_NAME(name) "ip_nat_" name +#define MODULE_ALIAS_NF_NAT_HELPER(name) \ + MODULE_ALIAS(NF_NAT_HELPER_NAME(name)) + struct module; enum nf_ct_helper_flags { -- cgit v1.2.3 From 08010a21602678932894c5e87014a282af0079cf Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 17 Apr 2019 11:46:15 -0300 Subject: netfilter: add API to manage NAT helpers. The API allows a conntrack helper to indicate its corresponding NAT helper which then can be loaded and reference counted. Signed-off-by: Flavio Leitner Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 28bd4569aa64..44b5a00a9c64 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -15,7 +15,8 @@ #include #include -#define NF_NAT_HELPER_NAME(name) "ip_nat_" name +#define NF_NAT_HELPER_PREFIX "ip_nat_" +#define NF_NAT_HELPER_NAME(name) NF_NAT_HELPER_PREFIX name #define MODULE_ALIAS_NF_NAT_HELPER(name) \ MODULE_ALIAS(NF_NAT_HELPER_NAME(name)) @@ -58,6 +59,8 @@ struct nf_conntrack_helper { unsigned int queue_num; /* length of userspace private data stored in nf_conn_help->data */ u16 data_len; + /* name of NAT helper module */ + char nat_mod_name[NF_CT_HELPER_NAME_LEN]; }; /* Must be kept in sync with the classes defined by helpers */ @@ -157,4 +160,21 @@ nf_ct_helper_expectfn_find_by_symbol(const void *symbol); extern struct hlist_head *nf_ct_helper_hash; extern unsigned int nf_ct_helper_hsize; +struct nf_conntrack_nat_helper { + struct list_head list; + char mod_name[NF_CT_HELPER_NAME_LEN]; /* module name */ + struct module *module; /* pointer to self */ +}; + +#define NF_CT_NAT_HELPER_INIT(name) \ + { \ + .mod_name = NF_NAT_HELPER_NAME(name), \ + .module = THIS_MODULE \ + } + +void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat); +void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat); +int nf_nat_helper_try_module_get(const char *name, u16 l3num, + u8 protonum); +void nf_nat_helper_put(struct nf_conntrack_helper *helper); #endif /*_NF_CONNTRACK_HELPER_H*/ -- cgit v1.2.3 From 33162e9a0590f16e1b21be764caae517e2bb310c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 28 Apr 2019 21:45:43 +0300 Subject: net: dsa: Store vlan_filtering as a property of dsa_port This allows drivers to query the VLAN setting imposed by the bridge driver directly from DSA, instead of keeping their own state based on the .port_vlan_filtering callback. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index b550f7bb5314..79a87913126c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -161,6 +161,7 @@ struct dsa_port { const char *mac; struct device_node *dn; unsigned int ageing_time; + bool vlan_filtering; u8 stp_state; struct net_device *bridge_dev; struct devlink_port devlink_port; -- cgit v1.2.3 From 8f5d16f638b9a1adf544a7f8cfd11ac1c01c6e25 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 28 Apr 2019 21:45:44 +0300 Subject: net: dsa: Be aware of switches where VLAN filtering is a global setting On some switches, the action of whether to parse VLAN frame headers and use that information for ingress admission is configurable, but not per port. Such is the case for the Broadcom BCM53xx and the NXP SJA1105 families, for example. In that case, DSA can prevent the bridge core from trying to apply different VLAN filtering settings on net devices that belong to the same switch. Signed-off-by: Vladimir Oltean Suggested-by: Florian Fainelli Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 79a87913126c..aab3c2029edd 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -228,6 +228,11 @@ struct dsa_switch { /* Number of switch port queues */ unsigned int num_tx_queues; + /* Disallow bridge core from requesting different VLAN awareness + * settings on ports if not hardware-supported + */ + bool vlan_filtering_is_global; + unsigned long *bitmap; unsigned long _bitmap; -- cgit v1.2.3 From 145746765f06a3dbc7869c81d0165b3ab96f935a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 28 Apr 2019 21:45:48 +0300 Subject: net: dsa: Keep the vlan_filtering setting in dsa_switch if it's global The current behavior is not as obvious as one would assume (which is that, if the driver set vlan_filtering_is_global = 1, then checking any dp->vlan_filtering would yield the same result). Only the ports which are actively enslaved into a bridge would have vlan_filtering set. This makes it tricky for drivers to check what the global state is. So fix this and make the struct dsa_switch hold this global setting. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index aab3c2029edd..4e0f7e9c5aa1 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -233,6 +233,11 @@ struct dsa_switch { */ bool vlan_filtering_is_global; + /* In case vlan_filtering_is_global is set, the VLAN awareness state + * should be retrieved from here and not from the per-port settings. + */ + bool vlan_filtering; + unsigned long *bitmap; unsigned long _bitmap; -- cgit v1.2.3 From cf2d45f5ba9a730df6ec190e0345cecde80b1d8b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 28 Apr 2019 21:45:49 +0300 Subject: net: dsa: Add helper function to retrieve VLAN awareness setting Since different types of hardware may or may not support this setting per-port, DSA keeps it either in dsa_switch or in dsa_port. While drivers may know the characteristics of their hardware and retrieve it from the correct place without the need of helpers, it is cumbersone to find out an unambigous answer from generic DSA code. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4e0f7e9c5aa1..1e6b4efc80b9 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -305,6 +305,16 @@ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } +static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) +{ + const struct dsa_switch *ds = dp->ds; + + if (ds->vlan_filtering_is_global) + return ds->vlan_filtering; + else + return dp->vlan_filtering; +} + typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { -- cgit v1.2.3 From 93e86b3bc842c159a60e6987444bf3952adcd4db Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 02:56:23 +0200 Subject: net: dsa: Remove legacy probing support Now that all drivers can be probed using more traditional methods, remove the legacy probe code. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1e6b4efc80b9..18db7b8e7a8e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -318,15 +318,6 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { -#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) - /* - * Legacy probing. - */ - const char *(*probe)(struct device *dsa_dev, - struct device *host_dev, int sw_addr, - void **priv); -#endif - enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port); @@ -516,20 +507,6 @@ struct dsa_switch_driver { const struct dsa_switch_ops *ops; }; -#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) -/* Legacy driver registration */ -void register_switch_driver(struct dsa_switch_driver *type); -void unregister_switch_driver(struct dsa_switch_driver *type); -struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); - -#else -static inline void register_switch_driver(struct dsa_switch_driver *type) { } -static inline void unregister_switch_driver(struct dsa_switch_driver *type) { } -static inline struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) -{ - return NULL; -} -#endif struct net_device *dsa_dev_to_net_device(struct device *dev); /* Keep inline for faster access in hot path */ -- cgit v1.2.3 From b587bdaf5f820cf7dac2c1b351db97bf98e1f427 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 29 Apr 2019 12:41:45 +0300 Subject: devlink: Change devlink health locking mechanism The devlink health reporters create/destroy and user commands currently use the devlink->lock as a locking mechanism. Different reporters have different rules in the driver and are being created/destroyed during different stages of driver load/unload/running. So during execution of a reporter recover the flow can go through another reporter's destroy and create. Such flow leads to deadlock trying to lock a mutex already held. With the new locking mechanism the different reporters share mutex lock only to protect access to shared reporters list. Added refcount per reporter, to protect the reporters from destroy while being used. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4f5e41613503..1c4adfb4195a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -32,6 +32,7 @@ struct devlink { struct list_head region_list; u32 snapshot_id; struct list_head reporter_list; + struct mutex reporters_lock; /* protects reporter_list */ struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; -- cgit v1.2.3 From b424e432e770d6dd572765459d5b6a96a19c5286 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 2 May 2019 16:15:10 +0200 Subject: netlink: add validation of NLA_F_NESTED flag Add new validation flag NL_VALIDATE_NESTED which adds three consistency checks of NLA_F_NESTED_FLAG: - the flag is set on attributes with NLA_NESTED{,_ARRAY} policy - the flag is not set on attributes with other policies except NLA_UNSPEC - the flag is set on attribute passed to nla_parse_nested() Signed-off-by: Michal Kubecek v2: change error messages to mention NLA_F_NESTED explicitly Reviewed-by: Johannes Berg Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/netlink.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 679f649748d4..395b4406f4b0 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -401,6 +401,8 @@ struct nl_info { * are enforced going forward. * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g. * U8, U16, U32 must have exact size, etc.) + * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY) + * and unset for other policies. */ enum netlink_validation { NL_VALIDATE_LIBERAL = 0, @@ -408,6 +410,7 @@ enum netlink_validation { NL_VALIDATE_MAXTYPE = BIT(1), NL_VALIDATE_UNSPEC = BIT(2), NL_VALIDATE_STRICT_ATTRS = BIT(3), + NL_VALIDATE_NESTED = BIT(4), }; #define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\ @@ -415,7 +418,8 @@ enum netlink_validation { #define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE |\ NL_VALIDATE_UNSPEC |\ - NL_VALIDATE_STRICT_ATTRS) + NL_VALIDATE_STRICT_ATTRS |\ + NL_VALIDATE_NESTED) int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, @@ -1132,6 +1136,11 @@ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { + if (!(nla->nla_type & NLA_F_NESTED)) { + NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing"); + return -EINVAL; + } + return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_STRICT, extack); } -- cgit v1.2.3 From 0f457a36626fa94026e483836fbf29e451434567 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 30 Apr 2019 07:45:48 -0700 Subject: ipv4: Move cached routes to fib_nh_common While the cached routes, nh_pcpu_rth_output and nh_rth_input, are IPv4 specific, a later patch wants to make them accessible for IPv6 nexthops with IPv4 routes using a fib6_nh. Move the cached routes from fib_nh to fib_nh_common and update references. Initialization of the cached entries is moved to fib_nh_common_init, and free is moved to fib_nh_common_release. Change in location only, from fib_nh up to fib_nh_common; no functional change intended. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 772a9e61bd84..659c5081c40b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -96,6 +96,10 @@ struct fib_nh_common { int nhc_weight; atomic_t nhc_upper_bound; + + /* v4 specific, but allows fib6_nh with v4 routes */ + struct rtable __rcu * __percpu *nhc_pcpu_rth_output; + struct rtable __rcu *nhc_rth_input; }; struct fib_nh { @@ -107,8 +111,6 @@ struct fib_nh { #endif __be32 nh_saddr; int nh_saddr_genid; - struct rtable __rcu * __percpu *nh_pcpu_rth_output; - struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; #define fib_nh_family nh_common.nhc_family #define fib_nh_dev nh_common.nhc_dev -- cgit v1.2.3 From a5995e7107eb3d9c44744d3cf47d49fabfef01f5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 30 Apr 2019 07:45:50 -0700 Subject: ipv4: Move exception bucket to nh_common Similar to the cached routes, make IPv4 exceptions accessible when using an IPv6 nexthop struct with IPv4 routes. Simplify the exception functions by passing in fib_nh_common since that is all it needs, and then cleanup the call sites that have extraneous fib_nh conversions. As with the cached routes this is a change in location only, from fib_nh up to fib_nh_common; no functional change intended. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 659c5081c40b..d0e28f4ab099 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -100,6 +100,7 @@ struct fib_nh_common { /* v4 specific, but allows fib6_nh with v4 routes */ struct rtable __rcu * __percpu *nhc_pcpu_rth_output; struct rtable __rcu *nhc_rth_input; + struct fnhe_hash_bucket __rcu *nhc_exceptions; }; struct fib_nh { @@ -111,7 +112,6 @@ struct fib_nh { #endif __be32 nh_saddr; int nh_saddr_genid; - struct fnhe_hash_bucket __rcu *nh_exceptions; #define fib_nh_family nh_common.nhc_family #define fib_nh_dev nh_common.nhc_dev #define fib_nh_oif nh_common.nhc_oif -- cgit v1.2.3 From f80c5dad7b6467b884c445ffea45985793b4b2d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Rechi=20Vita?= Date: Thu, 2 May 2019 10:01:52 +0800 Subject: Bluetooth: Ignore CC events not matching the last HCI command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit makes the kernel not send the next queued HCI command until a command complete arrives for the last HCI command sent to the controller. This change avoids a problem with some buggy controllers (seen on two SKUs of QCA9377) that send an extra command complete event for the previous command after the kernel had already sent a new HCI command to the controller. The problem was reproduced when starting an active scanning procedure, where an extra command complete event arrives for the LE_SET_RANDOM_ADDR command. When this happends the kernel ends up not processing the command complete for the following commmand, LE_SET_SCAN_PARAM, and ultimately behaving as if a passive scanning procedure was being performed, when in fact controller is performing an active scanning procedure. This makes it impossible to discover BLE devices as no device found events are sent to userspace. This problem is reproducible on 100% of the attempts on the affected controllers. The extra command complete event can be seen at timestamp 27.420131 on the btmon logs bellow. Bluetooth monitor ver 5.50 = Note: Linux version 5.0.0+ (x86_64) 0.352340 = Note: Bluetooth subsystem version 2.22 0.352343 = New Index: 80:C5:F2:8F:87:84 (Primary,USB,hci0) [hci0] 0.352344 = Open Index: 80:C5:F2:8F:87:84 [hci0] 0.352345 = Index Info: 80:C5:F2:8F:87:84 (Qualcomm) [hci0] 0.352346 @ MGMT Open: bluetoothd (privileged) version 1.14 {0x0001} 0.352347 @ MGMT Open: btmon (privileged) version 1.14 {0x0002} 0.352366 @ MGMT Open: btmgmt (privileged) version 1.14 {0x0003} 27.302164 @ MGMT Command: Start Discovery (0x0023) plen 1 {0x0003} [hci0] 27.302310 Address type: 0x06 LE Public LE Random < HCI Command: LE Set Random Address (0x08|0x0005) plen 6 #1 [hci0] 27.302496 Address: 15:60:F2:91:B2:24 (Non-Resolvable) > HCI Event: Command Complete (0x0e) plen 4 #2 [hci0] 27.419117 LE Set Random Address (0x08|0x0005) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Scan Parameters (0x08|0x000b) plen 7 #3 [hci0] 27.419244 Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) Own address type: Random (0x01) Filter policy: Accept all advertisement (0x00) > HCI Event: Command Complete (0x0e) plen 4 #4 [hci0] 27.420131 LE Set Random Address (0x08|0x0005) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Scan Enable (0x08|0x000c) plen 2 #5 [hci0] 27.420259 Scanning: Enabled (0x01) Filter duplicates: Enabled (0x01) > HCI Event: Command Complete (0x0e) plen 4 #6 [hci0] 27.420969 LE Set Scan Parameters (0x08|0x000b) ncmd 1 Status: Success (0x00) > HCI Event: Command Complete (0x0e) plen 4 #7 [hci0] 27.421983 LE Set Scan Enable (0x08|0x000c) ncmd 1 Status: Success (0x00) @ MGMT Event: Command Complete (0x0001) plen 4 {0x0003} [hci0] 27.422059 Start Discovery (0x0023) plen 1 Status: Success (0x00) Address type: 0x06 LE Public LE Random @ MGMT Event: Discovering (0x0013) plen 2 {0x0003} [hci0] 27.422067 Address type: 0x06 LE Public LE Random Discovery: Enabled (0x01) @ MGMT Event: Discovering (0x0013) plen 2 {0x0002} [hci0] 27.422067 Address type: 0x06 LE Public LE Random Discovery: Enabled (0x01) @ MGMT Event: Discovering (0x0013) plen 2 {0x0001} [hci0] 27.422067 Address type: 0x06 LE Public LE Random Discovery: Enabled (0x01) Signed-off-by: João Paulo Rechi Vita Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index fbba43e9bef5..9a5330eed794 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -282,6 +282,7 @@ enum { HCI_FORCE_BREDR_SMP, HCI_FORCE_STATIC_ADDR, HCI_LL_RPA_RESOLUTION, + HCI_CMD_PENDING, __HCI_NUM_FLAGS, }; -- cgit v1.2.3 From 47d3d7fdb10a21c223036b58bd70ffdc24a472c4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 May 2019 08:24:44 -0700 Subject: ip6: fix skb leak in ip6frag_expire_frag_queue() Since ip6frag_expire_frag_queue() now pulls the head skb from frag queue, we should no longer use skb_get(), since this leads to an skb leak. Stefan Bader initially reported a problem in 4.4.stable [1] caused by the skb_get(), so this patch should also fix this issue. 296583.091021] kernel BUG at /build/linux-6VmqmP/linux-4.4.0/net/core/skbuff.c:1207! [296583.091734] Call Trace: [296583.091749] [] __pskb_pull_tail+0x50/0x350 [296583.091764] [] _decode_session6+0x26a/0x400 [296583.091779] [] __xfrm_decode_session+0x39/0x50 [296583.091795] [] icmpv6_route_lookup+0xf0/0x1c0 [296583.091809] [] icmp6_send+0x5e1/0x940 [296583.091823] [] ? __netif_receive_skb+0x18/0x60 [296583.091838] [] ? netif_receive_skb_internal+0x32/0xa0 [296583.091858] [] ? ixgbe_clean_rx_irq+0x594/0xac0 [ixgbe] [296583.091876] [] ? nf_ct_net_exit+0x50/0x50 [nf_defrag_ipv6] [296583.091893] [] icmpv6_send+0x21/0x30 [296583.091906] [] ip6_expire_frag_queue+0xe0/0x120 [296583.091921] [] nf_ct_frag6_expire+0x1f/0x30 [nf_defrag_ipv6] [296583.091938] [] call_timer_fn+0x37/0x140 [296583.091951] [] ? nf_ct_net_exit+0x50/0x50 [nf_defrag_ipv6] [296583.091968] [] run_timer_softirq+0x234/0x330 [296583.091982] [] __do_softirq+0x109/0x2b0 Fixes: d4289fcc9b16 ("net: IP6 defrag: use rbtrees for IPv6 defrag") Signed-off-by: Eric Dumazet Reported-by: Stefan Bader Cc: Peter Oskolkov Cc: Florian Westphal Signed-off-by: David S. Miller --- include/net/ipv6_frag.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 28aa9b30aece..1f77fb4dc79d 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -94,7 +94,6 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) goto out; head->dev = dev; - skb_get(head); spin_unlock(&fq->q.lock); icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); -- cgit v1.2.3 From 9b3040a6aafd7898ece7fc7efcbca71e42aa8069 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 5 May 2019 11:16:20 -0700 Subject: ipv4: Define __ipv4_neigh_lookup_noref when CONFIG_INET is disabled Define __ipv4_neigh_lookup_noref to return NULL when CONFIG_INET is disabled. Fixes: 4b2a2bfeb3f0 ("neighbor: Call __ipv4_neigh_lookup_noref in neigh_xmit") Reported-by: kbuild test robot Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/arp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/arp.h b/include/net/arp.h index 977aabfcdc03..c8f580a0e6b1 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -18,6 +18,7 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 return val * hash_rnd[0]; } +#ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) @@ -25,6 +26,13 @@ static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } +#else +static inline +struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) +{ + return NULL; +} +#endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { -- cgit v1.2.3 From a7a7be6087b07563490725f61f4dbf4826f099e2 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:16 -0700 Subject: net/sched: add sample action to the hardware intermediate representation Add sample action to the hardware intermediate representation model which would subsequently allow it to be used by drivers for offload. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_offload.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index d035183c8d03..9a6c89b2c2bb 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -118,6 +118,7 @@ enum flow_action_id { FLOW_ACTION_MARK, FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, + FLOW_ACTION_SAMPLE, }; /* This is mirroring enum pedit_header_type definition for easy mapping between @@ -157,6 +158,12 @@ struct flow_action_entry { u32 index; u8 vf; } queue; + struct { /* FLOW_ACTION_SAMPLE */ + struct psample_group *psample_group; + u32 rate; + u32 trunc_size; + bool truncate; + } sample; }; }; -- cgit v1.2.3 From f00cbf1968145afbae385a867a66c69845e30711 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:17 -0700 Subject: net/sched: use the hardware intermediate representation for matchall Extends matchall offload to make use of the hardware intermediate representation. More specifically, this patch moves the native TC actions in cls_matchall offload to the newer flow_action representation. This ultimately allows us to avoid a direct dependency on native TC actions for matchall. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index d5e7a1af346f..c852ed502cc6 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -789,6 +789,7 @@ enum tc_matchall_command { struct tc_cls_matchall_offload { struct tc_cls_common_offload common; enum tc_matchall_command command; + struct flow_rule *rule; struct tcf_exts *exts; unsigned long cookie; }; -- cgit v1.2.3 From ab79af32b0a5606324ce04c0f04a0d2f90b94464 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:18 -0700 Subject: mlxsw: use intermediate representation for matchall offload Updates the Mellanox spectrum driver to use the newer intermediate representation for flow actions in matchall offloads. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_offload.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 9a6c89b2c2bb..3bf67dd64be5 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -177,6 +177,17 @@ static inline bool flow_action_has_entries(const struct flow_action *action) return action->num_entries; } +/** + * flow_action_has_one_action() - check if exactly one action is present + * @action: tc filter flow offload action + * + * Returns true if exactly one action is present. + */ +static inline bool flow_offload_has_one_action(const struct flow_action *action) +{ + return action->num_entries == 1; +} + #define flow_action_for_each(__i, __act, __actions) \ for (__i = 0, __act = &(__actions)->entries[0]; __i < (__actions)->num_entries; __act = &(__actions)->entries[++__i]) -- cgit v1.2.3 From dfcb19f0fae3d07f9c56f6efe2c9bbebef6826c9 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:20 -0700 Subject: net/sched: remove unused functions for matchall offload Cleanup unused functions and variables after porting to the newer intermediate representation. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c852ed502cc6..2d0470661277 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -371,30 +371,6 @@ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) #endif } -/** - * tcf_exts_has_one_action - check if exactly one action is present - * @exts: tc filter extensions handle - * - * Returns true if exactly one action is present. - */ -static inline bool tcf_exts_has_one_action(struct tcf_exts *exts) -{ -#ifdef CONFIG_NET_CLS_ACT - return exts->nr_actions == 1; -#else - return false; -#endif -} - -static inline struct tc_action *tcf_exts_first_action(struct tcf_exts *exts) -{ -#ifdef CONFIG_NET_CLS_ACT - return exts->actions[0]; -#else - return NULL; -#endif -} - /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer @@ -790,7 +766,6 @@ struct tc_cls_matchall_offload { struct tc_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; - struct tcf_exts *exts; unsigned long cookie; }; -- cgit v1.2.3 From fa762da94d9860f584c909621d1f8ccbe24c5d5e Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:21 -0700 Subject: net/sched: move police action structures to header Move tcf_police_params, tcf_police and tc_police_compat structures to a header. Making them usable to other code for example drivers that would offload police actions to hardware. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tc_act/tc_police.h | 70 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 include/net/tc_act/tc_police.h (limited to 'include/net') diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h new file mode 100644 index 000000000000..8b9ef3664262 --- /dev/null +++ b/include/net/tc_act/tc_police.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_TC_POLICE_H +#define __NET_TC_POLICE_H + +#include + +struct tcf_police_params { + int tcfp_result; + u32 tcfp_ewma_rate; + s64 tcfp_burst; + u32 tcfp_mtu; + s64 tcfp_mtu_ptoks; + struct psched_ratecfg rate; + bool rate_present; + struct psched_ratecfg peak; + bool peak_present; + struct rcu_head rcu; +}; + +struct tcf_police { + struct tc_action common; + struct tcf_police_params __rcu *params; + + spinlock_t tcfp_lock ____cacheline_aligned_in_smp; + s64 tcfp_toks; + s64 tcfp_ptoks; + s64 tcfp_t_c; +}; + +#define to_police(pc) ((struct tcf_police *)pc) + +/* old policer structure from before tc actions */ +struct tc_police_compat { + u32 index; + int action; + u32 limit; + u32 burst; + u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; +}; + +static inline bool is_tcf_police(const struct tc_action *act) +{ +#ifdef CONFIG_NET_CLS_ACT + if (act->ops && act->ops->id == TCA_ID_POLICE) + return true; +#endif + return false; +} + +static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_bh(police->params); + return params->rate.rate_bytes_ps; +} + +static inline s64 tcf_police_tcfp_burst(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_bh(police->params); + return params->tcfp_burst; +} + +#endif /* __NET_TC_POLICE_H */ -- cgit v1.2.3 From 8c8cfc6ed274e6fb86f00b53f3e7811afce29043 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:22 -0700 Subject: net/sched: add police action to the hardware intermediate representation Add police action to the hardware intermediate representation which would subsequently allow it to be used by drivers for offload. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_offload.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 3bf67dd64be5..6200900434e1 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -119,6 +119,7 @@ enum flow_action_id { FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, + FLOW_ACTION_POLICE, }; /* This is mirroring enum pedit_header_type definition for easy mapping between @@ -164,6 +165,10 @@ struct flow_action_entry { u32 trunc_size; bool truncate; } sample; + struct { /* FLOW_ACTION_POLICE */ + s64 burst; + u64 rate_bytes_ps; + } police; }; }; -- cgit v1.2.3 From b7fe4ab8a6013c3c721bed91f73e76eec8fb5d89 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:23 -0700 Subject: net/sched: extend matchall offload for hardware statistics Introduce a new command for matchall classifiers that allows hardware to update statistics. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2d0470661277..161fcf8516ac 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -760,12 +760,14 @@ tc_cls_flower_offload_flow_rule(struct tc_cls_flower_offload *tc_flow_cmd) enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, + TC_CLSMATCHALL_STATS, }; struct tc_cls_matchall_offload { struct tc_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; + struct flow_stats stats; unsigned long cookie; }; -- cgit v1.2.3 From 88c44a5200849c8182eaf36535b4ceae6b90b19d Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:25 -0700 Subject: net/sched: add block pointer to tc_cls_common_offload structure Some actions like the police action are stateful and could share state between devices. This is incompatible with offloading to multiple devices and drivers might want to test for shared blocks when offloading. Store a pointer to the tcf_block structure in the tc_cls_common_offload structure to allow drivers to determine when offloads apply to a shared block. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 161fcf8516ac..eed98f8fcb5e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -100,6 +100,11 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); #else +static inline bool tcf_block_shared(struct tcf_block *block) +{ + return false; +} + static inline int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, @@ -624,6 +629,7 @@ struct tc_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; + struct tcf_block *block; struct netlink_ext_ack *extack; }; @@ -725,11 +731,13 @@ static inline bool tc_in_hw(u32 flags) static inline void tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, + struct tcf_block *block, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio; + cls_common->block = block; if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } -- cgit v1.2.3 From f9bbe4477c30ece44296437ee26142b42ef8070b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:22 +0300 Subject: net: dsa: Optional VLAN-based port separation for switches without tagging This patch provides generic DSA code for using VLAN (802.1Q) tags for the same purpose as a dedicated switch tag for injection/extraction. It is based on the discussions and interest that has been so far expressed in https://www.spinics.net/lists/netdev/msg556125.html. Unlike all other DSA-supported tagging protocols, CONFIG_NET_DSA_TAG_8021Q does not offer a complete solution for drivers (nor can it). Instead, it provides generic code that driver can opt into calling: - dsa_8021q_xmit: Inserts a VLAN header with the specified contents. Can be called from another tagging protocol's xmit function. Currently the LAN9303 driver is inserting headers that are simply 802.1Q with custom fields, so this is an opportunity for code reuse. - dsa_8021q_rcv: Retrieves the TPID and TCI from a VLAN-tagged skb. Removing the VLAN header is left as a decision for the caller to make. - dsa_port_setup_8021q_tagging: For each user port, installs an Rx VID and a Tx VID, for proper untagged traffic identification on ingress and steering on egress. Also sets up the VLAN trunk on the upstream (CPU or DSA) port. Drivers are intentionally left to call this function explicitly, depending on the context and hardware support. The expected switch behavior and VLAN semantics should not be violated under any conditions. That is, after calling dsa_port_setup_8021q_tagging, the hardware should still pass all ingress traffic, be it tagged or untagged. For uniformity with the other tagging protocols, a module for the dsa_8021q_netdev_ops structure is registered, but the typical usage is to set up another tagging protocol which selects CONFIG_NET_DSA_TAG_8021Q, and calls the API from tag_8021q.h. Null function definitions are also provided so that a "depends on" is not forced in the Kconfig. This tagging protocol only works when switch ports are standalone, or when they are added to a VLAN-unaware bridge. It will probably remain this way for the reasons below. When added to a bridge that has vlan_filtering 1, the bridge core will install its own VLANs and reset the pvids through switchdev. For the bridge core, switchdev is a write-only pipe. All VLAN-related state is kept in the bridge core and nothing is read from DSA/switchdev or from the driver. So the bridge core will break this port separation because it will install the vlan_default_pvid into all switchdev ports. Even if we could teach the bridge driver about switchdev preference of a certain vlan_default_pvid (task difficult in itself since the current setting is per-bridge but we would need it per-port), there would still exist many other challenges. Firstly, in the DSA rcv callback, a driver would have to perform an iterative reverse lookup to find the correct switch port. That is because the port is a bridge slave, so its Rx VID (port PVID) is subject to user configuration. How would we ensure that the user doesn't reset the pvid to a different value (which would make an O(1) translation impossible), or to a non-unique value within this DSA switch tree (which would make any translation impossible)? Finally, not all switch ports are equal in DSA, and that makes it difficult for the bridge to be completely aware of this anyway. The CPU port needs to transmit tagged packets (VLAN trunk) in order for the DSA rcv code to be able to decode source information. But the bridge code has absolutely no idea which switch port is the CPU port, if nothing else then just because there is no netdevice registered by DSA for the CPU port. Also DSA does not currently allow the user to specify that they want the CPU port to do VLAN trunking anyway. VLANs are added to the CPU port using the same flags as they were added on the user port. So the VLANs installed by dsa_port_setup_8021q_tagging per driver request should remain private from the bridge's and user's perspective, and should not alter the VLAN semantics observed by the user. In the current implementation a VLAN range ending at 4095 (VLAN_N_VID) is reserved for this purpose. Each port receives a unique Rx VLAN and a unique Tx VLAN. Separate VLANs are needed for Rx and Tx because they serve different purposes: on Rx the switch must process traffic as untagged and process it with a port-based VLAN, but with care not to hinder bridging. On the other hand, the Tx VLAN is where the reachability restrictions are imposed, since by tagging frames in the xmit callback we are telling the switch onto which port to steer the frame. Some general guidance on how this support might be employed for real-life hardware (some comments made by Florian Fainelli): - If the hardware supports VLAN tag stacking, it should somehow back up its private VLAN settings when the bridge tries to override them. Then the driver could re-apply them as outer tags. Dedicating an outer tag per bridge device would allow identical inner tag VID numbers to co-exist, yet preserve broadcast domain isolation. - If the switch cannot handle VLAN tag stacking, it should disable this port separation when added as slave to a vlan_filtering bridge, in that case having reduced functionality. - Drivers for old switches that don't support the entire VLAN_N_VID range will need to rework the current range selection mechanism. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 18db7b8e7a8e..69f3714f42ba 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -42,6 +42,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_MTK_VALUE 9 #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 +#define DSA_TAG_PROTO_8021Q_VALUE 12 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -56,6 +57,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, + DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, }; struct packet_type; -- cgit v1.2.3 From cc1939e4b3aaf534fb2f3706820012036825731c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:23 +0300 Subject: net: dsa: Allow drivers to filter packets they can decode source port from Frames get processed by DSA and redirected to switch port net devices based on the ETH_P_XDSA multiplexed packet_type handler found by the network stack when calling eth_type_trans(). The running assumption is that once the DSA .rcv function is called, DSA is always able to decode the switch tag in order to change the skb->dev from its master. However there are tagging protocols (such as the new DSA_TAG_PROTO_SJA1105, user of DSA_TAG_PROTO_8021Q) where this assumption is not completely true, since switch tagging piggybacks on the absence of a vlan_filtering bridge. Moreover, management traffic (BPDU, PTP) for this switch doesn't rely on switch tagging, but on a different mechanism. So it would make sense to at least be able to terminate that. Having DSA receive traffic it can't decode would put it in an impossible situation: the eth_type_trans() function would invoke the DSA .rcv(), which could not change skb->dev, then eth_type_trans() would be invoked again, which again would call the DSA .rcv, and the packet would never be able to exit the DSA filter and would spiral in a loop until the whole system dies. This happens because eth_type_trans() doesn't actually look at the skb (so as to identify a potential tag) when it deems it as being ETH_P_XDSA. It just checks whether skb->dev has a DSA private pointer installed (therefore it's a DSA master) and that there exists a .rcv callback (everybody except DSA_TAG_PROTO_NONE has that). This is understandable as there are many switch tags out there, and exhaustively checking for all of them is far from ideal. The solution lies in introducing a filtering function for each tagging protocol. In the absence of a filtering function, all traffic is passed to the .rcv DSA callback. The tagging protocol should see the filtering function as a pre-validation that it can decode the incoming skb. The traffic that doesn't match the filter will bypass the DSA .rcv callback and be left on the master netdevice, which wasn't previously possible. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 69f3714f42ba..c90ceeec7d1f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -69,6 +69,11 @@ struct dsa_device_ops { struct packet_type *pt); int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); + /* Used to determine which traffic should match the DSA filter in + * eth_type_trans, and which, if any, should bypass it and be processed + * as regular on the master net device. + */ + bool (*filter)(const struct sk_buff *skb, struct net_device *dev); unsigned int overhead; const char *name; enum dsa_tag_protocol proto; @@ -148,6 +153,7 @@ struct dsa_port { struct dsa_switch_tree *dst; struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); + bool (*filter)(const struct sk_buff *skb, struct net_device *dev); enum { DSA_PORT_TYPE_UNUSED = 0, @@ -520,6 +526,15 @@ static inline bool netdev_uses_dsa(struct net_device *dev) return false; } +static inline bool dsa_can_decode(const struct sk_buff *skb, + struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_NET_DSA) + return !dev->dsa_ptr->filter || dev->dsa_ptr->filter(skb, dev); +#endif + return false; +} + struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n); void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); -- cgit v1.2.3 From b68b0dd0fb2d91056d5241a19960cf47d4a80f05 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:24 +0300 Subject: net: dsa: Keep private info in the skb->cb Map a DSA structure over the 48-byte control block that will hold skb info on transmit and receive. This is only for use within the DSA processing layer (e.g. communicating between DSA core and tagger) and not for passing info around with other layers such as the master net device. Also add a DSA_SKB_CB_PRIV() macro which retrieves a pointer to the space up to 48 bytes that the DSA structure does not use. This space can be used for drivers to add their own private info. One use is for the PTP timestamping code path. When cloning a skb, annotate the original with a pointer to the clone, which the driver can then find easily and place the timestamp to. This avoids the need of a separate queue to hold clones and a way to match an original to a cloned skb. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index c90ceeec7d1f..d628587e0bde 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -83,6 +83,37 @@ struct dsa_device_ops { #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) +struct dsa_skb_cb { + struct sk_buff *clone; +}; + +struct __dsa_skb_cb { + struct dsa_skb_cb cb; + u8 priv[48 - sizeof(struct dsa_skb_cb)]; +}; + +#define __DSA_SKB_CB(skb) ((struct __dsa_skb_cb *)((skb)->cb)) + +#define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb)) + +#define DSA_SKB_CB_COPY(nskb, skb) \ + { *__DSA_SKB_CB(nskb) = *__DSA_SKB_CB(skb); } + +#define DSA_SKB_CB_ZERO(skb) \ + { *__DSA_SKB_CB(skb) = (struct __dsa_skb_cb) {0}; } + +#define DSA_SKB_CB_PRIV(skb) \ + ((void *)(skb)->cb + offsetof(struct __dsa_skb_cb, priv)) + +#define DSA_SKB_CB_CLONE(_clone, _skb) \ + { \ + struct sk_buff *clone = _clone; \ + struct sk_buff *skb = _skb; \ + \ + DSA_SKB_CB_COPY(clone, skb); \ + DSA_SKB_CB(skb)->clone = clone; \ + } + struct dsa_switch_tree { struct list_head list; -- cgit v1.2.3 From 97a69a0dea9a048c6769249f1552de5f56731524 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:25 +0300 Subject: net: dsa: Add support for deferred xmit Some hardware needs to take work to get convinced to receive frames on the CPU port (such as the sja1105 which takes temporary L2 forwarding rules over SPI that last for a single frame). Such work needs a sleepable context, and because the regular .ndo_start_xmit is atomic, this cannot be done in the tagger. So introduce a generic DSA mechanism that sets up a transmit skb queue and a workqueue for deferred transmission. The new driver callback (.port_deferred_xmit) is in dsa_switch and not in the tagger because the operations that require sleeping typically also involve interacting with the hardware, and not simply skb manipulations. Therefore having it there simplifies the structure a bit and makes it unnecessary to export functions from the driver to the tagger. The driver is responsible of calling dsa_enqueue_skb which transfers it to the master netdevice. This is so that it has a chance of performing some more work afterwards, such as cleanup or TX timestamping. To tell DSA that skb xmit deferral is required, I have thought about changing the return type of the tagger .xmit from struct sk_buff * into a enum dsa_tx_t that could potentially encode a DSA_XMIT_DEFER value. But the trailer tagger is reallocating every skb on xmit and therefore making a valid use of the pointer return value. So instead of reworking the API in complicated ways, right now a boolean property in the newly introduced DSA_SKB_CB is set. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index d628587e0bde..0260b73938e2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -85,6 +85,7 @@ struct dsa_device_ops { struct dsa_skb_cb { struct sk_buff *clone; + bool deferred_xmit; }; struct __dsa_skb_cb { @@ -205,6 +206,10 @@ struct dsa_port { struct net_device *bridge_dev; struct devlink_port devlink_port; struct phylink *pl; + + struct work_struct xmit_work; + struct sk_buff_head xmit_queue; + /* * Original copy of the master netdev ethtool_ops */ @@ -539,6 +544,12 @@ struct dsa_switch_ops { struct sk_buff *clone, unsigned int type); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); + + /* + * Deferred frame Tx + */ + netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port, + struct sk_buff *skb); }; struct dsa_switch_driver { @@ -634,6 +645,7 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, #define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) +netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); int dsa_port_get_phy_sset_count(struct dsa_port *dp); -- cgit v1.2.3 From c362beb072e14b929eb657dc174d83ccdd9b0eed Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:26 +0300 Subject: net: dsa: Add a private structure pointer to dsa_port This is supposed to share information between the driver and the tagger, or used by the tagger to keep some state. Its use is optional. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 0260b73938e2..e20be1ceb306 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -210,6 +210,12 @@ struct dsa_port { struct work_struct xmit_work; struct sk_buff_head xmit_queue; + /* + * Give the switch driver somewhere to hang its per-port private data + * structures (accessible from the tagger). + */ + void *priv; + /* * Original copy of the master netdev ethtool_ops */ -- cgit v1.2.3 From 227d07a07ef126272ea2eed97fd136cd7a803d81 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:27 +0300 Subject: net: dsa: sja1105: Add support for traffic through standalone ports In order to support this, we are creating a make-shift switch tag out of a VLAN trunk configured on the CPU port. Termination of normal traffic on switch ports only works when not under a vlan_filtering bridge. Termination of management (PTP, BPDU) traffic works under all circumstances because it uses a different tagging mechanism (incl_srcpt). We are making use of the generic CONFIG_NET_DSA_TAG_8021Q code and leveraging it from our own CONFIG_NET_DSA_TAG_SJA1105. There are two types of traffic: regular and link-local. The link-local traffic received on the CPU port is trapped from the switch's regular forwarding decisions because it matched one of the two DMAC filters for management traffic. On transmission, the switch requires special massaging for these link-local frames. Due to a weird implementation of the switching IP, by default it drops link-local frames that originate on the CPU port. It needs to be told where to forward them to, through an SPI command ("management route") that is valid for only a single frame. So when we're sending link-local traffic, we are using the dsa_defer_xmit mechanism. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index e20be1ceb306..6aaaadd6a413 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -43,6 +43,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 #define DSA_TAG_PROTO_8021Q_VALUE 12 +#define DSA_TAG_PROTO_SJA1105_VALUE 13 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -58,6 +59,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, + DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, }; struct packet_type; -- cgit v1.2.3 From d6787147e15dffa7b7f3116a5bc3cbe0670bd74f Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Mon, 6 May 2019 17:24:21 -0700 Subject: net/sched: remove block pointer from common offload structure Based on feedback from Jiri avoid carrying a pointer to the tcf_block structure in the tc_cls_common_offload structure. Instead store a flag in driver private data which indicates if offloads apply to a shared block at block binding time. Suggested-by: Jiri Pirko Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index eed98f8fcb5e..514e3c80ecc1 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -629,7 +629,6 @@ struct tc_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; - struct tcf_block *block; struct netlink_ext_ack *extack; }; @@ -731,13 +730,11 @@ static inline bool tc_in_hw(u32 flags) static inline void tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, - struct tcf_block *block, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio; - cls_common->block = block; if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } -- cgit v1.2.3