From 9403cf2302588022d06f1878b072d3f6933021f0 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Tue, 19 Mar 2019 16:05:44 +0100
Subject: tcp: free request sock directly upon TFO or syncookies error

Since the request socket is created locally, it'd make more sense to
use reqsk_free() instead of reqsk_put() in TFO and syncookies' error
path.

However, tcp_get_cookie_sock() may set ->rsk_refcnt before freeing the
socket; tcp_conn_request() may also have non-null ->rsk_refcnt because
of tcp_try_fastopen(). In both cases 'req' hasn't been exposed
to the outside world and is safe to free immediately, but that'd
trigger the WARN_ON_ONCE in reqsk_free().

Define __reqsk_free() for these situations where we know nobody's
referencing the socket, even though ->rsk_refcnt might be non-null.
Now we can consolidate the error path of tcp_get_cookie_sock() and
tcp_conn_request().

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/request_sock.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 21a5243fecd1..9dfd7960d90a 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -106,10 +106,8 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
 	return req;
 }
 
-static inline void reqsk_free(struct request_sock *req)
+static inline void __reqsk_free(struct request_sock *req)
 {
-	WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0);
-
 	req->rsk_ops->destructor(req);
 	if (req->rsk_listener)
 		sock_put(req->rsk_listener);
@@ -117,6 +115,12 @@ static inline void reqsk_free(struct request_sock *req)
 	kmem_cache_free(req->rsk_ops->slab, req);
 }
 
+static inline void reqsk_free(struct request_sock *req)
+{
+	WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0);
+	__reqsk_free(req);
+}
+
 static inline void reqsk_put(struct request_sock *req)
 {
 	if (refcount_dec_and_test(&req->rsk_refcnt))
-- 
cgit v1.2.3


From 03f1eccc7a69c965351e6bee41c62afa2844752f Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Tue, 19 Mar 2019 12:37:12 -0400
Subject: ipv6: Add icmp_echo_ignore_multicast support for ICMPv6

IPv4 has icmp_echo_ignore_broadcast to prevent responding to broadcast pings.
IPv6 needs a similar mechanism.

v1->v2:
- Remove NET_IPV6_ICMP_ECHO_IGNORE_MULTICAST.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index b028a1dc150d..e29aff15acc9 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -33,6 +33,7 @@ struct netns_sysctl_ipv6 {
 	int auto_flowlabels;
 	int icmpv6_time;
 	int icmpv6_echo_ignore_all;
+	int icmpv6_echo_ignore_multicast;
 	int anycast_src_echo_reply;
 	int ip_nonlocal_bind;
 	int fwmark_reflect;
-- 
cgit v1.2.3


From f295b3ae9f5927e084bd5decdff82390e3471801 Mon Sep 17 00:00:00 2001
From: Vakul Garg <vakul.garg@nxp.com>
Date: Wed, 20 Mar 2019 02:03:36 +0000
Subject: net/tls: Add support of AES128-CCM based ciphers

Added support for AES128-CCM based record encryption. AES128-CCM is
similar to AES128-GCM. Both of them have same salt/iv/mac size. The
notable difference between the two is that while invoking AES128-CCM
operation, the salt||nonce (which is passed as IV) has to be prefixed
with a hardcoded value '2'. Further, CCM implementation in kernel
requires IV passed in crypto_aead_request() to be full '16' bytes.
Therefore, the record structure 'struct tls_rec' has been modified to
reserve '16' bytes for IV. This works for both GCM and CCM based cipher.

Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tls.h b/include/net/tls.h
index a5a938583295..3ce71d78414c 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -60,6 +60,17 @@
 #define TLS_AAD_SPACE_SIZE		13
 #define TLS_DEVICE_NAME_MAX		32
 
+#define MAX_IV_SIZE			16
+
+/* For AES-CCM, the full 16-bytes of IV is made of '4' fields of given sizes.
+ *
+ * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3]
+ *
+ * The field 'length' is encoded in field 'b0' as '(length width - 1)'.
+ * Hence b0 contains (3 - 1) = 2.
+ */
+#define TLS_AES_CCM_IV_B0_BYTE		2
+
 /*
  * This structure defines the routines for Inline TLS driver.
  * The following routines are optional and filled with a
@@ -123,8 +134,7 @@ struct tls_rec {
 	struct scatterlist sg_content_type;
 
 	char aad_space[TLS_AAD_SPACE_SIZE];
-	u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE +
-		   TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+	u8 iv_data[MAX_IV_SIZE];
 	struct aead_request aead_req;
 	u8 aead_req_ctx[];
 };
@@ -219,6 +229,7 @@ struct tls_prot_info {
 	u16 tag_size;
 	u16 overhead_size;
 	u16 iv_size;
+	u16 salt_size;
 	u16 rec_seq_size;
 	u16 aad_size;
 	u16 tail_size;
-- 
cgit v1.2.3


From 0b03a5ca8b14321366eec4a903922d2b46d585ff Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Wed, 20 Mar 2019 10:29:27 -0400
Subject: ipv6: Add icmp_echo_ignore_anycast for ICMPv6

In addition to icmp_echo_ignore_multicast, there is a need to also
prevent responding to pings to anycast addresses for security.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index e29aff15acc9..64e29b58bb5e 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -34,6 +34,7 @@ struct netns_sysctl_ipv6 {
 	int icmpv6_time;
 	int icmpv6_echo_ignore_all;
 	int icmpv6_echo_ignore_multicast;
+	int icmpv6_echo_ignore_anycast;
 	int anycast_src_echo_reply;
 	int ip_nonlocal_bind;
 	int fwmark_reflect;
-- 
cgit v1.2.3


From c7a1ce397adacaf5d4bb2eab0a738b5f80dc3e43 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 21 Mar 2019 05:21:35 -0700
Subject: ipv6: Change addrconf_f6i_alloc to use ip6_route_info_create

Change addrconf_f6i_alloc to generate a fib6_config and call
ip6_route_info_create. addrconf_f6i_alloc is the last caller to
fib6_info_alloc besides ip6_route_info_create, and there is no
reason for it to do its own initialization on a fib6_info.

Host routes need to be created even if the device is down, so add a
new flag, fc_ignore_dev_down, to fib6_config and update fib6_nh_init
to not error out if device is not up.

Notes on the conversion:
- ip_fib_metrics_init is the same as fib6_config has fc_mx set to NULL
  and fc_mx_len set to 0
- dst_nocount is handled by the RTF_ADDRCONF flag
- dst_host is handled by fc_dst_len = 128

nh_gw does not get set after the conversion to ip6_route_info_create
but it should not be set in addrconf_f6i_alloc since this is a host
route not a gateway route.

Everything else is a straight forward map between fib6_info and
fib6_config.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 84097010237c..2acb78a762ee 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -50,7 +50,8 @@ struct fib6_config {
 	u32		fc_protocol;
 	u16		fc_type;        /* only 8 bits are used */
 	u16		fc_delete_all_nh : 1,
-			__unused : 15;
+			fc_ignore_dev_down:1,
+			__unused : 14;
 
 	struct in6_addr	fc_dst;
 	struct in6_addr	fc_src;
-- 
cgit v1.2.3


From 9ab948a91b2c2abc8e82845c0e61f4b1683e3a4f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 20 Mar 2019 09:18:59 -0700
Subject: ipv4: Allow amount of dirty memory from fib resizing to be
 controllable

fib_trie implementation calls synchronize_rcu when a certain amount of
pages are dirty from freed entries. The number of pages was determined
experimentally in 2009 (commit c3059477fce2d).

At the current setting, synchronize_rcu is called often -- 51 times in a
second in one test with an average of an 8 msec delay adding a fib entry.
The total impact is a lot of slow down modifying the fib. This is seen
in the output of 'time' - the difference between real time and sys+user.
For example, using 720,022 single path routes and 'ip -batch'[1]:

    $ time ./ip -batch ipv4/routes-1-hops
    real    0m14.214s
    user    0m2.513s
    sys     0m6.783s

So roughly 35% of the actual time to install the routes is from the ip
command getting scheduled out, most notably due to synchronize_rcu (this
is observed using 'perf sched timehist').

This patch makes the amount of dirty memory configurable between 64k where
the synchronize_rcu is called often (small, low end systems that are memory
sensitive) to 64M where synchronize_rcu is called rarely during a large
FIB change (for high end systems with lots of memory). The default is 512kB
which corresponds to the current setting of 128 pages with a 4kB page size.

As an example, at 16MB the worst interval shows 4 calls to synchronize_rcu
in a second blocking for up to 30 msec in a single instance, and a total
of almost 100 msec across the 4 calls in the second. The trade off is
allowing FIB entries to consume more memory in a given time window but
but with much better fib insertion rates (~30% increase in prefixes/sec).
With this patch and net.ipv4.fib_sync_mem set to 16MB, the same batch
file runs in:

    $ time ./ip -batch ipv4/routes-1-hops
    real    0m9.692s
    user    0m2.491s
    sys     0m6.769s

So the dead time is reduced to about 1/2 second or <5% of the real time.

[1] 'ip' modified to not request ACK messages which improves route
    insertion times by about 20%

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index be3cad9c2e4c..aa09ae5f01a5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -38,6 +38,10 @@
 #define IPV4_MAX_PMTU		65535U		/* RFC 2675, Section 5.1 */
 #define IPV4_MIN_MTU		68			/* RFC 791 */
 
+extern unsigned int sysctl_fib_sync_mem;
+extern unsigned int sysctl_fib_sync_mem_min;
+extern unsigned int sysctl_fib_sync_mem_max;
+
 struct sock;
 
 struct inet_skb_parm {
-- 
cgit v1.2.3


From 02afc7ad45bd6cfc9fd51fdbc132455371b63469 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Wed, 20 Mar 2019 20:02:56 +0100
Subject: net: dst: remove gc leftovers

Get rid of some obsolete gc-related documentation and macros that were
missed in commit 5b7c9a8ff828 ("net: remove dst gc related code").

CC: Wei Wang <weiwan@google.com>
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Acked-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst.h b/include/net/dst.h
index 6cf0870414c7..12b31c602cb0 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -19,17 +19,6 @@
 #include <net/neighbour.h>
 #include <asm/processor.h>
 
-#define DST_GC_MIN	(HZ/10)
-#define DST_GC_INC	(HZ/2)
-#define DST_GC_MAX	(120*HZ)
-
-/* Each dst_entry has reference count and sits in some parent list(s).
- * When it is removed from parent list, it is "freed" (dst_free).
- * After this it enters dead state (dst->obsolete > 0) and if its refcnt
- * is zero, it can be destroyed immediately, otherwise it is added
- * to gc list and garbage collector periodically checks the refcnt.
- */
-
 struct sk_buff;
 
 struct dst_entry {
-- 
cgit v1.2.3


From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 21 Mar 2019 22:51:02 +0100
Subject: genetlink: make policy common to family

Since maxattr is common, the policy can't really differ sanely,
so make it common as well.

The only user that did in fact manage to make a non-common policy
is taskstats, which has to be really careful about it (since it's
still using a common maxattr!). This is no longer supported, but
we can fake it using pre_doit.

This reduces the size of e.g. nl80211.o (which has lots of commands):

   text	   data	    bss	    dec	    hex	filename
 398745	  14323	   2240	 415308	  6564c	net/wireless/nl80211.o (before)
 397913	  14331	   2240	 414484	  65314	net/wireless/nl80211.o (after)
--------------------------------
   -832      +8       0    -824

Which is obviously just 8 bytes for each command, and an added 8
bytes for the new policy pointer. I'm not sure why the ops list is
counted as .text though.

Most of the code transformations were done using the following spatch:
    @ops@
    identifier OPS;
    expression POLICY;
    @@
    struct genl_ops OPS[] = {
    ...,
     {
    -	.policy = POLICY,
     },
    ...
    };

    @@
    identifier ops.OPS;
    expression ops.POLICY;
    identifier fam;
    expression M;
    @@
    struct genl_family fam = {
            .ops = OPS,
            .maxattr = M,
    +       .policy = POLICY,
            ...
    };

This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing
the cb->data as ops, which we want to change in a later genl patch.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index aa2e5888f18d..6850c7b1a3a6 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -26,6 +26,7 @@ struct genl_info;
  * @name: name of family
  * @version: protocol version
  * @maxattr: maximum number of attributes supported
+ * @policy: netlink policy
  * @netnsok: set to true if the family can handle network
  *	namespaces and should be presented in all of them
  * @parallel_ops: operations can be called in parallel and aren't
@@ -56,6 +57,7 @@ struct genl_family {
 	unsigned int		maxattr;
 	bool			netnsok;
 	bool			parallel_ops;
+	const struct nla_policy *policy;
 	int			(*pre_doit)(const struct genl_ops *ops,
 					    struct sk_buff *skb,
 					    struct genl_info *info);
@@ -124,14 +126,12 @@ static inline int genl_err_attr(struct genl_info *info, int err,
  * @cmd: command identifier
  * @internal_flags: flags used by the family
  * @flags: flags
- * @policy: attribute validation policy
  * @doit: standard command callback
  * @start: start callback for dumps
  * @dumpit: callback for dumpers
  * @done: completion callback for dumps
  */
 struct genl_ops {
-	const struct nla_policy	*policy;
 	int		       (*doit)(struct sk_buff *skb,
 				       struct genl_info *info);
 	int		       (*start)(struct netlink_callback *cb);
-- 
cgit v1.2.3


From 974eff2b5793eeaa2eb433bca7eba9640d890c4a Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Thu, 21 Mar 2019 15:51:36 -0700
Subject: net: Move the definition of the default Geneve udp port to public
 header file

Move the definition of the default Geneve udp port from the geneve
source to the header file, so we can re-use it from drivers.
Modify existing drivers to use it.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Cc: John Hurley <john.hurley@netronome.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/net/geneve.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/geneve.h b/include/net/geneve.h
index fc6a7e0a874a..bced0b1d9fe4 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -4,6 +4,8 @@
 
 #include <net/udp_tunnel.h>
 
+#define GENEVE_UDP_PORT		6081
+
 /* Geneve Header:
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |Ver|  Opt Len  |O|C|    Rsvd.  |          Protocol Type        |
-- 
cgit v1.2.3


From bea964107fa78ffe484ef8659ecc26f9ae2bcd2f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Thu, 21 Mar 2019 15:51:39 -0700
Subject: net: Add IANA_VXLAN_UDP_PORT definition to vxlan header file

Added IANA_VXLAN_UDP_PORT (4789) definition to vxlan header file so it
can be used by drivers instead of local definition.
Updated drivers which locally defined it as 4789 to use it.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Cc: John Hurley <john.hurley@netronome.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Cc: Peng Li <lipeng321@huawei.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/net/vxlan.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 00254a58824b..83b5999a2587 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -8,6 +8,8 @@
 #include <net/rtnetlink.h>
 #include <net/switchdev.h>
 
+#define IANA_VXLAN_UDP_PORT     4789
+
 /* VXLAN protocol (RFC 7348) header:
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |R|R|R|R|I|R|R|R|               Reserved                        |
-- 
cgit v1.2.3


From 28cff537ef2eed9307bc7e4e40745075637bec56 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 22 Mar 2019 16:01:55 +0100
Subject: net: sched: add empty status flag for NOLOCK qdisc

The queue is marked not empty after acquiring the seqlock,
and it's up to the NOLOCK qdisc clearing such flag on dequeue.
Since the empty status lays on the same cache-line of the
seqlock, it's always hot on cache during the updates.

This makes the empty flag update a little bit loosy. Given
the lack of synchronization between enqueue and dequeue, this
is unavoidable.

v2 -> v3:
 - qdisc_is_empty() has a const argument (Eric)

v1 -> v2:
 - use really an 'empty' flag instead of 'not_empty', as
   suggested by Eric

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 31284c078d06..e227475e78ca 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -113,6 +113,9 @@ struct Qdisc {
 
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 	spinlock_t		seqlock;
+
+	/* for NOLOCK qdisc, true if there are no enqueued skbs */
+	bool			empty;
 	struct rcu_head		rcu;
 };
 
@@ -143,11 +146,19 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
 }
 
+static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
+{
+	if (qdisc->flags & TCQ_F_NOLOCK)
+		return qdisc->empty;
+	return !qdisc->q.qlen;
+}
+
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK) {
 		if (!spin_trylock(&qdisc->seqlock))
 			return false;
+		qdisc->empty = false;
 	} else if (qdisc_is_running(qdisc)) {
 		return false;
 	}
-- 
cgit v1.2.3


From dc05360fee660a9dbe59824b3f7896534210432b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Mar 2019 08:56:38 -0700
Subject: net: convert rps_needed and rfs_needed to new static branch api

We prefer static_branch_unlikely() over static_key_false() these days.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 8de5ee258b93..fecdf639225c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -966,7 +966,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	if (static_key_false(&rfs_needed)) {
+	if (static_branch_unlikely(&rfs_needed)) {
 		/* Reading sk->sk_rxhash might incur an expensive cache line
 		 * miss.
 		 *
-- 
cgit v1.2.3


From 472c2e07eef045145bc1493cc94a01c87140780a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Mar 2019 08:56:39 -0700
Subject: tcp: add one skb cache for tx

On hosts with a lot of cores, RPC workloads suffer from heavy contention on slab spinlocks.

    20.69%  [kernel]       [k] queued_spin_lock_slowpath
     5.64%  [kernel]       [k] _raw_spin_lock
     3.83%  [kernel]       [k] syscall_return_via_sysret
     3.48%  [kernel]       [k] __entry_text_start
     1.76%  [kernel]       [k] __netif_receive_skb_core
     1.64%  [kernel]       [k] __fget

For each sendmsg(), we allocate one skb, and free it at the time ACK packet comes.

In many cases, ACK packets are handled by another cpus, and this unfortunately
incurs heavy costs for slab layer.

This patch uses an extra pointer in socket structure, so that we try to reuse
the same skb and avoid these expensive costs.

We cache at most one skb per socket so this should be safe as far as
memory pressure is concerned.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index fecdf639225c..314c47a8f5d1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -414,6 +414,7 @@ struct sock {
 		struct sk_buff	*sk_send_head;
 		struct rb_root	tcp_rtx_queue;
 	};
+	struct sk_buff		*sk_tx_skb_cache;
 	struct sk_buff_head	sk_write_queue;
 	__s32			sk_peek_off;
 	int			sk_write_pending;
@@ -1463,6 +1464,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
+	if (!sk->sk_tx_skb_cache) {
+		sk->sk_tx_skb_cache = skb;
+		return;
+	}
 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
 	sk->sk_wmem_queued -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
-- 
cgit v1.2.3


From 8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Mar 2019 08:56:40 -0700
Subject: tcp: add one skb cache for rx

Often times, recvmsg() system calls and BH handling for a particular
TCP socket are done on different cpus.

This means the incoming skb had to be allocated on a cpu,
but freed on another.

This incurs a high spinlock contention in slab layer for small rpc,
but also a high number of cache line ping pongs for larger packets.

A full size GRO packet might use 45 page fragments, meaning
that up to 45 put_page() can be involved.

More over performing the __kfree_skb() in the recvmsg() context
adds a latency for user applications, and increase probability
of trapping them in backlog processing, since the BH handler
might found the socket owned by the user.

This patch, combined with the prior one increases the rpc
performance by about 10 % on servers with large number of cores.

(tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps
 instead of 8 Mpps)

This also increases single bulk flow performance on 40Gbit+ links,
since in this case there are often two cpus working in tandem :

 - CPU handling the NIC rx interrupts, feeding the receive queue,
  and (after this patch) freeing the skbs that were consumed.

 - CPU in recvmsg() system call, essentially 100 % busy copying out
  data to user space.

Having at most one skb in a per-socket cache has very little risk
of memory exhaustion, and since it is protected by socket lock,
its management is essentially free.

Note that if rps/rfs is used, we do not enable this feature, because
there is high chance that the same cpu is handling both the recvmsg()
system call and the TCP rx path, but that another cpu did the skb
allocations in the device driver right before the RPS/RFS logic.

To properly handle this case, it seems we would need to record
on which cpu skb was allocated, and use a different channel
to give skbs back to this cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 314c47a8f5d1..577d91fb5626 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -368,6 +368,7 @@ struct sock {
 	atomic_t		sk_drops;
 	int			sk_rcvlowat;
 	struct sk_buff_head	sk_error_queue;
+	struct sk_buff		*sk_rx_skb_cache;
 	struct sk_buff_head	sk_receive_queue;
 	/*
 	 * The backlog queue is special, it is always used with
@@ -2438,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
+	if (
+#ifdef CONFIG_RPS
+	    !static_branch_unlikely(&rps_needed) &&
+#endif
+	    !sk->sk_rx_skb_cache) {
+		sk->sk_rx_skb_cache = skb;
+		skb_orphan(skb);
+		return;
+	}
 	__kfree_skb(skb);
 }
 
-- 
cgit v1.2.3


From b8f975545cdbcc316cf20e827e7966d4410b5c5a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sun, 24 Mar 2019 11:14:37 +0100
Subject: net: devlink: add port type spinlock

Add spinlock to protect port type and type_dev pointer consistency.
Without that, userspace may see inconsistent type and type_dev
combinations.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
v1->v2:
- rebased
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 63de99e09f04..cb9b060033e1 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -16,6 +16,7 @@
 #include <linux/gfp.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
+#include <linux/spinlock.h>
 #include <net/net_namespace.h>
 #include <uapi/linux/devlink.h>
 
@@ -53,6 +54,9 @@ struct devlink_port {
 	struct devlink *devlink;
 	unsigned index;
 	bool registered;
+	spinlock_t type_lock; /* Protects type and type_dev
+			       * pointer consistency.
+			       */
 	enum devlink_port_type type;
 	enum devlink_port_type desired_type;
 	void *type_dev;
-- 
cgit v1.2.3


From f6b19b354d50c5ae46ad66b5273f92e563fbc847 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sun, 24 Mar 2019 11:14:38 +0100
Subject: net: devlink: select NET_DEVLINK from drivers

Some drivers are becoming more dependent on NET_DEVLINK being selected
in configuration. With upcoming compat functions, the behavior would be
wrong in case devlink was not compiled in. So make the drivers select
NET_DEVLINK and rely on the functions being there, not just stubs.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 495 +-------------------------------------------------
 1 file changed, 3 insertions(+), 492 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index cb9b060033e1..03fb16f4fb6c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -549,17 +549,13 @@ static inline struct devlink *priv_to_devlink(void *priv)
 
 static inline struct devlink *netdev_to_devlink(struct net_device *dev)
 {
-#if IS_ENABLED(CONFIG_NET_DEVLINK)
 	if (dev->netdev_ops->ndo_get_devlink)
 		return dev->netdev_ops->ndo_get_devlink(dev);
-#endif
 	return NULL;
 }
 
 struct ib_device;
 
-#if IS_ENABLED(CONFIG_NET_DEVLINK)
-
 struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
 int devlink_register(struct devlink *devlink, struct device *dev);
 void devlink_unregister(struct devlink *devlink);
@@ -728,500 +724,14 @@ void
 devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 				     enum devlink_health_reporter_state state);
 
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
 void devlink_compat_running_version(struct net_device *dev,
 				    char *buf, size_t len);
 int devlink_compat_flash_update(struct net_device *dev, const char *file_name);
 
 #else
 
-static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
-					    size_t priv_size)
-{
-	return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL);
-}
-
-static inline int devlink_register(struct devlink *devlink, struct device *dev)
-{
-	return 0;
-}
-
-static inline void devlink_unregister(struct devlink *devlink)
-{
-}
-
-static inline void devlink_params_publish(struct devlink *devlink)
-{
-}
-
-static inline void devlink_params_unpublish(struct devlink *devlink)
-{
-}
-
-static inline void devlink_free(struct devlink *devlink)
-{
-	kfree(devlink);
-}
-
-static inline int devlink_port_register(struct devlink *devlink,
-					struct devlink_port *devlink_port,
-					unsigned int port_index)
-{
-	return 0;
-}
-
-static inline void devlink_port_unregister(struct devlink_port *devlink_port)
-{
-}
-
-static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port,
-					     struct net_device *netdev)
-{
-}
-
-static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port,
-					    struct ib_device *ibdev)
-{
-}
-
-static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
-{
-}
-
-static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
-					  enum devlink_port_flavour flavour,
-					  u32 port_number, bool split,
-					  u32 split_subport_number)
-{
-}
-
-static inline int
-devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
-				char *name, size_t len)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int devlink_sb_register(struct devlink *devlink,
-				      unsigned int sb_index, u32 size,
-				      u16 ingress_pools_count,
-				      u16 egress_pools_count,
-				      u16 ingress_tc_count,
-				      u16 egress_tc_count)
-{
-	return 0;
-}
-
-static inline void devlink_sb_unregister(struct devlink *devlink,
-					 unsigned int sb_index)
-{
-}
-
-static inline int
-devlink_dpipe_table_register(struct devlink *devlink,
-			     const char *table_name,
-			     struct devlink_dpipe_table_ops *table_ops,
-			     void *priv, bool counter_control_extern)
-{
-	return 0;
-}
-
-static inline void devlink_dpipe_table_unregister(struct devlink *devlink,
-						  const char *table_name)
-{
-}
-
-static inline int devlink_dpipe_headers_register(struct devlink *devlink,
-						 struct devlink_dpipe_headers *
-						 dpipe_headers)
-{
-	return 0;
-}
-
-static inline void devlink_dpipe_headers_unregister(struct devlink *devlink)
-{
-}
-
-static inline bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
-						       const char *table_name)
-{
-	return false;
-}
-
-static inline int
-devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
-{
-	return 0;
-}
-
-static inline int
-devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx,
-			       struct devlink_dpipe_entry *entry)
-{
-	return 0;
-}
-
-static inline int
-devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx)
-{
-	return 0;
-}
-
-static inline void
-devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
-{
-}
-
-static inline int
-devlink_dpipe_action_put(struct sk_buff *skb,
-			 struct devlink_dpipe_action *action)
-{
-	return 0;
-}
-
-static inline int
-devlink_dpipe_match_put(struct sk_buff *skb,
-			struct devlink_dpipe_match *match)
-{
-	return 0;
-}
-
-static inline int
-devlink_resource_register(struct devlink *devlink,
-			  const char *resource_name,
-			  u64 resource_size,
-			  u64 resource_id,
-			  u64 parent_resource_id,
-			  const struct devlink_resource_size_params *size_params)
-{
-	return 0;
-}
-
-static inline void
-devlink_resources_unregister(struct devlink *devlink,
-			     struct devlink_resource *resource)
-{
-}
-
-static inline int
-devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
-			  u64 *p_resource_size)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-devlink_dpipe_table_resource_set(struct devlink *devlink,
-				 const char *table_name, u64 resource_id,
-				 u64 resource_units)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void
-devlink_resource_occ_get_register(struct devlink *devlink,
-				  u64 resource_id,
-				  devlink_resource_occ_get_t *occ_get,
-				  void *occ_get_priv)
-{
-}
-
-static inline void
-devlink_resource_occ_get_unregister(struct devlink *devlink,
-				    u64 resource_id)
-{
-}
-
-static inline int
-devlink_params_register(struct devlink *devlink,
-			const struct devlink_param *params,
-			size_t params_count)
-{
-	return 0;
-}
-
-static inline void
-devlink_params_unregister(struct devlink *devlink,
-			  const struct devlink_param *params,
-			  size_t params_count)
-{
-
-}
-
-static inline int
-devlink_port_params_register(struct devlink_port *devlink_port,
-			     const struct devlink_param *params,
-			     size_t params_count)
-{
-	return 0;
-}
-
-static inline void
-devlink_port_params_unregister(struct devlink_port *devlink_port,
-			       const struct devlink_param *params,
-			       size_t params_count)
-{
-}
-
-static inline int
-devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
-				   union devlink_param_value *init_val)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
-				   union devlink_param_value init_val)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port,
-					u32 param_id,
-					union devlink_param_value *init_val)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port,
-					u32 param_id,
-					union devlink_param_value init_val)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void
-devlink_param_value_changed(struct devlink *devlink, u32 param_id)
-{
-}
-
-static inline void
-devlink_port_param_value_changed(struct devlink_port *devlink_port,
-				 u32 param_id)
-{
-}
-
-static inline void
-devlink_param_value_str_fill(union devlink_param_value *dst_val,
-			     const char *src)
-{
-}
-
-static inline struct devlink_region *
-devlink_region_create(struct devlink *devlink,
-		      const char *region_name,
-		      u32 region_max_snapshots,
-		      u64 region_size)
-{
-	return NULL;
-}
-
-static inline void
-devlink_region_destroy(struct devlink_region *region)
-{
-}
-
-static inline u32
-devlink_region_shapshot_id_get(struct devlink *devlink)
-{
-	return 0;
-}
-
-static inline int
-devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
-			       u8 *data, u32 snapshot_id,
-			       devlink_snapshot_data_dest_t *data_destructor)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_driver_name_put(struct devlink_info_req *req, const char *name)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_version_fixed_put(struct devlink_info_req *req,
-			       const char *version_name,
-			       const char *version_value)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_version_stored_put(struct devlink_info_req *req,
-				const char *version_name,
-				const char *version_value)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_version_running_put(struct devlink_info_req *req,
-				 const char *version_name,
-				 const char *version_value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
-				 const char *name)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
-			u16 value_len)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			   bool value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			 u8 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			  u32 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			  u64 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			     const char *value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			     const void *value, u16 value_len)
-{
-	return 0;
-}
-
-static inline struct devlink_health_reporter *
-devlink_health_reporter_create(struct devlink *devlink,
-			       const struct devlink_health_reporter_ops *ops,
-			       u64 graceful_period, bool auto_recover,
-			       void *priv)
-{
-	return NULL;
-}
-
-static inline void
-devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
-{
-}
-
-static inline void *
-devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
-{
-	return NULL;
-}
-
-static inline int
-devlink_health_report(struct devlink_health_reporter *reporter,
-		      const char *msg, void *priv_ctx)
-{
-	return 0;
-}
-
-static inline void
-devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
-				     enum devlink_health_reporter_state state)
-{
-}
-
 static inline void
 devlink_compat_running_version(struct net_device *dev, char *buf, size_t len)
 {
@@ -1232,6 +742,7 @@ devlink_compat_flash_update(struct net_device *dev, const char *file_name)
 {
 	return -EOPNOTSUPP;
 }
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
-- 
cgit v1.2.3


From 4f661542a40217713f2cee0bb6678fbb30d9d367 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Mar 2019 08:34:55 -0700
Subject: tcp: fix zerocopy and notsent_lowat issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

My recent patch had at least three problems :

1) TX zerocopy wants notification when skb is acknowledged,
   thus we need to call skb_zcopy_clear() if the skb is
   cached into sk->sk_tx_skb_cache

2) Some applications might expect precise EPOLLOUT
   notifications, so we need to update sk->sk_wmem_queued
   and call sk_mem_uncharge() from sk_wmem_free_skb()
   in all cases. The SOCK_QUEUE_SHRUNK flag must also be set.

3) Reuse of saved skb should have used skb_cloned() instead
  of simply checking if the fast clone has been freed.

Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 577d91fb5626..7fa223278522 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1465,13 +1465,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
+	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+	sk->sk_wmem_queued -= skb->truesize;
+	sk_mem_uncharge(sk, skb->truesize);
 	if (!sk->sk_tx_skb_cache) {
+		skb_zcopy_clear(skb, true);
 		sk->sk_tx_skb_cache = skb;
 		return;
 	}
-	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
-	sk->sk_wmem_queued -= skb->truesize;
-	sk_mem_uncharge(sk, skb->truesize);
 	__kfree_skb(skb);
 }
 
-- 
cgit v1.2.3


From df453700e8d81b1bdafdf684365ee2b9431fb702 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Mar 2019 12:40:33 -0700
Subject: inet: switch IP ID generator to siphash

According to Amit Klein and Benny Pinkas, IP ID generation is too weak
and might be used by attackers.

Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix())
having 64bit key and Jenkins hash is risky.

It is time to switch to siphash and its 128bit keys.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Amit Klein <aksecurity@gmail.com>
Reported-by: Benny Pinkas <benny@pinkas.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 104a6669e344..7698460a3dd1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -9,6 +9,7 @@
 #include <linux/uidgid.h>
 #include <net/inet_frag.h>
 #include <linux/rcupdate.h>
+#include <linux/siphash.h>
 
 struct tcpm_hash_bucket;
 struct ctl_table_header;
@@ -217,5 +218,6 @@ struct netns_ipv4 {
 	unsigned int	ipmr_seq;	/* protected by rtnl_mutex */
 
 	atomic_t	rt_genid;
+	siphash_key_t	ip_id_key;
 };
 #endif
-- 
cgit v1.2.3


From 5dc37bb9b03586e8fdeb47d25e8d2a0399984936 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 28 Mar 2019 13:56:36 +0100
Subject: net: replace ndo_get_devlink with ndo_get_devlink_port

Follow-up patch is going to need a devlink port instance according to
a netdev. Devlink port instance should be always available when devlink
is used. So change the recently introduced ndo_get_devlink to
ndo_get_devlink_port. With that, adjust the wrapper for the only
user to get devlink pointer.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 03fb16f4fb6c..81b5ed04a341 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -547,10 +547,20 @@ static inline struct devlink *priv_to_devlink(void *priv)
 	return container_of(priv, struct devlink, priv);
 }
 
+static inline struct devlink_port *
+netdev_to_devlink_port(struct net_device *dev)
+{
+	if (dev->netdev_ops->ndo_get_devlink_port)
+		return dev->netdev_ops->ndo_get_devlink_port(dev);
+	return NULL;
+}
+
 static inline struct devlink *netdev_to_devlink(struct net_device *dev)
 {
-	if (dev->netdev_ops->ndo_get_devlink)
-		return dev->netdev_ops->ndo_get_devlink(dev);
+	struct devlink_port *devlink_port = netdev_to_devlink_port(dev);
+
+	if (devlink_port)
+		return devlink_port->devlink;
 	return NULL;
 }
 
-- 
cgit v1.2.3


From af3836df9a59e7339d60c9c46729a7d9094d0582 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 28 Mar 2019 13:56:37 +0100
Subject: net: devlink: introduce devlink_compat_phys_port_name_get()

Introduce devlink_compat_phys_port_name_get() helper that
gets the physical port name for specified netdevice
according to devlink port attributes.
Call this helper from dev_get_phys_port_name()
in case ndo_get_phys_port_name is not defined.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 81b5ed04a341..85e577d6ec3b 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -739,6 +739,8 @@ devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 void devlink_compat_running_version(struct net_device *dev,
 				    char *buf, size_t len);
 int devlink_compat_flash_update(struct net_device *dev, const char *file_name);
+int devlink_compat_phys_port_name_get(struct net_device *dev,
+				      char *name, size_t len);
 
 #else
 
@@ -753,6 +755,13 @@ devlink_compat_flash_update(struct net_device *dev, const char *file_name)
 	return -EOPNOTSUPP;
 }
 
+static inline int
+devlink_compat_phys_port_name_get(struct net_device *dev,
+				  char *name, size_t len)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
-- 
cgit v1.2.3


From 14c03ac4c100e4b81ec4747f5ec861701ff52de2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 28 Mar 2019 13:56:40 +0100
Subject: net: devlink: remove unused devlink_port_get_phys_port_name()
 function

Now it is unused, remove it.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 85e577d6ec3b..31d5cec4d06b 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -583,8 +583,6 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number);
-int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
-				    char *name, size_t len);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From 717700d183d65bd2e6511566aa6d32395419d158 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Tue, 26 Mar 2019 11:31:13 -0700
Subject: netfilter: Export nf_ct_{set,destroy}_timeout()

This patch exports nf_ct_set_timeout() and nf_ct_destroy_timeout().
The two functions are derived from xt_ct_destroy_timeout() and
xt_ct_set_timeout() in xt_CT.c, and moved to nf_conntrack_timeout.c
without any functional change.
It would be useful for other users (i.e. OVS) that utilizes the
finer-grain conntrack timeout feature.

CC: Pablo Neira Ayuso <pablo@netfilter.org>
CC: Pravin Shelar <pshelar@ovn.org>
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_conntrack_timeout.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index 3394d75e1c80..00a8fbb2d735 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -88,6 +88,9 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct)
 int nf_conntrack_timeout_init(void);
 void nf_conntrack_timeout_fini(void);
 void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout);
+int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num,
+		      const char *timeout_name);
+void nf_ct_destroy_timeout(struct nf_conn *ct);
 #else
 static inline int nf_conntrack_timeout_init(void)
 {
@@ -98,6 +101,18 @@ static inline void nf_conntrack_timeout_fini(void)
 {
         return;
 }
+
+static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
+				    u8 l3num, u8 l4num,
+				    const char *timeout_name)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void nf_ct_destroy_timeout(struct nf_conn *ct)
+{
+	return;
+}
 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-- 
cgit v1.2.3


From e4516ef65490ef29d48a98ad4d7c90dccf39068f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:48 -0700
Subject: ipv4: Create init helper for fib_nh

Consolidate the fib_nh initialization which is duplicated between
fib_create_info for single path and fib_get_nhs for multipath.
Export the helper to allow for use with nexthop objects in the
future.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9c8214d2116d..1af1f552644a 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -416,6 +416,10 @@ void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
 		     struct flowi4 *fl4, const struct sk_buff *skb);
 
+int fib_nh_init(struct net *net, struct fib_nh *fib_nh,
+		struct fib_config *cfg, int nh_weight,
+		struct netlink_ext_ack *extack);
+
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
 struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
-- 
cgit v1.2.3


From faa041a40b9fa64913789fcc0161c7c73161f57e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:49 -0700
Subject: ipv4: Create cleanup helper for fib_nh

Move the fib_nh cleanup code from free_fib_info_rcu into a new helper,
fib_nh_release. Move classid accounting into fib_nh_release which is
called per fib_nh to make accounting symmetrical with fib_nh_init.
Export the helper to allow for use with nexthop objects in the
future.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1af1f552644a..5a4df0ba175e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -419,6 +419,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
 int fib_nh_init(struct net *net, struct fib_nh *fib_nh,
 		struct fib_config *cfg, int nh_weight,
 		struct netlink_ext_ack *extack);
+void fib_nh_release(struct net *net, struct fib_nh *fib_nh);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
-- 
cgit v1.2.3


From 83c442515917812d4ff643e90cd456c630b7e762 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:50 -0700
Subject: ipv6: Create init helper for fib6_nh

Similar to IPv4, consolidate the fib6_nh initialization into a helper.
As a new standalone function, add a cleanup path to put lwtstate on
error.

To avoid modifying fib6_config flags, move the reject check to a helper
that is invoked once by fib6_nh_init to reset the device and then
again in ip6_route_info_create to set the fib6_flags.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2acb78a762ee..ce1f81345c8e 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -444,6 +444,10 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 	return f6i->fib6_nh.nh_dev;
 }
 
+int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+		 struct fib6_config *cfg, gfp_t gfp_flags,
+		 struct netlink_ext_ack *extack);
+
 static inline
 struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
 {
-- 
cgit v1.2.3


From dac7d0f27075ce54017a7efdd6ae0a55352a0f80 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:51 -0700
Subject: ipv6: Create cleanup helper for fib6_nh

Move the fib6_nh cleanup code to a new helper, fib6_nh_release.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index ce1f81345c8e..2d2a468b3d6d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -447,6 +447,7 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 		 struct fib6_config *cfg, gfp_t gfp_flags,
 		 struct netlink_ext_ack *extack);
+void fib6_nh_release(struct fib6_nh *fib6_nh);
 
 static inline
 struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
-- 
cgit v1.2.3


From 2b2450ca4a2d9d772dc45e1220c04cb3ba761843 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:52 -0700
Subject: ipv6: Move gateway checks to a fib6_nh setting

The gateway setting is not per fib6_info entry but per-fib6_nh. Add a new
fib_nh_has_gw flag to fib6_nh and convert references to RTF_GATEWAY to
the new flag. For IPv6 address the flag is cheaper than checking that
nh_gw is non-0 like IPv4 does.

While this increases fib6_nh by 8-bytes, the effective allocation size of
a fib6_info is unchanged. The 8 bytes is recovered later with a
fib_nh_common change.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   | 1 +
 include/net/ip6_route.h | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2d2a468b3d6d..3b04b318cf13 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -126,6 +126,7 @@ struct rt6_exception {
 
 struct fib6_nh {
 	struct in6_addr		nh_gw;
+	bool			fib_nh_has_gw;
 	struct net_device	*nh_dev;
 	struct lwtunnel_state	*nh_lwtstate;
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 7ab119936e69..95cd8a2f6284 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -68,8 +68,8 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
-	return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-	       RTF_GATEWAY;
+	return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) &&
+		f6i->fib6_nh.fib_nh_has_gw;
 }
 
 void ip6_route_input(struct sk_buff *skb);
-- 
cgit v1.2.3


From 6d3d07b45c86f984424ccbad110ca500397fd18c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:53 -0700
Subject: ipv6: Refactor fib6_ignore_linkdown

fib6_ignore_linkdown takes a fib6_info but only looks at the net_device
and its IPv6 config. Change it to take a net_device over a fib6_info as
its input argument.

In addition, move it to a header file to make the check inline and usable
later with IPv4 code without going through the ipv6 stub, and rename to
ip6_ignore_linkdown since it is only checking the setting based on the
ipv6 struct on a device.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 269ec27385e9..ec8e6784a6f7 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -425,6 +425,14 @@ static inline void in6_dev_hold(struct inet6_dev *idev)
 	refcount_inc(&idev->refcnt);
 }
 
+/* called with rcu_read_lock held */
+static inline bool ip6_ignore_linkdown(const struct net_device *dev)
+{
+	const struct inet6_dev *idev = __in6_dev_get(dev);
+
+	return !!idev->cnf.ignore_routes_with_linkdown;
+}
+
 void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp);
 
 static inline void in6_ifa_put(struct inet6_ifaddr *ifp)
-- 
cgit v1.2.3


From b75ed8b1aa9c3a99702159c3be8b0c1d54972ae5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:55 -0700
Subject: ipv4: Rename fib_nh entries

Rename fib_nh entries that will be moved to a fib_nh_common struct.
Specifically, the device, oif, gateway, flags, scope, lwtstate,
nh_weight and nh_upper_bound are common with all nexthop definitions.
In the process shorten fib_nh_lwtstate to fib_nh_lws to avoid really
long lines.

Rename only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 5a4df0ba175e..029acd333d29 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -77,26 +77,26 @@ struct fnhe_hash_bucket {
 #define FNHE_RECLAIM_DEPTH	5
 
 struct fib_nh {
-	struct net_device	*nh_dev;
+	struct net_device	*fib_nh_dev;
 	struct hlist_node	nh_hash;
 	struct fib_info		*nh_parent;
-	unsigned int		nh_flags;
-	unsigned char		nh_scope;
+	unsigned int		fib_nh_flags;
+	unsigned char		fib_nh_scope;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	int			nh_weight;
-	atomic_t		nh_upper_bound;
+	int			fib_nh_weight;
+	atomic_t		fib_nh_upper_bound;
 #endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	__u32			nh_tclassid;
 #endif
-	int			nh_oif;
-	__be32			nh_gw;
+	int			fib_nh_oif;
+	__be32			fib_nh_gw4;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
 	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
 	struct rtable __rcu	*nh_rth_input;
 	struct fnhe_hash_bucket	__rcu *nh_exceptions;
-	struct lwtunnel_state	*nh_lwtstate;
+	struct lwtunnel_state	*fib_nh_lws;
 };
 
 /*
@@ -125,7 +125,7 @@ struct fib_info {
 	int			fib_nhs;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
-#define fib_dev		fib_nh[0].nh_dev
+#define fib_dev		fib_nh[0].fib_nh_dev
 };
 
 
@@ -180,9 +180,9 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
 	  atomic_read(&(net)->ipv4.dev_addr_genid)) ?	\
 	 FIB_RES_NH(res).nh_saddr :			\
 	 fib_info_update_nh_saddr((net), &FIB_RES_NH(res)))
-#define FIB_RES_GW(res)			(FIB_RES_NH(res).nh_gw)
-#define FIB_RES_DEV(res)		(FIB_RES_NH(res).nh_dev)
-#define FIB_RES_OIF(res)		(FIB_RES_NH(res).nh_oif)
+#define FIB_RES_GW(res)			(FIB_RES_NH(res).fib_nh_gw4)
+#define FIB_RES_DEV(res)		(FIB_RES_NH(res).fib_nh_dev)
+#define FIB_RES_OIF(res)		(FIB_RES_NH(res).fib_nh_oif)
 
 #define FIB_RES_PREFSRC(net, res)	((res).fi->fib_prefsrc ? : \
 					 FIB_RES_SADDR(net, res))
-- 
cgit v1.2.3


From ad1601ae0260551f85691ca1ac814773fdcec239 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:56 -0700
Subject: ipv6: Rename fib6_nh entries

Rename fib6_nh entries that will be moved to a fib_nh_common struct.
Specifically, the device, gateway, flags, and lwtstate are common
with all nexthop definitions. In some places new temporary variables
are declared or local variables renamed to maintain line lengths.

Rename only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   | 16 ++++++++--------
 include/net/ip6_route.h |  8 +++++---
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 3b04b318cf13..aff8570725c8 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -125,14 +125,14 @@ struct rt6_exception {
 #define FIB6_MAX_DEPTH 5
 
 struct fib6_nh {
-	struct in6_addr		nh_gw;
+	struct in6_addr		fib_nh_gw6;
 	bool			fib_nh_has_gw;
-	struct net_device	*nh_dev;
-	struct lwtunnel_state	*nh_lwtstate;
+	struct net_device	*fib_nh_dev;
+	struct lwtunnel_state	*fib_nh_lws;
 
-	unsigned int		nh_flags;
-	atomic_t		nh_upper_bound;
-	int			nh_weight;
+	unsigned int		fib_nh_flags;
+	atomic_t		fib_nh_upper_bound;
+	int			fib_nh_weight;
 };
 
 struct fib6_info {
@@ -442,7 +442,7 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
 
 static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 {
-	return f6i->fib6_nh.nh_dev;
+	return f6i->fib6_nh.fib_nh_dev;
 }
 
 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
@@ -453,7 +453,7 @@ void fib6_nh_release(struct fib6_nh *fib6_nh);
 static inline
 struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
 {
-	return f6i->fib6_nh.nh_lwtstate;
+	return f6i->fib6_nh.fib_nh_lws;
 }
 
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 95cd8a2f6284..342180a7285c 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -274,9 +274,11 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
-	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
-	       ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) &&
-	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
+	struct fib6_nh *nha = &a->fib6_nh, *nhb = &b->fib6_nh;
+
+	return nha->fib_nh_dev == nhb->fib_nh_dev &&
+	       ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
+	       !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
 }
 
 static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
-- 
cgit v1.2.3


From f1741730dd18828fe3ea5fa91c22f41cf001c625 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:57 -0700
Subject: net: Add fib_nh_common and update fib_nh and fib6_nh

Add fib_nh_common struct with common nexthop attributes. Convert
fib_nh and fib6_nh to use it. Use macros to move existing
fib_nh_* references to the new nh_common.nhc_*.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 10 ++--------
 include/net/ip_fib.h  | 41 +++++++++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index aff8570725c8..58dbb4e82908 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -19,6 +19,7 @@
 #include <linux/notifier.h>
 #include <net/dst.h>
 #include <net/flow.h>
+#include <net/ip_fib.h>
 #include <net/netlink.h>
 #include <net/inetpeer.h>
 #include <net/fib_notifier.h>
@@ -125,14 +126,7 @@ struct rt6_exception {
 #define FIB6_MAX_DEPTH 5
 
 struct fib6_nh {
-	struct in6_addr		fib_nh_gw6;
-	bool			fib_nh_has_gw;
-	struct net_device	*fib_nh_dev;
-	struct lwtunnel_state	*fib_nh_lws;
-
-	unsigned int		fib_nh_flags;
-	atomic_t		fib_nh_upper_bound;
-	int			fib_nh_weight;
+	struct fib_nh_common	nh_common;
 };
 
 struct fib6_info {
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 029acd333d29..70548b1a6322 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -76,27 +76,48 @@ struct fnhe_hash_bucket {
 #define FNHE_HASH_SIZE		(1 << FNHE_HASH_SHIFT)
 #define FNHE_RECLAIM_DEPTH	5
 
+struct fib_nh_common {
+	struct net_device	*nhc_dev;
+	int			nhc_oif;
+	unsigned int		nhc_flags;
+	struct lwtunnel_state	*nhc_lwtstate;
+	unsigned char		nhc_scope;
+	u8			nhc_family;
+	u8			nhc_has_gw:1,
+				unused:7;
+	union {
+		__be32          ipv4;
+		struct in6_addr ipv6;
+	} nhc_gw;
+
+	int			nhc_weight;
+	atomic_t		nhc_upper_bound;
+};
+
 struct fib_nh {
-	struct net_device	*fib_nh_dev;
+	struct fib_nh_common	nh_common;
 	struct hlist_node	nh_hash;
 	struct fib_info		*nh_parent;
-	unsigned int		fib_nh_flags;
-	unsigned char		fib_nh_scope;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	int			fib_nh_weight;
-	atomic_t		fib_nh_upper_bound;
-#endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	__u32			nh_tclassid;
 #endif
-	int			fib_nh_oif;
-	__be32			fib_nh_gw4;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
 	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
 	struct rtable __rcu	*nh_rth_input;
 	struct fnhe_hash_bucket	__rcu *nh_exceptions;
-	struct lwtunnel_state	*fib_nh_lws;
+#define fib_nh_family		nh_common.nhc_family
+#define fib_nh_dev		nh_common.nhc_dev
+#define fib_nh_oif		nh_common.nhc_oif
+#define fib_nh_flags		nh_common.nhc_flags
+#define fib_nh_lws		nh_common.nhc_lwtstate
+#define fib_nh_scope		nh_common.nhc_scope
+#define fib_nh_family		nh_common.nhc_family
+#define fib_nh_has_gw		nh_common.nhc_has_gw
+#define fib_nh_gw4		nh_common.nhc_gw.ipv4
+#define fib_nh_gw6		nh_common.nhc_gw.ipv6
+#define fib_nh_weight		nh_common.nhc_weight
+#define fib_nh_upper_bound	nh_common.nhc_upper_bound
 };
 
 /*
-- 
cgit v1.2.3


From 979e276ebebd537782797c439c9cb42b6d3aba27 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:58 -0700
Subject: net: Use common nexthop init and release helpers

With fib_nh_common in place, move common initialization and release
code into helpers used by both ipv4 and ipv6. For the moment, the init
is just the lwt encap and the release is both the netdev reference and
the the lwt state reference. More will be added later.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 70548b1a6322..12a6d759cf57 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -441,6 +441,10 @@ int fib_nh_init(struct net *net, struct fib_nh *fib_nh,
 		struct fib_config *cfg, int nh_weight,
 		struct netlink_ext_ack *extack);
 void fib_nh_release(struct net *net, struct fib_nh *fib_nh);
+int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *fc_encap,
+		       u16 fc_encap_type, void *cfg, gfp_t gfp_flags,
+		       struct netlink_ext_ack *extack);
+void fib_nh_common_release(struct fib_nh_common *nhc);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
-- 
cgit v1.2.3


From 3616d08bcbb564c7765187cd45ad392e49bad73a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 22 Mar 2019 06:06:09 -0700
Subject: ipv6: Move ipv6 stubs to a separate header file

The number of stubs is growing and has nothing to do with addrconf.
Move the definition of the stubs to a separate header file and update
users. In the move, drop the vxlan specific comment before ipv6_stub.

Code move only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h   | 47 ------------------------------------
 include/net/ipv6_stubs.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/net/udp_tunnel.h |  2 +-
 3 files changed, 64 insertions(+), 48 deletions(-)
 create mode 100644 include/net/ipv6_stubs.h

(limited to 'include/net')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index ec8e6784a6f7..2f67ae854ff0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -238,53 +238,6 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 
 void ipv6_mc_dad_complete(struct inet6_dev *idev);
 
-/* A stub used by vxlan module. This is ugly, ideally these
- * symbols should be built into the core kernel.
- */
-struct ipv6_stub {
-	int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex,
-				 const struct in6_addr *addr);
-	int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex,
-				 const struct in6_addr *addr);
-	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
-			       struct dst_entry **dst, struct flowi6 *fl6);
-	int (*ipv6_route_input)(struct sk_buff *skb);
-
-	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
-	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
-					 struct flowi6 *fl6, int flags);
-	struct fib6_info *(*fib6_table_lookup)(struct net *net,
-					      struct fib6_table *table,
-					      int oif, struct flowi6 *fl6,
-					      int flags);
-	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
-						   struct fib6_info *f6i,
-						   struct flowi6 *fl6, int oif,
-						   const struct sk_buff *skb,
-						   int strict);
-	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
-				 struct in6_addr *saddr);
-
-	void (*udpv6_encap_enable)(void);
-	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
-			      const struct in6_addr *solicited_addr,
-			      bool router, bool solicited, bool override, bool inc_opt);
-	struct neigh_table *nd_tbl;
-};
-extern const struct ipv6_stub *ipv6_stub __read_mostly;
-
-/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
-struct ipv6_bpf_stub {
-	int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
-			  bool force_bind_address_no_port, bool with_lock);
-	struct sock *(*udp6_lib_lookup)(struct net *net,
-					const struct in6_addr *saddr, __be16 sport,
-					const struct in6_addr *daddr, __be16 dport,
-					int dif, int sdif, struct udp_table *tbl,
-					struct sk_buff *skb);
-};
-extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
-
 /*
  * identify MLD packets for MLD filter exceptions
  */
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
new file mode 100644
index 000000000000..d8d9c0b0e8c0
--- /dev/null
+++ b/include/net/ipv6_stubs.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IPV6_STUBS_H
+#define _IPV6_STUBS_H
+
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/neighbour.h>
+#include <net/sock.h>
+
+/* structs from net/ip6_fib.h */
+struct fib6_info;
+
+/* This is ugly, ideally these symbols should be built
+ * into the core kernel.
+ */
+struct ipv6_stub {
+	int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex,
+				 const struct in6_addr *addr);
+	int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex,
+				 const struct in6_addr *addr);
+	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
+			       struct dst_entry **dst, struct flowi6 *fl6);
+	int (*ipv6_route_input)(struct sk_buff *skb);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
+				 struct in6_addr *saddr);
+
+	void (*udpv6_encap_enable)(void);
+	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
+			      const struct in6_addr *solicited_addr,
+			      bool router, bool solicited, bool override, bool inc_opt);
+	struct neigh_table *nd_tbl;
+};
+extern const struct ipv6_stub *ipv6_stub __read_mostly;
+
+/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
+struct ipv6_bpf_stub {
+	int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+			  bool force_bind_address_no_port, bool with_lock);
+	struct sock *(*udp6_lib_lookup)(struct net *net,
+				     const struct in6_addr *saddr, __be16 sport,
+				     const struct in6_addr *daddr, __be16 dport,
+				     int dif, int sdif, struct udp_table *tbl,
+				     struct sk_buff *skb);
+};
+extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
+
+#endif
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index b8137953fea3..4b1f95e08307 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -7,7 +7,7 @@
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
-#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #endif
 
 struct udp_port_cfg {
-- 
cgit v1.2.3


From a2c7023f7075ca9b80f944d3f20f60e6574538e2 Mon Sep 17 00:00:00 2001
From: Xiaofei Shen <xiaofeis@codeaurora.org>
Date: Fri, 29 Mar 2019 11:04:58 +0530
Subject: net: dsa: read mac address from DT for slave device

Before creating a slave netdevice, get the mac address from DTS and
apply in case it is valid.

Signed-off-by: Xiaofei Shen <xiaofeis@codeaurora.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ae480bba11f5..0cfc2f828b87 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -140,6 +140,7 @@ struct dsa_port {
 	unsigned int		index;
 	const char		*name;
 	const struct dsa_port	*cpu_dp;
+	const char		*mac;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
 	u8			stp_state;
-- 
cgit v1.2.3


From eba618abacade71669eb67c3360eecfee810cc88 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:55 -0700
Subject: ipv4: Add fib_nh_common to fib_result

Most of the ipv4 code only needs data from fib_nh_common. Add
fib_nh_common selection to fib_result and update users to use it.

Right now, fib_nh_common in fib_result will point to a fib_nh struct
that is embedded within a fib_info:

        fib_info  --> fib_nh
                      fib_nh
                      ...
                      fib_nh
                        ^
    fib_result->nhc ----+

Later, nhc can point to a fib_nh within a nexthop struct:

        fib_info --> nexthop --> fib_nh
                                   ^
    fib_result->nhc ---------------+

or for a nexthop group:

        fib_info --> nexthop --> nexthop --> fib_nh
                                 nexthop --> fib_nh
                                 ...
                                 nexthop --> fib_nh
                                               ^
    fib_result->nhc ---------------------------+

In all cases nhsel within fib_result will point to which leg in the
multipath route is used.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 47 +++++++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 12a6d759cf57..1f4a3b8bf584 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -156,15 +156,16 @@ struct fib_rule;
 
 struct fib_table;
 struct fib_result {
-	__be32		prefix;
-	unsigned char	prefixlen;
-	unsigned char	nh_sel;
-	unsigned char	type;
-	unsigned char	scope;
-	u32		tclassid;
-	struct fib_info *fi;
-	struct fib_table *table;
-	struct hlist_head *fa_head;
+	__be32			prefix;
+	unsigned char		prefixlen;
+	unsigned char		nh_sel;
+	unsigned char		type;
+	unsigned char		scope;
+	u32			tclassid;
+	struct fib_nh_common	*nhc;
+	struct fib_info		*fi;
+	struct fib_table	*table;
+	struct hlist_head	*fa_head;
 };
 
 struct fib_result_nl {
@@ -182,11 +183,10 @@ struct fib_result_nl {
 	int             err;
 };
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-#define FIB_RES_NH(res)		((res).fi->fib_nh[(res).nh_sel])
-#else /* CONFIG_IP_ROUTE_MULTIPATH */
-#define FIB_RES_NH(res)		((res).fi->fib_nh[0])
-#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
+{
+	return &fi->fib_nh[nhsel].nh_common;
+}
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 #define FIB_TABLE_HASHSZ 256
@@ -195,18 +195,11 @@ struct fib_result_nl {
 #endif
 
 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
+__be32 fib_result_prefsrc(struct net *net, struct fib_result *res);
 
-#define FIB_RES_SADDR(net, res)				\
-	((FIB_RES_NH(res).nh_saddr_genid ==		\
-	  atomic_read(&(net)->ipv4.dev_addr_genid)) ?	\
-	 FIB_RES_NH(res).nh_saddr :			\
-	 fib_info_update_nh_saddr((net), &FIB_RES_NH(res)))
-#define FIB_RES_GW(res)			(FIB_RES_NH(res).fib_nh_gw4)
-#define FIB_RES_DEV(res)		(FIB_RES_NH(res).fib_nh_dev)
-#define FIB_RES_OIF(res)		(FIB_RES_NH(res).fib_nh_oif)
-
-#define FIB_RES_PREFSRC(net, res)	((res).fi->fib_prefsrc ? : \
-					 FIB_RES_SADDR(net, res))
+#define FIB_RES_NHC(res)		((res).nhc)
+#define FIB_RES_DEV(res)	(FIB_RES_NHC(res)->nhc_dev)
+#define FIB_RES_OIF(res)	(FIB_RES_NHC(res)->nhc_oif)
 
 struct fib_entry_notifier_info {
 	struct fib_notifier_info info; /* must be first */
@@ -453,10 +446,12 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
 static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
 {
 #ifdef CONFIG_IP_ROUTE_CLASSID
+	struct fib_nh_common *nhc = res->nhc;
+	struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	u32 rtag;
 #endif
-	*itag = FIB_RES_NH(*res).nh_tclassid<<16;
+	*itag = nh->nh_tclassid << 16;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	rtag = res->tclassid;
 	if (*itag == 0)
-- 
cgit v1.2.3


From c0a720770c01e67374b15f348f17a52409f6545c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:58 -0700
Subject: ipv6: Flip to fib_nexthop_info

Export fib_nexthop_info and fib_add_nexthop for use by IPv6 code.
Remove rt6_nexthop_info and rt6_add_nexthop in favor of the IPv4
versions. Update fib_nexthop_info for IPv6 linkdown check and
RTA_GATEWAY for AF_INET6.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1f4a3b8bf584..3ce07841dc3b 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -492,4 +492,9 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
 int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct fib_dump_filter *filter,
 			  struct netlink_callback *cb);
+
+int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh,
+		     unsigned int *flags, bool skip_oif);
+int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh,
+		    int nh_weight);
 #endif  /* _NET_FIB_H */
-- 
cgit v1.2.3


From 407dd706fb5245c138f3a972f8aaa1c8a09a574c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 3 Apr 2019 14:24:15 +0200
Subject: net: devlink: convert devlink_port_attrs bools to bits

In order to save space in the struct, convert bools to bits.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 31d5cec4d06b..4a1e3452a4ce 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -41,10 +41,10 @@ struct devlink {
 };
 
 struct devlink_port_attrs {
-	bool set;
+	u8 set:1,
+	   split:1;
 	enum devlink_port_flavour flavour;
 	u32 port_number; /* same value as "split group" */
-	bool split;
 	u32 split_subport_number;
 };
 
-- 
cgit v1.2.3


From bec5267cded268acdf679b651778c300d204e9f2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 3 Apr 2019 14:24:16 +0200
Subject: net: devlink: extend port attrs for switch ID

Extend devlink_port_attrs_set() to pass switch ID for ports which are
part of switch and store it in port attrs. For other ports, this is
NULL.

Note that this allows the driver to group devlink ports into one or more
switches according to the actual topology.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4a1e3452a4ce..0f7968761204 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -42,10 +42,12 @@ struct devlink {
 
 struct devlink_port_attrs {
 	u8 set:1,
-	   split:1;
+	   split:1,
+	   switch_port:1;
 	enum devlink_port_flavour flavour;
 	u32 port_number; /* same value as "split group" */
 	u32 split_subport_number;
+	struct netdev_phys_item_id switch_id;
 };
 
 struct devlink_port {
@@ -582,7 +584,9 @@ void devlink_port_type_clear(struct devlink_port *devlink_port);
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
 			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
-			    u32 split_subport_number);
+			    u32 split_subport_number,
+			    const unsigned char *switch_id,
+			    unsigned char switch_id_len);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From 7e1146e8c10c00f859843817da8ecc5d902ea409 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 3 Apr 2019 14:24:17 +0200
Subject: net: devlink: introduce devlink_compat_switch_id_get() helper

Introduce devlink_compat_switch_id_get() helper which fills up switch_id
according to passed netdev pointer. Call it directly from
dev_get_port_parent_id() as a fallback when ndo_get_port_parent_id
is not defined for given netdev.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0f7968761204..70c7d1ac8344 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -743,6 +743,8 @@ void devlink_compat_running_version(struct net_device *dev,
 int devlink_compat_flash_update(struct net_device *dev, const char *file_name);
 int devlink_compat_phys_port_name_get(struct net_device *dev,
 				      char *name, size_t len);
+int devlink_compat_switch_id_get(struct net_device *dev,
+				 struct netdev_phys_item_id *ppid);
 
 #else
 
@@ -764,6 +766,13 @@ devlink_compat_phys_port_name_get(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int
+devlink_compat_switch_id_get(struct net_device *dev,
+			     struct netdev_phys_item_id *ppid)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
-- 
cgit v1.2.3


From b262a69582a4676c7378a73077b7bb186c7c5b2a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:22 +0100
Subject: xfrm: place af number into xfrm_mode struct

This will be useful to know if we're supposed to decode ipv4 or ipv6.

While at it, make the unregister function return void, all module_exit
functions did just BUG(); there is never a point in doing error checks
if there is no way to handle such error.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 85386becbaea..9a155063c25f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -482,8 +482,9 @@ struct xfrm_mode {
 
 	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
-	unsigned int encap;
-	int flags;
+	u8 encap;
+	u8 family;
+	u8 flags;
 };
 
 /* Flags for xfrm_mode. */
@@ -491,8 +492,8 @@ enum {
 	XFRM_MODE_FLAG_TUNNEL = 1,
 };
 
-int xfrm_register_mode(struct xfrm_mode *mode, int family);
-int xfrm_unregister_mode(struct xfrm_mode *mode, int family);
+int xfrm_register_mode(struct xfrm_mode *mode);
+void xfrm_unregister_mode(struct xfrm_mode *mode);
 
 static inline int xfrm_af2proto(unsigned int family)
 {
-- 
cgit v1.2.3


From c2d305e51038167dd9de8d476c72f667d84cad8b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:24 +0100
Subject: xfrm: remove input indirection from xfrm_mode

No need for any indirection or abstraction here, both functions
are pretty much the same and quite small, they also have no external
dependencies.

xfrm_prepare_input can then be made static.

With allmodconfig build, size increase of vmlinux is 25 byte:

Before:
   text   data     bss     dec      filename
15730207  6936924 4046908 26714039  vmlinux

After:
15730208  6936948 4046908 26714064 vmlinux

v2: Fix INET_XFRM_MODE_TRANSPORT name in is-enabled test (Sabrina Dubroca)
    change copied comment to refer to transport and network header,
    not skb->{h,nh}, which don't exist anymore. (Sabrina)
    make xfrm_prepare_input static (Eyal Birger)

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9a155063c25f..2c5fc9cc367d 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -436,16 +436,6 @@ struct xfrm_mode {
 	 */
 	int (*input2)(struct xfrm_state *x, struct sk_buff *skb);
 
-	/*
-	 * This is the actual input entry point.
-	 *
-	 * For transport mode and equivalent this would be identical to
-	 * input2 (which does not need to be set).  While tunnel mode
-	 * and equivalent would set this to the tunnel encapsulation function
-	 * xfrm4_prepare_input that would in turn call input2.
-	 */
-	int (*input)(struct xfrm_state *x, struct sk_buff *skb);
-
 	/*
 	 * Add encapsulation header.
 	 *
@@ -1606,7 +1596,6 @@ int xfrm_init_replay(struct xfrm_state *x);
 int xfrm_state_mtu(struct xfrm_state *x, int mtu);
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload);
 int xfrm_init_state(struct xfrm_state *x);
-int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
 int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
 int xfrm_trans_queue(struct sk_buff *skb,
-- 
cgit v1.2.3


From 0c620e97b3490890facbbe06d5deed9b024de255 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:25 +0100
Subject: xfrm: remove output indirection from xfrm_mode

Same is input indirection.  Only exception: we need to export
xfrm_outer_mode_output for pktgen.

Increases size of vmlinux by about 163 byte:
Before:
   text    data     bss     dec      filename
15730208  6936948 4046908 26714064   vmlinux

After:
15730311  6937008 4046908 26714227   vmlinux

xfrm_inner_extract_output has no more external callers, make it static.

v2: add IS_ENABLED(IPV6) guard in xfrm6_prepare_output
    add two missing breaks in xfrm_outer_mode_output (Sabrina Dubroca)
    add WARN_ON_ONCE for 'call AF_INET6 related output function, but
    CONFIG_IPV6=n' case.
    make xfrm_inner_extract_output static

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2c5fc9cc367d..01e7e9c0e8a9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -449,17 +449,6 @@ struct xfrm_mode {
 	 */
 	int (*output2)(struct xfrm_state *x,struct sk_buff *skb);
 
-	/*
-	 * This is the actual output entry point.
-	 *
-	 * For transport mode and equivalent this would be identical to
-	 * output2 (which does not need to be set).  While tunnel mode
-	 * and equivalent would set this to a tunnel encapsulation function
-	 * (xfrm4_prepare_output or xfrm6_prepare_output) that would in turn
-	 * call output2.
-	 */
-	int (*output)(struct xfrm_state *x, struct sk_buff *skb);
-
 	/*
 	 * Adjust pointers into the packet and do GSO segmentation.
 	 */
@@ -1603,7 +1592,11 @@ int xfrm_trans_queue(struct sk_buff *skb,
 				   struct sk_buff *));
 int xfrm_output_resume(struct sk_buff *skb, int err);
 int xfrm_output(struct sock *sk, struct sk_buff *skb);
-int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb);
+
+#if IS_ENABLED(CONFIG_NET_PKTGEN)
+int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb);
+#endif
+
 void xfrm_local_error(struct sk_buff *skb, int mtu);
 int xfrm4_extract_header(struct sk_buff *skb);
 int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb);
@@ -1622,7 +1615,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 }
 
 int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb);
-int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb);
 int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err);
@@ -1649,7 +1641,6 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
 __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr);
 __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr);
 int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb);
-int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb);
 int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
-- 
cgit v1.2.3


From 303c5fab1272888b22088fbdd08cb770205ccb7a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:26 +0100
Subject: xfrm: remove xmit indirection from xfrm_mode

There are only two versions (tunnel and transport). The ip/ipv6 versions
are only differ in sizeof(iphdr) vs ipv6hdr.

Place this in the core and use x->outer_mode->encap type to call the
correct adjustment helper.

Before:
   text   data    bss     dec      filename
15730311  6937008 4046908 26714227 vmlinux

After:
15730428  6937008 4046908 26714344 vmlinux

(about 117 byte increase)

v2: use family from x->outer_mode, not inner

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 01e7e9c0e8a9..07966a27e4a4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -454,11 +454,6 @@ struct xfrm_mode {
 	 */
 	struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features);
 
-	/*
-	 * Adjust pointers into the packet when IPsec is done at layer2.
-	 */
-	void (*xmit)(struct xfrm_state *x, struct sk_buff *skb);
-
 	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
 	u8 encap;
-- 
cgit v1.2.3


From 7613b92b1ae37141704948b77e8762c5de896510 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:27 +0100
Subject: xfrm: remove gso_segment indirection from xfrm_mode

These functions are small and we only have versions for tunnel
and transport mode for ipv4 and ipv6 respectively.

Just place the 'transport or tunnel' conditional in the protocol
specific function instead of using an indirection.

Before:
    3226       12       0     3238   net/ipv4/esp4_offload.o
    7004      492       0     7496   net/ipv4/ip_vti.o
    3339       12       0     3351   net/ipv6/esp6_offload.o
   11294      460       0    11754   net/ipv6/ip6_vti.o
    1180       72       0     1252   net/ipv4/xfrm4_mode_beet.o
     428       48       0      476   net/ipv4/xfrm4_mode_transport.o
    1271       48       0     1319   net/ipv4/xfrm4_mode_tunnel.o
    1083       60       0     1143   net/ipv6/xfrm6_mode_beet.o
     172       48       0      220   net/ipv6/xfrm6_mode_ro.o
     429       48       0      477   net/ipv6/xfrm6_mode_transport.o
    1164       48       0     1212   net/ipv6/xfrm6_mode_tunnel.o
15730428  6937008 4046908 26714344   vmlinux

After:
    3461       12       0     3473   net/ipv4/esp4_offload.o
    7000      492       0     7492   net/ipv4/ip_vti.o
    3574       12       0     3586   net/ipv6/esp6_offload.o
   11295      460       0    11755   net/ipv6/ip6_vti.o
    1180       64       0     1244   net/ipv4/xfrm4_mode_beet.o
     171       40       0      211   net/ipv4/xfrm4_mode_transport.o
    1163       40       0     1203   net/ipv4/xfrm4_mode_tunnel.o
    1083       52       0     1135   net/ipv6/xfrm6_mode_beet.o
     172       40       0      212   net/ipv6/xfrm6_mode_ro.o
     172       40       0      212   net/ipv6/xfrm6_mode_transport.o
    1056       40       0     1096   net/ipv6/xfrm6_mode_tunnel.o
15730424  6937008 4046908 26714340   vmlinux

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 07966a27e4a4..de103a6d1ef8 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -449,11 +449,6 @@ struct xfrm_mode {
 	 */
 	int (*output2)(struct xfrm_state *x,struct sk_buff *skb);
 
-	/*
-	 * Adjust pointers into the packet and do GSO segmentation.
-	 */
-	struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features);
-
 	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
 	u8 encap;
-- 
cgit v1.2.3


From b3284df1c86f7ac078dcb8fb250fe3d6437e740c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:28 +0100
Subject: xfrm: remove input2 indirection from xfrm_mode

No external dependencies on any module, place this in the core.
Increase is about 1800 byte for xfrm_input.o.

The beet helpers get added to internal header, as they can be reused
from xfrm_output.c in the next patch (kernel contains several
copies of them in the xfrm{4,6}_mode_beet.c files).

Before:
   text    data     bss     dec filename
   5578     176    2364    8118 net/xfrm/xfrm_input.o
   1180      64       0    1244 net/ipv4/xfrm4_mode_beet.o
    171      40       0     211 net/ipv4/xfrm4_mode_transport.o
   1163      40       0    1203 net/ipv4/xfrm4_mode_tunnel.o
   1083      52       0    1135 net/ipv6/xfrm6_mode_beet.o
    172      40       0     212 net/ipv6/xfrm6_mode_ro.o
    172      40       0     212 net/ipv6/xfrm6_mode_transport.o
   1056      40       0    1096 net/ipv6/xfrm6_mode_tunnel.o

After:
   text    data     bss     dec filename
   7373     200    2364    9937 net/xfrm/xfrm_input.o
    587      44       0     631 net/ipv4/xfrm4_mode_beet.o
    171      32       0     203 net/ipv4/xfrm4_mode_transport.o
    649      32       0     681 net/ipv4/xfrm4_mode_tunnel.o
    625      44       0     669 net/ipv6/xfrm6_mode_beet.o
    172      32       0     204 net/ipv6/xfrm6_mode_ro.o
    172      32       0     204 net/ipv6/xfrm6_mode_transport.o
    599      32       0     631 net/ipv6/xfrm6_mode_tunnel.o

v2: pass inner_mode to xfrm_inner_mode_encap_remove to fix
    AF_UNSPEC selector breakage (bisected by Benedict Wong)

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index de103a6d1ef8..bdda545cf740 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -423,19 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
 struct xfrm_mode {
-	/*
-	 * Remove encapsulation header.
-	 *
-	 * The IP header will be moved over the top of the encapsulation
-	 * header.
-	 *
-	 * On entry, the transport header shall point to where the IP header
-	 * should be and the network header shall be set to where the IP
-	 * header currently is.  skb->data shall point to the start of the
-	 * payload.
-	 */
-	int (*input2)(struct xfrm_state *x, struct sk_buff *skb);
-
 	/*
 	 * Add encapsulation header.
 	 *
-- 
cgit v1.2.3


From 1de70830066b72b6a8e259e5363f6c0bc4ba7bbc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:29 +0100
Subject: xfrm: remove output2 indirection from xfrm_mode

similar to previous patch: no external module dependencies,
so we can avoid the indirection by placing this in the core.

This change removes the last indirection from xfrm_mode and the
xfrm4|6_mode_{beet,tunnel}.c modules contain (almost) no code anymore.

Before:
   text    data     bss     dec     hex filename
   3957     136       0    4093     ffd net/xfrm/xfrm_output.o
    587      44       0     631     277 net/ipv4/xfrm4_mode_beet.o
    649      32       0     681     2a9 net/ipv4/xfrm4_mode_tunnel.o
    625      44       0     669     29d net/ipv6/xfrm6_mode_beet.o
    599      32       0     631     277 net/ipv6/xfrm6_mode_tunnel.o
After:
   text    data     bss     dec     hex filename
   5359     184       0    5543    15a7 net/xfrm/xfrm_output.o
    171      24       0     195      c3 net/ipv4/xfrm4_mode_beet.o
    171      24       0     195      c3 net/ipv4/xfrm4_mode_tunnel.o
    172      24       0     196      c4 net/ipv6/xfrm6_mode_beet.o
    172      24       0     196      c4 net/ipv6/xfrm6_mode_tunnel.o

v2: fold the *encap_add functions into xfrm*_prepare_output
    preserve (move) output2 comment (Sabrina)
    use x->outer_mode->encap, not inner
    fix a build breakage on ppc (kbuild robot)

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index bdda545cf740..4351444c10fc 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -423,19 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
 struct xfrm_mode {
-	/*
-	 * Add encapsulation header.
-	 *
-	 * On exit, the transport header will be set to the start of the
-	 * encapsulation header to be filled in by x->type->output and
-	 * the mac header will be set to the nextheader (protocol for
-	 * IPv4) field of the extension header directly preceding the
-	 * encapsulation header, or in its absence, that of the top IP
-	 * header.  The value of the network header will always point
-	 * to the top IP header while skb->data will point to the payload.
-	 */
-	int (*output2)(struct xfrm_state *x,struct sk_buff *skb);
-
 	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
 	u8 encap;
-- 
cgit v1.2.3


From 733a5fac2f15b55b9059230d098ed04341d2d884 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:30 +0100
Subject: xfrm: remove afinfo pointer from xfrm_mode

Adds an EXPORT_SYMBOL for afinfo_get_rcu, as it will now be called from
ipv6 in case of CONFIG_IPV6=m.

This change has virtually no effect on vmlinux size, but it reduces
afinfo size and allows followup patch to make xfrm modes const.

v2: mark if (afinfo) tests as likely (Sabrina)
    re-fetch afinfo according to inner_mode in xfrm_prepare_input().

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4351444c10fc..8d1c9506bcf6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -423,7 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
 struct xfrm_mode {
-	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
 	u8 encap;
 	u8 family;
-- 
cgit v1.2.3


From 4c145dce26013763490df88f2473714f5bc7857d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:31 +0100
Subject: xfrm: make xfrm modes builtin

after previous changes, xfrm_mode contains no function pointers anymore
and all modules defining such struct contain no code except an init/exit
functions to register the xfrm_mode struct with the xfrm core.

Just place the xfrm modes core and remove the modules,
the run-time xfrm_mode register/unregister functionality is removed.

Before:

    text    data     bss      dec filename
    7523     200    2364    10087 net/xfrm/xfrm_input.o
   40003     628     440    41071 net/xfrm/xfrm_state.o
15730338 6937080 4046908 26714326 vmlinux

    7389     200    2364    9953  net/xfrm/xfrm_input.o
   40574     656     440   41670  net/xfrm/xfrm_state.o
15730084 6937068 4046908 26714060 vmlinux

The xfrm*_mode_{transport,tunnel,beet} modules are gone.

v2: replace CONFIG_INET6_XFRM_MODE_* IS_ENABLED guards with CONFIG_IPV6
    ones rather than removing them.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8d1c9506bcf6..4ca79cdc3460 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -234,9 +234,9 @@ struct xfrm_state {
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	const struct xfrm_type	*type;
-	struct xfrm_mode	*inner_mode;
-	struct xfrm_mode	*inner_mode_iaf;
-	struct xfrm_mode	*outer_mode;
+	const struct xfrm_mode	*inner_mode;
+	const struct xfrm_mode	*inner_mode_iaf;
+	const struct xfrm_mode	*outer_mode;
 
 	const struct xfrm_type_offload	*type_offload;
 
@@ -347,7 +347,6 @@ struct xfrm_state_afinfo {
 	struct module			*owner;
 	const struct xfrm_type		*type_map[IPPROTO_MAX];
 	const struct xfrm_type_offload	*type_offload_map[IPPROTO_MAX];
-	struct xfrm_mode		*mode_map[XFRM_MODE_MAX];
 
 	int			(*init_flags)(struct xfrm_state *x);
 	void			(*init_tempsel)(struct xfrm_selector *sel,
@@ -423,7 +422,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
 struct xfrm_mode {
-	struct module *owner;
 	u8 encap;
 	u8 family;
 	u8 flags;
@@ -434,9 +432,6 @@ enum {
 	XFRM_MODE_FLAG_TUNNEL = 1,
 };
 
-int xfrm_register_mode(struct xfrm_mode *mode);
-void xfrm_unregister_mode(struct xfrm_mode *mode);
-
 static inline int xfrm_af2proto(unsigned int family)
 {
 	switch(family) {
@@ -449,7 +444,7 @@ static inline int xfrm_af2proto(unsigned int family)
 	}
 }
 
-static inline struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
+static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
 {
 	if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
 	    (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
-- 
cgit v1.2.3


From c9500d7b7de8ff6ac88ee3e38b782889f1616593 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:32 +0100
Subject: xfrm: store xfrm_mode directly, not its address

This structure is now only 4 bytes, so its more efficient
to cache a copy rather than its address.

No significant size difference in allmodconfig vmlinux.

With non-modular kernel that has all XFRM options enabled, this
series reduces vmlinux image size by ~11kb. All xfrm_mode
indirections are gone and all modes are built-in.

before (ipsec-next master):
    text      data      bss         dec   filename
21071494   7233140 11104324    39408958   vmlinux.master

after this series:
21066448   7226772 11104324    39397544   vmlinux.patched

With allmodconfig kernel, the size increase is only 362 bytes,
even all the xfrm config options removed in this series are
modular.

before:
    text      data     bss      dec   filename
15731286   6936912 4046908 26715106   vmlinux.master

after this series:
15731492   6937068  4046908  26715468 vmlinux

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4ca79cdc3460..77eb578a0384 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -132,6 +132,17 @@ struct xfrm_state_offload {
 	u8			flags;
 };
 
+struct xfrm_mode {
+	u8 encap;
+	u8 family;
+	u8 flags;
+};
+
+/* Flags for xfrm_mode. */
+enum {
+	XFRM_MODE_FLAG_TUNNEL = 1,
+};
+
 /* Full description of state of transformer. */
 struct xfrm_state {
 	possible_net_t		xs_net;
@@ -234,9 +245,9 @@ struct xfrm_state {
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	const struct xfrm_type	*type;
-	const struct xfrm_mode	*inner_mode;
-	const struct xfrm_mode	*inner_mode_iaf;
-	const struct xfrm_mode	*outer_mode;
+	struct xfrm_mode	inner_mode;
+	struct xfrm_mode	inner_mode_iaf;
+	struct xfrm_mode	outer_mode;
 
 	const struct xfrm_type_offload	*type_offload;
 
@@ -421,17 +432,6 @@ struct xfrm_type_offload {
 int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
-struct xfrm_mode {
-	u8 encap;
-	u8 family;
-	u8 flags;
-};
-
-/* Flags for xfrm_mode. */
-enum {
-	XFRM_MODE_FLAG_TUNNEL = 1,
-};
-
 static inline int xfrm_af2proto(unsigned int family)
 {
 	switch(family) {
@@ -448,9 +448,9 @@ static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, i
 {
 	if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
 	    (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
-		return x->inner_mode;
+		return &x->inner_mode;
 	else
-		return x->inner_mode_iaf;
+		return &x->inner_mode_iaf;
 }
 
 struct xfrm_tmpl {
@@ -1990,7 +1990,7 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
 			tunnel = true;
 		break;
 	}
-	if (tunnel && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL))
+	if (tunnel && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL))
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.2.3


From 1e1b11b6a1111cd9e8af1fd6ccda270a9fa3eacf Mon Sep 17 00:00:00 2001
From: vamsi krishna <vamsin@codeaurora.org>
Date: Fri, 1 Feb 2019 18:34:51 +0530
Subject: nl80211/cfg80211: Specify band specific min RSSI thresholds with
 sched scan

This commit adds the support to specify the RSSI thresholds per
band for each match set. This enhances the current behavior which
specifies a single rssi_threshold across all the bands by
introducing the rssi_threshold_per_band. These per band rssi
thresholds are referred through NL80211_BAND_* (enum nl80211_band)
variables  as attribute types. Such attributes/values per each
band are nested through NL80211_ATTR_SCHED_SCAN_MIN_RSSI.
These band specific rssi thresholds shall take precedence over
the current rssi_thold per match set.
Drivers indicate this support through
%NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD.
These per band rssi attributes/values does not specify
"default RSSI filter" as done by
NL80211_SCHED_SCAN_MATCH_ATTR_RSSI to stay backward compatible.
That said, these per band rssi values have to be specified for
the corresponding matchset.

Signed-off-by: vamsi krishna <vamsin@codeaurora.org>
Signed-off-by: Srinivas Dasari <dasaris@codeaurora.org>
[rebase on refactoring, add policy]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index bb307a11ee63..b13234a486e7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1832,11 +1832,19 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
  * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match
  *	or no match (RSSI only)
  * @rssi_thold: don't report scan results below this threshold (in s32 dBm)
+ * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied
+ *	for filtering out scan results received. Drivers advertize this support
+ *	of band specific rssi based filtering through the feature capability
+ *	%NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band
+ *	specific rssi thresholds take precedence over rssi_thold, if specified.
+ *	If not specified for any band, it will be assigned with rssi_thold of
+ *	corresponding matchset.
  */
 struct cfg80211_match_set {
 	struct cfg80211_ssid ssid;
 	u8 bssid[ETH_ALEN];
 	s32 rssi_thold;
+	s32 per_band_rssi_thold[NUM_NL80211_BANDS];
 };
 
 /**
-- 
cgit v1.2.3


From ab60633c7136c300f15a390f3469d7c4be15a055 Mon Sep 17 00:00:00 2001
From: Narayanraddi Masti <team.nmasti@gmail.com>
Date: Thu, 7 Feb 2019 12:16:05 -0800
Subject: mac80211: Add support for NL80211_STA_INFO_AIRTIME_LINK_METRIC

Add support for mesh airtime link metric attribute
NL80211_STA_INFO_AIRTIME_LINK_METRIC.

Signed-off-by: Narayanraddi Masti <team.nmasti@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index b13234a486e7..5859a5e02454 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1327,6 +1327,7 @@ struct cfg80211_tid_stats {
  * @fcs_err_count: number of packets (MPDUs) received from this station with
  *	an FCS error. This counter should be incremented only when TA of the
  *	received packet with an FCS error matches the peer MAC address.
+ * @airtime_link_metric: mesh airtime link metric.
  */
 struct station_info {
 	u64 filled;
@@ -1381,6 +1382,8 @@ struct station_info {
 
 	u32 rx_mpdu_count;
 	u32 fcs_err_count;
+
+	u32 airtime_link_metric;
 };
 
 #if IS_ENABLED(CONFIG_CFG80211)
-- 
cgit v1.2.3


From cb74e9775871f8c82a1297cf76209f10ab5bbe3d Mon Sep 17 00:00:00 2001
From: Sunil Dutt <usdutt@codeaurora.org>
Date: Wed, 20 Feb 2019 16:18:07 +0530
Subject: cfg80211/nl80211: Offload OWE processing to user space in AP mode

This interface allows the host driver to offload OWE processing
to user space. This intends to support OWE (Opportunistic Wireless
Encryption) AKM by the drivers that implement SME but rely on the
user space for the cryptographic/OWE processing in AP mode. Such
drivers are not capable of processing/deriving the DH IE.

A new NL80211 command - NL80211_CMD_UPDATE_OWE_INFO is introduced
to send the request/event between the host driver and user space.

Driver shall provide the OWE info (MAC address and DH IE) of
the peer to user space for cryptographic processing of the DH IE
through the event. Accordingly, the user space shall update the
OWE info/DH IE to the driver.

Following is the sequence in AP mode for OWE authentication.

Driver passes the OWE info obtained from the peer in the
Association Request to the user space through the event
cfg80211_update_owe_info_event. User space shall process the
OWE info received and generate new OWE info. This OWE info is
passed to the driver through NL80211_CMD_UPDATE_OWE_INFO
request. Driver eventually uses this OWE info to send the
Association Response to the peer.

This OWE info in the command interface carries the IEs that include
PMKID of the peer if the PMKSA is still valid or an updated DH IE
for generating a new PMKSA with the peer.

Signed-off-by: Liangwei Dong <liangwei@codeaurora.org>
Signed-off-by: Sunil Dutt <usdutt@codeaurora.org>
Signed-off-by: Srinivas Dasari <dasaris@codeaurora.org>
[remove policy initialization - no longer exists]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5859a5e02454..70432fd638af 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3110,6 +3110,32 @@ struct cfg80211_pmsr_request {
 	struct cfg80211_pmsr_request_peer peers[];
 };
 
+/**
+ * struct cfg80211_update_owe_info - OWE Information
+ *
+ * This structure provides information needed for the drivers to offload OWE
+ * (Opportunistic Wireless Encryption) processing to the user space.
+ *
+ * Commonly used across update_owe_info request and event interfaces.
+ *
+ * @peer: MAC address of the peer device for which the OWE processing
+ *	has to be done.
+ * @status: status code, %WLAN_STATUS_SUCCESS for successful OWE info
+ *	processing, use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space
+ *	cannot give you the real status code for failures. Used only for
+ *	OWE update request command interface (user space to driver).
+ * @ie: IEs obtained from the peer or constructed by the user space. These are
+ *	the IEs of the remote peer in the event from the host driver and
+ *	the constructed IEs by the user space in the request interface.
+ * @ie_len: Length of IEs in octets.
+ */
+struct cfg80211_update_owe_info {
+	u8 peer[ETH_ALEN] __aligned(2);
+	u16 status;
+	const u8 *ie;
+	size_t ie_len;
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -3447,6 +3473,10 @@ struct cfg80211_pmsr_request {
  *	Statistics should be cumulative, currently no way to reset is provided.
  * @start_pmsr: start peer measurement (e.g. FTM)
  * @abort_pmsr: abort peer measurement
+ *
+ * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME
+ *	but offloading OWE processing to the user space will get the updated
+ *	DH IE through this interface.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -3761,6 +3791,8 @@ struct cfg80211_ops {
 			      struct cfg80211_pmsr_request *request);
 	void	(*abort_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
 			      struct cfg80211_pmsr_request *request);
+	int	(*update_owe_info)(struct wiphy *wiphy, struct net_device *dev,
+				   struct cfg80211_update_owe_info *owe_info);
 };
 
 /*
@@ -7219,4 +7251,14 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev,
 #define wiphy_WARN(wiphy, format, args...)			\
 	WARN(1, "wiphy: %s\n" format, wiphy_name(wiphy), ##args);
 
+/**
+ * cfg80211_update_owe_info_event - Notify the peer's OWE info to user space
+ * @netdev: network device
+ * @owe_info: peer's owe info
+ * @gfp: allocation flags
+ */
+void cfg80211_update_owe_info_event(struct net_device *netdev,
+				    struct cfg80211_update_owe_info *owe_info,
+				    gfp_t gfp);
+
 #endif /* __NET_CFG80211_H */
-- 
cgit v1.2.3


From fd69c399c7d6262086b6b820757c6aeaa71feeba Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 8 Apr 2019 10:15:59 +0200
Subject: datagram: remove rendundant 'peeked' argument

After commit a297569fe00a ("net/udp: do not touch skb->peeked unless
really needed") the 'peeked' argument of __skb_try_recv_datagram()
and friends is always equal to !!'flags & MSG_PEEK'.

Since such argument is really a boolean info, and the callers have
already 'flags & MSG_PEEK' handy, we can remove it and clean-up the
code a bit.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/udp.h b/include/net/udp.h
index fd6d948755c8..d8ce937bc395 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -269,13 +269,13 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
 struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
-			       int noblock, int *peeked, int *off, int *err);
+			       int noblock, int *off, int *err);
 static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
 					   int noblock, int *err)
 {
-	int peeked, off = 0;
+	int off = 0;
 
-	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
+	return __skb_recv_udp(sk, flags, noblock, &off, err);
 }
 
 int udp_v4_early_demux(struct sk_buff *skb);
-- 
cgit v1.2.3


From 84c0d5e96f3ae20344fb3a79161eab18905dae56 Mon Sep 17 00:00:00 2001
From: Jacky Hu <hengqing.hu@gmail.com>
Date: Tue, 26 Mar 2019 18:31:21 +0800
Subject: ipvs: allow tunneling with gue encapsulation

ipip packets are blocked in some public cloud environments, this patch
allows gue encapsulation with the tunneling method, which would make
tunneling working in those environments.

Signed-off-by: Jacky Hu <hengqing.hu@gmail.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 047f9a5ccaad..2ac40135b576 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -600,6 +600,9 @@ struct ip_vs_dest_user_kern {
 
 	/* Address family of addr */
 	u16			af;
+
+	u16			tun_type;	/* tunnel type */
+	__be16			tun_port;	/* tunnel port */
 };
 
 
@@ -660,6 +663,8 @@ struct ip_vs_dest {
 	atomic_t		conn_flags;	/* flags to copy to conn */
 	atomic_t		weight;		/* server weight */
 	atomic_t		last_weight;	/* server latest weight */
+	__u16			tun_type;	/* tunnel type */
+	__be16			tun_port;	/* tunnel port */
 
 	refcount_t		refcnt;		/* reference counter */
 	struct ip_vs_stats      stats;          /* statistics */
-- 
cgit v1.2.3


From d164385ec572cbe3335a635ac308760e126d4ec0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 27 Mar 2019 09:22:24 +0100
Subject: netfilter: nat: add inet family nat support

We need minimal support from the nat core for this, as we do not
want to register additional base hooks.

When an inet hook is registered, interally register ipv4 and ipv6
hooks for them and unregister those when inet hooks are removed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index cf332c4e0b32..423cda2c6542 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -69,9 +69,9 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum,
 #endif
 }
 
-int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 		       const struct nf_hook_ops *nat_ops, unsigned int ops_count);
-void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
+void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 			  unsigned int ops_count);
 
 unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
@@ -98,6 +98,9 @@ void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
 int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops);
 void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
 
+int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops);
+void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
+
 unsigned int
 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state);
-- 
cgit v1.2.3


From c1deb065cf3b5bcd483e3f03479f930edb151b99 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 27 Mar 2019 09:22:25 +0100
Subject: netfilter: nf_tables: merge route type into core

very little code, so it really doesn't make sense to have extra
modules or even a kconfig knob for this.

Merge them and make functionality available unconditionally.
The merge makes inet family route support trivial, so add it
as well here.

Before:
   text	   data	    bss	    dec	    hex	filename
    835	    832	      0	   1667	    683 nft_chain_route_ipv4.ko
    870	    832	      0	   1702	    6a6	nft_chain_route_ipv6.ko
 111568	   2556	    529	 114653	  1bfdd	nf_tables.ko

After:
   text	   data	    bss	    dec	    hex	filename
 113133	   2556	    529	 116218	  1c5fa	nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3e9ab643eedf..55dff3ab44c7 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1411,4 +1411,6 @@ struct nft_trans_flowtable {
 int __init nft_chain_filter_init(void);
 void nft_chain_filter_fini(void);
 
+void __init nft_chain_route_init(void);
+void nft_chain_route_fini(void);
 #endif /* _NET_NF_TABLES_H */
-- 
cgit v1.2.3


From 4806e975729f99c7908d1688a143f1e16d464e6c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 27 Mar 2019 09:22:26 +0100
Subject: netfilter: replace NF_NAT_NEEDED with IS_ENABLED(CONFIG_NF_NAT)

NF_NAT_NEEDED is true whenever nat support for either ipv4 or ipv6 is
enabled.  Now that the af-specific nat configuration switches have been
removed, IS_ENABLED(CONFIG_NF_NAT) has the same effect.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_expect.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index 006e430d1cdf..93ce6b0daaba 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -48,7 +48,7 @@ struct nf_conntrack_expect {
 	/* Expectation class */
 	unsigned int class;
 
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
 	union nf_inet_addr saved_addr;
 	/* This is the original per-proto part, used to map the
 	 * expected connection the way the recipient expects. */
-- 
cgit v1.2.3


From 3b0a081db1f730373993c7a27936778402a3322c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 4 Apr 2019 10:58:20 +0200
Subject: netfilter: make two functions static

They have no external callers anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 55dff3ab44c7..2d5a0a1a87b8 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -475,8 +475,6 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
 			      enum nft_trans_phase phase);
 int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		       struct nft_set_binding *binding);
-void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
-			  struct nft_set_binding *binding, bool commit);
 void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set);
 
 /**
-- 
cgit v1.2.3


From 1aefd3de7bc667115bb77cb0bc21e874c7e190fc Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:24 -0700
Subject: ipv6: Add fib6_nh_init and release to stubs

Add fib6_nh_init and fib6_nh_release to ipv6_stubs. If fib6_nh_init fails,
callers should not invoke fib6_nh_release, so there is no reason to have
a dummy stub for the IPv6 is not enabled case.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6_stubs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index d8d9c0b0e8c0..453b55bf6723 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -12,6 +12,8 @@
 
 /* structs from net/ip6_fib.h */
 struct fib6_info;
+struct fib6_nh;
+struct fib6_config;
 
 /* This is ugly, ideally these symbols should be built
  * into the core kernel.
@@ -40,6 +42,10 @@ struct ipv6_stub {
 	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
 				 struct in6_addr *saddr);
 
+	int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh,
+			    struct fib6_config *cfg, gfp_t gfp_flags,
+			    struct netlink_ext_ack *extack);
+	void (*fib6_nh_release)(struct fib6_nh *fib6_nh);
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
-- 
cgit v1.2.3


From 71df5777aaaeff673c242a49b945b1b96fe81718 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:25 -0700
Subject: ipv6: Add neighbor helpers that use the ipv6 stub

Add ipv6 helpers to handle ndisc references via the stub. Update
bpf_ipv6_fib_lookup to use __ipv6_neigh_lookup_noref_stub instead of
the open code ___neigh_lookup_noref with the stub.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ndisc.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index ddfbb591e2c5..366150053043 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -2,6 +2,8 @@
 #ifndef _NDISC_H
 #define _NDISC_H
 
+#include <net/ipv6_stubs.h>
+
 /*
  *	ICMP codes for neighbour discovery messages
  */
@@ -379,6 +381,14 @@ static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev
 	return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev);
 }
 
+static inline
+struct neighbour *__ipv6_neigh_lookup_noref_stub(struct net_device *dev,
+						 const void *pkey)
+{
+	return ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				     ndisc_hashfn, pkey, dev);
+}
+
 static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey)
 {
 	struct neighbour *n;
@@ -409,6 +419,36 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev,
 	rcu_read_unlock_bh();
 }
 
+static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
+					     const void *pkey)
+{
+	struct neighbour *n;
+
+	rcu_read_lock_bh();
+	n = __ipv6_neigh_lookup_noref_stub(dev, pkey);
+	if (n) {
+		unsigned long now = jiffies;
+
+		/* avoid dirtying neighbour */
+		if (n->confirmed != now)
+			n->confirmed = now;
+	}
+	rcu_read_unlock_bh();
+}
+
+/* uses ipv6_stub and is meant for use outside of IPv6 core */
+static inline struct neighbour *ip_neigh_gw6(struct net_device *dev,
+					     const void *addr)
+{
+	struct neighbour *neigh;
+
+	neigh = __ipv6_neigh_lookup_noref_stub(dev, addr);
+	if (unlikely(!neigh))
+		neigh = __neigh_create(ipv6_stub->nd_tbl, addr, dev, false);
+
+	return neigh;
+}
+
 int ndisc_init(void);
 int ndisc_late_init(void);
 
-- 
cgit v1.2.3


From bdf004677107e3b847c5db09c9fbf8edefa24996 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:26 -0700
Subject: net: Replace nhc_has_gw with nhc_gw_family

Allow the gateway in a fib_nh_common to be from a different address
family than the outer fib{6}_nh. To that end, replace nhc_has_gw with
nhc_gw_family and update users of nhc_has_gw to check nhc_gw_family.
Now nhc_family is used to know if the nh_common is part of a fib_nh
or fib6_nh (used for container_of to get to route family specific data),
and nhc_gw_family represents the address family for the gateway.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 2 +-
 include/net/ip_fib.h    | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 342180a7285c..5909fc421305 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -69,7 +69,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
 	return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) &&
-		f6i->fib6_nh.fib_nh_has_gw;
+		f6i->fib6_nh.fib_nh_gw_family;
 }
 
 void ip6_route_input(struct sk_buff *skb);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 3ce07841dc3b..c68a40435ee0 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -83,8 +83,8 @@ struct fib_nh_common {
 	struct lwtunnel_state	*nhc_lwtstate;
 	unsigned char		nhc_scope;
 	u8			nhc_family;
-	u8			nhc_has_gw:1,
-				unused:7;
+	u8			nhc_gw_family;
+
 	union {
 		__be32          ipv4;
 		struct in6_addr ipv6;
@@ -112,8 +112,7 @@ struct fib_nh {
 #define fib_nh_flags		nh_common.nhc_flags
 #define fib_nh_lws		nh_common.nhc_lwtstate
 #define fib_nh_scope		nh_common.nhc_scope
-#define fib_nh_family		nh_common.nhc_family
-#define fib_nh_has_gw		nh_common.nhc_has_gw
+#define fib_nh_gw_family	nh_common.nhc_gw_family
 #define fib_nh_gw4		nh_common.nhc_gw.ipv4
 #define fib_nh_gw6		nh_common.nhc_gw.ipv6
 #define fib_nh_weight		nh_common.nhc_weight
-- 
cgit v1.2.3


From 1550c171935d264f522581fd037db5e64a716bb6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:27 -0700
Subject: ipv4: Prepare rtable for IPv6 gateway

To allow the gateway to be either an IPv4 or IPv6 address, remove
rt_uses_gateway from rtable and replace with rt_gw_family. If
rt_gw_family is set it implies rt_uses_gateway. Rename rt_gateway
to rt_gw4 to represent the IPv4 version.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/route.h b/include/net/route.h
index 9883dc82f723..96912b099c08 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -55,12 +55,12 @@ struct rtable {
 	unsigned int		rt_flags;
 	__u16			rt_type;
 	__u8			rt_is_input;
-	__u8			rt_uses_gateway;
+	u8			rt_gw_family;
 
 	int			rt_iif;
 
 	/* Info on neighbour */
-	__be32			rt_gateway;
+	__be32			rt_gw4;
 
 	/* Miscellaneous cached information */
 	u32			rt_mtu_locked:1,
@@ -82,8 +82,8 @@ static inline bool rt_is_output_route(const struct rtable *rt)
 
 static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
 {
-	if (rt->rt_gateway)
-		return rt->rt_gateway;
+	if (rt->rt_gw_family == AF_INET)
+		return rt->rt_gw4;
 	return daddr;
 }
 
-- 
cgit v1.2.3


From f35b794b3b405e2478654ea875bc0b29fe1a1bc5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:28 -0700
Subject: ipv4: Prepare fib_config for IPv6 gateway

Similar to rtable, fib_config needs to allow the gateway to be either an
IPv4 or an IPv6 address. To that end, rename fc_gw to fc_gw4 to mean an
IPv4 address and add fc_gw_family. Checks on 'is a gateway set' are changed
to see if fc_gw_family is set. In the process prepare the code for a
fc_gw_family == AF_INET6.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index c68a40435ee0..1f72ad553c31 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -32,10 +32,11 @@ struct fib_config {
 	u8			fc_protocol;
 	u8			fc_scope;
 	u8			fc_type;
-	/* 3 bytes unused */
+	u8			fc_gw_family;
+	/* 2 bytes unused */
 	u32			fc_table;
 	__be32			fc_dst;
-	__be32			fc_gw;
+	__be32			fc_gw4;
 	int			fc_oif;
 	u32			fc_flags;
 	u32			fc_priority;
-- 
cgit v1.2.3


From 0f5f7d7bf6e6bda4dffe7b42812a16ada6ea9816 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:29 -0700
Subject: ipv4: Add support to rtable for ipv6 gateway

Add support for an IPv6 gateway to rtable. Since a gateway is either
IPv4 or IPv6, make it a union with rt_gw4 where rt_gw_family decides
which address is in use.

When dumping the route data, encode an ipv6 nexthop using RTA_VIA.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/route.h b/include/net/route.h
index 96912b099c08..5d28a2509b58 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -60,7 +60,10 @@ struct rtable {
 	int			rt_iif;
 
 	/* Info on neighbour */
-	__be32			rt_gw4;
+	union {
+		__be32		rt_gw4;
+		struct in6_addr	rt_gw6;
+	};
 
 	/* Miscellaneous cached information */
 	u32			rt_mtu_locked:1,
-- 
cgit v1.2.3


From a4ea5d43c807be28545625c1e0641905022fa0d1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:30 -0700
Subject: ipv4: Add support to fib_config for IPv6 gateway

Add support for an IPv6 gateway to fib_config. Since a gateway is either
IPv4 or IPv6, make it a union with fc_gw4 where fc_gw_family decides
which address is in use. Update current checks on family and gw4 to
handle ipv6 as well.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1f72ad553c31..f1c452f618a9 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -36,7 +36,10 @@ struct fib_config {
 	/* 2 bytes unused */
 	u32			fc_table;
 	__be32			fc_dst;
-	__be32			fc_gw4;
+	union {
+		__be32		fc_gw4;
+		struct in6_addr	fc_gw6;
+	};
 	int			fc_oif;
 	u32			fc_flags;
 	u32			fc_priority;
-- 
cgit v1.2.3


From 0353f28231c79416191326810e7fe656b69c63b7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:33 -0700
Subject: neighbor: Add skip_cache argument to neigh_output

A later patch allows an IPv6 gateway with an IPv4 route. The neighbor
entry will exist in the v6 ndisc table and the cached header will contain
the ipv6 protocol which is wrong for an IPv4 packet. For an IPv4 packet to
use the v6 neighbor entry, neigh_output needs to skip the cached header
and just use the output callback for the neigh entry.

A future patchset can look at expanding the hh_cache to handle 2
protocols. For now, IPv6 gateways with an IPv4 route will take the
extra overhead of generating the header.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 7c1ab9edba03..3e5438bd0101 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -498,11 +498,12 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
 	return dev_queue_xmit(skb);
 }
 
-static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
+static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
+			       bool skip_cache)
 {
 	const struct hh_cache *hh = &n->hh;
 
-	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
+	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache)
 		return neigh_hh_output(hh, skb);
 	else
 		return n->output(n, skb);
-- 
cgit v1.2.3


From 5c9f7c1dfc2e0776551ef1ceb335187c6698d1ff Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:34 -0700
Subject: ipv4: Add helpers for neigh lookup for nexthop

A common theme in the output path is looking up a neigh entry for a
nexthop, either the gateway in an rtable or a fallback to the daddr
in the skb:

        nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr);
        neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);

To allow the nexthop to be an IPv6 address we need to consider the
family of the nexthop and then call __ipv{4,6}_neigh_lookup_noref based
on it.

To make this simpler, add a ip_neigh_gw4 helper similar to ip_neigh_gw6
added in an earlier patch which handles:

        neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);

And then add a second one, ip_neigh_for_gw, that calls either
ip_neigh_gw4 or ip_neigh_gw6 based on the address family of the gateway.

Update the output paths in the VRF driver and core v4 code to use
ip_neigh_for_gw simplifying the family based lookup and making both
ready for a v6 nexthop.

ipv4_neigh_lookup has a different need - the potential to resolve a
passed in address in addition to any gateway in the rtable or skb. Since
this is a one-off, add ip_neigh_gw4 and ip_neigh_gw6 diectly. The
difference between __neigh_create used by the helpers and neigh_create
called by ipv4_neigh_lookup is taking a refcount, so add rcu_read_lock_bh
and bump the refcnt on the neigh entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/net')

diff --git a/include/net/route.h b/include/net/route.h
index 5d28a2509b58..96f6c9ae33c2 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -29,6 +29,8 @@
 #include <net/flow.h>
 #include <net/inet_sock.h>
 #include <net/ip_fib.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
 #include <linux/in_route.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
@@ -350,4 +352,34 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 	return hoplimit;
 }
 
+static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
+					     __be32 daddr)
+{
+	struct neighbour *neigh;
+
+	neigh = __ipv4_neigh_lookup_noref(dev, daddr);
+	if (unlikely(!neigh))
+		neigh = __neigh_create(&arp_tbl, &daddr, dev, false);
+
+	return neigh;
+}
+
+static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
+						struct sk_buff *skb,
+						bool *is_v6gw)
+{
+	struct net_device *dev = rt->dst.dev;
+	struct neighbour *neigh;
+
+	if (likely(rt->rt_gw_family == AF_INET)) {
+		neigh = ip_neigh_gw4(dev, rt->rt_gw4);
+	} else if (rt->rt_gw_family == AF_INET6) {
+		neigh = ip_neigh_gw6(dev, &rt->rt_gw6);
+		*is_v6gw = true;
+	} else {
+		neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr);
+	}
+	return neigh;
+}
+
 #endif	/* _ROUTE_H */
-- 
cgit v1.2.3


From 19a9d136f198cd7c4e26ea6897a0cf067d3f7ecb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:39 -0700
Subject: ipv4: Flag fib_info with a fib_nh using IPv6 gateway

Until support is added to the offload drivers, they need to be able to
reject routes with an IPv6 gateway. To that end add a flag to fib_info
that indicates if any fib_nh has a v6 gateway. The flag allows the drivers
to efficiently know the use of a v6 gateway without walking all fib_nh
tied to a fib_info each time a route is added.

Update mlxsw and rocker to reject the routes with extack message as to why.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f1c452f618a9..337106469ec5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -147,6 +147,7 @@ struct fib_info {
 #define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
 #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
 	int			fib_nhs;
+	bool			fib_nh_is_v6;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].fib_nh_dev
-- 
cgit v1.2.3


From d15662682db232da77136cd348f4c9df312ca6f9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:40 -0700
Subject: ipv4: Allow ipv6 gateway with ipv4 routes

Add support for RTA_VIA and allow an IPv6 nexthop for v4 routes:
   $ ip ro add 172.16.1.0/24 via inet6 2001:db8::1 dev eth0
   $ ip ro ls
   ...
   172.16.1.0/24 via inet6 2001:db8::1 dev eth0

For convenience and simplicity, userspace can use RTA_VIA to specify
AF_INET or AF_INET6 gateway.

The common fib_nexthop_info dump function compares the gateway address
family to the nh_common family to know if the gateway should be encoded
as RTA_VIA or RTA_GATEWAY.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 337106469ec5..d8195c77e247 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -401,6 +401,8 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net,
 /* Exported by fib_frontend.c */
 extern const struct nla_policy rtm_ipv4_policy[];
 void ip_fib_init(void);
+int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
+		    struct netlink_ext_ack *extack);
 __be32 fib_compute_spec_dst(struct sk_buff *skb);
 bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev);
 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
-- 
cgit v1.2.3


From 1f5e6fdd6aec7929e67afad1e42e35d894a119ae Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 10 Apr 2019 14:32:38 +0200
Subject: net: sched: prefer qdisc_is_empty() over direct qlen access

When checking for root qdisc queue length, do not access directly q.qlen.
In the following patches we will move back qlen accounting to per CPU
values for NOLOCK qdiscs.

Instead, prefer the qdisc_is_empty() helper usage.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0aea0e262452..7ecb6127e980 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -747,7 +747,7 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev)
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 		const struct Qdisc *q = rcu_dereference(txq->qdisc);
 
-		if (q->q.qlen) {
+		if (!qdisc_is_empty(q)) {
 			rcu_read_unlock();
 			return false;
 		}
-- 
cgit v1.2.3


From 9c01c9f1f2a3ddbddbf3b233cc6bfa86f5a59af0 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 10 Apr 2019 14:32:39 +0200
Subject: net: sched: always do stats accounting according to TCQ_F_CPUSTATS

The core sched implementation checks independently for NOLOCK flag
to acquire/release the root spin lock and for qdisc_is_percpu_stats()
to account per CPU values in many places.

This change update the last few places checking the TCQ_F_NOLOCK to
do per CPU stats accounting according to qdisc_is_percpu_stats()
value.

The above allows to clean dev_requeue_skb() implementation a bit
and makes stats update always consistent with a single flag.

v1 -> v2:
 - do not move qdisc_is_empty definition, fix build issue

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 7ecb6127e980..ed56474cfe3b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -146,9 +146,14 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
 }
 
+static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
+{
+	return q->flags & TCQ_F_CPUSTATS;
+}
+
 static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
 {
-	if (qdisc->flags & TCQ_F_NOLOCK)
+	if (qdisc_is_percpu_stats(qdisc))
 		return qdisc->empty;
 	return !qdisc->q.qlen;
 }
@@ -490,7 +495,7 @@ static inline u32 qdisc_qlen_sum(const struct Qdisc *q)
 {
 	u32 qlen = q->qstats.qlen;
 
-	if (q->flags & TCQ_F_NOLOCK)
+	if (qdisc_is_percpu_stats(q))
 		qlen += atomic_read(&q->q.atomic_qlen);
 	else
 		qlen += q->q.qlen;
@@ -817,11 +822,6 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	return sch->enqueue(skb, sch, to_free);
 }
 
-static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
-{
-	return q->flags & TCQ_F_CPUSTATS;
-}
-
 static inline void _bstats_update(struct gnet_stats_basic_packed *bstats,
 				  __u64 bytes, __u32 packets)
 {
@@ -1113,8 +1113,13 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 
 	if (skb) {
 		skb = __skb_dequeue(&sch->gso_skb);
-		qdisc_qstats_backlog_dec(sch, skb);
-		sch->q.qlen--;
+		if (qdisc_is_percpu_stats(sch)) {
+			qdisc_qstats_cpu_backlog_dec(sch, skb);
+			qdisc_qstats_atomic_qlen_dec(sch);
+		} else {
+			qdisc_qstats_backlog_dec(sch, skb);
+			sch->q.qlen--;
+		}
 	} else {
 		skb = sch->dequeue(sch);
 	}
-- 
cgit v1.2.3


From 8a53e616de294873fec1a75ddb77ecb3d225cee0 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 10 Apr 2019 14:32:40 +0200
Subject: net: sched: when clearing NOLOCK, clear TCQ_F_CPUSTATS, too

Since stats updating is always consistent with TCQ_F_CPUSTATS flag,
we can disable it at qdisc creation time flipping such bit.

In my experiments, if the NOLOCK flag is cleared, per CPU stats
accounting does not give any measurable performance gain, but it
waste some memory.

Let's clear TCQ_F_CPUSTATS together with NOLOCK, when enslaving
a NOLOCK qdisc to 'lock' one.

Use stats update helper inside pfifo_fast, to cope correctly with
TCQ_F_CPUSTATS flag change.

As a side effect, q.qlen value for any child qdiscs is always
consistent for all lock classfull qdiscs.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ed56474cfe3b..f069011524ba 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1106,6 +1106,32 @@ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
 	return skb;
 }
 
+static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
+						 struct sk_buff *skb)
+{
+	if (qdisc_is_percpu_stats(sch)) {
+		qdisc_qstats_cpu_backlog_dec(sch, skb);
+		qdisc_bstats_cpu_update(sch, skb);
+		qdisc_qstats_atomic_qlen_dec(sch);
+	} else {
+		qdisc_qstats_backlog_dec(sch, skb);
+		qdisc_bstats_update(sch, skb);
+		sch->q.qlen--;
+	}
+}
+
+static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
+						 unsigned int pkt_len)
+{
+	if (qdisc_is_percpu_stats(sch)) {
+		qdisc_qstats_atomic_qlen_inc(sch);
+		this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
+	} else {
+		sch->qstats.backlog += pkt_len;
+		sch->q.qlen++;
+	}
+}
+
 /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
 static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 {
-- 
cgit v1.2.3


From 73eb628ddfd3884d1e58a8022de2e78de7807fc6 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 10 Apr 2019 14:32:41 +0200
Subject: Revert: "net: sched: put back q.qlen into a single location"

This revert commit 46b1c18f9deb ("net: sched: put back q.qlen into
a single location").
After the previous patch, when a NOLOCK qdisc is enslaved to a
locking qdisc it switches to global stats accounting. As a consequence,
when a classful qdisc accesses directly a child qdisc's qlen, such
qdisc is not doing per CPU accounting and qlen value is consistent.

In the control path nobody uses directly qlen since commit
e5f0e8f8e45 ("net: sched: introduce and use qdisc tree flush/purge
helpers"), so we can remove the contented atomic ops from the
datapath.

v1 -> v2:
 - complete the qdisc_qstats_atomic_qlen_dec() ->
   qdisc_qstats_cpu_qlen_dec() replacement, fix build issue
 - more descriptive commit message

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f069011524ba..e8f85cd2afce 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -52,10 +52,7 @@ struct qdisc_size_table {
 struct qdisc_skb_head {
 	struct sk_buff	*head;
 	struct sk_buff	*tail;
-	union {
-		u32		qlen;
-		atomic_t	atomic_qlen;
-	};
+	__u32		qlen;
 	spinlock_t	lock;
 };
 
@@ -486,19 +483,27 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
 	BUILD_BUG_ON(sizeof(qcb->data) < sz);
 }
 
+static inline int qdisc_qlen_cpu(const struct Qdisc *q)
+{
+	return this_cpu_ptr(q->cpu_qstats)->qlen;
+}
+
 static inline int qdisc_qlen(const struct Qdisc *q)
 {
 	return q->q.qlen;
 }
 
-static inline u32 qdisc_qlen_sum(const struct Qdisc *q)
+static inline int qdisc_qlen_sum(const struct Qdisc *q)
 {
-	u32 qlen = q->qstats.qlen;
+	__u32 qlen = q->qstats.qlen;
+	int i;
 
-	if (qdisc_is_percpu_stats(q))
-		qlen += atomic_read(&q->q.atomic_qlen);
-	else
+	if (qdisc_is_percpu_stats(q)) {
+		for_each_possible_cpu(i)
+			qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
+	} else {
 		qlen += q->q.qlen;
+	}
 
 	return qlen;
 }
@@ -889,14 +894,14 @@ static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
 	this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
 }
 
-static inline void qdisc_qstats_atomic_qlen_inc(struct Qdisc *sch)
+static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
 {
-	atomic_inc(&sch->q.atomic_qlen);
+	this_cpu_inc(sch->cpu_qstats->qlen);
 }
 
-static inline void qdisc_qstats_atomic_qlen_dec(struct Qdisc *sch)
+static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
 {
-	atomic_dec(&sch->q.atomic_qlen);
+	this_cpu_dec(sch->cpu_qstats->qlen);
 }
 
 static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
@@ -1112,7 +1117,7 @@ static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
 	if (qdisc_is_percpu_stats(sch)) {
 		qdisc_qstats_cpu_backlog_dec(sch, skb);
 		qdisc_bstats_cpu_update(sch, skb);
-		qdisc_qstats_atomic_qlen_dec(sch);
+		qdisc_qstats_cpu_qlen_dec(sch);
 	} else {
 		qdisc_qstats_backlog_dec(sch, skb);
 		qdisc_bstats_update(sch, skb);
@@ -1124,7 +1129,7 @@ static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
 						 unsigned int pkt_len)
 {
 	if (qdisc_is_percpu_stats(sch)) {
-		qdisc_qstats_atomic_qlen_inc(sch);
+		qdisc_qstats_cpu_qlen_inc(sch);
 		this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
 	} else {
 		sch->qstats.backlog += pkt_len;
@@ -1141,7 +1146,7 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 		skb = __skb_dequeue(&sch->gso_skb);
 		if (qdisc_is_percpu_stats(sch)) {
 			qdisc_qstats_cpu_backlog_dec(sch, skb);
-			qdisc_qstats_atomic_qlen_dec(sch);
+			qdisc_qstats_cpu_qlen_dec(sch);
 		} else {
 			qdisc_qstats_backlog_dec(sch, skb);
 			sch->q.qlen--;
-- 
cgit v1.2.3


From bf8981a2aa082d9d64771b47c8a1c9c388d8cd40 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Apr 2019 10:44:06 +0200
Subject: netfilter: nf_nat: merge ip/ip6 masquerade headers

Both are now implemented by nf_nat_masquerade.c, so no need to keep
different headers.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_nat_masquerade.h | 15 ---------------
 include/net/netfilter/ipv6/nf_nat_masquerade.h | 11 -----------
 include/net/netfilter/nf_nat_masquerade.h      | 21 +++++++++++++++++++++
 3 files changed, 21 insertions(+), 26 deletions(-)
 delete mode 100644 include/net/netfilter/ipv4/nf_nat_masquerade.h
 delete mode 100644 include/net/netfilter/ipv6/nf_nat_masquerade.h
 create mode 100644 include/net/netfilter/nf_nat_masquerade.h

(limited to 'include/net')

diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h
deleted file mode 100644
index 13d55206bb9f..000000000000
--- a/include/net/netfilter/ipv4/nf_nat_masquerade.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NF_NAT_MASQUERADE_IPV4_H_
-#define _NF_NAT_MASQUERADE_IPV4_H_
-
-#include <net/netfilter/nf_nat.h>
-
-unsigned int
-nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
-		       const struct nf_nat_range2 *range,
-		       const struct net_device *out);
-
-int nf_nat_masquerade_ipv4_register_notifier(void);
-void nf_nat_masquerade_ipv4_unregister_notifier(void);
-
-#endif /*_NF_NAT_MASQUERADE_IPV4_H_ */
diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h
deleted file mode 100644
index 2917bf95c437..000000000000
--- a/include/net/netfilter/ipv6/nf_nat_masquerade.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NF_NAT_MASQUERADE_IPV6_H_
-#define _NF_NAT_MASQUERADE_IPV6_H_
-
-unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
-		       const struct net_device *out);
-int nf_nat_masquerade_ipv6_register_notifier(void);
-void nf_nat_masquerade_ipv6_unregister_notifier(void);
-
-#endif /* _NF_NAT_MASQUERADE_IPV6_H_ */
diff --git a/include/net/netfilter/nf_nat_masquerade.h b/include/net/netfilter/nf_nat_masquerade.h
new file mode 100644
index 000000000000..cafe71822a53
--- /dev/null
+++ b/include/net/netfilter/nf_nat_masquerade.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NF_NAT_MASQUERADE_H_
+#define _NF_NAT_MASQUERADE_H_
+
+#include <net/netfilter/nf_nat.h>
+
+unsigned int
+nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
+		       const struct nf_nat_range2 *range,
+		       const struct net_device *out);
+
+int nf_nat_masquerade_ipv4_register_notifier(void);
+void nf_nat_masquerade_ipv4_unregister_notifier(void);
+
+unsigned int
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
+		       const struct net_device *out);
+int nf_nat_masquerade_ipv6_register_notifier(void);
+void nf_nat_masquerade_ipv6_unregister_notifier(void);
+
+#endif /*_NF_NAT_MASQUERADE_H_ */
-- 
cgit v1.2.3


From 610a43149cabd0c7aa7bed19cbcf05a0249ab32a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Apr 2019 10:44:08 +0200
Subject: netfilter: nf_nat_masquerade: unify ipv4/6 notifier registration

Only reason for having two different register functions was because of
ipt_MASQUERADE and ip6t_MASQUERADE being two different modules.

Previous patch merged those into xt_MASQUERADE, so we can merge this too.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_masquerade.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_nat_masquerade.h b/include/net/netfilter/nf_nat_masquerade.h
index cafe71822a53..54a14d643c34 100644
--- a/include/net/netfilter/nf_nat_masquerade.h
+++ b/include/net/netfilter/nf_nat_masquerade.h
@@ -9,13 +9,11 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
 		       const struct nf_nat_range2 *range,
 		       const struct net_device *out);
 
-int nf_nat_masquerade_ipv4_register_notifier(void);
-void nf_nat_masquerade_ipv4_unregister_notifier(void);
+int nf_nat_masquerade_inet_register_notifiers(void);
+void nf_nat_masquerade_inet_unregister_notifiers(void);
 
 unsigned int
 nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		       const struct net_device *out);
-int nf_nat_masquerade_ipv6_register_notifier(void);
-void nf_nat_masquerade_ipv6_unregister_notifier(void);
 
 #endif /*_NF_NAT_MASQUERADE_H_ */
-- 
cgit v1.2.3


From cc3a86c802f0ba9a2627aef256d95ae3b3fa6e91 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 9 Apr 2019 14:41:12 -0700
Subject: ipv6: Change rt6_probe to take a fib6_nh

rt6_probe sends probes for gateways in a nexthop. As such it really
depends on a fib6_nh, not a fib entry. Move last_probe to fib6_nh and
update rt6_probe to a fib6_nh struct.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 58dbb4e82908..2e9235adfa0d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -127,6 +127,10 @@ struct rt6_exception {
 
 struct fib6_nh {
 	struct fib_nh_common	nh_common;
+
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	unsigned long		last_probe;
+#endif
 };
 
 struct fib6_info {
@@ -155,10 +159,6 @@ struct fib6_info {
 	struct rt6_info * __percpu	*rt6i_pcpu;
 	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
-#ifdef CONFIG_IPV6_ROUTER_PREF
-	unsigned long			last_probe;
-#endif
-
 	u32				fib6_metric;
 	u8				fib6_protocol;
 	u8				fib6_type;
-- 
cgit v1.2.3


From 971502d77faa50a37c89bc6d172450294ad9a5fd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Apr 2019 16:36:41 +0200
Subject: bridge: netfilter: unroll NF_HOOK helper in bridge input path

Replace NF_HOOK() based invocation of the netfilter hooks with a private
copy of nf_hook_slow().

This copy has one difference: it can return the rx handler value expected
by the stack, i.e. RX_HANDLER_CONSUMED or RX_HANDLER_PASS.

This is needed by the next patch to invoke the ebtables
"broute" table via the standard netfilter hooks rather than the custom
"br_should_route_hook" indirection that is used now.

When the skb is to be "brouted", we must return RX_HANDLER_PASS from the
bridge rx input handler, but there is no way to indicate this via
NF_HOOK(), unless perhaps by some hack such as exposing bridge_cb in the
netfilter core or a percpu flag.

  text    data     bss     dec   filename
  3369      56       0    3425   net/bridge/br_input.o.before
  3458      40       0    3498   net/bridge/br_input.o.after

This allows removal of the "br_should_route_hook" in the next patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index a50a69f5334c..7239105d9d2e 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -119,4 +119,7 @@ nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family,
 	return queue;
 }
 
+int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
+	     const struct nf_hook_entries *entries, unsigned int index,
+	     unsigned int verdict);
 #endif /* _NF_QUEUE_H */
-- 
cgit v1.2.3


From 013b96ec64616b57fc631b304dfcecc5bc288f90 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Thu, 11 Apr 2019 15:02:07 -0700
Subject: sctp: Pass sk_buff_head explicitly to sctp_ulpq_tail_event().

Now the SKB list implementation assumption can be removed.

And now that we know that the list head is always non-NULL
we can remove the code blocks dealing with that as well.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/ulpqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h
index bb0ecba3db2b..f4ac7117ff29 100644
--- a/include/net/sctp/ulpqueue.h
+++ b/include/net/sctp/ulpqueue.h
@@ -59,7 +59,7 @@ void sctp_ulpq_free(struct sctp_ulpq *);
 int sctp_ulpq_tail_data(struct sctp_ulpq *, struct sctp_chunk *, gfp_t);
 
 /* Add a new event for propagation to the ULP. */
-int sctp_ulpq_tail_event(struct sctp_ulpq *, struct sctp_ulpevent *ev);
+int sctp_ulpq_tail_event(struct sctp_ulpq *, struct sk_buff_head *skb_list);
 
 /* Renege previously received chunks.  */
 void sctp_ulpq_renege(struct sctp_ulpq *, struct sctp_chunk *, gfp_t);
-- 
cgit v1.2.3


From 9dde27de3e5efa0d032f3c891a0ca833a0d31911 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 15 Apr 2019 17:15:07 +0800
Subject: sctp: implement memory accounting on rx path

sk_forward_alloc's updating is also done on rx path, but to be consistent
we change to use sk_mem_charge() in sctp_skb_set_owner_r().

In sctp_eat_data(), it's not enough to check sctp_memory_pressure only,
which doesn't work for mem_cgroup_sockets_enabled, so we change to use
sk_under_memory_pressure().

When it's under memory pressure, sk_mem_reclaim() and sk_rmem_schedule()
should be called on both RENEGE or CHUNK DELIVERY path exit the memory
pressure status as soon as possible.

Note that sk_rmem_schedule() is using datalen to make things easy there.

Reported-by: Matteo Croce <mcroce@redhat.com>
Tested-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 1d13ec3f2707..eefdfa5abf6e 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -421,7 +421,7 @@ static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	/*
 	 * This mimics the behavior of skb_set_owner_r
 	 */
-	sk->sk_forward_alloc -= event->rmem_len;
+	sk_mem_charge(sk, event->rmem_len);
 }
 
 /* Tests if the list has one and only one entry. */
-- 
cgit v1.2.3


From b1d40991506aa9f1de310a2e74ef8e3bec6ba215 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 14:35:59 -0700
Subject: ipv6: Rename fib6_multipath_select and pass fib6_result

Add 'struct fib6_result' to hold the fib entry and fib6_nh from a fib
lookup as separate entries, similar to what IPv4 now has with fib_result.

Rename fib6_multipath_select to fib6_select_path, pass fib6_result to
it, and set f6i and nh in the result once a path selection is done.
Call fib6_select_path unconditionally for path selection which means
moving the sibling and oif check to fib6_select_path. To handle the two
different call paths (2 only call multipath_select if flowi6_oif == 0 and
the other always calls it), add a new have_oif_match that controls the
sibling walk if relevant.

Update callers of fib6_multipath_select accordingly and have them use the
fib6_info and fib6_nh from the result.

This is needed for multipath nexthop objects where a single f6i can
point to multiple fib6_nh (similar to IPv4).

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    | 13 ++++++++-----
 include/net/ipv6_stubs.h |  9 ++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2e9235adfa0d..c4d818041663 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -190,6 +190,11 @@ struct rt6_info {
 	unsigned short			rt6i_nfheader_len;
 };
 
+struct fib6_result {
+	struct fib6_nh		*nh;
+	struct fib6_info	*f6i;
+};
+
 #define for_each_fib6_node_rt_rcu(fn)					\
 	for (rt = rcu_dereference((fn)->leaf); rt;			\
 	     rt = rcu_dereference(rt->fib6_next))
@@ -391,11 +396,9 @@ struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 				    int oif, struct flowi6 *fl6, int strict);
 
-struct fib6_info *fib6_multipath_select(const struct net *net,
-					struct fib6_info *match,
-					struct flowi6 *fl6, int oif,
-					const struct sk_buff *skb, int strict);
-
+void fib6_select_path(const struct net *net, struct fib6_result *res,
+		      struct flowi6 *fl6, int oif, bool have_oif_match,
+		      const struct sk_buff *skb, int strict);
 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
 				   const struct in6_addr *daddr,
 				   const struct in6_addr *saddr);
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 453b55bf6723..5df36d6a2613 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -14,6 +14,7 @@
 struct fib6_info;
 struct fib6_nh;
 struct fib6_config;
+struct fib6_result;
 
 /* This is ugly, ideally these symbols should be built
  * into the core kernel.
@@ -34,11 +35,9 @@ struct ipv6_stub {
 					      struct fib6_table *table,
 					      int oif, struct flowi6 *fl6,
 					      int flags);
-	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
-						   struct fib6_info *f6i,
-						   struct flowi6 *fl6, int oif,
-						   const struct sk_buff *skb,
-						   int strict);
+	void (*fib6_select_path)(const struct net *net, struct fib6_result *res,
+				 struct flowi6 *fl6, int oif, bool oif_match,
+				 const struct sk_buff *skb, int strict);
 	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
 				 struct in6_addr *saddr);
 
-- 
cgit v1.2.3


From b748f26092626332f73e71d75e4390de6b8bdf9b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 14:36:06 -0700
Subject: ipv6: Pass fib6_result to ip6_mtu_from_fib6 and fib6_mtu

Change ip6_mtu_from_fib6 and fib6_mtu to take a fib6_result over a
fib6_info. Update both to use the fib6_nh from fib6_result.

Since the signature of ip6_mtu_from_fib6 is already changing, add const
to daddr and saddr.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h  | 5 +++--
 include/net/ipv6_stubs.h | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 5909fc421305..46bbd8ff9cc6 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -302,8 +302,9 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 	return mtu;
 }
 
-u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
-		      struct in6_addr *saddr);
+u32 ip6_mtu_from_fib6(const struct fib6_result *res,
+		      const struct in6_addr *daddr,
+		      const struct in6_addr *saddr);
 
 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 				   struct net_device *dev, struct sk_buff *skb,
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 5df36d6a2613..0d16b9ec0485 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -38,8 +38,9 @@ struct ipv6_stub {
 	void (*fib6_select_path)(const struct net *net, struct fib6_result *res,
 				 struct flowi6 *fl6, int oif, bool oif_match,
 				 const struct sk_buff *skb, int strict);
-	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
-				 struct in6_addr *saddr);
+	u32 (*ip6_mtu_from_fib6)(const struct fib6_result *res,
+				 const struct in6_addr *daddr,
+				 const struct in6_addr *saddr);
 
 	int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh,
 			    struct fib6_config *cfg, gfp_t gfp_flags,
-- 
cgit v1.2.3


From effda4dd97e878ab83336bec7411cc41b5cc6d37 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 14:36:10 -0700
Subject: ipv6: Pass fib6_result to fib lookups

Change fib6_lookup and fib6_table_lookup to take a fib6_result and set
f6i and nh rather than returning a fib6_info. For now both always
return 0.

A later patch set can make these more like the IPv4 counterparts and
return EINVAL, EACCESS, etc based on fib6_type.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    |  9 +++++----
 include/net/ipv6_stubs.h | 11 +++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c4d818041663..cb3277cd1413 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -389,12 +389,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 /* called with rcu lock held; can return error pointer
  * caller needs to select path
  */
-struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
-			      int flags);
+int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+		struct fib6_result *res, int flags);
 
 /* called with rcu lock held; caller needs to select path */
-struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
-				    int oif, struct flowi6 *fl6, int strict);
+int fib6_table_lookup(struct net *net, struct fib6_table *table,
+		      int oif, struct flowi6 *fl6, struct fib6_result *res,
+		      int strict);
 
 void fib6_select_path(const struct net *net, struct fib6_result *res,
 		      struct flowi6 *fl6, int oif, bool have_oif_match,
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 0d16b9ec0485..6c0c4fde16f8 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -29,12 +29,11 @@ struct ipv6_stub {
 	int (*ipv6_route_input)(struct sk_buff *skb);
 
 	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
-	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
-					 struct flowi6 *fl6, int flags);
-	struct fib6_info *(*fib6_table_lookup)(struct net *net,
-					      struct fib6_table *table,
-					      int oif, struct flowi6 *fl6,
-					      int flags);
+	int (*fib6_lookup)(struct net *net, int oif, struct flowi6 *fl6,
+			   struct fib6_result *res, int flags);
+	int (*fib6_table_lookup)(struct net *net, struct fib6_table *table,
+				 int oif, struct flowi6 *fl6,
+				 struct fib6_result *res, int flags);
 	void (*fib6_select_path)(const struct net *net, struct fib6_result *res,
 				 struct flowi6 *fl6, int oif, bool oif_match,
 				 const struct sk_buff *skb, int strict);
-- 
cgit v1.2.3


From 7d21fec90438941b44b699ae73673d2f8a3a9d76 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 14:36:11 -0700
Subject: ipv6: Add fib6_type and fib6_flags to fib6_result

Add the fib6_flags and fib6_type to fib6_result. Update the lookup helpers
to set them and update post fib lookup users to use the version from the
result.

This allows nexthop objects to have blackhole nexthop.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index cb3277cd1413..6b7557b71c8c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -193,6 +193,8 @@ struct rt6_info {
 struct fib6_result {
 	struct fib6_nh		*nh;
 	struct fib6_info	*f6i;
+	u32			fib6_flags;
+	u8			fib6_type;
 };
 
 #define for_each_fib6_node_rt_rcu(fn)					\
-- 
cgit v1.2.3


From b8fb1ab46169ac016a8552a6455bb0bfc401f8e2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 17:31:43 -0700
Subject: net ipv6: Prevent neighbor add if protocol is disabled on device

Disabling IPv6 on an interface removes existing entries but nothing prevents
new entries from being manually added. To that end, add a new neigh_table
operation, allow_add, that is called on RTM_NEWNEIGH to see if neighbor
entries are allowed on a given device. If IPv6 is disabled on the device,
allow_add returns false and passes a message back to the user via extack.

  $ echo 1 > /proc/sys/net/ipv6/conf/eth1/disable_ipv6
  $ ip -6 neigh add fe80::4c88:bff:fe21:2704 dev eth1 lladdr de:ad:be:ef:01:01
  Error: IPv6 is disabled on this device.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 3e5438bd0101..50a67bd6a434 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -205,6 +205,8 @@ struct neigh_table {
 	int			(*pconstructor)(struct pneigh_entry *);
 	void			(*pdestructor)(struct pneigh_entry *);
 	void			(*proxy_redo)(struct sk_buff *skb);
+	bool			(*allow_add)(const struct net_device *dev,
+					     struct netlink_ext_ack *extack);
 	char			*id;
 	struct neigh_parms	parms;
 	struct list_head	parms_list;
-- 
cgit v1.2.3


From 0bc199854405543b0debe67c735c0aae94f1d319 Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Wed, 17 Apr 2019 16:35:49 -0400
Subject: ipv6: Add rate limit mask for ICMPv6 messages

To make ICMPv6 closer to ICMPv4, add ratemask parameter. Since the ICMP
message types use larger numeric values, a simple bitmask doesn't fit.
I use large bitmap. The input and output are the in form of list of
ranges. Set the default to rate limit all error messages but Packet Too
Big. For Packet Too Big, use ratemask instead of hard-coded.

There are functions where icmpv6_xrlim_allow() and icmpv6_global_allow()
aren't called. This patch only adds them to icmpv6_echo_reply().

Rate limiting error messages is mandated by RFC 4443 but RFC 4890 says
that it is also acceptable to rate limit informational messages. Thus,
I removed the current hard-coded behavior of icmpv6_mask_allow() that
doesn't rate limit informational messages.

v2: Add dummy function proc_do_large_bitmap() if CONFIG_PROC_SYSCTL
    isn't defined, expand the description in ip-sysctl.txt and remove
    unnecessary conditional before kfree().
v3: Inline the bitmap instead of dynamically allocated. Still is a
    pointer to it is needed because of the way proc_do_large_bitmap work.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 64e29b58bb5e..5e61b5a8635d 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -8,6 +8,7 @@
 #ifndef __NETNS_IPV6_H__
 #define __NETNS_IPV6_H__
 #include <net/dst_ops.h>
+#include <uapi/linux/icmpv6.h>
 
 struct ctl_table_header;
 
@@ -35,6 +36,8 @@ struct netns_sysctl_ipv6 {
 	int icmpv6_echo_ignore_all;
 	int icmpv6_echo_ignore_multicast;
 	int icmpv6_echo_ignore_anycast;
+	DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1);
+	unsigned long *icmpv6_ratemask_ptr;
 	int anycast_src_echo_reply;
 	int ip_nonlocal_bind;
 	int fwmark_reflect;
-- 
cgit v1.2.3


From c7cbdbf29f488a19982cd9f4a109887f18028bbb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 17 Apr 2019 22:51:48 +0200
Subject: net: rework SIOCGSTAMP ioctl handling

The SIOCGSTAMP/SIOCGSTAMPNS ioctl commands are implemented by many
socket protocol handlers, and all of those end up calling the same
sock_get_timestamp()/sock_get_timestampns() helper functions, which
results in a lot of duplicate code.

With the introduction of 64-bit time_t on 32-bit architectures, this
gets worse, as we then need four different ioctl commands in each
socket protocol implementation.

To simplify that, let's add a new .gettstamp() operation in
struct proto_ops, and move ioctl implementation into the common
sock_ioctl()/compat_sock_ioctl_trans() functions that these all go
through.

We can reuse the sock_get_timestamp() implementation, but generalize
it so it can deal with both native and compat mode, as well as
timeval and timespec structures.

Acked-by: Stefan Schmidt <stefan@datenfreihafen.org>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/lkml/CAK8P3a038aDQQotzua_QtKGhq8O9n+rdiz2=WDCp82ys8eUT+A@mail.gmail.com/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/compat.h | 3 ---
 include/net/sock.h   | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/compat.h b/include/net/compat.h
index 4c6d75612b6c..f277653c7e17 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -30,9 +30,6 @@ struct compat_cmsghdr {
 	compat_int_t	cmsg_type;
 };
 
-int compat_sock_get_timestamp(struct sock *, struct timeval __user *);
-int compat_sock_get_timestampns(struct sock *, struct timespec __user *);
-
 #else /* defined(CONFIG_COMPAT) */
 /*
  * To avoid compiler warnings:
diff --git a/include/net/sock.h b/include/net/sock.h
index bdd77bbce7d8..784cd19d5ff7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1614,6 +1614,8 @@ int sock_setsockopt(struct socket *sock, int level, int op,
 
 int sock_getsockopt(struct socket *sock, int level, int op,
 		    char __user *optval, int __user *optlen);
+int sock_gettstamp(struct socket *sock, void __user *userstamp,
+		   bool timeval, bool time32);
 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
 				    int noblock, int *errcode);
 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
@@ -2503,8 +2505,6 @@ static inline bool sk_listener(const struct sock *sk)
 }
 
 void sock_enable_timestamp(struct sock *sk, int flag);
-int sock_get_timestamp(struct sock *, struct timeval __user *);
-int sock_get_timestampns(struct sock *, struct timespec __user *);
 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
 		       int type);
 
-- 
cgit v1.2.3


From 4e54507ab1a9da05238b986292f6cb702e6696c7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 21 Apr 2019 08:49:01 -0700
Subject: ipv6: Simplify rt6_qualify_for_ecmp

After commit c7a1ce397ada ("ipv6: Change addrconf_f6i_alloc to use
ip6_route_info_create"), the gateway is no longer filled in for fib6_nh
structs in a prefix route. Accordingly, the RTF_ADDRCONF flag check can
be dropped from the 'rt6_qualify_for_ecmp'.

Further, RTF_DYNAMIC is only set in rt6_info instances, so it can be
removed from the check as well.

This reduces rt6_qualify_for_ecmp and the mlxsw version to just checking
if the nexthop has a gateway which is the real indication of whether
entries can be coalesced into a multipath route.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 46bbd8ff9cc6..518d97fbe074 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -68,8 +68,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
-	return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) &&
-		f6i->fib6_nh.fib_nh_gw_family;
+	return f6i->fib6_nh.fib_nh_gw_family;
 }
 
 void ip6_route_input(struct sk_buff *skb);
-- 
cgit v1.2.3


From be659b8d3c79afc54e087ebf8d849685d7b0d395 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 21 Apr 2019 17:39:18 -0700
Subject: ipv6: Restore RTF_ADDRCONF check in rt6_qualify_for_ecmp

The RTF_ADDRCONF flag filters out routes added by RA's in determining
which routes can be appended to an existing one to create a multipath
route. Restore the flag check and add a comment to document the RA piece.

Fixes: 4e54507ab1a9 ("ipv6: Simplify rt6_qualify_for_ecmp")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 518d97fbe074..df9cebc2b20c 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -68,7 +68,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
-	return f6i->fib6_nh.fib_nh_gw_family;
+	/* the RTF_ADDRCONF flag filters out RA's */
+	return !(f6i->fib6_flags & RTF_ADDRCONF) &&
+		f6i->fib6_nh.fib_nh_gw_family;
 }
 
 void ip6_route_input(struct sk_buff *skb);
-- 
cgit v1.2.3


From 0b13c9bb96f6edf03018030f07929a16b4dd77a6 Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Sun, 21 Apr 2019 00:50:42 +0900
Subject: include/net/tcp.h: whitespace cleanup at tcp_v4_check

This patch makes trivial whitespace fix to the function
tcp_v4_check at include/net/tcp.h file.

It has stylistic issue, which is "space required after that ','"
and it can be confirmed with ./scripts/checkpatch.pl tool.

    ERROR: space required after that ',' (ctx:VxV)
    #29: FILE: include/net/tcp.h:1317:
    +	        return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
         	                              ^

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 68ee02523b87..7cf1181630a3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1314,7 +1314,7 @@ static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq)
 static inline __sum16 tcp_v4_check(int len, __be32 saddr,
 				   __be32 daddr, __wsum base)
 {
-	return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
+	return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base);
 }
 
 static inline bool tcp_checksum_complete(struct sk_buff *skb)
-- 
cgit v1.2.3


From 7e5f4cdb284be5ff862f84ccda084e2847f73fbb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 20 Apr 2019 09:27:27 -0700
Subject: ipv6: Remove fib6_info_nh_lwt

fib6_info_nh_lwt is no longer used; remove it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6b7557b71c8c..352f767bea81 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -450,12 +450,6 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 		 struct netlink_ext_ack *extack);
 void fib6_nh_release(struct fib6_nh *fib6_nh);
 
-static inline
-struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
-{
-	return f6i->fib6_nh.fib_nh_lws;
-}
-
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int flags);
 
-- 
cgit v1.2.3


From 3c618c1dbb8859625c643121ac80af9a6723533f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 20 Apr 2019 09:28:20 -0700
Subject: net: Rename net/nexthop.h net/rtnh.h

The header contains rtnh_ macros so rename the file accordingly.
Allows a later patch to use the nexthop.h name for the new
nexthop code.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h | 34 ----------------------------------
 include/net/rtnh.h    | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 34 deletions(-)
 delete mode 100644 include/net/nexthop.h
 create mode 100644 include/net/rtnh.h

(limited to 'include/net')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
deleted file mode 100644
index 902ff382a6dc..000000000000
--- a/include/net/nexthop.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __NET_NEXTHOP_H
-#define __NET_NEXTHOP_H
-
-#include <linux/rtnetlink.h>
-#include <net/netlink.h>
-
-static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining)
-{
-	return remaining >= (int)sizeof(*rtnh) &&
-	       rtnh->rtnh_len >= sizeof(*rtnh) &&
-	       rtnh->rtnh_len <= remaining;
-}
-
-static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh,
-                                         int *remaining)
-{
-	int totlen = NLA_ALIGN(rtnh->rtnh_len);
-
-	*remaining -= totlen;
-	return (struct rtnexthop *) ((char *) rtnh + totlen);
-}
-
-static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh)
-{
-	return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh)));
-}
-
-static inline int rtnh_attrlen(const struct rtnexthop *rtnh)
-{
-	return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh));
-}
-
-#endif
diff --git a/include/net/rtnh.h b/include/net/rtnh.h
new file mode 100644
index 000000000000..aa2cfc508f7c
--- /dev/null
+++ b/include/net/rtnh.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_RTNH_H
+#define __NET_RTNH_H
+
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+
+static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining)
+{
+	return remaining >= (int)sizeof(*rtnh) &&
+	       rtnh->rtnh_len >= sizeof(*rtnh) &&
+	       rtnh->rtnh_len <= remaining;
+}
+
+static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh,
+                                         int *remaining)
+{
+	int totlen = NLA_ALIGN(rtnh->rtnh_len);
+
+	*remaining -= totlen;
+	return (struct rtnexthop *) ((char *) rtnh + totlen);
+}
+
+static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh)
+{
+	return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh)));
+}
+
+static inline int rtnh_attrlen(const struct rtnexthop *rtnh)
+{
+	return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh));
+}
+
+#endif
-- 
cgit v1.2.3


From a79eda3aaf30bc73752487fff5160cf67e99e313 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 20 Apr 2019 23:29:42 -0400
Subject: net: psample: drop include of module.h from psample.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

There doesn't appear to be anything in psample.h that is module
related, and build coverage doesn't appear to show any other
files/drivers relying implicitly on getting it from here.

So it appears we are simply free to just remove it in this case.

Cc: Yotam Gigi <yotam.gi@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/psample.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/psample.h b/include/net/psample.h
index 9b80f814ab04..37a4df2325b2 100644
--- a/include/net/psample.h
+++ b/include/net/psample.h
@@ -3,7 +3,6 @@
 #define __NET_PSAMPLE_H
 
 #include <uapi/linux/psample.h>
-#include <linux/module.h>
 #include <linux/list.h>
 
 struct psample_group {
-- 
cgit v1.2.3


From c517796ea91d11dd3f0ae7ff61a12fe5d4941eb0 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 20 Apr 2019 23:29:43 -0400
Subject: net: ife: drop include of module.h from net/ife.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

There doesn't appear to be anything in net/ife.h that is module
related, and build coverage doesn't appear to show any other
files/drivers relying implicitly on getting it from here.

So it appears we are simply free to just remove it in this case.

Cc: Yotam Gigi <yotam.gi@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ife.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ife.h b/include/net/ife.h
index e117617e3c34..7e2538d8585b 100644
--- a/include/net/ife.h
+++ b/include/net/ife.h
@@ -4,7 +4,6 @@
 
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
-#include <linux/module.h>
 #include <uapi/linux/ife.h>
 
 #if IS_ENABLED(CONFIG_NET_IFE)
-- 
cgit v1.2.3


From 113e63286697893127c3ee83471b45ad0cf8d75f Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 20 Apr 2019 23:29:44 -0400
Subject: net: fib: drop include of module.h from fib_notifier.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

Since fib_notifier.h is not going into a module struct looking for
specific fields, we can just let it know that module is a struct,
just like about 60 other include/linux headers already do.

Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_notifier.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
index c91ec732afd6..c49d7bfb5c30 100644
--- a/include/net/fib_notifier.h
+++ b/include/net/fib_notifier.h
@@ -2,10 +2,11 @@
 #define __NET_FIB_NOTIFIER_H
 
 #include <linux/types.h>
-#include <linux/module.h>
 #include <linux/notifier.h>
 #include <net/net_namespace.h>
 
+struct module;
+
 struct fib_notifier_info {
 	struct net *net;
 	int family;
-- 
cgit v1.2.3


From a130f9b27545f56880034c345da9a4efc2ba2b24 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 20 Apr 2019 23:29:45 -0400
Subject: net: tc_act: drop include of module.h from tc_ife.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

Since tc_ife.h is not going into a module struct looking for
specific fields, we can just let it know that module is a struct,
just like about 60 other include/linux headers already do.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index 86d13b01b39d..c7f24a2da1ca 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -5,7 +5,8 @@
 #include <net/act_api.h>
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
-#include <linux/module.h>
+
+struct module;
 
 struct tcf_ife_params {
 	u8 eth_dst[ETH_ALEN];
-- 
cgit v1.2.3


From f2ad1a522e9817fba7799008e0a8dc6f8a32bf7d Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 22 Apr 2019 12:08:39 +0000
Subject: net: devlink: Add extack to shared buffer operations

Add extack to shared buffer set operations, so that meaningful error
messages could be propagated to the user.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 70c7d1ac8344..4f5e41613503 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -491,13 +491,14 @@ struct devlink_ops {
 			   struct devlink_sb_pool_info *pool_info);
 	int (*sb_pool_set)(struct devlink *devlink, unsigned int sb_index,
 			   u16 pool_index, u32 size,
-			   enum devlink_sb_threshold_type threshold_type);
+			   enum devlink_sb_threshold_type threshold_type,
+			   struct netlink_ext_ack *extack);
 	int (*sb_port_pool_get)(struct devlink_port *devlink_port,
 				unsigned int sb_index, u16 pool_index,
 				u32 *p_threshold);
 	int (*sb_port_pool_set)(struct devlink_port *devlink_port,
 				unsigned int sb_index, u16 pool_index,
-				u32 threshold);
+				u32 threshold, struct netlink_ext_ack *extack);
 	int (*sb_tc_pool_bind_get)(struct devlink_port *devlink_port,
 				   unsigned int sb_index,
 				   u16 tc_index,
@@ -507,7 +508,8 @@ struct devlink_ops {
 				   unsigned int sb_index,
 				   u16 tc_index,
 				   enum devlink_sb_pool_type pool_type,
-				   u16 pool_index, u32 threshold);
+				   u16 pool_index, u32 threshold,
+				   struct netlink_ext_ack *extack);
 	int (*sb_occ_snapshot)(struct devlink *devlink,
 			       unsigned int sb_index);
 	int (*sb_occ_max_clear)(struct devlink *devlink,
-- 
cgit v1.2.3


From f24ea52873c726bf7b54318f00ec45050222b367 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Apr 2019 16:44:37 +0200
Subject: xfrm: remove tos indirection from afinfo_policy

Only used by ipv4, we can read the fl4 tos value directly instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 77eb578a0384..652da5861772 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -329,7 +329,6 @@ struct xfrm_policy_afinfo {
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
-	int			(*get_tos)(const struct flowi *fl);
 	int			(*init_path)(struct xfrm_dst *path,
 					     struct dst_entry *dst,
 					     int nfheader_len);
-- 
cgit v1.2.3


From 2e8b4aa816eaaf480fe68b1086614259caf1bf3c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Apr 2019 16:44:38 +0200
Subject: xfrm: remove init_path indirection from afinfo_policy

handle this directly, its only used by ipv6.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 652da5861772..b8de1622141a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -329,9 +329,6 @@ struct xfrm_policy_afinfo {
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
-	int			(*init_path)(struct xfrm_dst *path,
-					     struct dst_entry *dst,
-					     int nfheader_len);
 	int			(*fill_dst)(struct xfrm_dst *xdst,
 					    struct net_device *dev,
 					    const struct flowi *fl);
-- 
cgit v1.2.3


From c53ac41e3720926301c623d6682bb87ce992a3b3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Apr 2019 16:44:39 +0200
Subject: xfrm: remove decode_session indirection from afinfo_policy

No external dependencies, might as well handle this directly.
xfrm_afinfo_policy is now 40 bytes on x86_64.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b8de1622141a..18d6b33501b9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -326,9 +326,6 @@ struct xfrm_policy_afinfo {
 					     xfrm_address_t *saddr,
 					     xfrm_address_t *daddr,
 					     u32 mark);
-	void			(*decode_session)(struct sk_buff *skb,
-						  struct flowi *fl,
-						  int reverse);
 	int			(*fill_dst)(struct xfrm_dst *xdst,
 					    struct net_device *dev,
 					    const struct flowi *fl);
-- 
cgit v1.2.3


From bb9cd077e216b886438c5698e1cd75f762ecd3c9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 17 Apr 2019 11:45:13 +0200
Subject: xfrm: remove unneeded export_symbols

None of them have any external callers, make them static.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 18d6b33501b9..eb5018b1cf9c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1568,7 +1568,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb);
-int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err);
 int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol);
 int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol);
 int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family);
@@ -1584,7 +1583,6 @@ int xfrm6_rcv(struct sk_buff *skb);
 int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 		     xfrm_address_t *saddr, u8 proto);
 void xfrm6_local_error(struct sk_buff *skb, u32 mtu);
-int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err);
 int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol);
 int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol);
 int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family);
-- 
cgit v1.2.3


From 089b19a9204fc090793d389a265f54124eacb05d Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 22 Apr 2019 08:55:44 -0700
Subject: flow_dissector: switch kernel context to struct bpf_flow_dissector

struct bpf_flow_dissector has a small subset of sk_buff fields that
flow dissector BPF program is allowed to access and an optional
pointer to real skb. Real skb is used only in bpf_skb_load_bytes
helper to read non-linear data.

The real motivation for this is to be able to call flow dissector
from eth_get_headlen context where we don't have an skb and need
to dissect raw bytes.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/flow_dissector.h |  7 +++++++
 include/net/sch_generic.h    | 11 ++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 2b26979efb48..7c5a8d9a8d2a 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -305,4 +305,11 @@ static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissec
 	return ((char *)target_container) + flow_dissector->offset[key_id];
 }
 
+struct bpf_flow_dissector {
+	struct bpf_flow_keys	*flow_keys;
+	const struct sk_buff	*skb;
+	void			*data;
+	void			*data_end;
+};
+
 #endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e8f85cd2afce..21f434f3ac9e 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -364,13 +364,10 @@ struct tcf_proto {
 };
 
 struct qdisc_skb_cb {
-	union {
-		struct {
-			unsigned int		pkt_len;
-			u16			slave_dev_queue_mapping;
-			u16			tc_classid;
-		};
-		struct bpf_flow_keys *flow_keys;
+	struct {
+		unsigned int		pkt_len;
+		u16			slave_dev_queue_mapping;
+		u16			tc_classid;
 	};
 #define QDISC_CB_PRIV_LEN 20
 	unsigned char		data[QDISC_CB_PRIV_LEN];
-- 
cgit v1.2.3


From f05713e0916ca46f127641b6afa74bd1a0772423 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Apr 2019 18:35:03 -0700
Subject: ipv6: convert fib6_ref to refcount_t

We suspect some issues involving fib6_ref 0 -> 1 transitions might
cause strange syzbot reports.

Lets convert fib6_ref to refcount_t to catch them earlier.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 352f767bea81..5a4a67b38712 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -146,7 +146,7 @@ struct fib6_info {
 	struct list_head		fib6_siblings;
 	unsigned int			fib6_nsiblings;
 
-	atomic_t			fib6_ref;
+	refcount_t			fib6_ref;
 	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
 #define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
@@ -284,17 +284,17 @@ void fib6_info_destroy_rcu(struct rcu_head *head);
 
 static inline void fib6_info_hold(struct fib6_info *f6i)
 {
-	atomic_inc(&f6i->fib6_ref);
+	refcount_inc(&f6i->fib6_ref);
 }
 
 static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
 {
-	return atomic_inc_not_zero(&f6i->fib6_ref);
+	return refcount_inc_not_zero(&f6i->fib6_ref);
 }
 
 static inline void fib6_info_release(struct fib6_info *f6i)
 {
-	if (f6i && atomic_dec_and_test(&f6i->fib6_ref))
+	if (f6i && refcount_dec_and_test(&f6i->fib6_ref))
 		call_rcu(&f6i->rcu, fib6_info_destroy_rcu);
 }
 
-- 
cgit v1.2.3


From ffa8ce54be3aaf6b15abae3bbd08282b867d3a4f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 23 Apr 2019 08:23:41 -0700
Subject: lwtunnel: Pass encap and encap type attributes to lwtunnel_fill_encap

Currently, lwtunnel_fill_encap hardcodes the encap and encap type
attributes as RTA_ENCAP and RTA_ENCAP_TYPE, respectively. The nexthop
objects want to re-use this code but the encap attributes passed to
userspace as NHA_ENCAP and NHA_ENCAP_TYPE. Since that is the only
difference, change lwtunnel_fill_encap to take the attribute type as
an input.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 671113bcb2cc..5d6c5b1fc695 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -118,8 +118,8 @@ int lwtunnel_build_state(u16 encap_type,
 			 unsigned int family, const void *cfg,
 			 struct lwtunnel_state **lws,
 			 struct netlink_ext_ack *extack);
-int lwtunnel_fill_encap(struct sk_buff *skb,
-			struct lwtunnel_state *lwtstate);
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate,
+			int encap_attr, int encap_type_attr);
 int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
 struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
 int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
@@ -219,7 +219,8 @@ static inline int lwtunnel_build_state(u16 encap_type,
 }
 
 static inline int lwtunnel_fill_encap(struct sk_buff *skb,
-				      struct lwtunnel_state *lwtstate)
+				      struct lwtunnel_state *lwtstate,
+				      int encap_attr, int encap_type_attr)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From ecc5663cce8c7d7e4eba32af4e1e3cab296c64b9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 23 Apr 2019 08:48:09 -0700
Subject: net: Change nhc_flags to unsigned char

nhc_flags holds the RTNH_F flags for a given nexthop (fib{6}_nh).
All of the RTNH_F_ flags fit in an unsigned char, and since the API to
userspace (rtnh_flags and lower byte of rtm_flags) is 1 byte it can not
grow. Make nhc_flags in fib_nh_common an unsigned char and shrink the
size of the struct by 8, from 56 to 48 bytes.

Update the flags arguments for up netdevice events and fib_nexthop_info
which determines the RTNH_F flags to return on a dump/event. The RTNH_F
flags are passed in the lower byte of rtm_flags which is an unsigned int
so use a temp variable for the flags to fib_nexthop_info and combine
with rtm_flags in the caller.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 2 +-
 include/net/ip_fib.h    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index df9cebc2b20c..4790beaa86e0 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -182,7 +182,7 @@ int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
-void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
+void rt6_sync_up(struct net_device *dev, unsigned char nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
 void rt6_multipath_rebalance(struct fib6_info *f6i);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index d8195c77e247..772a9e61bd84 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -83,11 +83,11 @@ struct fnhe_hash_bucket {
 struct fib_nh_common {
 	struct net_device	*nhc_dev;
 	int			nhc_oif;
-	unsigned int		nhc_flags;
-	struct lwtunnel_state	*nhc_lwtstate;
 	unsigned char		nhc_scope;
 	u8			nhc_family;
 	u8			nhc_gw_family;
+	unsigned char		nhc_flags;
+	struct lwtunnel_state	*nhc_lwtstate;
 
 	union {
 		__be32          ipv4;
@@ -425,7 +425,7 @@ int fib_unmerge(struct net *net);
 int ip_fib_check_default(__be32 gw, struct net_device *dev);
 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
-int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
+int fib_sync_up(struct net_device *dev, unsigned char nh_flags);
 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -500,7 +500,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct netlink_callback *cb);
 
 int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh,
-		     unsigned int *flags, bool skip_oif);
+		     unsigned char *flags, bool skip_oif);
 int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh,
 		    int nh_weight);
 #endif  /* _NET_FIB_H */
-- 
cgit v1.2.3


From a65120bae4b7425a39c5783aa3d4fc29677eef0e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 23 Apr 2019 18:05:33 -0700
Subject: ipv6: Use result arg in fib_lookup_arg consistently

arg.result is sometimes used as fib6_result and sometimes used to
hold the rt6_info. Add rt6_info to fib6_result and make the use
of arg.result consistent through ipv6 rules.

The rt6 entry is filled in for lookups returning a dst_entry, but not
for direct fib_lookups that just want a fib6_info.

Fixes: effda4dd97e8 ("ipv6: Pass fib6_result to fib lookups")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5a4a67b38712..40105738e2f6 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -195,6 +195,7 @@ struct fib6_result {
 	struct fib6_info	*f6i;
 	u32			fib6_flags;
 	u8			fib6_type;
+	struct rt6_info		*rt6;
 };
 
 #define for_each_fib6_node_rt_rcu(fn)					\
-- 
cgit v1.2.3


From d5bb334a8e171b262e48f378bd2096c0ea458265 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 24 Apr 2019 22:19:17 +0200
Subject: Bluetooth: Align minimum encryption key size for LE and BR/EDR
 connections

The minimum encryption key size for LE connections is 56 bits and to
align LE with BR/EDR, enforce 56 bits of minimum encryption key size for
BR/EDR connections as well.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Cc: stable@vger.kernel.org
---
 include/net/bluetooth/hci_core.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 094e61e07030..05b1b96f4d9e 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -190,6 +190,9 @@ struct adv_info {
 
 #define HCI_MAX_SHORT_NAME_LENGTH	10
 
+/* Min encryption key size to match with SMP */
+#define HCI_MIN_ENC_KEY_SIZE		7
+
 /* Default LE RPA expiry time, 15 minutes */
 #define HCI_DEFAULT_RPA_TIMEOUT		(15 * 60)
 
-- 
cgit v1.2.3


From f7dacfb11475ba777e1e84ccec2e14b0ba5a17a3 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Fri, 15 Mar 2019 17:39:03 +0200
Subject: cfg80211: support non-inheritance element

Subelement profile may specify element IDs it doesn't inherit
from the management frame. Support it.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 70432fd638af..777c4f021610 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5534,6 +5534,14 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid,
 	u64_to_ether_addr(new_bssid_u64, new_bssid);
 }
 
+/**
+ * cfg80211_is_element_inherited - returns if element ID should be inherited
+ * @element: element to check
+ * @non_inherit_element: non inheritance element
+ */
+bool cfg80211_is_element_inherited(const struct element *element,
+				   const struct element *non_inherit_element);
+
 /**
  * enum cfg80211_bss_frame_type - frame type that the BSS data came from
  * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is
-- 
cgit v1.2.3


From fe806e4992c9047affd263bcc13b2c047029a726 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Fri, 15 Mar 2019 17:39:05 +0200
Subject: cfg80211: support profile split between elements

Since an element is limited to 255 octets, a profile may be split
split to several elements. Support the split as defined in the 11ax
draft 3. Detect legacy split and print a net-rate limited warning,
since there is no ROI in supporting this probably non-existent
split.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 777c4f021610..f6665f8eba5a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5542,6 +5542,20 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid,
 bool cfg80211_is_element_inherited(const struct element *element,
 				   const struct element *non_inherit_element);
 
+/**
+ * cfg80211_merge_profile - merges a MBSSID profile if it is split between IEs
+ * @ie: ies
+ * @ielen: length of IEs
+ * @mbssid_elem: current MBSSID element
+ * @sub_elem: current MBSSID subelement (profile)
+ * @merged_ie: location of the merged profile
+ * @max_copy_len: max merged profile length
+ */
+size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
+			      const struct element *mbssid_elem,
+			      const struct element *sub_elem,
+			      u8 **merged_ie, size_t max_copy_len);
+
 /**
  * enum cfg80211_bss_frame_type - frame type that the BSS data came from
  * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is
-- 
cgit v1.2.3


From f2af2df800d3648b1d68e02d5b8a5d77cfee8970 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 16 Mar 2019 18:06:32 +0100
Subject: mac80211: calculate hash for fq without holding fq->lock in itxq
 enqueue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reduces lock contention on enqueue/dequeue of iTXQ packets

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/fq_impl.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index be7c0fab3478..2caa86660ab0 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -107,21 +107,23 @@ begin:
 	return skb;
 }
 
+static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb)
+{
+	u32 hash = skb_get_hash_perturb(skb, fq->perturbation);
+
+	return reciprocal_scale(hash, fq->flows_cnt);
+}
+
 static struct fq_flow *fq_flow_classify(struct fq *fq,
-					struct fq_tin *tin,
+					struct fq_tin *tin, u32 idx,
 					struct sk_buff *skb,
 					fq_flow_get_default_t get_default_func)
 {
 	struct fq_flow *flow;
-	u32 hash;
-	u32 idx;
 
 	lockdep_assert_held(&fq->lock);
 
-	hash = skb_get_hash_perturb(skb, fq->perturbation);
-	idx = reciprocal_scale(hash, fq->flows_cnt);
 	flow = &fq->flows[idx];
-
 	if (flow->tin && flow->tin != tin) {
 		flow = get_default_func(fq, tin, idx, skb);
 		tin->collisions++;
@@ -153,7 +155,7 @@ static void fq_recalc_backlog(struct fq *fq,
 }
 
 static void fq_tin_enqueue(struct fq *fq,
-			   struct fq_tin *tin,
+			   struct fq_tin *tin, u32 idx,
 			   struct sk_buff *skb,
 			   fq_skb_free_t free_func,
 			   fq_flow_get_default_t get_default_func)
@@ -163,7 +165,7 @@ static void fq_tin_enqueue(struct fq *fq,
 
 	lockdep_assert_held(&fq->lock);
 
-	flow = fq_flow_classify(fq, tin, skb, get_default_func);
+	flow = fq_flow_classify(fq, tin, idx, skb, get_default_func);
 
 	flow->tin = tin;
 	flow->backlog += skb->len;
-- 
cgit v1.2.3


From 6cdd3979a2bdc16116c5b2eb09475abf54ba9e70 Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander@wetzel-home.de>
Date: Tue, 19 Mar 2019 21:34:07 +0100
Subject: nl80211/cfg80211: Extended Key ID support

Add support for IEEE 802.11-2016 "Extended Key ID for Individually
Addressed Frames".

Extend cfg80211 and nl80211 to allow pairwise keys to be installed for
Rx only, enable Tx separately and allow Key ID 1 for pairwise keys.

Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de>
[use NLA_POLICY_RANGE() for NL80211_KEY_MODE]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f6665f8eba5a..2b039802ae2e 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -485,6 +485,7 @@ struct vif_params {
  *	with the get_key() callback, must be in little endian,
  *	length given by @seq_len.
  * @seq_len: length of @seq.
+ * @mode: key install mode (RX_TX, NO_TX or SET_TX)
  */
 struct key_params {
 	const u8 *key;
@@ -492,6 +493,7 @@ struct key_params {
 	int key_len;
 	int seq_len;
 	u32 cipher;
+	enum nl80211_key_mode mode;
 };
 
 /**
-- 
cgit v1.2.3


From 96fc6efb9ad9d0cd8cbb4462f0eb2a07092649e6 Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander@wetzel-home.de>
Date: Tue, 19 Mar 2019 21:34:08 +0100
Subject: mac80211: IEEE 802.11 Extended Key ID support

Add support for Extended Key ID as defined in IEEE 802.11-2016.

 - Implement the nl80211 API for Extended Key ID
 - Extend mac80211 API to allow drivers to support Extended Key ID
 - Enable Extended Key ID by default for drivers only supporting SW
   crypto (e.g. mac80211_hwsim)
 - Allow unicast Tx usage to be supressed (IEEE80211_KEY_FLAG_NO_AUTO_TX)
 - Select the decryption key based on the MPDU keyid
 - Enforce existing assumptions in the code that rekeys don't change the
   cipher

Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de>
[remove module parameter]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ac2ed8ec662b..c10abca55fde 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1697,6 +1697,7 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif);
  * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for
  *	a TKIP key if it only requires MIC space. Do not set together with
  *	@IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key.
+ * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation.
  */
 enum ieee80211_key_flags {
 	IEEE80211_KEY_FLAG_GENERATE_IV_MGMT	= BIT(0),
@@ -1708,6 +1709,7 @@ enum ieee80211_key_flags {
 	IEEE80211_KEY_FLAG_RX_MGMT		= BIT(6),
 	IEEE80211_KEY_FLAG_RESERVE_TAILROOM	= BIT(7),
 	IEEE80211_KEY_FLAG_PUT_MIC_SPACE	= BIT(8),
+	IEEE80211_KEY_FLAG_NO_AUTO_TX		= BIT(9),
 };
 
 /**
@@ -2243,6 +2245,9 @@ struct ieee80211_txq {
  * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID
  *	only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set.
  *
+ * @IEEE80211_HW_EXT_KEY_ID_NATIVE: Driver and hardware are supporting Extended
+ *	Key ID and can handle two unicast keys per station for Rx and Tx.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2294,6 +2299,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN,
 	IEEE80211_HW_SUPPORTS_MULTI_BSSID,
 	IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID,
+	IEEE80211_HW_EXT_KEY_ID_NATIVE,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
-- 
cgit v1.2.3


From e96d1cd2635c05efdd01b4eafcfc50c22c40751f Mon Sep 17 00:00:00 2001
From: Ashok Raj Nagarajan <arnagara@codeaurora.org>
Date: Fri, 29 Mar 2019 16:18:21 +0530
Subject: cfg80211: Add support to set tx power for a station associated

This patch adds support to set transmit power setting type and transmit
power level attributes to NL80211_CMD_SET_STATION in order to facilitate
adjusting the transmit power level of a station associated to the AP.

The added attributes allow selection of automatic and limited transmit
power level, with the level defined in dBm format.

Co-developed-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Ashok Raj Nagarajan <arnagara@codeaurora.org>
Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2b039802ae2e..2ea04e94b522 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -975,6 +975,27 @@ enum station_parameters_apply_mask {
 	STATION_PARAM_APPLY_UAPSD = BIT(0),
 	STATION_PARAM_APPLY_CAPABILITY = BIT(1),
 	STATION_PARAM_APPLY_PLINK_STATE = BIT(2),
+	STATION_PARAM_APPLY_STA_TXPOWER = BIT(3),
+};
+
+/**
+ * struct sta_txpwr - station txpower configuration
+ *
+ * Used to configure txpower for station.
+ *
+ * @power: tx power (in dBm) to be used for sending data traffic. If tx power
+ *	is not provided, the default per-interface tx power setting will be
+ *	overriding. Driver should be picking up the lowest tx power, either tx
+ *	power per-interface or per-station.
+ * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power
+ *	will be less than or equal to specified from userspace, whereas if TPC
+ *	%type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power.
+ *	NL80211_TX_POWER_FIXED is not a valid configuration option for
+ *	per peer TPC.
+ */
+struct sta_txpwr {
+	s16 power;
+	enum nl80211_tx_power_setting type;
 };
 
 /**
@@ -1049,6 +1070,7 @@ struct station_parameters {
 	const struct ieee80211_he_cap_elem *he_capa;
 	u8 he_capa_len;
 	u16 airtime_weight;
+	struct sta_txpwr txpwr;
 };
 
 /**
-- 
cgit v1.2.3


From ba905bf432f662cb907fd692a4f160e612c0408b Mon Sep 17 00:00:00 2001
From: Ashok Raj Nagarajan <arnagara@codeaurora.org>
Date: Fri, 29 Mar 2019 16:19:09 +0530
Subject: mac80211: store tx power value from user to station

This patch introduce a new driver callback drv_sta_set_txpwr. This API will
copy the transmit power value passed from user space and call the driver
callback to set the tx power for the station.

Co-developed-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Ashok Raj Nagarajan <arnagara@codeaurora.org>
Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c10abca55fde..d66fbfe8d55d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1889,6 +1889,24 @@ struct ieee80211_sta_rates {
 	} rate[IEEE80211_TX_RATE_TABLE_SIZE];
 };
 
+/**
+ * struct ieee80211_sta_txpwr - station txpower configuration
+ *
+ * Used to configure txpower for station.
+ *
+ * @power: indicates the tx power, in dBm, to be used when sending data frames
+ *	to the STA.
+ * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power
+ *	will be less than or equal to specified from userspace, whereas if TPC
+ *	%type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power.
+ *	NL80211_TX_POWER_FIXED is not a valid configuration option for
+ *	per peer TPC.
+ */
+struct ieee80211_sta_txpwr {
+	s16 power;
+	enum nl80211_tx_power_setting type;
+};
+
 /**
  * struct ieee80211_sta - station table entry
  *
@@ -1975,6 +1993,7 @@ struct ieee80211_sta {
 	bool support_p2p_ps;
 	u16 max_rc_amsdu_len;
 	u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS];
+	struct ieee80211_sta_txpwr txpwr;
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1];
 
@@ -3800,6 +3819,9 @@ struct ieee80211_ops {
 #endif
 	void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			enum sta_notify_cmd, struct ieee80211_sta *sta);
+	int (*sta_set_txpwr)(struct ieee80211_hw *hw,
+			     struct ieee80211_vif *vif,
+			     struct ieee80211_sta *sta);
 	int (*sta_state)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			 struct ieee80211_sta *sta,
 			 enum ieee80211_sta_state old_state,
-- 
cgit v1.2.3


From 5809a5d54bb9eda3a388b5a712657970c2cb9f8e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 11 Apr 2019 11:59:50 +0300
Subject: cfg80211: don't pass pointer to pointer unnecessarily

The cfg80211_merge_profile() and ieee802_11_find_bssid_profile() are
a bit cleaner if we just pass the merged_ie pointer instead of a pointer
to the pointer.

This isn't a functional change, it's just a clean up.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2ea04e94b522..944de1802210 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5578,7 +5578,7 @@ bool cfg80211_is_element_inherited(const struct element *element,
 size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
 			      const struct element *mbssid_elem,
 			      const struct element *sub_elem,
-			      u8 **merged_ie, size_t max_copy_len);
+			      u8 *merged_ie, size_t max_copy_len);
 
 /**
  * enum cfg80211_bss_frame_type - frame type that the BSS data came from
-- 
cgit v1.2.3


From 5ab92e7fe49ad74293b50fb9e6f25be5521e2f68 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Thu, 11 Apr 2019 13:47:24 -0700
Subject: cfg80211: add support to probe unexercised mesh link

Adding support to allow mesh HWMP to measure link metrics on unexercised
direct mesh path by sending some data frames to other mesh points which
are not currently selected as a primary traffic path but only 1 hop away.
The absence of the primary path to the chosen node makes it necessary to
apply some form of marking on a chosen packet stream so that the packets
can be properly steered to the selected node for testing, and not by the
regular mesh path lookup.

Tested-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 944de1802210..298301525f9f 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3501,6 +3501,9 @@ struct cfg80211_update_owe_info {
  * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME
  *	but offloading OWE processing to the user space will get the updated
  *	DH IE through this interface.
+ *
+ * @probe_mesh_link: Probe direct Mesh peer's link quality by sending data frame
+ *	and overrule HWMP path selection algorithm.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -3817,6 +3820,8 @@ struct cfg80211_ops {
 			      struct cfg80211_pmsr_request *request);
 	int	(*update_owe_info)(struct wiphy *wiphy, struct net_device *dev,
 				   struct cfg80211_update_owe_info *owe_info);
+	int	(*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev,
+				   const u8 *buf, size_t len);
 };
 
 /*
-- 
cgit v1.2.3


From 8828f81ad4a2f4e89ebe6e7793c06ed767c31d53 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Thu, 11 Apr 2019 13:47:26 -0700
Subject: mac80211: probe unexercised mesh links

The requirement for mesh link metric refreshing, is that from one
mesh point we be able to send some data frames to other mesh points
which are not currently selected as a primary traffic path, but which
are only 1 hop away. The absence of the primary path to the chosen node
makes it necessary to apply some form of marking on a chosen packet
stream so that the packets can be properly steered to the selected node
for testing, and not by the regular mesh path lookup.

Tested-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d66fbfe8d55d..76a443f32fc8 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -807,6 +807,7 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
  * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
+ * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -816,6 +817,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
 	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
+	IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP	= BIT(5),
 };
 
 /*
-- 
cgit v1.2.3


From 6ac99e8f23d4b10258406ca0dd7bffca5f31da9d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 26 Apr 2019 16:39:39 -0700
Subject: bpf: Introduce bpf sk local storage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After allowing a bpf prog to
- directly read the skb->sk ptr
- get the fullsock bpf_sock by "bpf_sk_fullsock()"
- get the bpf_tcp_sock by "bpf_tcp_sock()"
- get the listener sock by "bpf_get_listener_sock()"
- avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock"
  into different bpf running context.

this patch is another effort to make bpf's network programming
more intuitive to do (together with memory and performance benefit).

When bpf prog needs to store data for a sk, the current practice is to
define a map with the usual 4-tuples (src/dst ip/port) as the key.
If multiple bpf progs require to store different sk data, multiple maps
have to be defined.  Hence, wasting memory to store the duplicated
keys (i.e. 4 tuples here) in each of the bpf map.
[ The smallest key could be the sk pointer itself which requires
  some enhancement in the verifier and it is a separate topic. ]

Also, the bpf prog needs to clean up the elem when sk is freed.
Otherwise, the bpf map will become full and un-usable quickly.
The sk-free tracking currently could be done during sk state
transition (e.g. BPF_SOCK_OPS_STATE_CB).

The size of the map needs to be predefined which then usually ended-up
with an over-provisioned map in production.  Even the map was re-sizable,
while the sk naturally come and go away already, this potential re-size
operation is arguably redundant if the data can be directly connected
to the sk itself instead of proxy-ing through a bpf map.

This patch introduces sk->sk_bpf_storage to provide local storage space
at sk for bpf prog to use.  The space will be allocated when the first bpf
prog has created data for this particular sk.

The design optimizes the bpf prog's lookup (and then optionally followed by
an inline update).  bpf_spin_lock should be used if the inline update needs
to be protected.

BPF_MAP_TYPE_SK_STORAGE:
-----------------------
To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in
this patch) needs to be created.  Multiple BPF_MAP_TYPE_SK_STORAGE maps can
be created to fit different bpf progs' needs.  The map enforces
BTF to allow printing the sk-local-storage during a system-wise
sk dump (e.g. "ss -ta") in the future.

The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete
a "sk-local-storage" data from a particular sk.
Think of the map as a meta-data (or "type") of a "sk-local-storage".  This
particular "type" of "sk-local-storage" data can then be stored in any sk.

The main purposes of this map are mostly:
1. Define the size of a "sk-local-storage" type.
2. Provide a similar syscall userspace API as the map (e.g. lookup/update,
   map-id, map-btf...etc.)
3. Keep track of all sk's storages of this "type" and clean them up
   when the map is freed.

sk->sk_bpf_storage:
------------------
The main lookup/update/delete is done on sk->sk_bpf_storage (which
is a "struct bpf_sk_storage").  When doing a lookup,
the "map" pointer is now used as the "key" to search on the
sk_storage->list.  The "map" pointer is actually serving
as the "type" of the "sk-local-storage" that is being
requested.

To allow very fast lookup, it should be as fast as looking up an
array at a stable-offset.  At the same time, it is not ideal to
set a hard limit on the number of sk-local-storage "type" that the
system can have.  Hence, this patch takes a cache approach.
The last search result from sk_storage->list is cached in
sk_storage->cache[] which is a stable sized array.  Each
"sk-local-storage" type has a stable offset to the cache[] array.
In the future, a map's flag could be introduced to do cache
opt-out/enforcement if it became necessary.

The cache size is 16 (i.e. 16 types of "sk-local-storage").
Programs can share map.  On the program side, having a few bpf_progs
running in the networking hotpath is already a lot.  The bpf_prog
should have already consolidated the existing sock-key-ed map usage
to minimize the map lookup penalty.  16 has enough runway to grow.

All sk-local-storage data will be removed from sk->sk_bpf_storage
during sk destruction.

bpf_sk_storage_get() and bpf_sk_storage_delete():
------------------------------------------------
Instead of using bpf_map_(lookup|update|delete)_elem(),
the bpf prog needs to use the new helper bpf_sk_storage_get() and
bpf_sk_storage_delete().  The verifier can then enforce the
ARG_PTR_TO_SOCKET argument.  The bpf_sk_storage_get() also allows to
"create" new elem if one does not exist in the sk.  It is done by
the new BPF_SK_STORAGE_GET_F_CREATE flag.  An optional value can also be
provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE.
The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock.  Together,
it has eliminated the potential use cases for an equivalent
bpf_map_update_elem() API (for bpf_prog) in this patch.

Misc notes:
----------
1. map_get_next_key is not supported.  From the userspace syscall
   perspective,  the map has the socket fd as the key while the map
   can be shared by pinned-file or map-id.

   Since btf is enforced, the existing "ss" could be enhanced to pretty
   print the local-storage.

   Supporting a kernel defined btf with 4 tuples as the return key could
   be explored later also.

2. The sk->sk_lock cannot be acquired.  Atomic operations is used instead.
   e.g. cmpxchg is done on the sk->sk_bpf_storage ptr.
   Please refer to the source code comments for the details in
   synchronization cases and considerations.

3. The mem is charged to the sk->sk_omem_alloc as the sk filter does.

Benchmark:
---------
Here is the benchmark data collected by turning on
the "kernel.bpf_stats_enabled" sysctl.
Two bpf progs are tested:

One bpf prog with the usual bpf hashmap (max_entries = 8192) with the
sk ptr as the key. (verifier is modified to support sk ptr as the key
That should have shortened the key lookup time.)

Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE.

Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for
each egress skb and then bump the cnt.  netperf is used to drive
data with 4096 connected UDP sockets.

BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run)
27: cgroup_skb  name egress_sk_map  tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633
    loaded_at 2019-04-15T13:46:39-0700  uid 0
    xlated 344B  jited 258B  memlock 4096B  map_ids 16
    btf_id 5

BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run)
30: cgroup_skb  name egress_sk_stora  tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739
    loaded_at 2019-04-15T13:47:54-0700  uid 0
    xlated 168B  jited 156B  memlock 4096B  map_ids 17
    btf_id 6

Here is a high-level picture on how are the objects organized:

       sk
    ┌──────┐
    │      │
    │      │
    │      │
    │*sk_bpf_storage─────▶ bpf_sk_storage
    └──────┘                 ┌───────┐
                 ┌───────────┤ list  │
                 │           │       │
                 │           │       │
                 │           │       │
                 │           └───────┘
                 │
                 │     elem
                 │  ┌────────┐
                 ├─▶│ snode  │
                 │  ├────────┤
                 │  │  data  │          bpf_map
                 │  ├────────┤        ┌─────────┐
                 │  │map_node│◀─┬─────┤  list   │
                 │  └────────┘  │     │         │
                 │              │     │         │
                 │     elem     │     │         │
                 │  ┌────────┐  │     └─────────┘
                 └─▶│ snode  │  │
                    ├────────┤  │
   bpf_map          │  data  │  │
 ┌─────────┐        ├────────┤  │
 │  list   ├───────▶│map_node│  │
 │         │        └────────┘  │
 │         │                    │
 │         │           elem     │
 └─────────┘        ┌────────┐  │
                 ┌─▶│ snode  │  │
                 │  ├────────┤  │
                 │  │  data  │  │
                 │  ├────────┤  │
                 │  │map_node│◀─┘
                 │  └────────┘
                 │
                 │
                 │          ┌───────┐
     sk          └──────────│ list  │
  ┌──────┐                  │       │
  │      │                  │       │
  │      │                  │       │
  │      │                  └───────┘
  │*sk_bpf_storage───────▶bpf_sk_storage
  └──────┘

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/bpf_sk_storage.h | 13 +++++++++++++
 include/net/sock.h           |  5 +++++
 2 files changed, 18 insertions(+)
 create mode 100644 include/net/bpf_sk_storage.h

(limited to 'include/net')

diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
new file mode 100644
index 000000000000..b9dcb02e756b
--- /dev/null
+++ b/include/net/bpf_sk_storage.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2019 Facebook */
+#ifndef _BPF_SK_STORAGE_H
+#define _BPF_SK_STORAGE_H
+
+struct sock;
+
+void bpf_sk_storage_free(struct sock *sk);
+
+extern const struct bpf_func_proto bpf_sk_storage_get_proto;
+extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
+
+#endif /* _BPF_SK_STORAGE_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 784cd19d5ff7..4d208c0f9c14 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -236,6 +236,8 @@ struct sock_common {
 	/* public: */
 };
 
+struct bpf_sk_storage;
+
 /**
   *	struct sock - network layer representation of sockets
   *	@__sk_common: shared layout with inet_timewait_sock
@@ -510,6 +512,9 @@ struct sock {
 #endif
 	void                    (*sk_destruct)(struct sock *sk);
 	struct sock_reuseport __rcu	*sk_reuseport_cb;
+#ifdef CONFIG_BPF_SYSCALL
+	struct bpf_sk_storage __rcu	*sk_bpf_storage;
+#endif
 	struct rcu_head		sk_rcu;
 };
 
-- 
cgit v1.2.3


From 9e9957973c7785b1f8fa77f099cac661cc5e7e5b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 25 Apr 2019 12:32:02 -0700
Subject: net/tls: remove old exports of sk_destruct functions

tls_device_sk_destruct being set on a socket used to indicate
that socket is a kTLS device one.  That is no longer true -
now we use sk_validate_xmit_skb pointer for that purpose.
Remove the export.  tls_device_attach() needs to be moved.

While at it, remove the dead declaration of tls_sk_destruct().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tls.h b/include/net/tls.h
index d9d0ac66f040..20196cb31ecc 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -317,7 +317,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
 int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_device_sendpage(struct sock *sk, struct page *page,
 			int offset, size_t size, int flags);
-void tls_device_sk_destruct(struct sock *sk);
 void tls_device_free_resources_tx(struct sock *sk);
 void tls_device_init(void);
 void tls_device_cleanup(void);
@@ -336,7 +335,6 @@ static inline u32 tls_record_start_seq(struct tls_record_info *rec)
 	return rec->end_seq - rec->len;
 }
 
-void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 int tls_push_sg(struct sock *sk, struct tls_context *ctx,
 		struct scatterlist *sg, u16 first_offset,
 		int flags);
-- 
cgit v1.2.3


From da68b4ad02343862fee1e3e8c6315984f16a4778 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 25 Apr 2019 12:32:03 -0700
Subject: net/tls: move definition of tls ops into net/tls.h

There seems to be no reason for tls_ops to be defined in netdevice.h
which is included in a lot of places.  Don't wrap the struct/enum
declaration in ifdefs, it trickles down unnecessary ifdefs into
driver code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/net')

diff --git a/include/net/tls.h b/include/net/tls.h
index 20196cb31ecc..41a2ee643fc5 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -277,6 +277,23 @@ struct tls_context {
 	void (*unhash)(struct sock *sk);
 };
 
+enum tls_offload_ctx_dir {
+	TLS_OFFLOAD_CTX_DIR_RX,
+	TLS_OFFLOAD_CTX_DIR_TX,
+};
+
+struct tlsdev_ops {
+	int (*tls_dev_add)(struct net_device *netdev, struct sock *sk,
+			   enum tls_offload_ctx_dir direction,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn);
+	void (*tls_dev_del)(struct net_device *netdev,
+			    struct tls_context *ctx,
+			    enum tls_offload_ctx_dir direction);
+	void (*tls_dev_resync_rx)(struct net_device *netdev,
+				  struct sock *sk, u32 seq, u64 rcd_sn);
+};
+
 struct tls_offload_context_rx {
 	/* sw must be the first member of tls_offload_context_rx */
 	struct tls_sw_context_rx sw;
-- 
cgit v1.2.3


From 63a1c95f3fe48b4e9fe0c261b376e5e527b71b25 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 25 Apr 2019 12:32:04 -0700
Subject: net/tls: byte swap device req TCP seq no upon setting

To avoid a sparse warning byteswap the be32 sequence number
before it's stored in the atomic value.  While at it drop
unnecessary brackets and use kernel's u64 type.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tls.h b/include/net/tls.h
index 41a2ee643fc5..39ea62f0c1f6 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -562,7 +562,7 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq)
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
 
-	atomic64_set(&rx_ctx->resync_req, ((((uint64_t)seq) << 32) | 1));
+	atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | 1);
 }
 
 
-- 
cgit v1.2.3


From ae0be8de9a53cda3505865c11826d8ff0640237c Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Fri, 26 Apr 2019 11:13:06 +0200
Subject: netlink: make nla_nest_start() add NLA_F_NESTED flag

Even if the NLA_F_NESTED flag was introduced more than 11 years ago, most
netlink based interfaces (including recently added ones) are still not
setting it in kernel generated messages. Without the flag, message parsers
not aware of attribute semantics (e.g. wireshark dissector or libmnl's
mnl_nlmsg_fprintf()) cannot recognize nested attributes and won't display
the structure of their contents.

Unfortunately we cannot just add the flag everywhere as there may be
userspace applications which check nlattr::nla_type directly rather than
through a helper masking out the flags. Therefore the patch renames
nla_nest_start() to nla_nest_start_noflag() and introduces nla_nest_start()
as a wrapper adding NLA_F_NESTED. The calls which add NLA_F_NESTED manually
are rewritten to use nla_nest_start().

Except for changes in include/net/netlink.h, the patch was generated using
this semantic patch:

@@ expression E1, E2; @@
-nla_nest_start(E1, E2)
+nla_nest_start_noflag(E1, E2)

@@ expression E1, E2; @@
-nla_nest_start_noflag(E1, E2 | NLA_F_NESTED)
+nla_nest_start(E1, E2)

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 23f27b0b3cef..1f18b47f41b4 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1415,13 +1415,18 @@ static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp)
 }
 
 /**
- * nla_nest_start - Start a new level of nested attributes
+ * nla_nest_start_noflag - Start a new level of nested attributes
  * @skb: socket buffer to add attributes to
  * @attrtype: attribute type of container
  *
- * Returns the container attribute
+ * This function exists for backward compatibility to use in APIs which never
+ * marked their nest attributes with NLA_F_NESTED flag. New APIs should use
+ * nla_nest_start() which sets the flag.
+ *
+ * Returns the container attribute or NULL on error
  */
-static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
+static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb,
+						   int attrtype)
 {
 	struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb);
 
@@ -1431,6 +1436,21 @@ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
 	return start;
 }
 
+/**
+ * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED
+ * @skb: socket buffer to add attributes to
+ * @attrtype: attribute type of container
+ *
+ * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED
+ * flag. This is the preferred function to use in new code.
+ *
+ * Returns the container attribute or NULL on error
+ */
+static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
+{
+	return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED);
+}
+
 /**
  * nla_nest_end - Finalize nesting of attributes
  * @skb: socket buffer the attributes are stored in
-- 
cgit v1.2.3


From 6f455f5f4e9c28aefaefbe18ce7304b499645d75 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:27 +0200
Subject: netlink: add NLA_MIN_LEN

Rather than using NLA_UNSPEC for this type of thing, use NLA_MIN_LEN
so we can make NLA_UNSPEC be NLA_REJECT under certain conditions for
future attributes.

While at it, also use NLA_EXACT_LEN for the struct example.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 1f18b47f41b4..c77ed51c18f1 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -183,6 +183,7 @@ enum {
 	NLA_REJECT,
 	NLA_EXACT_LEN,
 	NLA_EXACT_LEN_WARN,
+	NLA_MIN_LEN,
 	__NLA_TYPE_MAX,
 };
 
@@ -212,6 +213,7 @@ enum nla_policy_validation {
  *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
  *    NLA_FLAG             Unused
  *    NLA_BINARY           Maximum length of attribute payload
+ *    NLA_MIN_LEN          Minimum length of attribute payload
  *    NLA_NESTED,
  *    NLA_NESTED_ARRAY     Length verification is done by checking len of
  *                         nested header (or empty); len field is used if
@@ -230,6 +232,7 @@ enum nla_policy_validation {
  *                         it is rejected.
  *    NLA_EXACT_LEN_WARN   Attribute should have exactly this length, a warning
  *                         is logged if it is longer, shorter is rejected.
+ *    NLA_MIN_LEN          Minimum length of attribute payload
  *    All other            Minimum length of attribute payload
  *
  * Meaning of `validation_data' field:
@@ -281,7 +284,7 @@ enum nla_policy_validation {
  * static const struct nla_policy my_policy[ATTR_MAX+1] = {
  * 	[ATTR_FOO] = { .type = NLA_U16 },
  *	[ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
- *	[ATTR_BAZ] = { .len = sizeof(struct mystruct) },
+ *	[ATTR_BAZ] = { .type = NLA_EXACT_LEN, .len = sizeof(struct mystruct) },
  *	[ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags },
  * };
  */
@@ -302,6 +305,7 @@ struct nla_policy {
 #define NLA_POLICY_EXACT_LEN(_len)	{ .type = NLA_EXACT_LEN, .len = _len }
 #define NLA_POLICY_EXACT_LEN_WARN(_len)	{ .type = NLA_EXACT_LEN_WARN, \
 					  .len = _len }
+#define NLA_POLICY_MIN_LEN(_len)	{ .type = NLA_MIN_LEN, .len = _len }
 
 #define NLA_POLICY_ETH_ADDR		NLA_POLICY_EXACT_LEN(ETH_ALEN)
 #define NLA_POLICY_ETH_ADDR_COMPAT	NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)
-- 
cgit v1.2.3


From 8cb081746c031fb164089322e2336a0bf5b3070c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:28 +0200
Subject: netlink: make validation more configurable for future strictness

We currently have two levels of strict validation:

 1) liberal (default)
     - undefined (type >= max) & NLA_UNSPEC attributes accepted
     - attribute length >= expected accepted
     - garbage at end of message accepted
 2) strict (opt-in)
     - NLA_UNSPEC attributes accepted
     - attribute length >= expected accepted

Split out parsing strictness into four different options:
 * TRAILING     - check that there's no trailing data after parsing
                  attributes (in message or nested)
 * MAXTYPE      - reject attrs > max known type
 * UNSPEC       - reject attributes with NLA_UNSPEC policy entries
 * STRICT_ATTRS - strictly validate attribute size

The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().

Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.

We end up with the following renames:
 * nla_parse           -> nla_parse_deprecated
 * nla_parse_strict    -> nla_parse_deprecated_strict
 * nlmsg_parse         -> nlmsg_parse_deprecated
 * nlmsg_parse_strict  -> nlmsg_parse_deprecated_strict
 * nla_parse_nested    -> nla_parse_nested_deprecated
 * nla_validate_nested -> nla_validate_nested_deprecated

Using spatch, of course:
    @@
    expression TB, MAX, HEAD, LEN, POL, EXT;
    @@
    -nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
    +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)

    @@
    expression NLH, HDRLEN, TB, MAX, POL, EXT;
    @@
    -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
    +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)

    @@
    expression NLH, HDRLEN, TB, MAX, POL, EXT;
    @@
    -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
    +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)

    @@
    expression TB, MAX, NLA, POL, EXT;
    @@
    -nla_parse_nested(TB, MAX, NLA, POL, EXT)
    +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)

    @@
    expression START, MAX, POL, EXT;
    @@
    -nla_validate_nested(START, MAX, POL, EXT)
    +nla_validate_nested_deprecated(START, MAX, POL, EXT)

    @@
    expression NLH, HDRLEN, MAX, POL, EXT;
    @@
    -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
    +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)

For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.

Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.

Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.

In effect then, this adds fully strict validation for any new command.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h |  16 ++--
 include/net/netlink.h   | 238 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 199 insertions(+), 55 deletions(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 6850c7b1a3a6..897cdba13569 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -165,7 +165,7 @@ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr)
 }
 
 /**
- * genlmsg_parse - parse attributes of a genetlink message
+ * genlmsg_parse_deprecated - parse attributes of a genetlink message
  * @nlh: netlink message header
  * @family: genetlink message family
  * @tb: destination array with maxtype+1 elements
@@ -173,14 +173,14 @@ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr)
  * @policy: validation policy
  * @extack: extended ACK report struct
  */
-static inline int genlmsg_parse(const struct nlmsghdr *nlh,
-				const struct genl_family *family,
-				struct nlattr *tb[], int maxtype,
-				const struct nla_policy *policy,
-				struct netlink_ext_ack *extack)
+static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh,
+					   const struct genl_family *family,
+					   struct nlattr *tb[], int maxtype,
+					   const struct nla_policy *policy,
+					   struct netlink_ext_ack *extack)
 {
-	return nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
-			   policy, extack);
+	return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
+			     policy, NL_VALIDATE_LIBERAL, extack);
 }
 
 /**
diff --git a/include/net/netlink.h b/include/net/netlink.h
index c77ed51c18f1..ab26a5e3558b 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -369,21 +369,48 @@ struct nl_info {
 	bool			skip_notify;
 };
 
+/**
+ * enum netlink_validation - netlink message/attribute validation levels
+ * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about
+ *	extra data at the end of the message, attributes being longer than
+ *	they should be, or unknown attributes being present.
+ * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing.
+ * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING
+ *	this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict().
+ * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy.
+ *	This can safely be set by the kernel when the given policy has no
+ *	NLA_UNSPEC anymore, and can thus be used to ensure policy entries
+ *	are enforced going forward.
+ * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
+ *	U8, U16, U32 must have exact size, etc.)
+ */
+enum netlink_validation {
+	NL_VALIDATE_LIBERAL = 0,
+	NL_VALIDATE_TRAILING = BIT(0),
+	NL_VALIDATE_MAXTYPE = BIT(1),
+	NL_VALIDATE_UNSPEC = BIT(2),
+	NL_VALIDATE_STRICT_ATTRS = BIT(3),
+};
+
+#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
+				       NL_VALIDATE_MAXTYPE)
+#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
+			    NL_VALIDATE_MAXTYPE |\
+			    NL_VALIDATE_UNSPEC |\
+			    NL_VALIDATE_STRICT_ATTRS)
+
 int netlink_rcv_skb(struct sk_buff *skb,
 		    int (*cb)(struct sk_buff *, struct nlmsghdr *,
 			      struct netlink_ext_ack *));
 int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
 		 unsigned int group, int report, gfp_t flags);
 
-int nla_validate(const struct nlattr *head, int len, int maxtype,
-		 const struct nla_policy *policy,
-		 struct netlink_ext_ack *extack);
-int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
-	      int len, const struct nla_policy *policy,
-	      struct netlink_ext_ack *extack);
-int nla_parse_strict(struct nlattr **tb, int maxtype, const struct nlattr *head,
-		     int len, const struct nla_policy *policy,
-		     struct netlink_ext_ack *extack);
+int __nla_validate(const struct nlattr *head, int len, int maxtype,
+		   const struct nla_policy *policy, unsigned int validate,
+		   struct netlink_ext_ack *extack);
+int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
+		int len, const struct nla_policy *policy, unsigned int validate,
+		struct netlink_ext_ack *extack);
 int nla_policy_len(const struct nla_policy *, int);
 struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
 size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize);
@@ -512,42 +539,121 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
 }
 
 /**
- * nlmsg_parse - parse attributes of a netlink message
+ * nla_parse_deprecated - Parse a stream of attributes into a tb buffer
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @policy: validation policy
+ * @extack: extended ACK pointer
+ *
+ * Parses a stream of attributes and stores a pointer to each attribute in
+ * the tb array accessible via the attribute type. Attributes with a type
+ * exceeding maxtype will be ignored and attributes from the policy are not
+ * always strictly validated (only for new attributes).
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype,
+				       const struct nlattr *head, int len,
+				       const struct nla_policy *policy,
+				       struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, head, len, policy,
+			   NL_VALIDATE_LIBERAL, extack);
+}
+
+/**
+ * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @policy: validation policy
+ * @extack: extended ACK pointer
+ *
+ * Parses a stream of attributes and stores a pointer to each attribute in
+ * the tb array accessible via the attribute type. Attributes with a type
+ * exceeding maxtype will be rejected as well as trailing data, but the
+ * policy is not completely strictly validated (only for new attributes).
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype,
+					      const struct nlattr *head,
+					      int len,
+					      const struct nla_policy *policy,
+					      struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, head, len, policy,
+			   NL_VALIDATE_DEPRECATED_STRICT, extack);
+}
+
+/**
+ * __nlmsg_parse - parse attributes of a netlink message
  * @nlh: netlink message header
  * @hdrlen: length of family specific header
  * @tb: destination array with maxtype+1 elements
  * @maxtype: maximum attribute type to be expected
  * @policy: validation policy
+ * @validate: validation strictness
  * @extack: extended ACK report struct
  *
  * See nla_parse()
  */
-static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
-			      struct nlattr *tb[], int maxtype,
-			      const struct nla_policy *policy,
-			      struct netlink_ext_ack *extack)
+static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
+				struct nlattr *tb[], int maxtype,
+				const struct nla_policy *policy,
+				unsigned int validate,
+				struct netlink_ext_ack *extack)
 {
 	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
 		NL_SET_ERR_MSG(extack, "Invalid header length");
 		return -EINVAL;
 	}
 
-	return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
-			 nlmsg_attrlen(nlh, hdrlen), policy, extack);
+	return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
+			   nlmsg_attrlen(nlh, hdrlen), policy, validate,
+			   extack);
 }
 
-static inline int nlmsg_parse_strict(const struct nlmsghdr *nlh, int hdrlen,
-				     struct nlattr *tb[], int maxtype,
-				     const struct nla_policy *policy,
-				     struct netlink_ext_ack *extack)
+/**
+ * nlmsg_parse_deprecated - parse attributes of a netlink message
+ * @nlh: netlink message header
+ * @hdrlen: length of family specific header
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @extack: extended ACK report struct
+ *
+ * See nla_parse_deprecated()
+ */
+static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen,
+					 struct nlattr *tb[], int maxtype,
+					 const struct nla_policy *policy,
+					 struct netlink_ext_ack *extack)
 {
-	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
-		NL_SET_ERR_MSG(extack, "Invalid header length");
-		return -EINVAL;
-	}
+	return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
+			     NL_VALIDATE_LIBERAL, extack);
+}
 
-	return nla_parse_strict(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
-				nlmsg_attrlen(nlh, hdrlen), policy, extack);
+/**
+ * nlmsg_parse_deprecated_strict - parse attributes of a netlink message
+ * @nlh: netlink message header
+ * @hdrlen: length of family specific header
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @extack: extended ACK report struct
+ *
+ * See nla_parse_deprecated_strict()
+ */
+static inline int
+nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen,
+			      struct nlattr *tb[], int maxtype,
+			      const struct nla_policy *policy,
+			      struct netlink_ext_ack *extack)
+{
+	return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
+			     NL_VALIDATE_DEPRECATED_STRICT, extack);
 }
 
 /**
@@ -566,26 +672,53 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh,
 }
 
 /**
- * nlmsg_validate - validate a netlink message including attributes
+ * nla_validate_deprecated - Validate a stream of attributes
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @maxtype: maximum attribute type to be expected
+ * @policy: validation policy
+ * @validate: validation strictness
+ * @extack: extended ACK report struct
+ *
+ * Validates all attributes in the specified attribute stream against the
+ * specified policy. Validation is done in liberal mode.
+ * See documenation of struct nla_policy for more details.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_validate_deprecated(const struct nlattr *head, int len,
+					  int maxtype,
+					  const struct nla_policy *policy,
+					  struct netlink_ext_ack *extack)
+{
+	return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL,
+			      extack);
+}
+
+
+/**
+ * nlmsg_validate_deprecated - validate a netlink message including attributes
  * @nlh: netlinket message header
  * @hdrlen: length of familiy specific header
  * @maxtype: maximum attribute type to be expected
  * @policy: validation policy
  * @extack: extended ACK report struct
  */
-static inline int nlmsg_validate(const struct nlmsghdr *nlh,
-				 int hdrlen, int maxtype,
-				 const struct nla_policy *policy,
-				 struct netlink_ext_ack *extack)
+static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh,
+					    int hdrlen, int maxtype,
+					    const struct nla_policy *policy,
+					    struct netlink_ext_ack *extack)
 {
 	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
 		return -EINVAL;
 
-	return nla_validate(nlmsg_attrdata(nlh, hdrlen),
-			    nlmsg_attrlen(nlh, hdrlen), maxtype, policy,
-			    extack);
+	return __nla_validate(nlmsg_attrdata(nlh, hdrlen),
+			      nlmsg_attrlen(nlh, hdrlen), maxtype,
+			      policy, NL_VALIDATE_LIBERAL, extack);
 }
 
+
+
 /**
  * nlmsg_report - need to report back to application?
  * @nlh: netlink message header
@@ -899,22 +1032,22 @@ nla_find_nested(const struct nlattr *nla, int attrtype)
 }
 
 /**
- * nla_parse_nested - parse nested attributes
+ * nla_parse_nested_deprecated - parse nested attributes
  * @tb: destination array with maxtype+1 elements
  * @maxtype: maximum attribute type to be expected
  * @nla: attribute containing the nested attributes
  * @policy: validation policy
  * @extack: extended ACK report struct
  *
- * See nla_parse()
+ * See nla_parse_deprecated()
  */
-static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
-				   const struct nlattr *nla,
-				   const struct nla_policy *policy,
-				   struct netlink_ext_ack *extack)
+static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype,
+					      const struct nlattr *nla,
+					      const struct nla_policy *policy,
+					      struct netlink_ext_ack *extack)
 {
-	return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
-			 extack);
+	return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
+			   NL_VALIDATE_LIBERAL, extack);
 }
 
 /**
@@ -1489,6 +1622,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
  * @start: container attribute
  * @maxtype: maximum attribute type to be expected
  * @policy: validation policy
+ * @validate: validation strictness
  * @extack: extended ACK report struct
  *
  * Validates all attributes in the nested attribute stream against the
@@ -1497,12 +1631,22 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
  *
  * Returns 0 on success or a negative error code.
  */
-static inline int nla_validate_nested(const struct nlattr *start, int maxtype,
-				      const struct nla_policy *policy,
-				      struct netlink_ext_ack *extack)
+static inline int __nla_validate_nested(const struct nlattr *start, int maxtype,
+					const struct nla_policy *policy,
+					unsigned int validate,
+					struct netlink_ext_ack *extack)
+{
+	return __nla_validate(nla_data(start), nla_len(start), maxtype, policy,
+			      validate, extack);
+}
+
+static inline int
+nla_validate_nested_deprecated(const struct nlattr *start, int maxtype,
+			       const struct nla_policy *policy,
+			       struct netlink_ext_ack *extack)
 {
-	return nla_validate(nla_data(start), nla_len(start), maxtype, policy,
-			    extack);
+	return __nla_validate_nested(start, maxtype, policy,
+				     NL_VALIDATE_LIBERAL, extack);
 }
 
 /**
-- 
cgit v1.2.3


From 3de644035446567017e952f16da2594d6bd195fc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:29 +0200
Subject: netlink: re-add parse/validate functions in strict mode

This re-adds the parse and validate functions like nla_parse()
that are now actually strict after the previous rename and were
just split out to make sure everything is converted (and if not
compilation of the previous patch would fail.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 19 +++++++++++
 include/net/netlink.h   | 87 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 897cdba13569..68de579cfe5e 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -183,6 +183,25 @@ static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh,
 			     policy, NL_VALIDATE_LIBERAL, extack);
 }
 
+/**
+ * genlmsg_parse - parse attributes of a genetlink message
+ * @nlh: netlink message header
+ * @family: genetlink message family
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @policy: validation policy
+ * @extack: extended ACK report struct
+ */
+static inline int genlmsg_parse(const struct nlmsghdr *nlh,
+				const struct genl_family *family,
+				struct nlattr *tb[], int maxtype,
+				const struct nla_policy *policy,
+				struct netlink_ext_ack *extack)
+{
+	return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
+			     policy, NL_VALIDATE_STRICT, extack);
+}
+
 /**
  * genl_dump_check_consistent - check if sequence is consistent and advertise if not
  * @cb: netlink callback structure that stores the sequence number
diff --git a/include/net/netlink.h b/include/net/netlink.h
index ab26a5e3558b..e4dd874412bf 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -538,6 +538,31 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
 	return (struct nlmsghdr *) ((unsigned char *) nlh + totlen);
 }
 
+/**
+ * nla_parse - Parse a stream of attributes into a tb buffer
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @policy: validation policy
+ * @extack: extended ACK pointer
+ *
+ * Parses a stream of attributes and stores a pointer to each attribute in
+ * the tb array accessible via the attribute type. Attributes with a type
+ * exceeding maxtype will be rejected, policy must be specified, attributes
+ * will be validated in the strictest way possible.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_parse(struct nlattr **tb, int maxtype,
+			    const struct nlattr *head, int len,
+			    const struct nla_policy *policy,
+			    struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, head, len, policy,
+			   NL_VALIDATE_STRICT, extack);
+}
+
 /**
  * nla_parse_deprecated - Parse a stream of attributes into a tb buffer
  * @tb: destination array with maxtype+1 elements
@@ -617,6 +642,27 @@ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
 			   extack);
 }
 
+/**
+ * nlmsg_parse - parse attributes of a netlink message
+ * @nlh: netlink message header
+ * @hdrlen: length of family specific header
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @validate: validation strictness
+ * @extack: extended ACK report struct
+ *
+ * See nla_parse()
+ */
+static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
+			      struct nlattr *tb[], int maxtype,
+			      const struct nla_policy *policy,
+			      struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
+			   nlmsg_attrlen(nlh, hdrlen), policy,
+			   NL_VALIDATE_STRICT, extack);
+}
+
 /**
  * nlmsg_parse_deprecated - parse attributes of a netlink message
  * @nlh: netlink message header
@@ -695,6 +741,28 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len,
 			      extack);
 }
 
+/**
+ * nla_validate - Validate a stream of attributes
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @maxtype: maximum attribute type to be expected
+ * @policy: validation policy
+ * @validate: validation strictness
+ * @extack: extended ACK report struct
+ *
+ * Validates all attributes in the specified attribute stream against the
+ * specified policy. Validation is done in strict mode.
+ * See documenation of struct nla_policy for more details.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_validate(const struct nlattr *head, int len, int maxtype,
+			       const struct nla_policy *policy,
+			       struct netlink_ext_ack *extack)
+{
+	return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT,
+			      extack);
+}
 
 /**
  * nlmsg_validate_deprecated - validate a netlink message including attributes
@@ -1031,6 +1099,25 @@ nla_find_nested(const struct nlattr *nla, int attrtype)
 	return nla_find(nla_data(nla), nla_len(nla), attrtype);
 }
 
+/**
+ * nla_parse_nested - parse nested attributes
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @nla: attribute containing the nested attributes
+ * @policy: validation policy
+ * @extack: extended ACK report struct
+ *
+ * See nla_parse()
+ */
+static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
+				   const struct nlattr *nla,
+				   const struct nla_policy *policy,
+				   struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
+			   NL_VALIDATE_STRICT, extack);
+}
+
 /**
  * nla_parse_nested_deprecated - parse nested attributes
  * @tb: destination array with maxtype+1 elements
-- 
cgit v1.2.3


From 56738f460841761abc70347c919d5c45f6f05a42 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:30 +0200
Subject: netlink: add strict parsing for future attributes

Unfortunately, we cannot add strict parsing for all attributes, as
that would break existing userspace. We currently warn about it, but
that's about all we can do.

For new attributes, however, the story is better: nobody is using
them, so we can reject bad sizes.

Also, for new attributes, we need not accept them when the policy
doesn't declare their usage.

David Ahern and I went back and forth on how to best encode this, and
the best way we found was to have a "boundary type", from which point
on new attributes have all possible validation applied, and NLA_UNSPEC
is rejected.

As we didn't want to add another argument to all functions that get a
netlink policy, the workaround is to encode that boundary in the first
entry of the policy array (which is for type 0 and thus probably not
really valid anyway). I put it into the validation union for the rare
possibility that somebody is actually using attribute 0, which would
continue to work fine unless they tried to use the extended validation,
which isn't likely. We also didn't find any in-tree users with type 0.

The reason for setting the "start strict here" attribute is that we
never really need to start strict from 0, which is invalid anyway (or
in legacy families where that isn't true, it cannot be set to strict),
so we can thus reserve the value 0 for "don't do this check" and don't
have to add the tag to all policies right now.

Thus, policies can now opt in to this validation, which we should do
for all existing policies, at least when adding new attributes.

Note that entirely *new* policies won't need to set it, as the use
of that should be using nla_parse()/nlmsg_parse() etc. which anyway
do fully strict validation now, regardless of this.

So in effect, this patch only covers the "existing command with new
attribute" case.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index e4dd874412bf..679f649748d4 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -299,6 +299,24 @@ struct nla_policy {
 		};
 		int (*validate)(const struct nlattr *attr,
 				struct netlink_ext_ack *extack);
+		/* This entry is special, and used for the attribute at index 0
+		 * only, and specifies special data about the policy, namely it
+		 * specifies the "boundary type" where strict length validation
+		 * starts for any attribute types >= this value, also, strict
+		 * nesting validation starts here.
+		 *
+		 * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT
+		 * for any types >= this, so need to use NLA_MIN_LEN to get the
+		 * previous pure { .len = xyz } behaviour. The advantage of this
+		 * is that types not specified in the policy will be rejected.
+		 *
+		 * For completely new families it should be set to 1 so that the
+		 * validation is enforced for all attributes. For existing ones
+		 * it should be set at least when new attributes are added to
+		 * the enum used by the policy, and be set to the new value that
+		 * was added to enforce strict validation from thereon.
+		 */
+		u16 strict_start_type;
 	};
 };
 
-- 
cgit v1.2.3


From ef6243acb4782df587a4d7d6c310fa5b5d82684b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:31 +0200
Subject: genetlink: optionally validate strictly/dumps

Add options to strictly validate messages and dump messages,
sometimes perhaps validating dump messages non-strictly may
be required, so add an option for that as well.

Since none of this can really be applied to existing commands,
set the options everwhere using the following spatch:

    @@
    identifier ops;
    expression X;
    @@
    struct genl_ops ops[] = {
    ...,
     {
            .cmd = X,
    +       .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
            ...
     },
    ...
    };

For new commands one should just not copy the .validate 'opt-out'
flags and thus get strict validation.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 68de579cfe5e..9292f1c588b7 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -121,6 +121,12 @@ static inline int genl_err_attr(struct genl_info *info, int err,
 	return err;
 }
 
+enum genl_validate_flags {
+	GENL_DONT_VALIDATE_STRICT		= BIT(0),
+	GENL_DONT_VALIDATE_DUMP			= BIT(1),
+	GENL_DONT_VALIDATE_DUMP_STRICT		= BIT(2),
+};
+
 /**
  * struct genl_ops - generic netlink operations
  * @cmd: command identifier
@@ -141,6 +147,7 @@ struct genl_ops {
 	u8			cmd;
 	u8			internal_flags;
 	u8			flags;
+	u8			validate;
 };
 
 int genl_register_family(struct genl_family *family);
-- 
cgit v1.2.3


From 875138f81d71af3cfa80df57e32fe9efbc4f95bc Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:11 +0200
Subject: dsa: Move tagger name into its ops structure

Rather than keep a list to map a tagger ops to a name, place the name
into the ops structure. This removes the hard coded list, a step
towards making the taggers more dynamic.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>

v2:
Move name to end of structure, keeping the hot entries at the beginning.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0cfc2f828b87..801346e31e9b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -56,6 +56,7 @@ struct dsa_device_ops {
 	int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
 			    int *offset);
 	unsigned int overhead;
+	const char *name;
 };
 
 struct dsa_switch_tree {
-- 
cgit v1.2.3


From 0b42f03363706609d621c31324fae5c1250f579f Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:12 +0200
Subject: dsa: Add MODULE_ALIAS to taggers in preparation to become modules

When the tag drivers become modules, we will need to dynamically load
them based on what the switch drivers need. Add aliases to map between
the TAG protocol and the driver.

In order to do this, we need the tag protocol number as something
which the C pre-processor can stringinfy. Only the compiler knows the
value of an enum, CPP cannot use them. So add #defines.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 801346e31e9b..8f3d5e0825a2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -30,20 +30,33 @@ struct phy_device;
 struct fixed_phy_status;
 struct phylink_link_state;
 
+#define DSA_TAG_PROTO_NONE_VALUE		0
+#define DSA_TAG_PROTO_BRCM_VALUE		1
+#define DSA_TAG_PROTO_BRCM_PREPEND_VALUE	2
+#define DSA_TAG_PROTO_DSA_VALUE			3
+#define DSA_TAG_PROTO_EDSA_VALUE		4
+#define DSA_TAG_PROTO_GSWIP_VALUE		5
+#define DSA_TAG_PROTO_KSZ9477_VALUE		6
+#define DSA_TAG_PROTO_KSZ9893_VALUE		7
+#define DSA_TAG_PROTO_LAN9303_VALUE		8
+#define DSA_TAG_PROTO_MTK_VALUE			9
+#define DSA_TAG_PROTO_QCA_VALUE			10
+#define DSA_TAG_PROTO_TRAILER_VALUE		11
+
 enum dsa_tag_protocol {
-	DSA_TAG_PROTO_NONE = 0,
-	DSA_TAG_PROTO_BRCM,
-	DSA_TAG_PROTO_BRCM_PREPEND,
-	DSA_TAG_PROTO_DSA,
-	DSA_TAG_PROTO_EDSA,
-	DSA_TAG_PROTO_GSWIP,
-	DSA_TAG_PROTO_KSZ9477,
-	DSA_TAG_PROTO_KSZ9893,
-	DSA_TAG_PROTO_LAN9303,
-	DSA_TAG_PROTO_MTK,
-	DSA_TAG_PROTO_QCA,
-	DSA_TAG_PROTO_TRAILER,
-	DSA_TAG_LAST,		/* MUST BE LAST */
+	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
+	DSA_TAG_PROTO_BRCM		= DSA_TAG_PROTO_BRCM_VALUE,
+	DSA_TAG_PROTO_BRCM_PREPEND	= DSA_TAG_PROTO_BRCM_PREPEND_VALUE,
+	DSA_TAG_PROTO_DSA		= DSA_TAG_PROTO_DSA_VALUE,
+	DSA_TAG_PROTO_EDSA		= DSA_TAG_PROTO_EDSA_VALUE,
+	DSA_TAG_PROTO_GSWIP		= DSA_TAG_PROTO_GSWIP_VALUE,
+	DSA_TAG_PROTO_KSZ9477		= DSA_TAG_PROTO_KSZ9477_VALUE,
+	DSA_TAG_PROTO_KSZ9893		= DSA_TAG_PROTO_KSZ9893_VALUE,
+	DSA_TAG_PROTO_LAN9303		= DSA_TAG_PROTO_LAN9303_VALUE,
+	DSA_TAG_PROTO_MTK		= DSA_TAG_PROTO_MTK_VALUE,
+	DSA_TAG_PROTO_QCA		= DSA_TAG_PROTO_QCA_VALUE,
+	DSA_TAG_PROTO_TRAILER		= DSA_TAG_PROTO_TRAILER_VALUE,
+	DSA_TAG_LAST,			/* MUST BE LAST */
 };
 
 struct packet_type;
@@ -59,6 +72,10 @@ struct dsa_device_ops {
 	const char *name;
 };
 
+#define DSA_TAG_DRIVER_ALIAS "dsa_tag-"
+#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto)				\
+	MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE))
+
 struct dsa_switch_tree {
 	struct list_head	list;
 
-- 
cgit v1.2.3


From 056eed2fb071c11535527fc792bdfb985a9a3e26 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:14 +0200
Subject: dsa: Add TAG protocol to tag ops

In order that we can match the tagging protocol a switch driver
request to the tagger, we need to know what protocol the tagger
supports. Add this information to the ops structure.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>

v2
More tag protocol to end of structure to keep hot members at the beginning.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 8f3d5e0825a2..720036f48fb3 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -70,6 +70,7 @@ struct dsa_device_ops {
 			    int *offset);
 	unsigned int overhead;
 	const char *name;
+	enum dsa_tag_protocol proto;
 };
 
 #define DSA_TAG_DRIVER_ALIAS "dsa_tag-"
-- 
cgit v1.2.3


From d3b8c04988ca1685700e345a82a1396df79e6291 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:15 +0200
Subject: dsa: Add boilerplate helper to register DSA tag driver modules

A DSA tag driver module will need to register the tag protocols it
implements with the DSA core. Add macros containing this boiler plate.

The registration/unregistration code is currently just a stub. A Later
patch will add the real implementation.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>

v2
Fix indent of #endif
Rewrite to move list pointer into a new structure
v3
Move kdoc next to macro
Fix THIS_MODULE indentation

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 720036f48fb3..08ac05c014e3 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -594,4 +594,70 @@ int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
 int dsa_port_get_phy_sset_count(struct dsa_port *dp);
 void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up);
 
+struct dsa_tag_driver {
+	const struct dsa_device_ops *ops;
+	struct list_head list;
+	struct module *owner;
+};
+
+void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
+			      unsigned int count,
+			      struct module *owner);
+void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
+				unsigned int count);
+
+#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count)	\
+static int __init dsa_tag_driver_module_init(void)			\
+{									\
+	dsa_tag_drivers_register(__dsa_tag_drivers_array, __count,	\
+				 THIS_MODULE);				\
+	return 0;							\
+}									\
+module_init(dsa_tag_driver_module_init);				\
+									\
+static void __exit dsa_tag_driver_module_exit(void)			\
+{									\
+	dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count);	\
+}									\
+module_exit(dsa_tag_driver_module_exit)
+
+/**
+ * module_dsa_tag_drivers() - Helper macro for registering DSA tag
+ * drivers
+ * @__ops_array: Array of tag driver strucutres
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_drivers(__ops_array)				\
+dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array))
+
+#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops
+
+/* Create a static structure we can build a linked list of dsa_tag
+ * drivers
+ */
+#define DSA_TAG_DRIVER(__ops)						\
+static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = {		\
+	.ops = &__ops,							\
+}
+
+/**
+ * module_dsa_tag_driver() - Helper macro for registering a single DSA tag
+ * driver
+ * @__ops: Single tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_driver(__ops)					\
+DSA_TAG_DRIVER(__ops);							\
+									\
+static struct dsa_tag_driver *dsa_tag_driver_array[] =	{		\
+	&DSA_TAG_DRIVER_NAME(__ops)					\
+};									\
+module_dsa_tag_drivers(dsa_tag_driver_array)
 #endif
+
-- 
cgit v1.2.3


From f81a43e8da07ccd91c4d923a44ffffaeee39dcc8 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:21 +0200
Subject: dsa: Cleanup unneeded table and make tag structures static

Now that tag drivers dynamically register, we don't need the static
table. Remove it. This also means the tag driver structures can be
made static.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 08ac05c014e3..b550f7bb5314 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -56,7 +56,6 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_MTK		= DSA_TAG_PROTO_MTK_VALUE,
 	DSA_TAG_PROTO_QCA		= DSA_TAG_PROTO_QCA_VALUE,
 	DSA_TAG_PROTO_TRAILER		= DSA_TAG_PROTO_TRAILER_VALUE,
-	DSA_TAG_LAST,			/* MUST BE LAST */
 };
 
 struct packet_type;
-- 
cgit v1.2.3


From f1f86d09ca7e35fb161a47bc54ec9cb2f4fe42d8 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 15 Apr 2019 16:43:14 -0400
Subject: netfilter: nf_tables: relocate header content to consumer

The nf_tables.h header is used in a lot of files, but it turns out
that there is only one actual user of nft_expr_clone().

Hence we relocate that function to be with the one consumer of it
and avoid having to process it with CPP for all the other files.

This will also enable a reduction in the other headers that the
nf_tables.h itself has to include just to be stand-alone, hence
a pending further significant reduction in the CPP content that
needs to get processed for each netfilter file.

Note that the explicit "inline" has been dropped as part of this
relocation.  In similar changes to this, I believe Dave has asked
this be done, so we free up gcc to make the choice of whether to
inline or not.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2d5a0a1a87b8..706f744f7308 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -806,23 +806,6 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
 int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
 		  const struct nft_expr *expr);
 
-static inline int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
-{
-	int err;
-
-	if (src->ops->clone) {
-		dst->ops = src->ops;
-		err = src->ops->clone(dst, src);
-		if (err < 0)
-			return err;
-	} else {
-		memcpy(dst, src, src->ops->size);
-	}
-
-	__module_get(src->ops->type->owner);
-	return 0;
-}
-
 /**
  *	struct nft_rule - nf_tables rule
  *
-- 
cgit v1.2.3


From a4cb98f32c9046fea28bcb4979182f2ff731a27a Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 15 Apr 2019 16:43:16 -0400
Subject: netfilter: nf_tables: drop include of module.h from nf_tables.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

Since nf_tables.h is not going into a module struct looking for
specific fields, we can just let it know that module is a struct,
just like about 60 other include/linux headers already do.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 706f744f7308..5b8624ae4a27 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -2,7 +2,6 @@
 #ifndef _NET_NF_TABLES_H
 #define _NET_NF_TABLES_H
 
-#include <linux/module.h>
 #include <linux/list.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nfnetlink.h>
@@ -13,6 +12,8 @@
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netlink.h>
 
+struct module;
+
 #define NFT_JUMP_STACK_SIZE	16
 
 struct nft_pktinfo {
-- 
cgit v1.2.3


From 8f14c99c7edaaba9c0bb1727d44db6ebf157cc61 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Sun, 7 Apr 2019 08:14:20 -0700
Subject: netfilter: conntrack: limit sysctl setting for boolean options

We use the zero and one to limit the boolean options setting.
After this patch we only set 0 or 1 to boolean options for nf
conntrack sysctl.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/conntrack.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index f19b53130bf7..806454e767bf 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -24,9 +24,9 @@ struct nf_generic_net {
 
 struct nf_tcp_net {
 	unsigned int timeouts[TCP_CONNTRACK_TIMEOUT_MAX];
-	unsigned int tcp_loose;
-	unsigned int tcp_be_liberal;
-	unsigned int tcp_max_retrans;
+	int tcp_loose;
+	int tcp_be_liberal;
+	int tcp_max_retrans;
 };
 
 enum udp_conntrack {
-- 
cgit v1.2.3


From e1f172e162c0a11721f1188f12e5b4c3f9f80de6 Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fbl@redhat.com>
Date: Wed, 17 Apr 2019 11:46:14 -0300
Subject: netfilter: use macros to create module aliases.

Each NAT helper creates a module alias which follows a pattern.
Use macros for consistency.

Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_helper.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index ec52a8dc32fd..28bd4569aa64 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -15,6 +15,10 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 
+#define NF_NAT_HELPER_NAME(name)	"ip_nat_" name
+#define MODULE_ALIAS_NF_NAT_HELPER(name) \
+	MODULE_ALIAS(NF_NAT_HELPER_NAME(name))
+
 struct module;
 
 enum nf_ct_helper_flags {
-- 
cgit v1.2.3


From 08010a21602678932894c5e87014a282af0079cf Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fbl@redhat.com>
Date: Wed, 17 Apr 2019 11:46:15 -0300
Subject: netfilter: add API to manage NAT helpers.

The API allows a conntrack helper to indicate its corresponding
NAT helper which then can be loaded and reference counted.

Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_helper.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 28bd4569aa64..44b5a00a9c64 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -15,7 +15,8 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 
-#define NF_NAT_HELPER_NAME(name)	"ip_nat_" name
+#define NF_NAT_HELPER_PREFIX		"ip_nat_"
+#define NF_NAT_HELPER_NAME(name)	NF_NAT_HELPER_PREFIX name
 #define MODULE_ALIAS_NF_NAT_HELPER(name) \
 	MODULE_ALIAS(NF_NAT_HELPER_NAME(name))
 
@@ -58,6 +59,8 @@ struct nf_conntrack_helper {
 	unsigned int queue_num;
 	/* length of userspace private data stored in nf_conn_help->data */
 	u16 data_len;
+	/* name of NAT helper module */
+	char nat_mod_name[NF_CT_HELPER_NAME_LEN];
 };
 
 /* Must be kept in sync with the classes defined by helpers */
@@ -157,4 +160,21 @@ nf_ct_helper_expectfn_find_by_symbol(const void *symbol);
 extern struct hlist_head *nf_ct_helper_hash;
 extern unsigned int nf_ct_helper_hsize;
 
+struct nf_conntrack_nat_helper {
+	struct list_head list;
+	char mod_name[NF_CT_HELPER_NAME_LEN];	/* module name */
+	struct module *module;			/* pointer to self */
+};
+
+#define NF_CT_NAT_HELPER_INIT(name) \
+	{ \
+	.mod_name = NF_NAT_HELPER_NAME(name), \
+	.module = THIS_MODULE \
+	}
+
+void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat);
+void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat);
+int nf_nat_helper_try_module_get(const char *name, u16 l3num,
+				 u8 protonum);
+void nf_nat_helper_put(struct nf_conntrack_helper *helper);
 #endif /*_NF_CONNTRACK_HELPER_H*/
-- 
cgit v1.2.3


From 33162e9a0590f16e1b21be764caae517e2bb310c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 28 Apr 2019 21:45:43 +0300
Subject: net: dsa: Store vlan_filtering as a property of dsa_port

This allows drivers to query the VLAN setting imposed by the bridge
driver directly from DSA, instead of keeping their own state based on
the .port_vlan_filtering callback.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index b550f7bb5314..79a87913126c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -161,6 +161,7 @@ struct dsa_port {
 	const char		*mac;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
+	bool			vlan_filtering;
 	u8			stp_state;
 	struct net_device	*bridge_dev;
 	struct devlink_port	devlink_port;
-- 
cgit v1.2.3


From 8f5d16f638b9a1adf544a7f8cfd11ac1c01c6e25 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 28 Apr 2019 21:45:44 +0300
Subject: net: dsa: Be aware of switches where VLAN filtering is a global
 setting

On some switches, the action of whether to parse VLAN frame headers and use
that information for ingress admission is configurable, but not per
port. Such is the case for the Broadcom BCM53xx and the NXP SJA1105
families, for example. In that case, DSA can prevent the bridge core
from trying to apply different VLAN filtering settings on net devices
that belong to the same switch.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Suggested-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 79a87913126c..aab3c2029edd 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -228,6 +228,11 @@ struct dsa_switch {
 	/* Number of switch port queues */
 	unsigned int		num_tx_queues;
 
+	/* Disallow bridge core from requesting different VLAN awareness
+	 * settings on ports if not hardware-supported
+	 */
+	bool			vlan_filtering_is_global;
+
 	unsigned long		*bitmap;
 	unsigned long		_bitmap;
 
-- 
cgit v1.2.3


From 145746765f06a3dbc7869c81d0165b3ab96f935a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 28 Apr 2019 21:45:48 +0300
Subject: net: dsa: Keep the vlan_filtering setting in dsa_switch if it's
 global

The current behavior is not as obvious as one would assume (which is
that, if the driver set vlan_filtering_is_global = 1, then checking any
dp->vlan_filtering would yield the same result). Only the ports which
are actively enslaved into a bridge would have vlan_filtering set.

This makes it tricky for drivers to check what the global state is.
So fix this and make the struct dsa_switch hold this global setting.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index aab3c2029edd..4e0f7e9c5aa1 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -233,6 +233,11 @@ struct dsa_switch {
 	 */
 	bool			vlan_filtering_is_global;
 
+	/* In case vlan_filtering_is_global is set, the VLAN awareness state
+	 * should be retrieved from here and not from the per-port settings.
+	 */
+	bool			vlan_filtering;
+
 	unsigned long		*bitmap;
 	unsigned long		_bitmap;
 
-- 
cgit v1.2.3


From cf2d45f5ba9a730df6ec190e0345cecde80b1d8b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 28 Apr 2019 21:45:49 +0300
Subject: net: dsa: Add helper function to retrieve VLAN awareness setting

Since different types of hardware may or may not support this setting
per-port, DSA keeps it either in dsa_switch or in dsa_port.

While drivers may know the characteristics of their hardware and
retrieve it from the correct place without the need of helpers, it is
cumbersone to find out an unambigous answer from generic DSA code.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4e0f7e9c5aa1..1e6b4efc80b9 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -305,6 +305,16 @@ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port)
 	return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index);
 }
 
+static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp)
+{
+	const struct dsa_switch *ds = dp->ds;
+
+	if (ds->vlan_filtering_is_global)
+		return ds->vlan_filtering;
+	else
+		return dp->vlan_filtering;
+}
+
 typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
 			      bool is_static, void *data);
 struct dsa_switch_ops {
-- 
cgit v1.2.3


From 93e86b3bc842c159a60e6987444bf3952adcd4db Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 02:56:23 +0200
Subject: net: dsa: Remove legacy probing support

Now that all drivers can be probed using more traditional methods,
remove the legacy probe code.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1e6b4efc80b9..18db7b8e7a8e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -318,15 +318,6 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp)
 typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
 			      bool is_static, void *data);
 struct dsa_switch_ops {
-#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
-	/*
-	 * Legacy probing.
-	 */
-	const char	*(*probe)(struct device *dsa_dev,
-				  struct device *host_dev, int sw_addr,
-				  void **priv);
-#endif
-
 	enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds,
 						  int port);
 
@@ -516,20 +507,6 @@ struct dsa_switch_driver {
 	const struct dsa_switch_ops *ops;
 };
 
-#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
-/* Legacy driver registration */
-void register_switch_driver(struct dsa_switch_driver *type);
-void unregister_switch_driver(struct dsa_switch_driver *type);
-struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev);
-
-#else
-static inline void register_switch_driver(struct dsa_switch_driver *type) { }
-static inline void unregister_switch_driver(struct dsa_switch_driver *type) { }
-static inline struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
-{
-	return NULL;
-}
-#endif
 struct net_device *dsa_dev_to_net_device(struct device *dev);
 
 /* Keep inline for faster access in hot path */
-- 
cgit v1.2.3


From b587bdaf5f820cf7dac2c1b351db97bf98e1f427 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Mon, 29 Apr 2019 12:41:45 +0300
Subject: devlink: Change devlink health locking mechanism

The devlink health reporters create/destroy and user commands currently
use the devlink->lock as a locking mechanism. Different reporters have
different rules in the driver and are being created/destroyed during
different stages of driver load/unload/running. So during execution of a
reporter recover the flow can go through another reporter's destroy and
create. Such flow leads to deadlock trying to lock a mutex already
held.

With the new locking mechanism the different reporters share mutex lock
only to protect access to shared reporters list.
Added refcount per reporter, to protect the reporters from destroy while
being used.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4f5e41613503..1c4adfb4195a 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -32,6 +32,7 @@ struct devlink {
 	struct list_head region_list;
 	u32 snapshot_id;
 	struct list_head reporter_list;
+	struct mutex reporters_lock; /* protects reporter_list */
 	struct devlink_dpipe_headers *dpipe_headers;
 	const struct devlink_ops *ops;
 	struct device *dev;
-- 
cgit v1.2.3


From b424e432e770d6dd572765459d5b6a96a19c5286 Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Thu, 2 May 2019 16:15:10 +0200
Subject: netlink: add validation of NLA_F_NESTED flag

Add new validation flag NL_VALIDATE_NESTED which adds three consistency
checks of NLA_F_NESTED_FLAG:

  - the flag is set on attributes with NLA_NESTED{,_ARRAY} policy
  - the flag is not set on attributes with other policies except NLA_UNSPEC
  - the flag is set on attribute passed to nla_parse_nested()

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>

v2: change error messages to mention NLA_F_NESTED explicitly
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 679f649748d4..395b4406f4b0 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -401,6 +401,8 @@ struct nl_info {
  *	are enforced going forward.
  * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
  *	U8, U16, U32 must have exact size, etc.)
+ * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY)
+ *	and unset for other policies.
  */
 enum netlink_validation {
 	NL_VALIDATE_LIBERAL = 0,
@@ -408,6 +410,7 @@ enum netlink_validation {
 	NL_VALIDATE_MAXTYPE = BIT(1),
 	NL_VALIDATE_UNSPEC = BIT(2),
 	NL_VALIDATE_STRICT_ATTRS = BIT(3),
+	NL_VALIDATE_NESTED = BIT(4),
 };
 
 #define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
@@ -415,7 +418,8 @@ enum netlink_validation {
 #define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
 			    NL_VALIDATE_MAXTYPE |\
 			    NL_VALIDATE_UNSPEC |\
-			    NL_VALIDATE_STRICT_ATTRS)
+			    NL_VALIDATE_STRICT_ATTRS |\
+			    NL_VALIDATE_NESTED)
 
 int netlink_rcv_skb(struct sk_buff *skb,
 		    int (*cb)(struct sk_buff *, struct nlmsghdr *,
@@ -1132,6 +1136,11 @@ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
 				   const struct nla_policy *policy,
 				   struct netlink_ext_ack *extack)
 {
+	if (!(nla->nla_type & NLA_F_NESTED)) {
+		NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing");
+		return -EINVAL;
+	}
+
 	return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
 			   NL_VALIDATE_STRICT, extack);
 }
-- 
cgit v1.2.3


From 0f457a36626fa94026e483836fbf29e451434567 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 30 Apr 2019 07:45:48 -0700
Subject: ipv4: Move cached routes to fib_nh_common

While the cached routes, nh_pcpu_rth_output and nh_rth_input, are IPv4
specific, a later patch wants to make them accessible for IPv6 nexthops
with IPv4 routes using a fib6_nh. Move the cached routes from fib_nh to
fib_nh_common and update references.

Initialization of the cached entries is moved to fib_nh_common_init,
and free is moved to fib_nh_common_release.

Change in location only, from fib_nh up to fib_nh_common; no functional
change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 772a9e61bd84..659c5081c40b 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -96,6 +96,10 @@ struct fib_nh_common {
 
 	int			nhc_weight;
 	atomic_t		nhc_upper_bound;
+
+	/* v4 specific, but allows fib6_nh with v4 routes */
+	struct rtable __rcu * __percpu *nhc_pcpu_rth_output;
+	struct rtable __rcu     *nhc_rth_input;
 };
 
 struct fib_nh {
@@ -107,8 +111,6 @@ struct fib_nh {
 #endif
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
-	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
-	struct rtable __rcu	*nh_rth_input;
 	struct fnhe_hash_bucket	__rcu *nh_exceptions;
 #define fib_nh_family		nh_common.nhc_family
 #define fib_nh_dev		nh_common.nhc_dev
-- 
cgit v1.2.3


From a5995e7107eb3d9c44744d3cf47d49fabfef01f5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 30 Apr 2019 07:45:50 -0700
Subject: ipv4: Move exception bucket to nh_common

Similar to the cached routes, make IPv4 exceptions accessible when
using an IPv6 nexthop struct with IPv4 routes. Simplify the exception
functions by passing in fib_nh_common since that is all it needs,
and then cleanup the call sites that have extraneous fib_nh conversions.

As with the cached routes this is a change in location only, from fib_nh
up to fib_nh_common; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 659c5081c40b..d0e28f4ab099 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -100,6 +100,7 @@ struct fib_nh_common {
 	/* v4 specific, but allows fib6_nh with v4 routes */
 	struct rtable __rcu * __percpu *nhc_pcpu_rth_output;
 	struct rtable __rcu     *nhc_rth_input;
+	struct fnhe_hash_bucket	__rcu *nhc_exceptions;
 };
 
 struct fib_nh {
@@ -111,7 +112,6 @@ struct fib_nh {
 #endif
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
-	struct fnhe_hash_bucket	__rcu *nh_exceptions;
 #define fib_nh_family		nh_common.nhc_family
 #define fib_nh_dev		nh_common.nhc_dev
 #define fib_nh_oif		nh_common.nhc_oif
-- 
cgit v1.2.3


From f80c5dad7b6467b884c445ffea45985793b4b2d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Rechi=20Vita?= <jprvita@gmail.com>
Date: Thu, 2 May 2019 10:01:52 +0800
Subject: Bluetooth: Ignore CC events not matching the last HCI command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit makes the kernel not send the next queued HCI command until
a command complete arrives for the last HCI command sent to the
controller. This change avoids a problem with some buggy controllers
(seen on two SKUs of QCA9377) that send an extra command complete event
for the previous command after the kernel had already sent a new HCI
command to the controller.

The problem was reproduced when starting an active scanning procedure,
where an extra command complete event arrives for the LE_SET_RANDOM_ADDR
command. When this happends the kernel ends up not processing the
command complete for the following commmand, LE_SET_SCAN_PARAM, and
ultimately behaving as if a passive scanning procedure was being
performed, when in fact controller is performing an active scanning
procedure. This makes it impossible to discover BLE devices as no device
found events are sent to userspace.

This problem is reproducible on 100% of the attempts on the affected
controllers. The extra command complete event can be seen at timestamp
27.420131 on the btmon logs bellow.

Bluetooth monitor ver 5.50
= Note: Linux version 5.0.0+ (x86_64)                                  0.352340
= Note: Bluetooth subsystem version 2.22                               0.352343
= New Index: 80:C5:F2:8F:87:84 (Primary,USB,hci0)               [hci0] 0.352344
= Open Index: 80:C5:F2:8F:87:84                                 [hci0] 0.352345
= Index Info: 80:C5:F2:8F:87:84 (Qualcomm)                      [hci0] 0.352346
@ MGMT Open: bluetoothd (privileged) version 1.14             {0x0001} 0.352347
@ MGMT Open: btmon (privileged) version 1.14                  {0x0002} 0.352366
@ MGMT Open: btmgmt (privileged) version 1.14                {0x0003} 27.302164
@ MGMT Command: Start Discovery (0x0023) plen 1       {0x0003} [hci0] 27.302310
        Address type: 0x06
          LE Public
          LE Random
< HCI Command: LE Set Random Address (0x08|0x0005) plen 6   #1 [hci0] 27.302496
        Address: 15:60:F2:91:B2:24 (Non-Resolvable)
> HCI Event: Command Complete (0x0e) plen 4                 #2 [hci0] 27.419117
      LE Set Random Address (0x08|0x0005) ncmd 1
        Status: Success (0x00)
< HCI Command: LE Set Scan Parameters (0x08|0x000b) plen 7  #3 [hci0] 27.419244
        Type: Active (0x01)
        Interval: 11.250 msec (0x0012)
        Window: 11.250 msec (0x0012)
        Own address type: Random (0x01)
        Filter policy: Accept all advertisement (0x00)
> HCI Event: Command Complete (0x0e) plen 4                 #4 [hci0] 27.420131
      LE Set Random Address (0x08|0x0005) ncmd 1
        Status: Success (0x00)
< HCI Command: LE Set Scan Enable (0x08|0x000c) plen 2      #5 [hci0] 27.420259
        Scanning: Enabled (0x01)
        Filter duplicates: Enabled (0x01)
> HCI Event: Command Complete (0x0e) plen 4                 #6 [hci0] 27.420969
      LE Set Scan Parameters (0x08|0x000b) ncmd 1
        Status: Success (0x00)
> HCI Event: Command Complete (0x0e) plen 4                 #7 [hci0] 27.421983
      LE Set Scan Enable (0x08|0x000c) ncmd 1
        Status: Success (0x00)
@ MGMT Event: Command Complete (0x0001) plen 4        {0x0003} [hci0] 27.422059
      Start Discovery (0x0023) plen 1
        Status: Success (0x00)
        Address type: 0x06
          LE Public
          LE Random
@ MGMT Event: Discovering (0x0013) plen 2             {0x0003} [hci0] 27.422067
        Address type: 0x06
          LE Public
          LE Random
        Discovery: Enabled (0x01)
@ MGMT Event: Discovering (0x0013) plen 2             {0x0002} [hci0] 27.422067
        Address type: 0x06
          LE Public
          LE Random
        Discovery: Enabled (0x01)
@ MGMT Event: Discovering (0x0013) plen 2             {0x0001} [hci0] 27.422067
        Address type: 0x06
          LE Public
          LE Random
        Discovery: Enabled (0x01)

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index fbba43e9bef5..9a5330eed794 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -282,6 +282,7 @@ enum {
 	HCI_FORCE_BREDR_SMP,
 	HCI_FORCE_STATIC_ADDR,
 	HCI_LL_RPA_RESOLUTION,
+	HCI_CMD_PENDING,
 
 	__HCI_NUM_FLAGS,
 };
-- 
cgit v1.2.3


From 47d3d7fdb10a21c223036b58bd70ffdc24a472c4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 3 May 2019 08:24:44 -0700
Subject: ip6: fix skb leak in ip6frag_expire_frag_queue()

Since ip6frag_expire_frag_queue() now pulls the head skb
from frag queue, we should no longer use skb_get(), since
this leads to an skb leak.

Stefan Bader initially reported a problem in 4.4.stable [1] caused
by the skb_get(), so this patch should also fix this issue.

296583.091021] kernel BUG at /build/linux-6VmqmP/linux-4.4.0/net/core/skbuff.c:1207!
[296583.091734] Call Trace:
[296583.091749]  [<ffffffff81740e50>] __pskb_pull_tail+0x50/0x350
[296583.091764]  [<ffffffff8183939a>] _decode_session6+0x26a/0x400
[296583.091779]  [<ffffffff817ec719>] __xfrm_decode_session+0x39/0x50
[296583.091795]  [<ffffffff818239d0>] icmpv6_route_lookup+0xf0/0x1c0
[296583.091809]  [<ffffffff81824421>] icmp6_send+0x5e1/0x940
[296583.091823]  [<ffffffff81753238>] ? __netif_receive_skb+0x18/0x60
[296583.091838]  [<ffffffff817532b2>] ? netif_receive_skb_internal+0x32/0xa0
[296583.091858]  [<ffffffffc0199f74>] ? ixgbe_clean_rx_irq+0x594/0xac0 [ixgbe]
[296583.091876]  [<ffffffffc04eb260>] ? nf_ct_net_exit+0x50/0x50 [nf_defrag_ipv6]
[296583.091893]  [<ffffffff8183d431>] icmpv6_send+0x21/0x30
[296583.091906]  [<ffffffff8182b500>] ip6_expire_frag_queue+0xe0/0x120
[296583.091921]  [<ffffffffc04eb27f>] nf_ct_frag6_expire+0x1f/0x30 [nf_defrag_ipv6]
[296583.091938]  [<ffffffff810f3b57>] call_timer_fn+0x37/0x140
[296583.091951]  [<ffffffffc04eb260>] ? nf_ct_net_exit+0x50/0x50 [nf_defrag_ipv6]
[296583.091968]  [<ffffffff810f5464>] run_timer_softirq+0x234/0x330
[296583.091982]  [<ffffffff8108a339>] __do_softirq+0x109/0x2b0

Fixes: d4289fcc9b16 ("net: IP6 defrag: use rbtrees for IPv6 defrag")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Stefan Bader <stefan.bader@canonical.com>
Cc: Peter Oskolkov <posk@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6_frag.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
index 28aa9b30aece..1f77fb4dc79d 100644
--- a/include/net/ipv6_frag.h
+++ b/include/net/ipv6_frag.h
@@ -94,7 +94,6 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
 		goto out;
 
 	head->dev = dev;
-	skb_get(head);
 	spin_unlock(&fq->q.lock);
 
 	icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
-- 
cgit v1.2.3


From 9b3040a6aafd7898ece7fc7efcbca71e42aa8069 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 5 May 2019 11:16:20 -0700
Subject: ipv4: Define __ipv4_neigh_lookup_noref when CONFIG_INET is disabled

Define __ipv4_neigh_lookup_noref to return NULL when CONFIG_INET is disabled.

Fixes: 4b2a2bfeb3f0 ("neighbor: Call __ipv4_neigh_lookup_noref in neigh_xmit")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/arp.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/arp.h b/include/net/arp.h
index 977aabfcdc03..c8f580a0e6b1 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -18,6 +18,7 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32
 	return val * hash_rnd[0];
 }
 
+#ifdef CONFIG_INET
 static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
 {
 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
@@ -25,6 +26,13 @@ static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev
 
 	return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);
 }
+#else
+static inline
+struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
+{
+	return NULL;
+}
+#endif
 
 static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key)
 {
-- 
cgit v1.2.3


From a7a7be6087b07563490725f61f4dbf4826f099e2 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:16 -0700
Subject: net/sched: add sample action to the hardware intermediate
 representation

Add sample action to the hardware intermediate representation model which
would subsequently allow it to be used by drivers for offload.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index d035183c8d03..9a6c89b2c2bb 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -118,6 +118,7 @@ enum flow_action_id {
 	FLOW_ACTION_MARK,
 	FLOW_ACTION_WAKE,
 	FLOW_ACTION_QUEUE,
+	FLOW_ACTION_SAMPLE,
 };
 
 /* This is mirroring enum pedit_header_type definition for easy mapping between
@@ -157,6 +158,12 @@ struct flow_action_entry {
 			u32		index;
 			u8		vf;
 		} queue;
+		struct {				/* FLOW_ACTION_SAMPLE */
+			struct psample_group	*psample_group;
+			u32			rate;
+			u32			trunc_size;
+			bool			truncate;
+		} sample;
 	};
 };
 
-- 
cgit v1.2.3


From f00cbf1968145afbae385a867a66c69845e30711 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:17 -0700
Subject: net/sched: use the hardware intermediate representation for matchall

Extends matchall offload to make use of the hardware intermediate
representation. More specifically, this patch moves the native TC
actions in cls_matchall offload to the newer flow_action
representation. This ultimately allows us to avoid a direct
dependency on native TC actions for matchall.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index d5e7a1af346f..c852ed502cc6 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -789,6 +789,7 @@ enum tc_matchall_command {
 struct tc_cls_matchall_offload {
 	struct tc_cls_common_offload common;
 	enum tc_matchall_command command;
+	struct flow_rule *rule;
 	struct tcf_exts *exts;
 	unsigned long cookie;
 };
-- 
cgit v1.2.3


From ab79af32b0a5606324ce04c0f04a0d2f90b94464 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:18 -0700
Subject: mlxsw: use intermediate representation for matchall offload

Updates the Mellanox spectrum driver to use the newer intermediate
representation for flow actions in matchall offloads.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 9a6c89b2c2bb..3bf67dd64be5 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -177,6 +177,17 @@ static inline bool flow_action_has_entries(const struct flow_action *action)
 	return action->num_entries;
 }
 
+/**
+ * flow_action_has_one_action() - check if exactly one action is present
+ * @action: tc filter flow offload action
+ *
+ * Returns true if exactly one action is present.
+ */
+static inline bool flow_offload_has_one_action(const struct flow_action *action)
+{
+	return action->num_entries == 1;
+}
+
 #define flow_action_for_each(__i, __act, __actions)			\
         for (__i = 0, __act = &(__actions)->entries[0]; __i < (__actions)->num_entries; __act = &(__actions)->entries[++__i])
 
-- 
cgit v1.2.3


From dfcb19f0fae3d07f9c56f6efe2c9bbebef6826c9 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:20 -0700
Subject: net/sched: remove unused functions for matchall offload

Cleanup unused functions and variables after porting to the newer
intermediate representation.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'include/net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index c852ed502cc6..2d0470661277 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -371,30 +371,6 @@ static inline bool tcf_exts_has_actions(struct tcf_exts *exts)
 #endif
 }
 
-/**
- * tcf_exts_has_one_action - check if exactly one action is present
- * @exts: tc filter extensions handle
- *
- * Returns true if exactly one action is present.
- */
-static inline bool tcf_exts_has_one_action(struct tcf_exts *exts)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	return exts->nr_actions == 1;
-#else
-	return false;
-#endif
-}
-
-static inline struct tc_action *tcf_exts_first_action(struct tcf_exts *exts)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	return exts->actions[0];
-#else
-	return NULL;
-#endif
-}
-
 /**
  * tcf_exts_exec - execute tc filter extensions
  * @skb: socket buffer
@@ -790,7 +766,6 @@ struct tc_cls_matchall_offload {
 	struct tc_cls_common_offload common;
 	enum tc_matchall_command command;
 	struct flow_rule *rule;
-	struct tcf_exts *exts;
 	unsigned long cookie;
 };
 
-- 
cgit v1.2.3


From fa762da94d9860f584c909621d1f8ccbe24c5d5e Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:21 -0700
Subject: net/sched: move police action structures to header

Move tcf_police_params, tcf_police and tc_police_compat structures to a
header. Making them usable to other code for example drivers that would
offload police actions to hardware.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_police.h | 70 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 include/net/tc_act/tc_police.h

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h
new file mode 100644
index 000000000000..8b9ef3664262
--- /dev/null
+++ b/include/net/tc_act/tc_police.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_POLICE_H
+#define __NET_TC_POLICE_H
+
+#include <net/act_api.h>
+
+struct tcf_police_params {
+	int			tcfp_result;
+	u32			tcfp_ewma_rate;
+	s64			tcfp_burst;
+	u32			tcfp_mtu;
+	s64			tcfp_mtu_ptoks;
+	struct psched_ratecfg	rate;
+	bool			rate_present;
+	struct psched_ratecfg	peak;
+	bool			peak_present;
+	struct rcu_head rcu;
+};
+
+struct tcf_police {
+	struct tc_action	common;
+	struct tcf_police_params __rcu *params;
+
+	spinlock_t		tcfp_lock ____cacheline_aligned_in_smp;
+	s64			tcfp_toks;
+	s64			tcfp_ptoks;
+	s64			tcfp_t_c;
+};
+
+#define to_police(pc) ((struct tcf_police *)pc)
+
+/* old policer structure from before tc actions */
+struct tc_police_compat {
+	u32			index;
+	int			action;
+	u32			limit;
+	u32			burst;
+	u32			mtu;
+	struct tc_ratespec	rate;
+	struct tc_ratespec	peakrate;
+};
+
+static inline bool is_tcf_police(const struct tc_action *act)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (act->ops && act->ops->id == TCA_ID_POLICE)
+		return true;
+#endif
+	return false;
+}
+
+static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act)
+{
+	struct tcf_police *police = to_police(act);
+	struct tcf_police_params *params;
+
+	params = rcu_dereference_bh(police->params);
+	return params->rate.rate_bytes_ps;
+}
+
+static inline s64 tcf_police_tcfp_burst(const struct tc_action *act)
+{
+	struct tcf_police *police = to_police(act);
+	struct tcf_police_params *params;
+
+	params = rcu_dereference_bh(police->params);
+	return params->tcfp_burst;
+}
+
+#endif /* __NET_TC_POLICE_H */
-- 
cgit v1.2.3


From 8c8cfc6ed274e6fb86f00b53f3e7811afce29043 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:22 -0700
Subject: net/sched: add police action to the hardware intermediate
 representation

Add police action to the hardware intermediate representation which
would subsequently allow it to be used by drivers for offload.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 3bf67dd64be5..6200900434e1 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -119,6 +119,7 @@ enum flow_action_id {
 	FLOW_ACTION_WAKE,
 	FLOW_ACTION_QUEUE,
 	FLOW_ACTION_SAMPLE,
+	FLOW_ACTION_POLICE,
 };
 
 /* This is mirroring enum pedit_header_type definition for easy mapping between
@@ -164,6 +165,10 @@ struct flow_action_entry {
 			u32			trunc_size;
 			bool			truncate;
 		} sample;
+		struct {				/* FLOW_ACTION_POLICE */
+			s64			burst;
+			u64			rate_bytes_ps;
+		} police;
 	};
 };
 
-- 
cgit v1.2.3


From b7fe4ab8a6013c3c721bed91f73e76eec8fb5d89 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:23 -0700
Subject: net/sched: extend matchall offload for hardware statistics

Introduce a new command for matchall classifiers that allows hardware
to update statistics.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 2d0470661277..161fcf8516ac 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -760,12 +760,14 @@ tc_cls_flower_offload_flow_rule(struct tc_cls_flower_offload *tc_flow_cmd)
 enum tc_matchall_command {
 	TC_CLSMATCHALL_REPLACE,
 	TC_CLSMATCHALL_DESTROY,
+	TC_CLSMATCHALL_STATS,
 };
 
 struct tc_cls_matchall_offload {
 	struct tc_cls_common_offload common;
 	enum tc_matchall_command command;
 	struct flow_rule *rule;
+	struct flow_stats stats;
 	unsigned long cookie;
 };
 
-- 
cgit v1.2.3


From 88c44a5200849c8182eaf36535b4ceae6b90b19d Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:25 -0700
Subject: net/sched: add block pointer to tc_cls_common_offload structure

Some actions like the police action are stateful and could share state
between devices. This is incompatible with offloading to multiple devices
and drivers might want to test for shared blocks when offloading.
Store a pointer to the tcf_block structure in the tc_cls_common_offload
structure to allow drivers to determine when offloads apply to a shared
block.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 161fcf8516ac..eed98f8fcb5e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -100,6 +100,11 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		 struct tcf_result *res, bool compat_mode);
 
 #else
+static inline bool tcf_block_shared(struct tcf_block *block)
+{
+	return false;
+}
+
 static inline
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
@@ -624,6 +629,7 @@ struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
 	u32 prio;
+	struct tcf_block *block;
 	struct netlink_ext_ack *extack;
 };
 
@@ -725,11 +731,13 @@ static inline bool tc_in_hw(u32 flags)
 static inline void
 tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 			   const struct tcf_proto *tp, u32 flags,
+			   struct tcf_block *block,
 			   struct netlink_ext_ack *extack)
 {
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
+	cls_common->block = block;
 	if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE)
 		cls_common->extack = extack;
 }
-- 
cgit v1.2.3


From f9bbe4477c30ece44296437ee26142b42ef8070b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:22 +0300
Subject: net: dsa: Optional VLAN-based port separation for switches without
 tagging

This patch provides generic DSA code for using VLAN (802.1Q) tags for
the same purpose as a dedicated switch tag for injection/extraction.
It is based on the discussions and interest that has been so far
expressed in https://www.spinics.net/lists/netdev/msg556125.html.

Unlike all other DSA-supported tagging protocols, CONFIG_NET_DSA_TAG_8021Q
does not offer a complete solution for drivers (nor can it). Instead, it
provides generic code that driver can opt into calling:
- dsa_8021q_xmit: Inserts a VLAN header with the specified contents.
  Can be called from another tagging protocol's xmit function.
  Currently the LAN9303 driver is inserting headers that are simply
  802.1Q with custom fields, so this is an opportunity for code reuse.
- dsa_8021q_rcv: Retrieves the TPID and TCI from a VLAN-tagged skb.
  Removing the VLAN header is left as a decision for the caller to make.
- dsa_port_setup_8021q_tagging: For each user port, installs an Rx VID
  and a Tx VID, for proper untagged traffic identification on ingress
  and steering on egress. Also sets up the VLAN trunk on the upstream
  (CPU or DSA) port. Drivers are intentionally left to call this
  function explicitly, depending on the context and hardware support.
  The expected switch behavior and VLAN semantics should not be violated
  under any conditions. That is, after calling
  dsa_port_setup_8021q_tagging, the hardware should still pass all
  ingress traffic, be it tagged or untagged.

For uniformity with the other tagging protocols, a module for the
dsa_8021q_netdev_ops structure is registered, but the typical usage is
to set up another tagging protocol which selects CONFIG_NET_DSA_TAG_8021Q,
and calls the API from tag_8021q.h. Null function definitions are also
provided so that a "depends on" is not forced in the Kconfig.

This tagging protocol only works when switch ports are standalone, or
when they are added to a VLAN-unaware bridge. It will probably remain
this way for the reasons below.

When added to a bridge that has vlan_filtering 1, the bridge core will
install its own VLANs and reset the pvids through switchdev. For the
bridge core, switchdev is a write-only pipe. All VLAN-related state is
kept in the bridge core and nothing is read from DSA/switchdev or from
the driver. So the bridge core will break this port separation because
it will install the vlan_default_pvid into all switchdev ports.

Even if we could teach the bridge driver about switchdev preference of a
certain vlan_default_pvid (task difficult in itself since the current
setting is per-bridge but we would need it per-port), there would still
exist many other challenges.

Firstly, in the DSA rcv callback, a driver would have to perform an
iterative reverse lookup to find the correct switch port. That is
because the port is a bridge slave, so its Rx VID (port PVID) is subject
to user configuration. How would we ensure that the user doesn't reset
the pvid to a different value (which would make an O(1) translation
impossible), or to a non-unique value within this DSA switch tree (which
would make any translation impossible)?

Finally, not all switch ports are equal in DSA, and that makes it
difficult for the bridge to be completely aware of this anyway.
The CPU port needs to transmit tagged packets (VLAN trunk) in order for
the DSA rcv code to be able to decode source information.
But the bridge code has absolutely no idea which switch port is the CPU
port, if nothing else then just because there is no netdevice registered
by DSA for the CPU port.
Also DSA does not currently allow the user to specify that they want the
CPU port to do VLAN trunking anyway. VLANs are added to the CPU port
using the same flags as they were added on the user port.

So the VLANs installed by dsa_port_setup_8021q_tagging per driver
request should remain private from the bridge's and user's perspective,
and should not alter the VLAN semantics observed by the user.

In the current implementation a VLAN range ending at 4095 (VLAN_N_VID)
is reserved for this purpose. Each port receives a unique Rx VLAN and a
unique Tx VLAN. Separate VLANs are needed for Rx and Tx because they
serve different purposes: on Rx the switch must process traffic as
untagged and process it with a port-based VLAN, but with care not to
hinder bridging. On the other hand, the Tx VLAN is where the
reachability restrictions are imposed, since by tagging frames in the
xmit callback we are telling the switch onto which port to steer the
frame.

Some general guidance on how this support might be employed for
real-life hardware (some comments made by Florian Fainelli):

- If the hardware supports VLAN tag stacking, it should somehow back
  up its private VLAN settings when the bridge tries to override them.
  Then the driver could re-apply them as outer tags. Dedicating an outer
  tag per bridge device would allow identical inner tag VID numbers to
  co-exist, yet preserve broadcast domain isolation.

- If the switch cannot handle VLAN tag stacking, it should disable this
  port separation when added as slave to a vlan_filtering bridge, in
  that case having reduced functionality.

- Drivers for old switches that don't support the entire VLAN_N_VID
  range will need to rework the current range selection mechanism.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 18db7b8e7a8e..69f3714f42ba 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -42,6 +42,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_MTK_VALUE			9
 #define DSA_TAG_PROTO_QCA_VALUE			10
 #define DSA_TAG_PROTO_TRAILER_VALUE		11
+#define DSA_TAG_PROTO_8021Q_VALUE		12
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -56,6 +57,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_MTK		= DSA_TAG_PROTO_MTK_VALUE,
 	DSA_TAG_PROTO_QCA		= DSA_TAG_PROTO_QCA_VALUE,
 	DSA_TAG_PROTO_TRAILER		= DSA_TAG_PROTO_TRAILER_VALUE,
+	DSA_TAG_PROTO_8021Q		= DSA_TAG_PROTO_8021Q_VALUE,
 };
 
 struct packet_type;
-- 
cgit v1.2.3


From cc1939e4b3aaf534fb2f3706820012036825731c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:23 +0300
Subject: net: dsa: Allow drivers to filter packets they can decode source port
 from

Frames get processed by DSA and redirected to switch port net devices
based on the ETH_P_XDSA multiplexed packet_type handler found by the
network stack when calling eth_type_trans().

The running assumption is that once the DSA .rcv function is called, DSA
is always able to decode the switch tag in order to change the skb->dev
from its master.

However there are tagging protocols (such as the new DSA_TAG_PROTO_SJA1105,
user of DSA_TAG_PROTO_8021Q) where this assumption is not completely
true, since switch tagging piggybacks on the absence of a vlan_filtering
bridge. Moreover, management traffic (BPDU, PTP) for this switch doesn't
rely on switch tagging, but on a different mechanism. So it would make
sense to at least be able to terminate that.

Having DSA receive traffic it can't decode would put it in an impossible
situation: the eth_type_trans() function would invoke the DSA .rcv(),
which could not change skb->dev, then eth_type_trans() would be invoked
again, which again would call the DSA .rcv, and the packet would never
be able to exit the DSA filter and would spiral in a loop until the
whole system dies.

This happens because eth_type_trans() doesn't actually look at the skb
(so as to identify a potential tag) when it deems it as being
ETH_P_XDSA. It just checks whether skb->dev has a DSA private pointer
installed (therefore it's a DSA master) and that there exists a .rcv
callback (everybody except DSA_TAG_PROTO_NONE has that). This is
understandable as there are many switch tags out there, and exhaustively
checking for all of them is far from ideal.

The solution lies in introducing a filtering function for each tagging
protocol. In the absence of a filtering function, all traffic is passed
to the .rcv DSA callback. The tagging protocol should see the filtering
function as a pre-validation that it can decode the incoming skb. The
traffic that doesn't match the filter will bypass the DSA .rcv callback
and be left on the master netdevice, which wasn't previously possible.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 69f3714f42ba..c90ceeec7d1f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -69,6 +69,11 @@ struct dsa_device_ops {
 			       struct packet_type *pt);
 	int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
 			    int *offset);
+	/* Used to determine which traffic should match the DSA filter in
+	 * eth_type_trans, and which, if any, should bypass it and be processed
+	 * as regular on the master net device.
+	 */
+	bool (*filter)(const struct sk_buff *skb, struct net_device *dev);
 	unsigned int overhead;
 	const char *name;
 	enum dsa_tag_protocol proto;
@@ -148,6 +153,7 @@ struct dsa_port {
 	struct dsa_switch_tree *dst;
 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
 			       struct packet_type *pt);
+	bool (*filter)(const struct sk_buff *skb, struct net_device *dev);
 
 	enum {
 		DSA_PORT_TYPE_UNUSED = 0,
@@ -520,6 +526,15 @@ static inline bool netdev_uses_dsa(struct net_device *dev)
 	return false;
 }
 
+static inline bool dsa_can_decode(const struct sk_buff *skb,
+				  struct net_device *dev)
+{
+#if IS_ENABLED(CONFIG_NET_DSA)
+	return !dev->dsa_ptr->filter || dev->dsa_ptr->filter(skb, dev);
+#endif
+	return false;
+}
+
 struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n);
 void dsa_unregister_switch(struct dsa_switch *ds);
 int dsa_register_switch(struct dsa_switch *ds);
-- 
cgit v1.2.3


From b68b0dd0fb2d91056d5241a19960cf47d4a80f05 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:24 +0300
Subject: net: dsa: Keep private info in the skb->cb

Map a DSA structure over the 48-byte control block that will hold
skb info on transmit and receive. This is only for use within the DSA
processing layer (e.g. communicating between DSA core and tagger) and
not for passing info around with other layers such as the master net
device.

Also add a DSA_SKB_CB_PRIV() macro which retrieves a pointer to the
space up to 48 bytes that the DSA structure does not use. This space can
be used for drivers to add their own private info.

One use is for the PTP timestamping code path. When cloning a skb,
annotate the original with a pointer to the clone, which the driver can
then find easily and place the timestamp to. This avoids the need of a
separate queue to hold clones and a way to match an original to a cloned
skb.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index c90ceeec7d1f..d628587e0bde 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -83,6 +83,37 @@ struct dsa_device_ops {
 #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto)				\
 	MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE))
 
+struct dsa_skb_cb {
+	struct sk_buff *clone;
+};
+
+struct __dsa_skb_cb {
+	struct dsa_skb_cb cb;
+	u8 priv[48 - sizeof(struct dsa_skb_cb)];
+};
+
+#define __DSA_SKB_CB(skb) ((struct __dsa_skb_cb *)((skb)->cb))
+
+#define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb))
+
+#define DSA_SKB_CB_COPY(nskb, skb)		\
+	{ *__DSA_SKB_CB(nskb) = *__DSA_SKB_CB(skb); }
+
+#define DSA_SKB_CB_ZERO(skb)			\
+	{ *__DSA_SKB_CB(skb) = (struct __dsa_skb_cb) {0}; }
+
+#define DSA_SKB_CB_PRIV(skb)			\
+	((void *)(skb)->cb + offsetof(struct __dsa_skb_cb, priv))
+
+#define DSA_SKB_CB_CLONE(_clone, _skb)		\
+	{					\
+		struct sk_buff *clone = _clone;	\
+		struct sk_buff *skb = _skb;	\
+						\
+		DSA_SKB_CB_COPY(clone, skb);	\
+		DSA_SKB_CB(skb)->clone = clone; \
+	}
+
 struct dsa_switch_tree {
 	struct list_head	list;
 
-- 
cgit v1.2.3


From 97a69a0dea9a048c6769249f1552de5f56731524 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:25 +0300
Subject: net: dsa: Add support for deferred xmit

Some hardware needs to take work to get convinced to receive frames on
the CPU port (such as the sja1105 which takes temporary L2 forwarding
rules over SPI that last for a single frame). Such work needs a
sleepable context, and because the regular .ndo_start_xmit is atomic,
this cannot be done in the tagger. So introduce a generic DSA mechanism
that sets up a transmit skb queue and a workqueue for deferred
transmission.

The new driver callback (.port_deferred_xmit) is in dsa_switch and not
in the tagger because the operations that require sleeping typically
also involve interacting with the hardware, and not simply skb
manipulations. Therefore having it there simplifies the structure a bit
and makes it unnecessary to export functions from the driver to the
tagger.

The driver is responsible of calling dsa_enqueue_skb which transfers it
to the master netdevice. This is so that it has a chance of performing
some more work afterwards, such as cleanup or TX timestamping.

To tell DSA that skb xmit deferral is required, I have thought about
changing the return type of the tagger .xmit from struct sk_buff * into
a enum dsa_tx_t that could potentially encode a DSA_XMIT_DEFER value.

But the trailer tagger is reallocating every skb on xmit and therefore
making a valid use of the pointer return value. So instead of reworking
the API in complicated ways, right now a boolean property in the newly
introduced DSA_SKB_CB is set.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index d628587e0bde..0260b73938e2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -85,6 +85,7 @@ struct dsa_device_ops {
 
 struct dsa_skb_cb {
 	struct sk_buff *clone;
+	bool deferred_xmit;
 };
 
 struct __dsa_skb_cb {
@@ -205,6 +206,10 @@ struct dsa_port {
 	struct net_device	*bridge_dev;
 	struct devlink_port	devlink_port;
 	struct phylink		*pl;
+
+	struct work_struct	xmit_work;
+	struct sk_buff_head	xmit_queue;
+
 	/*
 	 * Original copy of the master netdev ethtool_ops
 	 */
@@ -539,6 +544,12 @@ struct dsa_switch_ops {
 				 struct sk_buff *clone, unsigned int type);
 	bool	(*port_rxtstamp)(struct dsa_switch *ds, int port,
 				 struct sk_buff *skb, unsigned int type);
+
+	/*
+	 * Deferred frame Tx
+	 */
+	netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port,
+					  struct sk_buff *skb);
 };
 
 struct dsa_switch_driver {
@@ -634,6 +645,7 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
 #define BRCM_TAG_GET_QUEUE(v)		((v) & 0xff)
 
 
+netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev);
 int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data);
 int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
 int dsa_port_get_phy_sset_count(struct dsa_port *dp);
-- 
cgit v1.2.3


From c362beb072e14b929eb657dc174d83ccdd9b0eed Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:26 +0300
Subject: net: dsa: Add a private structure pointer to dsa_port

This is supposed to share information between the driver and the tagger,
or used by the tagger to keep some state. Its use is optional.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0260b73938e2..e20be1ceb306 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -210,6 +210,12 @@ struct dsa_port {
 	struct work_struct	xmit_work;
 	struct sk_buff_head	xmit_queue;
 
+	/*
+	 * Give the switch driver somewhere to hang its per-port private data
+	 * structures (accessible from the tagger).
+	 */
+	void *priv;
+
 	/*
 	 * Original copy of the master netdev ethtool_ops
 	 */
-- 
cgit v1.2.3


From 227d07a07ef126272ea2eed97fd136cd7a803d81 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:27 +0300
Subject: net: dsa: sja1105: Add support for traffic through standalone ports

In order to support this, we are creating a make-shift switch tag out of
a VLAN trunk configured on the CPU port. Termination of normal traffic
on switch ports only works when not under a vlan_filtering bridge.
Termination of management (PTP, BPDU) traffic works under all
circumstances because it uses a different tagging mechanism
(incl_srcpt). We are making use of the generic CONFIG_NET_DSA_TAG_8021Q
code and leveraging it from our own CONFIG_NET_DSA_TAG_SJA1105.

There are two types of traffic: regular and link-local.

The link-local traffic received on the CPU port is trapped from the
switch's regular forwarding decisions because it matched one of the two
DMAC filters for management traffic.

On transmission, the switch requires special massaging for these
link-local frames. Due to a weird implementation of the switching IP, by
default it drops link-local frames that originate on the CPU port.
It needs to be told where to forward them to, through an SPI command
("management route") that is valid for only a single frame.
So when we're sending link-local traffic, we are using the
dsa_defer_xmit mechanism.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index e20be1ceb306..6aaaadd6a413 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -43,6 +43,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_QCA_VALUE			10
 #define DSA_TAG_PROTO_TRAILER_VALUE		11
 #define DSA_TAG_PROTO_8021Q_VALUE		12
+#define DSA_TAG_PROTO_SJA1105_VALUE		13
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -58,6 +59,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_QCA		= DSA_TAG_PROTO_QCA_VALUE,
 	DSA_TAG_PROTO_TRAILER		= DSA_TAG_PROTO_TRAILER_VALUE,
 	DSA_TAG_PROTO_8021Q		= DSA_TAG_PROTO_8021Q_VALUE,
+	DSA_TAG_PROTO_SJA1105		= DSA_TAG_PROTO_SJA1105_VALUE,
 };
 
 struct packet_type;
-- 
cgit v1.2.3


From d6787147e15dffa7b7f3116a5bc3cbe0670bd74f Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Mon, 6 May 2019 17:24:21 -0700
Subject: net/sched: remove block pointer from common offload structure

Based on feedback from Jiri avoid carrying a pointer to the tcf_block
structure in the tc_cls_common_offload structure. Instead store
a flag in driver private data which indicates if offloads apply
to a shared block at block binding time.

Suggested-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index eed98f8fcb5e..514e3c80ecc1 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -629,7 +629,6 @@ struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
 	u32 prio;
-	struct tcf_block *block;
 	struct netlink_ext_ack *extack;
 };
 
@@ -731,13 +730,11 @@ static inline bool tc_in_hw(u32 flags)
 static inline void
 tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 			   const struct tcf_proto *tp, u32 flags,
-			   struct tcf_block *block,
 			   struct netlink_ext_ack *extack)
 {
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
-	cls_common->block = block;
 	if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE)
 		cls_common->extack = extack;
 }
-- 
cgit v1.2.3