From 65101aeca52241a05e66f23c96eb896c9412718d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 16 May 2017 11:20:13 +0200
Subject: net/sock: factor out dequeue/peek with offset code

And update __sk_queue_drop_skb() to work on the specified queue.
This will help the udp protocol to use an additional private
rx queue in a later patch.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index f33e3d134e0b..42264035dec0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2035,8 +2035,8 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer,
 
 void sk_stop_timer(struct sock *sk, struct timer_list *timer);
 
-int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
-			unsigned int flags,
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
+			struct sk_buff *skb, unsigned int flags,
 			void (*destructor)(struct sock *sk,
 					   struct sk_buff *skb));
 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-- 
cgit v1.2.3


From 218af599fa635b107cfe10acf3249c4dfe5e4123 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 16 May 2017 04:24:36 -0700
Subject: tcp: internal implementation for pacing

BBR congestion control depends on pacing, and pacing is
currently handled by sch_fq packet scheduler for performance reasons,
and also because implemening pacing with FQ was convenient to truly
avoid bursts.

However there are many cases where this packet scheduler constraint
is not practical.
- Many linux hosts are not focusing on handling thousands of TCP
  flows in the most efficient way.
- Some routers use fq_codel or other AQM, but still would like
  to use BBR for the few TCP flows they initiate/terminate.

This patch implements an automatic fallback to internal pacing.

Pacing is requested either by BBR or use of SO_MAX_PACING_RATE option.

If sch_fq happens to be in the egress path, pacing is delegated to
the qdisc, otherwise pacing is done by TCP itself.

One advantage of pacing from TCP stack is to get more precise rtt
estimations, and less work done from TX completion, since TCP Small
queue limits are not generally hit. Setups with single TX queue but
many cpus might even benefit from this.

Note that unlike sch_fq, we do not take into account header sizes.
Taking care of these headers would add additional complexity for
no practical differences in behavior.

Some performance numbers using 800 TCP_STREAM flows rate limited to
~48 Mbit per second on 40Gbit NIC.

If MQ+pfifo_fast is used on the NIC :

$ sar -n DEV 1 5 | grep eth
14:48:44         eth0 725743.00 2932134.00  46776.76 4335184.68      0.00      0.00      1.00
14:48:45         eth0 725349.00 2932112.00  46751.86 4335158.90      0.00      0.00      0.00
14:48:46         eth0 725101.00 2931153.00  46735.07 4333748.63      0.00      0.00      0.00
14:48:47         eth0 725099.00 2931161.00  46735.11 4333760.44      0.00      0.00      1.00
14:48:48         eth0 725160.00 2931731.00  46738.88 4334606.07      0.00      0.00      0.00
Average:         eth0 725290.40 2931658.20  46747.54 4334491.74      0.00      0.00      0.40
$ vmstat 1 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 4  0      0 259825920  45644 2708324    0    0    21     2  247   98  0  0 100  0  0
 4  0      0 259823744  45644 2708356    0    0     0     0 2400825 159843  0 19 81  0  0
 0  0      0 259824208  45644 2708072    0    0     0     0 2407351 159929  0 19 81  0  0
 1  0      0 259824592  45644 2708128    0    0     0     0 2405183 160386  0 19 80  0  0
 1  0      0 259824272  45644 2707868    0    0     0    32 2396361 158037  0 19 81  0  0

Now use MQ+FQ :

lpaa23:~# echo fq >/proc/sys/net/core/default_qdisc
lpaa23:~# tc qdisc replace dev eth0 root mq

$ sar -n DEV 1 5 | grep eth
14:49:57         eth0 678614.00 2727930.00  43739.13 4033279.14      0.00      0.00      0.00
14:49:58         eth0 677620.00 2723971.00  43674.69 4027429.62      0.00      0.00      1.00
14:49:59         eth0 676396.00 2719050.00  43596.83 4020125.02      0.00      0.00      0.00
14:50:00         eth0 675197.00 2714173.00  43518.62 4012938.90      0.00      0.00      1.00
14:50:01         eth0 676388.00 2719063.00  43595.47 4020171.64      0.00      0.00      0.00
Average:         eth0 676843.00 2720837.40  43624.95 4022788.86      0.00      0.00      0.40
$ vmstat 1 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 2  0      0 259832240  46008 2710912    0    0    21     2  223  192  0  1 99  0  0
 1  0      0 259832896  46008 2710744    0    0     0     0 1702206 198078  0 17 82  0  0
 0  0      0 259830272  46008 2710596    0    0     0     0 1696340 197756  1 17 83  0  0
 4  0      0 259829168  46024 2710584    0    0    16     0 1688472 197158  1 17 82  0  0
 3  0      0 259830224  46024 2710408    0    0     0     0 1692450 197212  0 18 82  0  0

As expected, number of interrupts per second is very different.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Van Jacobson <vanj@google.com>
Cc: Jerry Chu <hkchu@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 42264035dec0..3467d9e89e7d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -253,6 +253,7 @@ struct sock_common {
   *	@sk_ll_usec: usecs to busypoll when there is no data
   *	@sk_allocation: allocation mode
   *	@sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
+  *	@sk_pacing_status: Pacing status (requested, handled by sch_fq)
   *	@sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
   *	@sk_sndbuf: size of send buffer in bytes
   *	@sk_padding: unused element for alignment
@@ -396,7 +397,7 @@ struct sock {
 	__s32			sk_peek_off;
 	int			sk_write_pending;
 	__u32			sk_dst_pending_confirm;
-	/* Note: 32bit hole on 64bit arches */
+	u32			sk_pacing_status; /* see enum sk_pacing */
 	long			sk_sndtimeo;
 	struct timer_list	sk_timer;
 	__u32			sk_priority;
@@ -475,6 +476,12 @@ struct sock {
 	struct rcu_head		sk_rcu;
 };
 
+enum sk_pacing {
+	SK_PACING_NONE		= 0,
+	SK_PACING_NEEDED	= 1,
+	SK_PACING_FQ		= 2,
+};
+
 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
 
 #define rcu_dereference_sk_user_data(sk)	rcu_dereference(__sk_user_data((sk)))
-- 
cgit v1.2.3


From 0604475119de5f80dc051a5db055c6a2a75bd542 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Jun 2017 13:29:12 -0700
Subject: tcp: add TCPMemoryPressuresChrono counter

DRAM supply shortage and poor memory pressure tracking in TCP
stack makes any change in SO_SNDBUF/SO_RCVBUF (or equivalent autotuning
limits) and tcp_mem[] quite hazardous.

TCPMemoryPressures SNMP counter is an indication of tcp_mem sysctl
limits being hit, but only tracking number of transitions.

If TCP stack behavior under stress was perfect :
1) It would maintain memory usage close to the limit.
2) Memory pressure state would be entered for short times.

We certainly prefer 100 events lasting 10ms compared to one event
lasting 200 seconds.

This patch adds a new SNMP counter tracking cumulative duration of
memory pressure events, given in ms units.

$ cat /proc/sys/net/ipv4/tcp_mem
3088    4117    6176
$ grep TCP /proc/net/sockstat
TCP: inuse 180 orphan 0 tw 2 alloc 234 mem 4140
$ nstat -n ; sleep 10 ; nstat |grep Pressure
TcpExtTCPMemoryPressures        1700
TcpExtTCPMemoryPressuresChrono  5209

v2: Used EXPORT_SYMBOL_GPL() instead of EXPORT_SYMBOL() as David
instructed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 3467d9e89e7d..858891c36f94 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1080,6 +1080,7 @@ struct proto {
 	bool			(*stream_memory_free)(const struct sock *sk);
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
+	void			(*leave_memory_pressure)(struct sock *sk);
 	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
 	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
 	/*
@@ -1088,7 +1089,7 @@ struct proto {
 	 * All the __sk_mem_schedule() is of this nature: accounting
 	 * is strict, actions are advisory and have some latency.
 	 */
-	int			*memory_pressure;
+	unsigned long		*memory_pressure;
 	long			*sysctl_mem;
 	int			*sysctl_wmem;
 	int			*sysctl_rmem;
@@ -1193,25 +1194,6 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 	return !!*sk->sk_prot->memory_pressure;
 }
 
-static inline void sk_leave_memory_pressure(struct sock *sk)
-{
-	int *memory_pressure = sk->sk_prot->memory_pressure;
-
-	if (!memory_pressure)
-		return;
-
-	if (*memory_pressure)
-		*memory_pressure = 0;
-}
-
-static inline void sk_enter_memory_pressure(struct sock *sk)
-{
-	if (!sk->sk_prot->enter_memory_pressure)
-		return;
-
-	sk->sk_prot->enter_memory_pressure(sk);
-}
-
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
-- 
cgit v1.2.3


From 34cfb542b5b1762c73479a4a61e0a3b77253f876 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 21 Jun 2017 11:45:31 +0200
Subject: sock: avoid dirtying incoming_cpu if not needed

for connected socket, the incoming_cpu field in the sock struct
is not going to change frequently, but we are setting it
unconditionally for each packet.

Since sk_incoming_cpu and sk_flags share the same cacheline,
and the latter is access by udp_recvmsg(), this cause a cache
miss for each packet for UDP connected socket.

With this patch, we set the incoming cpu field only when the
ingress cpu really changes.

This gives a small but measurable performance improvement for
connected UDP socket.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 858891c36f94..00d09140e354 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -907,7 +907,10 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 
 static inline void sk_incoming_cpu_update(struct sock *sk)
 {
-	sk->sk_incoming_cpu = raw_smp_processor_id();
+	int cpu = raw_smp_processor_id();
+
+	if (unlikely(sk->sk_incoming_cpu != cpu))
+		sk->sk_incoming_cpu = cpu;
 }
 
 static inline void sock_rps_record_flow_hash(__u32 hash)
-- 
cgit v1.2.3


From 14afee4b6092fde451ee17604e5f5c89da33e71e Mon Sep 17 00:00:00 2001
From: "Reshetova, Elena" <elena.reshetova@intel.com>
Date: Fri, 30 Jun 2017 13:08:00 +0300
Subject: net: convert sock.sk_wmem_alloc from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 00d09140e354..5284e50fc81a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -390,7 +390,7 @@ struct sock {
 
 	/* ===== cache line for TX ===== */
 	int			sk_wmem_queued;
-	atomic_t		sk_wmem_alloc;
+	refcount_t		sk_wmem_alloc;
 	unsigned long		sk_tsq_flags;
 	struct sk_buff		*sk_send_head;
 	struct sk_buff_head	sk_write_queue;
@@ -1911,7 +1911,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
  */
 static inline int sk_wmem_alloc_get(const struct sock *sk)
 {
-	return atomic_read(&sk->sk_wmem_alloc) - 1;
+	return refcount_read(&sk->sk_wmem_alloc) - 1;
 }
 
 /**
@@ -2055,7 +2055,7 @@ static inline unsigned long sock_wspace(struct sock *sk)
 	int amt = 0;
 
 	if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
-		amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
+		amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc);
 		if (amt < 0)
 			amt = 0;
 	}
@@ -2136,7 +2136,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
  */
 static inline bool sock_writeable(const struct sock *sk)
 {
-	return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
+	return refcount_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
 }
 
 static inline gfp_t gfp_any(void)
-- 
cgit v1.2.3


From 41c6d650f6537e55a1b53438c646fbc3f49176bf Mon Sep 17 00:00:00 2001
From: "Reshetova, Elena" <elena.reshetova@intel.com>
Date: Fri, 30 Jun 2017 13:08:01 +0300
Subject: net: convert sock.sk_refcnt from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

This patch uses refcount_inc_not_zero() instead of
atomic_inc_not_zero_hint() due to absense of a _hint()
version of refcount API. If the hint() version must
be used, we might need to revisit API.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/net/sock.h')

diff --git a/include/net/sock.h b/include/net/sock.h
index 5284e50fc81a..60200f4f4028 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -66,6 +66,7 @@
 #include <linux/poll.h>
 
 #include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <net/dst.h>
 #include <net/checksum.h>
 #include <net/tcp_states.h>
@@ -219,7 +220,7 @@ struct sock_common {
 		u32		skc_tw_rcv_nxt; /* struct tcp_timewait_sock  */
 	};
 
-	atomic_t		skc_refcnt;
+	refcount_t		skc_refcnt;
 	/* private: */
 	int                     skc_dontcopy_end[0];
 	union {
@@ -611,7 +612,7 @@ static inline bool __sk_del_node_init(struct sock *sk)
 
 static __always_inline void sock_hold(struct sock *sk)
 {
-	atomic_inc(&sk->sk_refcnt);
+	refcount_inc(&sk->sk_refcnt);
 }
 
 /* Ungrab socket in the context, which assumes that socket refcnt
@@ -619,7 +620,7 @@ static __always_inline void sock_hold(struct sock *sk)
  */
 static __always_inline void __sock_put(struct sock *sk)
 {
-	atomic_dec(&sk->sk_refcnt);
+	refcount_dec(&sk->sk_refcnt);
 }
 
 static inline bool sk_del_node_init(struct sock *sk)
@@ -628,7 +629,7 @@ static inline bool sk_del_node_init(struct sock *sk)
 
 	if (rc) {
 		/* paranoid for a while -acme */
-		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
+		WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
 		__sock_put(sk);
 	}
 	return rc;
@@ -650,7 +651,7 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
 
 	if (rc) {
 		/* paranoid for a while -acme */
-		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
+		WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
 		__sock_put(sk);
 	}
 	return rc;
@@ -1144,9 +1145,9 @@ static inline void sk_refcnt_debug_dec(struct sock *sk)
 
 static inline void sk_refcnt_debug_release(const struct sock *sk)
 {
-	if (atomic_read(&sk->sk_refcnt) != 1)
+	if (refcount_read(&sk->sk_refcnt) != 1)
 		printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n",
-		       sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt));
+		       sk->sk_prot->name, sk, refcount_read(&sk->sk_refcnt));
 }
 #else /* SOCK_REFCNT_DEBUG */
 #define sk_refcnt_debug_inc(sk) do { } while (0)
@@ -1636,7 +1637,7 @@ void sock_init_data(struct socket *sock, struct sock *sk);
 /* Ungrab socket and destroy it, if it was the last reference. */
 static inline void sock_put(struct sock *sk)
 {
-	if (atomic_dec_and_test(&sk->sk_refcnt))
+	if (refcount_dec_and_test(&sk->sk_refcnt))
 		sk_free(sk);
 }
 /* Generic version of sock_put(), dealing with all sockets
-- 
cgit v1.2.3