From 65101aeca52241a05e66f23c96eb896c9412718d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 16 May 2017 11:20:13 +0200 Subject: net/sock: factor out dequeue/peek with offset code And update __sk_queue_drop_skb() to work on the specified queue. This will help the udp protocol to use an additional private rx queue in a later patch. Signed-off-by: Paolo Abeni Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index f33e3d134e0b..42264035dec0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2035,8 +2035,8 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer, void sk_stop_timer(struct sock *sk, struct timer_list *timer); -int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, - unsigned int flags, +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, + struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -- cgit v1.2.3 From 218af599fa635b107cfe10acf3249c4dfe5e4123 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 04:24:36 -0700 Subject: tcp: internal implementation for pacing BBR congestion control depends on pacing, and pacing is currently handled by sch_fq packet scheduler for performance reasons, and also because implemening pacing with FQ was convenient to truly avoid bursts. However there are many cases where this packet scheduler constraint is not practical. - Many linux hosts are not focusing on handling thousands of TCP flows in the most efficient way. - Some routers use fq_codel or other AQM, but still would like to use BBR for the few TCP flows they initiate/terminate. This patch implements an automatic fallback to internal pacing. Pacing is requested either by BBR or use of SO_MAX_PACING_RATE option. If sch_fq happens to be in the egress path, pacing is delegated to the qdisc, otherwise pacing is done by TCP itself. One advantage of pacing from TCP stack is to get more precise rtt estimations, and less work done from TX completion, since TCP Small queue limits are not generally hit. Setups with single TX queue but many cpus might even benefit from this. Note that unlike sch_fq, we do not take into account header sizes. Taking care of these headers would add additional complexity for no practical differences in behavior. Some performance numbers using 800 TCP_STREAM flows rate limited to ~48 Mbit per second on 40Gbit NIC. If MQ+pfifo_fast is used on the NIC : $ sar -n DEV 1 5 | grep eth 14:48:44 eth0 725743.00 2932134.00 46776.76 4335184.68 0.00 0.00 1.00 14:48:45 eth0 725349.00 2932112.00 46751.86 4335158.90 0.00 0.00 0.00 14:48:46 eth0 725101.00 2931153.00 46735.07 4333748.63 0.00 0.00 0.00 14:48:47 eth0 725099.00 2931161.00 46735.11 4333760.44 0.00 0.00 1.00 14:48:48 eth0 725160.00 2931731.00 46738.88 4334606.07 0.00 0.00 0.00 Average: eth0 725290.40 2931658.20 46747.54 4334491.74 0.00 0.00 0.40 $ vmstat 1 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 4 0 0 259825920 45644 2708324 0 0 21 2 247 98 0 0 100 0 0 4 0 0 259823744 45644 2708356 0 0 0 0 2400825 159843 0 19 81 0 0 0 0 0 259824208 45644 2708072 0 0 0 0 2407351 159929 0 19 81 0 0 1 0 0 259824592 45644 2708128 0 0 0 0 2405183 160386 0 19 80 0 0 1 0 0 259824272 45644 2707868 0 0 0 32 2396361 158037 0 19 81 0 0 Now use MQ+FQ : lpaa23:~# echo fq >/proc/sys/net/core/default_qdisc lpaa23:~# tc qdisc replace dev eth0 root mq $ sar -n DEV 1 5 | grep eth 14:49:57 eth0 678614.00 2727930.00 43739.13 4033279.14 0.00 0.00 0.00 14:49:58 eth0 677620.00 2723971.00 43674.69 4027429.62 0.00 0.00 1.00 14:49:59 eth0 676396.00 2719050.00 43596.83 4020125.02 0.00 0.00 0.00 14:50:00 eth0 675197.00 2714173.00 43518.62 4012938.90 0.00 0.00 1.00 14:50:01 eth0 676388.00 2719063.00 43595.47 4020171.64 0.00 0.00 0.00 Average: eth0 676843.00 2720837.40 43624.95 4022788.86 0.00 0.00 0.40 $ vmstat 1 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 2 0 0 259832240 46008 2710912 0 0 21 2 223 192 0 1 99 0 0 1 0 0 259832896 46008 2710744 0 0 0 0 1702206 198078 0 17 82 0 0 0 0 0 259830272 46008 2710596 0 0 0 0 1696340 197756 1 17 83 0 0 4 0 0 259829168 46024 2710584 0 0 16 0 1688472 197158 1 17 82 0 0 3 0 0 259830224 46024 2710408 0 0 0 0 1692450 197212 0 18 82 0 0 As expected, number of interrupts per second is very different. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Van Jacobson Cc: Jerry Chu Signed-off-by: David S. Miller --- include/net/sock.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 42264035dec0..3467d9e89e7d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -253,6 +253,7 @@ struct sock_common { * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) + * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes * @sk_padding: unused element for alignment @@ -396,7 +397,7 @@ struct sock { __s32 sk_peek_off; int sk_write_pending; __u32 sk_dst_pending_confirm; - /* Note: 32bit hole on 64bit arches */ + u32 sk_pacing_status; /* see enum sk_pacing */ long sk_sndtimeo; struct timer_list sk_timer; __u32 sk_priority; @@ -475,6 +476,12 @@ struct sock { struct rcu_head sk_rcu; }; +enum sk_pacing { + SK_PACING_NONE = 0, + SK_PACING_NEEDED = 1, + SK_PACING_FQ = 2, +}; + #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) #define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk))) -- cgit v1.2.3 From 0604475119de5f80dc051a5db055c6a2a75bd542 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Jun 2017 13:29:12 -0700 Subject: tcp: add TCPMemoryPressuresChrono counter DRAM supply shortage and poor memory pressure tracking in TCP stack makes any change in SO_SNDBUF/SO_RCVBUF (or equivalent autotuning limits) and tcp_mem[] quite hazardous. TCPMemoryPressures SNMP counter is an indication of tcp_mem sysctl limits being hit, but only tracking number of transitions. If TCP stack behavior under stress was perfect : 1) It would maintain memory usage close to the limit. 2) Memory pressure state would be entered for short times. We certainly prefer 100 events lasting 10ms compared to one event lasting 200 seconds. This patch adds a new SNMP counter tracking cumulative duration of memory pressure events, given in ms units. $ cat /proc/sys/net/ipv4/tcp_mem 3088 4117 6176 $ grep TCP /proc/net/sockstat TCP: inuse 180 orphan 0 tw 2 alloc 234 mem 4140 $ nstat -n ; sleep 10 ; nstat |grep Pressure TcpExtTCPMemoryPressures 1700 TcpExtTCPMemoryPressuresChrono 5209 v2: Used EXPORT_SYMBOL_GPL() instead of EXPORT_SYMBOL() as David instructed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 3467d9e89e7d..858891c36f94 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1080,6 +1080,7 @@ struct proto { bool (*stream_memory_free)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); + void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* @@ -1088,7 +1089,7 @@ struct proto { * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ - int *memory_pressure; + unsigned long *memory_pressure; long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; @@ -1193,25 +1194,6 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) return !!*sk->sk_prot->memory_pressure; } -static inline void sk_leave_memory_pressure(struct sock *sk) -{ - int *memory_pressure = sk->sk_prot->memory_pressure; - - if (!memory_pressure) - return; - - if (*memory_pressure) - *memory_pressure = 0; -} - -static inline void sk_enter_memory_pressure(struct sock *sk) -{ - if (!sk->sk_prot->enter_memory_pressure) - return; - - sk->sk_prot->enter_memory_pressure(sk); -} - static inline long sk_memory_allocated(const struct sock *sk) { -- cgit v1.2.3 From 34cfb542b5b1762c73479a4a61e0a3b77253f876 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 21 Jun 2017 11:45:31 +0200 Subject: sock: avoid dirtying incoming_cpu if not needed for connected socket, the incoming_cpu field in the sock struct is not going to change frequently, but we are setting it unconditionally for each packet. Since sk_incoming_cpu and sk_flags share the same cacheline, and the latter is access by udp_recvmsg(), this cause a cache miss for each packet for UDP connected socket. With this patch, we set the incoming cpu field only when the ingress cpu really changes. This gives a small but measurable performance improvement for connected UDP socket. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sock.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 858891c36f94..00d09140e354 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -907,7 +907,10 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) static inline void sk_incoming_cpu_update(struct sock *sk) { - sk->sk_incoming_cpu = raw_smp_processor_id(); + int cpu = raw_smp_processor_id(); + + if (unlikely(sk->sk_incoming_cpu != cpu)) + sk->sk_incoming_cpu = cpu; } static inline void sock_rps_record_flow_hash(__u32 hash) -- cgit v1.2.3 From 14afee4b6092fde451ee17604e5f5c89da33e71e Mon Sep 17 00:00:00 2001 From: "Reshetova, Elena" Date: Fri, 30 Jun 2017 13:08:00 +0300 Subject: net: convert sock.sk_wmem_alloc from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David S. Miller --- include/net/sock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 00d09140e354..5284e50fc81a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -390,7 +390,7 @@ struct sock { /* ===== cache line for TX ===== */ int sk_wmem_queued; - atomic_t sk_wmem_alloc; + refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; struct sk_buff *sk_send_head; struct sk_buff_head sk_write_queue; @@ -1911,7 +1911,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro */ static inline int sk_wmem_alloc_get(const struct sock *sk) { - return atomic_read(&sk->sk_wmem_alloc) - 1; + return refcount_read(&sk->sk_wmem_alloc) - 1; } /** @@ -2055,7 +2055,7 @@ static inline unsigned long sock_wspace(struct sock *sk) int amt = 0; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { - amt = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); + amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; } @@ -2136,7 +2136,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); */ static inline bool sock_writeable(const struct sock *sk) { - return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1); + return refcount_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1); } static inline gfp_t gfp_any(void) -- cgit v1.2.3 From 41c6d650f6537e55a1b53438c646fbc3f49176bf Mon Sep 17 00:00:00 2001 From: "Reshetova, Elena" Date: Fri, 30 Jun 2017 13:08:01 +0300 Subject: net: convert sock.sk_refcnt from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. This patch uses refcount_inc_not_zero() instead of atomic_inc_not_zero_hint() due to absense of a _hint() version of refcount API. If the hint() version must be used, we might need to revisit API. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: David S. Miller --- include/net/sock.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 5284e50fc81a..60200f4f4028 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -66,6 +66,7 @@ #include #include +#include #include #include #include @@ -219,7 +220,7 @@ struct sock_common { u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */ }; - atomic_t skc_refcnt; + refcount_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; union { @@ -611,7 +612,7 @@ static inline bool __sk_del_node_init(struct sock *sk) static __always_inline void sock_hold(struct sock *sk) { - atomic_inc(&sk->sk_refcnt); + refcount_inc(&sk->sk_refcnt); } /* Ungrab socket in the context, which assumes that socket refcnt @@ -619,7 +620,7 @@ static __always_inline void sock_hold(struct sock *sk) */ static __always_inline void __sock_put(struct sock *sk) { - atomic_dec(&sk->sk_refcnt); + refcount_dec(&sk->sk_refcnt); } static inline bool sk_del_node_init(struct sock *sk) @@ -628,7 +629,7 @@ static inline bool sk_del_node_init(struct sock *sk) if (rc) { /* paranoid for a while -acme */ - WARN_ON(atomic_read(&sk->sk_refcnt) == 1); + WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; @@ -650,7 +651,7 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) if (rc) { /* paranoid for a while -acme */ - WARN_ON(atomic_read(&sk->sk_refcnt) == 1); + WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; @@ -1144,9 +1145,9 @@ static inline void sk_refcnt_debug_dec(struct sock *sk) static inline void sk_refcnt_debug_release(const struct sock *sk) { - if (atomic_read(&sk->sk_refcnt) != 1) + if (refcount_read(&sk->sk_refcnt) != 1) printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", - sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt)); + sk->sk_prot->name, sk, refcount_read(&sk->sk_refcnt)); } #else /* SOCK_REFCNT_DEBUG */ #define sk_refcnt_debug_inc(sk) do { } while (0) @@ -1636,7 +1637,7 @@ void sock_init_data(struct socket *sock, struct sock *sk); /* Ungrab socket and destroy it, if it was the last reference. */ static inline void sock_put(struct sock *sk) { - if (atomic_dec_and_test(&sk->sk_refcnt)) + if (refcount_dec_and_test(&sk->sk_refcnt)) sk_free(sk); } /* Generic version of sock_put(), dealing with all sockets -- cgit v1.2.3