diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2012-07-11 05:50:31 +0000 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-07-11 18:12:59 -0700 |
commit | 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 (patch) | |
tree | 771200292431be56c6ebcb23af9206bc03d40e65 /include | |
parent | 2100844ca9d7055d5cddce2f8ed13af94c01f85b (diff) | |
download | linux-stable-46d3ceabd8d98ed0ad10f20c595ca784e34786c5.tar.gz linux-stable-46d3ceabd8d98ed0ad10f20c595ca784e34786c5.tar.bz2 linux-stable-46d3ceabd8d98ed0ad10f20c595ca784e34786c5.zip |
tcp: TCP Small Queues
This introduce TSQ (TCP Small Queues)
TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
device queues), to reduce RTT and cwnd bias, part of the bufferbloat
problem.
sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
given time.
TSO packets are sized/capped to half the limit, so that we have two
TSO packets in flight, allowing better bandwidth use.
As a side effect, setting the limit to 40000 automatically reduces the
standard gso max limit (65536) to 40000/2 : It can help to reduce
latencies of high prio packets, having smaller TSO packets.
This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() [2] is called for the
already queued skbs.
Results on my dev machines (tg3/ixgbe nics) are really impressive,
using standard pfifo_fast, and with or without TSO/GSO.
Without reduction of nominal bandwidth, we have reduction of buffering
per bulk sender :
< 1ms on Gbit (instead of 50ms with TSO)
< 8ms on 100Mbit (instead of 132 ms)
I no longer have 4 MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.
As skb destructor cannot restart xmit itself ( as qdisc lock might be
taken at this point ), we delegate the work to a tasklet. We use one
tasklest per cpu for performance reasons.
If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag.
This flag is tested in a new protocol method called from release_sock(),
to eventually send new segments.
[1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
[2] skb_orphan() is usually called at TX completion time,
but some drivers call it in their start_xmit() handler.
These drivers should at least use BQL, or else a single TCP
session can still fill the whole NIC TX ring, since TSQ will
have no effect.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/tcp.h | 9 | ||||
-rw-r--r-- | include/net/sock.h | 2 | ||||
-rw-r--r-- | include/net/tcp.h | 4 |
3 files changed, 15 insertions, 0 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2de9cf46f9fc..1888169e07c7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -339,6 +339,9 @@ struct tcp_sock { u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ + struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ + unsigned long tsq_flags; + /* Data for direct copy to user */ struct { struct sk_buff_head prequeue; @@ -494,6 +497,12 @@ struct tcp_sock { struct tcp_cookie_values *cookie_values; }; +enum tsq_flags { + TSQ_THROTTLED, + TSQ_QUEUED, + TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */ +}; + static inline struct tcp_sock *tcp_sk(const struct sock *sk) { return (struct tcp_sock *)sk; diff --git a/include/net/sock.h b/include/net/sock.h index dcb54a0793ec..88de092df50f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -858,6 +858,8 @@ struct proto { int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); + void (*release_cb)(struct sock *sk); + /* Keeping track of sk's, looking them up, and port selection methods. */ void (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 3618fefae049..439984b9af49 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; +extern int sysctl_tcp_limit_output_bytes; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -321,6 +322,8 @@ extern struct proto tcp_prot; extern void tcp_init_mem(struct net *net); +extern void tcp_tasklet_init(void); + extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); @@ -334,6 +337,7 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); extern int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +extern void tcp_release_cb(struct sock *sk); extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len); |