summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 12:31:59 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 12:31:59 -0700
commit5518b69b76680a4f2df96b1deca260059db0c2de (patch)
treef33cd1519c8efb4590500f2f9617400be233238c /net/ipv4/tcp.c
parent8ad06e56dcbc1984ef0ff8f6e3c19982c5809f73 (diff)
parent0e72582270c07850b92cac351c8b97d4f9c123b9 (diff)
downloadlinux-stable-5518b69b76680a4f2df96b1deca260059db0c2de.tar.gz
linux-stable-5518b69b76680a4f2df96b1deca260059db0c2de.tar.bz2
linux-stable-5518b69b76680a4f2df96b1deca260059db0c2de.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Reasonably busy this cycle, but perhaps not as busy as in the 4.12 merge window: 1) Several optimizations for UDP processing under high load from Paolo Abeni. 2) Support pacing internally in TCP when using the sch_fq packet scheduler for this is not practical. From Eric Dumazet. 3) Support mutliple filter chains per qdisc, from Jiri Pirko. 4) Move to 1ms TCP timestamp clock, from Eric Dumazet. 5) Add batch dequeueing to vhost_net, from Jason Wang. 6) Flesh out more completely SCTP checksum offload support, from Davide Caratti. 7) More plumbing of extended netlink ACKs, from David Ahern, Pablo Neira Ayuso, and Matthias Schiffer. 8) Add devlink support to nfp driver, from Simon Horman. 9) Add RTM_F_FIB_MATCH flag to RTM_GETROUTE queries, from Roopa Prabhu. 10) Add stack depth tracking to BPF verifier and use this information in the various eBPF JITs. From Alexei Starovoitov. 11) Support XDP on qed device VFs, from Yuval Mintz. 12) Introduce BPF PROG ID for better introspection of installed BPF programs. From Martin KaFai Lau. 13) Add bpf_set_hash helper for TC bpf programs, from Daniel Borkmann. 14) For loads, allow narrower accesses in bpf verifier checking, from Yonghong Song. 15) Support MIPS in the BPF selftests and samples infrastructure, the MIPS eBPF JIT will be merged in via the MIPS GIT tree. From David Daney. 16) Support kernel based TLS, from Dave Watson and others. 17) Remove completely DST garbage collection, from Wei Wang. 18) Allow installing TCP MD5 rules using prefixes, from Ivan Delalande. 19) Add XDP support to Intel i40e driver, from Björn Töpel 20) Add support for TC flower offload in nfp driver, from Simon Horman, Pieter Jansen van Vuuren, Benjamin LaHaise, Jakub Kicinski, and Bert van Leeuwen. 21) IPSEC offloading support in mlx5, from Ilan Tayari. 22) Add HW PTP support to macb driver, from Rafal Ozieblo. 23) Networking refcount_t conversions, From Elena Reshetova. 24) Add sock_ops support to BPF, from Lawrence Brako. This is useful for tuning the TCP sockopt settings of a group of applications, currently via CGROUPs" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1899 commits) net: phy: dp83867: add workaround for incorrect RX_CTRL pin strap dt-bindings: phy: dp83867: provide a workaround for incorrect RX_CTRL pin strap cxgb4: Support for get_ts_info ethtool method cxgb4: Add PTP Hardware Clock (PHC) support cxgb4: time stamping interface for PTP nfp: default to chained metadata prepend format nfp: remove legacy MAC address lookup nfp: improve order of interfaces in breakout mode net: macb: remove extraneous return when MACB_EXT_DESC is defined bpf: add missing break in for the TCP_BPF_SNDCWND_CLAMP case bpf: fix return in load_bpf_file mpls: fix rtm policy in mpls_getroute net, ax25: convert ax25_cb.refcount from atomic_t to refcount_t net, ax25: convert ax25_route.refcount from atomic_t to refcount_t net, ax25: convert ax25_uid_assoc.refcount from atomic_t to refcount_t net, sctp: convert sctp_ep_common.refcnt from atomic_t to refcount_t net, sctp: convert sctp_transport.refcnt from atomic_t to refcount_t net, sctp: convert sctp_chunk.refcnt from atomic_t to refcount_t net, sctp: convert sctp_datamsg.refcnt from atomic_t to refcount_t net, sctp: convert sctp_auth_bytes.refcnt from atomic_t to refcount_t ...
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c93
1 files changed, 74 insertions, 19 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 40aca7803cf2..71ce33decd97 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -320,17 +320,36 @@ struct tcp_splice_state {
* All the __sk_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
-int tcp_memory_pressure __read_mostly;
-EXPORT_SYMBOL(tcp_memory_pressure);
+unsigned long tcp_memory_pressure __read_mostly;
+EXPORT_SYMBOL_GPL(tcp_memory_pressure);
void tcp_enter_memory_pressure(struct sock *sk)
{
- if (!tcp_memory_pressure) {
+ unsigned long val;
+
+ if (tcp_memory_pressure)
+ return;
+ val = jiffies;
+
+ if (!val)
+ val--;
+ if (!cmpxchg(&tcp_memory_pressure, 0, val))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
- tcp_memory_pressure = 1;
- }
}
-EXPORT_SYMBOL(tcp_enter_memory_pressure);
+EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
+
+void tcp_leave_memory_pressure(struct sock *sk)
+{
+ unsigned long val;
+
+ if (!tcp_memory_pressure)
+ return;
+ val = xchg(&tcp_memory_pressure, 0);
+ if (val)
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
+ jiffies_to_msecs(jiffies - val));
+}
+EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
/* Convert seconds to retransmits based on initial and max timeout */
static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
@@ -386,7 +405,7 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
+ minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -645,7 +664,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
return skb->len < size_goal &&
sysctl_tcp_autocorking &&
skb != tcp_write_queue_head(sk) &&
- atomic_read(&sk->sk_wmem_alloc) > skb->truesize;
+ refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
static void tcp_push(struct sock *sk, int flags, int mss_now,
@@ -673,7 +692,7 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
*/
- if (atomic_read(&sk->sk_wmem_alloc) > skb->truesize)
+ if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
return;
}
@@ -882,8 +901,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
@@ -1013,6 +1032,7 @@ out_err:
}
return sk_stream_error(sk, flags, err);
}
+EXPORT_SYMBOL_GPL(do_tcp_sendpages);
int tcp_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
@@ -2186,7 +2206,7 @@ adjudge_to_death:
/* Now socket is owned by kernel and we acquire BH lock
- to finish close. No need to check for user refs.
+ * to finish close. No need to check for user refs.
*/
local_bh_disable();
bh_lock_sock(sk);
@@ -2461,7 +2481,25 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
name[val] = 0;
lock_sock(sk);
- err = tcp_set_congestion_control(sk, name);
+ err = tcp_set_congestion_control(sk, name, true);
+ release_sock(sk);
+ return err;
+ }
+ case TCP_ULP: {
+ char name[TCP_ULP_NAME_MAX];
+
+ if (optlen < 1)
+ return -EINVAL;
+
+ val = strncpy_from_user(name, optval,
+ min_t(long, TCP_ULP_NAME_MAX - 1,
+ optlen));
+ if (val < 0)
+ return -EFAULT;
+ name[val] = 0;
+
+ lock_sock(sk);
+ err = tcp_set_ulp(sk, name);
release_sock(sk);
return err;
}
@@ -2482,7 +2520,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_MAXSEG:
/* Values greater than interface MTU won't take effect. However
* at the point when this call is done we typically don't yet
- * know which interface is going to be used */
+ * know which interface is going to be used
+ */
if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
err = -EINVAL;
break;
@@ -2677,8 +2716,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
#ifdef CONFIG_TCP_MD5SIG
case TCP_MD5SIG:
+ case TCP_MD5SIG_EXT:
/* Read the IP->Key mappings from userspace */
- err = tp->af_specific->md5_parse(sk, optval, optlen);
+ err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break;
#endif
case TCP_USER_TIMEOUT:
@@ -2717,7 +2757,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (!tp->repair)
err = -EPERM;
else
- tp->tsoffset = val - tcp_time_stamp;
+ tp->tsoffset = val - tcp_time_stamp_raw();
break;
case TCP_REPAIR_WINDOW:
err = tcp_repair_set_window(tp, optval, optlen);
@@ -2768,7 +2808,7 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
stats[i] = tp->chrono_stat[i - 1];
if (i == tp->chrono_type)
- stats[i] += tcp_time_stamp - tp->chrono_start;
+ stats[i] += tcp_jiffies32 - tp->chrono_start;
stats[i] *= USEC_PER_SEC / HZ;
total += stats[i];
}
@@ -2852,7 +2892,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_retrans = tp->retrans_out;
info->tcpi_fackets = tp->fackets_out;
- now = tcp_time_stamp;
+ now = tcp_jiffies32;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
@@ -3020,6 +3060,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
+ case TCP_ULP:
+ if (get_user(len, optlen))
+ return -EFAULT;
+ len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
+ if (!icsk->icsk_ulp_ops) {
+ if (put_user(0, optlen))
+ return -EFAULT;
+ return 0;
+ }
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
+ return -EFAULT;
+ return 0;
+
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
@@ -3083,7 +3138,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_TIMESTAMP:
- val = tcp_time_stamp + tp->tsoffset;
+ val = tcp_time_stamp_raw() + tp->tsoffset;
break;
case TCP_NOTSENT_LOWAT:
val = tp->notsent_lowat;