diff options
author | Maxime Ripard <mripard@kernel.org> | 2023-07-11 09:23:20 +0200 |
---|---|---|
committer | Maxime Ripard <mripard@kernel.org> | 2023-07-11 09:23:20 +0200 |
commit | 2f98e686ef59b5d19af5847d755798e2031bee3a (patch) | |
tree | 7b47919242853d088decf898ca79d6cda0d49381 /net/ipv4/tcp_output.c | |
parent | a2848d08742c8e8494675892c02c0d22acbe3cf8 (diff) | |
parent | 06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5 (diff) | |
download | linux-2f98e686ef59b5d19af5847d755798e2031bee3a.tar.gz linux-2f98e686ef59b5d19af5847d755798e2031bee3a.tar.bz2 linux-2f98e686ef59b5d19af5847d755798e2031bee3a.zip |
Merge v6.5-rc1 into drm-misc-fixes
Boris needs 6.5-rc1 in drm-misc-fixes to prevent a conflict.
Signed-off-by: Maxime Ripard <mripard@kernel.org>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 195 |
1 files changed, 142 insertions, 53 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cfe128b81a01..2cb39b6dad02 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -260,8 +260,8 @@ static u16 tcp_select_window(struct sock *sk) u32 old_win = tp->rcv_wnd; u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); + struct net *net = sock_net(sk); - /* Never shrink the offered window */ if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else @@ -270,11 +270,14 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ - if (new_win == 0) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPWANTZEROWINDOWADV); - new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { + /* Never shrink the offered window */ + if (new_win == 0) + NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + } } + tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; @@ -282,7 +285,7 @@ static u16 tcp_select_window(struct sock *sk) * scaled window. */ if (!tp->rx_opt.rcv_wscale && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) + READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); @@ -294,10 +297,9 @@ static u16 tcp_select_window(struct sock *sk) if (new_win == 0) { tp->pred_flags = 0; if (old_win) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTOZEROWINDOWADV); + NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; @@ -1530,7 +1532,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; - int nsize, old_factor; + int old_factor; long limit; int nlen; u8 flags; @@ -1538,9 +1540,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (WARN_ON(len > skb->len)) return -EINVAL; - nsize = skb_headlen(skb) - len; - if (nsize < 0) - nsize = 0; + DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. * We need some allowance to not penalize applications setting small @@ -1560,7 +1560,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; /* Get a new skb... force flag on. */ - buff = tcp_stream_alloc_skb(sk, nsize, gfp, true); + buff = tcp_stream_alloc_skb(sk, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); @@ -1568,7 +1568,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); - nlen = skb->len - len - nsize; + nlen = skb->len - len; buff->truesize += nlen; skb->truesize -= nlen; @@ -1626,13 +1626,7 @@ static int __pskb_trim_head(struct sk_buff *skb, int len) struct skb_shared_info *shinfo; int i, k, eat; - eat = min_t(int, len, skb_headlen(skb)); - if (eat) { - __skb_pull(skb, eat); - len -= eat; - if (!len) - return 0; - } + DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); eat = len; k = 0; shinfo = skb_shinfo(skb); @@ -1671,12 +1665,10 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) TCP_SKB_CB(skb)->seq += len; - if (delta_truesize) { - skb->truesize -= delta_truesize; - sk_wmem_queued_add(sk, -delta_truesize); - if (!skb_zcopy_pure(skb)) - sk_mem_uncharge(sk, delta_truesize); - } + skb->truesize -= delta_truesize; + sk_wmem_queued_add(sk, -delta_truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_uncharge(sk, delta_truesize); /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) @@ -2126,11 +2118,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, u8 flags; /* All of a TSO frame must be composed of paged data. */ - if (skb->len != skb->data_len) - return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, - skb, len, mss_now, gfp); + DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); - buff = tcp_stream_alloc_skb(sk, 0, gfp, true); + buff = tcp_stream_alloc_skb(sk, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); @@ -2319,6 +2309,57 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) return true; } +static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, + int probe_size) +{ + skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags; + int i, todo, len = 0, nr_frags = 0; + const struct sk_buff *skb; + + if (!sk_wmem_schedule(sk, to->truesize + probe_size)) + return -ENOMEM; + + skb_queue_walk(&sk->sk_write_queue, skb) { + const skb_frag_t *fragfrom = skb_shinfo(skb)->frags; + + if (skb_headlen(skb)) + return -EINVAL; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) { + if (len >= probe_size) + goto commit; + todo = min_t(int, skb_frag_size(fragfrom), + probe_size - len); + len += todo; + if (lastfrag && + skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && + skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + + skb_frag_size(lastfrag)) { + skb_frag_size_add(lastfrag, todo); + continue; + } + if (unlikely(nr_frags == MAX_SKB_FRAGS)) + return -E2BIG; + skb_frag_page_copy(fragto, fragfrom); + skb_frag_off_copy(fragto, fragfrom); + skb_frag_size_set(fragto, todo); + nr_frags++; + lastfrag = fragto++; + } + } +commit: + WARN_ON_ONCE(len != probe_size); + for (i = 0; i < nr_frags; i++) + skb_frag_ref(to, i); + + skb_shinfo(to)->nr_frags = nr_frags; + to->truesize += probe_size; + to->len += probe_size; + to->data_len += probe_size; + __skb_header_release(to); + return 0; +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -2395,9 +2436,15 @@ static int tcp_mtu_probe(struct sock *sk) return -1; /* We're allowed to probe. Build it now. */ - nskb = tcp_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); + nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false); if (!nskb) return -1; + + /* build the payload, and be prepared to abort if this fails. */ + if (tcp_clone_payload(sk, nskb, probe_size)) { + consume_skb(nskb); + return -1; + } sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); @@ -2415,7 +2462,6 @@ static int tcp_mtu_probe(struct sock *sk) len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); - skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); if (skb->len <= copy) { /* We've eaten all the data from this skb. @@ -2431,12 +2477,8 @@ static int tcp_mtu_probe(struct sock *sk) } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); - if (!skb_shinfo(skb)->nr_frags) { - skb_pull(skb, copy); - } else { - __pskb_trim_head(skb, copy); - tcp_set_skb_tso_segs(skb, mss_now); - } + __pskb_trim_head(skb, copy); + tcp_set_skb_tso_segs(skb, mss_now); TCP_SKB_CB(skb)->seq += copy; } @@ -2947,6 +2989,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct @@ -2968,6 +3011,15 @@ u32 __tcp_select_window(struct sock *sk) if (mss <= 0) return 0; } + + /* Only allow window shrink if the sysctl is enabled and we have + * a non-zero scaling factor in effect. + */ + if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) + goto shrink_window_allowed; + + /* do not allow window to shrink */ + if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; @@ -3022,6 +3074,36 @@ u32 __tcp_select_window(struct sock *sk) } return window; + +shrink_window_allowed: + /* new window should always be an exact multiple of scaling factor */ + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + + if (free_space < (full_space >> 1)) { + icsk->icsk_ack.quick = 0; + + if (tcp_under_memory_pressure(sk)) + tcp_adjust_rcv_ssthresh(sk); + + /* if free space is too low, return a zero window */ + if (free_space < (allowed_space >> 4) || free_space < mss || + free_space < (1 << tp->rx_opt.rcv_wscale)) + return 0; + } + + if (free_space > tp->rcv_ssthresh) { + free_space = tp->rcv_ssthresh; + /* new window should always be an exact multiple of scaling factor + * + * For this case, we ALIGN "up" (increase free_space) because + * we know free_space is not zero here, it has been reduced from + * the memory-based limit, and rcv_ssthresh is not a hard limit + * (unlike sk_rcvbuf). + */ + free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); + } + + return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, @@ -3746,8 +3828,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int space, err = 0; + struct page_frag *pfrag = sk_page_frag(sk); struct sk_buff *syn_data; + int space, err = 0; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) @@ -3766,25 +3849,31 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) space = min_t(size_t, space, fo->size); - /* limit to order-0 allocations */ - space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - - syn_data = tcp_stream_alloc_skb(sk, space, sk->sk_allocation, false); + if (space && + !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), + pfrag, sk->sk_allocation)) + goto fallback; + syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false); if (!syn_data) goto fallback; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { - int copied = copy_from_iter(skb_put(syn_data, space), space, - &fo->data->msg_iter); - if (unlikely(!copied)) { + space = min_t(size_t, space, pfrag->size - pfrag->offset); + space = tcp_wmem_schedule(sk, space); + } + if (space) { + space = copy_page_from_iter(pfrag->page, pfrag->offset, + space, &fo->data->msg_iter); + if (unlikely(!space)) { tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } - if (copied != space) { - skb_trim(syn_data, copied); - space = copied; - } + skb_fill_page_desc(syn_data, 0, pfrag->page, + pfrag->offset, space); + page_ref_inc(pfrag->page); + pfrag->offset += space; + skb_len_add(syn_data, space); skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ @@ -3849,7 +3938,7 @@ int tcp_connect(struct sock *sk) return 0; } - buff = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, true); + buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; |