summaryrefslogtreecommitdiffstats
path: root/target/linux/generic/pending-6.1/680-net-add-TCP-fraglist-GRO-support.patch
diff options
context:
space:
mode:
Diffstat (limited to 'target/linux/generic/pending-6.1/680-net-add-TCP-fraglist-GRO-support.patch')
-rw-r--r--target/linux/generic/pending-6.1/680-net-add-TCP-fraglist-GRO-support.patch627
1 files changed, 627 insertions, 0 deletions
diff --git a/target/linux/generic/pending-6.1/680-net-add-TCP-fraglist-GRO-support.patch b/target/linux/generic/pending-6.1/680-net-add-TCP-fraglist-GRO-support.patch
new file mode 100644
index 0000000000..f52233fe90
--- /dev/null
+++ b/target/linux/generic/pending-6.1/680-net-add-TCP-fraglist-GRO-support.patch
@@ -0,0 +1,627 @@
+From: Felix Fietkau <nbd@nbd.name>
+Date: Tue, 23 Apr 2024 11:23:03 +0200
+Subject: [PATCH] net: add TCP fraglist GRO support
+
+When forwarding TCP after GRO, software segmentation is very expensive,
+especially when the checksum needs to be recalculated.
+One case where that's currently unavoidable is when routing packets over
+PPPoE. Performance improves significantly when using fraglist GRO
+implemented in the same way as for UDP.
+
+Here's a measurement of running 2 TCP streams through a MediaTek MT7622
+device (2-core Cortex-A53), which runs NAT with flow offload enabled from
+one ethernet port to PPPoE on another ethernet port + cake qdisc set to
+1Gbps.
+
+rx-gro-list off: 630 Mbit/s, CPU 35% idle
+rx-gro-list on: 770 Mbit/s, CPU 40% idle
+
+Signe-off-by: Felix Fietkau <nbd@nbd.name>
+---
+
+--- a/include/net/gro.h
++++ b/include/net/gro.h
+@@ -424,6 +424,7 @@ static inline __wsum ip6_gro_compute_pse
+ }
+
+ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
++int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
+
+ /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
+ static inline void gro_normal_list(struct napi_struct *napi)
+@@ -446,5 +447,48 @@ static inline void gro_normal_one(struct
+ gro_normal_list(napi);
+ }
+
++/* This function is the alternative of 'inet_iif' and 'inet_sdif'
++ * functions in case we can not rely on fields of IPCB.
++ *
++ * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
++ * The caller must hold the RCU read lock.
++ */
++static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
++{
++ *iif = inet_iif(skb) ?: skb->dev->ifindex;
++ *sdif = 0;
++
++#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
++ if (netif_is_l3_slave(skb->dev)) {
++ struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
++
++ *sdif = *iif;
++ *iif = master ? master->ifindex : 0;
++ }
++#endif
++}
++
++/* This function is the alternative of 'inet6_iif' and 'inet6_sdif'
++ * functions in case we can not rely on fields of IP6CB.
++ *
++ * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized.
++ * The caller must hold the RCU read lock.
++ */
++static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif)
++{
++ /* using skb->dev->ifindex because skb_dst(skb) is not initialized */
++ *iif = skb->dev->ifindex;
++ *sdif = 0;
++
++#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
++ if (netif_is_l3_slave(skb->dev)) {
++ struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev);
++
++ *sdif = *iif;
++ *iif = master ? master->ifindex : 0;
++ }
++#endif
++}
++
+
+ #endif /* _NET_IPV6_GRO_H */
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -2057,7 +2057,10 @@ void tcp_v4_destroy_sock(struct sock *sk
+
+ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
+ netdev_features_t features);
+-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
++struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
++struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
++struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
++ struct tcphdr *th);
+ INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
+ INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
+ INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
+--- a/net/core/gro.c
++++ b/net/core/gro.c
+@@ -290,6 +290,33 @@ done:
+ return 0;
+ }
+
++int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
++{
++ if (unlikely(p->len + skb->len >= 65536))
++ return -E2BIG;
++
++ if (NAPI_GRO_CB(p)->last == p)
++ skb_shinfo(p)->frag_list = skb;
++ else
++ NAPI_GRO_CB(p)->last->next = skb;
++
++ skb_pull(skb, skb_gro_offset(skb));
++
++ NAPI_GRO_CB(p)->last = skb;
++ NAPI_GRO_CB(p)->count++;
++ p->data_len += skb->len;
++
++ /* sk ownership - if any - completely transferred to the aggregated packet */
++ skb->destructor = NULL;
++ skb->sk = NULL;
++ p->truesize += skb->truesize;
++ p->len += skb->len;
++
++ NAPI_GRO_CB(skb)->same_flow = 1;
++
++ return 0;
++}
++
+
+ static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
+ {
+--- a/net/ipv4/tcp_offload.c
++++ b/net/ipv4/tcp_offload.c
+@@ -27,6 +27,70 @@ static void tcp_gso_tstamp(struct sk_buf
+ }
+ }
+
++static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
++ __be32 *oldip, __be32 newip,
++ __be16 *oldport, __be16 newport)
++{
++ struct tcphdr *th;
++ struct iphdr *iph;
++
++ if (*oldip == newip && *oldport == newport)
++ return;
++
++ th = tcp_hdr(seg);
++ iph = ip_hdr(seg);
++
++ inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true);
++ inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
++ *oldport = newport;
++
++ csum_replace4(&iph->check, *oldip, newip);
++ *oldip = newip;
++}
++
++static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
++{
++ const struct tcphdr *th;
++ const struct iphdr *iph;
++ struct sk_buff *seg;
++ struct tcphdr *th2;
++ struct iphdr *iph2;
++
++ seg = segs;
++ th = tcp_hdr(seg);
++ iph = ip_hdr(seg);
++ th2 = tcp_hdr(seg->next);
++ iph2 = ip_hdr(seg->next);
++
++ if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
++ iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
++ return segs;
++
++ while ((seg = seg->next)) {
++ th2 = tcp_hdr(seg);
++ iph2 = ip_hdr(seg);
++
++ __tcpv4_gso_segment_csum(seg,
++ &iph2->saddr, iph->saddr,
++ &th2->source, th->source);
++ __tcpv4_gso_segment_csum(seg,
++ &iph2->daddr, iph->daddr,
++ &th2->dest, th->dest);
++ }
++
++ return segs;
++}
++
++static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
++ netdev_features_t features)
++{
++ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
++ if (IS_ERR(skb))
++ return skb;
++
++ return __tcpv4_gso_segment_list_csum(skb);
++}
++
+ static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+ {
+@@ -36,6 +100,9 @@ static struct sk_buff *tcp4_gso_segment(
+ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+ return ERR_PTR(-EINVAL);
+
++ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
++ return __tcp4_gso_segment_list(skb, features);
++
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+@@ -177,61 +244,76 @@ out:
+ return segs;
+ }
+
+-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
++struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
+ {
+- struct sk_buff *pp = NULL;
++ struct tcphdr *th2;
+ struct sk_buff *p;
++
++ list_for_each_entry(p, head, list) {
++ if (!NAPI_GRO_CB(p)->same_flow)
++ continue;
++
++ th2 = tcp_hdr(p);
++ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
++ NAPI_GRO_CB(p)->same_flow = 0;
++ continue;
++ }
++
++ return p;
++ }
++
++ return NULL;
++}
++
++struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
++{
++ unsigned int thlen, hlen, off;
+ struct tcphdr *th;
+- struct tcphdr *th2;
+- unsigned int len;
+- unsigned int thlen;
+- __be32 flags;
+- unsigned int mss = 1;
+- unsigned int hlen;
+- unsigned int off;
+- int flush = 1;
+- int i;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*th);
+ th = skb_gro_header(skb, hlen, off);
+ if (unlikely(!th))
+- goto out;
++ return NULL;
+
+ thlen = th->doff * 4;
+ if (thlen < sizeof(*th))
+- goto out;
++ return NULL;
+
+ hlen = off + thlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+- goto out;
++ return NULL;
+ }
+
+ skb_gro_pull(skb, thlen);
+
+- len = skb_gro_len(skb);
+- flags = tcp_flag_word(th);
+-
+- list_for_each_entry(p, head, list) {
+- if (!NAPI_GRO_CB(p)->same_flow)
+- continue;
++ return th;
++}
+
+- th2 = tcp_hdr(p);
++struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
++ struct tcphdr *th)
++{
++ unsigned int thlen = th->doff * 4;
++ struct sk_buff *pp = NULL;
++ struct sk_buff *p;
++ struct tcphdr *th2;
++ unsigned int len;
++ __be32 flags;
++ unsigned int mss = 1;
++ int flush = 1;
++ int i;
+
+- if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+- NAPI_GRO_CB(p)->same_flow = 0;
+- continue;
+- }
++ len = skb_gro_len(skb);
++ flags = tcp_flag_word(th);
+
+- goto found;
+- }
+- p = NULL;
+- goto out_check_final;
++ p = tcp_gro_lookup(head, th);
++ if (!p)
++ goto out_check_final;
+
+-found:
+ /* Include the IP ID check below from the inner most IP hdr */
++ th2 = tcp_hdr(p);
+ flush = NAPI_GRO_CB(p)->flush;
+ flush |= (__force int)(flags & TCP_FLAG_CWR);
+ flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+@@ -268,6 +350,19 @@ found:
+ flush |= p->decrypted ^ skb->decrypted;
+ #endif
+
++ if (unlikely(NAPI_GRO_CB(p)->is_flist)) {
++ flush |= (__force int)(flags ^ tcp_flag_word(th2));
++ flush |= skb->ip_summed != p->ip_summed;
++ flush |= skb->csum_level != p->csum_level;
++ flush |= !pskb_may_pull(skb, skb_gro_offset(skb));
++ flush |= NAPI_GRO_CB(p)->count >= 64;
++
++ if (flush || skb_gro_receive_list(p, skb))
++ mss = 1;
++
++ goto out_check_final;
++ }
++
+ if (flush || skb_gro_receive(p, skb)) {
+ mss = 1;
+ goto out_check_final;
+@@ -289,7 +384,6 @@ out_check_final:
+ if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+ pp = p;
+
+-out:
+ NAPI_GRO_CB(skb)->flush |= (flush != 0);
+
+ return pp;
+@@ -315,18 +409,58 @@ int tcp_gro_complete(struct sk_buff *skb
+ }
+ EXPORT_SYMBOL(tcp_gro_complete);
+
++static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
++ struct tcphdr *th)
++{
++ const struct iphdr *iph;
++ struct sk_buff *p;
++ struct sock *sk;
++ struct net *net;
++ int iif, sdif;
++
++ if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
++ return;
++
++ p = tcp_gro_lookup(head, th);
++ if (p) {
++ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
++ return;
++ }
++
++ inet_get_iif_sdif(skb, &iif, &sdif);
++ iph = skb_gro_network_header(skb);
++ net = dev_net(skb->dev);
++ sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
++ iph->saddr, th->source,
++ iph->daddr, ntohs(th->dest),
++ iif, sdif);
++ NAPI_GRO_CB(skb)->is_flist = !sk;
++ if (sk)
++ sock_put(sk);
++}
++
+ INDIRECT_CALLABLE_SCOPE
+ struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
+ {
++ struct tcphdr *th;
++
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (!NAPI_GRO_CB(skb)->flush &&
+ skb_gro_checksum_validate(skb, IPPROTO_TCP,
+- inet_gro_compute_pseudo)) {
+- NAPI_GRO_CB(skb)->flush = 1;
+- return NULL;
+- }
++ inet_gro_compute_pseudo))
++ goto flush;
++
++ th = tcp_gro_pull_header(skb);
++ if (!th)
++ goto flush;
+
+- return tcp_gro_receive(head, skb);
++ tcp4_check_fraglist_gro(head, skb, th);
++
++ return tcp_gro_receive(head, skb, th);
++
++flush:
++ NAPI_GRO_CB(skb)->flush = 1;
++ return NULL;
+ }
+
+ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
+@@ -334,6 +468,15 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_com
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
++ if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
++ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
++ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
++
++ __skb_incr_checksum_unnecessary(skb);
++
++ return 0;
++ }
++
+ th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
+ iph->daddr, 0);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -425,33 +425,6 @@ out:
+ return segs;
+ }
+
+-static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+-{
+- if (unlikely(p->len + skb->len >= 65536))
+- return -E2BIG;
+-
+- if (NAPI_GRO_CB(p)->last == p)
+- skb_shinfo(p)->frag_list = skb;
+- else
+- NAPI_GRO_CB(p)->last->next = skb;
+-
+- skb_pull(skb, skb_gro_offset(skb));
+-
+- NAPI_GRO_CB(p)->last = skb;
+- NAPI_GRO_CB(p)->count++;
+- p->data_len += skb->len;
+-
+- /* sk ownership - if any - completely transferred to the aggregated packet */
+- skb->destructor = NULL;
+- skb->sk = NULL;
+- p->truesize += skb->truesize;
+- p->len += skb->len;
+-
+- NAPI_GRO_CB(skb)->same_flow = 1;
+-
+- return 0;
+-}
+-
+
+ #define UDP_GRO_CNT_MAX 64
+ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+--- a/net/ipv6/tcpv6_offload.c
++++ b/net/ipv6/tcpv6_offload.c
+@@ -7,24 +7,67 @@
+ */
+ #include <linux/indirect_call_wrapper.h>
+ #include <linux/skbuff.h>
++#include <net/inet6_hashtables.h>
+ #include <net/gro.h>
+ #include <net/protocol.h>
+ #include <net/tcp.h>
+ #include <net/ip6_checksum.h>
+ #include "ip6_offload.h"
+
++static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
++ struct tcphdr *th)
++{
++#if IS_ENABLED(CONFIG_IPV6)
++ const struct ipv6hdr *hdr;
++ struct sk_buff *p;
++ struct sock *sk;
++ struct net *net;
++ int iif, sdif;
++
++ if (!(skb->dev->features & NETIF_F_GRO_FRAGLIST))
++ return;
++
++ p = tcp_gro_lookup(head, th);
++ if (p) {
++ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
++ return;
++ }
++
++ inet6_get_iif_sdif(skb, &iif, &sdif);
++ hdr = skb_gro_network_header(skb);
++ net = dev_net(skb->dev);
++ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
++ &hdr->saddr, th->source,
++ &hdr->daddr, ntohs(th->dest),
++ iif, sdif);
++ NAPI_GRO_CB(skb)->is_flist = !sk;
++ if (sk)
++ sock_put(sk);
++#endif /* IS_ENABLED(CONFIG_IPV6) */
++}
++
+ INDIRECT_CALLABLE_SCOPE
+ struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
+ {
++ struct tcphdr *th;
++
+ /* Don't bother verifying checksum if we're going to flush anyway. */
+ if (!NAPI_GRO_CB(skb)->flush &&
+ skb_gro_checksum_validate(skb, IPPROTO_TCP,
+- ip6_gro_compute_pseudo)) {
+- NAPI_GRO_CB(skb)->flush = 1;
+- return NULL;
+- }
++ ip6_gro_compute_pseudo))
++ goto flush;
+
+- return tcp_gro_receive(head, skb);
++ th = tcp_gro_pull_header(skb);
++ if (!th)
++ goto flush;
++
++ tcp6_check_fraglist_gro(head, skb, th);
++
++ return tcp_gro_receive(head, skb, th);
++
++flush:
++ NAPI_GRO_CB(skb)->flush = 1;
++ return NULL;
+ }
+
+ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
+@@ -32,6 +75,15 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);
+
++ if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
++ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
++ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
++
++ __skb_incr_checksum_unnecessary(skb);
++
++ return 0;
++ }
++
+ th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
+ &iph->daddr, 0);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+@@ -39,6 +91,61 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_com
+ return tcp_gro_complete(skb);
+ }
+
++static void __tcpv6_gso_segment_csum(struct sk_buff *seg,
++ __be16 *oldport, __be16 newport)
++{
++ struct tcphdr *th;
++
++ if (*oldport == newport)
++ return;
++
++ th = tcp_hdr(seg);
++ inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
++ *oldport = newport;
++}
++
++static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs)
++{
++ const struct tcphdr *th;
++ const struct ipv6hdr *iph;
++ struct sk_buff *seg;
++ struct tcphdr *th2;
++ struct ipv6hdr *iph2;
++
++ seg = segs;
++ th = tcp_hdr(seg);
++ iph = ipv6_hdr(seg);
++ th2 = tcp_hdr(seg->next);
++ iph2 = ipv6_hdr(seg->next);
++
++ if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
++ ipv6_addr_equal(&iph->saddr, &iph2->saddr) &&
++ ipv6_addr_equal(&iph->daddr, &iph2->daddr))
++ return segs;
++
++ while ((seg = seg->next)) {
++ th2 = tcp_hdr(seg);
++ iph2 = ipv6_hdr(seg);
++
++ iph2->saddr = iph->saddr;
++ iph2->daddr = iph->daddr;
++ __tcpv6_gso_segment_csum(seg, &th2->source, th->source);
++ __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest);
++ }
++
++ return segs;
++}
++
++static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb,
++ netdev_features_t features)
++{
++ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
++ if (IS_ERR(skb))
++ return skb;
++
++ return __tcpv6_gso_segment_list_csum(skb);
++}
++
+ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+ {
+@@ -50,6 +157,9 @@ static struct sk_buff *tcp6_gso_segment(
+ if (!pskb_may_pull(skb, sizeof(*th)))
+ return ERR_PTR(-EINVAL);
+
++ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
++ return __tcp6_gso_segment_list(skb, features);
++
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct tcphdr *th = tcp_hdr(skb);