diff options
author | David Ahern <dsahern@kernel.org> | 2022-03-14 14:45:51 -0600 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2022-03-15 20:20:02 -0700 |
commit | 40867d74c374b235e14d839f3a77f26684feefe5 (patch) | |
tree | c48cb6e159ac445e5171fe550d9c58437d46a0e0 /net | |
parent | c84d86a0295c24487db5b7db1a61d9c0eddfbb66 (diff) | |
download | linux-40867d74c374b235e14d839f3a77f26684feefe5.tar.gz linux-40867d74c374b235e14d839f3a77f26684feefe5.tar.bz2 linux-40867d74c374b235e14d839f3a77f26684feefe5.zip |
net: Add l3mdev index to flow struct and avoid oif reset for port devices
The fundamental premise of VRF and l3mdev core code is binding a socket
to a device (l3mdev or netdev with an L3 domain) to indicate L3 scope.
Legacy code resets flowi_oif to the l3mdev losing any original port
device binding. Ben (among others) has demonstrated use cases where the
original port device binding is important and needs to be retained.
This patch handles that by adding a new entry to the common flow struct
that can indicate the l3mdev index for later rule and table matching
avoiding the need to reset flowi_oif.
In addition to allowing more use cases that require port device binds,
this patch brings a few datapath simplications:
1. l3mdev_fib_rule_match is only called when walking fib rules and
always after l3mdev_update_flow. That allows an optimization to bail
early for non-VRF type uses cases when flowi_l3mdev is not set. Also,
only that index needs to be checked for the FIB table id.
2. l3mdev_update_flow can be called with flowi_oif set to a l3mdev
(e.g., VRF) device. By resetting flowi_oif only for this case the
FLOWI_FLAG_SKIP_NH_OIF flag is not longer needed and can be removed,
removing several checks in the datapath. The flowi_iif path can be
simplified to only be called if the it is not loopback (loopback can
not be assigned to an L3 domain) and the l3mdev index is not already
set.
3. Avoid another device lookup in the output path when the fib lookup
returns a reject failure.
Note: 2 functional tests for local traffic with reject fib rules are
updated to reflect the new direct failure at FIB lookup time for ping
rather than the failure on packet path. The current code fails like this:
HINT: Fails since address on vrf device is out of device scope
COMMAND: ip netns exec ns-A ping -c1 -w1 -I eth1 172.16.3.1
ping: Warning: source address might be selected on device other than: eth1
PING 172.16.3.1 (172.16.3.1) from 172.16.3.1 eth1: 56(84) bytes of data.
--- 172.16.3.1 ping statistics ---
1 packets transmitted, 0 received, 100% packet loss, time 0ms
where the test now directly fails:
HINT: Fails since address on vrf device is out of device scope
COMMAND: ip netns exec ns-A ping -c1 -w1 -I eth1 172.16.3.1
ping: connect: No route to host
Signed-off-by: David Ahern <dsahern@kernel.org>
Tested-by: Ben Greear <greearb@candelatech.com>
Link: https://lore.kernel.org/r/20220314204551.16369-1-dsahern@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
-rw-r--r-- | net/ipv4/fib_frontend.c | 7 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 2 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 7 | ||||
-rw-r--r-- | net/ipv4/route.c | 4 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 4 | ||||
-rw-r--r-- | net/ipv6/ip6_output.c | 3 | ||||
-rw-r--r-- | net/ipv6/route.c | 12 | ||||
-rw-r--r-- | net/ipv6/xfrm6_policy.c | 3 | ||||
-rw-r--r-- | net/l3mdev/l3mdev.c | 43 |
9 files changed, 28 insertions, 57 deletions
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7408051632ac..af8209f912ab 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -291,7 +291,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, - .flowi4_oif = l3mdev_master_ifindex_rcu(dev), + .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, @@ -353,9 +353,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev); - if (!fl4.flowi4_iif) - fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; + fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev); + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c5a29703185a..cc8e84ef2ae4 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2234,7 +2234,7 @@ void fib_select_multipath(struct fib_result *res, int hash) void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb) { - if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) + if (fl4->flowi4_oif) goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2af2b99e0bea..fb0e49c36c2e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1429,11 +1429,8 @@ bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) return false; - if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { - if (flp->flowi4_oif && - flp->flowi4_oif != nhc->nhc_oif) - return false; - } + if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif) + return false; return true; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f444f5983405..63f3256a407d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2263,6 +2263,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* * Now we are ready to route packet. */ + fl4.flowi4_l3mdev = 0; fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; @@ -2738,8 +2739,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, res->fi = NULL; res->table = NULL; if (fl4->flowi4_oif && - (ipv4_is_multicast(fl4->daddr) || - !netif_index_is_l3_master(net, fl4->flowi4_oif))) { + (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) { /* Apparently, routing tables are wrong. Assume, * that the destination is on link. * diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 9e83bcb6bc99..6fde0b184791 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -28,13 +28,11 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; - fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); + fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(net, oif); fl4->flowi4_mark = mark; if (saddr) fl4->saddr = saddr->a4; - fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; - rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e69fac576970..a76fba3dd47a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1035,8 +1035,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif - (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && - (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { + (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { dst_release(dst); dst = NULL; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6188712f24b0..2fa10e60cccd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1209,9 +1209,6 @@ INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_node *fn; struct rt6_info *rt; - if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) - flags &= ~RT6_LOOKUP_F_IFACE; - rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: @@ -2181,9 +2178,6 @@ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; - if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) - oif = 0; - redo_rt6_select: rt6_select(net, fn, oif, res, strict); if (res->f6i == net->ipv6.fib6_null_entry) { @@ -3058,12 +3052,6 @@ INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_info *rt; struct fib6_node *fn; - /* l3mdev_update_flow overrides oif if the device is enslaved; in - * this case we must match on the real ingress device, so reset it - */ - if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) - fl6->flowi6_oif = skb->dev->ifindex; - /* Get the "current" route for this destination and * check if the redirect has come from appropriate router. * diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 55bb2cbae13d..e64e427a51cf 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -33,8 +33,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, int err; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif); - fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; + fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(net, oif); fl6.flowi6_mark = mark; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 17927966abb3..4eb8892fb2ff 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -250,25 +250,19 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct net_device *dev; int rc = 0; - rcu_read_lock(); + /* update flow ensures flowi_l3mdev is set when relevant */ + if (!fl->flowi_l3mdev) + return 0; - dev = dev_get_by_index_rcu(net, fl->flowi_oif); - if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_fib_table) { - arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); - rc = 1; - goto out; - } + rcu_read_lock(); - dev = dev_get_by_index_rcu(net, fl->flowi_iif); + dev = dev_get_by_index_rcu(net, fl->flowi_l3mdev); if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_fib_table) { arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); rc = 1; - goto out; } -out: rcu_read_unlock(); return rc; @@ -277,31 +271,28 @@ out: void l3mdev_update_flow(struct net *net, struct flowi *fl) { struct net_device *dev; - int ifindex; rcu_read_lock(); if (fl->flowi_oif) { dev = dev_get_by_index_rcu(net, fl->flowi_oif); if (dev) { - ifindex = l3mdev_master_ifindex_rcu(dev); - if (ifindex) { - fl->flowi_oif = ifindex; - fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; - goto out; - } + if (!fl->flowi_l3mdev) + fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev); + + /* oif set to L3mdev directs lookup to its table; + * reset to avoid oif match in fib_lookup + */ + if (netif_is_l3_master(dev)) + fl->flowi_oif = 0; + goto out; } } - if (fl->flowi_iif) { + if (fl->flowi_iif > LOOPBACK_IFINDEX && !fl->flowi_l3mdev) { dev = dev_get_by_index_rcu(net, fl->flowi_iif); - if (dev) { - ifindex = l3mdev_master_ifindex_rcu(dev); - if (ifindex) { - fl->flowi_iif = ifindex; - fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; - } - } + if (dev) + fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev); } out: |