From 430a049190de3c9e219f43084de9f1122da04570 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 24 May 2019 14:43:08 -0700 Subject: nexthop: Add support for nexthop groups Allow the creation of nexthop groups which reference other nexthop objects to create multipath routes: +--------------+ +------------+ +--------------+ | | nh nh_grp --->| nh_grp_entry |-+ +------------+ +---------|----+ ^ | | +------------+ +----------------+ +--->| nh, weight | nh_parent +------------+ A group entry points to a nexthop with a weight for that hop within the group. The nexthop has a list_head, grp_list, for tracking which groups it is a member of and the group entry has a reference back to the parent. The grp_list is used when a nexthop is deleted - to efficiently remove it from groups using it. If a nexthop group spec is given, no other attributes can be set. Each nexthop id in a group spec must already exist. Similar to single nexthops, the specification of a nexthop group can be updated so that data is managed with rcu locking. Add path selection function to account for multiple paths and add ipv{4,6}_good_nh helpers to know that if a neighbor entry exists it is in a good state. Update NETDEV event handling to rebalance multipath nexthop groups if a nexthop is deleted due to a link event (down or unregister). When a nexthop is removed any groups using it are updated. Groups using a nexthop a tracked via a grp_list. Nexthop dumps can be limited to groups only by adding NHA_GROUPS to the request. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/nexthop.c | 504 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 481 insertions(+), 23 deletions(-) (limited to 'net/ipv4/nexthop.c') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 3a1cbcb96baa..1af8a329dacb 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -8,12 +8,17 @@ #include #include #include +#include #include #include +#include #include #include #include +static void remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo); + #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) @@ -53,9 +58,20 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) hlist_add_head(&nhi->dev_hash, head); } -void nexthop_free_rcu(struct rcu_head *head) +static void nexthop_free_mpath(struct nexthop *nh) +{ + struct nh_group *nhg; + int i; + + nhg = rcu_dereference_raw(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) + WARN_ON(nhg->nh_entries[i].nh); + + kfree(nhg); +} + +static void nexthop_free_single(struct nexthop *nh) { - struct nexthop *nh = container_of(head, struct nexthop, rcu); struct nh_info *nhi; nhi = rcu_dereference_raw(nh->nh_info); @@ -68,6 +84,16 @@ void nexthop_free_rcu(struct rcu_head *head) break; } kfree(nhi); +} + +void nexthop_free_rcu(struct rcu_head *head) +{ + struct nexthop *nh = container_of(head, struct nexthop, rcu); + + if (nh->is_group) + nexthop_free_mpath(nh); + else + nexthop_free_single(nh); kfree(nh); } @@ -78,9 +104,26 @@ static struct nexthop *nexthop_alloc(void) struct nexthop *nh; nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); + if (nh) { + INIT_LIST_HEAD(&nh->grp_list); + } return nh; } +static struct nh_group *nexthop_grp_alloc(u16 num_nh) +{ + size_t sz = offsetof(struct nexthop, nh_grp) + + sizeof(struct nh_group) + + sizeof(struct nh_grp_entry) * num_nh; + struct nh_group *nhg; + + nhg = kzalloc(sz, GFP_KERNEL); + if (nhg) + nhg->num_nh = num_nh; + + return nhg; +} + static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) @@ -129,6 +172,37 @@ static u32 nh_find_unused_id(struct net *net) return 0; } +static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) +{ + struct nexthop_grp *p; + size_t len = nhg->num_nh * sizeof(*p); + struct nlattr *nla; + u16 group_type = 0; + int i; + + if (nhg->mpath) + group_type = NEXTHOP_GRP_TYPE_MPATH; + + if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) + goto nla_put_failure; + + nla = nla_reserve(skb, NHA_GROUP, len); + if (!nla) + goto nla_put_failure; + + p = nla_data(nla); + for (i = 0; i < nhg->num_nh; ++i) { + p->id = nhg->nh_entries[i].nh->id; + p->weight = nhg->nh_entries[i].weight - 1; + p += 1; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, int event, u32 portid, u32 seq, unsigned int nlflags) { @@ -152,6 +226,14 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + + if (nla_put_nh_group(skb, nhg)) + goto nla_put_failure; + goto out; + } + nhi = rtnl_dereference(nh->nh_info); nhm->nh_family = nhi->family; if (nhi->reject_nh) { @@ -196,15 +278,24 @@ nla_put_failure: return -EMSGSIZE; } -static size_t nh_nlmsg_size(struct nexthop *nh) +static size_t nh_nlmsg_size_grp(struct nexthop *nh) +{ + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; + + return nla_total_size(sz) + + nla_total_size(2); /* NHA_GROUP_TYPE */ +} + +static size_t nh_nlmsg_size_single(struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); - size_t sz = nla_total_size(4); /* NHA_ID */ + size_t sz; /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE * are mutually exclusive */ - sz += nla_total_size(4); /* NHA_OIF */ + sz = nla_total_size(4); /* NHA_OIF */ switch (nhi->family) { case AF_INET: @@ -227,6 +318,18 @@ static size_t nh_nlmsg_size(struct nexthop *nh) return sz; } +static size_t nh_nlmsg_size(struct nexthop *nh) +{ + size_t sz = nla_total_size(4); /* NHA_ID */ + + if (nh->is_group) + sz += nh_nlmsg_size_grp(nh); + else + sz += nh_nlmsg_size_single(nh); + + return sz; +} + static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) { unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; @@ -254,17 +357,274 @@ errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } -static void __remove_nexthop(struct net *net, struct nexthop *nh) +static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, + struct netlink_ext_ack *extack) { - struct nh_info *nhi; + if (nh->is_group) { + struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - nhi = rtnl_dereference(nh->nh_info); - if (nhi->fib_nhc.nhc_dev) - hlist_del(&nhi->dev_hash); + /* nested multipath (group within a group) is not + * supported + */ + if (nhg->mpath) { + NL_SET_ERR_MSG(extack, + "Multipath group can not be a nexthop within a group"); + return false; + } + } else { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + + if (nhi->reject_nh && npaths > 1) { + NL_SET_ERR_MSG(extack, + "Blackhole nexthop can not be used in a group with more than 1 path"); + return false; + } + } + + return true; +} + +static int nh_check_attr_group(struct net *net, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + unsigned int len = nla_len(tb[NHA_GROUP]); + struct nexthop_grp *nhg; + unsigned int i, j; + + if (len & (sizeof(struct nexthop_grp) - 1)) { + NL_SET_ERR_MSG(extack, + "Invalid length for nexthop group attribute"); + return -EINVAL; + } + + /* convert len to number of nexthop ids */ + len /= sizeof(*nhg); + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + if (nhg[i].resvd1 || nhg[i].resvd2) { + NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); + return -EINVAL; + } + if (nhg[i].weight > 254) { + NL_SET_ERR_MSG(extack, "Invalid value for weight"); + return -EINVAL; + } + for (j = i + 1; j < len; ++j) { + if (nhg[i].id == nhg[j].id) { + NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); + return -EINVAL; + } + } + } + + nhg = nla_data(tb[NHA_GROUP]); + for (i = 0; i < len; ++i) { + struct nexthop *nh; + + nh = nexthop_find_by_id(net, nhg[i].id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + return -EINVAL; + } + if (!valid_group_nh(nh, len, extack)) + return -EINVAL; + } + for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) { + if (!tb[i]) + continue; + + NL_SET_ERR_MSG(extack, + "No other attributes can be set in nexthop groups"); + return -EINVAL; + } + + return 0; +} + +static bool ipv6_good_nh(const struct fib6_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +static bool ipv4_good_nh(const struct fib_nh *nh) +{ + int state = NUD_REACHABLE; + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, + (__force u32)nh->fib_nh_gw4); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + + return !!(state & NUD_VALID); +} + +struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) +{ + struct nexthop *rc = NULL; + struct nh_group *nhg; + int i; + + if (!nh->is_group) + return nh; + + nhg = rcu_dereference(nh->nh_grp); + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + struct nh_info *nhi; + + if (hash > atomic_read(&nhge->upper_bound)) + continue; + + /* nexthops always check if it is good and does + * not rely on a sysctl for this behavior + */ + nhi = rcu_dereference(nhge->nh->nh_info); + switch (nhi->family) { + case AF_INET: + if (ipv4_good_nh(&nhi->fib_nh)) + return nhge->nh; + break; + case AF_INET6: + if (ipv6_good_nh(&nhi->fib6_nh)) + return nhge->nh; + break; + } + + if (!rc) + rc = nhge->nh; + } + + return rc; +} +EXPORT_SYMBOL_GPL(nexthop_select_path); + +static void nh_group_rebalance(struct nh_group *nhg) +{ + int total = 0; + int w = 0; + int i; + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; + atomic_set(&nhge->upper_bound, upper_bound); + } +} + +static void remove_nh_grp_entry(struct nh_grp_entry *nhge, + struct nh_group *nhg, + struct nl_info *nlinfo) +{ + struct nexthop *nh = nhge->nh; + struct nh_grp_entry *nhges; + bool found = false; + int i; + + WARN_ON(!nh); + + nhges = nhg->nh_entries; + for (i = 0; i < nhg->num_nh; ++i) { + if (found) { + nhges[i-1].nh = nhges[i].nh; + nhges[i-1].weight = nhges[i].weight; + list_del(&nhges[i].nh_list); + list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list); + } else if (nhg->nh_entries[i].nh == nh) { + found = true; + } + } + + if (WARN_ON(!found)) + return; + + nhg->num_nh--; + nhg->nh_entries[nhg->num_nh].nh = NULL; + + nh_group_rebalance(nhg); + + nexthop_put(nh); + + if (nlinfo) + nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo); +} + +static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + struct nh_grp_entry *nhge, *tmp; + + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) { + struct nh_group *nhg; + + list_del(&nhge->nh_list); + nhg = rtnl_dereference(nhge->nh_parent->nh_grp); + remove_nh_grp_entry(nhge, nhg, nlinfo); + + /* if this group has no more entries then remove it */ + if (!nhg->num_nh) + remove_nexthop(net, nhge->nh_parent, nlinfo); + } +} + +static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) +{ + struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + int i, num_nh = nhg->num_nh; + + for (i = 0; i < num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + + if (WARN_ON(!nhge->nh)) + continue; + + list_del(&nhge->nh_list); + nexthop_put(nhge->nh); + nhge->nh = NULL; + nhg->num_nh--; + } +} + +static void __remove_nexthop(struct net *net, struct nexthop *nh, + struct nl_info *nlinfo) +{ + if (nh->is_group) { + remove_nexthop_group(nh, nlinfo); + } else { + struct nh_info *nhi; + + nhi = rtnl_dereference(nh->nh_info); + if (nhi->fib_nhc.nhc_dev) + hlist_del(&nhi->dev_hash); + + remove_nexthop_from_groups(net, nh, nlinfo); + } } static void remove_nexthop(struct net *net, struct nexthop *nh, - bool skip_fib, struct nl_info *nlinfo) + struct nl_info *nlinfo) { /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); @@ -272,7 +632,7 @@ static void remove_nexthop(struct net *net, struct nexthop *nh, if (nlinfo) nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); - __remove_nexthop(net, nh); + __remove_nexthop(net, nh, nlinfo); nh_base_seq_inc(net); nexthop_put(nh); @@ -353,7 +713,7 @@ static void nexthop_flush_dev(struct net_device *dev) if (nhi->fib_nhc.nhc_dev != dev) continue; - remove_nexthop(net, nhi->nh_parent, false, NULL); + remove_nexthop(net, nhi->nh_parent, NULL); } } @@ -366,11 +726,69 @@ static void flush_all_nexthops(struct net *net) while ((node = rb_first(root))) { nh = rb_entry(node, struct nexthop, rb_node); - remove_nexthop(net, nh, false, NULL); + remove_nexthop(net, nh, NULL); cond_resched(); } } +static struct nexthop *nexthop_create_group(struct net *net, + struct nh_config *cfg) +{ + struct nlattr *grps_attr = cfg->nh_grp; + struct nexthop_grp *entry = nla_data(grps_attr); + struct nh_group *nhg; + struct nexthop *nh; + int i; + + nh = nexthop_alloc(); + if (!nh) + return ERR_PTR(-ENOMEM); + + nh->is_group = 1; + + nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry)); + if (!nhg) { + kfree(nh); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nhg->num_nh; ++i) { + struct nexthop *nhe; + struct nh_info *nhi; + + nhe = nexthop_find_by_id(net, entry[i].id); + if (!nexthop_get(nhe)) + goto out_no_nh; + + nhi = rtnl_dereference(nhe->nh_info); + if (nhi->family == AF_INET) + nhg->has_v4 = true; + + nhg->nh_entries[i].nh = nhe; + nhg->nh_entries[i].weight = entry[i].weight + 1; + list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); + nhg->nh_entries[i].nh_parent = nh; + } + + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + nhg->mpath = 1; + nh_group_rebalance(nhg); + } + + rcu_assign_pointer(nh->nh_grp, nhg); + + return nh; + +out_no_nh: + for (; i >= 0; --i) + nexthop_put(nhg->nh_entries[i].nh); + + kfree(nhg); + kfree(nh); + + return ERR_PTR(-ENOENT); +} + static int nh_create_ipv4(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) @@ -506,7 +924,11 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, } } - nh = nexthop_create(net, cfg, extack); + if (cfg->nh_grp) + nh = nexthop_create_group(net, cfg); + else + nh = nexthop_create(net, cfg, extack); + if (IS_ERR(nh)) return nh; @@ -517,7 +939,7 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, err = insert_nexthop(net, nh, cfg, extack); if (err) { - __remove_nexthop(net, nh); + __remove_nexthop(net, nh, NULL); nexthop_put(nh); nh = ERR_PTR(err); } @@ -552,6 +974,10 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, case AF_INET: case AF_INET6: break; + case AF_UNSPEC: + if (tb[NHA_GROUP]) + break; + /* fallthrough */ default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; @@ -575,6 +1001,27 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); + if (tb[NHA_GROUP]) { + if (nhm->nh_family != AF_UNSPEC) { + NL_SET_ERR_MSG(extack, "Invalid family for group"); + goto out; + } + cfg->nh_grp = tb[NHA_GROUP]; + + cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; + if (tb[NHA_GROUP_TYPE]) + cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); + + if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { + NL_SET_ERR_MSG(extack, "Invalid group type"); + goto out; + } + err = nh_check_attr_group(net, tb, extack); + + /* no other attributes should be set */ + goto out; + } + if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { @@ -752,7 +1199,7 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, if (!nh) return -ENOENT; - remove_nexthop(net, nh, false, &nlinfo); + remove_nexthop(net, nh, &nlinfo); return 0; } @@ -796,15 +1243,21 @@ errout_free: goto out; } -static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, - int master_idx, u8 family) +static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, + bool group_filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; + if (group_filter && !nh->is_group) + return true; + if (!dev_idx && !master_idx && !family) return false; + if (nh->is_group) + return true; + nhi = rtnl_dereference(nh->nh_info); if (family && nhi->family != family) return true; @@ -827,8 +1280,8 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, return false; } -static int nh_valid_dump_req(const struct nlmsghdr *nlh, - int *dev_idx, int *master_idx, +static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, + int *master_idx, bool *group_filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; @@ -863,6 +1316,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, } *master_idx = idx; break; + case NHA_GROUPS: + *group_filter = true; + break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; @@ -885,11 +1341,13 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) int dev_filter_idx = 0, master_idx = 0; struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; + bool group_filter = false; struct rb_node *node; int idx = 0, s_idx; int err; - err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, cb); + err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, + &group_filter, cb); if (err < 0) return err; @@ -902,7 +1360,7 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) nh = rb_entry(node, struct nexthop, rb_node); if (nh_dump_filtered(nh, dev_filter_idx, master_idx, - nhm->nh_family)) + group_filter, nhm->nh_family)) goto cont; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, -- cgit v1.2.3