From c8d1da4000b0b95bf95d3e13b7450eec5428da1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 11 Nov 2018 11:43:59 -0800 Subject: netfilter: Replace call_rcu_bh(), rcu_barrier_bh(), and synchronize_rcu_bh() Now that call_rcu()'s callback is not invoked until after bh-disable regions of code have completed (in addition to explicitly marked RCU read-side critical sections), call_rcu() can be used in place of call_rcu_bh(). Similarly, rcu_barrier() can be used in place of rcu_barrier_bh() and synchronize_rcu() in place of synchronize_rcu_bh(). This commit therefore makes these changes. Signed-off-by: Paul E. McKenney Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2c8d313ae216..5b0c1ee6ae26 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -94,7 +94,7 @@ static inline void clusterip_config_put(struct clusterip_config *c) { if (refcount_dec_and_test(&c->refcount)) - call_rcu_bh(&c->rcu, clusterip_config_rcu_free); + call_rcu(&c->rcu, clusterip_config_rcu_free); } /* decrease the count of entries using/referencing this config. If last @@ -876,8 +876,8 @@ static void __exit clusterip_tg_exit(void) xt_unregister_target(&clusterip_tg_reg); unregister_pernet_subsys(&clusterip_net_ops); - /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */ - rcu_barrier_bh(); + /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */ + rcu_barrier(); } module_init(clusterip_tg_init); -- cgit v1.2.3 From 912da924a29fc6bd466b98a8791d6f7cf74caf61 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Dec 2018 16:01:27 +0100 Subject: netfilter: remove NF_NAT_RANGE_PROTO_RANDOM support Historically this was net_random() based, and was then converted to a hash based algorithm (private boot seed + hash of endpoint addresses) due to concerns of leaking net_random() bits. RANDOM_FULLY mode was added later to avoid problems with hash based mode (see commit 34ce324019e76, "netfilter: nf_nat: add full port randomization support" for details). Just make prandom_u32() the default search starting point and get rid of ->secure_port() altogether. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 78a67f961d86..4d755a6f73ad 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -69,12 +69,6 @@ static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); } -static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t, - __be16 dport) -{ - return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport); -} - static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_nat_l4proto *l4proto, @@ -162,7 +156,6 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .l3proto = NFPROTO_IPV4, .in_range = nf_nat_ipv4_in_range, - .secure_port = nf_nat_ipv4_secure_port, .manip_pkt = nf_nat_ipv4_manip_pkt, .csum_update = nf_nat_ipv4_csum_update, .csum_recalc = nf_nat_ipv4_csum_recalc, -- cgit v1.2.3 From 203f2e78200c27e42e9f7d063091f950bf5fe4a0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Dec 2018 16:01:29 +0100 Subject: netfilter: nat: remove l4proto->unique_tuple fold remaining users (icmp, icmpv6, gre) into nf_nat_l4proto_unique_tuple. The static-save of old incarnation of resolved key in gre and icmp is removed as well, just use the prandom based offset like the others. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_proto_gre.c | 44 ---------------------------------- net/ipv4/netfilter/nf_nat_proto_icmp.c | 27 --------------------- 2 files changed, 71 deletions(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 00fda6331ce5..a04ff7665e4c 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -37,49 +37,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); -/* generate unique tuple ... */ -static void -gre_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - static u_int16_t key; - __be16 *keyptr; - unsigned int min, i, range_size; - - /* If there is no master conntrack we are not PPTP, - do not change tuples */ - if (!ct->master) - return; - - if (maniptype == NF_NAT_MANIP_SRC) - keyptr = &tuple->src.u.gre.key; - else - keyptr = &tuple->dst.u.gre.key; - - if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { - pr_debug("%p: NATing GRE PPTP\n", ct); - min = 1; - range_size = 0xffff; - } else { - min = ntohs(range->min_proto.gre.key); - range_size = ntohs(range->max_proto.gre.key) - min + 1; - } - - pr_debug("min = %u, range_size = %u\n", min, range_size); - - for (i = 0; ; ++key) { - *keyptr = htons(min + key % range_size); - if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) - return; - } - - pr_debug("%p: no NAT mapping\n", ct); - return; -} - /* manipulate a GRE packet according to maniptype */ static bool gre_manip_pkt(struct sk_buff *skb, @@ -124,7 +81,6 @@ static const struct nf_nat_l4proto gre = { .l4proto = IPPROTO_GRE, .manip_pkt = gre_manip_pkt, .in_range = nf_nat_l4proto_in_range, - .unique_tuple = gre_unique_tuple, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 6d7cf1d79baf..70d7fabdbb01 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -27,32 +27,6 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple, ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); } -static void -icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, - struct nf_conntrack_tuple *tuple, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, - const struct nf_conn *ct) -{ - static u_int16_t id; - unsigned int range_size; - unsigned int i; - - range_size = ntohs(range->max_proto.icmp.id) - - ntohs(range->min_proto.icmp.id) + 1; - /* If no range specified... */ - if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) - range_size = 0xFFFF; - - for (i = 0; ; ++id) { - tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + - (id % range_size)); - if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) - return; - } - return; -} - static bool icmp_manip_pkt(struct sk_buff *skb, const struct nf_nat_l3proto *l3proto, @@ -76,7 +50,6 @@ const struct nf_nat_l4proto nf_nat_l4proto_icmp = { .l4proto = IPPROTO_ICMP, .manip_pkt = icmp_manip_pkt, .in_range = icmp_in_range, - .unique_tuple = icmp_unique_tuple, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif -- cgit v1.2.3 From 40e786bd296d5517b1f6c4bcc9ed13e502606ced Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Dec 2018 16:01:30 +0100 Subject: netfilter: nat: fold in_range indirection into caller No need for indirections here, we only support ipv4 and ipv6 and the called functions are very small. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 4d755a6f73ad..00904e605e85 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -62,13 +62,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, } #endif /* CONFIG_XFRM */ -static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, - const struct nf_nat_range2 *range) -{ - return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && - ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); -} - static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_nat_l4proto *l4proto, @@ -155,7 +148,6 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = { .l3proto = NFPROTO_IPV4, - .in_range = nf_nat_ipv4_in_range, .manip_pkt = nf_nat_ipv4_manip_pkt, .csum_update = nf_nat_ipv4_csum_update, .csum_recalc = nf_nat_ipv4_csum_recalc, -- cgit v1.2.3 From fe2d0020994cd9d4f451e3024109319af287413b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Dec 2018 16:01:31 +0100 Subject: netfilter: nat: remove l4proto->in_range With exception of icmp, all of the l4 nat protocols set this to nf_nat_l4proto_in_range. Get rid of this and just check the l4proto in the caller. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_proto_gre.c | 1 - net/ipv4/netfilter/nf_nat_proto_icmp.c | 11 ----------- 2 files changed, 12 deletions(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index a04ff7665e4c..94b735dd570d 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -80,7 +80,6 @@ gre_manip_pkt(struct sk_buff *skb, static const struct nf_nat_l4proto gre = { .l4proto = IPPROTO_GRE, .manip_pkt = gre_manip_pkt, - .in_range = nf_nat_l4proto_in_range, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 70d7fabdbb01..f532e2215970 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -17,16 +17,6 @@ #include #include -static bool -icmp_in_range(const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype, - const union nf_conntrack_man_proto *min, - const union nf_conntrack_man_proto *max) -{ - return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && - ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); -} - static bool icmp_manip_pkt(struct sk_buff *skb, const struct nf_nat_l3proto *l3proto, @@ -49,7 +39,6 @@ icmp_manip_pkt(struct sk_buff *skb, const struct nf_nat_l4proto nf_nat_l4proto_icmp = { .l4proto = IPPROTO_ICMP, .manip_pkt = icmp_manip_pkt, - .in_range = icmp_in_range, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, #endif -- cgit v1.2.3 From 76b90019e03d866eab85cb57c2a6416ab94284dc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Dec 2018 16:01:32 +0100 Subject: netfilter: nat: remove l4proto->nlattr_to_range all protocols did set this to nf_nat_l4proto_nlattr_to_range, so just call it directly. The important difference is that we'll now also call it for protocols that we don't support (i.e., nf_nat_proto_unknown did not provide .nlattr_to_range). However, there should be no harm, even icmp provided this callback. If we don't implement a specific l4nat for this, nothing would make use of this information, so adding a big switch/case construct listing all supported l4protocols seems a bit pointless. This change leaves a single function pointer in the l4proto struct. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_proto_gre.c | 3 --- net/ipv4/netfilter/nf_nat_proto_icmp.c | 3 --- 2 files changed, 6 deletions(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 94b735dd570d..86af36651edd 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -80,9 +80,6 @@ gre_manip_pkt(struct sk_buff *skb, static const struct nf_nat_l4proto gre = { .l4proto = IPPROTO_GRE, .manip_pkt = gre_manip_pkt, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif }; static int __init nf_nat_proto_gre_init(void) diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index f532e2215970..4fecb3f2c55a 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -39,7 +39,4 @@ icmp_manip_pkt(struct sk_buff *skb, const struct nf_nat_l4proto nf_nat_l4proto_icmp = { .l4proto = IPPROTO_ICMP, .manip_pkt = icmp_manip_pkt, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, -#endif }; -- cgit v1.2.3 From faec18dbb0405c7d4dda025054511dc3a6696918 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Dec 2018 16:01:33 +0100 Subject: netfilter: nat: remove l4proto->manip_pkt This removes the last l4proto indirection, the two callers, the l3proto packet mangling helpers for ipv4 and ipv6, now call the nf_nat_l4proto_manip_pkt() helper. nf_nat_proto_{dccp,tcp,sctp,gre,icmp,icmpv6} are left behind, even though they contain no functionality anymore to not clutter this patch. Next patch will remove the empty files and the nf_nat_l4proto struct. nf_nat_proto_udp.c is renamed to nf_nat_proto.c, as it now contains the other nat manip functionality as well, not just udp and udplite. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 5 ---- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 4 ++-- net/ipv4/netfilter/nf_nat_pptp.c | 2 -- net/ipv4/netfilter/nf_nat_proto_gre.c | 41 -------------------------------- net/ipv4/netfilter/nf_nat_proto_icmp.c | 21 ---------------- 5 files changed, 2 insertions(+), 71 deletions(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 184bf2e0a1ed..80f72cc5ca8d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -156,15 +156,10 @@ config NF_NAT_SNMP_BASIC To compile it as a module, choose M here. If unsure, say N. -config NF_NAT_PROTO_GRE - tristate - depends on NF_CT_PROTO_GRE - config NF_NAT_PPTP tristate depends on NF_CONNTRACK default NF_CONNTRACK_PPTP - select NF_NAT_PROTO_GRE config NF_NAT_H323 tristate diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 00904e605e85..65fdb7a74621 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -77,8 +77,8 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, iph = (void *)skb->data + iphdroff; hdroff = iphdroff + iph->ihl * 4; - if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff, - target, maniptype)) + if (!nf_nat_l4proto_manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, + hdroff, target, maniptype)) return false; iph = (void *)skb->data + iphdroff; diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 5d259a12e25f..68b4d450391b 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -299,8 +299,6 @@ pptp_inbound_pkt(struct sk_buff *skb, static int __init nf_nat_helper_pptp_init(void) { - nf_nat_need_gre(); - BUG_ON(nf_nat_pptp_hook_outbound != NULL); RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 86af36651edd..25849295d537 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -37,49 +37,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); -/* manipulate a GRE packet according to maniptype */ -static bool -gre_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - const struct gre_base_hdr *greh; - struct pptp_gre_header *pgreh; - - /* pgreh includes two optional 32bit fields which are not required - * to be there. That's where the magic '8' comes from */ - if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) - return false; - - greh = (void *)skb->data + hdroff; - pgreh = (struct pptp_gre_header *)greh; - - /* we only have destination manip of a packet, since 'source key' - * is not present in the packet itself */ - if (maniptype != NF_NAT_MANIP_DST) - return true; - - switch (greh->flags & GRE_VERSION) { - case GRE_VERSION_0: - /* We do not currently NAT any GREv0 packets. - * Try to behave like "nf_nat_proto_unknown" */ - break; - case GRE_VERSION_1: - pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); - pgreh->call_id = tuple->dst.u.gre.key; - break; - default: - pr_debug("can't nat unknown GRE version\n"); - return false; - } - return true; -} - static const struct nf_nat_l4proto gre = { .l4proto = IPPROTO_GRE, - .manip_pkt = gre_manip_pkt, }; static int __init nf_nat_proto_gre_init(void) diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 4fecb3f2c55a..c2b7fd1a997b 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -10,33 +10,12 @@ #include #include #include -#include #include #include #include #include -static bool -icmp_manip_pkt(struct sk_buff *skb, - const struct nf_nat_l3proto *l3proto, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ - struct icmphdr *hdr; - - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) - return false; - - hdr = (struct icmphdr *)(skb->data + hdroff); - inet_proto_csum_replace2(&hdr->checksum, skb, - hdr->un.echo.id, tuple->src.u.icmp.id, false); - hdr->un.echo.id = tuple->src.u.icmp.id; - return true; -} - const struct nf_nat_l4proto nf_nat_l4proto_icmp = { .l4proto = IPPROTO_ICMP, - .manip_pkt = icmp_manip_pkt, }; -- cgit v1.2.3 From 5cbabeec1eb758233b35683123de446a57852932 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Dec 2018 16:01:34 +0100 Subject: netfilter: nat: remove nf_nat_l4proto struct This removes the (now empty) nf_nat_l4proto struct, all its instances and all the no longer needed runtime (un)register functionality. nf_nat_need_gre() can be axed as well: the module that calls it (to load the no-longer-existing nat_gre module) also calls other nat core functions. GRE nat is now always available if kernel is built with it. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Makefile | 5 +-- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 24 ++----------- net/ipv4/netfilter/nf_nat_proto_gre.c | 61 -------------------------------- net/ipv4/netfilter/nf_nat_proto_icmp.c | 21 ----------- 4 files changed, 4 insertions(+), 107 deletions(-) delete mode 100644 net/ipv4/netfilter/nf_nat_proto_gre.c delete mode 100644 net/ipv4/netfilter/nf_nat_proto_icmp.c (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 367993adf4d3..fd7122e0e2c9 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -3,7 +3,7 @@ # Makefile for the netfilter modules on top of IPv4. # -nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o +nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o @@ -28,9 +28,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -# NAT protocols (nf_nat) -obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o - obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 65fdb7a74621..2687db015b6f 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -64,7 +64,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, - const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { @@ -171,7 +170,6 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); unsigned int hdrlen = ip_hdrlen(skb); - const struct nf_nat_l4proto *l4proto; struct nf_conntrack_tuple target; unsigned long statusbit; @@ -202,9 +200,8 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, if (!(ct->status & statusbit)) return 1; - l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol); if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), - l4proto, &ct->tuplehash[!dir].tuple, !manip)) + &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { @@ -218,8 +215,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, /* Change outer to look like the reply to an incoming packet */ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0); - if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip)) + if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; return 1; @@ -376,26 +372,12 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn); static int __init nf_nat_l3proto_ipv4_init(void) { - int err; - - err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp); - if (err < 0) - goto err1; - err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4); - if (err < 0) - goto err2; - return err; - -err2: - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); -err1: - return err; + return nf_nat_l3proto_register(&nf_nat_l3proto_ipv4); } static void __exit nf_nat_l3proto_ipv4_exit(void) { nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4); - nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp); } MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c deleted file mode 100644 index 25849295d537..000000000000 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * nf_nat_proto_gre.c - * - * NAT protocol helper module for GRE. - * - * GRE is a generic encapsulation protocol, which is generally not very - * suited for NAT, as it has no protocol-specific part as port numbers. - * - * It has an optional key field, which may help us distinguishing two - * connections between the same two hosts. - * - * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 - * - * PPTP is built on top of a modified version of GRE, and has a mandatory - * field called "CallID", which serves us for the same purpose as the key - * field in plain GRE. - * - * Documentation about PPTP can be found in RFC 2637 - * - * (C) 2000-2005 by Harald Welte - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - * - * (C) 2006-2012 Patrick McHardy - * - */ - -#include -#include -#include - -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); - -static const struct nf_nat_l4proto gre = { - .l4proto = IPPROTO_GRE, -}; - -static int __init nf_nat_proto_gre_init(void) -{ - return nf_nat_l4proto_register(NFPROTO_IPV4, &gre); -} - -static void __exit nf_nat_proto_gre_fini(void) -{ - nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre); -} - -module_init(nf_nat_proto_gre_init); -module_exit(nf_nat_proto_gre_fini); - -void nf_nat_need_gre(void) -{ - return; -} -EXPORT_SYMBOL_GPL(nf_nat_need_gre); diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c deleted file mode 100644 index c2b7fd1a997b..000000000000 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ /dev/null @@ -1,21 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -const struct nf_nat_l4proto nf_nat_l4proto_icmp = { - .l4proto = IPPROTO_ICMP, -}; -- cgit v1.2.3 From 5a86d68bcf02f2d1e9a5897dd482079fd5f75e7f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 5 Nov 2018 18:22:44 +0900 Subject: netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 155 +++++++++++++++++++++---------------- 1 file changed, 87 insertions(+), 68 deletions(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 5b0c1ee6ae26..091d163d215b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -57,17 +57,14 @@ struct clusterip_config { enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ struct rcu_head rcu; - + struct net *net; /* netns for pernet list */ char ifname[IFNAMSIZ]; /* device ifname */ - struct notifier_block notifier; /* refresh c->ifindex in it */ }; #ifdef CONFIG_PROC_FS static const struct file_operations clusterip_proc_fops; #endif -static unsigned int clusterip_net_id __read_mostly; - struct clusterip_net { struct list_head configs; /* lock protects the configs list */ @@ -78,16 +75,30 @@ struct clusterip_net { #endif }; +static unsigned int clusterip_net_id __read_mostly; +static inline struct clusterip_net *clusterip_pernet(struct net *net) +{ + return net_generic(net, clusterip_net_id); +} + static inline void clusterip_config_get(struct clusterip_config *c) { refcount_inc(&c->refcount); } - static void clusterip_config_rcu_free(struct rcu_head *head) { - kfree(container_of(head, struct clusterip_config, rcu)); + struct clusterip_config *config; + struct net_device *dev; + + config = container_of(head, struct clusterip_config, rcu); + dev = dev_get_by_name(config->net, config->ifname); + if (dev) { + dev_mc_del(dev, config->clustermac); + dev_put(dev); + } + kfree(config); } static inline void @@ -101,9 +112,9 @@ clusterip_config_put(struct clusterip_config *c) * entry(rule) is removed, remove the config from lists, but don't free it * yet, since proc-files could still be holding references */ static inline void -clusterip_config_entry_put(struct net *net, struct clusterip_config *c) +clusterip_config_entry_put(struct clusterip_config *c) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(c->net); local_bh_disable(); if (refcount_dec_and_lock(&c->entries, &cn->lock)) { @@ -118,8 +129,6 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c) spin_unlock(&cn->lock); local_bh_enable(); - unregister_netdevice_notifier(&c->notifier); - return; } local_bh_enable(); @@ -129,7 +138,7 @@ static struct clusterip_config * __clusterip_config_find(struct net *net, __be32 clusterip) { struct clusterip_config *c; - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); list_for_each_entry_rcu(c, &cn->configs, list) { if (c->clusterip == clusterip) @@ -181,32 +190,37 @@ clusterip_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; - c = container_of(this, struct clusterip_config, notifier); - switch (event) { - case NETDEV_REGISTER: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } - break; - case NETDEV_UNREGISTER: - if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; - } - break; - case NETDEV_CHANGENAME: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } else if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; + spin_lock_bh(&cn->lock); + list_for_each_entry_rcu(c, &cn->configs, list) { + switch (event) { + case NETDEV_REGISTER: + if (!strcmp(dev->name, c->ifname)) { + c->ifindex = dev->ifindex; + dev_mc_add(dev, c->clustermac); + } + break; + case NETDEV_UNREGISTER: + if (dev->ifindex == c->ifindex) { + dev_mc_del(dev, c->clustermac); + c->ifindex = -1; + } + break; + case NETDEV_CHANGENAME: + if (!strcmp(dev->name, c->ifname)) { + c->ifindex = dev->ifindex; + dev_mc_add(dev, c->clustermac); + } else if (dev->ifindex == c->ifindex) { + dev_mc_del(dev, c->clustermac); + c->ifindex = -1; + } + break; } - break; } + spin_unlock_bh(&cn->lock); return NOTIFY_DONE; } @@ -215,30 +229,44 @@ static struct clusterip_config * clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, __be32 ip, const char *iniface) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; + struct net_device *dev; int err; + if (iniface[0] == '\0') { + pr_info("Please specify an interface name\n"); + return ERR_PTR(-EINVAL); + } + c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return ERR_PTR(-ENOMEM); - strcpy(c->ifname, iniface); - c->ifindex = -1; - c->clusterip = ip; + dev = dev_get_by_name(net, iniface); + if (!dev) { + pr_info("no such interface %s\n", iniface); + kfree(c); + return ERR_PTR(-ENOENT); + } + c->ifindex = dev->ifindex; + strcpy(c->ifname, dev->name); memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); + dev_mc_add(dev, c->clustermac); + dev_put(dev); + + c->clusterip = ip; c->num_total_nodes = i->num_total_nodes; clusterip_config_init_nodelist(c, i); c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; + c->net = net; refcount_set(&c->refcount, 1); spin_lock_bh(&cn->lock); if (__clusterip_config_find(net, ip)) { - spin_unlock_bh(&cn->lock); - kfree(c); - - return ERR_PTR(-EBUSY); + err = -EBUSY; + goto out_config_put; } list_add_rcu(&c->list, &cn->configs); @@ -260,22 +288,17 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, } #endif - c->notifier.notifier_call = clusterip_netdev_event; - err = register_netdevice_notifier(&c->notifier); - if (!err) { - refcount_set(&c->entries, 1); - return c; - } + refcount_set(&c->entries, 1); + return c; #ifdef CONFIG_PROC_FS - proc_remove(c->pde); err: #endif spin_lock_bh(&cn->lock); list_del_rcu(&c->list); +out_config_put: spin_unlock_bh(&cn->lock); clusterip_config_put(c); - return ERR_PTR(err); } @@ -475,21 +498,6 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) &e->ip.dst.s_addr); return -EINVAL; } else { - struct net_device *dev; - - if (e->ip.iniface[0] == '\0') { - pr_info("Please specify an interface name\n"); - return -EINVAL; - } - - dev = dev_get_by_name(par->net, e->ip.iniface); - if (!dev) { - pr_info("no such interface %s\n", - e->ip.iniface); - return -ENOENT; - } - dev_put(dev); - config = clusterip_config_init(par->net, cipinfo, e->ip.dst.s_addr, e->ip.iniface); @@ -502,7 +510,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) if (ret < 0) { pr_info("cannot load conntrack support for proto=%u\n", par->family); - clusterip_config_entry_put(par->net, config); + clusterip_config_entry_put(config); clusterip_config_put(config); return ret; } @@ -524,7 +532,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ - clusterip_config_entry_put(par->net, cipinfo->config); + clusterip_config_entry_put(cipinfo->config); clusterip_config_put(cipinfo->config); @@ -806,7 +814,7 @@ static const struct file_operations clusterip_proc_fops = { static int clusterip_net_init(struct net *net) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); int ret; INIT_LIST_HEAD(&cn->configs); @@ -831,7 +839,7 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); #ifdef CONFIG_PROC_FS proc_remove(cn->procdir); cn->procdir = NULL; @@ -847,6 +855,10 @@ static struct pernet_operations clusterip_net_ops = { .size = sizeof(struct clusterip_net), }; +struct notifier_block cip_netdev_notifier = { + .notifier_call = clusterip_netdev_event +}; + static int __init clusterip_tg_init(void) { int ret; @@ -859,11 +871,17 @@ static int __init clusterip_tg_init(void) if (ret < 0) goto cleanup_subsys; + ret = register_netdevice_notifier(&cip_netdev_notifier); + if (ret < 0) + goto unregister_target; + pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION); return 0; +unregister_target: + xt_unregister_target(&clusterip_tg_reg); cleanup_subsys: unregister_pernet_subsys(&clusterip_net_ops); return ret; @@ -873,6 +891,7 @@ static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); + unregister_netdevice_notifier(&cip_netdev_notifier); xt_unregister_target(&clusterip_tg_reg); unregister_pernet_subsys(&clusterip_net_ops); -- cgit v1.2.3 From b12f7bad5ad3724d19754390a3e80928525c0769 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 5 Nov 2018 18:22:55 +0900 Subject: netfilter: ipt_CLUSTERIP: remove wrong WARN_ON_ONCE in netns exit routine When network namespace is destroyed, both clusterip_tg_destroy() and clusterip_net_exit() are called. and clusterip_net_exit() is called before clusterip_tg_destroy(). Hence cleanup check code in clusterip_net_exit() doesn't make sense. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.184508] WARNING: CPU: 1 PID: 87 at net/ipv4/netfilter/ipt_CLUSTERIP.c:840 clusterip_net_exit+0x319/0x380 [ipt_CLUSTERIP] [ 341.184850] Modules linked in: ipt_CLUSTERIP nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 xt_tcpudp iptable_filter bpfilter ip_tables x_tables [ 341.184850] CPU: 1 PID: 87 Comm: kworker/u4:2 Not tainted 4.19.0-rc5+ #16 [ 341.227509] Workqueue: netns cleanup_net [ 341.227509] RIP: 0010:clusterip_net_exit+0x319/0x380 [ipt_CLUSTERIP] [ 341.227509] Code: 0f 85 7f fe ff ff 48 c7 c2 80 64 2c c0 be a8 02 00 00 48 c7 c7 a0 63 2c c0 c6 05 18 6e 00 00 01 e8 bc 38 ff f5 e9 5b fe ff ff <0f> 0b e9 33 ff ff ff e8 4b 90 50 f6 e9 2d fe ff ff 48 89 df e8 de [ 341.227509] RSP: 0018:ffff88011086f408 EFLAGS: 00010202 [ 341.227509] RAX: dffffc0000000000 RBX: 1ffff1002210de85 RCX: 0000000000000000 [ 341.227509] RDX: 1ffff1002210de85 RSI: ffff880110813be8 RDI: ffffed002210de58 [ 341.227509] RBP: ffff88011086f4d0 R08: 0000000000000000 R09: 0000000000000000 [ 341.227509] R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff1002210de81 [ 341.227509] R13: ffff880110625a48 R14: ffff880114cec8c8 R15: 0000000000000014 [ 341.227509] FS: 0000000000000000(0000) GS:ffff880116600000(0000) knlGS:0000000000000000 [ 341.227509] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 341.227509] CR2: 00007f11fd38e000 CR3: 000000013ca16000 CR4: 00000000001006e0 [ 341.227509] Call Trace: [ 341.227509] ? __clusterip_config_find+0x460/0x460 [ipt_CLUSTERIP] [ 341.227509] ? default_device_exit+0x1ca/0x270 [ 341.227509] ? remove_proc_entry+0x1cd/0x390 [ 341.227509] ? dev_change_net_namespace+0xd00/0xd00 [ 341.227509] ? __init_waitqueue_head+0x130/0x130 [ 341.227509] ops_exit_list.isra.10+0x94/0x140 [ 341.227509] cleanup_net+0x45b/0x900 [ ... ] Fixes: 613d0776d3fe ("netfilter: exit_net cleanup check added") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 091d163d215b..ddf9a878932a 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -845,7 +845,6 @@ static void clusterip_net_exit(struct net *net) cn->procdir = NULL; #endif nf_unregister_net_hook(net, &cip_arp_ops); - WARN_ON_ONCE(!list_empty(&cn->configs)); } static struct pernet_operations clusterip_net_ops = { -- cgit v1.2.3 From 2a61d8b883bbad26b06d2e6cc3777a697e78830d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 5 Nov 2018 18:23:13 +0900 Subject: netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index ddf9a878932a..d4d549a46c04 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -56,7 +56,7 @@ struct clusterip_config { #endif enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ - struct rcu_head rcu; + struct rcu_head rcu; /* for call_rcu_bh */ struct net *net; /* netns for pernet list */ char ifname[IFNAMSIZ]; /* device ifname */ }; @@ -72,6 +72,8 @@ struct clusterip_net { #ifdef CONFIG_PROC_FS struct proc_dir_entry *procdir; + /* mutex protects the config->pde*/ + struct mutex mutex; #endif }; @@ -118,17 +120,18 @@ clusterip_config_entry_put(struct clusterip_config *c) local_bh_disable(); if (refcount_dec_and_lock(&c->entries, &cn->lock)) { + list_del_rcu(&c->list); + spin_unlock(&cn->lock); + local_bh_enable(); /* In case anyone still accesses the file, the open/close * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS + mutex_lock(&cn->mutex); if (cn->procdir) proc_remove(c->pde); + mutex_unlock(&cn->mutex); #endif - list_del_rcu(&c->list); - spin_unlock(&cn->lock); - local_bh_enable(); - return; } local_bh_enable(); @@ -278,9 +281,11 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); + mutex_lock(&cn->mutex); c->pde = proc_create_data(buffer, 0600, cn->procdir, &clusterip_proc_fops, c); + mutex_unlock(&cn->mutex); if (!c->pde) { err = -ENOMEM; goto err; @@ -832,6 +837,7 @@ static int clusterip_net_init(struct net *net) pr_err("Unable to proc dir entry\n"); return -ENOMEM; } + mutex_init(&cn->mutex); #endif /* CONFIG_PROC_FS */ return 0; @@ -840,9 +846,12 @@ static int clusterip_net_init(struct net *net) static void clusterip_net_exit(struct net *net) { struct clusterip_net *cn = clusterip_pernet(net); + #ifdef CONFIG_PROC_FS + mutex_lock(&cn->mutex); proc_remove(cn->procdir); cn->procdir = NULL; + mutex_unlock(&cn->mutex); #endif nf_unregister_net_hook(net, &cip_arp_ops); } -- cgit v1.2.3 From 06aa151ad1fc74a49b45336672515774a678d78d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 5 Nov 2018 18:23:25 +0900 Subject: netfilter: ipt_CLUSTERIP: check MAC address when duplicate config is set If same destination IP address config is already existing, that config is just used. MAC address also should be same. However, there is no MAC address checking routine. So that MAC address checking routine is added. test commands: %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 2 --local-node 1 After this patch, above commands are disallowed. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/netfilter') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index d4d549a46c04..b61977db9b7f 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -509,7 +509,8 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) if (IS_ERR(config)) return PTR_ERR(config); } - } + } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) + return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { -- cgit v1.2.3