From e3245a7b7b34bd2e97f744fd79463add6e9d41f4 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 10 Aug 2021 15:59:20 +0300 Subject: netfilter: nft_ct: protect nft_ct_pcpu_template_refcnt with mutex Syzbot hit use-after-free in nf_tables_dump_sets. The problem was in missing lock protection for nft_ct_pcpu_template_refcnt. Before commit f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions") all transactions were serialized by global mutex, but then global mutex was changed to local per netnamespace commit_mutex. This change causes use-after-free bug, when 2 netnamespaces concurently changing nft_ct_pcpu_template_refcnt without proper locking. Fix it by adding nft_ct_pcpu_mutex and protect all nft_ct_pcpu_template_refcnt changes with it. Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions") Reported-and-tested-by: syzbot+649e339fa6658ee623d3@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 337e22d8b40b..99b1de14ff7e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -41,6 +41,7 @@ struct nft_ct_helper_obj { #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; +static DEFINE_MUTEX(nft_ct_pcpu_mutex); #endif static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, @@ -525,8 +526,10 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: + mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); + mutex_unlock(&nft_ct_pcpu_mutex); break; #endif default: @@ -564,9 +567,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: - if (!nft_ct_tmpl_alloc_pcpu()) + mutex_lock(&nft_ct_pcpu_mutex); + if (!nft_ct_tmpl_alloc_pcpu()) { + mutex_unlock(&nft_ct_pcpu_mutex); return -ENOMEM; + } nft_ct_pcpu_template_refcnt++; + mutex_unlock(&nft_ct_pcpu_mutex); len = sizeof(u16); break; #endif -- cgit v1.2.3 From d532bcd0b2699d84d71a0c71d37157ac6eb3be25 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Aug 2021 15:54:19 +0200 Subject: netfilter: conntrack: sanitize table size default settings conntrack has two distinct table size settings: nf_conntrack_max and nf_conntrack_buckets. The former limits how many conntrack objects are allowed to exist in each namespace. The second sets the size of the hashtable. As all entries are inserted twice (once for original direction, once for reply), there should be at least twice as many buckets in the table than the maximum number of conntrack objects that can exist at the same time. Change the default multiplier to 1 and increase the chosen bucket sizes. This results in the same nf_conntrack_max settings as before but reduces the average bucket list length. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d31dbccbe7bd..cdd8a1dc2275 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2594,26 +2594,24 @@ int nf_conntrack_init_start(void) spin_lock_init(&nf_conntrack_locks[i]); if (!nf_conntrack_htable_size) { - /* Idea from tcp.c: use 1/16384 of memory. - * On i386: 32MB machine has 512 buckets. - * >= 1GB machines have 16384 buckets. - * >= 4GB machines have 65536 buckets. - */ nf_conntrack_htable_size = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) - nf_conntrack_htable_size = 65536; + if (BITS_PER_LONG >= 64 && + nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + nf_conntrack_htable_size = 262144; else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) - nf_conntrack_htable_size = 16384; - if (nf_conntrack_htable_size < 32) - nf_conntrack_htable_size = 32; - - /* Use a max. factor of four by default to get the same max as - * with the old struct list_heads. When a table size is given - * we use the old value of 8 to avoid reducing the max. - * entries. */ - max_factor = 4; + nf_conntrack_htable_size = 65536; + + if (nf_conntrack_htable_size < 1024) + nf_conntrack_htable_size = 1024; + /* Use a max. factor of one by default to keep the average + * hash chain length at 2 entries. Each entry has to be added + * twice (once for original direction, once for reply). + * When a table size is given we use the old value of 8 to + * avoid implicit reduction of the max entries setting. + */ + max_factor = 1; } nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); -- cgit v1.2.3 From dd6d2910c5e071a8683827df1a89e527aa5145ab Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Aug 2021 15:54:20 +0200 Subject: netfilter: conntrack: switch to siphash Replace jhash in conntrack and nat core with siphash. While at it, use the netns mix value as part of the input key rather than abuse the seed value. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 31 ++++++++++++++++++------------- net/netfilter/nf_conntrack_expect.c | 25 ++++++++++++++++++------- net/netfilter/nf_nat_core.c | 18 ++++++++++++++---- 3 files changed, 50 insertions(+), 24 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index cdd8a1dc2275..da2650f872e1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -184,25 +183,31 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_spinlock_t nf_conntrack_generation __read_mostly; -static unsigned int nf_conntrack_hash_rnd __read_mostly; +static siphash_key_t nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, const struct net *net) { - unsigned int n; - u32 seed; + struct { + struct nf_conntrack_man src; + union nf_inet_addr dst_addr; + u32 net_mix; + u16 dport; + u16 proto; + } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); - /* The direction must be ignored, so we hash everything up to the - * destination ports (which is a multiple of 4) and treat the last - * three bytes manually. - */ - seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); - n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, seed ^ - (((__force __u16)tuple->dst.u.all << 16) | - tuple->dst.protonum)); + memset(&combined, 0, sizeof(combined)); + + /* The direction must be ignored, so handle usable members manually. */ + combined.src = tuple->src; + combined.dst_addr = tuple->dst.u3; + combined.net_mix = net_hash_mix(net); + combined.dport = (__force __u16)tuple->dst.u.all; + combined.proto = tuple->dst.protonum; + + return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); } static u32 scale_hash(u32 hash) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 1e851bc2e61a..f562eeef4234 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; -static unsigned int nf_ct_expect_hashrnd __read_mostly; +static siphash_key_t nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -81,15 +81,26 @@ static void nf_ct_expectation_timed_out(struct timer_list *t) static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash, seed; + struct { + union nf_inet_addr dst_addr; + u32 net_mix; + u16 dport; + u8 l3num; + u8 protonum; + } __aligned(SIPHASH_ALIGNMENT) combined; + u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); - seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); + memset(&combined, 0, sizeof(combined)); - hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), - (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ seed); + combined.dst_addr = tuple->dst.u3; + combined.net_mix = net_hash_mix(n); + combined.dport = (__force __u16)tuple->dst.u.all; + combined.l3num = tuple->src.l3num; + combined.protonum = tuple->dst.protonum; + + hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7de595ead06a..7008961f5cb0 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -34,7 +34,7 @@ static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; -static unsigned int nf_nat_hash_rnd __read_mostly; +static siphash_key_t nf_nat_hash_rnd __read_mostly; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; @@ -153,12 +153,22 @@ static unsigned int hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { unsigned int hash; + struct { + struct nf_conntrack_man src; + u32 net_mix; + u32 protonum; + } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + memset(&combined, 0, sizeof(combined)); + /* Original src, to ensure we map it consistently if poss. */ - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); + combined.src = tuple->src; + combined.net_mix = net_hash_mix(n); + combined.protonum = tuple->dst.protonum; + + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } -- cgit v1.2.3 From d7e7747ac5c2496c98291944c6066adaa9f3b975 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 26 Aug 2021 15:54:22 +0200 Subject: netfilter: refuse insertion if chain has grown too large Also add a stat counter for this that gets exported both via old /proc interface and ctnetlink. Assuming the old default size of 16536 buckets and max hash occupancy of 64k, this results in 128k insertions (origin+reply), so ~8 entries per chain on average. The revised settings in this series will result in about two entries per bucket on average. This allows a hard-limit ceiling of 64. This is not tunable at the moment, but its possible to either increase nf_conntrack_buckets or decrease nf_conntrack_max to reduce average lengths. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 42 +++++++++++++++++++++++++++------ net/netfilter/nf_conntrack_netlink.c | 4 +++- net/netfilter/nf_conntrack_standalone.c | 4 ++-- 3 files changed, 40 insertions(+), 10 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index da2650f872e1..94e18fb9690d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -77,6 +77,8 @@ static __read_mostly bool nf_conntrack_locks_all; #define GC_SCAN_INTERVAL (120u * HZ) #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) +#define MAX_CHAINLEN 64u + static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) @@ -840,7 +842,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int chainlen = 0; unsigned int sequence; + int err = -EEXIST; zone = nf_ct_zone(ct); @@ -854,15 +858,24 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (chainlen++ > MAX_CHAINLEN) + goto chaintoolong; + } + + chainlen = 0; + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > MAX_CHAINLEN) + goto chaintoolong; + } smp_wmb(); /* The caller holds a reference to this object */ @@ -872,11 +885,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) NF_CT_STAT_INC(net, insert); local_bh_enable(); return 0; - +chaintoolong: + NF_CT_STAT_INC(net, chaintoolong); + err = -ENOSPC; out: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); - return -EEXIST; + return err; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); @@ -1089,6 +1104,7 @@ int __nf_conntrack_confirm(struct sk_buff *skb) { const struct nf_conntrack_zone *zone; + unsigned int chainlen = 0, sequence; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -1096,7 +1112,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; - unsigned int sequence; int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); @@ -1156,15 +1171,28 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; + if (chainlen++ > MAX_CHAINLEN) + goto chaintoolong; + } - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + chainlen = 0; + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > MAX_CHAINLEN) { +chaintoolong: + nf_ct_add_to_dying_list(ct); + NF_CT_STAT_INC(net, chaintoolong); + NF_CT_STAT_INC(net, insert_failed); + ret = NF_DROP; + goto dying; + } + } /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e81af33b233b..3f081ae08266 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2484,7 +2484,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, htonl(st->search_restart)) || nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, - htonl(st->clash_resolve))) + htonl(st->clash_resolve)) || + nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG, + htonl(st->chaintoolong))) goto nla_put_failure; nlmsg_end(skb, nlh); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index e84b499b7bfa..f94ebd5194b5 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -429,7 +429,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { - seq_puts(seq, "entries clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); + seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } @@ -444,7 +444,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) st->invalid, 0, 0, - 0, + st->chaintoolong, st->insert, st->insert_failed, st->drop, -- cgit v1.2.3