From a70e483460d58e64504dd679fd127e9549385c86 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Nov 2022 12:21:58 +0100 Subject: netfilter: conntrack: merge ipv4+ipv6 confirm functions No need to have distinct functions. After merge, ipv6 can avoid protooff computation if the connection neither needs sequence adjustment nor helper invocation -- this is the normal case. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index b2b9de70d9f4..71d1269fe4d4 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -71,8 +71,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) return ret; } -unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo); +unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *proto); -- cgit v1.2.3 From 5df7d714d8cbcce7642936cc0f6532f0c4c3d197 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:45:59 +0200 Subject: ipvs: add rcu protection to stats In preparation to using RCU locking for the list with estimators, make sure the struct ip_vs_stats are released after RCU grace period by using RCU callbacks. This affects ipvs->tot_stats where we can not use RCU callbacks for ipvs, so we use allocated struct ip_vs_stats_rcu. For services and dests we force RCU callbacks for all cases. Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ff1804a0c469..bd8ae137e43b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -405,6 +405,11 @@ struct ip_vs_stats { struct ip_vs_kstats kstats0; /* reset values */ }; +struct ip_vs_stats_rcu { + struct ip_vs_stats s; + struct rcu_head rcu_head; +}; + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -688,6 +693,7 @@ struct ip_vs_dest { union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ + struct rcu_head rcu_head; struct list_head t_list; /* in dest_trash */ unsigned int in_rs_table:1; /* we are in rs_table */ }; @@ -869,7 +875,7 @@ struct netns_ipvs { atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ - struct ip_vs_stats tot_stats; /* Statistics & est. */ + struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ int num_services; /* no of virtual services */ int num_services6; /* IPv6 virtual services */ -- cgit v1.2.3 From de39afb3d811ba2c028de8662adafedb4899327b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:00 +0200 Subject: ipvs: use common functions for stats allocation Move alloc_percpu/free_percpu logic in new functions Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index bd8ae137e43b..e5582c01a4a3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -410,6 +410,11 @@ struct ip_vs_stats_rcu { struct rcu_head rcu_head; }; +int ip_vs_stats_init_alloc(struct ip_vs_stats *s); +struct ip_vs_stats *ip_vs_stats_alloc(void); +void ip_vs_stats_release(struct ip_vs_stats *stats); +void ip_vs_stats_free(struct ip_vs_stats *stats); + struct dst_entry; struct iphdr; struct ip_vs_conn; -- cgit v1.2.3 From 1dbd8d9a82e3f26b9d063292d47ece673f48fce2 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:01 +0200 Subject: ipvs: use u64_stats_t for the per-cpu counters Use the provided u64_stats_t type to avoid load/store tearing. Fixes: 316580b69d0a ("u64_stats: provide u64_stats_t type") Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Tested-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index e5582c01a4a3..a4d44138c2a8 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -351,11 +351,11 @@ struct ip_vs_seq { /* counters per cpu */ struct ip_vs_counters { - __u64 conns; /* connections scheduled */ - __u64 inpkts; /* incoming packets */ - __u64 outpkts; /* outgoing packets */ - __u64 inbytes; /* incoming bytes */ - __u64 outbytes; /* outgoing bytes */ + u64_stats_t conns; /* connections scheduled */ + u64_stats_t inpkts; /* incoming packets */ + u64_stats_t outpkts; /* outgoing packets */ + u64_stats_t inbytes; /* incoming bytes */ + u64_stats_t outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { -- cgit v1.2.3 From 705dd34440812735ece298eb5bc153fde9544d42 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:02 +0200 Subject: ipvs: use kthreads for stats estimation Estimating all entries in single list in timer context by single CPU causes large latency with multiple IPVS rules as reported in [1], [2], [3]. Spread the estimator structures in multiple chains and use kthread(s) for the estimation. The chains are processed in multiple (50) timer ticks to ensure the 2-second interval between estimations with some accuracy. Every chain is processed under RCU lock. Every kthread works over its own data structure and all such contexts are attached to array. The contexts can be preserved while the kthread tasks are stopped or restarted. When estimators are removed, unused kthread contexts are released and the slots in array are left empty. First kthread determines parameters to use, eg. maximum number of estimators to process per kthread based on chain's length (chain_max), allowing sub-100us cond_resched rate and estimation taking up to 1/8 of the CPU capacity to avoid any problems if chain_max is not correctly calculated. chain_max is calculated taking into account factors such as CPU speed and memory/cache speed where the cache_factor (4) is selected from real tests with current generation of CPU/NUMA configurations to correct the difference in CPU usage between cached (during calc phase) and non-cached (working) state of the estimated per-cpu data. First kthread also plays the role of distributor of added estimators to all kthreads, keeping low the time to add estimators. The optimization is based on the fact that newly added estimator should be estimated after 2 seconds, so we have the time to offload the adding to chain from controlling process to kthread 0. The allocated kthread context may grow from 1 to 50 allocated structures for timer ticks which saves memory for setups with small number of estimators. We also add delayed work est_reload_work that will make sure the kthread tasks are properly started/stopped. ip_vs_start_estimator() is changed to report errors which allows to safely store the estimators in allocated structures. Many thanks to Jiri Wiesner for his valuable comments and for spending a lot of time reviewing and testing the changes on different platforms with 48-256 CPUs and 1-8 NUMA nodes under different cpufreq governors. [1] Report from Yunhong Jiang: https://lore.kernel.org/netdev/D25792C1-1B89-45DE-9F10-EC350DC04ADC@gmail.com/ [2] https://marc.info/?l=linux-virtual-server&m=159679809118027&w=2 [3] Report from Dust: https://archive.linuxvirtualserver.org/html/lvs-devel/2020-12/msg00000.html Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Tested-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a4d44138c2a8..04960dc6228f 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -42,6 +42,8 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; +extern struct mutex __ip_vs_mutex; + struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ @@ -365,7 +367,7 @@ struct ip_vs_cpu_stats { /* IPVS statistics objects */ struct ip_vs_estimator { - struct list_head list; + struct hlist_node list; u64 last_inbytes; u64 last_outbytes; @@ -378,6 +380,10 @@ struct ip_vs_estimator { u64 outpps; u64 inbps; u64 outbps; + + s32 ktid:16, /* kthread ID, -1=temp list */ + ktrow:8, /* row/tick ID for kthread */ + ktcid:8; /* chain ID for kthread tick */ }; /* @@ -415,6 +421,66 @@ struct ip_vs_stats *ip_vs_stats_alloc(void); void ip_vs_stats_release(struct ip_vs_stats *stats); void ip_vs_stats_free(struct ip_vs_stats *stats); +/* Process estimators in multiple timer ticks (20/50/100, see ktrow) */ +#define IPVS_EST_NTICKS 50 +/* Estimation uses a 2-second period containing ticks (in jiffies) */ +#define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NTICKS) + +/* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C). + * Value of 4 and above ensures kthreads will take work without exceeding + * the CPU capacity under different circumstances. + */ +#define IPVS_EST_LOAD_DIVISOR 8 + +/* Kthreads should not have work that exceeds the CPU load above 50% */ +#define IPVS_EST_CPU_KTHREADS (IPVS_EST_LOAD_DIVISOR / 2) + +/* Desired number of chains per timer tick (chain load factor in 100us units), + * 48=4.8ms of 40ms tick (12% CPU usage): + * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50 + */ +#define IPVS_EST_CHAIN_FACTOR \ + ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8) + +/* Compiled number of chains per tick + * The defines should match cond_resched_rcu + */ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) +#define IPVS_EST_TICK_CHAINS IPVS_EST_CHAIN_FACTOR +#else +#define IPVS_EST_TICK_CHAINS 1 +#endif + +#if IPVS_EST_NTICKS > 127 +#error Too many timer ticks for ktrow +#endif + +/* Multiple chains processed in same tick */ +struct ip_vs_est_tick_data { + struct hlist_head chains[IPVS_EST_TICK_CHAINS]; + DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS); + DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS); + int chain_len[IPVS_EST_TICK_CHAINS]; +}; + +/* Context for estimation kthread */ +struct ip_vs_est_kt_data { + struct netns_ipvs *ipvs; + struct task_struct *task; /* task if running */ + struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS]; + DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ + unsigned long est_timer; /* estimation timer (jiffies) */ + struct ip_vs_stats *calc_stats; /* Used for calculation */ + int tick_len[IPVS_EST_NTICKS]; /* est count */ + int id; /* ktid per netns */ + int chain_max; /* max ests per tick chain */ + int tick_max; /* max ests per tick */ + int est_count; /* attached ests to kthread */ + int est_max_count; /* max ests per kthread */ + int add_row; /* row for new ests */ + int est_row; /* estimated row */ +}; + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -953,9 +1019,17 @@ struct netns_ipvs { struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; /* ip_vs_est */ - struct list_head est_list; /* estimator list */ - spinlock_t est_lock; - struct timer_list est_timer; /* Estimation timer */ + struct delayed_work est_reload_work;/* Reload kthread tasks */ + struct mutex est_mutex; /* protect kthread tasks */ + struct hlist_head est_temp_list; /* Ests during calc phase */ + struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ + unsigned long est_max_threads;/* Hard limit of kthreads */ + int est_calc_phase; /* Calculation phase */ + int est_chain_max; /* Calculated chain_max */ + int est_kt_count; /* Allocated ptrs */ + int est_add_ktid; /* ktid where to add ests */ + atomic_t est_genid; /* kthreads reload genid */ + atomic_t est_genid_done; /* applied genid */ /* ip_vs_sync */ spinlock_t sync_lock; struct ipvs_master_sync_state *ms; @@ -1486,10 +1560,14 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state); void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts); /* IPVS rate estimator prototypes (from ip_vs_est.c) */ -void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); +int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); +void ip_vs_est_reload_start(struct netns_ipvs *ipvs); +int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd); +void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, -- cgit v1.2.3 From f0be83d5421718ead31707b6ece34cf77d411c00 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:03 +0200 Subject: ipvs: add est_cpulist and est_nice sysctl vars Allow the kthreads for stats to be configured for specific cpulist (isolation) and niceness (scheduling priority). Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 04960dc6228f..dc51b5497cf7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -29,6 +29,7 @@ #include #endif #include /* Netw namespace */ +#include #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 @@ -365,6 +366,9 @@ struct ip_vs_cpu_stats { struct u64_stats_sync syncp; }; +/* Default nice for estimator kthreads */ +#define IPVS_EST_NICE 0 + /* IPVS statistics objects */ struct ip_vs_estimator { struct hlist_node list; @@ -1009,6 +1013,12 @@ struct netns_ipvs { int sysctl_schedule_icmp; int sysctl_ignore_tunneled; int sysctl_run_estimation; +#ifdef CONFIG_SYSCTL + cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */ + int est_cpulist_valid; /* cpulist set */ + int sysctl_est_nice; /* kthread nice */ + int est_stopped; /* stop tasks */ +#endif /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1162,6 +1172,19 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return ipvs->sysctl_run_estimation; } +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + if (ipvs->est_cpulist_valid) + return ipvs->sysctl_est_cpulist; + else + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_est_nice; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -1259,6 +1282,16 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return 1; } +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return IPVS_EST_NICE; +} + #endif /* IPVS core functions @@ -1569,6 +1602,31 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); +static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + ipvs->est_stopped = ipvs->est_cpulist_valid && + cpumask_empty(sysctl_est_cpulist(ipvs)); +#endif +} + +static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + return ipvs->est_stopped; +#else + return false; +#endif +} + +static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) +{ + unsigned int limit = IPVS_EST_CPU_KTHREADS * + cpumask_weight(sysctl_est_cpulist(ipvs)); + + return max(1U, limit); +} + /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); -- cgit v1.2.3 From 144361c1949f227df9244302da02c258a363b674 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:04 +0200 Subject: ipvs: run_estimation should control the kthread tasks Change the run_estimation flag to start/stop the kthread tasks. Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index dc51b5497cf7..c6c61100d244 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1605,8 +1605,10 @@ void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL - ipvs->est_stopped = ipvs->est_cpulist_valid && - cpumask_empty(sysctl_est_cpulist(ipvs)); + /* Stop tasks while cpulist is empty or if disabled with flag */ + ipvs->est_stopped = !sysctl_run_estimation(ipvs) || + (ipvs->est_cpulist_valid && + cpumask_empty(sysctl_est_cpulist(ipvs))); #endif } -- cgit v1.2.3