summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2023-02-09 17:45:57 -0800
committerJakub Kicinski <kuba@kernel.org>2023-02-09 17:45:57 -0800
commit2894d3530948acd30f2cc5366728a6967fc6569f (patch)
treea408253f02823f66dc55b29f1ba5b085ee7b1666
parent8697a258ae24703267d2a37d91ab757c91ef027e (diff)
parentc12e0d5f267d7eb45a2f8eaa9fd44eaa2871a95e (diff)
downloadlinux-stable-2894d3530948acd30f2cc5366728a6967fc6569f.tar.gz
linux-stable-2894d3530948acd30f2cc5366728a6967fc6569f.tar.bz2
linux-stable-2894d3530948acd30f2cc5366728a6967fc6569f.zip
Merge branch 'net-introduce-rps_default_mask'
Paolo Abeni says: ==================== net: introduce rps_default_mask Real-time setups try hard to ensure proper isolation between time critical applications and e.g. network processing performed by the network stack in softirq and RPS is used to move the softirq activity away from the isolated core. If the network configuration is dynamic, with netns and devices routinely created at run-time, enforcing the correct RPS setting on each newly created device allowing to transient bad configuration became complex. Additionally, when multi-queue devices are involved, configuring rps in user-space on each queue easily becomes very expensive, e.g. some setups use veths with per cpu queues. These series try to address the above, introducing a new sysctl knob: rps_default_mask. The new sysctl entry allows configuring a netns-wide RPS mask, to be enforced since receive queue creation time without any fourther per device configuration required. Additionally, a simple self-test is introduced to check the rps_default_mask behavior. ==================== Link: https://lore.kernel.org/r/cover.1675789134.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--Documentation/admin-guide/sysctl/net.rst6
-rw-r--r--include/linux/netdevice.h1
-rw-r--r--net/core/dev.h2
-rw-r--r--net/core/net-sysfs.c79
-rw-r--r--net/core/sysctl_net_core.c81
-rw-r--r--tools/testing/selftests/net/Makefile1
-rw-r--r--tools/testing/selftests/net/config3
-rwxr-xr-xtools/testing/selftests/net/rps_default_mask.sh57
8 files changed, 182 insertions, 48 deletions
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 6394f5dc2303..466c560b0c30 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -215,6 +215,12 @@ rmem_max
The maximum receive socket buffer size in bytes.
+rps_default_mask
+----------------
+
+The default RPS CPU mask used on newly created network devices. An empty
+mask means RPS disabled by default.
+
tstamp_allow_data
-----------------
Allow processes to receive tx timestamps looped together with the original
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d5ef4c1fedd2..38ab96ae0d68 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -223,6 +223,7 @@ struct net_device_core_stats {
#include <linux/static_key.h>
extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;
+extern struct cpumask rps_default_mask;
#endif
struct neighbour;
diff --git a/net/core/dev.h b/net/core/dev.h
index a065b7571441..e075e198092c 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -9,6 +9,7 @@ struct net_device;
struct netdev_bpf;
struct netdev_phys_item_id;
struct netlink_ext_ack;
+struct cpumask;
/* Random bits of netdevice that don't need to be exposed */
#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
@@ -134,4 +135,5 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
WRITE_ONCE(dev->gro_ipv4_max_size, size);
}
+int rps_cpumask_housekeeping(struct cpumask *mask);
#endif
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index ca55dd747d6c..4b361ac6a252 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -831,42 +831,18 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
return len < PAGE_SIZE ? len : -EINVAL;
}
-static ssize_t store_rps_map(struct netdev_rx_queue *queue,
- const char *buf, size_t len)
+static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue,
+ cpumask_var_t mask)
{
- struct rps_map *old_map, *map;
- cpumask_var_t mask;
- int err, cpu, i;
static DEFINE_MUTEX(rps_map_mutex);
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
-
- err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
- if (err) {
- free_cpumask_var(mask);
- return err;
- }
-
- if (!cpumask_empty(mask)) {
- cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
- cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
- if (cpumask_empty(mask)) {
- free_cpumask_var(mask);
- return -EINVAL;
- }
- }
+ struct rps_map *old_map, *map;
+ int cpu, i;
map = kzalloc(max_t(unsigned int,
RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
GFP_KERNEL);
- if (!map) {
- free_cpumask_var(mask);
+ if (!map)
return -ENOMEM;
- }
i = 0;
for_each_cpu_and(cpu, mask, cpu_online_mask)
@@ -893,9 +869,45 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
if (old_map)
kfree_rcu(old_map, rcu);
+ return 0;
+}
+int rps_cpumask_housekeeping(struct cpumask *mask)
+{
+ if (!cpumask_empty(mask)) {
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
+ if (cpumask_empty(mask))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+ const char *buf, size_t len)
+{
+ cpumask_var_t mask;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err)
+ goto out;
+
+ err = rps_cpumask_housekeeping(mask);
+ if (err)
+ goto out;
+
+ err = netdev_rx_queue_set_rps_mask(queue, mask);
+
+out:
free_cpumask_var(mask);
- return len;
+ return err ? : len;
}
static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
@@ -1071,6 +1083,13 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
goto err;
}
+#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL)
+ if (!cpumask_empty(&rps_default_mask)) {
+ error = netdev_rx_queue_set_rps_mask(queue, &rps_default_mask);
+ if (error)
+ goto err;
+ }
+#endif
kobject_uevent(kobj, KOBJ_ADD);
return error;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index e7b98162c632..7130e6d9e263 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -16,6 +16,7 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/sched/isolation.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -45,7 +46,59 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
int sysctl_devconf_inherit_init_net __read_mostly;
EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
+#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
+static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
+ struct cpumask *mask)
+{
+ char kbuf[128];
+ int len;
+
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return;
+ }
+
+ len = min(sizeof(kbuf) - 1, *lenp);
+ len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
+ if (!len) {
+ *lenp = 0;
+ return;
+ }
+
+ if (len < *lenp)
+ kbuf[len++] = '\n';
+ memcpy(buffer, kbuf, len);
+ *lenp = len;
+ *ppos += len;
+}
+#endif
+
#ifdef CONFIG_RPS
+struct cpumask rps_default_mask;
+
+static int rps_default_mask_sysctl(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err = 0;
+
+ rtnl_lock();
+ if (write) {
+ err = cpumask_parse(buffer, &rps_default_mask);
+ if (err)
+ goto done;
+
+ err = rps_cpumask_housekeeping(&rps_default_mask);
+ if (err)
+ goto done;
+ } else {
+ dump_cpumask(buffer, lenp, ppos, &rps_default_mask);
+ }
+
+done:
+ rtnl_unlock();
+ return err;
+}
+
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -155,13 +208,6 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
write_unlock:
mutex_unlock(&flow_limit_update_mutex);
} else {
- char kbuf[128];
-
- if (*ppos || !*lenp) {
- *lenp = 0;
- goto done;
- }
-
cpumask_clear(mask);
rcu_read_lock();
for_each_possible_cpu(i) {
@@ -171,17 +217,7 @@ write_unlock:
}
rcu_read_unlock();
- len = min(sizeof(kbuf) - 1, *lenp);
- len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
- if (!len) {
- *lenp = 0;
- goto done;
- }
- if (len < *lenp)
- kbuf[len++] = '\n';
- memcpy(buffer, kbuf, len);
- *lenp = len;
- *ppos += len;
+ dump_cpumask(buffer, lenp, ppos, mask);
}
done:
@@ -472,6 +508,11 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = rps_sock_flow_sysctl
},
+ {
+ .procname = "rps_default_mask",
+ .mode = 0644,
+ .proc_handler = rps_default_mask_sysctl
+ },
#endif
#ifdef CONFIG_NET_FLOW_LIMIT
{
@@ -675,6 +716,10 @@ static __net_initdata struct pernet_operations sysctl_core_ops = {
static __init int sysctl_core_init(void)
{
+#if IS_ENABLED(CONFIG_RPS)
+ cpumask_copy(&rps_default_mask, cpu_none_mask);
+#endif
+
register_net_sysctl(&init_net, "net/core", net_core_table);
return register_pernet_subsys(&sysctl_core_ops);
}
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 951bd5342bc6..3364c548a23b 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -46,6 +46,7 @@ TEST_PROGS += stress_reuseport_listen.sh
TEST_PROGS += l2_tos_ttl_inherit.sh
TEST_PROGS += bind_bhash.sh
TEST_PROGS += ip_local_port_range.sh
+TEST_PROGS += rps_default_mask.sh
TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh
TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh
TEST_GEN_FILES = socket nettest
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index bd89198cd817..cc9fd55ab869 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -3,6 +3,9 @@ CONFIG_NET_NS=y
CONFIG_BPF_SYSCALL=y
CONFIG_TEST_BPF=m
CONFIG_NUMA=y
+CONFIG_RPS=y
+CONFIG_SYSFS=y
+CONFIG_PROC_SYSCTL=y
CONFIG_NET_VRF=y
CONFIG_NET_L3_MASTER_DEV=y
CONFIG_IPV6=y
diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh
new file mode 100755
index 000000000000..c81c0ac7ddfe
--- /dev/null
+++ b/tools/testing/selftests/net/rps_default_mask.sh
@@ -0,0 +1,57 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+readonly ksft_skip=4
+readonly cpus=$(nproc)
+ret=0
+
+[ $cpus -gt 2 ] || exit $ksft_skip
+
+readonly INITIAL_RPS_DEFAULT_MASK=$(cat /proc/sys/net/core/rps_default_mask)
+readonly NETNS="ns-$(mktemp -u XXXXXX)"
+
+setup() {
+ ip netns add "${NETNS}"
+ ip -netns "${NETNS}" link set lo up
+}
+
+cleanup() {
+ echo $INITIAL_RPS_DEFAULT_MASK > /proc/sys/net/core/rps_default_mask
+ ip netns del $NETNS
+}
+
+chk_rps() {
+ local rps_mask expected_rps_mask=$3
+ local dev_name=$2
+ local msg=$1
+
+ rps_mask=$(ip netns exec $NETNS cat /sys/class/net/$dev_name/queues/rx-0/rps_cpus)
+ printf "%-60s" "$msg"
+ if [ $rps_mask -eq $expected_rps_mask ]; then
+ echo "[ ok ]"
+ else
+ echo "[fail] expected $expected_rps_mask found $rps_mask"
+ ret=1
+ fi
+}
+
+trap cleanup EXIT
+
+echo 0 > /proc/sys/net/core/rps_default_mask
+setup
+chk_rps "empty rps_default_mask" lo 0
+cleanup
+
+echo 1 > /proc/sys/net/core/rps_default_mask
+setup
+chk_rps "non zero rps_default_mask" lo 1
+
+echo 3 > /proc/sys/net/core/rps_default_mask
+chk_rps "changing rps_default_mask dont affect existing netns" lo 1
+
+ip -n $NETNS link add type veth
+ip -n $NETNS link set dev veth0 up
+ip -n $NETNS link set dev veth1 up
+chk_rps "changing rps_default_mask affect newly created devices" veth0 3
+chk_rps "changing rps_default_mask affect newly created devices[II]" veth1 3
+exit $ret