From aa80c6343fcf53cbc29f84ba9f89ca87d4e41350 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:11 +0530 Subject: powerpc/smp: Enable Asym packing for cores on shared processor If there are shared processor LPARs, underlying Hypervisor can have more virtual cores to handle than actual physical cores. Starting with Power 9, a big core (aka SMT8 core) has 2 nearly independent thread groups. On a shared processors LPARs, it helps to pack threads to lesser number of cores so that the overall system performance and utilization improves. PowerVM schedules at a big core level. Hence packing to fewer cores helps. Since each thread-group is independent, running threads on both the thread-groups of a SMT8 core, should have a minimal adverse impact in non over provisioned scenarios. These changes in this patchset will not affect in the over provisioned scenario. If there are more threads than SMT domains, then asym_packing will not kick-in For example: Lets says there are two 8-core Shared LPARs that are actually sharing a 8 Core shared physical pool, each running 8 threads each. Then Consolidating 8 threads to 4 cores on each LPAR would help them to perform better. This is because each of the LPAR will get 100% time to run applications and there will no switching required by the Hypervisor. To achieve this, enable SD_ASYM_PACKING flag at CACHE, MC and DIE level when the system is running in shared processor mode and has big cores. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-2-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ab691c89d787..3fc8ad9646a4 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1003,6 +1003,13 @@ static int powerpc_smt_flags(void) } #endif +/* + * On shared processor LPARs scheduled on a big core (which has two or more + * independent thread groups per core), prefer lower numbered CPUs, so + * that workload consolidates to lesser number of cores. + */ +static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); + /* * P9 has a slightly odd architecture where pairs of cores share an L2 cache. * This topology makes it *much* cheaper to migrate tasks between adjacent cores @@ -1011,9 +1018,20 @@ static int powerpc_smt_flags(void) */ static int powerpc_shared_cache_flags(void) { + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING; + return SD_SHARE_PKG_RESOURCES; } +static int powerpc_shared_proc_flags(void) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_ASYM_PACKING; + + return 0; +} + /* * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. @@ -1050,8 +1068,8 @@ static struct sched_domain_topology_level powerpc_topology[] = { { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, #endif { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_mc_mask, SD_INIT_NAME(MC) }, - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + { cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) }, + { cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) }, { NULL, }, }; @@ -1686,6 +1704,9 @@ static void __init fixup_topology(void) { int i; + if (is_shared_processor() && has_big_cores) + static_branch_enable(&splpar_asym_pack); + #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); -- cgit v1.2.3 From 0e1c1986e0e65746daa05405d7747ce882f83cf1 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:12 +0530 Subject: powerpc/smp: Disable MC domain for shared processor Like L2-cache info, coregroup information which is used to determine MC sched domains is only present on dedicated LPARs. i.e PowerVM doesn't export coregroup information for shared processor LPARs. Hence disable creating MC domains on shared LPAR Systems. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-3-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 3fc8ad9646a4..2cebc53e97f9 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1055,6 +1055,10 @@ static struct cpumask *cpu_coregroup_mask(int cpu) static bool has_coregroup_support(void) { + /* Coregroup identification not available on shared systems */ + if (is_shared_processor()) + return 0; + return coregroup_enabled; } -- cgit v1.2.3 From fd535a858ebeb1f478b1d065b6c057f52aad483a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:13 +0530 Subject: powerpc/smp: Add __ro_after_init attribute There are some variables that are only updated at boot time. So add __ro_after_init attribute to such variables Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-4-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 2cebc53e97f9..aea149627209 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -77,10 +77,10 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif struct task_struct *secondary_current; -bool has_big_cores; -bool coregroup_enabled; -bool thread_group_shares_l2; -bool thread_group_shares_l3; +bool has_big_cores __ro_after_init; +bool coregroup_enabled __ro_after_init; +bool thread_group_shares_l2 __ro_after_init; +bool thread_group_shares_l3 __ro_after_init; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); @@ -987,7 +987,7 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property) return 0; } -static bool shared_caches; +static bool shared_caches __ro_after_init; #ifdef CONFIG_SCHED_SMT /* cpumask of CPUs with asymmetric SMT dependency */ -- cgit v1.2.3 From 0e93f1c780e8fd315f1262467b7d35eb6f766d2f Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:14 +0530 Subject: powerpc/smp: Avoid asym packing within thread_group of a core PowerVM Hypervisor will schedule at a core granularity. However each core can have more than one thread_groups. For better utilization in case of a shared processor, its preferable for the scheduler to pack to the lowest core. However there is no benefit of moving a thread between two thread groups of the same core. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-5-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index aea149627209..9d8bb9a084bd 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1763,6 +1763,19 @@ void __init smp_cpus_done(unsigned int max_cpus) set_sched_topology(powerpc_topology); } +/* + * For asym packing, by default lower numbered CPU has higher priority. + * On shared processors, pack to lower numbered core. However avoid moving + * between thread_groups within the same core. + */ +int arch_asym_cpu_priority(int cpu) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return -cpu / threads_per_core; + + return -cpu; +} + #ifdef CONFIG_HOTPLUG_CPU int __cpu_disable(void) { -- cgit v1.2.3 From c46975715f5a7b941aa09bc0539a8dbe297f308f Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:15 +0530 Subject: powerpc/smp: Dynamically build Powerpc topology Currently there are four Powerpc specific sched topologies. These are all statically defined. However not all these topologies are used by all Powerpc systems. To avoid unnecessary degenerations by the scheduler, masks and flags are compared. However if the sched topologies are build dynamically then the code is simpler and there are greater chances of avoiding degenerations. Note: Even X86 builds its sched topologies dynamically and proposed changes are very similar to the way X86 is building its topologies. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-6-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 78 +++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 9d8bb9a084bd..693334c20d07 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -93,15 +93,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); EXPORT_SYMBOL_GPL(has_big_cores); -enum { -#ifdef CONFIG_SCHED_SMT - smt_idx, -#endif - cache_idx, - mc_idx, - die_idx, -}; - #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 #define THREAD_GROUP_SHARE_L2_L3 2 @@ -1067,16 +1058,6 @@ static const struct cpumask *cpu_mc_mask(int cpu) return cpu_coregroup_mask(cpu); } -static struct sched_domain_topology_level powerpc_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, -#endif - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) }, - { cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) }, - { NULL, }, -}; - static int __init init_big_cores(void) { int cpu; @@ -1704,9 +1685,11 @@ void start_secondary(void *unused) BUG(); } -static void __init fixup_topology(void) +static struct sched_domain_topology_level powerpc_topology[6]; + +static void __init build_sched_topology(void) { - int i; + int i = 0; if (is_shared_processor() && has_big_cores) static_branch_enable(&splpar_asym_pack); @@ -1714,36 +1697,33 @@ static void __init fixup_topology(void) #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[smt_idx].mask = smallcore_smt_mask; + powerpc_topology[i++] = (struct sched_domain_topology_level){ + smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; + } else { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; } #endif + if (shared_caches) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) + }; + } + if (has_coregroup_support()) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) + }; + } + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) + }; - if (!has_coregroup_support()) - powerpc_topology[mc_idx].mask = powerpc_topology[cache_idx].mask; - - /* - * Try to consolidate topology levels here instead of - * allowing scheduler to degenerate. - * - Dont consolidate if masks are different. - * - Dont consolidate if sd_flags exists and are different. - */ - for (i = 1; i <= die_idx; i++) { - if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask) - continue; - - if (powerpc_topology[i].sd_flags && powerpc_topology[i - 1].sd_flags && - powerpc_topology[i].sd_flags != powerpc_topology[i - 1].sd_flags) - continue; - - if (!powerpc_topology[i - 1].sd_flags) - powerpc_topology[i - 1].sd_flags = powerpc_topology[i].sd_flags; + /* There must be one trailing NULL entry left. */ + BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); - powerpc_topology[i].mask = powerpc_topology[i + 1].mask; - powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags; -#ifdef CONFIG_SCHED_DEBUG - powerpc_topology[i].name = powerpc_topology[i + 1].name; -#endif - } + set_sched_topology(powerpc_topology); } void __init smp_cpus_done(unsigned int max_cpus) @@ -1758,9 +1738,7 @@ void __init smp_cpus_done(unsigned int max_cpus) smp_ops->bringup_done(); dump_numa_cpu_topology(); - - fixup_topology(); - set_sched_topology(powerpc_topology); + build_sched_topology(); } /* -- cgit v1.2.3