From 12753d71e8c5c3e716cedba23ddeed508da0bdc4 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:04:57 +0800 Subject: ACPI: CPPC: Add helper to get the highest performance value Add support for getting the highest performance to the generic CPPC driver. This enables downstream drivers such as amd-pstate to discover and use these values. Refer to Chapter 8.4.6.1.1.1. Highest Performance of ACPI Specification 6.5 for details on continuous performance control of CPPC (linked below). Tested-by: Oleksandr Natalenko Reviewed-by: Mario Limonciello Reviewed-by: Wyes Karny Reviewed-by: Perry Yuan Acked-by: Huang Rui Signed-off-by: Meng Li Link: https://uefi.org/specs/ACPI/6.5/08_Processor_Configuration_and_Control.html?highlight=cppc#highest-performance [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers') diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index d155a86a8614..a50e70abdf19 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1157,6 +1157,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf); } +/** + * cppc_get_highest_perf - Get the highest performance register value. + * @cpunum: CPU from which to get highest performance. + * @highest_perf: Return address. + * + * Return: 0 for success, -EIO otherwise. + */ +int cppc_get_highest_perf(int cpunum, u64 *highest_perf) +{ + return cppc_get_perf(cpunum, HIGHEST_PERF, highest_perf); +} +EXPORT_SYMBOL_GPL(cppc_get_highest_perf); + /** * cppc_get_epp_perf - Get the epp register value. * @cpunum: CPU from which to get epp preference value. -- cgit v1.2.3 From f3a052391822b772b4e27f2594526cf1eb103cab Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:04:58 +0800 Subject: cpufreq: amd-pstate: Enable amd-pstate preferred core support amd-pstate driver utilizes the functions and data structures provided by the ITMT architecture to enable the scheduler to favor scheduling on cores which can be get a higher frequency with lower voltage. We call it amd-pstate preferrred core. Here sched_set_itmt_core_prio() is called to set priorities and sched_set_itmt_support() is called to enable ITMT feature. amd-pstate driver uses the highest performance value to indicate the priority of CPU. The higher value has a higher priority. The initial core rankings are set up by amd-pstate when the system boots. Add a variable hw_prefcore in cpudata structure. It will check if the processor and power firmware support preferred core feature. Add one new early parameter `disable` to allow user to disable the preferred core. Only when hardware supports preferred core and user set `enabled` in early parameter, amd pstate driver supports preferred core featue. Tested-by: Oleksandr Natalenko Reviewed-by: Huang Rui Reviewed-by: Wyes Karny Reviewed-by: Mario Limonciello Co-developed-by: Perry Yuan Signed-off-by: Perry Yuan Signed-off-by: Meng Li Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 131 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 123 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 1791d37fbc53..7c533ad91f4f 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,7 @@ #define AMD_PSTATE_TRANSITION_LATENCY 20000 #define AMD_PSTATE_TRANSITION_DELAY 1000 +#define AMD_PSTATE_PREFCORE_THRESHOLD 166 /* * TODO: We need more time to fine tune processors with shared memory solution @@ -64,6 +66,7 @@ static struct cpufreq_driver amd_pstate_driver; static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_UNDEFINED; static bool cppc_enabled; +static bool amd_pstate_prefcore = true; /* * AMD Energy Preference Performance (EPP) @@ -297,13 +300,14 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) if (ret) return ret; - /* - * TODO: Introduce AMD specific power feature. - * - * CPPC entry doesn't indicate the highest performance in some ASICs. + /* For platforms that do not support the preferred core feature, the + * highest_pef may be configured with 166 or 255, to avoid max frequency + * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as + * the default max perf. */ - highest_perf = amd_get_highest_perf(); - if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1)) + if (cpudata->hw_prefcore) + highest_perf = AMD_PSTATE_PREFCORE_THRESHOLD; + else highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); WRITE_ONCE(cpudata->highest_perf, highest_perf); @@ -324,8 +328,9 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) if (ret) return ret; - highest_perf = amd_get_highest_perf(); - if (highest_perf > cppc_perf.highest_perf) + if (cpudata->hw_prefcore) + highest_perf = AMD_PSTATE_PREFCORE_THRESHOLD; + else highest_perf = cppc_perf.highest_perf; WRITE_ONCE(cpudata->highest_perf, highest_perf); @@ -706,6 +711,80 @@ static void amd_perf_ctl_reset(unsigned int cpu) wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0); } +/* + * Set amd-pstate preferred core enable can't be done directly from cpufreq callbacks + * due to locking, so queue the work for later. + */ +static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) +{ + sched_set_itmt_support(); +} +static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); + +/* + * Get the highest performance register value. + * @cpu: CPU from which to get highest performance. + * @highest_perf: Return address. + * + * Return: 0 for success, -EIO otherwise. + */ +static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf) +{ + int ret; + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + u64 cap1; + + ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); + if (ret) + return ret; + WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); + } else { + u64 cppc_highest_perf; + + ret = cppc_get_highest_perf(cpu, &cppc_highest_perf); + if (ret) + return ret; + WRITE_ONCE(*highest_perf, cppc_highest_perf); + } + + return (ret); +} + +#define CPPC_MAX_PERF U8_MAX + +static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) +{ + int ret, prio; + u32 highest_perf; + + ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf); + if (ret) + return; + + cpudata->hw_prefcore = true; + /* check if CPPC preferred core feature is enabled*/ + if (highest_perf < CPPC_MAX_PERF) + prio = (int)highest_perf; + else { + pr_debug("AMD CPPC preferred core is unsupported!\n"); + cpudata->hw_prefcore = false; + return; + } + + if (!amd_pstate_prefcore) + return; + + /* + * The priorities can be set regardless of whether or not + * sched_set_itmt_support(true) has been called and it is valid to + * update them at any time after it has been called. + */ + sched_set_itmt_core_prio(prio, cpudata->cpu); + + schedule_work(&sched_prefcore_work); +} + static int amd_pstate_cpu_init(struct cpufreq_policy *policy) { int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; @@ -727,6 +806,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) cpudata->cpu = policy->cpu; + amd_pstate_init_prefcore(cpudata); + ret = amd_pstate_init_perf(cpudata); if (ret) goto free_cpudata1; @@ -877,6 +958,17 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, return sysfs_emit(buf, "%u\n", perf); } +static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy, + char *buf) +{ + bool hw_prefcore; + struct amd_cpudata *cpudata = policy->driver_data; + + hw_prefcore = READ_ONCE(cpudata->hw_prefcore); + + return sysfs_emit(buf, "%s\n", str_enabled_disabled(hw_prefcore)); +} + static ssize_t show_energy_performance_available_preferences( struct cpufreq_policy *policy, char *buf) { @@ -1074,18 +1166,27 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, return ret < 0 ? ret : count; } +static ssize_t prefcore_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); +} + cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); cpufreq_freq_attr_ro(amd_pstate_highest_perf); +cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); cpufreq_freq_attr_rw(energy_performance_preference); cpufreq_freq_attr_ro(energy_performance_available_preferences); static DEVICE_ATTR_RW(status); +static DEVICE_ATTR_RO(prefcore); static struct freq_attr *amd_pstate_attr[] = { &amd_pstate_max_freq, &amd_pstate_lowest_nonlinear_freq, &amd_pstate_highest_perf, + &amd_pstate_hw_prefcore, NULL, }; @@ -1093,6 +1194,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { &amd_pstate_max_freq, &amd_pstate_lowest_nonlinear_freq, &amd_pstate_highest_perf, + &amd_pstate_hw_prefcore, &energy_performance_preference, &energy_performance_available_preferences, NULL, @@ -1100,6 +1202,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { static struct attribute *pstate_global_attributes[] = { &dev_attr_status.attr, + &dev_attr_prefcore.attr, NULL }; @@ -1151,6 +1254,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) cpudata->cpu = policy->cpu; cpudata->epp_policy = 0; + amd_pstate_init_prefcore(cpudata); + ret = amd_pstate_init_perf(cpudata); if (ret) goto free_cpudata1; @@ -1567,7 +1672,17 @@ static int __init amd_pstate_param(char *str) return amd_pstate_set_driver(mode_idx); } + +static int __init amd_prefcore_param(char *str) +{ + if (!strcmp(str, "disable")) + amd_pstate_prefcore = false; + + return 0; +} + early_param("amd_pstate", amd_pstate_param); +early_param("amd_prefcore", amd_prefcore_param); MODULE_AUTHOR("Huang Rui "); MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver"); -- cgit v1.2.3 From 9c4a13a08a9b7afa4bc33f57675358f0195e302c Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:04:59 +0800 Subject: ACPI: cpufreq: Add highest perf change notification Platform firmware sends notify 0x85 to inform the OS that the highest performance of a CPU has changed. This will be used by the AMD P-state driver to update the ranking of preferred cores and set the priority of cores accordingly. Tested-by: Oleksandr Natalenko Reviewed-by: Mario Limonciello Reviewed-by: Huang Rui Reviewed-by: Perry Yuan Signed-off-by: Meng Li Link: https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-device-notification-values [ rjw: New subject, changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/processor_driver.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 4bd16b3f0781..67db60eda370 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -27,6 +27,7 @@ #define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80 #define ACPI_PROCESSOR_NOTIFY_POWER 0x81 #define ACPI_PROCESSOR_NOTIFY_THROTTLING 0x82 +#define ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED 0x85 MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_DESCRIPTION("ACPI Processor Driver"); @@ -83,6 +84,11 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data) acpi_bus_generate_netlink_event(device->pnp.device_class, dev_name(&device->dev), event, 0); break; + case ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED: + cpufreq_update_limits(pr->id); + acpi_bus_generate_netlink_event(device->pnp.device_class, + dev_name(&device->dev), event, 0); + break; default: acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event); break; -- cgit v1.2.3 From e571a5e2068ef57945fcd5d0fb950f8f96da6dc8 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:05:00 +0800 Subject: cpufreq: amd-pstate: Update amd-pstate preferred core ranking dynamically Preferred core rankings can be changed dynamically by the platform based on the workload and platform conditions and accounting for thermals and aging. When this occurs, cpu priority need to be set. Tested-by: Oleksandr Natalenko Reviewed-by: Mario Limonciello Reviewed-by: Wyes Karny Reviewed-by: Huang Rui Reviewed-by: Perry Yuan Signed-off-by: Meng Li Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 52 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'drivers') diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 7c533ad91f4f..08e112444c27 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -315,6 +315,7 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); + WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1)); WRITE_ONCE(cpudata->min_limit_perf, AMD_CPPC_LOWEST_PERF(cap1)); return 0; } @@ -339,6 +340,7 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) WRITE_ONCE(cpudata->lowest_nonlinear_perf, cppc_perf.lowest_nonlinear_perf); WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); + WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf); WRITE_ONCE(cpudata->min_limit_perf, cppc_perf.lowest_perf); if (cppc_state == AMD_PSTATE_ACTIVE) @@ -785,6 +787,40 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) schedule_work(&sched_prefcore_work); } +static void amd_pstate_update_limits(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct amd_cpudata *cpudata = policy->driver_data; + u32 prev_high = 0, cur_high = 0; + int ret; + bool highest_perf_changed = false; + + mutex_lock(&amd_pstate_driver_lock); + if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore)) + goto free_cpufreq_put; + + ret = amd_pstate_get_highest_perf(cpu, &cur_high); + if (ret) + goto free_cpufreq_put; + + prev_high = READ_ONCE(cpudata->prefcore_ranking); + if (prev_high != cur_high) { + highest_perf_changed = true; + WRITE_ONCE(cpudata->prefcore_ranking, cur_high); + + if (cur_high < CPPC_MAX_PERF) + sched_set_itmt_core_prio((int)cur_high, cpu); + } + +free_cpufreq_put: + cpufreq_cpu_put(policy); + + if (!highest_perf_changed) + cpufreq_update_policy(cpu); + + mutex_unlock(&amd_pstate_driver_lock); +} + static int amd_pstate_cpu_init(struct cpufreq_policy *policy) { int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; @@ -958,6 +994,17 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, return sysfs_emit(buf, "%u\n", perf); } +static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy, + char *buf) +{ + u32 perf; + struct amd_cpudata *cpudata = policy->driver_data; + + perf = READ_ONCE(cpudata->prefcore_ranking); + + return sysfs_emit(buf, "%u\n", perf); +} + static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy, char *buf) { @@ -1176,6 +1223,7 @@ cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); cpufreq_freq_attr_ro(amd_pstate_highest_perf); +cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking); cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); cpufreq_freq_attr_rw(energy_performance_preference); cpufreq_freq_attr_ro(energy_performance_available_preferences); @@ -1186,6 +1234,7 @@ static struct freq_attr *amd_pstate_attr[] = { &amd_pstate_max_freq, &amd_pstate_lowest_nonlinear_freq, &amd_pstate_highest_perf, + &amd_pstate_prefcore_ranking, &amd_pstate_hw_prefcore, NULL, }; @@ -1194,6 +1243,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { &amd_pstate_max_freq, &amd_pstate_lowest_nonlinear_freq, &amd_pstate_highest_perf, + &amd_pstate_prefcore_ranking, &amd_pstate_hw_prefcore, &energy_performance_preference, &energy_performance_available_preferences, @@ -1537,6 +1587,7 @@ static struct cpufreq_driver amd_pstate_driver = { .suspend = amd_pstate_cpu_suspend, .resume = amd_pstate_cpu_resume, .set_boost = amd_pstate_set_boost, + .update_limits = amd_pstate_update_limits, .name = "amd-pstate", .attr = amd_pstate_attr, }; @@ -1551,6 +1602,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .online = amd_pstate_epp_cpu_online, .suspend = amd_pstate_epp_suspend, .resume = amd_pstate_epp_resume, + .update_limits = amd_pstate_update_limits, .name = "amd-pstate-epp", .attr = amd_pstate_epp_attr, }; -- cgit v1.2.3 From b26ffbf800ae3c8d01bdf90d9cd8a37e1606ff06 Mon Sep 17 00:00:00 2001 From: Tor Vic Date: Fri, 9 Feb 2024 16:42:26 +0100 Subject: cpufreq: amd-pstate: Fix min_perf assignment in amd_pstate_adjust_perf() In the function amd_pstate_adjust_perf(), the 'min_perf' variable is set to 'highest_perf' instead of 'lowest_perf'. Fixes: 1d215f0319c2 ("cpufreq: amd-pstate: Add fast switch function for AMD P-State") Reported-by: Oleksandr Natalenko Reviewed-by: Perry Yuan Signed-off-by: Tor Vic Reviewed-by: Mario Limonciello Cc: 6.1+ # 6.1+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 08e112444c27..aa5e57e27d2b 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -577,7 +577,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (target_perf < capacity) des_perf = DIV_ROUND_UP(cap_perf * target_perf, capacity); - min_perf = READ_ONCE(cpudata->highest_perf); + min_perf = READ_ONCE(cpudata->lowest_perf); if (_min_perf < capacity) min_perf = DIV_ROUND_UP(cap_perf * _min_perf, capacity); -- cgit v1.2.3 From e13aa799c2a6a0be23294b630109d23940cf8079 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Mon, 5 Feb 2024 02:25:00 +0000 Subject: cpufreq: Change default transition delay to 2ms 10ms is too high for today's hardware, even low end ones. This default end up being used a lot on Arm machines at least. Pine64, mac mini and pixel 6 all end up with 10ms rate_limit_us when using schedutil, and it's too high for all of them. Change the default to 2ms which should be 'pessimistic' enough for worst case scenario, but not too high for platforms with fast DVFS hardware. Signed-off-by: Qais Yousef Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 44db4f59c4cc..66cef33c4ec7 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -582,11 +582,11 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy) * for platforms where transition_latency is in milliseconds, it * ends up giving unrealistic values. * - * Cap the default transition delay to 10 ms, which seems to be + * Cap the default transition delay to 2 ms, which seems to be * a reasonable amount of time after which we should reevaluate * the frequency. */ - return min(latency * LATENCY_MULTIPLIER, (unsigned int)10000); + return min(latency * LATENCY_MULTIPLIER, (unsigned int)(2 * MSEC_PER_SEC)); } return LATENCY_MULTIPLIER; -- cgit v1.2.3 From 4615ac9010be84d676f9e893e5a7ea4b5febd1e8 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Tue, 13 Feb 2024 12:16:00 +0100 Subject: cpufreq: intel_pstate: remove cpudata::prev_cummulative_iowait Commit 09c448d3c61f ("cpufreq: intel_pstate: Use IOWAIT flag in Atom algorithm") removed the last user of cpudata::prev_cummulative_iowait. Remove the member too. Found by https://github.com/jirislaby/clang-struct. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index ca94e60e705a..5ad3542c0e1e 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -201,8 +201,6 @@ struct global_params { * @prev_aperf: Last APERF value read from APERF MSR * @prev_mperf: Last MPERF value read from MPERF MSR * @prev_tsc: Last timestamp counter (TSC) value - * @prev_cummulative_iowait: IO Wait time difference from last and - * current sample * @sample: Storage for storing last Sample data * @min_perf_ratio: Minimum capacity in terms of PERF or HWP ratios * @max_perf_ratio: Maximum capacity in terms of PERF or HWP ratios @@ -241,7 +239,6 @@ struct cpudata { u64 prev_aperf; u64 prev_mperf; u64 prev_tsc; - u64 prev_cummulative_iowait; struct sample sample; int32_t min_perf_ratio; int32_t max_perf_ratio; -- cgit v1.2.3 From 240a8da623008eb9f4e32c7a19ce16a6605911dc Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 19 Feb 2024 18:26:06 -0800 Subject: cpufreq: intel_pstate: Allow model specific EPPs The current implementation allows model specific EPP override for balanced_performance. Add feature to allow model specific EPP for all predefined EPP strings. For example for some CPU models, even changing performance EPP has benefits Use a mask of EPPs as driver_data instead of just balanced_performance. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 5ad3542c0e1e..001f6d95453e 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -3401,14 +3402,29 @@ static bool intel_pstate_hwp_is_enabled(void) return !!(value & 0x1); } -static const struct x86_cpu_id intel_epp_balance_perf[] = { +#define POWERSAVE_MASK GENMASK(7, 0) +#define BALANCE_POWER_MASK GENMASK(15, 8) +#define BALANCE_PERFORMANCE_MASK GENMASK(23, 16) +#define PERFORMANCE_MASK GENMASK(31, 24) + +#define HWP_SET_EPP_VALUES(powersave, balance_power, balance_perf, performance) \ + (FIELD_PREP_CONST(POWERSAVE_MASK, powersave) |\ + FIELD_PREP_CONST(BALANCE_POWER_MASK, balance_power) |\ + FIELD_PREP_CONST(BALANCE_PERFORMANCE_MASK, balance_perf) |\ + FIELD_PREP_CONST(PERFORMANCE_MASK, performance)) + +#define HWP_SET_DEF_BALANCE_PERF_EPP(balance_perf) \ + (HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE, HWP_EPP_BALANCE_POWERSAVE,\ + balance_perf, HWP_EPP_PERFORMANCE)) + +static const struct x86_cpu_id intel_epp_default[] = { /* * Set EPP value as 102, this is the max suggested EPP * which can result in one core turbo frequency for * AlderLake Mobile CPUs. */ - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 102), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 32), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, HWP_SET_DEF_BALANCE_PERF_EPP(102)), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)), {} }; @@ -3506,11 +3522,24 @@ hwp_cpu_matched: intel_pstate_sysfs_expose_params(); if (hwp_active) { - const struct x86_cpu_id *id = x86_match_cpu(intel_epp_balance_perf); + const struct x86_cpu_id *id = x86_match_cpu(intel_epp_default); const struct x86_cpu_id *hybrid_id = x86_match_cpu(intel_hybrid_scaling_factor); - if (id) - epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = id->driver_data; + if (id) { + epp_values[EPP_INDEX_POWERSAVE] = + FIELD_GET(POWERSAVE_MASK, id->driver_data); + epp_values[EPP_INDEX_BALANCE_POWERSAVE] = + FIELD_GET(BALANCE_POWER_MASK, id->driver_data); + epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = + FIELD_GET(BALANCE_PERFORMANCE_MASK, id->driver_data); + epp_values[EPP_INDEX_PERFORMANCE] = + FIELD_GET(PERFORMANCE_MASK, id->driver_data); + pr_debug("Updated EPPs powersave:%x balanced power:%x balanced perf:%x performance:%x\n", + epp_values[EPP_INDEX_POWERSAVE], + epp_values[EPP_INDEX_BALANCE_POWERSAVE], + epp_values[EPP_INDEX_BALANCE_PERFORMANCE], + epp_values[EPP_INDEX_PERFORMANCE]); + } if (hybrid_id) { hybrid_scaling_factor = hybrid_id->driver_data; -- cgit v1.2.3 From 1f4b7fdd71e066aa7c01e3e26ceeb39b47dd5461 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 19 Feb 2024 18:26:07 -0800 Subject: cpufreq: intel_pstate: Update default EPPs for Meteor Lake Update default balanced_performance EPP to 115 and performance EPP to 16. Changing the balanced_performance EPP has better performance/watt compared to default powerup EPP value of 128. Changing the performance EPP to 0x10 shows reduced power for similar performance as EPP 0. On small form factor devices this is beneficial as lower power results in lower CPU and skin temperature. This results in reduced thermal throttling and higher performance. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 001f6d95453e..4d2a1aed6c39 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3425,6 +3425,8 @@ static const struct x86_cpu_id intel_epp_default[] = { */ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, HWP_SET_DEF_BALANCE_PERF_EPP(102)), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE, + HWP_EPP_BALANCE_POWERSAVE, 115, 16)), {} }; -- cgit v1.2.3 From 88debc69754f7fe5186954941bb1cc4d744f4f25 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 22 Feb 2024 16:34:15 +0100 Subject: cpufreq: Remove references to 10ms min sampling rate A minimum sampling rate value of 10ms was introduced in: commit cef9615a853e ("[CPUFREQ] ondemand: Uncouple minimal sampling rate from HZ in NO_HZ case") The use of this value was removed in: commit ed4676e25463 ("cpufreq: Replace "max_transition_latency" with "dynamic_switching"") Remove: - a comment referencing this value - an unused macro associated to this value Signed-off-by: Pierre Gondois Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_ondemand.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index c52d19d67557..a7c38b8b3e78 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -22,7 +22,6 @@ #define DEF_SAMPLING_DOWN_FACTOR (1) #define MAX_SAMPLING_DOWN_FACTOR (100000) #define MICRO_FREQUENCY_UP_THRESHOLD (95) -#define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) #define MIN_FREQUENCY_UP_THRESHOLD (1) #define MAX_FREQUENCY_UP_THRESHOLD (100) -- cgit v1.2.3 From 8164f743326404fbe00a721a12efd86b2a8d74d2 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Tue, 27 Feb 2024 15:39:24 +0800 Subject: cpufreq: amd-pstate: adjust min/max limit perf The min/max limit perf values calculated based on frequency may exceed the reasonable range of perf(highest perf, lowest perf). Signed-off-by: Meng Li Reviewed-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd-pstate.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index aa5e57e27d2b..2015c9fcc3c9 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -484,12 +484,19 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy) static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) { - u32 max_limit_perf, min_limit_perf; + u32 max_limit_perf, min_limit_perf, lowest_perf; struct amd_cpudata *cpudata = policy->driver_data; max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); + lowest_perf = READ_ONCE(cpudata->lowest_perf); + if (min_limit_perf < lowest_perf) + min_limit_perf = lowest_perf; + + if (max_limit_perf < min_limit_perf) + max_limit_perf = min_limit_perf; + WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); WRITE_ONCE(cpudata->max_limit_freq, policy->max); @@ -1387,6 +1394,12 @@ static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy) max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); + if (min_limit_perf < min_perf) + min_limit_perf = min_perf; + + if (max_limit_perf < min_limit_perf) + max_limit_perf = min_limit_perf; + WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); -- cgit v1.2.3 From a755d0e2d41b9b0c7b4e0bc387c6225e3edba128 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 27 Feb 2024 23:34:52 +0000 Subject: cpufreq: Honour transition_latency over transition_delay_us Some platforms like Arm's Juno can have a high transition latency that can be larger than the 2ms cap introduced. If a driver reports a transition_latency that is higher than the cap, then use it as-is. Update comment s/10/2/ to reflect the new cap of 2ms. Reported-by: Pierre Gondois Signed-off-by: Qais Yousef Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 66cef33c4ec7..926a51cb7e52 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -576,8 +576,17 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy) latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC; if (latency) { + unsigned int max_delay_us = 2 * MSEC_PER_SEC; + + /* + * If the platform already has high transition_latency, use it + * as-is. + */ + if (latency > max_delay_us) + return latency; + /* - * For platforms that can change the frequency very fast (< 10 + * For platforms that can change the frequency very fast (< 2 * us), the above formula gives a decent transition delay. But * for platforms where transition_latency is in milliseconds, it * ends up giving unrealistic values. @@ -586,7 +595,7 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy) * a reasonable amount of time after which we should reevaluate * the frequency. */ - return min(latency * LATENCY_MULTIPLIER, (unsigned int)(2 * MSEC_PER_SEC)); + return min(latency * LATENCY_MULTIPLIER, max_delay_us); } return LATENCY_MULTIPLIER; -- cgit v1.2.3 From c4d61a529db788d2e52654f5b02c8d1de4952c5b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 29 Feb 2024 13:42:07 +0530 Subject: cpufreq: Don't unregister cpufreq cooling on CPU hotplug Offlining a CPU and bringing it back online is a common operation and it happens frequently during system suspend/resume, where the non-boot CPUs are hotplugged out during suspend and brought back at resume. The cpufreq core already tries to make this path as fast as possible as the changes are only temporary in nature and full cleanup of resources isn't required in this case. For example the drivers can implement online()/offline() callbacks to avoid a lot of tear down of resources. On similar lines, there is no need to unregister the cpufreq cooling device during suspend / resume, but only while the policy is getting removed. Moreover, unregistering the cpufreq cooling device is resulting in an unwanted outcome, where the system suspend is eventually aborted in the process. Currently, during system suspend the cpufreq core unregisters the cooling device, which in turn removes a kobject using device_del() and that generates a notification to the userspace via uevent broadcast. This causes system suspend to abort in some setups. This was also earlier reported (indirectly) by Roman [1]. Maybe there is another way around to fixing that problem properly, but this change makes sense anyways. Move the registering and unregistering of the cooling device to policy creation and removal times onlyy. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218521 Reported-by: Manaf Meethalavalappu Pallikunhi Reported-by: Roman Stratiienko Link: https://patchwork.kernel.org/project/linux-pm/patch/20220710164026.541466-1-r.stratiienko@gmail.com/ [1] Tested-by: Manaf Meethalavalappu Pallikunhi Signed-off-by: Viresh Kumar Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 926a51cb7e52..f6f8d7f450e7 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1580,7 +1580,8 @@ static int cpufreq_online(unsigned int cpu) if (cpufreq_driver->ready) cpufreq_driver->ready(policy); - if (cpufreq_thermal_control_enabled(cpufreq_driver)) + /* Register cpufreq cooling only for a new policy */ + if (new_policy && cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); pr_debug("initialization complete\n"); @@ -1664,11 +1665,6 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) else policy->last_policy = policy->policy; - if (cpufreq_thermal_control_enabled(cpufreq_driver)) { - cpufreq_cooling_unregister(policy->cdev); - policy->cdev = NULL; - } - if (has_target()) cpufreq_exit_governor(policy); @@ -1729,6 +1725,15 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) return; } + /* + * Unregister cpufreq cooling once all the CPUs of the policy are + * removed. + */ + if (cpufreq_thermal_control_enabled(cpufreq_driver)) { + cpufreq_cooling_unregister(policy->cdev); + policy->cdev = NULL; + } + /* We did light-weight exit earlier, do full tear down now */ if (cpufreq_driver->offline) cpufreq_driver->exit(policy); -- cgit v1.2.3