From 5861381d486601430cccf64849bd0a226154bc0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2019 23:18:01 +0100 Subject: PM / arch: x86: Rework the MSR_IA32_ENERGY_PERF_BIAS handling The current handling of MSR_IA32_ENERGY_PERF_BIAS in the kernel is problematic, because it may cause changes made by user space to that MSR (with the help of the x86_energy_perf_policy tool, for example) to be lost every time a CPU goes offline and then back online as well as during system-wide power management transitions into sleep states and back into the working state. The first problem is that if the current EPB value for a CPU going online is 0 ('performance'), the kernel will change it to 6 ('normal') regardless of whether or not this is the first bring-up of that CPU. That also happens during system-wide resume from sleep states (including, but not limited to, hibernation). However, the EPB may have been adjusted by user space this way and the kernel should not blindly override that setting. The second problem is that if the platform firmware resets the EPB values for any CPUs during system-wide resume from a sleep state, the kernel will not restore their previous EPB values that may have been set by user space before the preceding system-wide suspend transition. Again, that behavior may at least be confusing from the user space perspective. In order to address these issues, rework the handling of MSR_IA32_ENERGY_PERF_BIAS so that the EPB value is saved on CPU offline and restored on CPU online as well as (for the boot CPU) during the syscore stages of system-wide suspend and resume transitions, respectively. However, retain the policy by which the EPB is set to 6 ('normal') on the first bring-up of each CPU if its initial value is 0, based on the observation that 0 may mean 'not initialized' just as well as 'performance' in that case. While at it, move the MSR_IA32_ENERGY_PERF_BIAS handling code into a separate file and document it in Documentation/admin-guide. Fixes: abe48b108247 (x86, intel, power: Initialize MSR_IA32_ENERGY_PERF_BIAS) Fixes: b51ef52df71c (x86/cpu: Restore MSR_IA32_ENERGY_PERF_BIAS after resume) Reported-by: Thomas Renninger Signed-off-by: Rafael J. Wysocki Reviewed-by: Hannes Reinecke Acked-by: Borislav Petkov Acked-by: Thomas Gleixner --- Documentation/admin-guide/pm/intel_epb.rst | 6 ++ Documentation/admin-guide/pm/working-state.rst | 1 + arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/common.c | 17 ---- arch/x86/kernel/cpu/cpu.h | 1 - arch/x86/kernel/cpu/intel.c | 34 ------- arch/x86/kernel/cpu/intel_epb.c | 131 +++++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 8 files changed, 140 insertions(+), 53 deletions(-) create mode 100644 Documentation/admin-guide/pm/intel_epb.rst create mode 100644 arch/x86/kernel/cpu/intel_epb.c diff --git a/Documentation/admin-guide/pm/intel_epb.rst b/Documentation/admin-guide/pm/intel_epb.rst new file mode 100644 index 000000000000..e9cfa7ec5420 --- /dev/null +++ b/Documentation/admin-guide/pm/intel_epb.rst @@ -0,0 +1,6 @@ +====================================== +Intel Performance and Energy Bias Hint +====================================== + +.. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c + :doc: overview diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst index b6cef9b5e961..beb004d3632b 100644 --- a/Documentation/admin-guide/pm/working-state.rst +++ b/Documentation/admin-guide/pm/working-state.rst @@ -8,3 +8,4 @@ Working-State Power Management cpuidle cpufreq intel_pstate + intel_epb diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cfd24f9f7614..1796d2bdcaaa 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o intel_epb.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..5e37dfa4d9df 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1864,23 +1864,6 @@ void cpu_init(void) } #endif -static void bsp_resume(void) -{ - if (this_cpu->c_bsp_resume) - this_cpu->c_bsp_resume(&boot_cpu_data); -} - -static struct syscore_ops cpu_syscore_ops = { - .resume = bsp_resume, -}; - -static int __init init_cpu_syscore(void) -{ - register_syscore_ops(&cpu_syscore_ops); - return 0; -} -core_initcall(init_cpu_syscore); - /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 5eb946b9a9f3..c0e2407abdd6 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -14,7 +14,6 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); - void (*c_bsp_resume)(struct cpuinfo_x86 *); int c_x86_vendor; #ifdef CONFIG_X86_32 /* Optional vendor specific routine to obtain the cache size. */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fc3c07fe7df5..f17c1a714779 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -596,36 +596,6 @@ detect_keyid_bits: c->x86_phys_bits -= keyid_bits; } -static void init_intel_energy_perf(struct cpuinfo_x86 *c) -{ - u64 epb; - - /* - * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized. - * (x86_energy_perf_policy(8) is available to change it at run-time.) - */ - if (!cpu_has(c, X86_FEATURE_EPB)) - return; - - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) - return; - - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); - epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); -} - -static void intel_bsp_resume(struct cpuinfo_x86 *c) -{ - /* - * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume, - * so reinitialize it properly like during bootup: - */ - init_intel_energy_perf(c); -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -763,8 +733,6 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TME)) detect_tme(c); - init_intel_energy_perf(c); - init_intel_misc_features(c); } @@ -1023,9 +991,7 @@ static const struct cpu_dev intel_cpu_dev = { .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, - .c_bsp_resume = intel_bsp_resume, .c_x86_vendor = X86_VENDOR_INTEL, }; cpu_dev_register(intel_cpu_dev); - diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c new file mode 100644 index 000000000000..8d53cc88bd22 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Performance and Energy Bias Hint support. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Rafael J. Wysocki + */ + +#include +#include +#include + +#include +#include + +/** + * DOC: overview + * + * The Performance and Energy Bias Hint (EPB) allows software to specify its + * preference with respect to the power-performance tradeoffs present in the + * processor. Generally, the EPB is expected to be set by user space through + * the generic MSR interface (with the help of the x86_energy_perf_policy tool), + * but there are two reasons for the kernel to touch it. + * + * First, there are systems where the platform firmware resets the EPB during + * system-wide transitions from sleep states back into the working state + * effectively causing the previous EPB updates by user space to be lost. + * Thus the kernel needs to save the current EPB values for all CPUs during + * system-wide transitions to sleep states and restore them on the way back to + * the working state. That can be achieved by saving EPB for secondary CPUs + * when they are taken offline during transitions into system sleep states and + * for the boot CPU in a syscore suspend operation, so that it can be restored + * for the boot CPU in a syscore resume operation and for the other CPUs when + * they are brought back online. However, CPUs that are already offline when + * a system-wide PM transition is started are not taken offline again, but their + * EPB values may still be reset by the platform firmware during the transition, + * so in fact it is necessary to save the EPB of any CPU taken offline and to + * restore it when the given CPU goes back online at all times. + * + * Second, on many systems the initial EPB value coming from the platform + * firmware is 0 ('performance') and at least on some of them that is because + * the platform firmware does not initialize EPB at all with the assumption that + * the OS will do that anyway. That sometimes is problematic, as it may cause + * the system battery to drain too fast, for example, so it is better to adjust + * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the + * kernel changes it to 6 ('normal'). + */ + +static DEFINE_PER_CPU(u8, saved_epb); + +#define EPB_MASK 0x0fULL +#define EPB_SAVED 0x10ULL + +static int intel_epb_save(void) +{ + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + /* + * Ensure that saved_epb will always be nonzero after this write even if + * the EPB value read from the MSR is 0. + */ + this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED); + + return 0; +} + +static void intel_epb_restore(void) +{ + u64 val = this_cpu_read(saved_epb); + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if (val) { + val &= EPB_MASK; + } else { + /* + * Because intel_epb_save() has not run for the current CPU yet, + * it is going online for the first time, so if its EPB value is + * 0 ('performance') at this point, assume that it has not been + * initialized by the platform firmware and set it to 6 + * ('normal'). + */ + val = epb & EPB_MASK; + if (val == ENERGY_PERF_BIAS_PERFORMANCE) { + val = ENERGY_PERF_BIAS_NORMAL; + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + } + } + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); +} + +static struct syscore_ops intel_epb_syscore_ops = { + .suspend = intel_epb_save, + .resume = intel_epb_restore, +}; + +static int intel_epb_online(unsigned int cpu) +{ + intel_epb_restore(); + return 0; +} + +static int intel_epb_offline(unsigned int cpu) +{ + return intel_epb_save(); +} + +static __init int intel_epb_init(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_EPB)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, + "x86/intel/epb:online", intel_epb_online, + intel_epb_offline); + if (ret < 0) + goto err_out_online; + + register_syscore_ops(&intel_epb_syscore_ops); + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); + return ret; +} +subsys_initcall(intel_epb_init); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index e78281d07b70..dbfdd0fadbef 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -147,6 +147,7 @@ enum cpuhp_state { CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, + CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_PERF_ONLINE, CPUHP_AP_PERF_X86_ONLINE, CPUHP_AP_PERF_X86_UNCORE_ONLINE, -- cgit v1.2.3 From b9c273babce791cf228fc466577f55056a699f9c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2019 23:20:17 +0100 Subject: PM / arch: x86: MSR_IA32_ENERGY_PERF_BIAS sysfs interface The Performance and Energy Bias Hint (EPB) is expected to be set by user space through the generic MSR interface, but that interface is not particularly nice and there are security concerns regarding it, so it is not always available. For this reason, add a sysfs interface for reading and updating the EPB, in the form of a new attribute, energy_perf_bias, located under /sys/devices/system/cpu/cpu#/power/ for online CPUs that support the EPB feature. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hannes Reinecke Acked-by: Borislav Petkov --- Documentation/ABI/testing/sysfs-devices-system-cpu | 18 +++++ Documentation/admin-guide/pm/intel_epb.rst | 27 +++++++ arch/x86/kernel/cpu/intel_epb.c | 93 +++++++++++++++++++++- 3 files changed, 134 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 9605dbd4b5b5..7f4af7da3fbc 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -518,3 +518,21 @@ Description: Control Symetric Multi Threading (SMT) If control status is "forceoff" or "notsupported" writes are rejected. + +What: /sys/devices/system/cpu/cpu#/power/energy_perf_bias +Date: March 2019 +Contact: linux-pm@vger.kernel.org +Description: Intel Energy and Performance Bias Hint (EPB) + + EPB for the given CPU in a sliding scale 0 - 15, where a value + of 0 corresponds to a hint preference for highest performance + and a value of 15 corresponds to the maximum energy savings. + + In order to change the EPB value for the CPU, write either + a number in the 0 - 15 sliding scale above, or one of the + strings: "performance", "balance-performance", "normal", + "balance-power", "power" (that represent values reflected by + their meaning), to this attribute. + + This attribute is present for all online CPUs supporting the + Intel EPB feature. diff --git a/Documentation/admin-guide/pm/intel_epb.rst b/Documentation/admin-guide/pm/intel_epb.rst index e9cfa7ec5420..d100849edfc4 100644 --- a/Documentation/admin-guide/pm/intel_epb.rst +++ b/Documentation/admin-guide/pm/intel_epb.rst @@ -4,3 +4,30 @@ Intel Performance and Energy Bias Hint .. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c :doc: overview + +Intel Performance and Energy Bias Attribute in ``sysfs`` +======================================================== + +The Intel Performance and Energy Bias Hint (EPB) value for a given (logical) CPU +can be checked or updated through a ``sysfs`` attribute (file) under +:file:`/sys/devices/system/cpu/cpu/power/`, where the CPU number ```` +is allocated at the system initialization time: + +``energy_perf_bias`` + Shows the current EPB value for the CPU in a sliding scale 0 - 15, where + a value of 0 corresponds to a hint preference for highest performance + and a value of 15 corresponds to the maximum energy savings. + + In order to update the EPB value for the CPU, this attribute can be + written to, either with a number in the 0 - 15 sliding scale above, or + with one of the strings: "performance", "balance-performance", "normal", + "balance-power", "power" that represent values reflected by their + meaning. + + This attribute is present for all online CPUs supporting the EPB + feature. + +Note that while the EPB interface to the processor is defined at the logical CPU +level, the physical register backing it may be shared by multiple CPUs (for +example, SMT siblings or cores in one package). For this reason, updating the +EPB value for one CPU may cause the EPB values for other CPUs to change. diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 8d53cc88bd22..f4dd73396f28 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -9,8 +9,12 @@ */ #include +#include +#include #include +#include #include +#include #include #include @@ -20,9 +24,9 @@ * * The Performance and Energy Bias Hint (EPB) allows software to specify its * preference with respect to the power-performance tradeoffs present in the - * processor. Generally, the EPB is expected to be set by user space through - * the generic MSR interface (with the help of the x86_energy_perf_policy tool), - * but there are two reasons for the kernel to touch it. + * processor. Generally, the EPB is expected to be set by user space (directly + * via sysfs or with the help of the x86_energy_perf_policy tool), but there are + * two reasons for the kernel to update it. * * First, there are systems where the platform firmware resets the EPB during * system-wide transitions from sleep states back into the working state @@ -52,6 +56,7 @@ static DEFINE_PER_CPU(u8, saved_epb); #define EPB_MASK 0x0fULL #define EPB_SAVED 0x10ULL +#define MAX_EPB EPB_MASK static int intel_epb_save(void) { @@ -97,15 +102,95 @@ static struct syscore_ops intel_epb_syscore_ops = { .resume = intel_epb_restore, }; +static const char * const energy_perf_strings[] = { + "performance", + "balance-performance", + "normal", + "balance-power", + "power" +}; +static const u8 energ_perf_values[] = { + ENERGY_PERF_BIAS_PERFORMANCE, + ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + ENERGY_PERF_BIAS_NORMAL, + ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + ENERGY_PERF_BIAS_POWERSAVE +}; + +static ssize_t energy_perf_bias_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + unsigned int cpu = dev->id; + u64 epb; + int ret; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + return sprintf(buf, "%llu\n", epb); +} + +static ssize_t energy_perf_bias_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int cpu = dev->id; + u64 epb, val; + int ret; + + ret = __sysfs_match_string(energy_perf_strings, + ARRAY_SIZE(energy_perf_strings), buf); + if (ret >= 0) + val = energ_perf_values[ret]; + else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) + return -EINVAL; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + (epb & ~EPB_MASK) | val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(energy_perf_bias); + +static struct attribute *intel_epb_attrs[] = { + &dev_attr_energy_perf_bias.attr, + NULL +}; + +static const struct attribute_group intel_epb_attr_group = { + .name = power_group_name, + .attrs = intel_epb_attrs +}; + static int intel_epb_online(unsigned int cpu) { + struct device *cpu_dev = get_cpu_device(cpu); + intel_epb_restore(); + if (!cpuhp_tasks_frozen) + sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); + return 0; } static int intel_epb_offline(unsigned int cpu) { - return intel_epb_save(); + struct device *cpu_dev = get_cpu_device(cpu); + + if (!cpuhp_tasks_frozen) + sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + intel_epb_save(); + return 0; } static __init int intel_epb_init(void) -- cgit v1.2.3 From 7973b799dbea1770742851487a98276a24c961a5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 4 Apr 2019 00:08:18 +0200 Subject: admin-guide: pm: intel_epb: Add SPDX license tag and copyright notice Add an SPDX license tag and a copyright notice to the intel_epb.rst file under Documentation/admin-quide/pm. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- Documentation/admin-guide/pm/intel_epb.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/admin-guide/pm/intel_epb.rst b/Documentation/admin-guide/pm/intel_epb.rst index d100849edfc4..005121167af7 100644 --- a/Documentation/admin-guide/pm/intel_epb.rst +++ b/Documentation/admin-guide/pm/intel_epb.rst @@ -1,7 +1,15 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + ====================================== Intel Performance and Energy Bias Hint ====================================== +:Copyright: |copy| 2019 Intel Corporation + +:Author: Rafael J. Wysocki + + .. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c :doc: overview -- cgit v1.2.3 From c208ac8f8f862dba7b01eb54557f4803b3c17296 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 18 Apr 2019 16:11:37 +0200 Subject: x86: tsc: Rework time_cpufreq_notifier() There are problems with running time_cpufreq_notifier() on SMP systems. First off, the rdtsc() called from there runs on the CPU executing that code and not necessarily on the CPU whose sched_clock() rate is updated which is questionable at best. Second, in the cases when the frequencies of all CPUs in an SMP system are always in sync, it is not sufficient to update just one of them or the set associated with a given cpufreq policy on frequency changes - all CPUs in the system should be updated and that would require more than a simple transition notifier. Note, however, that the underlying issue (the TSC rate depending on the CPU frequency) has not been present in hardware shipping for the last few years and in quite a few relevant cases (acpi-cpufreq in particular) running time_cpufreq_notifier() will cause the TSC to be marked as unstable anyway. For this reason, make time_cpufreq_notifier() simply mark the TSC as unstable and give up when run on SMP and only try to carry out any adjustments otherwise. Signed-off-by: Rafael J. Wysocki Reviewed-by: Viresh Kumar --- arch/x86/kernel/tsc.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3fae23834069..cc6df5c6d7b3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -185,8 +185,7 @@ static void __init cyc2ns_init_boot_cpu(void) /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) + * speed as the bootup CPU. */ static void __init cyc2ns_init_secondary_cpus(void) { @@ -937,12 +936,12 @@ void tsc_restore_sched_clock_state(void) } #ifdef CONFIG_CPU_FREQ -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency +/* + * Frequency scaling support. Adjust the TSC based timer when the CPU frequency * changes. * - * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - * not that important because current Opteron setups do not support - * scaling on SMP anyroads. + * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC + * as unstable and give up in those cases. * * Should fix up last_tsc too. Currently gettimeofday in the * first tick after the change will be slightly wrong. @@ -956,22 +955,22 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - unsigned long *lpj; - lpj = &boot_cpu_data.loops_per_jiffy; -#ifdef CONFIG_SMP - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#endif + if (num_online_cpus() > 1) { + mark_tsc_unstable("cpufreq changes on SMP"); + return 0; + } if (!ref_freq) { ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; + loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; tsc_khz_ref = tsc_khz; } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { - *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { + boot_cpu_data.loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -- cgit v1.2.3