diff options
60 files changed, 2463 insertions, 1005 deletions
diff --git a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt deleted file mode 100644 index 58fc8a6cebc7..000000000000 --- a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt +++ /dev/null @@ -1,212 +0,0 @@ -* Rockchip rk3399 DMC (Dynamic Memory Controller) device - -Required properties: -- compatible: Must be "rockchip,rk3399-dmc". -- devfreq-events: Node to get DDR loading, Refer to - Documentation/devicetree/bindings/devfreq/event/ - rockchip-dfi.txt -- clocks: Phandles for clock specified in "clock-names" property -- clock-names : The name of clock used by the DFI, must be - "pclk_ddr_mon"; -- operating-points-v2: Refer to Documentation/devicetree/bindings/opp/opp-v2.yaml - for details. -- center-supply: DMC supply node. -- status: Marks the node enabled/disabled. -- rockchip,pmu: Phandle to the syscon managing the "PMU general register - files". - -Optional properties: -- interrupts: The CPU interrupt number. The interrupt specifier - format depends on the interrupt controller. - It should be a DCF interrupt. When DDR DVFS finishes - a DCF interrupt is triggered. -- rockchip,pmu: Phandle to the syscon managing the "PMU general register - files". - -Following properties relate to DDR timing: - -- rockchip,dram_speed_bin : Value reference include/dt-bindings/clock/rk3399-ddr.h, - it selects the DDR3 cl-trp-trcd type. It must be - set according to "Speed Bin" in DDR3 datasheet, - DO NOT use a smaller "Speed Bin" than specified - for the DDR3 being used. - -- rockchip,pd_idle : Configure the PD_IDLE value. Defines the - power-down idle period in which memories are - placed into power-down mode if bus is idle - for PD_IDLE DFI clock cycles. - -- rockchip,sr_idle : Configure the SR_IDLE value. Defines the - self-refresh idle period in which memories are - placed into self-refresh mode if bus is idle - for SR_IDLE * 1024 DFI clock cycles (DFI - clocks freq is half of DRAM clock), default - value is "0". - -- rockchip,sr_mc_gate_idle : Defines the memory self-refresh and controller - clock gating idle period. Memories are placed - into self-refresh mode and memory controller - clock arg gating started if bus is idle for - sr_mc_gate_idle*1024 DFI clock cycles. - -- rockchip,srpd_lite_idle : Defines the self-refresh power down idle - period in which memories are placed into - self-refresh power down mode if bus is idle - for srpd_lite_idle * 1024 DFI clock cycles. - This parameter is for LPDDR4 only. - -- rockchip,standby_idle : Defines the standby idle period in which - memories are placed into self-refresh mode. - The controller, pi, PHY and DRAM clock will - be gated if bus is idle for standby_idle * DFI - clock cycles. - -- rockchip,dram_dll_dis_freq : Defines the DDR3 DLL bypass frequency in MHz. - When DDR frequency is less than DRAM_DLL_DISB_FREQ, - DDR3 DLL will be bypassed. Note: if DLL was bypassed, - the odt will also stop working. - -- rockchip,phy_dll_dis_freq : Defines the PHY dll bypass frequency in - MHz (Mega Hz). When DDR frequency is less than - DRAM_DLL_DISB_FREQ, PHY DLL will be bypassed. - Note: PHY DLL and PHY ODT are independent. - -- rockchip,ddr3_odt_dis_freq : When the DRAM type is DDR3, this parameter defines - the ODT disable frequency in MHz (Mega Hz). - when the DDR frequency is less then ddr3_odt_dis_freq, - the ODT on the DRAM side and controller side are - both disabled. - -- rockchip,ddr3_drv : When the DRAM type is DDR3, this parameter defines - the DRAM side driver strength in ohms. Default - value is 40. - -- rockchip,ddr3_odt : When the DRAM type is DDR3, this parameter defines - the DRAM side ODT strength in ohms. Default value - is 120. - -- rockchip,phy_ddr3_ca_drv : When the DRAM type is DDR3, this parameter defines - the phy side CA line (incluing command line, - address line and clock line) driver strength. - Default value is 40. - -- rockchip,phy_ddr3_dq_drv : When the DRAM type is DDR3, this parameter defines - the PHY side DQ line (including DQS/DQ/DM line) - driver strength. Default value is 40. - -- rockchip,phy_ddr3_odt : When the DRAM type is DDR3, this parameter defines - the PHY side ODT strength. Default value is 240. - -- rockchip,lpddr3_odt_dis_freq : When the DRAM type is LPDDR3, this parameter defines - then ODT disable frequency in MHz (Mega Hz). - When DDR frequency is less then ddr3_odt_dis_freq, - the ODT on the DRAM side and controller side are - both disabled. - -- rockchip,lpddr3_drv : When the DRAM type is LPDDR3, this parameter defines - the DRAM side driver strength in ohms. Default - value is 34. - -- rockchip,lpddr3_odt : When the DRAM type is LPDDR3, this parameter defines - the DRAM side ODT strength in ohms. Default value - is 240. - -- rockchip,phy_lpddr3_ca_drv : When the DRAM type is LPDDR3, this parameter defines - the PHY side CA line (including command line, - address line and clock line) driver strength. - Default value is 40. - -- rockchip,phy_lpddr3_dq_drv : When the DRAM type is LPDDR3, this parameter defines - the PHY side DQ line (including DQS/DQ/DM line) - driver strength. Default value is 40. - -- rockchip,phy_lpddr3_odt : When dram type is LPDDR3, this parameter define - the phy side odt strength, default value is 240. - -- rockchip,lpddr4_odt_dis_freq : When the DRAM type is LPDDR4, this parameter - defines the ODT disable frequency in - MHz (Mega Hz). When the DDR frequency is less then - ddr3_odt_dis_freq, the ODT on the DRAM side and - controller side are both disabled. - -- rockchip,lpddr4_drv : When the DRAM type is LPDDR4, this parameter defines - the DRAM side driver strength in ohms. Default - value is 60. - -- rockchip,lpddr4_dq_odt : When the DRAM type is LPDDR4, this parameter defines - the DRAM side ODT on DQS/DQ line strength in ohms. - Default value is 40. - -- rockchip,lpddr4_ca_odt : When the DRAM type is LPDDR4, this parameter defines - the DRAM side ODT on CA line strength in ohms. - Default value is 40. - -- rockchip,phy_lpddr4_ca_drv : When the DRAM type is LPDDR4, this parameter defines - the PHY side CA line (including command address - line) driver strength. Default value is 40. - -- rockchip,phy_lpddr4_ck_cs_drv : When the DRAM type is LPDDR4, this parameter defines - the PHY side clock line and CS line driver - strength. Default value is 80. - -- rockchip,phy_lpddr4_dq_drv : When the DRAM type is LPDDR4, this parameter defines - the PHY side DQ line (including DQS/DQ/DM line) - driver strength. Default value is 80. - -- rockchip,phy_lpddr4_odt : When the DRAM type is LPDDR4, this parameter defines - the PHY side ODT strength. Default value is 60. - -Example: - dmc_opp_table: dmc_opp_table { - compatible = "operating-points-v2"; - - opp00 { - opp-hz = /bits/ 64 <300000000>; - opp-microvolt = <900000>; - }; - opp01 { - opp-hz = /bits/ 64 <666000000>; - opp-microvolt = <900000>; - }; - }; - - dmc: dmc { - compatible = "rockchip,rk3399-dmc"; - devfreq-events = <&dfi>; - interrupts = <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&cru SCLK_DDRC>; - clock-names = "dmc_clk"; - operating-points-v2 = <&dmc_opp_table>; - center-supply = <&ppvar_centerlogic>; - upthreshold = <15>; - downdifferential = <10>; - rockchip,ddr3_speed_bin = <21>; - rockchip,pd_idle = <0x40>; - rockchip,sr_idle = <0x2>; - rockchip,sr_mc_gate_idle = <0x3>; - rockchip,srpd_lite_idle = <0x4>; - rockchip,standby_idle = <0x2000>; - rockchip,dram_dll_dis_freq = <300>; - rockchip,phy_dll_dis_freq = <125>; - rockchip,auto_pd_dis_freq = <666>; - rockchip,ddr3_odt_dis_freq = <333>; - rockchip,ddr3_drv = <40>; - rockchip,ddr3_odt = <120>; - rockchip,phy_ddr3_ca_drv = <40>; - rockchip,phy_ddr3_dq_drv = <40>; - rockchip,phy_ddr3_odt = <240>; - rockchip,lpddr3_odt_dis_freq = <333>; - rockchip,lpddr3_drv = <34>; - rockchip,lpddr3_odt = <240>; - rockchip,phy_lpddr3_ca_drv = <40>; - rockchip,phy_lpddr3_dq_drv = <40>; - rockchip,phy_lpddr3_odt = <240>; - rockchip,lpddr4_odt_dis_freq = <333>; - rockchip,lpddr4_drv = <60>; - rockchip,lpddr4_dq_odt = <40>; - rockchip,lpddr4_ca_odt = <40>; - rockchip,phy_lpddr4_ca_drv = <40>; - rockchip,phy_lpddr4_ck_cs_drv = <80>; - rockchip,phy_lpddr4_dq_drv = <80>; - rockchip,phy_lpddr4_odt = <60>; - }; diff --git a/Documentation/devicetree/bindings/memory-controllers/rockchip,rk3399-dmc.yaml b/Documentation/devicetree/bindings/memory-controllers/rockchip,rk3399-dmc.yaml new file mode 100644 index 000000000000..fb4920397d08 --- /dev/null +++ b/Documentation/devicetree/bindings/memory-controllers/rockchip,rk3399-dmc.yaml @@ -0,0 +1,384 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# %YAML 1.2 +--- +$id: http://devicetree.org/schemas/memory-controllers/rockchip,rk3399-dmc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Rockchip rk3399 DMC (Dynamic Memory Controller) device + +maintainers: + - Brian Norris <briannorris@chromium.org> + +properties: + compatible: + enum: + - rockchip,rk3399-dmc + + devfreq-events: + $ref: /schemas/types.yaml#/definitions/phandle + description: + Node to get DDR loading. Refer to + Documentation/devicetree/bindings/devfreq/event/rockchip-dfi.txt. + + clocks: + maxItems: 1 + + clock-names: + items: + - const: dmc_clk + + operating-points-v2: true + + center-supply: + description: + DMC regulator supply. + + rockchip,pmu: + $ref: /schemas/types.yaml#/definitions/phandle + description: + Phandle to the syscon managing the "PMU general register files". + + interrupts: + maxItems: 1 + description: + The CPU interrupt number. It should be a DCF interrupt. When DDR DVFS + finishes, a DCF interrupt is triggered. + + rockchip,ddr3_speed_bin: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + For values, reference include/dt-bindings/clock/rk3399-ddr.h. Selects the + DDR3 cl-trp-trcd type. It must be set according to "Speed Bin" in DDR3 + datasheet; DO NOT use a smaller "Speed Bin" than specified for the DDR3 + being used. + + rockchip,pd_idle: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Configure the PD_IDLE value. Defines the power-down idle period in which + memories are placed into power-down mode if bus is idle for PD_IDLE DFI + clock cycles. + See also rockchip,pd-idle-ns. + + rockchip,sr_idle: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Configure the SR_IDLE value. Defines the self-refresh idle period in + which memories are placed into self-refresh mode if bus is idle for + SR_IDLE * 1024 DFI clock cycles (DFI clocks freq is half of DRAM clock). + See also rockchip,sr-idle-ns. + default: 0 + + rockchip,sr_mc_gate_idle: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Defines the memory self-refresh and controller clock gating idle period. + Memories are placed into self-refresh mode and memory controller clock + arg gating started if bus is idle for sr_mc_gate_idle*1024 DFI clock + cycles. + See also rockchip,sr-mc-gate-idle-ns. + + rockchip,srpd_lite_idle: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Defines the self-refresh power down idle period in which memories are + placed into self-refresh power down mode if bus is idle for + srpd_lite_idle * 1024 DFI clock cycles. This parameter is for LPDDR4 + only. + See also rockchip,srpd-lite-idle-ns. + + rockchip,standby_idle: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Defines the standby idle period in which memories are placed into + self-refresh mode. The controller, pi, PHY and DRAM clock will be gated + if bus is idle for standby_idle * DFI clock cycles. + See also rockchip,standby-idle-ns. + + rockchip,dram_dll_dis_freq: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: | + Defines the DDR3 DLL bypass frequency in MHz. When DDR frequency is less + than DRAM_DLL_DISB_FREQ, DDR3 DLL will be bypassed. + Note: if DLL was bypassed, the odt will also stop working. + + rockchip,phy_dll_dis_freq: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: | + Defines the PHY dll bypass frequency in MHz (Mega Hz). When DDR frequency + is less than DRAM_DLL_DISB_FREQ, PHY DLL will be bypassed. + Note: PHY DLL and PHY ODT are independent. + + rockchip,auto_pd_dis_freq: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Defines the auto PD disable frequency in MHz. + + rockchip,ddr3_odt_dis_freq: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 1000000 # In case anyone thought this was MHz. + description: + When the DRAM type is DDR3, this parameter defines the ODT disable + frequency in Hz. When the DDR frequency is less then ddr3_odt_dis_freq, + the ODT on the DRAM side and controller side are both disabled. + + rockchip,ddr3_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is DDR3, this parameter defines the DRAM side drive + strength in ohms. + default: 40 + + rockchip,ddr3_odt: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is DDR3, this parameter defines the DRAM side ODT + strength in ohms. + default: 120 + + rockchip,phy_ddr3_ca_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is DDR3, this parameter defines the phy side CA line + (incluing command line, address line and clock line) drive strength. + default: 40 + + rockchip,phy_ddr3_dq_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is DDR3, this parameter defines the PHY side DQ line + (including DQS/DQ/DM line) drive strength. + default: 40 + + rockchip,phy_ddr3_odt: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is DDR3, this parameter defines the PHY side ODT + strength. + default: 240 + + rockchip,lpddr3_odt_dis_freq: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 1000000 # In case anyone thought this was MHz. + description: + When the DRAM type is LPDDR3, this parameter defines then ODT disable + frequency in Hz. When DDR frequency is less then ddr3_odt_dis_freq, the + ODT on the DRAM side and controller side are both disabled. + + rockchip,lpddr3_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR3, this parameter defines the DRAM side drive + strength in ohms. + default: 34 + + rockchip,lpddr3_odt: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR3, this parameter defines the DRAM side ODT + strength in ohms. + default: 240 + + rockchip,phy_lpddr3_ca_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR3, this parameter defines the PHY side CA line + (including command line, address line and clock line) drive strength. + default: 40 + + rockchip,phy_lpddr3_dq_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR3, this parameter defines the PHY side DQ line + (including DQS/DQ/DM line) drive strength. + default: 40 + + rockchip,phy_lpddr3_odt: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When dram type is LPDDR3, this parameter define the phy side odt + strength, default value is 240. + + rockchip,lpddr4_odt_dis_freq: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 1000000 # In case anyone thought this was MHz. + description: + When the DRAM type is LPDDR4, this parameter defines the ODT disable + frequency in Hz. When the DDR frequency is less then ddr3_odt_dis_freq, + the ODT on the DRAM side and controller side are both disabled. + + rockchip,lpddr4_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR4, this parameter defines the DRAM side drive + strength in ohms. + default: 60 + + rockchip,lpddr4_dq_odt: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR4, this parameter defines the DRAM side ODT on + DQS/DQ line strength in ohms. + default: 40 + + rockchip,lpddr4_ca_odt: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR4, this parameter defines the DRAM side ODT on + CA line strength in ohms. + default: 40 + + rockchip,phy_lpddr4_ca_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR4, this parameter defines the PHY side CA line + (including command address line) drive strength. + default: 40 + + rockchip,phy_lpddr4_ck_cs_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR4, this parameter defines the PHY side clock + line and CS line drive strength. + default: 80 + + rockchip,phy_lpddr4_dq_drv: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR4, this parameter defines the PHY side DQ line + (including DQS/DQ/DM line) drive strength. + default: 80 + + rockchip,phy_lpddr4_odt: + deprecated: true + $ref: /schemas/types.yaml#/definitions/uint32 + description: + When the DRAM type is LPDDR4, this parameter defines the PHY side ODT + strength. + default: 60 + + rockchip,pd-idle-ns: + description: + Configure the PD_IDLE value in nanoseconds. Defines the power-down idle + period in which memories are placed into power-down mode if bus is idle + for PD_IDLE nanoseconds. + + rockchip,sr-idle-ns: + description: + Configure the SR_IDLE value in nanoseconds. Defines the self-refresh idle + period in which memories are placed into self-refresh mode if bus is idle + for SR_IDLE nanoseconds. + default: 0 + + rockchip,sr-mc-gate-idle-ns: + description: + Defines the memory self-refresh and controller clock gating idle period in nanoseconds. + Memories are placed into self-refresh mode and memory controller clock + arg gating started if bus is idle for sr_mc_gate_idle nanoseconds. + + rockchip,srpd-lite-idle-ns: + description: + Defines the self-refresh power down idle period in which memories are + placed into self-refresh power down mode if bus is idle for + srpd_lite_idle nanoseonds. This parameter is for LPDDR4 only. + + rockchip,standby-idle-ns: + description: + Defines the standby idle period in which memories are placed into + self-refresh mode. The controller, pi, PHY and DRAM clock will be gated + if bus is idle for standby_idle nanoseconds. + + rockchip,pd-idle-dis-freq-hz: + description: + Defines the power-down idle disable frequency in Hz. When the DDR + frequency is greater than pd-idle-dis-freq, power-down idle is disabled. + See also rockchip,pd-idle-ns. + + rockchip,sr-idle-dis-freq-hz: + description: + Defines the self-refresh idle disable frequency in Hz. When the DDR + frequency is greater than sr-idle-dis-freq, self-refresh idle is + disabled. See also rockchip,sr-idle-ns. + + rockchip,sr-mc-gate-idle-dis-freq-hz: + description: + Defines the self-refresh and memory-controller clock gating disable + frequency in Hz. When the DDR frequency is greater than + sr-mc-gate-idle-dis-freq, the clock will not be gated when idle. See also + rockchip,sr-mc-gate-idle-ns. + + rockchip,srpd-lite-idle-dis-freq-hz: + description: + Defines the self-refresh power down idle disable frequency in Hz. When + the DDR frequency is greater than srpd-lite-idle-dis-freq, memory will + not be placed into self-refresh power down mode when idle. See also + rockchip,srpd-lite-idle-ns. + + rockchip,standby-idle-dis-freq-hz: + description: + Defines the standby idle disable frequency in Hz. When the DDR frequency + is greater than standby-idle-dis-freq, standby idle is disabled. See also + rockchip,standby-idle-ns. + +required: + - compatible + - devfreq-events + - clocks + - clock-names + - operating-points-v2 + - center-supply + +additionalProperties: false + +examples: + - | + #include <dt-bindings/clock/rk3399-cru.h> + #include <dt-bindings/interrupt-controller/arm-gic.h> + memory-controller { + compatible = "rockchip,rk3399-dmc"; + devfreq-events = <&dfi>; + rockchip,pmu = <&pmu>; + interrupts = <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&cru SCLK_DDRC>; + clock-names = "dmc_clk"; + operating-points-v2 = <&dmc_opp_table>; + center-supply = <&ppvar_centerlogic>; + rockchip,pd-idle-ns = <160>; + rockchip,sr-idle-ns = <10240>; + rockchip,sr-mc-gate-idle-ns = <40960>; + rockchip,srpd-lite-idle-ns = <61440>; + rockchip,standby-idle-ns = <81920>; + rockchip,ddr3_odt_dis_freq = <333000000>; + rockchip,lpddr3_odt_dis_freq = <333000000>; + rockchip,lpddr4_odt_dis_freq = <333000000>; + rockchip,pd-idle-dis-freq-hz = <1000000000>; + rockchip,sr-idle-dis-freq-hz = <1000000000>; + rockchip,sr-mc-gate-idle-dis-freq-hz = <1000000000>; + rockchip,srpd-lite-idle-dis-freq-hz = <0>; + rockchip,standby-idle-dis-freq-hz = <928000000>; + }; diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index 49549aab41b4..feb257b7f350 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -123,6 +123,26 @@ allows a platform to register EM power values which are reflecting total power (static + dynamic). These power values might be coming directly from experiments and measurements. +Registration of 'artificial' EM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There is an option to provide a custom callback for drivers missing detailed +knowledge about power value for each performance state. The callback +.get_cost() is optional and provides the 'cost' values used by the EAS. +This is useful for platforms that only provide information on relative +efficiency between CPU types, where one could use the information to +create an abstract power model. But even an abstract power model can +sometimes be hard to fit in, given the input power value size restrictions. +The .get_cost() allows to provide the 'cost' values which reflect the +efficiency of the CPUs. This would allow to provide EAS information which +has different relation than what would be forced by the EM internal +formulas calculating 'cost' values. To register an EM for such platform, the +driver must set the flag 'milliwatts' to 0, provide .get_power() callback +and provide .get_cost() callback. The EM framework would handle such platform +properly during registration. A flag EM_PERF_DOMAIN_ARTIFICIAL is set for such +platform. Special care should be taken by other frameworks which are using EM +to test and treat this flag properly. + Registration of 'simple' EM ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -181,8 +201,8 @@ EM framework:: -> drivers/cpufreq/foo_cpufreq.c - 01 static int est_power(unsigned long *mW, unsigned long *KHz, - 02 struct device *dev) + 01 static int est_power(struct device *dev, unsigned long *mW, + 02 unsigned long *KHz) 03 { 04 long freq, power; 05 diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 3b46041f2b97..62ed361a4376 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -512,6 +512,7 @@ struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu) { return &cpu_madt_gicc[cpu]; } +EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc); /* * acpi_map_gic_cpu_interface - parse processor MADT entry diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 14154b8b1720..403e83b4adc8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -319,6 +319,7 @@ /* Run Time Average Power Limiting (RAPL) Interface */ +#define MSR_VR_CURRENT_CONFIG 0x00000601 #define MSR_RAPL_POWER_UNIT 0x00000606 #define MSR_PKG_POWER_LIMIT 0x00000610 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 058e7b924189..907cc98b1938 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1862,7 +1862,7 @@ int __acpi_release_global_lock(unsigned int *lock) void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) { - e820__range_add(addr, size, E820_TYPE_ACPI); + e820__range_add(addr, size, E820_TYPE_NVS); e820__update_table_print(); } diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index fe0000eb7cae..b67d2ee77cd1 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -279,6 +279,20 @@ bool osc_pc_lpi_support_confirmed; EXPORT_SYMBOL_GPL(osc_pc_lpi_support_confirmed); /* + * ACPI 6.2 Section 6.2.11.2 'Platform-Wide OSPM Capabilities': + * Starting with ACPI Specification 6.2, all _CPC registers can be in + * PCC, System Memory, System IO, or Functional Fixed Hardware address + * spaces. OSPM support for this more flexible register space scheme is + * indicated by the “Flexible Address Space for CPPC Registers” _OSC bit. + * + * Otherwise (cf ACPI 6.1, s8.4.7.1.1.X), _CPC registers must be in: + * - PCC or Functional Fixed Hardware address space if defined + * - SystemMemory address space (NULL register) if not defined + */ +bool osc_cpc_flexible_adr_space_confirmed; +EXPORT_SYMBOL_GPL(osc_cpc_flexible_adr_space_confirmed); + +/* * ACPI 6.4 Operating System Capabilities for USB. */ bool osc_sb_native_usb4_support_confirmed; @@ -315,12 +329,15 @@ static void acpi_bus_osc_negotiate_platform_control(void) #endif #ifdef CONFIG_X86 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; - if (boot_cpu_has(X86_FEATURE_HWP)) { - capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT; - capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT; - } #endif +#ifdef CONFIG_ACPI_CPPC_LIB + capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT; + capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT; +#endif + + capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_FLEXIBLE_ADR_SPACE; + if (IS_ENABLED(CONFIG_SCHED_MC_PRIO)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT; @@ -341,10 +358,9 @@ static void acpi_bus_osc_negotiate_platform_control(void) return; } -#ifdef CONFIG_X86 - if (boot_cpu_has(X86_FEATURE_HWP)) - osc_sb_cppc_not_supported = !(capbuf_ret[OSC_SUPPORT_DWORD] & - (OSC_SB_CPC_SUPPORT | OSC_SB_CPCV2_SUPPORT)); +#ifdef CONFIG_ACPI_CPPC_LIB + osc_sb_cppc_not_supported = !(capbuf_ret[OSC_SUPPORT_DWORD] & + (OSC_SB_CPC_SUPPORT | OSC_SB_CPCV2_SUPPORT)); #endif /* @@ -366,6 +382,8 @@ static void acpi_bus_osc_negotiate_platform_control(void) capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_PCLPI_SUPPORT; osc_sb_native_usb4_support_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_NATIVE_USB4_SUPPORT; + osc_cpc_flexible_adr_space_confirmed = + capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPC_FLEXIBLE_ADR_SPACE; } kfree(context.ret.pointer); diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index bc1454789a06..3b299b28a8af 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -100,6 +100,16 @@ static DEFINE_PER_CPU(struct cpc_desc *, cpc_desc_ptr); (cpc)->cpc_entry.reg.space_id == \ ACPI_ADR_SPACE_PLATFORM_COMM) +/* Check if a CPC register is in SystemMemory */ +#define CPC_IN_SYSTEM_MEMORY(cpc) ((cpc)->type == ACPI_TYPE_BUFFER && \ + (cpc)->cpc_entry.reg.space_id == \ + ACPI_ADR_SPACE_SYSTEM_MEMORY) + +/* Check if a CPC register is in SystemIo */ +#define CPC_IN_SYSTEM_IO(cpc) ((cpc)->type == ACPI_TYPE_BUFFER && \ + (cpc)->cpc_entry.reg.space_id == \ + ACPI_ADR_SPACE_SYSTEM_IO) + /* Evaluates to True if reg is a NULL register descriptor */ #define IS_NULL_REG(reg) ((reg)->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY && \ (reg)->address == 0 && \ @@ -424,6 +434,24 @@ bool acpi_cpc_valid(void) } EXPORT_SYMBOL_GPL(acpi_cpc_valid); +bool cppc_allow_fast_switch(void) +{ + struct cpc_register_resource *desired_reg; + struct cpc_desc *cpc_ptr; + int cpu; + + for_each_possible_cpu(cpu) { + cpc_ptr = per_cpu(cpc_desc_ptr, cpu); + desired_reg = &cpc_ptr->cpc_regs[DESIRED_PERF]; + if (!CPC_IN_SYSTEM_MEMORY(desired_reg) && + !CPC_IN_SYSTEM_IO(desired_reg)) + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(cppc_allow_fast_switch); + /** * acpi_get_psd_map - Map the CPUs in the freq domain of a given cpu * @cpu: Find all CPUs that share a domain with cpu. @@ -736,6 +764,11 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) if (gas_t->address) { void __iomem *addr; + if (!osc_cpc_flexible_adr_space_confirmed) { + pr_debug("Flexible address space capability not supported\n"); + goto out_free; + } + addr = ioremap(gas_t->address, gas_t->bit_width/8); if (!addr) goto out_free; @@ -758,6 +791,10 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) gas_t->address); goto out_free; } + if (!osc_cpc_flexible_adr_space_confirmed) { + pr_debug("Flexible address space capability not supported\n"); + goto out_free; + } } else { if (gas_t->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE || !cpc_ffh_supported()) { /* Support only PCC, SystemMemory, SystemIO, and FFH type regs. */ @@ -1447,6 +1484,9 @@ EXPORT_SYMBOL_GPL(cppc_set_perf); * transition latency for performance change requests. The closest we have * is the timing information from the PCCT tables which provides the info * on the number and frequency of PCC commands the platform can handle. + * + * If desired_reg is in the SystemMemory or SystemIo ACPI address space, + * then assume there is no latency. */ unsigned int cppc_get_transition_latency(int cpu_num) { @@ -1472,7 +1512,9 @@ unsigned int cppc_get_transition_latency(int cpu_num) return CPUFREQ_ETERNAL; desired_reg = &cpc_desc->cpc_regs[DESIRED_PERF]; - if (!CPC_IN_PCC(desired_reg)) + if (CPC_IN_SYSTEM_MEMORY(desired_reg) || CPC_IN_SYSTEM_IO(desired_reg)) + return 0; + else if (!CPC_IN_PCC(desired_reg)) return CPUFREQ_ETERNAL; if (pcc_ss_id < 0) diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c index bbddb267c2e6..72115917e0bd 100644 --- a/drivers/base/power/common.c +++ b/drivers/base/power/common.c @@ -172,10 +172,10 @@ EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_name); * @dev: Device to detach. * @power_off: Used to indicate whether we should power off the device. * - * This functions will reverse the actions from dev_pm_domain_attach() and - * dev_pm_domain_attach_by_id(), thus it detaches @dev from its PM domain. - * Typically it should be invoked during the remove phase, either from - * subsystem level code or from drivers. + * This functions will reverse the actions from dev_pm_domain_attach(), + * dev_pm_domain_attach_by_id() and dev_pm_domain_attach_by_name(), thus it + * detaches @dev from its PM domain. Typically it should be invoked during the + * remove phase, either from subsystem level code or from drivers. * * Callers must ensure proper synchronization of this function with power * management callbacks. diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 1ee878d126fd..739e52cd4aba 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -131,7 +131,7 @@ static const struct genpd_lock_ops genpd_spin_ops = { #define genpd_is_cpu_domain(genpd) (genpd->flags & GENPD_FLAG_CPU_DOMAIN) #define genpd_is_rpm_always_on(genpd) (genpd->flags & GENPD_FLAG_RPM_ALWAYS_ON) -static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev, +static inline bool irq_safe_dev_in_sleep_domain(struct device *dev, const struct generic_pm_domain *genpd) { bool ret; @@ -139,11 +139,14 @@ static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev, ret = pm_runtime_is_irq_safe(dev) && !genpd_is_irq_safe(genpd); /* - * Warn once if an IRQ safe device is attached to a no sleep domain, as - * to indicate a suboptimal configuration for PM. For an always on - * domain this isn't case, thus don't warn. + * Warn once if an IRQ safe device is attached to a domain, which + * callbacks are allowed to sleep. This indicates a suboptimal + * configuration for PM, but it doesn't matter for an always on domain. */ - if (ret && !genpd_is_always_on(genpd)) + if (genpd_is_always_on(genpd) || genpd_is_rpm_always_on(genpd)) + return ret; + + if (ret) dev_warn_once(dev, "PM domain %s will not be powered off\n", genpd->name); @@ -225,24 +228,23 @@ static void genpd_debug_remove(struct generic_pm_domain *genpd) static void genpd_update_accounting(struct generic_pm_domain *genpd) { - ktime_t delta, now; + u64 delta, now; - now = ktime_get(); - delta = ktime_sub(now, genpd->accounting_time); + now = ktime_get_mono_fast_ns(); + if (now <= genpd->accounting_time) + return; + + delta = now - genpd->accounting_time; /* * If genpd->status is active, it means we are just * out of off and so update the idle time and vice * versa. */ - if (genpd->status == GENPD_STATE_ON) { - int state_idx = genpd->state_idx; - - genpd->states[state_idx].idle_time = - ktime_add(genpd->states[state_idx].idle_time, delta); - } else { - genpd->on_time = ktime_add(genpd->on_time, delta); - } + if (genpd->status == GENPD_STATE_ON) + genpd->states[genpd->state_idx].idle_time += delta; + else + genpd->on_time += delta; genpd->accounting_time = now; } @@ -476,15 +478,16 @@ EXPORT_SYMBOL_GPL(dev_pm_genpd_set_performance_state); */ void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next) { - struct generic_pm_domain_data *gpd_data; struct generic_pm_domain *genpd; + struct gpd_timing_data *td; genpd = dev_to_genpd_safe(dev); if (!genpd) return; - gpd_data = to_gpd_data(dev->power.subsys_data->domain_data); - gpd_data->next_wakeup = next; + td = to_gpd_data(dev->power.subsys_data->domain_data)->td; + if (td) + td->next_wakeup = next; } EXPORT_SYMBOL_GPL(dev_pm_genpd_set_next_wakeup); @@ -506,6 +509,7 @@ static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed) if (!genpd->power_on) goto out; + timed = timed && genpd->gd && !genpd->states[state_idx].fwnode; if (!timed) { ret = genpd->power_on(genpd); if (ret) @@ -524,7 +528,7 @@ static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed) goto out; genpd->states[state_idx].power_on_latency_ns = elapsed_ns; - genpd->max_off_time_changed = true; + genpd->gd->max_off_time_changed = true; pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n", genpd->name, "on", elapsed_ns); @@ -555,6 +559,7 @@ static int _genpd_power_off(struct generic_pm_domain *genpd, bool timed) if (!genpd->power_off) goto out; + timed = timed && genpd->gd && !genpd->states[state_idx].fwnode; if (!timed) { ret = genpd->power_off(genpd); if (ret) @@ -573,7 +578,7 @@ static int _genpd_power_off(struct generic_pm_domain *genpd, bool timed) goto out; genpd->states[state_idx].power_off_latency_ns = elapsed_ns; - genpd->max_off_time_changed = true; + genpd->gd->max_off_time_changed = true; pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n", genpd->name, "off", elapsed_ns); @@ -649,18 +654,12 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on, } list_for_each_entry(pdd, &genpd->dev_list, list_node) { - enum pm_qos_flags_status stat; - - stat = dev_pm_qos_flags(pdd->dev, PM_QOS_FLAG_NO_POWER_OFF); - if (stat > PM_QOS_FLAGS_NONE) - return -EBUSY; - /* * Do not allow PM domain to be powered off, when an IRQ safe * device is part of a non-IRQ safe domain. */ if (!pm_runtime_suspended(pdd->dev) || - irq_safe_dev_in_no_sleep_domain(pdd->dev, genpd)) + irq_safe_dev_in_sleep_domain(pdd->dev, genpd)) not_suspended++; } @@ -775,25 +774,27 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb, dev = gpd_data->base.dev; for (;;) { - struct generic_pm_domain *genpd; + struct generic_pm_domain *genpd = ERR_PTR(-ENODATA); struct pm_domain_data *pdd; + struct gpd_timing_data *td; spin_lock_irq(&dev->power.lock); pdd = dev->power.subsys_data ? dev->power.subsys_data->domain_data : NULL; if (pdd) { - to_gpd_data(pdd)->td.constraint_changed = true; - genpd = dev_to_genpd(dev); - } else { - genpd = ERR_PTR(-ENODATA); + td = to_gpd_data(pdd)->td; + if (td) { + td->constraint_changed = true; + genpd = dev_to_genpd(dev); + } } spin_unlock_irq(&dev->power.lock); if (!IS_ERR(genpd)) { genpd_lock(genpd); - genpd->max_off_time_changed = true; + genpd->gd->max_off_time_changed = true; genpd_unlock(genpd); } @@ -879,9 +880,9 @@ static int genpd_runtime_suspend(struct device *dev) struct generic_pm_domain *genpd; bool (*suspend_ok)(struct device *__dev); struct generic_pm_domain_data *gpd_data = dev_gpd_data(dev); - struct gpd_timing_data *td = &gpd_data->td; + struct gpd_timing_data *td = gpd_data->td; bool runtime_pm = pm_runtime_enabled(dev); - ktime_t time_start; + ktime_t time_start = 0; s64 elapsed_ns; int ret; @@ -902,8 +903,7 @@ static int genpd_runtime_suspend(struct device *dev) return -EBUSY; /* Measure suspend latency. */ - time_start = 0; - if (runtime_pm) + if (td && runtime_pm) time_start = ktime_get(); ret = __genpd_runtime_suspend(dev); @@ -917,13 +917,13 @@ static int genpd_runtime_suspend(struct device *dev) } /* Update suspend latency value if the measured time exceeds it. */ - if (runtime_pm) { + if (td && runtime_pm) { elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start)); if (elapsed_ns > td->suspend_latency_ns) { td->suspend_latency_ns = elapsed_ns; dev_dbg(dev, "suspend latency exceeded, %lld ns\n", elapsed_ns); - genpd->max_off_time_changed = true; + genpd->gd->max_off_time_changed = true; td->constraint_changed = true; } } @@ -932,7 +932,7 @@ static int genpd_runtime_suspend(struct device *dev) * If power.irq_safe is set, this routine may be run with * IRQs disabled, so suspend only if the PM domain also is irq_safe. */ - if (irq_safe_dev_in_no_sleep_domain(dev, genpd)) + if (irq_safe_dev_in_sleep_domain(dev, genpd)) return 0; genpd_lock(genpd); @@ -955,12 +955,11 @@ static int genpd_runtime_resume(struct device *dev) { struct generic_pm_domain *genpd; struct generic_pm_domain_data *gpd_data = dev_gpd_data(dev); - struct gpd_timing_data *td = &gpd_data->td; - bool runtime_pm = pm_runtime_enabled(dev); - ktime_t time_start; + struct gpd_timing_data *td = gpd_data->td; + bool timed = td && pm_runtime_enabled(dev); + ktime_t time_start = 0; s64 elapsed_ns; int ret; - bool timed = true; dev_dbg(dev, "%s()\n", __func__); @@ -972,10 +971,8 @@ static int genpd_runtime_resume(struct device *dev) * As we don't power off a non IRQ safe domain, which holds * an IRQ safe device, we don't need to restore power to it. */ - if (irq_safe_dev_in_no_sleep_domain(dev, genpd)) { - timed = false; + if (irq_safe_dev_in_sleep_domain(dev, genpd)) goto out; - } genpd_lock(genpd); ret = genpd_power_on(genpd, 0); @@ -988,8 +985,7 @@ static int genpd_runtime_resume(struct device *dev) out: /* Measure resume latency. */ - time_start = 0; - if (timed && runtime_pm) + if (timed) time_start = ktime_get(); ret = genpd_start_dev(genpd, dev); @@ -1001,13 +997,13 @@ static int genpd_runtime_resume(struct device *dev) goto err_stop; /* Update resume latency value if the measured time exceeds it. */ - if (timed && runtime_pm) { + if (timed) { elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start)); if (elapsed_ns > td->resume_latency_ns) { td->resume_latency_ns = elapsed_ns; dev_dbg(dev, "resume latency exceeded, %lld ns\n", elapsed_ns); - genpd->max_off_time_changed = true; + genpd->gd->max_off_time_changed = true; td->constraint_changed = true; } } @@ -1500,9 +1496,11 @@ EXPORT_SYMBOL_GPL(dev_pm_genpd_resume); #endif /* CONFIG_PM_SLEEP */ -static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev) +static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev, + bool has_governor) { struct generic_pm_domain_data *gpd_data; + struct gpd_timing_data *td; int ret; ret = dev_pm_get_subsys_data(dev); @@ -1516,26 +1514,38 @@ static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev) } gpd_data->base.dev = dev; - gpd_data->td.constraint_changed = true; - gpd_data->td.effective_constraint_ns = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS; gpd_data->nb.notifier_call = genpd_dev_pm_qos_notifier; - gpd_data->next_wakeup = KTIME_MAX; - spin_lock_irq(&dev->power.lock); + /* Allocate data used by a governor. */ + if (has_governor) { + td = kzalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + ret = -ENOMEM; + goto err_free; + } - if (dev->power.subsys_data->domain_data) { - ret = -EINVAL; - goto err_free; + td->constraint_changed = true; + td->effective_constraint_ns = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS; + td->next_wakeup = KTIME_MAX; + gpd_data->td = td; } - dev->power.subsys_data->domain_data = &gpd_data->base; + spin_lock_irq(&dev->power.lock); + + if (dev->power.subsys_data->domain_data) + ret = -EINVAL; + else + dev->power.subsys_data->domain_data = &gpd_data->base; spin_unlock_irq(&dev->power.lock); + if (ret) + goto err_free; + return gpd_data; err_free: - spin_unlock_irq(&dev->power.lock); + kfree(gpd_data->td); kfree(gpd_data); err_put: dev_pm_put_subsys_data(dev); @@ -1551,6 +1561,7 @@ static void genpd_free_dev_data(struct device *dev, spin_unlock_irq(&dev->power.lock); + kfree(gpd_data->td); kfree(gpd_data); dev_pm_put_subsys_data(dev); } @@ -1607,6 +1618,7 @@ static int genpd_get_cpu(struct generic_pm_domain *genpd, struct device *dev) static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, struct device *base_dev) { + struct genpd_governor_data *gd = genpd->gd; struct generic_pm_domain_data *gpd_data; int ret; @@ -1615,7 +1627,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(dev)) return -EINVAL; - gpd_data = genpd_alloc_dev_data(dev); + gpd_data = genpd_alloc_dev_data(dev, gd); if (IS_ERR(gpd_data)) return PTR_ERR(gpd_data); @@ -1631,7 +1643,8 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, dev_pm_domain_set(dev, &genpd->domain); genpd->device_count++; - genpd->max_off_time_changed = true; + if (gd) + gd->max_off_time_changed = true; list_add_tail(&gpd_data->base.list_node, &genpd->dev_list); @@ -1685,7 +1698,8 @@ static int genpd_remove_device(struct generic_pm_domain *genpd, } genpd->device_count--; - genpd->max_off_time_changed = true; + if (genpd->gd) + genpd->gd->max_off_time_changed = true; genpd_clear_cpumask(genpd, gpd_data->cpu); dev_pm_domain_set(dev, NULL); @@ -1958,6 +1972,53 @@ static int genpd_set_default_power_state(struct generic_pm_domain *genpd) return 0; } +static int genpd_alloc_data(struct generic_pm_domain *genpd) +{ + struct genpd_governor_data *gd = NULL; + int ret; + + if (genpd_is_cpu_domain(genpd) && + !zalloc_cpumask_var(&genpd->cpus, GFP_KERNEL)) + return -ENOMEM; + + if (genpd->gov) { + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) { + ret = -ENOMEM; + goto free; + } + + gd->max_off_time_ns = -1; + gd->max_off_time_changed = true; + gd->next_wakeup = KTIME_MAX; + } + + /* Use only one "off" state if there were no states declared */ + if (genpd->state_count == 0) { + ret = genpd_set_default_power_state(genpd); + if (ret) + goto free; + } + + genpd->gd = gd; + return 0; + +free: + if (genpd_is_cpu_domain(genpd)) + free_cpumask_var(genpd->cpus); + kfree(gd); + return ret; +} + +static void genpd_free_data(struct generic_pm_domain *genpd) +{ + if (genpd_is_cpu_domain(genpd)) + free_cpumask_var(genpd->cpus); + if (genpd->free_states) + genpd->free_states(genpd->states, genpd->state_count); + kfree(genpd->gd); +} + static void genpd_lock_init(struct generic_pm_domain *genpd) { if (genpd->flags & GENPD_FLAG_IRQ_SAFE) { @@ -1995,11 +2056,9 @@ int pm_genpd_init(struct generic_pm_domain *genpd, atomic_set(&genpd->sd_count, 0); genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON; genpd->device_count = 0; - genpd->max_off_time_ns = -1; - genpd->max_off_time_changed = true; genpd->provider = NULL; genpd->has_provider = false; - genpd->accounting_time = ktime_get(); + genpd->accounting_time = ktime_get_mono_fast_ns(); genpd->domain.ops.runtime_suspend = genpd_runtime_suspend; genpd->domain.ops.runtime_resume = genpd_runtime_resume; genpd->domain.ops.prepare = genpd_prepare; @@ -2017,26 +2076,22 @@ int pm_genpd_init(struct generic_pm_domain *genpd, genpd->dev_ops.start = pm_clk_resume; } + /* The always-on governor works better with the corresponding flag. */ + if (gov == &pm_domain_always_on_gov) + genpd->flags |= GENPD_FLAG_RPM_ALWAYS_ON; + /* Always-on domains must be powered on at initialization. */ if ((genpd_is_always_on(genpd) || genpd_is_rpm_always_on(genpd)) && !genpd_status_on(genpd)) return -EINVAL; - if (genpd_is_cpu_domain(genpd) && - !zalloc_cpumask_var(&genpd->cpus, GFP_KERNEL)) - return -ENOMEM; - - /* Use only one "off" state if there were no states declared */ - if (genpd->state_count == 0) { - ret = genpd_set_default_power_state(genpd); - if (ret) { - if (genpd_is_cpu_domain(genpd)) - free_cpumask_var(genpd->cpus); - return ret; - } - } else if (!gov && genpd->state_count > 1) { + /* Multiple states but no governor doesn't make sense. */ + if (!gov && genpd->state_count > 1) pr_warn("%s: no governor for states\n", genpd->name); - } + + ret = genpd_alloc_data(genpd); + if (ret) + return ret; device_initialize(&genpd->dev); dev_set_name(&genpd->dev, "%s", genpd->name); @@ -2081,10 +2136,7 @@ static int genpd_remove(struct generic_pm_domain *genpd) genpd_unlock(genpd); genpd_debug_remove(genpd); cancel_work_sync(&genpd->power_off_work); - if (genpd_is_cpu_domain(genpd)) - free_cpumask_var(genpd->cpus); - if (genpd->free_states) - genpd->free_states(genpd->states, genpd->state_count); + genpd_free_data(genpd); pr_debug("%s: removed %s\n", __func__, genpd->name); @@ -3163,6 +3215,7 @@ static int sub_domains_show(struct seq_file *s, void *data) static int idle_states_show(struct seq_file *s, void *data) { struct generic_pm_domain *genpd = s->private; + u64 now, delta, idle_time = 0; unsigned int i; int ret = 0; @@ -3173,17 +3226,19 @@ static int idle_states_show(struct seq_file *s, void *data) seq_puts(s, "State Time Spent(ms) Usage Rejected\n"); for (i = 0; i < genpd->state_count; i++) { - ktime_t delta = 0; - s64 msecs; + idle_time += genpd->states[i].idle_time; - if ((genpd->status == GENPD_STATE_OFF) && - (genpd->state_idx == i)) - delta = ktime_sub(ktime_get(), genpd->accounting_time); + if (genpd->status == GENPD_STATE_OFF && genpd->state_idx == i) { + now = ktime_get_mono_fast_ns(); + if (now > genpd->accounting_time) { + delta = now - genpd->accounting_time; + idle_time += delta; + } + } - msecs = ktime_to_ms( - ktime_add(genpd->states[i].idle_time, delta)); - seq_printf(s, "S%-13i %-14lld %-14llu %llu\n", i, msecs, - genpd->states[i].usage, genpd->states[i].rejected); + do_div(idle_time, NSEC_PER_MSEC); + seq_printf(s, "S%-13i %-14llu %-14llu %llu\n", i, idle_time, + genpd->states[i].usage, genpd->states[i].rejected); } genpd_unlock(genpd); @@ -3193,18 +3248,22 @@ static int idle_states_show(struct seq_file *s, void *data) static int active_time_show(struct seq_file *s, void *data) { struct generic_pm_domain *genpd = s->private; - ktime_t delta = 0; + u64 now, on_time, delta = 0; int ret = 0; ret = genpd_lock_interruptible(genpd); if (ret) return -ERESTARTSYS; - if (genpd->status == GENPD_STATE_ON) - delta = ktime_sub(ktime_get(), genpd->accounting_time); + if (genpd->status == GENPD_STATE_ON) { + now = ktime_get_mono_fast_ns(); + if (now > genpd->accounting_time) + delta = now - genpd->accounting_time; + } - seq_printf(s, "%lld ms\n", ktime_to_ms( - ktime_add(genpd->on_time, delta))); + on_time = genpd->on_time + delta; + do_div(on_time, NSEC_PER_MSEC); + seq_printf(s, "%llu ms\n", on_time); genpd_unlock(genpd); return ret; @@ -3213,7 +3272,7 @@ static int active_time_show(struct seq_file *s, void *data) static int total_idle_time_show(struct seq_file *s, void *data) { struct generic_pm_domain *genpd = s->private; - ktime_t delta = 0, total = 0; + u64 now, delta, total = 0; unsigned int i; int ret = 0; @@ -3222,16 +3281,19 @@ static int total_idle_time_show(struct seq_file *s, void *data) return -ERESTARTSYS; for (i = 0; i < genpd->state_count; i++) { + total += genpd->states[i].idle_time; - if ((genpd->status == GENPD_STATE_OFF) && - (genpd->state_idx == i)) - delta = ktime_sub(ktime_get(), genpd->accounting_time); - - total = ktime_add(total, genpd->states[i].idle_time); + if (genpd->status == GENPD_STATE_OFF && genpd->state_idx == i) { + now = ktime_get_mono_fast_ns(); + if (now > genpd->accounting_time) { + delta = now - genpd->accounting_time; + total += delta; + } + } } - total = ktime_add(total, delta); - seq_printf(s, "%lld ms\n", ktime_to_ms(total)); + do_div(total, NSEC_PER_MSEC); + seq_printf(s, "%llu ms\n", total); genpd_unlock(genpd); return ret; diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c index cd08c5885190..282a3a135827 100644 --- a/drivers/base/power/domain_governor.c +++ b/drivers/base/power/domain_governor.c @@ -18,6 +18,8 @@ static int dev_update_qos_constraint(struct device *dev, void *data) s64 constraint_ns; if (dev->power.subsys_data && dev->power.subsys_data->domain_data) { + struct gpd_timing_data *td = dev_gpd_data(dev)->td; + /* * Only take suspend-time QoS constraints of devices into * account, because constraints updated after the device has @@ -25,7 +27,8 @@ static int dev_update_qos_constraint(struct device *dev, void *data) * anyway. In order for them to take effect, the device has to * be resumed and suspended again. */ - constraint_ns = dev_gpd_data(dev)->td.effective_constraint_ns; + constraint_ns = td ? td->effective_constraint_ns : + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS; } else { /* * The child is not in a domain and there's no info on its @@ -49,7 +52,7 @@ static int dev_update_qos_constraint(struct device *dev, void *data) */ static bool default_suspend_ok(struct device *dev) { - struct gpd_timing_data *td = &dev_gpd_data(dev)->td; + struct gpd_timing_data *td = dev_gpd_data(dev)->td; unsigned long flags; s64 constraint_ns; @@ -136,26 +139,28 @@ static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t n * is able to enter its optimal idle state. */ list_for_each_entry(pdd, &genpd->dev_list, list_node) { - next_wakeup = to_gpd_data(pdd)->next_wakeup; + next_wakeup = to_gpd_data(pdd)->td->next_wakeup; if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now)) if (ktime_before(next_wakeup, domain_wakeup)) domain_wakeup = next_wakeup; } list_for_each_entry(link, &genpd->parent_links, parent_node) { - next_wakeup = link->child->next_wakeup; + struct genpd_governor_data *cgd = link->child->gd; + + next_wakeup = cgd ? cgd->next_wakeup : KTIME_MAX; if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now)) if (ktime_before(next_wakeup, domain_wakeup)) domain_wakeup = next_wakeup; } - genpd->next_wakeup = domain_wakeup; + genpd->gd->next_wakeup = domain_wakeup; } static bool next_wakeup_allows_state(struct generic_pm_domain *genpd, unsigned int state, ktime_t now) { - ktime_t domain_wakeup = genpd->next_wakeup; + ktime_t domain_wakeup = genpd->gd->next_wakeup; s64 idle_time_ns, min_sleep_ns; min_sleep_ns = genpd->states[state].power_off_latency_ns + @@ -185,8 +190,9 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd, * All subdomains have been powered off already at this point. */ list_for_each_entry(link, &genpd->parent_links, parent_node) { - struct generic_pm_domain *sd = link->child; - s64 sd_max_off_ns = sd->max_off_time_ns; + struct genpd_governor_data *cgd = link->child->gd; + + s64 sd_max_off_ns = cgd ? cgd->max_off_time_ns : -1; if (sd_max_off_ns < 0) continue; @@ -215,7 +221,7 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd, * domain to turn off and on (that's how much time it will * have to wait worst case). */ - td = &to_gpd_data(pdd)->td; + td = to_gpd_data(pdd)->td; constraint_ns = td->effective_constraint_ns; /* * Zero means "no suspend at all" and this runs only when all @@ -244,7 +250,7 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd, * time and the time needed to turn the domain on is the maximum * theoretical time this domain can spend in the "off" state. */ - genpd->max_off_time_ns = min_off_time_ns - + genpd->gd->max_off_time_ns = min_off_time_ns - genpd->states[state].power_on_latency_ns; return true; } @@ -259,6 +265,7 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd, static bool _default_power_down_ok(struct dev_pm_domain *pd, ktime_t now) { struct generic_pm_domain *genpd = pd_to_genpd(pd); + struct genpd_governor_data *gd = genpd->gd; int state_idx = genpd->state_count - 1; struct gpd_link *link; @@ -269,11 +276,11 @@ static bool _default_power_down_ok(struct dev_pm_domain *pd, ktime_t now) * cannot be met. */ update_domain_next_wakeup(genpd, now); - if ((genpd->flags & GENPD_FLAG_MIN_RESIDENCY) && (genpd->next_wakeup != KTIME_MAX)) { + if ((genpd->flags & GENPD_FLAG_MIN_RESIDENCY) && (gd->next_wakeup != KTIME_MAX)) { /* Let's find out the deepest domain idle state, the devices prefer */ while (state_idx >= 0) { if (next_wakeup_allows_state(genpd, state_idx, now)) { - genpd->max_off_time_changed = true; + gd->max_off_time_changed = true; break; } state_idx--; @@ -281,14 +288,14 @@ static bool _default_power_down_ok(struct dev_pm_domain *pd, ktime_t now) if (state_idx < 0) { state_idx = 0; - genpd->cached_power_down_ok = false; + gd->cached_power_down_ok = false; goto done; } } - if (!genpd->max_off_time_changed) { - genpd->state_idx = genpd->cached_power_down_state_idx; - return genpd->cached_power_down_ok; + if (!gd->max_off_time_changed) { + genpd->state_idx = gd->cached_power_down_state_idx; + return gd->cached_power_down_ok; } /* @@ -297,12 +304,16 @@ static bool _default_power_down_ok(struct dev_pm_domain *pd, ktime_t now) * going to be called for any parent until this instance * returns. */ - list_for_each_entry(link, &genpd->child_links, child_node) - link->parent->max_off_time_changed = true; + list_for_each_entry(link, &genpd->child_links, child_node) { + struct genpd_governor_data *pgd = link->parent->gd; + + if (pgd) + pgd->max_off_time_changed = true; + } - genpd->max_off_time_ns = -1; - genpd->max_off_time_changed = false; - genpd->cached_power_down_ok = true; + gd->max_off_time_ns = -1; + gd->max_off_time_changed = false; + gd->cached_power_down_ok = true; /* * Find a state to power down to, starting from the state @@ -310,7 +321,7 @@ static bool _default_power_down_ok(struct dev_pm_domain *pd, ktime_t now) */ while (!__default_power_down_ok(pd, state_idx)) { if (state_idx == 0) { - genpd->cached_power_down_ok = false; + gd->cached_power_down_ok = false; break; } state_idx--; @@ -318,8 +329,8 @@ static bool _default_power_down_ok(struct dev_pm_domain *pd, ktime_t now) done: genpd->state_idx = state_idx; - genpd->cached_power_down_state_idx = genpd->state_idx; - return genpd->cached_power_down_ok; + gd->cached_power_down_state_idx = genpd->state_idx; + return gd->cached_power_down_ok; } static bool default_power_down_ok(struct dev_pm_domain *pd) @@ -327,11 +338,6 @@ static bool default_power_down_ok(struct dev_pm_domain *pd) return _default_power_down_ok(pd, ktime_get()); } -static bool always_on_power_down_ok(struct dev_pm_domain *domain) -{ - return false; -} - #ifdef CONFIG_CPU_IDLE static bool cpu_power_down_ok(struct dev_pm_domain *pd) { @@ -401,6 +407,5 @@ struct dev_power_governor simple_qos_governor = { * pm_genpd_gov_always_on - A governor implementing an always-on policy */ struct dev_power_governor pm_domain_always_on_gov = { - .power_down_ok = always_on_power_down_ok, .suspend_ok = default_suspend_ok, }; diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index d4059e6ffeae..676dc72d912d 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -263,7 +263,7 @@ static int rpm_check_suspend_allowed(struct device *dev) retval = -EINVAL; else if (dev->power.disable_depth > 0) retval = -EACCES; - else if (atomic_read(&dev->power.usage_count) > 0) + else if (atomic_read(&dev->power.usage_count)) retval = -EAGAIN; else if (!dev->power.ignore_children && atomic_read(&dev->power.child_count)) @@ -1039,13 +1039,33 @@ int pm_schedule_suspend(struct device *dev, unsigned int delay) } EXPORT_SYMBOL_GPL(pm_schedule_suspend); +static int rpm_drop_usage_count(struct device *dev) +{ + int ret; + + ret = atomic_sub_return(1, &dev->power.usage_count); + if (ret >= 0) + return ret; + + /* + * Because rpm_resume() does not check the usage counter, it will resume + * the device even if the usage counter is 0 or negative, so it is + * sufficient to increment the usage counter here to reverse the change + * made above. + */ + atomic_inc(&dev->power.usage_count); + dev_warn(dev, "Runtime PM usage count underflow!\n"); + return -EINVAL; +} + /** * __pm_runtime_idle - Entry point for runtime idle operations. * @dev: Device to send idle notification for. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and - * return immediately if it is larger than zero. Then carry out an idle + * return immediately if it is larger than zero (if it becomes negative, log a + * warning, increment it, and return an error). Then carry out an idle * notification, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, @@ -1057,7 +1077,10 @@ int __pm_runtime_idle(struct device *dev, int rpmflags) int retval; if (rpmflags & RPM_GET_PUT) { - if (!atomic_dec_and_test(&dev->power.usage_count)) { + retval = rpm_drop_usage_count(dev); + if (retval < 0) { + return retval; + } else if (retval > 0) { trace_rpm_usage_rcuidle(dev, rpmflags); return 0; } @@ -1079,7 +1102,8 @@ EXPORT_SYMBOL_GPL(__pm_runtime_idle); * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and - * return immediately if it is larger than zero. Then carry out a suspend, + * return immediately if it is larger than zero (if it becomes negative, log a + * warning, increment it, and return an error). Then carry out a suspend, * either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, @@ -1091,7 +1115,10 @@ int __pm_runtime_suspend(struct device *dev, int rpmflags) int retval; if (rpmflags & RPM_GET_PUT) { - if (!atomic_dec_and_test(&dev->power.usage_count)) { + retval = rpm_drop_usage_count(dev); + if (retval < 0) { + return retval; + } else if (retval > 0) { trace_rpm_usage_rcuidle(dev, rpmflags); return 0; } @@ -1210,12 +1237,13 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) { struct device *parent = dev->parent; bool notify_parent = false; + unsigned long flags; int error = 0; if (status != RPM_ACTIVE && status != RPM_SUSPENDED) return -EINVAL; - spin_lock_irq(&dev->power.lock); + spin_lock_irqsave(&dev->power.lock, flags); /* * Prevent PM-runtime from being enabled for the device or return an @@ -1226,7 +1254,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) else error = -EAGAIN; - spin_unlock_irq(&dev->power.lock); + spin_unlock_irqrestore(&dev->power.lock, flags); if (error) return error; @@ -1247,7 +1275,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) device_links_read_unlock(idx); } - spin_lock_irq(&dev->power.lock); + spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.runtime_status == status || !parent) goto out_set; @@ -1288,7 +1316,7 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) dev->power.runtime_error = 0; out: - spin_unlock_irq(&dev->power.lock); + spin_unlock_irqrestore(&dev->power.lock, flags); if (notify_parent) pm_request_idle(parent); @@ -1527,14 +1555,17 @@ EXPORT_SYMBOL_GPL(pm_runtime_forbid); */ void pm_runtime_allow(struct device *dev) { + int ret; + spin_lock_irq(&dev->power.lock); if (dev->power.runtime_auto) goto out; dev->power.runtime_auto = true; - if (atomic_dec_and_test(&dev->power.usage_count)) + ret = rpm_drop_usage_count(dev); + if (ret == 0) rpm_idle(dev, RPM_AUTO | RPM_ASYNC); - else + else if (ret > 0) trace_rpm_usage_rcuidle(dev, RPM_AUTO | RPM_ASYNC); out: diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 82d370ae6a4a..d092c9bb4ba3 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -389,6 +389,27 @@ static int cppc_cpufreq_set_target(struct cpufreq_policy *policy, return ret; } +static unsigned int cppc_cpufreq_fast_switch(struct cpufreq_policy *policy, + unsigned int target_freq) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + unsigned int cpu = policy->cpu; + u32 desired_perf; + int ret; + + desired_perf = cppc_cpufreq_khz_to_perf(cpu_data, target_freq); + cpu_data->perf_ctrls.desired_perf = desired_perf; + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); + + if (ret) { + pr_debug("Failed to set target on CPU:%d. ret:%d\n", + cpu, ret); + return 0; + } + + return target_freq; +} + static int cppc_verify_policy(struct cpufreq_policy_data *policy) { cpufreq_verify_within_cpu_limits(policy); @@ -420,12 +441,197 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; } +static DEFINE_PER_CPU(unsigned int, efficiency_class); +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy); + +/* Create an artificial performance state every CPPC_EM_CAP_STEP capacity unit. */ +#define CPPC_EM_CAP_STEP (20) +/* Increase the cost value by CPPC_EM_COST_STEP every performance state. */ +#define CPPC_EM_COST_STEP (1) +/* Add a cost gap correspnding to the energy of 4 CPUs. */ +#define CPPC_EM_COST_GAP (4 * SCHED_CAPACITY_SCALE * CPPC_EM_COST_STEP \ + / CPPC_EM_CAP_STEP) + +static unsigned int get_perf_level_count(struct cpufreq_policy *policy) +{ + struct cppc_perf_caps *perf_caps; + unsigned int min_cap, max_cap; + struct cppc_cpudata *cpu_data; + int cpu = policy->cpu; + + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu); + min_cap = div_u64(max_cap * perf_caps->lowest_perf, perf_caps->highest_perf); + if ((min_cap == 0) || (max_cap < min_cap)) + return 0; + return 1 + max_cap / CPPC_EM_CAP_STEP - min_cap / CPPC_EM_CAP_STEP; +} + +/* + * The cost is defined as: + * cost = power * max_frequency / frequency + */ +static inline unsigned long compute_cost(int cpu, int step) +{ + return CPPC_EM_COST_GAP * per_cpu(efficiency_class, cpu) + + step * CPPC_EM_COST_STEP; +} + +static int cppc_get_cpu_power(struct device *cpu_dev, + unsigned long *power, unsigned long *KHz) +{ + unsigned long perf_step, perf_prev, perf, perf_check; + unsigned int min_step, max_step, step, step_check; + unsigned long prev_freq = *KHz; + unsigned int min_cap, max_cap; + struct cpufreq_policy *policy; + + struct cppc_perf_caps *perf_caps; + struct cppc_cpudata *cpu_data; + + policy = cpufreq_cpu_get_raw(cpu_dev->id); + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu_dev->id); + min_cap = div_u64(max_cap * perf_caps->lowest_perf, + perf_caps->highest_perf); + + perf_step = CPPC_EM_CAP_STEP * perf_caps->highest_perf / max_cap; + min_step = min_cap / CPPC_EM_CAP_STEP; + max_step = max_cap / CPPC_EM_CAP_STEP; + + perf_prev = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + step = perf_prev / perf_step; + + if (step > max_step) + return -EINVAL; + + if (min_step == max_step) { + step = max_step; + perf = perf_caps->highest_perf; + } else if (step < min_step) { + step = min_step; + perf = perf_caps->lowest_perf; + } else { + step++; + if (step == max_step) + perf = perf_caps->highest_perf; + else + perf = step * perf_step; + } + + *KHz = cppc_cpufreq_perf_to_khz(cpu_data, perf); + perf_check = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + step_check = perf_check / perf_step; + + /* + * To avoid bad integer approximation, check that new frequency value + * increased and that the new frequency will be converted to the + * desired step value. + */ + while ((*KHz == prev_freq) || (step_check != step)) { + perf++; + *KHz = cppc_cpufreq_perf_to_khz(cpu_data, perf); + perf_check = cppc_cpufreq_khz_to_perf(cpu_data, *KHz); + step_check = perf_check / perf_step; + } + + /* + * With an artificial EM, only the cost value is used. Still the power + * is populated such as 0 < power < EM_MAX_POWER. This allows to add + * more sense to the artificial performance states. + */ + *power = compute_cost(cpu_dev->id, step); + + return 0; +} + +static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, + unsigned long *cost) +{ + unsigned long perf_step, perf_prev; + struct cppc_perf_caps *perf_caps; + struct cpufreq_policy *policy; + struct cppc_cpudata *cpu_data; + unsigned int max_cap; + int step; + + policy = cpufreq_cpu_get_raw(cpu_dev->id); + cpu_data = policy->driver_data; + perf_caps = &cpu_data->perf_caps; + max_cap = arch_scale_cpu_capacity(cpu_dev->id); + + perf_prev = cppc_cpufreq_khz_to_perf(cpu_data, KHz); + perf_step = CPPC_EM_CAP_STEP * perf_caps->highest_perf / max_cap; + step = perf_prev / perf_step; + + *cost = compute_cost(cpu_dev->id, step); + + return 0; +} + +static int populate_efficiency_class(void) +{ + struct acpi_madt_generic_interrupt *gicc; + DECLARE_BITMAP(used_classes, 256) = {}; + int class, cpu, index; + + for_each_possible_cpu(cpu) { + gicc = acpi_cpu_get_madt_gicc(cpu); + class = gicc->efficiency_class; + bitmap_set(used_classes, class, 1); + } + + if (bitmap_weight(used_classes, 256) <= 1) { + pr_debug("Efficiency classes are all equal (=%d). " + "No EM registered", class); + return -EINVAL; + } + + /* + * Squeeze efficiency class values on [0:#efficiency_class-1]. + * Values are per spec in [0:255]. + */ + index = 0; + for_each_set_bit(class, used_classes, 256) { + for_each_possible_cpu(cpu) { + gicc = acpi_cpu_get_madt_gicc(cpu); + if (gicc->efficiency_class == class) + per_cpu(efficiency_class, cpu) = index; + } + index++; + } + cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; + + return 0; +} + +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data; + struct em_data_callback em_cb = + EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); + + cpu_data = policy->driver_data; + em_dev_register_perf_domain(get_cpu_device(policy->cpu), + get_perf_level_count(policy), &em_cb, + cpu_data->shared_cpu_map, 0); +} + #else static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) { return cppc_get_transition_latency(cpu) / NSEC_PER_USEC; } +static int populate_efficiency_class(void) +{ + return 0; +} +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ +} #endif @@ -536,6 +742,9 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) goto out; } + policy->fast_switch_possible = cppc_allow_fast_switch(); + policy->dvfs_possible_from_any_cpu = true; + /* * If 'highest_perf' is greater than 'nominal_perf', we assume CPU Boost * is supported. @@ -681,6 +890,7 @@ static struct cpufreq_driver cppc_cpufreq_driver = { .verify = cppc_verify_policy, .target = cppc_cpufreq_set_target, .get = cppc_cpufreq_get_rate, + .fast_switch = cppc_cpufreq_fast_switch, .init = cppc_cpufreq_cpu_init, .exit = cppc_cpufreq_cpu_exit, .set_boost = cppc_cpufreq_set_boost, @@ -742,6 +952,7 @@ static int __init cppc_cpufreq_init(void) cppc_check_hisi_workaround(); cppc_freq_invariance_init(); + populate_efficiency_class(); ret = cpufreq_register_driver(&cppc_cpufreq_driver); if (ret) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 80f535cc8a75..2cad42774164 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -28,6 +28,7 @@ #include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/tick.h> +#include <linux/units.h> #include <trace/events/power.h> static LIST_HEAD(cpufreq_policy_list); @@ -947,13 +948,14 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct cpufreq_policy *policy = to_policy(kobj); struct freq_attr *fattr = to_attr(attr); - ssize_t ret; + ssize_t ret = -EBUSY; if (!fattr->show) return -EIO; down_read(&policy->rwsem); - ret = fattr->show(policy, buf); + if (likely(!policy_is_inactive(policy))) + ret = fattr->show(policy, buf); up_read(&policy->rwsem); return ret; @@ -964,7 +966,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, { struct cpufreq_policy *policy = to_policy(kobj); struct freq_attr *fattr = to_attr(attr); - ssize_t ret = -EINVAL; + ssize_t ret = -EBUSY; if (!fattr->store) return -EIO; @@ -978,7 +980,8 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, if (cpu_online(policy->cpu)) { down_write(&policy->rwsem); - ret = fattr->store(policy, buf, count); + if (likely(!policy_is_inactive(policy))) + ret = fattr->store(policy, buf, count); up_write(&policy->rwsem); } @@ -1019,11 +1022,12 @@ static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu, dev_err(dev, "cpufreq symlink creation failed\n"); } -static void remove_cpu_dev_symlink(struct cpufreq_policy *policy, +static void remove_cpu_dev_symlink(struct cpufreq_policy *policy, int cpu, struct device *dev) { dev_dbg(dev, "%s: Removing symlink\n", __func__); sysfs_remove_link(&dev->kobj, "cpufreq"); + cpumask_clear_cpu(cpu, policy->real_cpus); } static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) @@ -1337,12 +1341,12 @@ static int cpufreq_online(unsigned int cpu) down_write(&policy->rwsem); policy->cpu = cpu; policy->governor = NULL; - up_write(&policy->rwsem); } else { new_policy = true; policy = cpufreq_policy_alloc(cpu); if (!policy) return -ENOMEM; + down_write(&policy->rwsem); } if (!new_policy && cpufreq_driver->online) { @@ -1382,7 +1386,6 @@ static int cpufreq_online(unsigned int cpu) cpumask_copy(policy->related_cpus, policy->cpus); } - down_write(&policy->rwsem); /* * affected cpus must always be the one, which are online. We aren't * managing offline cpus here. @@ -1531,9 +1534,9 @@ static int cpufreq_online(unsigned int cpu) out_destroy_policy: for_each_cpu(j, policy->real_cpus) - remove_cpu_dev_symlink(policy, get_cpu_device(j)); + remove_cpu_dev_symlink(policy, j, get_cpu_device(j)); - up_write(&policy->rwsem); + cpumask_clear(policy->cpus); out_offline_policy: if (cpufreq_driver->offline) @@ -1544,6 +1547,8 @@ out_exit_policy: cpufreq_driver->exit(policy); out_free_policy: + up_write(&policy->rwsem); + cpufreq_policy_free(policy); return ret; } @@ -1575,47 +1580,36 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif) return 0; } -static int cpufreq_offline(unsigned int cpu) +static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) { - struct cpufreq_policy *policy; int ret; - pr_debug("%s: unregistering CPU %u\n", __func__, cpu); - - policy = cpufreq_cpu_get_raw(cpu); - if (!policy) { - pr_debug("%s: No cpu_data found\n", __func__); - return 0; - } - - down_write(&policy->rwsem); if (has_target()) cpufreq_stop_governor(policy); cpumask_clear_cpu(cpu, policy->cpus); - if (policy_is_inactive(policy)) { - if (has_target()) - strncpy(policy->last_governor, policy->governor->name, - CPUFREQ_NAME_LEN); - else - policy->last_policy = policy->policy; - } else if (cpu == policy->cpu) { - /* Nominate new CPU */ - policy->cpu = cpumask_any(policy->cpus); - } - - /* Start governor again for active policy */ if (!policy_is_inactive(policy)) { + /* Nominate a new CPU if necessary. */ + if (cpu == policy->cpu) + policy->cpu = cpumask_any(policy->cpus); + + /* Start the governor again for the active policy. */ if (has_target()) { ret = cpufreq_start_governor(policy); if (ret) pr_err("%s: Failed to start governor\n", __func__); } - goto unlock; + return; } + if (has_target()) + strncpy(policy->last_governor, policy->governor->name, + CPUFREQ_NAME_LEN); + else + policy->last_policy = policy->policy; + if (cpufreq_thermal_control_enabled(cpufreq_driver)) { cpufreq_cooling_unregister(policy->cdev); policy->cdev = NULL; @@ -1634,8 +1628,24 @@ static int cpufreq_offline(unsigned int cpu) cpufreq_driver->exit(policy); policy->freq_table = NULL; } +} + +static int cpufreq_offline(unsigned int cpu) +{ + struct cpufreq_policy *policy; + + pr_debug("%s: unregistering CPU %u\n", __func__, cpu); + + policy = cpufreq_cpu_get_raw(cpu); + if (!policy) { + pr_debug("%s: No cpu_data found\n", __func__); + return 0; + } + + down_write(&policy->rwsem); + + __cpufreq_offline(cpu, policy); -unlock: up_write(&policy->rwsem); return 0; } @@ -1653,19 +1663,25 @@ static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) if (!policy) return; - if (cpu_online(cpu)) - cpufreq_offline(cpu); + down_write(&policy->rwsem); - cpumask_clear_cpu(cpu, policy->real_cpus); - remove_cpu_dev_symlink(policy, dev); + if (cpu_online(cpu)) + __cpufreq_offline(cpu, policy); - if (cpumask_empty(policy->real_cpus)) { - /* We did light-weight exit earlier, do full tear down now */ - if (cpufreq_driver->offline) - cpufreq_driver->exit(policy); + remove_cpu_dev_symlink(policy, cpu, dev); - cpufreq_policy_free(policy); + if (!cpumask_empty(policy->real_cpus)) { + up_write(&policy->rwsem); + return; } + + /* We did light-weight exit earlier, do full tear down now */ + if (cpufreq_driver->offline) + cpufreq_driver->exit(policy); + + up_write(&policy->rwsem); + + cpufreq_policy_free(policy); } /** @@ -1707,6 +1723,16 @@ static unsigned int cpufreq_verify_current_freq(struct cpufreq_policy *policy, b return new_freq; if (policy->cur != new_freq) { + /* + * For some platforms, the frequency returned by hardware may be + * slightly different from what is provided in the frequency + * table, for example hardware may return 499 MHz instead of 500 + * MHz. In such cases it is better to avoid getting into + * unnecessary frequency updates. + */ + if (abs(policy->cur - new_freq) < HZ_PER_MHZ) + return policy->cur; + cpufreq_out_of_sync(policy, new_freq); if (update) schedule_work(&policy->update); diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 0d42cf8b88d8..85da677c43d6 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -388,6 +388,15 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs, gov->free(policy_dbs); } +static void cpufreq_dbs_data_release(struct kobject *kobj) +{ + struct dbs_data *dbs_data = to_dbs_data(to_gov_attr_set(kobj)); + struct dbs_governor *gov = dbs_data->gov; + + gov->exit(dbs_data); + kfree(dbs_data); +} + int cpufreq_dbs_governor_init(struct cpufreq_policy *policy) { struct dbs_governor *gov = dbs_governor_of(policy); @@ -425,6 +434,7 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy) goto free_policy_dbs_info; } + dbs_data->gov = gov; gov_attr_set_init(&dbs_data->attr_set, &policy_dbs->list); ret = gov->init(dbs_data); @@ -447,6 +457,7 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy) policy->governor_data = policy_dbs; gov->kobj_type.sysfs_ops = &governor_sysfs_ops; + gov->kobj_type.release = cpufreq_dbs_data_release; ret = kobject_init_and_add(&dbs_data->attr_set.kobj, &gov->kobj_type, get_governor_parent_kobj(policy), "%s", gov->gov.name); @@ -488,13 +499,8 @@ void cpufreq_dbs_governor_exit(struct cpufreq_policy *policy) policy->governor_data = NULL; - if (!count) { - if (!have_governor_per_policy()) - gov->gdbs_data = NULL; - - gov->exit(dbs_data); - kfree(dbs_data); - } + if (!count && !have_governor_per_policy()) + gov->gdbs_data = NULL; free_policy_dbs_info(policy_dbs, gov); diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index a5a0bc3cc23e..168c23fd7fca 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -37,6 +37,7 @@ enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE}; /* Governor demand based switching data (per-policy or global). */ struct dbs_data { struct gov_attr_set attr_set; + struct dbs_governor *gov; void *tuners; unsigned int ignore_nice_load; unsigned int sampling_rate; diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 846bb3a78788..57cdb3679885 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1322,6 +1322,7 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b, mutex_unlock(&intel_pstate_limits_lock); intel_pstate_update_policies(); + arch_set_max_freq_ratio(global.no_turbo); mutex_unlock(&intel_pstate_driver_lock); @@ -2424,6 +2425,7 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { X86_MATCH(BROADWELL_X, core_funcs), X86_MATCH(SKYLAKE_X, core_funcs), X86_MATCH(ICELAKE_X, core_funcs), + X86_MATCH(SAPPHIRERAPIDS_X, core_funcs), {} }; diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index 0a94c56ddad2..813cccbfe934 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -51,8 +51,8 @@ static const u16 cpufreq_mtk_offsets[REG_ARRAY_SIZE] = { }; static int __maybe_unused -mtk_cpufreq_get_cpu_power(unsigned long *mW, - unsigned long *KHz, struct device *cpu_dev) +mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW, + unsigned long *KHz) { struct mtk_cpufreq_data *data; struct cpufreq_policy *policy; diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c index 815645170c4d..039a66bbe1be 100644 --- a/drivers/cpufreq/pasemi-cpufreq.c +++ b/drivers/cpufreq/pasemi-cpufreq.c @@ -18,7 +18,6 @@ #include <asm/hw_irq.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/time.h> #include <asm/smp.h> diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c index 4f20c6a9108d..20f64a8b0a35 100644 --- a/drivers/cpufreq/pmac32-cpufreq.c +++ b/drivers/cpufreq/pmac32-cpufreq.c @@ -24,7 +24,7 @@ #include <linux/device.h> #include <linux/hardirq.h> #include <linux/of_device.h> -#include <asm/prom.h> + #include <asm/machdep.h> #include <asm/irq.h> #include <asm/pmac_feature.h> diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c index d7542a106e6b..ba9c31d98bd6 100644 --- a/drivers/cpufreq/pmac64-cpufreq.c +++ b/drivers/cpufreq/pmac64-cpufreq.c @@ -22,7 +22,7 @@ #include <linux/completion.h> #include <linux/mutex.h> #include <linux/of_device.h> -#include <asm/prom.h> + #include <asm/machdep.h> #include <asm/irq.h> #include <asm/sections.h> diff --git a/drivers/cpufreq/ppc_cbe_cpufreq.c b/drivers/cpufreq/ppc_cbe_cpufreq.c index c58abb4cca3a..e3313ce63b38 100644 --- a/drivers/cpufreq/ppc_cbe_cpufreq.c +++ b/drivers/cpufreq/ppc_cbe_cpufreq.c @@ -12,7 +12,6 @@ #include <linux/of_platform.h> #include <asm/machdep.h> -#include <asm/prom.h> #include <asm/cell-regs.h> #include "ppc_cbe_cpufreq.h" diff --git a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c index 037fe23bc6ed..4fba3637b115 100644 --- a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c +++ b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c @@ -13,9 +13,9 @@ #include <linux/init.h> #include <linux/of_platform.h> #include <linux/pm_qos.h> +#include <linux/slab.h> #include <asm/processor.h> -#include <asm/prom.h> #include <asm/pmi.h> #include <asm/cell-regs.h> diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 919fa6e3f462..6d2a4cf46db7 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -96,8 +96,8 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) } static int __maybe_unused -scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, - struct device *cpu_dev) +scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power, + unsigned long *KHz) { unsigned long Hz; int ret, domain; diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c index 755bbdfc5b82..3db4fca1172b 100644 --- a/drivers/cpuidle/cpuidle-psci-domain.c +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -52,7 +52,7 @@ static int psci_pd_init(struct device_node *np, bool use_osi) struct generic_pm_domain *pd; struct psci_pd_provider *pd_provider; struct dev_power_governor *pd_gov; - int ret = -ENOMEM, state_count = 0; + int ret = -ENOMEM; pd = dt_idle_pd_alloc(np, psci_dt_parse_state_node); if (!pd) @@ -71,7 +71,7 @@ static int psci_pd_init(struct device_node *np, bool use_osi) pd->flags |= GENPD_FLAG_ALWAYS_ON; /* Use governor for CPU PM domains if it has some states to manage. */ - pd_gov = state_count > 0 ? &pm_domain_cpu_gov : NULL; + pd_gov = pd->states ? &pm_domain_cpu_gov : NULL; ret = pm_genpd_init(pd, pd_gov, false); if (ret) diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index b51b5df08450..540105ca0781 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -23,6 +23,7 @@ #include <linux/pm_runtime.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/syscore_ops.h> #include <asm/cpuidle.h> @@ -131,6 +132,49 @@ static int psci_idle_cpuhp_down(unsigned int cpu) return 0; } +static void psci_idle_syscore_switch(bool suspend) +{ + bool cleared = false; + struct device *dev; + int cpu; + + for_each_possible_cpu(cpu) { + dev = per_cpu_ptr(&psci_cpuidle_data, cpu)->dev; + + if (dev && suspend) { + dev_pm_genpd_suspend(dev); + } else if (dev) { + dev_pm_genpd_resume(dev); + + /* Account for userspace having offlined a CPU. */ + if (pm_runtime_status_suspended(dev)) + pm_runtime_set_active(dev); + + /* Clear domain state to re-start fresh. */ + if (!cleared) { + psci_set_domain_state(0); + cleared = true; + } + } + } +} + +static int psci_idle_syscore_suspend(void) +{ + psci_idle_syscore_switch(true); + return 0; +} + +static void psci_idle_syscore_resume(void) +{ + psci_idle_syscore_switch(false); +} + +static struct syscore_ops psci_idle_syscore_ops = { + .suspend = psci_idle_syscore_suspend, + .resume = psci_idle_syscore_resume, +}; + static void psci_idle_init_cpuhp(void) { int err; @@ -138,6 +182,8 @@ static void psci_idle_init_cpuhp(void) if (!psci_cpuidle_use_cpuhp) return; + register_syscore_ops(&psci_idle_syscore_ops); + err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING, "cpuidle/psci:online", psci_idle_cpuhp_up, diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index 5c852e671992..1151e5e2ba82 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -414,7 +414,7 @@ static int sbi_pd_init(struct device_node *np) struct generic_pm_domain *pd; struct sbi_pd_provider *pd_provider; struct dev_power_governor *pd_gov; - int ret = -ENOMEM, state_count = 0; + int ret = -ENOMEM; pd = dt_idle_pd_alloc(np, sbi_dt_parse_state_node); if (!pd) @@ -433,7 +433,7 @@ static int sbi_pd_init(struct device_node *np) pd->flags |= GENPD_FLAG_ALWAYS_ON; /* Use governor for CPU PM domains if it has some states to manage. */ - pd_gov = state_count > 0 ? &pm_domain_cpu_gov : NULL; + pd_gov = pd->states ? &pm_domain_cpu_gov : NULL; ret = pm_genpd_init(pd, pd_gov, false); if (ret) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index a525a609dfc6..01474daf4548 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -112,16 +112,16 @@ static unsigned long find_available_max_freq(struct devfreq *devfreq) } /** - * get_freq_range() - Get the current freq range + * devfreq_get_freq_range() - Get the current freq range * @devfreq: the devfreq instance * @min_freq: the min frequency * @max_freq: the max frequency * * This takes into consideration all constraints. */ -static void get_freq_range(struct devfreq *devfreq, - unsigned long *min_freq, - unsigned long *max_freq) +void devfreq_get_freq_range(struct devfreq *devfreq, + unsigned long *min_freq, + unsigned long *max_freq) { unsigned long *freq_table = devfreq->profile->freq_table; s32 qos_min_freq, qos_max_freq; @@ -158,6 +158,7 @@ static void get_freq_range(struct devfreq *devfreq, if (*min_freq > *max_freq) *min_freq = *max_freq; } +EXPORT_SYMBOL(devfreq_get_freq_range); /** * devfreq_get_freq_level() - Lookup freq_table for the frequency @@ -418,7 +419,7 @@ int devfreq_update_target(struct devfreq *devfreq, unsigned long freq) err = devfreq->governor->get_target_freq(devfreq, &freq); if (err) return err; - get_freq_range(devfreq, &min_freq, &max_freq); + devfreq_get_freq_range(devfreq, &min_freq, &max_freq); if (freq < min_freq) { freq = min_freq; @@ -785,6 +786,7 @@ struct devfreq *devfreq_add_device(struct device *dev, { struct devfreq *devfreq; struct devfreq_governor *governor; + unsigned long min_freq, max_freq; int err = 0; if (!dev || !profile || !governor_name) { @@ -849,6 +851,8 @@ struct devfreq *devfreq_add_device(struct device *dev, goto err_dev; } + devfreq_get_freq_range(devfreq, &min_freq, &max_freq); + devfreq->suspend_freq = dev_pm_opp_get_suspend_opp_freq(dev); devfreq->opp_table = dev_pm_opp_get_opp_table(dev); if (IS_ERR(devfreq->opp_table)) @@ -1587,7 +1591,7 @@ static ssize_t min_freq_show(struct device *dev, struct device_attribute *attr, unsigned long min_freq, max_freq; mutex_lock(&df->lock); - get_freq_range(df, &min_freq, &max_freq); + devfreq_get_freq_range(df, &min_freq, &max_freq); mutex_unlock(&df->lock); return sprintf(buf, "%lu\n", min_freq); @@ -1641,7 +1645,7 @@ static ssize_t max_freq_show(struct device *dev, struct device_attribute *attr, unsigned long min_freq, max_freq; mutex_lock(&df->lock); - get_freq_range(df, &min_freq, &max_freq); + devfreq_get_freq_range(df, &min_freq, &max_freq); mutex_unlock(&df->lock); return sprintf(buf, "%lu\n", max_freq); @@ -1955,7 +1959,7 @@ static int devfreq_summary_show(struct seq_file *s, void *data) mutex_lock(&devfreq->lock); cur_freq = devfreq->previous_freq; - get_freq_range(devfreq, &min_freq, &max_freq); + devfreq_get_freq_range(devfreq, &min_freq, &max_freq); timer = devfreq->profile->timer; if (IS_SUPPORTED_ATTR(devfreq->governor->attrs, POLLING_INTERVAL)) diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h index 002a7d67e39d..0adfebc0467a 100644 --- a/drivers/devfreq/governor.h +++ b/drivers/devfreq/governor.h @@ -48,6 +48,31 @@ #define DEVFREQ_GOV_ATTR_TIMER BIT(1) /** + * struct devfreq_cpu_data - Hold the per-cpu data + * @node: list node + * @dev: reference to cpu device. + * @first_cpu: the cpumask of the first cpu of a policy. + * @opp_table: reference to cpu opp table. + * @cur_freq: the current frequency of the cpu. + * @min_freq: the min frequency of the cpu. + * @max_freq: the max frequency of the cpu. + * + * This structure stores the required cpu_data of a cpu. + * This is auto-populated by the governor. + */ +struct devfreq_cpu_data { + struct list_head node; + + struct device *dev; + unsigned int first_cpu; + + struct opp_table *opp_table; + unsigned int cur_freq; + unsigned int min_freq; + unsigned int max_freq; +}; + +/** * struct devfreq_governor - Devfreq policy governor * @node: list node - contains registered devfreq governors * @name: Governor's name @@ -89,6 +114,8 @@ int devm_devfreq_add_governor(struct device *dev, int devfreq_update_status(struct devfreq *devfreq, unsigned long freq); int devfreq_update_target(struct devfreq *devfreq, unsigned long freq); +void devfreq_get_freq_range(struct devfreq *devfreq, unsigned long *min_freq, + unsigned long *max_freq); static inline int devfreq_update_stats(struct devfreq *df) { diff --git a/drivers/devfreq/governor_passive.c b/drivers/devfreq/governor_passive.c index fc09324a03e0..72c67979ebe1 100644 --- a/drivers/devfreq/governor_passive.c +++ b/drivers/devfreq/governor_passive.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only + // SPDX-License-Identifier: GPL-2.0-only /* * linux/drivers/devfreq/governor_passive.c * @@ -8,76 +8,129 @@ */ #include <linux/module.h> +#include <linux/cpu.h> +#include <linux/cpufreq.h> +#include <linux/cpumask.h> +#include <linux/slab.h> #include <linux/device.h> #include <linux/devfreq.h> #include "governor.h" -static int devfreq_passive_get_target_freq(struct devfreq *devfreq, - unsigned long *freq) +#define HZ_PER_KHZ 1000 + +static struct devfreq_cpu_data * +get_parent_cpu_data(struct devfreq_passive_data *p_data, + struct cpufreq_policy *policy) { - struct devfreq_passive_data *p_data - = (struct devfreq_passive_data *)devfreq->data; - struct devfreq *parent_devfreq = (struct devfreq *)p_data->parent; - unsigned long child_freq = ULONG_MAX; - struct dev_pm_opp *opp, *p_opp; - int i, count; + struct devfreq_cpu_data *parent_cpu_data; - /* - * If the devfreq device with passive governor has the specific method - * to determine the next frequency, should use the get_target_freq() - * of struct devfreq_passive_data. - */ - if (p_data->get_target_freq) - return p_data->get_target_freq(devfreq, freq); + if (!p_data || !policy) + return NULL; - /* - * If the parent and passive devfreq device uses the OPP table, - * get the next frequency by using the OPP table. - */ + list_for_each_entry(parent_cpu_data, &p_data->cpu_data_list, node) + if (parent_cpu_data->first_cpu == cpumask_first(policy->related_cpus)) + return parent_cpu_data; - /* - * - parent devfreq device uses the governors except for passive. - * - passive devfreq device uses the passive governor. - * - * Each devfreq has the OPP table. After deciding the new frequency - * from the governor of parent devfreq device, the passive governor - * need to get the index of new frequency on OPP table of parent - * device. And then the index is used for getting the suitable - * new frequency for passive devfreq device. - */ - if (!devfreq->profile || !devfreq->profile->freq_table - || devfreq->profile->max_state <= 0) - return -EINVAL; + return NULL; +} - /* - * The passive governor have to get the correct frequency from OPP - * list of parent device. Because in this case, *freq is temporary - * value which is decided by ondemand governor. - */ - if (devfreq->opp_table && parent_devfreq->opp_table) { - p_opp = devfreq_recommended_opp(parent_devfreq->dev.parent, - freq, 0); - if (IS_ERR(p_opp)) - return PTR_ERR(p_opp); +static unsigned long get_target_freq_by_required_opp(struct device *p_dev, + struct opp_table *p_opp_table, + struct opp_table *opp_table, + unsigned long *freq) +{ + struct dev_pm_opp *opp = NULL, *p_opp = NULL; + unsigned long target_freq; - opp = dev_pm_opp_xlate_required_opp(parent_devfreq->opp_table, - devfreq->opp_table, p_opp); - dev_pm_opp_put(p_opp); + if (!p_dev || !p_opp_table || !opp_table || !freq) + return 0; - if (IS_ERR(opp)) - goto no_required_opp; + p_opp = devfreq_recommended_opp(p_dev, freq, 0); + if (IS_ERR(p_opp)) + return 0; - *freq = dev_pm_opp_get_freq(opp); - dev_pm_opp_put(opp); + opp = dev_pm_opp_xlate_required_opp(p_opp_table, opp_table, p_opp); + dev_pm_opp_put(p_opp); + if (IS_ERR(opp)) return 0; + + target_freq = dev_pm_opp_get_freq(opp); + dev_pm_opp_put(opp); + + return target_freq; +} + +static int get_target_freq_with_cpufreq(struct devfreq *devfreq, + unsigned long *target_freq) +{ + struct devfreq_passive_data *p_data = + (struct devfreq_passive_data *)devfreq->data; + struct devfreq_cpu_data *parent_cpu_data; + struct cpufreq_policy *policy; + unsigned long cpu, cpu_cur, cpu_min, cpu_max, cpu_percent; + unsigned long dev_min, dev_max; + unsigned long freq = 0; + int ret = 0; + + for_each_online_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); + if (!policy) { + ret = -EINVAL; + continue; + } + + parent_cpu_data = get_parent_cpu_data(p_data, policy); + if (!parent_cpu_data) { + cpufreq_cpu_put(policy); + continue; + } + + /* Get target freq via required opps */ + cpu_cur = parent_cpu_data->cur_freq * HZ_PER_KHZ; + freq = get_target_freq_by_required_opp(parent_cpu_data->dev, + parent_cpu_data->opp_table, + devfreq->opp_table, &cpu_cur); + if (freq) { + *target_freq = max(freq, *target_freq); + cpufreq_cpu_put(policy); + continue; + } + + /* Use interpolation if required opps is not available */ + devfreq_get_freq_range(devfreq, &dev_min, &dev_max); + + cpu_min = parent_cpu_data->min_freq; + cpu_max = parent_cpu_data->max_freq; + cpu_cur = parent_cpu_data->cur_freq; + + cpu_percent = ((cpu_cur - cpu_min) * 100) / (cpu_max - cpu_min); + freq = dev_min + mult_frac(dev_max - dev_min, cpu_percent, 100); + + *target_freq = max(freq, *target_freq); + cpufreq_cpu_put(policy); } -no_required_opp: - /* - * Get the OPP table's index of decided frequency by governor - * of parent device. - */ + return ret; +} + +static int get_target_freq_with_devfreq(struct devfreq *devfreq, + unsigned long *freq) +{ + struct devfreq_passive_data *p_data + = (struct devfreq_passive_data *)devfreq->data; + struct devfreq *parent_devfreq = (struct devfreq *)p_data->parent; + unsigned long child_freq = ULONG_MAX; + int i, count; + + /* Get target freq via required opps */ + child_freq = get_target_freq_by_required_opp(parent_devfreq->dev.parent, + parent_devfreq->opp_table, + devfreq->opp_table, freq); + if (child_freq) + goto out; + + /* Use interpolation if required opps is not available */ for (i = 0; i < parent_devfreq->profile->max_state; i++) if (parent_devfreq->profile->freq_table[i] == *freq) break; @@ -85,7 +138,6 @@ no_required_opp: if (i == parent_devfreq->profile->max_state) return -EINVAL; - /* Get the suitable frequency by using index of parent device. */ if (i < devfreq->profile->max_state) { child_freq = devfreq->profile->freq_table[i]; } else { @@ -93,12 +145,202 @@ no_required_opp: child_freq = devfreq->profile->freq_table[count - 1]; } - /* Return the suitable frequency for passive device. */ +out: *freq = child_freq; return 0; } +static int devfreq_passive_get_target_freq(struct devfreq *devfreq, + unsigned long *freq) +{ + struct devfreq_passive_data *p_data = + (struct devfreq_passive_data *)devfreq->data; + int ret; + + if (!p_data) + return -EINVAL; + + /* + * If the devfreq device with passive governor has the specific method + * to determine the next frequency, should use the get_target_freq() + * of struct devfreq_passive_data. + */ + if (p_data->get_target_freq) + return p_data->get_target_freq(devfreq, freq); + + switch (p_data->parent_type) { + case DEVFREQ_PARENT_DEV: + ret = get_target_freq_with_devfreq(devfreq, freq); + break; + case CPUFREQ_PARENT_DEV: + ret = get_target_freq_with_cpufreq(devfreq, freq); + break; + default: + ret = -EINVAL; + dev_err(&devfreq->dev, "Invalid parent type\n"); + break; + } + + return ret; +} + +static int cpufreq_passive_notifier_call(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct devfreq_passive_data *p_data = + container_of(nb, struct devfreq_passive_data, nb); + struct devfreq *devfreq = (struct devfreq *)p_data->this; + struct devfreq_cpu_data *parent_cpu_data; + struct cpufreq_freqs *freqs = ptr; + unsigned int cur_freq; + int ret; + + if (event != CPUFREQ_POSTCHANGE || !freqs) + return 0; + + parent_cpu_data = get_parent_cpu_data(p_data, freqs->policy); + if (!parent_cpu_data || parent_cpu_data->cur_freq == freqs->new) + return 0; + + cur_freq = parent_cpu_data->cur_freq; + parent_cpu_data->cur_freq = freqs->new; + + mutex_lock(&devfreq->lock); + ret = devfreq_update_target(devfreq, freqs->new); + mutex_unlock(&devfreq->lock); + if (ret) { + parent_cpu_data->cur_freq = cur_freq; + dev_err(&devfreq->dev, "failed to update the frequency.\n"); + return ret; + } + + return 0; +} + +static int cpufreq_passive_unregister_notifier(struct devfreq *devfreq) +{ + struct devfreq_passive_data *p_data + = (struct devfreq_passive_data *)devfreq->data; + struct devfreq_cpu_data *parent_cpu_data; + int cpu, ret = 0; + + if (p_data->nb.notifier_call) { + ret = cpufreq_unregister_notifier(&p_data->nb, + CPUFREQ_TRANSITION_NOTIFIER); + if (ret < 0) + return ret; + } + + for_each_possible_cpu(cpu) { + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + if (!policy) { + ret = -EINVAL; + continue; + } + + parent_cpu_data = get_parent_cpu_data(p_data, policy); + if (!parent_cpu_data) { + cpufreq_cpu_put(policy); + continue; + } + + list_del(&parent_cpu_data->node); + if (parent_cpu_data->opp_table) + dev_pm_opp_put_opp_table(parent_cpu_data->opp_table); + kfree(parent_cpu_data); + cpufreq_cpu_put(policy); + } + + return ret; +} + +static int cpufreq_passive_register_notifier(struct devfreq *devfreq) +{ + struct devfreq_passive_data *p_data + = (struct devfreq_passive_data *)devfreq->data; + struct device *dev = devfreq->dev.parent; + struct opp_table *opp_table = NULL; + struct devfreq_cpu_data *parent_cpu_data; + struct cpufreq_policy *policy; + struct device *cpu_dev; + unsigned int cpu; + int ret; + + p_data->cpu_data_list + = (struct list_head)LIST_HEAD_INIT(p_data->cpu_data_list); + + p_data->nb.notifier_call = cpufreq_passive_notifier_call; + ret = cpufreq_register_notifier(&p_data->nb, CPUFREQ_TRANSITION_NOTIFIER); + if (ret) { + dev_err(dev, "failed to register cpufreq notifier\n"); + p_data->nb.notifier_call = NULL; + goto err; + } + + for_each_possible_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); + if (!policy) { + ret = -EPROBE_DEFER; + goto err; + } + + parent_cpu_data = get_parent_cpu_data(p_data, policy); + if (parent_cpu_data) { + cpufreq_cpu_put(policy); + continue; + } + + parent_cpu_data = kzalloc(sizeof(*parent_cpu_data), + GFP_KERNEL); + if (!parent_cpu_data) { + ret = -ENOMEM; + goto err_put_policy; + } + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) { + dev_err(dev, "failed to get cpu device\n"); + ret = -ENODEV; + goto err_free_cpu_data; + } + + opp_table = dev_pm_opp_get_opp_table(cpu_dev); + if (IS_ERR(opp_table)) { + dev_err(dev, "failed to get opp_table of cpu%d\n", cpu); + ret = PTR_ERR(opp_table); + goto err_free_cpu_data; + } + + parent_cpu_data->dev = cpu_dev; + parent_cpu_data->opp_table = opp_table; + parent_cpu_data->first_cpu = cpumask_first(policy->related_cpus); + parent_cpu_data->cur_freq = policy->cur; + parent_cpu_data->min_freq = policy->cpuinfo.min_freq; + parent_cpu_data->max_freq = policy->cpuinfo.max_freq; + + list_add_tail(&parent_cpu_data->node, &p_data->cpu_data_list); + cpufreq_cpu_put(policy); + } + + mutex_lock(&devfreq->lock); + ret = devfreq_update_target(devfreq, 0L); + mutex_unlock(&devfreq->lock); + if (ret) + dev_err(dev, "failed to update the frequency\n"); + + return ret; + +err_free_cpu_data: + kfree(parent_cpu_data); +err_put_policy: + cpufreq_cpu_put(policy); +err: + WARN_ON(cpufreq_passive_unregister_notifier(devfreq)); + + return ret; +} + static int devfreq_passive_notifier_call(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -131,30 +373,55 @@ static int devfreq_passive_notifier_call(struct notifier_block *nb, return NOTIFY_DONE; } -static int devfreq_passive_event_handler(struct devfreq *devfreq, - unsigned int event, void *data) +static int devfreq_passive_unregister_notifier(struct devfreq *devfreq) +{ + struct devfreq_passive_data *p_data + = (struct devfreq_passive_data *)devfreq->data; + struct devfreq *parent = (struct devfreq *)p_data->parent; + struct notifier_block *nb = &p_data->nb; + + return devfreq_unregister_notifier(parent, nb, DEVFREQ_TRANSITION_NOTIFIER); +} + +static int devfreq_passive_register_notifier(struct devfreq *devfreq) { struct devfreq_passive_data *p_data = (struct devfreq_passive_data *)devfreq->data; struct devfreq *parent = (struct devfreq *)p_data->parent; struct notifier_block *nb = &p_data->nb; - int ret = 0; if (!parent) return -EPROBE_DEFER; + nb->notifier_call = devfreq_passive_notifier_call; + return devfreq_register_notifier(parent, nb, DEVFREQ_TRANSITION_NOTIFIER); +} + +static int devfreq_passive_event_handler(struct devfreq *devfreq, + unsigned int event, void *data) +{ + struct devfreq_passive_data *p_data + = (struct devfreq_passive_data *)devfreq->data; + int ret = 0; + + if (!p_data) + return -EINVAL; + + if (!p_data->this) + p_data->this = devfreq; + switch (event) { case DEVFREQ_GOV_START: - if (!p_data->this) - p_data->this = devfreq; - - nb->notifier_call = devfreq_passive_notifier_call; - ret = devfreq_register_notifier(parent, nb, - DEVFREQ_TRANSITION_NOTIFIER); + if (p_data->parent_type == DEVFREQ_PARENT_DEV) + ret = devfreq_passive_register_notifier(devfreq); + else if (p_data->parent_type == CPUFREQ_PARENT_DEV) + ret = cpufreq_passive_register_notifier(devfreq); break; case DEVFREQ_GOV_STOP: - WARN_ON(devfreq_unregister_notifier(parent, nb, - DEVFREQ_TRANSITION_NOTIFIER)); + if (p_data->parent_type == DEVFREQ_PARENT_DEV) + WARN_ON(devfreq_passive_unregister_notifier(devfreq)); + else if (p_data->parent_type == CPUFREQ_PARENT_DEV) + WARN_ON(cpufreq_passive_unregister_notifier(devfreq)); break; default: break; diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c index 293857ebfd75..daff40702615 100644 --- a/drivers/devfreq/rk3399_dmc.c +++ b/drivers/devfreq/rk3399_dmc.c @@ -5,6 +5,7 @@ */ #include <linux/arm-smccc.h> +#include <linux/bitfield.h> #include <linux/clk.h> #include <linux/delay.h> #include <linux/devfreq.h> @@ -20,55 +21,49 @@ #include <linux/rwsem.h> #include <linux/suspend.h> +#include <soc/rockchip/pm_domains.h> #include <soc/rockchip/rk3399_grf.h> #include <soc/rockchip/rockchip_sip.h> -struct dram_timing { - unsigned int ddr3_speed_bin; - unsigned int pd_idle; - unsigned int sr_idle; - unsigned int sr_mc_gate_idle; - unsigned int srpd_lite_idle; - unsigned int standby_idle; - unsigned int auto_pd_dis_freq; - unsigned int dram_dll_dis_freq; - unsigned int phy_dll_dis_freq; - unsigned int ddr3_odt_dis_freq; - unsigned int ddr3_drv; - unsigned int ddr3_odt; - unsigned int phy_ddr3_ca_drv; - unsigned int phy_ddr3_dq_drv; - unsigned int phy_ddr3_odt; - unsigned int lpddr3_odt_dis_freq; - unsigned int lpddr3_drv; - unsigned int lpddr3_odt; - unsigned int phy_lpddr3_ca_drv; - unsigned int phy_lpddr3_dq_drv; - unsigned int phy_lpddr3_odt; - unsigned int lpddr4_odt_dis_freq; - unsigned int lpddr4_drv; - unsigned int lpddr4_dq_odt; - unsigned int lpddr4_ca_odt; - unsigned int phy_lpddr4_ca_drv; - unsigned int phy_lpddr4_ck_cs_drv; - unsigned int phy_lpddr4_dq_drv; - unsigned int phy_lpddr4_odt; -}; +#define NS_TO_CYCLE(NS, MHz) (((NS) * (MHz)) / NSEC_PER_USEC) + +#define RK3399_SET_ODT_PD_0_SR_IDLE GENMASK(7, 0) +#define RK3399_SET_ODT_PD_0_SR_MC_GATE_IDLE GENMASK(15, 8) +#define RK3399_SET_ODT_PD_0_STANDBY_IDLE GENMASK(31, 16) + +#define RK3399_SET_ODT_PD_1_PD_IDLE GENMASK(11, 0) +#define RK3399_SET_ODT_PD_1_SRPD_LITE_IDLE GENMASK(27, 16) + +#define RK3399_SET_ODT_PD_2_ODT_ENABLE BIT(0) struct rk3399_dmcfreq { struct device *dev; struct devfreq *devfreq; + struct devfreq_dev_profile profile; struct devfreq_simple_ondemand_data ondemand_data; struct clk *dmc_clk; struct devfreq_event_dev *edev; struct mutex lock; - struct dram_timing timing; struct regulator *vdd_center; struct regmap *regmap_pmu; unsigned long rate, target_rate; unsigned long volt, target_volt; unsigned int odt_dis_freq; - int odt_pd_arg0, odt_pd_arg1; + + unsigned int pd_idle_ns; + unsigned int sr_idle_ns; + unsigned int sr_mc_gate_idle_ns; + unsigned int srpd_lite_idle_ns; + unsigned int standby_idle_ns; + unsigned int ddr3_odt_dis_freq; + unsigned int lpddr3_odt_dis_freq; + unsigned int lpddr4_odt_dis_freq; + + unsigned int pd_idle_dis_freq; + unsigned int sr_idle_dis_freq; + unsigned int sr_mc_gate_idle_dis_freq; + unsigned int srpd_lite_idle_dis_freq; + unsigned int standby_idle_dis_freq; }; static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq, @@ -78,10 +73,14 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq, struct dev_pm_opp *opp; unsigned long old_clk_rate = dmcfreq->rate; unsigned long target_volt, target_rate; + unsigned int ddrcon_mhz; struct arm_smccc_res res; - bool odt_enable = false; int err; + u32 odt_pd_arg0 = 0; + u32 odt_pd_arg1 = 0; + u32 odt_pd_arg2 = 0; + opp = devfreq_recommended_opp(dev, freq, flags); if (IS_ERR(opp)) return PTR_ERR(opp); @@ -95,19 +94,71 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq, mutex_lock(&dmcfreq->lock); + /* + * Ensure power-domain transitions don't interfere with ARM Trusted + * Firmware power-domain idling. + */ + err = rockchip_pmu_block(); + if (err) { + dev_err(dev, "Failed to block PMU: %d\n", err); + goto out_unlock; + } + + /* + * Some idle parameters may be based on the DDR controller clock, which + * is half of the DDR frequency. + * pd_idle and standby_idle are based on the controller clock cycle. + * sr_idle_cycle, sr_mc_gate_idle_cycle, and srpd_lite_idle_cycle + * are based on the 1024 controller clock cycle + */ + ddrcon_mhz = target_rate / USEC_PER_SEC / 2; + + u32p_replace_bits(&odt_pd_arg1, + NS_TO_CYCLE(dmcfreq->pd_idle_ns, ddrcon_mhz), + RK3399_SET_ODT_PD_1_PD_IDLE); + u32p_replace_bits(&odt_pd_arg0, + NS_TO_CYCLE(dmcfreq->standby_idle_ns, ddrcon_mhz), + RK3399_SET_ODT_PD_0_STANDBY_IDLE); + u32p_replace_bits(&odt_pd_arg0, + DIV_ROUND_UP(NS_TO_CYCLE(dmcfreq->sr_idle_ns, + ddrcon_mhz), 1024), + RK3399_SET_ODT_PD_0_SR_IDLE); + u32p_replace_bits(&odt_pd_arg0, + DIV_ROUND_UP(NS_TO_CYCLE(dmcfreq->sr_mc_gate_idle_ns, + ddrcon_mhz), 1024), + RK3399_SET_ODT_PD_0_SR_MC_GATE_IDLE); + u32p_replace_bits(&odt_pd_arg1, + DIV_ROUND_UP(NS_TO_CYCLE(dmcfreq->srpd_lite_idle_ns, + ddrcon_mhz), 1024), + RK3399_SET_ODT_PD_1_SRPD_LITE_IDLE); + if (dmcfreq->regmap_pmu) { + if (target_rate >= dmcfreq->sr_idle_dis_freq) + odt_pd_arg0 &= ~RK3399_SET_ODT_PD_0_SR_IDLE; + + if (target_rate >= dmcfreq->sr_mc_gate_idle_dis_freq) + odt_pd_arg0 &= ~RK3399_SET_ODT_PD_0_SR_MC_GATE_IDLE; + + if (target_rate >= dmcfreq->standby_idle_dis_freq) + odt_pd_arg0 &= ~RK3399_SET_ODT_PD_0_STANDBY_IDLE; + + if (target_rate >= dmcfreq->pd_idle_dis_freq) + odt_pd_arg1 &= ~RK3399_SET_ODT_PD_1_PD_IDLE; + + if (target_rate >= dmcfreq->srpd_lite_idle_dis_freq) + odt_pd_arg1 &= ~RK3399_SET_ODT_PD_1_SRPD_LITE_IDLE; + if (target_rate >= dmcfreq->odt_dis_freq) - odt_enable = true; + odt_pd_arg2 |= RK3399_SET_ODT_PD_2_ODT_ENABLE; /* * This makes a SMC call to the TF-A to set the DDR PD * (power-down) timings and to enable or disable the * ODT (on-die termination) resistors. */ - arm_smccc_smc(ROCKCHIP_SIP_DRAM_FREQ, dmcfreq->odt_pd_arg0, - dmcfreq->odt_pd_arg1, - ROCKCHIP_SIP_CONFIG_DRAM_SET_ODT_PD, - odt_enable, 0, 0, 0, &res); + arm_smccc_smc(ROCKCHIP_SIP_DRAM_FREQ, odt_pd_arg0, odt_pd_arg1, + ROCKCHIP_SIP_CONFIG_DRAM_SET_ODT_PD, odt_pd_arg2, + 0, 0, 0, &res); } /* @@ -158,6 +209,8 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq, dmcfreq->volt = target_volt; out: + rockchip_pmu_unblock(); +out_unlock: mutex_unlock(&dmcfreq->lock); return err; } @@ -189,13 +242,6 @@ static int rk3399_dmcfreq_get_cur_freq(struct device *dev, unsigned long *freq) return 0; } -static struct devfreq_dev_profile rk3399_devfreq_dmc_profile = { - .polling_ms = 200, - .target = rk3399_dmcfreq_target, - .get_dev_status = rk3399_dmcfreq_get_dev_status, - .get_cur_freq = rk3399_dmcfreq_get_cur_freq, -}; - static __maybe_unused int rk3399_dmcfreq_suspend(struct device *dev) { struct rk3399_dmcfreq *dmcfreq = dev_get_drvdata(dev); @@ -238,69 +284,48 @@ static __maybe_unused int rk3399_dmcfreq_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(rk3399_dmcfreq_pm, rk3399_dmcfreq_suspend, rk3399_dmcfreq_resume); -static int of_get_ddr_timings(struct dram_timing *timing, - struct device_node *np) +static int rk3399_dmcfreq_of_props(struct rk3399_dmcfreq *data, + struct device_node *np) { int ret = 0; - ret = of_property_read_u32(np, "rockchip,ddr3_speed_bin", - &timing->ddr3_speed_bin); - ret |= of_property_read_u32(np, "rockchip,pd_idle", - &timing->pd_idle); - ret |= of_property_read_u32(np, "rockchip,sr_idle", - &timing->sr_idle); - ret |= of_property_read_u32(np, "rockchip,sr_mc_gate_idle", - &timing->sr_mc_gate_idle); - ret |= of_property_read_u32(np, "rockchip,srpd_lite_idle", - &timing->srpd_lite_idle); - ret |= of_property_read_u32(np, "rockchip,standby_idle", - &timing->standby_idle); - ret |= of_property_read_u32(np, "rockchip,auto_pd_dis_freq", - &timing->auto_pd_dis_freq); - ret |= of_property_read_u32(np, "rockchip,dram_dll_dis_freq", - &timing->dram_dll_dis_freq); - ret |= of_property_read_u32(np, "rockchip,phy_dll_dis_freq", - &timing->phy_dll_dis_freq); + /* + * These are all optional, and serve as minimum bounds. Give them large + * (i.e., never "disabled") values if the DT doesn't specify one. + */ + data->pd_idle_dis_freq = + data->sr_idle_dis_freq = + data->sr_mc_gate_idle_dis_freq = + data->srpd_lite_idle_dis_freq = + data->standby_idle_dis_freq = UINT_MAX; + + ret |= of_property_read_u32(np, "rockchip,pd-idle-ns", + &data->pd_idle_ns); + ret |= of_property_read_u32(np, "rockchip,sr-idle-ns", + &data->sr_idle_ns); + ret |= of_property_read_u32(np, "rockchip,sr-mc-gate-idle-ns", + &data->sr_mc_gate_idle_ns); + ret |= of_property_read_u32(np, "rockchip,srpd-lite-idle-ns", + &data->srpd_lite_idle_ns); + ret |= of_property_read_u32(np, "rockchip,standby-idle-ns", + &data->standby_idle_ns); ret |= of_property_read_u32(np, "rockchip,ddr3_odt_dis_freq", - &timing->ddr3_odt_dis_freq); - ret |= of_property_read_u32(np, "rockchip,ddr3_drv", - &timing->ddr3_drv); - ret |= of_property_read_u32(np, "rockchip,ddr3_odt", - &timing->ddr3_odt); - ret |= of_property_read_u32(np, "rockchip,phy_ddr3_ca_drv", - &timing->phy_ddr3_ca_drv); - ret |= of_property_read_u32(np, "rockchip,phy_ddr3_dq_drv", - &timing->phy_ddr3_dq_drv); - ret |= of_property_read_u32(np, "rockchip,phy_ddr3_odt", - &timing->phy_ddr3_odt); + &data->ddr3_odt_dis_freq); ret |= of_property_read_u32(np, "rockchip,lpddr3_odt_dis_freq", - &timing->lpddr3_odt_dis_freq); - ret |= of_property_read_u32(np, "rockchip,lpddr3_drv", - &timing->lpddr3_drv); - ret |= of_property_read_u32(np, "rockchip,lpddr3_odt", - &timing->lpddr3_odt); - ret |= of_property_read_u32(np, "rockchip,phy_lpddr3_ca_drv", - &timing->phy_lpddr3_ca_drv); - ret |= of_property_read_u32(np, "rockchip,phy_lpddr3_dq_drv", - &timing->phy_lpddr3_dq_drv); - ret |= of_property_read_u32(np, "rockchip,phy_lpddr3_odt", - &timing->phy_lpddr3_odt); + &data->lpddr3_odt_dis_freq); ret |= of_property_read_u32(np, "rockchip,lpddr4_odt_dis_freq", - &timing->lpddr4_odt_dis_freq); - ret |= of_property_read_u32(np, "rockchip,lpddr4_drv", - &timing->lpddr4_drv); - ret |= of_property_read_u32(np, "rockchip,lpddr4_dq_odt", - &timing->lpddr4_dq_odt); - ret |= of_property_read_u32(np, "rockchip,lpddr4_ca_odt", - &timing->lpddr4_ca_odt); - ret |= of_property_read_u32(np, "rockchip,phy_lpddr4_ca_drv", - &timing->phy_lpddr4_ca_drv); - ret |= of_property_read_u32(np, "rockchip,phy_lpddr4_ck_cs_drv", - &timing->phy_lpddr4_ck_cs_drv); - ret |= of_property_read_u32(np, "rockchip,phy_lpddr4_dq_drv", - &timing->phy_lpddr4_dq_drv); - ret |= of_property_read_u32(np, "rockchip,phy_lpddr4_odt", - &timing->phy_lpddr4_odt); + &data->lpddr4_odt_dis_freq); + + ret |= of_property_read_u32(np, "rockchip,pd-idle-dis-freq-hz", + &data->pd_idle_dis_freq); + ret |= of_property_read_u32(np, "rockchip,sr-idle-dis-freq-hz", + &data->sr_idle_dis_freq); + ret |= of_property_read_u32(np, "rockchip,sr-mc-gate-idle-dis-freq-hz", + &data->sr_mc_gate_idle_dis_freq); + ret |= of_property_read_u32(np, "rockchip,srpd-lite-idle-dis-freq-hz", + &data->srpd_lite_idle_dis_freq); + ret |= of_property_read_u32(np, "rockchip,standby-idle-dis-freq-hz", + &data->standby_idle_dis_freq); return ret; } @@ -311,8 +336,7 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct device_node *np = pdev->dev.of_node, *node; struct rk3399_dmcfreq *data; - int ret, index, size; - uint32_t *timing; + int ret; struct dev_pm_opp *opp; u32 ddr_type; u32 val; @@ -343,26 +367,7 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev) return ret; } - /* - * Get dram timing and pass it to arm trust firmware, - * the dram driver in arm trust firmware will get these - * timing and to do dram initial. - */ - if (!of_get_ddr_timings(&data->timing, np)) { - timing = &data->timing.ddr3_speed_bin; - size = sizeof(struct dram_timing) / 4; - for (index = 0; index < size; index++) { - arm_smccc_smc(ROCKCHIP_SIP_DRAM_FREQ, *timing++, index, - ROCKCHIP_SIP_CONFIG_DRAM_SET_PARAM, - 0, 0, 0, 0, &res); - if (res.a0) { - dev_err(dev, "Failed to set dram param: %ld\n", - res.a0); - ret = -EINVAL; - goto err_edev; - } - } - } + rk3399_dmcfreq_of_props(data, np); node = of_parse_phandle(np, "rockchip,pmu", 0); if (!node) @@ -381,13 +386,13 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev) switch (ddr_type) { case RK3399_PMUGRF_DDRTYPE_DDR3: - data->odt_dis_freq = data->timing.ddr3_odt_dis_freq; + data->odt_dis_freq = data->ddr3_odt_dis_freq; break; case RK3399_PMUGRF_DDRTYPE_LPDDR3: - data->odt_dis_freq = data->timing.lpddr3_odt_dis_freq; + data->odt_dis_freq = data->lpddr3_odt_dis_freq; break; case RK3399_PMUGRF_DDRTYPE_LPDDR4: - data->odt_dis_freq = data->timing.lpddr4_odt_dis_freq; + data->odt_dis_freq = data->lpddr4_odt_dis_freq; break; default: ret = -EINVAL; @@ -400,62 +405,45 @@ no_pmu: 0, 0, 0, 0, &res); /* - * In TF-A there is a platform SIP call to set the PD (power-down) - * timings and to enable or disable the ODT (on-die termination). - * This call needs three arguments as follows: - * - * arg0: - * bit[0-7] : sr_idle - * bit[8-15] : sr_mc_gate_idle - * bit[16-31] : standby idle - * arg1: - * bit[0-11] : pd_idle - * bit[16-27] : srpd_lite_idle - * arg2: - * bit[0] : odt enable - */ - data->odt_pd_arg0 = (data->timing.sr_idle & 0xff) | - ((data->timing.sr_mc_gate_idle & 0xff) << 8) | - ((data->timing.standby_idle & 0xffff) << 16); - data->odt_pd_arg1 = (data->timing.pd_idle & 0xfff) | - ((data->timing.srpd_lite_idle & 0xfff) << 16); - - /* * We add a devfreq driver to our parent since it has a device tree node * with operating points. */ - if (dev_pm_opp_of_add_table(dev)) { + if (devm_pm_opp_of_add_table(dev)) { dev_err(dev, "Invalid operating-points in device tree.\n"); ret = -EINVAL; goto err_edev; } - of_property_read_u32(np, "upthreshold", - &data->ondemand_data.upthreshold); - of_property_read_u32(np, "downdifferential", - &data->ondemand_data.downdifferential); + data->ondemand_data.upthreshold = 25; + data->ondemand_data.downdifferential = 15; data->rate = clk_get_rate(data->dmc_clk); opp = devfreq_recommended_opp(dev, &data->rate, 0); if (IS_ERR(opp)) { ret = PTR_ERR(opp); - goto err_free_opp; + goto err_edev; } data->rate = dev_pm_opp_get_freq(opp); data->volt = dev_pm_opp_get_voltage(opp); dev_pm_opp_put(opp); - rk3399_devfreq_dmc_profile.initial_freq = data->rate; + data->profile = (struct devfreq_dev_profile) { + .polling_ms = 200, + .target = rk3399_dmcfreq_target, + .get_dev_status = rk3399_dmcfreq_get_dev_status, + .get_cur_freq = rk3399_dmcfreq_get_cur_freq, + .initial_freq = data->rate, + }; data->devfreq = devm_devfreq_add_device(dev, - &rk3399_devfreq_dmc_profile, + &data->profile, DEVFREQ_GOV_SIMPLE_ONDEMAND, &data->ondemand_data); if (IS_ERR(data->devfreq)) { ret = PTR_ERR(data->devfreq); - goto err_free_opp; + goto err_edev; } devm_devfreq_register_opp_notifier(dev, data->devfreq); @@ -465,8 +453,6 @@ no_pmu: return 0; -err_free_opp: - dev_pm_opp_of_remove_table(&pdev->dev); err_edev: devfreq_event_disable_edev(data->edev); @@ -477,11 +463,7 @@ static int rk3399_dmcfreq_remove(struct platform_device *pdev) { struct rk3399_dmcfreq *dmcfreq = dev_get_drvdata(&pdev->dev); - /* - * Before remove the opp table we need to unregister the opp notifier. - */ - devm_devfreq_unregister_opp_notifier(dmcfreq->dev, dmcfreq->devfreq); - dev_pm_opp_of_remove_table(dmcfreq->dev); + devfreq_event_disable_edev(dmcfreq->edev); return 0; } diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 47551ab73ca8..b9bb94bd0f67 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -765,6 +765,106 @@ static struct cpuidle_state icx_cstates[] __initdata = { }; /* + * On AlderLake C1 has to be disabled if C1E is enabled, and vice versa. + * C1E is enabled only if "C1E promotion" bit is set in MSR_IA32_POWER_CTL. + * But in this case there is effectively no C1, because C1 requests are + * promoted to C1E. If the "C1E promotion" bit is cleared, then both C1 + * and C1E requests end up with C1, so there is effectively no C1E. + * + * By default we enable C1E and disable C1 by marking it with + * 'CPUIDLE_FLAG_UNUSABLE'. + */ +static struct cpuidle_state adl_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 2, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 220, + .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C8", + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 280, + .target_residency = 800, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C10", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 680, + .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + +static struct cpuidle_state adl_l_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE, + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 2, + .target_residency = 4, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 170, + .target_residency = 500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C8", + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, + .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C10", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 230, + .target_residency = 700, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + +/* * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1 @@ -1147,6 +1247,14 @@ static const struct idle_cpu idle_cpu_icx __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_adl __initconst = { + .state_table = adl_cstates, +}; + +static const struct idle_cpu idle_cpu_adl_l __initconst = { + .state_table = adl_l_cstates, +}; + static const struct idle_cpu idle_cpu_spr __initconst = { .state_table = spr_cstates, .disable_promotion_to_c1e = true, @@ -1215,6 +1323,8 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &idle_cpu_adl), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), @@ -1574,6 +1684,25 @@ static void __init skx_idle_state_table_update(void) } /** + * adl_idle_state_table_update - Adjust AlderLake idle states table. + */ +static void __init adl_idle_state_table_update(void) +{ + /* Check if user prefers C1 over C1E. */ + if (preferred_states_mask & BIT(1) && !(preferred_states_mask & BIT(2))) { + cpuidle_state_table[0].flags &= ~CPUIDLE_FLAG_UNUSABLE; + cpuidle_state_table[1].flags |= CPUIDLE_FLAG_UNUSABLE; + + /* Disable C1E by clearing the "C1E promotion" bit. */ + c1e_promotion = C1E_PROMOTION_DISABLE; + return; + } + + /* Make sure C1E is enabled by default */ + c1e_promotion = C1E_PROMOTION_ENABLE; +} + +/** * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table. */ static void __init spr_idle_state_table_update(void) @@ -1642,6 +1771,10 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_FAM6_SAPPHIRERAPIDS_X: spr_idle_state_table_update(); break; + case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: + adl_idle_state_table_update(); + break; } for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { diff --git a/drivers/iio/chemical/scd30.h b/drivers/iio/chemical/scd30.h index f60127bfe0f4..1ac9f3f79271 100644 --- a/drivers/iio/chemical/scd30.h +++ b/drivers/iio/chemical/scd30.h @@ -68,10 +68,7 @@ struct scd30_state { scd30_command_t command; }; -int scd30_suspend(struct device *dev); -int scd30_resume(struct device *dev); - -static __maybe_unused SIMPLE_DEV_PM_OPS(scd30_pm_ops, scd30_suspend, scd30_resume); +extern const struct dev_pm_ops scd30_pm_ops; int scd30_probe(struct device *dev, int irq, const char *name, void *priv, scd30_command_t command); diff --git a/drivers/iio/chemical/scd30_core.c b/drivers/iio/chemical/scd30_core.c index 9fe6bbe9ee04..682fca39d14d 100644 --- a/drivers/iio/chemical/scd30_core.c +++ b/drivers/iio/chemical/scd30_core.c @@ -517,7 +517,7 @@ static const struct iio_chan_spec scd30_channels[] = { IIO_CHAN_SOFT_TIMESTAMP(3), }; -int __maybe_unused scd30_suspend(struct device *dev) +static int scd30_suspend(struct device *dev) { struct iio_dev *indio_dev = dev_get_drvdata(dev); struct scd30_state *state = iio_priv(indio_dev); @@ -529,9 +529,8 @@ int __maybe_unused scd30_suspend(struct device *dev) return regulator_disable(state->vdd); } -EXPORT_SYMBOL(scd30_suspend); -int __maybe_unused scd30_resume(struct device *dev) +static int scd30_resume(struct device *dev) { struct iio_dev *indio_dev = dev_get_drvdata(dev); struct scd30_state *state = iio_priv(indio_dev); @@ -543,7 +542,8 @@ int __maybe_unused scd30_resume(struct device *dev) return scd30_command_write(state, CMD_START_MEAS, state->pressure_comp); } -EXPORT_SYMBOL(scd30_resume); + +EXPORT_NS_SIMPLE_DEV_PM_OPS(scd30_pm_ops, scd30_suspend, scd30_resume, IIO_SCD30); static void scd30_stop_meas(void *data) { @@ -759,7 +759,7 @@ int scd30_probe(struct device *dev, int irq, const char *name, void *priv, return devm_iio_device_register(dev, indio_dev); } -EXPORT_SYMBOL(scd30_probe); +EXPORT_SYMBOL_NS(scd30_probe, IIO_SCD30); MODULE_AUTHOR("Tomasz Duszynski <tomasz.duszynski@octakon.com>"); MODULE_DESCRIPTION("Sensirion SCD30 carbon dioxide sensor core driver"); diff --git a/drivers/iio/chemical/scd30_i2c.c b/drivers/iio/chemical/scd30_i2c.c index 875892a070ee..bae479a4721f 100644 --- a/drivers/iio/chemical/scd30_i2c.c +++ b/drivers/iio/chemical/scd30_i2c.c @@ -128,7 +128,7 @@ static struct i2c_driver scd30_i2c_driver = { .driver = { .name = KBUILD_MODNAME, .of_match_table = scd30_i2c_of_match, - .pm = &scd30_pm_ops, + .pm = pm_sleep_ptr(&scd30_pm_ops), }, .probe_new = scd30_i2c_probe, }; @@ -137,3 +137,4 @@ module_i2c_driver(scd30_i2c_driver); MODULE_AUTHOR("Tomasz Duszynski <tomasz.duszynski@octakon.com>"); MODULE_DESCRIPTION("Sensirion SCD30 carbon dioxide sensor i2c driver"); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(IIO_SCD30); diff --git a/drivers/iio/chemical/scd30_serial.c b/drivers/iio/chemical/scd30_serial.c index 568b34486c44..3c519103d30b 100644 --- a/drivers/iio/chemical/scd30_serial.c +++ b/drivers/iio/chemical/scd30_serial.c @@ -252,7 +252,7 @@ static struct serdev_device_driver scd30_serdev_driver = { .driver = { .name = KBUILD_MODNAME, .of_match_table = scd30_serdev_of_match, - .pm = &scd30_pm_ops, + .pm = pm_sleep_ptr(&scd30_pm_ops), }, .probe = scd30_serdev_probe, }; @@ -261,3 +261,4 @@ module_serdev_device_driver(scd30_serdev_driver); MODULE_AUTHOR("Tomasz Duszynski <tomasz.duszynski@octakon.com>"); MODULE_DESCRIPTION("Sensirion SCD30 carbon dioxide sensor serial driver"); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(IIO_SCD30); diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 440ab5a03df9..485ea980bde7 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1448,7 +1448,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); * Returns 0 on success or a proper -EINVAL value in case of error. */ static int __maybe_unused -_get_dt_power(unsigned long *mW, unsigned long *kHz, struct device *dev) +_get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz) { struct dev_pm_opp *opp; unsigned long opp_freq, opp_power; @@ -1482,8 +1482,8 @@ _get_dt_power(unsigned long *mW, unsigned long *kHz, struct device *dev) * Returns -EINVAL if the power calculation failed because of missing * parameters, 0 otherwise. */ -static int __maybe_unused _get_power(unsigned long *mW, unsigned long *kHz, - struct device *dev) +static int __maybe_unused _get_power(struct device *dev, unsigned long *mW, + unsigned long *kHz) { struct dev_pm_opp *opp; struct device_node *np; diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index bca2f912d349..f5eced0842b3 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -211,7 +211,7 @@ static int __dtpm_cpu_setup(int cpu, struct dtpm *parent) return 0; pd = em_cpu_get(cpu); - if (!pd) + if (!pd || em_is_artificial(pd)) return -EINVAL; dtpm_cpu = kzalloc(sizeof(*dtpm_cpu), GFP_KERNEL); diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 07611a00b78f..a9c99d9e8b42 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1010,7 +1010,7 @@ static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value, * where time_unit is default to 1 sec. Never 0. */ if (!to_raw) - return (value) ? value *= rp->time_unit : rp->time_unit; + return (value) ? value * rp->time_unit : rp->time_unit; value = div64_u64(value, rp->time_unit); @@ -1107,6 +1107,8 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &rapl_defaults_core), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &rapl_defaults_core), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core), diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 1be45f36ab6c..9d23984d8931 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -140,6 +140,7 @@ static const struct x86_cpu_id pl4_support_ids[] = { { X86_VENDOR_INTEL, 6, INTEL_FAM6_TIGERLAKE_L, X86_FEATURE_ANY }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ALDERLAKE, X86_FEATURE_ANY }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ALDERLAKE_L, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_RAPTORLAKE, X86_FEATURE_ANY }, {} }; diff --git a/drivers/soc/rockchip/pm_domains.c b/drivers/soc/rockchip/pm_domains.c index 0868b7d406fb..b1cf7d29dafd 100644 --- a/drivers/soc/rockchip/pm_domains.c +++ b/drivers/soc/rockchip/pm_domains.c @@ -8,6 +8,7 @@ #include <linux/io.h> #include <linux/iopoll.h> #include <linux/err.h> +#include <linux/mutex.h> #include <linux/pm_clock.h> #include <linux/pm_domain.h> #include <linux/of_address.h> @@ -16,6 +17,7 @@ #include <linux/clk.h> #include <linux/regmap.h> #include <linux/mfd/syscon.h> +#include <soc/rockchip/pm_domains.h> #include <dt-bindings/power/px30-power.h> #include <dt-bindings/power/rk3036-power.h> #include <dt-bindings/power/rk3066-power.h> @@ -139,6 +141,109 @@ struct rockchip_pmu { #define DOMAIN_RK3568(name, pwr, req, wakeup) \ DOMAIN_M(name, pwr, pwr, req, req, req, wakeup) +/* + * Dynamic Memory Controller may need to coordinate with us -- see + * rockchip_pmu_block(). + * + * dmc_pmu_mutex protects registration-time races, so DMC driver doesn't try to + * block() while we're initializing the PMU. + */ +static DEFINE_MUTEX(dmc_pmu_mutex); +static struct rockchip_pmu *dmc_pmu; + +/* + * Block PMU transitions and make sure they don't interfere with ARM Trusted + * Firmware operations. There are two conflicts, noted in the comments below. + * + * Caller must unblock PMU transitions via rockchip_pmu_unblock(). + */ +int rockchip_pmu_block(void) +{ + struct rockchip_pmu *pmu; + struct generic_pm_domain *genpd; + struct rockchip_pm_domain *pd; + int i, ret; + + mutex_lock(&dmc_pmu_mutex); + + /* No PMU (yet)? Then we just block rockchip_pmu_probe(). */ + if (!dmc_pmu) + return 0; + pmu = dmc_pmu; + + /* + * mutex blocks all idle transitions: we can't touch the + * PMU_BUS_IDLE_REQ (our ".idle_offset") register while ARM Trusted + * Firmware might be using it. + */ + mutex_lock(&pmu->mutex); + + /* + * Power domain clocks: Per Rockchip, we *must* keep certain clocks + * enabled for the duration of power-domain transitions. Most + * transitions are handled by this driver, but some cases (in + * particular, DRAM DVFS / memory-controller idle) must be handled by + * firmware. Firmware can handle most clock management via a special + * "ungate" register (PMU_CRU_GATEDIS_CON0), but unfortunately, this + * doesn't handle PLLs. We can assist this transition by doing the + * clock management on behalf of firmware. + */ + for (i = 0; i < pmu->genpd_data.num_domains; i++) { + genpd = pmu->genpd_data.domains[i]; + if (genpd) { + pd = to_rockchip_pd(genpd); + ret = clk_bulk_enable(pd->num_clks, pd->clks); + if (ret < 0) { + dev_err(pmu->dev, + "failed to enable clks for domain '%s': %d\n", + genpd->name, ret); + goto err; + } + } + } + + return 0; + +err: + for (i = i - 1; i >= 0; i--) { + genpd = pmu->genpd_data.domains[i]; + if (genpd) { + pd = to_rockchip_pd(genpd); + clk_bulk_disable(pd->num_clks, pd->clks); + } + } + mutex_unlock(&pmu->mutex); + mutex_unlock(&dmc_pmu_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rockchip_pmu_block); + +/* Unblock PMU transitions. */ +void rockchip_pmu_unblock(void) +{ + struct rockchip_pmu *pmu; + struct generic_pm_domain *genpd; + struct rockchip_pm_domain *pd; + int i; + + if (dmc_pmu) { + pmu = dmc_pmu; + for (i = 0; i < pmu->genpd_data.num_domains; i++) { + genpd = pmu->genpd_data.domains[i]; + if (genpd) { + pd = to_rockchip_pd(genpd); + clk_bulk_disable(pd->num_clks, pd->clks); + } + } + + mutex_unlock(&pmu->mutex); + } + + mutex_unlock(&dmc_pmu_mutex); +} +EXPORT_SYMBOL_GPL(rockchip_pmu_unblock); + static bool rockchip_pmu_domain_is_idle(struct rockchip_pm_domain *pd) { struct rockchip_pmu *pmu = pd->pmu; @@ -690,6 +795,12 @@ static int rockchip_pm_domain_probe(struct platform_device *pdev) error = -ENODEV; + /* + * Prevent any rockchip_pmu_block() from racing with the remainder of + * setup (clocks, register initialization). + */ + mutex_lock(&dmc_pmu_mutex); + for_each_available_child_of_node(np, node) { error = rockchip_pm_add_one_domain(pmu, node); if (error) { @@ -719,10 +830,17 @@ static int rockchip_pm_domain_probe(struct platform_device *pdev) goto err_out; } + /* We only expect one PMU. */ + if (!WARN_ON_ONCE(dmc_pmu)) + dmc_pmu = pmu; + + mutex_unlock(&dmc_pmu_mutex); + return 0; err_out: rockchip_pm_domain_cleanup(pmu); + mutex_unlock(&dmc_pmu_mutex); return error; } diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 0bfb8eebd126..b8151d95a806 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -328,7 +328,7 @@ static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev, struct cpufreq_policy *policy; unsigned int nr_levels; - if (!em) + if (!em || em_is_artificial(em)) return false; policy = cpufreq_cdev->policy; diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c index 4310cb342a9f..b04dcbbf721a 100644 --- a/drivers/thermal/devfreq_cooling.c +++ b/drivers/thermal/devfreq_cooling.c @@ -358,6 +358,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, struct thermal_cooling_device *cdev; struct device *dev = df->dev.parent; struct devfreq_cooling_device *dfc; + struct em_perf_domain *em; char *name; int err, num_opps; @@ -367,8 +368,9 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, dfc->devfreq = df; - dfc->em_pd = em_pd_get(dev); - if (dfc->em_pd) { + em = em_pd_get(dev); + if (em && !em_is_artificial(em)) { + dfc->em_pd = em; devfreq_cooling_ops.get_requested_power = devfreq_cooling_get_requested_power; devfreq_cooling_ops.state2power = devfreq_cooling_state2power; @@ -379,7 +381,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, num_opps = em_pd_nr_perf_states(dfc->em_pd); } else { /* Backward compatibility for drivers which do not use IPA */ - dev_dbg(dev, "missing EM for cooling device\n"); + dev_dbg(dev, "missing proper EM for cooling device\n"); num_opps = dev_pm_opp_get_opp_count(dev); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 92b7ea8d8f5e..c6108581d97d 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -141,6 +141,7 @@ extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_set_enable(int cpu, bool enable); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); extern bool acpi_cpc_valid(void); +extern bool cppc_allow_fast_switch(void); extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); extern unsigned int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); @@ -175,6 +176,10 @@ static inline bool acpi_cpc_valid(void) { return false; } +static inline bool cppc_allow_fast_switch(void) +{ + return false; +} static inline unsigned int cppc_get_transition_latency(int cpu) { return CPUFREQ_ETERNAL; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d7136d13aa44..03465db16b68 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -574,6 +574,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); #define OSC_SB_OSLPI_SUPPORT 0x00000100 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000 #define OSC_SB_GENERIC_INITIATOR_SUPPORT 0x00002000 +#define OSC_SB_CPC_FLEXIBLE_ADR_SPACE 0x00004000 #define OSC_SB_NATIVE_USB4_SUPPORT 0x00040000 #define OSC_SB_PRM_SUPPORT 0x00200000 @@ -581,6 +582,7 @@ extern bool osc_sb_apei_support_acked; extern bool osc_pc_lpi_support_confirmed; extern bool osc_sb_native_usb4_support_confirmed; extern bool osc_sb_cppc_not_supported; +extern bool osc_cpc_flexible_adr_space_confirmed; /* USB4 Capabilities */ #define OSC_USB_USB3_TUNNELING 0x00000001 diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index 142474b4af96..dc10bee75a72 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -38,6 +38,7 @@ enum devfreq_timer { struct devfreq; struct devfreq_governor; +struct devfreq_cpu_data; struct thermal_cooling_device; /** @@ -288,6 +289,11 @@ struct devfreq_simple_ondemand_data { #endif #if IS_ENABLED(CONFIG_DEVFREQ_GOV_PASSIVE) +enum devfreq_parent_dev_type { + DEVFREQ_PARENT_DEV, + CPUFREQ_PARENT_DEV, +}; + /** * struct devfreq_passive_data - ``void *data`` fed to struct devfreq * and devfreq_add_device @@ -299,8 +305,11 @@ struct devfreq_simple_ondemand_data { * using governors except for passive governor. * If the devfreq device has the specific method to decide * the next frequency, should use this callback. - * @this: the devfreq instance of own device. - * @nb: the notifier block for DEVFREQ_TRANSITION_NOTIFIER list + * @parent_type: the parent type of the device. + * @this: the devfreq instance of own device. + * @nb: the notifier block for DEVFREQ_TRANSITION_NOTIFIER or + * CPUFREQ_TRANSITION_NOTIFIER list. + * @cpu_data_list: the list of cpu frequency data for all cpufreq_policy. * * The devfreq_passive_data have to set the devfreq instance of parent * device with governors except for the passive governor. But, don't need to @@ -314,9 +323,13 @@ struct devfreq_passive_data { /* Optional callback to decide the next frequency of passvice device */ int (*get_target_freq)(struct devfreq *this, unsigned long *freq); + /* Should set the type of parent device */ + enum devfreq_parent_dev_type parent_type; + /* For passive governor's internal use. Don't need to set them */ struct devfreq *this; struct notifier_block nb; + struct list_head cpu_data_list; }; #endif diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 9f3c400bc52d..8419bffb4398 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -67,11 +67,16 @@ struct em_perf_domain { * * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating * energy consumption. + * + * EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be + * created by platform missing real power information */ #define EM_PERF_DOMAIN_MILLIWATTS BIT(0) #define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) +#define EM_PERF_DOMAIN_ARTIFICIAL BIT(2) #define em_span_cpus(em) (to_cpumask((em)->cpus)) +#define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL) #ifdef CONFIG_ENERGY_MODEL #define EM_MAX_POWER 0xFFFF @@ -96,11 +101,11 @@ struct em_data_callback { /** * active_power() - Provide power at the next performance state of * a device + * @dev : Device for which we do this operation (can be a CPU) * @power : Active power at the performance state * (modified) * @freq : Frequency at the performance state in kHz * (modified) - * @dev : Device for which we do this operation (can be a CPU) * * active_power() must find the lowest performance state of 'dev' above * 'freq' and update 'power' and 'freq' to the matching active power @@ -112,11 +117,32 @@ struct em_data_callback { * * Return 0 on success. */ - int (*active_power)(unsigned long *power, unsigned long *freq, - struct device *dev); + int (*active_power)(struct device *dev, unsigned long *power, + unsigned long *freq); + + /** + * get_cost() - Provide the cost at the given performance state of + * a device + * @dev : Device for which we do this operation (can be a CPU) + * @freq : Frequency at the performance state in kHz + * @cost : The cost value for the performance state + * (modified) + * + * In case of CPUs, the cost is the one of a single CPU in the domain. + * It is expected to fit in the [0, EM_MAX_POWER] range due to internal + * usage in EAS calculation. + * + * Return 0 on success, or appropriate error value in case of failure. + */ + int (*get_cost)(struct device *dev, unsigned long freq, + unsigned long *cost); }; -#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } #define EM_SET_ACTIVE_POWER_CB(em_cb, cb) ((em_cb).active_power = cb) +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) \ + { .active_power = _active_power_cb, \ + .get_cost = _cost_cb } +#define EM_DATA_CB(_active_power_cb) \ + EM_ADV_DATA_CB(_active_power_cb, NULL) struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); @@ -264,6 +290,7 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) #else struct em_data_callback {}; +#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { } #define EM_DATA_CB(_active_power_cb) { } #define EM_SET_ACTIVE_POWER_CB(em_cb, cb) do { } while (0) diff --git a/include/linux/pm.h b/include/linux/pm.h index e65b3ab28377..ffe941958501 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -368,13 +368,13 @@ const struct dev_pm_ops name = { \ #ifdef CONFIG_PM #define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ - runtime_resume_fn, idle_fn, sec) \ + runtime_resume_fn, idle_fn, sec, ns) \ _DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ runtime_resume_fn, idle_fn); \ - _EXPORT_SYMBOL(name, sec) + __EXPORT_SYMBOL(name, sec, ns) #else #define _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, runtime_suspend_fn, \ - runtime_resume_fn, idle_fn, sec) \ + runtime_resume_fn, idle_fn, sec, ns) \ static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \ resume_fn, runtime_suspend_fn, \ runtime_resume_fn, idle_fn) @@ -391,9 +391,13 @@ static __maybe_unused _DEFINE_DEV_PM_OPS(__static_##name, suspend_fn, \ _DEFINE_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL) #define EXPORT_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ - _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "") + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", "") #define EXPORT_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ - _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl") + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", "") +#define EXPORT_NS_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \ + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "", #ns) +#define EXPORT_NS_GPL_SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn, ns) \ + _EXPORT_DEV_PM_OPS(name, suspend_fn, resume_fn, NULL, NULL, NULL, "_gpl", #ns) /* Deprecated. Use DEFINE_SIMPLE_DEV_PM_OPS() instead. */ #define SIMPLE_DEV_PM_OPS(name, suspend_fn, resume_fn) \ diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 67017c9390c8..ebc351698090 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -91,6 +91,14 @@ struct gpd_dev_ops { int (*stop)(struct device *dev); }; +struct genpd_governor_data { + s64 max_off_time_ns; + bool max_off_time_changed; + ktime_t next_wakeup; + bool cached_power_down_ok; + bool cached_power_down_state_idx; +}; + struct genpd_power_state { s64 power_off_latency_ns; s64 power_on_latency_ns; @@ -98,7 +106,7 @@ struct genpd_power_state { u64 usage; u64 rejected; struct fwnode_handle *fwnode; - ktime_t idle_time; + u64 idle_time; void *data; }; @@ -114,6 +122,7 @@ struct generic_pm_domain { struct list_head child_links; /* Links with PM domain as a child */ struct list_head dev_list; /* List of devices */ struct dev_power_governor *gov; + struct genpd_governor_data *gd; /* Data used by a genpd governor. */ struct work_struct power_off_work; struct fwnode_handle *provider; /* Identity of the domain provider */ bool has_provider; @@ -134,11 +143,6 @@ struct generic_pm_domain { int (*set_performance_state)(struct generic_pm_domain *genpd, unsigned int state); struct gpd_dev_ops dev_ops; - s64 max_off_time_ns; /* Maximum allowed "suspended" time. */ - ktime_t next_wakeup; /* Maintained by the domain governor */ - bool max_off_time_changed; - bool cached_power_down_ok; - bool cached_power_down_state_idx; int (*attach_dev)(struct generic_pm_domain *domain, struct device *dev); void (*detach_dev)(struct generic_pm_domain *domain, @@ -149,8 +153,8 @@ struct generic_pm_domain { unsigned int state_count); unsigned int state_count; /* number of states */ unsigned int state_idx; /* state that genpd will go to when off */ - ktime_t on_time; - ktime_t accounting_time; + u64 on_time; + u64 accounting_time; const struct genpd_lock_ops *lock_ops; union { struct mutex mlock; @@ -182,6 +186,7 @@ struct gpd_timing_data { s64 suspend_latency_ns; s64 resume_latency_ns; s64 effective_constraint_ns; + ktime_t next_wakeup; bool constraint_changed; bool cached_suspend_ok; }; @@ -193,14 +198,13 @@ struct pm_domain_data { struct generic_pm_domain_data { struct pm_domain_data base; - struct gpd_timing_data td; + struct gpd_timing_data *td; struct notifier_block nb; struct notifier_block *power_nb; int cpu; unsigned int performance_state; unsigned int default_pstate; unsigned int rpm_pstate; - ktime_t next_wakeup; void *data; }; diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 2bff6a10095d..9e4d056967c6 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -41,10 +41,16 @@ #define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ - suspend_fn, resume_fn, idle_fn, "") + suspend_fn, resume_fn, idle_fn, "", "") #define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ - suspend_fn, resume_fn, idle_fn, "_gpl") + suspend_fn, resume_fn, idle_fn, "_gpl", "") +#define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ + _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ + suspend_fn, resume_fn, idle_fn, "", #ns) +#define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ + _EXPORT_DEV_PM_OPS(name, pm_runtime_force_suspend, pm_runtime_force_resume, \ + suspend_fn, resume_fn, idle_fn, "_gpl", #ns) #ifdef CONFIG_PM extern struct workqueue_struct *pm_wq; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 300273ff40cc..70f2921e2e70 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -542,22 +542,56 @@ static inline void unlock_system_sleep(void) {} #ifdef CONFIG_PM_SLEEP_DEBUG extern bool pm_print_times_enabled; extern bool pm_debug_messages_on; -extern __printf(2, 3) void __pm_pr_dbg(bool defer, const char *fmt, ...); +static inline int pm_dyn_debug_messages_on(void) +{ +#ifdef CONFIG_DYNAMIC_DEBUG + return 1; +#else + return 0; +#endif +} +#ifndef pr_fmt +#define pr_fmt(fmt) "PM: " fmt +#endif +#define __pm_pr_dbg(fmt, ...) \ + do { \ + if (pm_debug_messages_on) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ + else if (pm_dyn_debug_messages_on()) \ + pr_debug(fmt, ##__VA_ARGS__); \ + } while (0) +#define __pm_deferred_pr_dbg(fmt, ...) \ + do { \ + if (pm_debug_messages_on) \ + printk_deferred(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ + } while (0) #else #define pm_print_times_enabled (false) #define pm_debug_messages_on (false) #include <linux/printk.h> -#define __pm_pr_dbg(defer, fmt, ...) \ - no_printk(KERN_DEBUG fmt, ##__VA_ARGS__) +#define __pm_pr_dbg(fmt, ...) \ + no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) +#define __pm_deferred_pr_dbg(fmt, ...) \ + no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif +/** + * pm_pr_dbg - print pm sleep debug messages + * + * If pm_debug_messages_on is enabled, print message. + * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is enabled, + * print message only from instances explicitly enabled on dynamic debug's + * control. + * If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is disabled, + * don't print message. + */ #define pm_pr_dbg(fmt, ...) \ - __pm_pr_dbg(false, fmt, ##__VA_ARGS__) + __pm_pr_dbg(fmt, ##__VA_ARGS__) #define pm_deferred_pr_dbg(fmt, ...) \ - __pm_pr_dbg(true, fmt, ##__VA_ARGS__) + __pm_deferred_pr_dbg(fmt, ##__VA_ARGS__) #ifdef CONFIG_PM_AUTOSLEEP diff --git a/include/soc/rockchip/pm_domains.h b/include/soc/rockchip/pm_domains.h new file mode 100644 index 000000000000..7dbd941fc937 --- /dev/null +++ b/include/soc/rockchip/pm_domains.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2022, The Chromium OS Authors. All rights reserved. + */ + +#ifndef __SOC_ROCKCHIP_PM_DOMAINS_H__ +#define __SOC_ROCKCHIP_PM_DOMAINS_H__ + +#ifdef CONFIG_ROCKCHIP_PM_DOMAINS + +int rockchip_pmu_block(void); +void rockchip_pmu_unblock(void); + +#else /* CONFIG_ROCKCHIP_PM_DOMAINS */ + +static inline int rockchip_pmu_block(void) +{ + return 0; +} + +static inline void rockchip_pmu_unblock(void) { } + +#endif /* CONFIG_ROCKCHIP_PM_DOMAINS */ + +#endif /* __SOC_ROCKCHIP_PM_DOMAINS_H__ */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 5899260a8bef..874ad834dc8d 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +ifeq ($(CONFIG_DYNAMIC_DEBUG), y) +CFLAGS_swap.o := -DDEBUG +CFLAGS_snapshot.o := -DDEBUG +CFLAGS_energy_model.o := -DDEBUG +endif KASAN_SANITIZE_snapshot.o := n diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0153b0ca7b23..6c373f2960e7 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -54,28 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static int em_debug_units_show(struct seq_file *s, void *unused) +static int em_debug_flags_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? - "milliWatts" : "bogoWatts"; - seq_printf(s, "%s\n", units); + seq_printf(s, "%#lx\n", pd->flags); return 0; } -DEFINE_SHOW_ATTRIBUTE(em_debug_units); - -static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) -{ - struct em_perf_domain *pd = s->private; - int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; - - seq_printf(s, "%d\n", enabled); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); +DEFINE_SHOW_ATTRIBUTE(em_debug_flags); static void em_debug_create_pd(struct device *dev) { @@ -89,9 +76,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, &em_debug_cpus_fops); - debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); - debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, - &em_debug_skip_inefficiencies_fops); + debugfs_create_file("flags", 0444, d, dev->em_pd, + &em_debug_flags_fops); /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) @@ -121,7 +107,8 @@ static void em_debug_remove_pd(struct device *dev) {} #endif static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, - int nr_states, struct em_data_callback *cb) + int nr_states, struct em_data_callback *cb, + unsigned long flags) { unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; struct em_perf_state *table; @@ -139,7 +126,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, dev); + ret = cb->active_power(dev, &power, &freq); if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); @@ -173,10 +160,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = nr_states - 1; i >= 0; i--) { - unsigned long power_res = em_scale_power(table[i].power); + unsigned long power_res, cost; + + if (flags & EM_PERF_DOMAIN_ARTIFICIAL) { + ret = cb->get_cost(dev, table[i].frequency, &cost); + if (ret || !cost || cost > EM_MAX_POWER) { + dev_err(dev, "EM: invalid cost %lu %d\n", + cost, ret); + goto free_ps_table; + } + } else { + power_res = em_scale_power(table[i].power); + cost = div64_u64(fmax * power_res, table[i].frequency); + } + + table[i].cost = cost; - table[i].cost = div64_u64(fmax * power_res, - table[i].frequency); if (table[i].cost >= prev_cost) { table[i].flags = EM_PERF_STATE_INEFFICIENT; dev_dbg(dev, "EM: OPP:%lu is inefficient\n", @@ -197,7 +196,8 @@ free_ps_table: } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus) + struct em_data_callback *cb, cpumask_t *cpus, + unsigned long flags) { struct em_perf_domain *pd; struct device *cpu_dev; @@ -215,7 +215,7 @@ static int em_create_pd(struct device *dev, int nr_states, return -ENOMEM; } - ret = em_create_perf_table(dev, pd, nr_states, cb); + ret = em_create_perf_table(dev, pd, nr_states, cb, flags); if (ret) { kfree(pd); return ret; @@ -259,6 +259,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev) found++; } + cpufreq_cpu_put(policy); + if (!found) return; @@ -332,6 +334,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, bool milliwatts) { unsigned long cap, prev_cap = 0; + unsigned long flags = 0; int cpu, ret; if (!dev || !nr_states || !cb) @@ -378,12 +381,16 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, } } - ret = em_create_pd(dev, nr_states, cb, cpus); + if (milliwatts) + flags |= EM_PERF_DOMAIN_MILLIWATTS; + else if (cb->get_cost) + flags |= EM_PERF_DOMAIN_ARTIFICIAL; + + ret = em_create_pd(dev, nr_states, cb, cpus, flags); if (ret) goto unlock; - if (milliwatts) - dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + dev->em_pd->flags |= flags; em_cpufreq_update_efficiencies(dev); diff --git a/kernel/power/main.c b/kernel/power/main.c index 7e646079fbeb..5242bf2ee469 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -545,35 +545,6 @@ static int __init pm_debug_messages_setup(char *str) } __setup("pm_debug_messages", pm_debug_messages_setup); -/** - * __pm_pr_dbg - Print a suspend debug message to the kernel log. - * @defer: Whether or not to use printk_deferred() to print the message. - * @fmt: Message format. - * - * The message will be emitted if enabled through the pm_debug_messages - * sysfs attribute. - */ -void __pm_pr_dbg(bool defer, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - if (!pm_debug_messages_on) - return; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - if (defer) - printk_deferred(KERN_DEBUG "PM: %pV", &vaf); - else - printk(KERN_DEBUG "PM: %pV", &vaf); - - va_end(args); -} - #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ diff --git a/kernel/power/process.c b/kernel/power/process.c index 11b570fcf049..3068601e585a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -6,9 +6,6 @@ * Originally from swsusp. */ - -#undef DEBUG - #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 330d49937692..2a406753af90 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -326,7 +326,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -/** +/* * Data types related to memory bitmaps. * * Memory bitmap is a structure consisting of many linked lists of @@ -427,6 +427,10 @@ struct memory_bitmap { /** * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages not used before hibernation (restore only) + * @ca: Pointer to a linked list of pages ("a chain") to allocate from + * @list: Radix Tree node to add. * * This function is used to allocate inner nodes as well as the * leave nodes of the radix tree. It also adds the node to the @@ -902,7 +906,7 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /** - * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. + * memory_bm_next_pfn - Find the next set bit in a memory bitmap. * @bm: Memory bitmap. * * Starting from the last returned position this function searches for the next @@ -1937,7 +1941,7 @@ static inline int get_highmem_buffer(int safe_needed) } /** - * alloc_highmem_image_pages - Allocate some highmem pages for the image. + * alloc_highmem_pages - Allocate some highmem pages for the image. * * Try to allocate as many pages as needed, but if the number of free highmem * pages is less than that, allocate them all. @@ -2224,7 +2228,7 @@ static int check_header(struct swsusp_info *info) } /** - * load header - Check the image header and copy the data from it. + * load_header - Check the image header and copy the data from it. */ static int load_header(struct swsusp_info *info) { diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index f3e3c94ab9bd..92e139b9c792 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -9,7 +9,7 @@ ifeq ("$(origin O)", "command line") endif turbostat : turbostat.c -override CFLAGS += -O2 -Wall -I../../../include +override CFLAGS += -O2 -Wall -Wextra -I../../../include override CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' override CFLAGS += -DINTEL_FAMILY_HEADER='"../../../../arch/x86/include/asm/intel-family.h"' override CFLAGS += -D_FILE_OFFSET_BITS=64 diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 9b17097bc3d7..1e7d3de55a94 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -292,7 +292,7 @@ starts a new interval. must be run as root. Alternatively, non-root users can be enabled to run turbostat this way: -# setcap cap_sys_rawio=ep ./turbostat +# setcap cap_sys_admin,cap_sys_rawio,cap_sys_nice=+ep ./turbostat # chmod +r /dev/cpu/*/msr diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bc5ae0872fed..ede31a4287a0 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2021 Intel Corporation. + * Copyright (c) 2022 Intel Corporation. * Len Brown <len.brown@intel.com> */ @@ -37,6 +37,171 @@ #include <asm/unistd.h> #include <stdbool.h> +#define UNUSED(x) (void)(x) + +/* + * This list matches the column headers, except + * 1. built-in only, the sysfs counters are not here -- we learn of those at run-time + * 2. Core and CPU are moved to the end, we can't have strings that contain them + * matching on them for --show and --hide. + */ + +/* + * buffer size used by sscanf() for added column names + * Usually truncated to 7 characters, but also handles 18 columns for raw 64-bit counters + */ +#define NAME_BYTES 20 +#define PATH_BYTES 128 + +enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; +enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; +enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; + +struct msr_counter { + unsigned int msr_num; + char name[NAME_BYTES]; + char path[PATH_BYTES]; + unsigned int width; + enum counter_type type; + enum counter_format format; + struct msr_counter *next; + unsigned int flags; +#define FLAGS_HIDE (1 << 0) +#define FLAGS_SHOW (1 << 1) +#define SYSFS_PERCPU (1 << 1) +}; + +struct msr_counter bic[] = { + { 0x0, "usec", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Time_Of_Day_Seconds", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Package", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Node", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Avg_MHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Busy%", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Bzy_MHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "TSC_MHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "IRQ", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL, 0 }, + { 0x0, "sysfs", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c1", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c3", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c6", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%c7", "", 0, 0, 0, NULL, 0 }, + { 0x0, "ThreadC", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CoreTmp", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CoreCnt", "", 0, 0, 0, NULL, 0 }, + { 0x0, "PkgTmp", "", 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%rc6", "", 0, 0, 0, NULL, 0 }, + { 0x0, "GFXMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc2", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc3", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc6", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc7", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc8", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg%pc9", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Pk%pc10", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CPU%LPI", "", 0, 0, 0, NULL, 0 }, + { 0x0, "SYS%LPI", "", 0, 0, 0, NULL, 0 }, + { 0x0, "PkgWatt", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CorWatt", "", 0, 0, 0, NULL, 0 }, + { 0x0, "GFXWatt", "", 0, 0, 0, NULL, 0 }, + { 0x0, "PkgCnt", "", 0, 0, 0, NULL, 0 }, + { 0x0, "RAMWatt", "", 0, 0, 0, NULL, 0 }, + { 0x0, "PKG_%", "", 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_%", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Pkg_J", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Cor_J", "", 0, 0, 0, NULL, 0 }, + { 0x0, "GFX_J", "", 0, 0, 0, NULL, 0 }, + { 0x0, "RAM_J", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Mod%c6", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Totl%C0", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Any%C0", "", 0, 0, 0, NULL, 0 }, + { 0x0, "GFX%C0", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CPUGFX%", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Core", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CPU", "", 0, 0, 0, NULL, 0 }, + { 0x0, "APIC", "", 0, 0, 0, NULL, 0 }, + { 0x0, "X2APIC", "", 0, 0, 0, NULL, 0 }, + { 0x0, "Die", "", 0, 0, 0, NULL, 0 }, + { 0x0, "GFXAMHz", "", 0, 0, 0, NULL, 0 }, + { 0x0, "IPC", "", 0, 0, 0, NULL, 0 }, + { 0x0, "CoreThr", "", 0, 0, 0, NULL, 0 }, +}; + +#define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) +#define BIC_USEC (1ULL << 0) +#define BIC_TOD (1ULL << 1) +#define BIC_Package (1ULL << 2) +#define BIC_Node (1ULL << 3) +#define BIC_Avg_MHz (1ULL << 4) +#define BIC_Busy (1ULL << 5) +#define BIC_Bzy_MHz (1ULL << 6) +#define BIC_TSC_MHz (1ULL << 7) +#define BIC_IRQ (1ULL << 8) +#define BIC_SMI (1ULL << 9) +#define BIC_sysfs (1ULL << 10) +#define BIC_CPU_c1 (1ULL << 11) +#define BIC_CPU_c3 (1ULL << 12) +#define BIC_CPU_c6 (1ULL << 13) +#define BIC_CPU_c7 (1ULL << 14) +#define BIC_ThreadC (1ULL << 15) +#define BIC_CoreTmp (1ULL << 16) +#define BIC_CoreCnt (1ULL << 17) +#define BIC_PkgTmp (1ULL << 18) +#define BIC_GFX_rc6 (1ULL << 19) +#define BIC_GFXMHz (1ULL << 20) +#define BIC_Pkgpc2 (1ULL << 21) +#define BIC_Pkgpc3 (1ULL << 22) +#define BIC_Pkgpc6 (1ULL << 23) +#define BIC_Pkgpc7 (1ULL << 24) +#define BIC_Pkgpc8 (1ULL << 25) +#define BIC_Pkgpc9 (1ULL << 26) +#define BIC_Pkgpc10 (1ULL << 27) +#define BIC_CPU_LPI (1ULL << 28) +#define BIC_SYS_LPI (1ULL << 29) +#define BIC_PkgWatt (1ULL << 30) +#define BIC_CorWatt (1ULL << 31) +#define BIC_GFXWatt (1ULL << 32) +#define BIC_PkgCnt (1ULL << 33) +#define BIC_RAMWatt (1ULL << 34) +#define BIC_PKG__ (1ULL << 35) +#define BIC_RAM__ (1ULL << 36) +#define BIC_Pkg_J (1ULL << 37) +#define BIC_Cor_J (1ULL << 38) +#define BIC_GFX_J (1ULL << 39) +#define BIC_RAM_J (1ULL << 40) +#define BIC_Mod_c6 (1ULL << 41) +#define BIC_Totl_c0 (1ULL << 42) +#define BIC_Any_c0 (1ULL << 43) +#define BIC_GFX_c0 (1ULL << 44) +#define BIC_CPUGFX (1ULL << 45) +#define BIC_Core (1ULL << 46) +#define BIC_CPU (1ULL << 47) +#define BIC_APIC (1ULL << 48) +#define BIC_X2APIC (1ULL << 49) +#define BIC_Die (1ULL << 50) +#define BIC_GFXACTMHz (1ULL << 51) +#define BIC_IPC (1ULL << 52) +#define BIC_CORE_THROT_CNT (1ULL << 53) + +#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) +#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) +#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz ) +#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) +#define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) + +#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) + +unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); +unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC; + +#define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME) +#define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME) +#define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME) +#define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT) +#define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) +#define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) + char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; @@ -48,6 +213,7 @@ struct timespec interval_ts = { 5, 0 }; unsigned int model_orig; unsigned int num_iterations; +unsigned int header_iterations; unsigned int debug; unsigned int quiet; unsigned int shown; @@ -159,13 +325,6 @@ int ignore_stdin; #define MAX(a, b) ((a) > (b) ? (a) : (b)) -/* - * buffer size used by sscanf() for added column names - * Usually truncated to 7 characters, but also handles 18 columns for raw 64-bit counters - */ -#define NAME_BYTES 20 -#define PATH_BYTES 128 - int backwards_count; char *progname; @@ -205,6 +364,7 @@ struct core_data { unsigned int core_temp_c; unsigned int core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; + unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_COUNTERS]; } *core_even, *core_odd; @@ -255,24 +415,6 @@ struct pkg_data { #define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no) -enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; -enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; -enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; - -struct msr_counter { - unsigned int msr_num; - char name[NAME_BYTES]; - char path[PATH_BYTES]; - unsigned int width; - enum counter_type type; - enum counter_format format; - struct msr_counter *next; - unsigned int flags; -#define FLAGS_HIDE (1 << 0) -#define FLAGS_SHOW (1 << 1) -#define SYSFS_PERCPU (1 << 1) -}; - /* * The accumulated sum of MSR is defined as a monotonic * increasing MSR, it will be accumulated periodically, @@ -522,8 +664,10 @@ static int perf_instr_count_open(int cpu_num) /* counter for cpu_num, including user + kernel and all processes */ fd = perf_event_open(&pea, -1, cpu_num, -1, 0); - if (fd == -1) - err(-1, "cpu%d: perf instruction counter\n", cpu_num); + if (fd == -1) { + warn("cpu%d: perf instruction counter", cpu_num); + BIC_NOT_PRESENT(BIC_IPC); + } return fd; } @@ -550,143 +694,10 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) return 0; } -/* - * This list matches the column headers, except - * 1. built-in only, the sysfs counters are not here -- we learn of those at run-time - * 2. Core and CPU are moved to the end, we can't have strings that contain them - * matching on them for --show and --hide. - */ -struct msr_counter bic[] = { - { 0x0, "usec" }, - { 0x0, "Time_Of_Day_Seconds" }, - { 0x0, "Package" }, - { 0x0, "Node" }, - { 0x0, "Avg_MHz" }, - { 0x0, "Busy%" }, - { 0x0, "Bzy_MHz" }, - { 0x0, "TSC_MHz" }, - { 0x0, "IRQ" }, - { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL }, - { 0x0, "sysfs" }, - { 0x0, "CPU%c1" }, - { 0x0, "CPU%c3" }, - { 0x0, "CPU%c6" }, - { 0x0, "CPU%c7" }, - { 0x0, "ThreadC" }, - { 0x0, "CoreTmp" }, - { 0x0, "CoreCnt" }, - { 0x0, "PkgTmp" }, - { 0x0, "GFX%rc6" }, - { 0x0, "GFXMHz" }, - { 0x0, "Pkg%pc2" }, - { 0x0, "Pkg%pc3" }, - { 0x0, "Pkg%pc6" }, - { 0x0, "Pkg%pc7" }, - { 0x0, "Pkg%pc8" }, - { 0x0, "Pkg%pc9" }, - { 0x0, "Pk%pc10" }, - { 0x0, "CPU%LPI" }, - { 0x0, "SYS%LPI" }, - { 0x0, "PkgWatt" }, - { 0x0, "CorWatt" }, - { 0x0, "GFXWatt" }, - { 0x0, "PkgCnt" }, - { 0x0, "RAMWatt" }, - { 0x0, "PKG_%" }, - { 0x0, "RAM_%" }, - { 0x0, "Pkg_J" }, - { 0x0, "Cor_J" }, - { 0x0, "GFX_J" }, - { 0x0, "RAM_J" }, - { 0x0, "Mod%c6" }, - { 0x0, "Totl%C0" }, - { 0x0, "Any%C0" }, - { 0x0, "GFX%C0" }, - { 0x0, "CPUGFX%" }, - { 0x0, "Core" }, - { 0x0, "CPU" }, - { 0x0, "APIC" }, - { 0x0, "X2APIC" }, - { 0x0, "Die" }, - { 0x0, "GFXAMHz" }, - { 0x0, "IPC" }, -}; - -#define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) -#define BIC_USEC (1ULL << 0) -#define BIC_TOD (1ULL << 1) -#define BIC_Package (1ULL << 2) -#define BIC_Node (1ULL << 3) -#define BIC_Avg_MHz (1ULL << 4) -#define BIC_Busy (1ULL << 5) -#define BIC_Bzy_MHz (1ULL << 6) -#define BIC_TSC_MHz (1ULL << 7) -#define BIC_IRQ (1ULL << 8) -#define BIC_SMI (1ULL << 9) -#define BIC_sysfs (1ULL << 10) -#define BIC_CPU_c1 (1ULL << 11) -#define BIC_CPU_c3 (1ULL << 12) -#define BIC_CPU_c6 (1ULL << 13) -#define BIC_CPU_c7 (1ULL << 14) -#define BIC_ThreadC (1ULL << 15) -#define BIC_CoreTmp (1ULL << 16) -#define BIC_CoreCnt (1ULL << 17) -#define BIC_PkgTmp (1ULL << 18) -#define BIC_GFX_rc6 (1ULL << 19) -#define BIC_GFXMHz (1ULL << 20) -#define BIC_Pkgpc2 (1ULL << 21) -#define BIC_Pkgpc3 (1ULL << 22) -#define BIC_Pkgpc6 (1ULL << 23) -#define BIC_Pkgpc7 (1ULL << 24) -#define BIC_Pkgpc8 (1ULL << 25) -#define BIC_Pkgpc9 (1ULL << 26) -#define BIC_Pkgpc10 (1ULL << 27) -#define BIC_CPU_LPI (1ULL << 28) -#define BIC_SYS_LPI (1ULL << 29) -#define BIC_PkgWatt (1ULL << 30) -#define BIC_CorWatt (1ULL << 31) -#define BIC_GFXWatt (1ULL << 32) -#define BIC_PkgCnt (1ULL << 33) -#define BIC_RAMWatt (1ULL << 34) -#define BIC_PKG__ (1ULL << 35) -#define BIC_RAM__ (1ULL << 36) -#define BIC_Pkg_J (1ULL << 37) -#define BIC_Cor_J (1ULL << 38) -#define BIC_GFX_J (1ULL << 39) -#define BIC_RAM_J (1ULL << 40) -#define BIC_Mod_c6 (1ULL << 41) -#define BIC_Totl_c0 (1ULL << 42) -#define BIC_Any_c0 (1ULL << 43) -#define BIC_GFX_c0 (1ULL << 44) -#define BIC_CPUGFX (1ULL << 45) -#define BIC_Core (1ULL << 46) -#define BIC_CPU (1ULL << 47) -#define BIC_APIC (1ULL << 48) -#define BIC_X2APIC (1ULL << 49) -#define BIC_Die (1ULL << 50) -#define BIC_GFXACTMHz (1ULL << 51) -#define BIC_IPC (1ULL << 52) - -#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) -#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) -#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz ) -#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) -#define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) - -#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) - -unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); -unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC; - -#define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME) -#define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME) -#define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME) -#define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT) -#define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) -#define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) - #define MAX_DEFERRED 16 +char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; +int deferred_add_index; int deferred_skip_index; /* @@ -720,6 +731,8 @@ void help(void) " -l, --list list column headers only\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" + " -N, --header_iterations num\n" + " print header every num iterations\n" " -o, --out file\n" " create or truncate \"file\" for all output\n" " -q, --quiet skip decoding system configuration header\n" @@ -741,7 +754,7 @@ void help(void) */ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) { - int i; + unsigned int i; unsigned long long retval = 0; while (name_list) { @@ -752,40 +765,51 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) if (comma) *comma = '\0'; - if (!strcmp(name_list, "all")) - return ~0; - if (!strcmp(name_list, "topology")) - return BIC_TOPOLOGY; - if (!strcmp(name_list, "power")) - return BIC_THERMAL_PWR; - if (!strcmp(name_list, "idle")) - return BIC_IDLE; - if (!strcmp(name_list, "frequency")) - return BIC_FREQUENCY; - if (!strcmp(name_list, "other")) - return BIC_OTHER; - if (!strcmp(name_list, "all")) - return 0; - for (i = 0; i < MAX_BIC; ++i) { if (!strcmp(name_list, bic[i].name)) { retval |= (1ULL << i); break; } + if (!strcmp(name_list, "all")) { + retval |= ~0; + break; + } else if (!strcmp(name_list, "topology")) { + retval |= BIC_TOPOLOGY; + break; + } else if (!strcmp(name_list, "power")) { + retval |= BIC_THERMAL_PWR; + break; + } else if (!strcmp(name_list, "idle")) { + retval |= BIC_IDLE; + break; + } else if (!strcmp(name_list, "frequency")) { + retval |= BIC_FREQUENCY; + break; + } else if (!strcmp(name_list, "other")) { + retval |= BIC_OTHER; + break; + } + } if (i == MAX_BIC) { if (mode == SHOW_LIST) { - fprintf(stderr, "Invalid counter name: %s\n", name_list); - exit(-1); - } - deferred_skip_names[deferred_skip_index++] = name_list; - if (debug) - fprintf(stderr, "deferred \"%s\"\n", name_list); - if (deferred_skip_index >= MAX_DEFERRED) { - fprintf(stderr, "More than max %d un-recognized --skip options '%s'\n", - MAX_DEFERRED, name_list); - help(); - exit(1); + deferred_add_names[deferred_add_index++] = name_list; + if (deferred_add_index >= MAX_DEFERRED) { + fprintf(stderr, "More than max %d un-recognized --add options '%s'\n", + MAX_DEFERRED, name_list); + help(); + exit(1); + } + } else { + deferred_skip_names[deferred_skip_index++] = name_list; + if (debug) + fprintf(stderr, "deferred \"%s\"\n", name_list); + if (deferred_skip_index >= MAX_DEFERRED) { + fprintf(stderr, "More than max %d un-recognized --skip options '%s'\n", + MAX_DEFERRED, name_list); + help(); + exit(1); + } } } @@ -872,6 +896,9 @@ void print_header(char *delim) if (DO_BIC(BIC_CoreTmp)) outp += sprintf(outp, "%sCoreTmp", (printed++ ? delim : "")); + if (DO_BIC(BIC_CORE_THROT_CNT)) + outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : "")); + if (do_rapl && !rapl_joules) { if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); @@ -1011,6 +1038,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p outp += sprintf(outp, "c6: %016llX\n", c->c6); outp += sprintf(outp, "c7: %016llX\n", c->c7); outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c); + outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt); outp += sprintf(outp, "Joules: %0X\n", c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { @@ -1225,6 +1253,10 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_CoreTmp)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_temp_c); + /* Core throttle count */ + if (DO_BIC(BIC_CORE_THROT_CNT)) + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->core_throt_cnt); + for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) { if (mp->width == 32) @@ -1311,6 +1343,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_PkgWatt)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); + if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); @@ -1386,14 +1419,14 @@ void flush_output_stderr(void) void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { - static int printed; + static int count; - if (!printed || !summary_only) + if ((!count || (header_iterations && !(count % header_iterations))) || !summary_only) print_header("\t"); format_counters(&average.threads, &average.cores, &average.packages); - printed = 1; + count++; if (summary_only) return; @@ -1467,6 +1500,7 @@ void delta_core(struct core_data *new, struct core_data *old) old->c6 = new->c6 - old->c6; old->c7 = new->c7 - old->c7; old->core_temp_c = new->core_temp_c; + old->core_throt_cnt = new->core_throt_cnt; old->mc6_us = new->mc6_us - old->mc6_us; DELTA_WRAP32(new->core_energy, old->core_energy); @@ -1626,6 +1660,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data c->mc6_us = 0; c->core_temp_c = 0; c->core_energy = 0; + c->core_throt_cnt = 0; p->pkg_wtd_core_c0 = 0; p->pkg_any_core_c0 = 0; @@ -1710,6 +1745,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.mc6_us += c->mc6_us; average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c); + average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt); average.cores.core_energy += c->core_energy; @@ -1987,6 +2023,26 @@ void get_apic_id(struct thread_data *t) fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id); } +int get_core_throt_cnt(int cpu, unsigned long long *cnt) +{ + char path[128 + PATH_BYTES]; + unsigned long long tmp; + FILE *fp; + int ret; + + sprintf(path, "/sys/devices/system/cpu/cpu%d/thermal_throttle/core_throttle_count", cpu); + fp = fopen(path, "r"); + if (!fp) + return -1; + ret = fscanf(fp, "%lld", &tmp); + if (ret != 1) + return -1; + fclose(fp); + *cnt = tmp; + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -2129,6 +2185,9 @@ retry: c->core_temp_c = tj_max - ((msr >> 16) & 0x7F); } + if (DO_BIC(BIC_CORE_THROT_CNT)) + get_core_throt_cnt(cpu, &c->core_throt_cnt); + if (do_rapl & RAPL_AMD_F17H) { if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) return -14; @@ -2428,6 +2487,9 @@ int has_turbo_ratio_group_limits(int family, int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_SKYLAKE_X: @@ -2435,8 +2497,9 @@ int has_turbo_ratio_group_limits(int family, int model) case INTEL_FAM6_ATOM_GOLDMONT_D: case INTEL_FAM6_ATOM_TREMONT_D: return 1; + default: + return 0; } - return 0; } static void dump_turbo_ratio_limits(int family, int model) @@ -3027,6 +3090,8 @@ void set_max_cpu_num(void) */ int count_cpus(int cpu) { + UNUSED(cpu); + topo.num_cpus++; return 0; } @@ -3361,6 +3426,9 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg int i, ret; int cpu = t->cpu_id; + UNUSED(c); + UNUSED(p); + for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { unsigned long long msr_cur, msr_last; off_t offset; @@ -3387,6 +3455,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg static void msr_record_handler(union sigval v) { + UNUSED(v); + for_all_cpus(update_msr_sum, EVEN_COUNTERS); } @@ -3439,6 +3509,9 @@ release_msr: /* * set_my_sched_priority(pri) * return previous + * + * if non-root, do this: + * # /sbin/setcap cap_sys_rawio,cap_sys_nice=+ep /usr/bin/turbostat */ int set_my_sched_priority(int priority) { @@ -3457,7 +3530,7 @@ int set_my_sched_priority(int priority) errno = 0; retval = getpriority(PRIO_PROCESS, 0); if (retval != priority) - err(-1, "getpriority(%d) != setpriority(%d)", retval, priority); + err(retval, "getpriority(%d) != setpriority(%d)", retval, priority); return original_priority; } @@ -3466,7 +3539,7 @@ void turbostat_loop() { int retval; int restarted = 0; - int done_iters = 0; + unsigned int done_iters = 0; setup_signal_handler(); @@ -3678,6 +3751,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ no_MSR_MISC_PWR_MGMT = 1; + /* FALLTHRU */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ pkg_cstate_limits = slv_pkg_cstate_limits; break; @@ -3721,6 +3795,9 @@ int has_slv_msrs(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_ATOM_SILVERMONT: case INTEL_FAM6_ATOM_SILVERMONT_MID: @@ -3736,6 +3813,9 @@ int is_dnv(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_ATOM_GOLDMONT_D: return 1; @@ -3749,6 +3829,9 @@ int is_bdx(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_BROADWELL_X: return 1; @@ -3762,6 +3845,9 @@ int is_skx(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_SKYLAKE_X: return 1; @@ -3775,6 +3861,9 @@ int is_icx(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_ICELAKE_X: return 1; @@ -3787,6 +3876,9 @@ int is_ehl(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_ATOM_TREMONT: return 1; @@ -3799,6 +3891,9 @@ int is_jvl(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_ATOM_TREMONT_D: return 1; @@ -3811,6 +3906,9 @@ int has_turbo_ratio_limit(unsigned int family, unsigned int model) if (has_slv_msrs(family, model)) return 0; + if (family != 6) + return 0; + switch (model) { /* Nehalem compatible, but do not include turbo-ratio limit support */ case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ @@ -4125,6 +4223,9 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) char *epb_string; int cpu, epb; + UNUSED(c); + UNUSED(p); + if (!has_epb) return 0; @@ -4171,6 +4272,9 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) unsigned long long msr; int cpu; + UNUSED(c); + UNUSED(p); + if (!has_hwp) return 0; @@ -4254,6 +4358,9 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data unsigned long long msr; int cpu; + UNUSED(c); + UNUSED(p); + cpu = t->cpu_id; /* per-package */ @@ -4359,6 +4466,8 @@ double get_tdp_intel(unsigned int model) double get_tdp_amd(unsigned int family) { + UNUSED(family); + /* This is the max stock TDP of HEDT/Server Fam17h+ chips */ return 280.0; } @@ -4376,6 +4485,7 @@ static double rapl_dram_energy_units_probe(int model, double rapl_energy_units) case INTEL_FAM6_BROADWELL_X: /* BDX */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ + case INTEL_FAM6_ICELAKE_X: /* ICX */ return (rapl_dram_energy_units = 15.3 / 1000000); default: return (rapl_energy_units); @@ -4559,6 +4669,8 @@ void rapl_probe_amd(unsigned int family, unsigned int model) unsigned int has_rapl = 0; double tdp; + UNUSED(model); + if (max_extended_level >= 0x80000007) { __cpuid(0x80000007, eax, ebx, ecx, edx); /* RAPL (Fam 17h+) */ @@ -4617,6 +4729,7 @@ void perf_limit_reasons_probe(unsigned int family, unsigned int model) case INTEL_FAM6_HASWELL_L: /* HSW */ case INTEL_FAM6_HASWELL_G: /* HSW */ do_gfx_perf_limit_reasons = 1; + /* FALLTHRU */ case INTEL_FAM6_HASWELL_X: /* HSX */ do_core_perf_limit_reasons = 1; do_ring_perf_limit_reasons = 1; @@ -4643,6 +4756,9 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p unsigned int dts, dts2; int cpu; + UNUSED(c); + UNUSED(p); + if (!(do_dts || do_ptm)) return 0; @@ -4698,7 +4814,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p void print_power_limit_msr(int cpu, unsigned long long msr, char *label) { - fprintf(outf, "cpu%d: %s: %sabled (%f Watts, %f sec, clamp %sabled)\n", + fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n", cpu, label, ((msr >> 15) & 1) ? "EN" : "DIS", ((msr >> 0) & 0x7FFF) * rapl_power_units, @@ -4714,6 +4830,9 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) const char *msr_name; int cpu; + UNUSED(c); + UNUSED(p); + if (!do_rapl) return 0; @@ -4762,12 +4881,19 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu, msr, (msr >> 63) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "PKG Limit #1"); - fprintf(outf, "cpu%d: PKG Limit #2: %sabled (%f Watts, %f* sec, clamp %sabled)\n", + fprintf(outf, "cpu%d: PKG Limit #2: %sabled (%0.3f Watts, %f* sec, clamp %sabled)\n", cpu, ((msr >> 47) & 1) ? "EN" : "DIS", ((msr >> 32) & 0x7FFF) * rapl_power_units, (1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units, ((msr >> 48) & 1) ? "EN" : "DIS"); + + if (get_msr(cpu, MSR_VR_CURRENT_CONFIG, &msr)) + return -9; + + fprintf(outf, "cpu%d: MSR_VR_CURRENT_CONFIG: 0x%08llx\n", cpu, msr); + fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n", + cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN"); } if (do_rapl & RAPL_DRAM_POWER_INFO) { @@ -4830,6 +4956,9 @@ int has_snb_msrs(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_SANDYBRIDGE: case INTEL_FAM6_SANDYBRIDGE_X: @@ -4873,6 +5002,9 @@ int has_c8910_msrs(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_HASWELL_L: /* HSW */ case INTEL_FAM6_BROADWELL: /* BDW */ @@ -4899,6 +5031,9 @@ int has_skl_msrs(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ @@ -4911,6 +5046,10 @@ int is_slm(unsigned int family, unsigned int model) { if (!genuine_intel) return 0; + + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ @@ -4923,6 +5062,10 @@ int is_knl(unsigned int family, unsigned int model) { if (!genuine_intel) return 0; + + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ return 1; @@ -4935,6 +5078,9 @@ int is_cnl(unsigned int family, unsigned int model) if (!genuine_intel) return 0; + if (family != 6) + return 0; + switch (model) { case INTEL_FAM6_CANNONLAKE_L: /* CNL */ return 1; @@ -4989,6 +5135,9 @@ int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned int eax, ebx, ecx, edx; + UNUSED(c); + UNUSED(p); + if (!genuine_intel) return 0; @@ -5025,6 +5174,9 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk unsigned int tcc_default, tcc_offset; int cpu; + UNUSED(c); + UNUSED(p); + /* tj_max is used only for dts or ptm */ if (!(do_dts || do_ptm)) return 0; @@ -5572,6 +5724,11 @@ void process_cpuid() else BIC_NOT_PRESENT(BIC_CPU_LPI); + if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) + BIC_PRESENT(BIC_CORE_THROT_CNT); + else + BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); + if (!access(sys_lpi_file_sysfs, R_OK)) { sys_lpi_file = sys_lpi_file_sysfs; BIC_PRESENT(BIC_SYS_LPI); @@ -5601,11 +5758,6 @@ int dir_filter(const struct dirent *dirp) return 0; } -int open_dev_cpu_msr(int dummy1) -{ - return 0; -} - void topology_probe() { int i; @@ -5896,6 +6048,9 @@ void turbostat_init() if (!quiet && do_irtl_snb) print_irtl(); + + if (DO_BIC(BIC_IPC)) + (void)get_instr_count_fd(base_cpu); } int fork_it(char **argv) @@ -5973,7 +6128,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 21.05.04" " - Len Brown <lenb@kernel.org>\n"); + fprintf(outf, "turbostat version 2022.04.16 - Len Brown <lenb@kernel.org>\n"); } int add_counter(unsigned int msr_num, char *path, char *name, @@ -6138,6 +6293,16 @@ next: } } +int is_deferred_add(char *name) +{ + int i; + + for (i = 0; i < deferred_add_index; ++i) + if (!strcmp(name, deferred_add_names[i])) + return 1; + return 0; +} + int is_deferred_skip(char *name) { int i; @@ -6156,9 +6321,6 @@ void probe_sysfs(void) int state; char *sp; - if (!DO_BIC(BIC_sysfs)) - return; - for (state = 10; state >= 0; --state) { sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); @@ -6181,6 +6343,9 @@ void probe_sysfs(void) sprintf(path, "cpuidle/state%d/time", state); + if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf)) + continue; + if (is_deferred_skip(name_buf)) continue; @@ -6206,6 +6371,9 @@ void probe_sysfs(void) sprintf(path, "cpuidle/state%d/usage", state); + if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf)) + continue; + if (is_deferred_skip(name_buf)) continue; @@ -6313,6 +6481,7 @@ void cmdline(int argc, char **argv) { "interval", required_argument, 0, 'i' }, { "IPC", no_argument, 0, 'I' }, { "num_iterations", required_argument, 0, 'n' }, + { "header_iterations", required_argument, 0, 'N' }, { "help", no_argument, 0, 'h' }, { "hide", required_argument, 0, 'H' }, // meh, -h taken by --help { "Joules", no_argument, 0, 'J' }, @@ -6394,6 +6563,14 @@ void cmdline(int argc, char **argv) exit(2); } break; + case 'N': + header_iterations = strtod(optarg, NULL); + + if (header_iterations <= 0) { + fprintf(outf, "iterations %d should be positive number\n", header_iterations); + exit(2); + } + break; case 's': /* * --show: show only those specified @@ -6432,6 +6609,8 @@ int main(int argc, char **argv) turbostat_init(); + msr_sum_record(); + /* dump counters and exit */ if (dump_only) return get_and_dump_counters(); @@ -6443,7 +6622,6 @@ int main(int argc, char **argv) return 0; } - msr_sum_record(); /* * if any params left, it must be a command to fork */ |