diff options
Diffstat (limited to 'arch')
161 files changed, 4479 insertions, 1604 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 54b2bb817a7f..173159e93c99 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -117,7 +117,7 @@ config ARM select HAVE_KERNEL_XZ select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M select HAVE_KRETPROBES if HAVE_KPROBES - select HAVE_LD_DEAD_CODE_DATA_ELIMINATION + select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_IS_LLD) select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPTPROBES if !THUMB2_KERNEL diff --git a/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi b/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi index 52a0f6ee426f..bcf4d9c870ec 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi @@ -274,24 +274,24 @@ led@0 { chan-name = "R"; - led-cur = /bits/ 8 <0x20>; - max-cur = /bits/ 8 <0x60>; + led-cur = /bits/ 8 <0x6e>; + max-cur = /bits/ 8 <0xc8>; reg = <0>; color = <LED_COLOR_ID_RED>; }; led@1 { chan-name = "G"; - led-cur = /bits/ 8 <0x20>; - max-cur = /bits/ 8 <0x60>; + led-cur = /bits/ 8 <0xbe>; + max-cur = /bits/ 8 <0xc8>; reg = <1>; color = <LED_COLOR_ID_GREEN>; }; led@2 { chan-name = "B"; - led-cur = /bits/ 8 <0x20>; - max-cur = /bits/ 8 <0x60>; + led-cur = /bits/ 8 <0xbe>; + max-cur = /bits/ 8 <0xc8>; reg = <2>; color = <LED_COLOR_ID_BLUE>; }; diff --git a/arch/arm/boot/dts/ti/omap/omap3-n900.dts b/arch/arm/boot/dts/ti/omap/omap3-n900.dts index 07c5b963af78..4bde3342bb95 100644 --- a/arch/arm/boot/dts/ti/omap/omap3-n900.dts +++ b/arch/arm/boot/dts/ti/omap/omap3-n900.dts @@ -781,7 +781,7 @@ mount-matrix = "-1", "0", "0", "0", "1", "0", - "0", "0", "1"; + "0", "0", "-1"; }; cam1: camera@3e { diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index f01d23a220e6..1dfae1af8e31 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -29,6 +29,12 @@ #include "entry-header.S" #include <asm/probes.h> +#ifdef CONFIG_HAVE_LD_DEAD_CODE_DATA_ELIMINATION +#define RELOC_TEXT_NONE .reloc .text, R_ARM_NONE, . +#else +#define RELOC_TEXT_NONE +#endif + /* * Interrupt handling. */ @@ -1065,7 +1071,7 @@ vector_addrexcptn: .globl vector_fiq .section .vectors, "ax", %progbits - .reloc .text, R_ARM_NONE, . + RELOC_TEXT_NONE W(b) vector_rst W(b) vector_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_swi ) @@ -1079,7 +1085,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_swi ) #ifdef CONFIG_HARDEN_BRANCH_HISTORY .section .vectors.bhb.loop8, "ax", %progbits - .reloc .text, R_ARM_NONE, . + RELOC_TEXT_NONE W(b) vector_rst W(b) vector_bhb_loop8_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_loop8_swi ) @@ -1092,7 +1098,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_loop8_swi ) W(b) vector_bhb_loop8_fiq .section .vectors.bhb.bpiall, "ax", %progbits - .reloc .text, R_ARM_NONE, . + RELOC_TEXT_NONE W(b) vector_rst W(b) vector_bhb_bpiall_und ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_bpiall_swi ) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi index 6b6e3ee950e5..acf293310f7a 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi @@ -175,7 +175,7 @@ }; }; - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 1>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 17f4e3171120..ab4c919e3e16 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -214,7 +214,7 @@ }; }; - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 3>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index 200e52622f99..55019866d6a2 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -182,7 +182,7 @@ }; }; - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 3>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi index 8ce4b6aae79d..e3a7db21fe29 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi @@ -131,7 +131,7 @@ }; thermal-zones { - core-cluster-thermal { + cluster-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 0>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index bde89de2576e..1b306d6802ce 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -122,7 +122,7 @@ }; }; - core-cluster1-thermal { + cluster1-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 4>; @@ -151,7 +151,7 @@ }; }; - core-cluster2-thermal { + cluster2-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 5>; @@ -180,7 +180,7 @@ }; }; - core-cluster3-thermal { + cluster3-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 6>; @@ -209,7 +209,7 @@ }; }; - core-cluster4-thermal { + cluster4-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 7>; diff --git a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi index 26c7ca31e22e..bd75a658767d 100644 --- a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi @@ -492,7 +492,7 @@ }; }; - ddr-cluster5-thermal { + ddr-ctrl5-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tmu 1>; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso index bf3e04651ba0..353ace3601dc 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso @@ -21,7 +21,7 @@ &gpio3 { pinctrl-names = "default"; - pinctrcl-0 = <&pinctrl_gpio3_hog>; + pinctrl-0 = <&pinctrl_gpio3_hog>; uart4_rs485_en { gpio-hog; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso index f4448cde0407..8a75d6783ad2 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso +++ b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso @@ -22,7 +22,7 @@ &gpio3 { pinctrl-names = "default"; - pinctrcl-0 = <&pinctrl_gpio3_hog>; + pinctrl-0 = <&pinctrl_gpio3_hog>; uart4_rs485_en { gpio-hog; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts index 17e2c19d8455..cc9b81d46188 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts @@ -211,13 +211,12 @@ simple-audio-card,cpu { sound-dai = <&sai3>; + frame-master; + bitclock-master; }; simple-audio-card,codec { sound-dai = <&wm8962>; - clocks = <&clk IMX8MP_CLK_IPP_DO_CLKO1>; - frame-master; - bitclock-master; }; }; }; @@ -507,10 +506,9 @@ &sai3 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_sai3>; - assigned-clocks = <&clk IMX8MP_CLK_SAI3>, - <&clk IMX8MP_AUDIO_PLL2> ; - assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>; - assigned-clock-rates = <12288000>, <361267200>; + assigned-clocks = <&clk IMX8MP_CLK_SAI3>; + assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL1_OUT>; + assigned-clock-rates = <12288000>; fsl,sai-mclk-direction-output; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts b/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts index da8f19a646a9..e2ee9f5a042c 100644 --- a/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts +++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts @@ -499,7 +499,7 @@ pinctrl-0 = <&pinctrl_usdhc2_hs>, <&pinctrl_usdhc2_gpio>; pinctrl-1 = <&pinctrl_usdhc2_uhs>, <&pinctrl_usdhc2_gpio>; pinctrl-2 = <&pinctrl_usdhc2_uhs>, <&pinctrl_usdhc2_gpio>; - cd-gpios = <&gpio3 00 GPIO_ACTIVE_LOW>; + cd-gpios = <&gpio3 0 GPIO_ACTIVE_LOW>; vmmc-supply = <®_usdhc2_vmmc>; bus-width = <4>; no-sdio; diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi index edbd8cad35bc..72a9a5d4e27a 100644 --- a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi +++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi @@ -19,7 +19,7 @@ linux,cma { compatible = "shared-dma-pool"; reusable; - alloc-ranges = <0 0x60000000 0 0x40000000>; + alloc-ranges = <0 0x80000000 0 0x40000000>; size = <0 0x10000000>; linux,cma-default; }; @@ -156,6 +156,7 @@ &wdog3 { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_wdog>; + fsl,ext-reset-output; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx93.dtsi b/arch/arm64/boot/dts/freescale/imx93.dtsi index 4a3f42355cb8..a0993022c102 100644 --- a/arch/arm64/boot/dts/freescale/imx93.dtsi +++ b/arch/arm64/boot/dts/freescale/imx93.dtsi @@ -1105,7 +1105,7 @@ <&clk IMX93_CLK_SYS_PLL_PFD0_DIV2>; assigned-clock-rates = <100000000>, <250000000>; intf_mode = <&wakeupmix_gpr 0x28>; - snps,clk-csr = <0>; + snps,clk-csr = <6>; nvmem-cells = <ð_mac2>; nvmem-cell-names = "mac-address"; status = "disabled"; diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index 1bbf9a0468f6..425272aa5a81 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -27,7 +27,7 @@ reg = <0x0>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -44,7 +44,7 @@ reg = <0x100>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -61,7 +61,7 @@ reg = <0x200>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -78,7 +78,7 @@ reg = <0x300>; enable-method = "psci"; #cooling-cells = <2>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; i-cache-size = <32768>; i-cache-line-size = <64>; @@ -93,7 +93,7 @@ device_type = "cpu"; compatible = "arm,cortex-a55"; reg = <0x400>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; enable-method = "psci"; #cooling-cells = <2>; @@ -110,7 +110,7 @@ device_type = "cpu"; compatible = "arm,cortex-a55"; reg = <0x500>; - power-domains = <&scmi_devpd IMX95_PERF_A55>; + power-domains = <&scmi_perf IMX95_PERF_A55>; power-domain-names = "perf"; enable-method = "psci"; #cooling-cells = <2>; @@ -187,7 +187,7 @@ compatible = "cache"; cache-size = <524288>; cache-line-size = <64>; - cache-sets = <1024>; + cache-sets = <512>; cache-level = <3>; cache-unified; }; diff --git a/arch/arm64/boot/dts/qcom/ipq5332.dtsi b/arch/arm64/boot/dts/qcom/ipq5332.dtsi index 573656587c0d..0a74ed4f72cc 100644 --- a/arch/arm64/boot/dts/qcom/ipq5332.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq5332.dtsi @@ -320,8 +320,8 @@ reg = <0x08af8800 0x400>; interrupts = <GIC_SPI 62 IRQ_TYPE_LEVEL_HIGH>, - <GIC_SPI 53 IRQ_TYPE_EDGE_BOTH>, - <GIC_SPI 52 IRQ_TYPE_EDGE_BOTH>; + <GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 52 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "pwr_event", "dp_hs_phy_irq", "dm_hs_phy_irq"; diff --git a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts index 7fb980fcb307..9caa14dda585 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts @@ -278,6 +278,13 @@ vdd-l3-supply = <&vreg_s1f_0p7>; vdd-s1-supply = <&vph_pwr>; vdd-s2-supply = <&vph_pwr>; + + vreg_l3i_0p8: ldo3 { + regulator-name = "vreg_l3i_0p8"; + regulator-min-microvolt = <880000>; + regulator-max-microvolt = <920000>; + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>; + }; }; regulators-7 { @@ -423,11 +430,17 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; @@ -517,7 +530,30 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -529,7 +565,7 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts index 6152bcd0bc1f..e17ab8251e2a 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts @@ -268,7 +268,6 @@ pinctrl-0 = <&edp_reg_en>; pinctrl-names = "default"; - regulator-always-on; regulator-boot-on; }; @@ -637,6 +636,14 @@ }; }; +&gpu { + status = "okay"; + + zap-shader { + firmware-name = "qcom/x1e80100/gen70500_zap.mbn"; + }; +}; + &i2c0 { clock-frequency = <400000>; @@ -724,9 +731,13 @@ aux-bus { panel { - compatible = "edp-panel"; + compatible = "samsung,atna45af01", "samsung,atna33xc20"; + enable-gpios = <&pmc8380_3_gpios 4 GPIO_ACTIVE_HIGH>; power-supply = <&vreg_edp_3p3>; + pinctrl-0 = <&edp_bl_en>; + pinctrl-names = "default"; + port { edp_panel_in: endpoint { remote-endpoint = <&mdss_dp3_out>; @@ -756,11 +767,17 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; @@ -785,6 +802,16 @@ status = "okay"; }; +&pmc8380_3_gpios { + edp_bl_en: edp-bl-en-state { + pins = "gpio4"; + function = "normal"; + power-source = <1>; /* 1.8V */ + input-disable; + output-enable; + }; +}; + &qupv3_0 { status = "okay"; }; @@ -931,7 +958,30 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -943,15 +993,15 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { - pins = "gpio154"; - function = "gpio"; - drive-strength = <2>; - bias-pull-up; - }; + pins = "gpio154"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; }; tpad_default: tpad-default-state { diff --git a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts index fbff558f5b07..1943bdbfb8c0 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts @@ -625,16 +625,31 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; }; +&pcie4_port0 { + wifi@0 { + compatible = "pci17cb,1107"; + reg = <0x10000 0x0 0x0 0x0 0x0>; + + qcom,ath12k-calibration-variant = "LES790"; + }; +}; + &pcie6a { perst-gpios = <&tlmm 152 GPIO_ACTIVE_LOW>; wake-gpios = <&tlmm 154 GPIO_ACTIVE_LOW>; @@ -782,7 +797,30 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -794,15 +832,15 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { - pins = "gpio154"; - function = "gpio"; - drive-strength = <2>; - bias-pull-up; - }; + pins = "gpio154"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; }; tpad_default: tpad-default-state { diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts index 72a4f4138616..8098e6730ae5 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts +++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts @@ -606,6 +606,14 @@ }; }; +&gpu { + status = "okay"; + + zap-shader { + firmware-name = "qcom/x1e80100/gen70500_zap.mbn"; + }; +}; + &lpass_tlmm { spkr_01_sd_n_active: spkr-01-sd-n-active-state { pins = "gpio12"; @@ -660,11 +668,17 @@ }; &pcie4 { + perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>; + wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>; + + pinctrl-0 = <&pcie4_default>; + pinctrl-names = "default"; + status = "okay"; }; &pcie4_phy { - vdda-phy-supply = <&vreg_l3j_0p8>; + vdda-phy-supply = <&vreg_l3i_0p8>; vdda-pll-supply = <&vreg_l3e_1p2>; status = "okay"; @@ -804,7 +818,30 @@ bias-disable; }; - pcie6a_default: pcie2a-default-state { + pcie4_default: pcie4-default-state { + clkreq-n-pins { + pins = "gpio147"; + function = "pcie4_clk"; + drive-strength = <2>; + bias-pull-up; + }; + + perst-n-pins { + pins = "gpio146"; + function = "gpio"; + drive-strength = <2>; + bias-disable; + }; + + wake-n-pins { + pins = "gpio148"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; + }; + + pcie6a_default: pcie6a-default-state { clkreq-n-pins { pins = "gpio153"; function = "pcie6a_clk"; @@ -816,15 +853,15 @@ pins = "gpio152"; function = "gpio"; drive-strength = <2>; - bias-pull-down; + bias-disable; }; wake-n-pins { - pins = "gpio154"; - function = "gpio"; - drive-strength = <2>; - bias-pull-up; - }; + pins = "gpio154"; + function = "gpio"; + drive-strength = <2>; + bias-pull-up; + }; }; wcd_default: wcd-reset-n-active-state { diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 7bca5fcd7d52..cd732ef88cd8 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -2901,7 +2901,7 @@ dma-coherent; - linux,pci-domain = <7>; + linux,pci-domain = <6>; num-lanes = <2>; interrupts = <GIC_SPI 773 IRQ_TYPE_LEVEL_HIGH>, @@ -2959,6 +2959,7 @@ "link_down"; power-domains = <&gcc GCC_PCIE_6A_GDSC>; + required-opps = <&rpmhpd_opp_nom>; phys = <&pcie6a_phy>; phy-names = "pciephy"; @@ -3022,7 +3023,7 @@ dma-coherent; - linux,pci-domain = <5>; + linux,pci-domain = <4>; num-lanes = <2>; interrupts = <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>, @@ -3080,11 +3081,22 @@ "link_down"; power-domains = <&gcc GCC_PCIE_4_GDSC>; + required-opps = <&rpmhpd_opp_nom>; phys = <&pcie4_phy>; phy-names = "pciephy"; status = "disabled"; + + pcie4_port0: pcie@0 { + device_type = "pci"; + reg = <0x0 0x0 0x0 0x0 0x0>; + bus-range = <0x01 0xff>; + + #address-cells = <3>; + #size-cells = <2>; + ranges; + }; }; pcie4_phy: phy@1c0e000 { @@ -3155,9 +3167,10 @@ interconnects = <&gem_noc MASTER_GFX3D 0 &mc_virt SLAVE_EBI1 0>; interconnect-names = "gfx-mem"; + status = "disabled"; + zap-shader { memory-region = <&gpu_microcode_mem>; - firmware-name = "qcom/gen70500_zap.mbn"; }; gpu_opp_table: opp-table { @@ -3288,7 +3301,7 @@ reg = <0x0 0x03da0000 0x0 0x40000>; #iommu-cells = <2>; #global-interrupts = <1>; - interrupts = <GIC_SPI 673 IRQ_TYPE_LEVEL_HIGH>, + interrupts = <GIC_SPI 674 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 678 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 679 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 680 IRQ_TYPE_LEVEL_HIGH>, diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 7d32fca64996..362df9390263 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -887,6 +887,7 @@ CONFIG_DRM_PANEL_KHADAS_TS050=m CONFIG_DRM_PANEL_MANTIX_MLAF057WE51=m CONFIG_DRM_PANEL_NOVATEK_NT36672E=m CONFIG_DRM_PANEL_RAYDIUM_RM67191=m +CONFIG_DRM_PANEL_SAMSUNG_ATNA33XC20=m CONFIG_DRM_PANEL_SITRONIX_ST7703=m CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m CONFIG_DRM_PANEL_VISIONOX_VTDR6130=m diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 56c148890daf..d79308c23ddb 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -122,8 +122,8 @@ #define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n)) /* Status codes for individual page table levels */ -#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + n) -#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + n) +#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n)) +#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + (n)) #define ESR_ELx_FSC_FAULT_nL (0x2C) #define ESR_ELx_FSC_FAULT_L(n) (((n) < 0 ? ESR_ELx_FSC_FAULT_nL : \ @@ -161,6 +161,7 @@ /* ISS field definitions for exceptions taken in to Hyp */ #define ESR_ELx_FSC_ADDRSZ (0x00) +#define ESR_ELx_FSC_ADDRSZ_L(n) (ESR_ELx_FSC_ADDRSZ + (n)) #define ESR_ELx_CV (UL(1) << 24) #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index d81cc746e0eb..109a85ee6910 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -107,6 +107,7 @@ /* TCR_EL2 Registers bits */ #define TCR_EL2_DS (1UL << 32) #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) +#define TCR_EL2_HPD (1 << 24) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 2181a11b9d92..b36a3b6cc011 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -10,6 +10,7 @@ #include <asm/hyp_image.h> #include <asm/insn.h> #include <asm/virt.h> +#include <asm/sysreg.h> #define ARM_EXIT_WITH_SERROR_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) @@ -235,6 +236,9 @@ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding); extern void __kvm_timer_set_cntvoff(u64 cntvoff); +extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); +extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); @@ -259,7 +263,7 @@ extern u64 __kvm_get_mdcr_el2(void); asm volatile( \ " mrs %1, spsr_el2\n" \ " mrs %2, elr_el2\n" \ - "1: at "at_op", %3\n" \ + "1: " __msr_s(at_op, "%3") "\n" \ " isb\n" \ " b 9f\n" \ "2: msr spsr_el2, %1\n" \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a33f5996ca9f..b9ca899041db 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -446,6 +446,10 @@ enum vcpu_sysreg { GCR_EL1, /* Tag Control Register */ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */ + /* FP/SIMD/SVE */ + SVCR, + FPMR, + /* 32bit specific registers. */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ @@ -530,6 +534,8 @@ enum vcpu_sysreg { VNCR(CNTP_CVAL_EL0), VNCR(CNTP_CTL_EL0), + VNCR(ICH_HCR_EL2), + NR_SYS_REGS /* Nothing after this line! */ }; @@ -595,6 +601,16 @@ struct kvm_host_data { struct cpu_sve_state *sve_state; }; + union { + /* HYP VA pointer to the host storage for FPMR */ + u64 *fpmr_ptr; + /* + * Used by pKVM only, as it needs to provide storage + * for the host + */ + u64 fpmr; + }; + /* Ownership of the FP regs */ enum { FP_STATE_FREE, @@ -664,8 +680,6 @@ struct kvm_vcpu_arch { void *sve_state; enum fp_type fp_type; unsigned int sve_max_vl; - u64 svcr; - u64 fpmr; /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; @@ -1473,4 +1487,8 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); (pa + pi + pa3) == 1; \ }) +#define kvm_has_fpmr(k) \ + (system_supports_fpmr() && \ + kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 216ca424bb16..cd4087fbda9a 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -352,5 +352,11 @@ static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) return &kvm->arch.mmu != mmu; } +#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm); +#else +static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {} +#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */ + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 5b06c31035a2..e8bc6d67aba2 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -85,7 +85,7 @@ struct kvm_s2_trans { bool readable; int level; u32 esr; - u64 upper_attr; + u64 desc; }; static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans) @@ -115,7 +115,7 @@ static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans) static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans) { - return !(trans->upper_attr & BIT(54)); + return !(trans->desc & BIT(54)); } extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa, @@ -205,4 +205,40 @@ static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans) return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level); } +/* Adjust alignment for the contiguous bit as per StageOA() */ +#define contiguous_bit_shift(d, wi, l) \ + ({ \ + u8 shift = 0; \ + \ + if ((d) & PTE_CONT) { \ + switch (BIT((wi)->pgshift)) { \ + case SZ_4K: \ + shift = 4; \ + break; \ + case SZ_16K: \ + shift = (l) == 2 ? 5 : 7; \ + break; \ + case SZ_64K: \ + shift = 5; \ + break; \ + } \ + } \ + \ + shift; \ + }) + +static inline unsigned int ps_to_output_size(unsigned int ps) +{ + switch (ps) { + case 0: return 32; + case 1: return 36; + case 2: return 40; + case 3: return 42; + case 4: return 44; + case 5: + default: + return 48; + } +} + #endif /* __ARM64_KVM_NESTED_H */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 19278dfe7978..03f4c3d7839c 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -59,6 +59,48 @@ typedef u64 kvm_pte_t; #define KVM_PHYS_INVALID (-1ULL) +#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) + +#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) + +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) + +#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) + +#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) + +#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) + +#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ + KVM_PTE_LEAF_ATTR_HI_S2_XN) + +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) +#define KVM_MAX_OWNER_ID 1 + +/* + * Used to indicate a pte for which a 'break-before-make' sequence is in + * progress. + */ +#define KVM_INVALID_PTE_LOCKED BIT(10) + static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 1f60aa1bc750..07dfbdb14bab 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -205,6 +205,11 @@ #define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2) /* + * Hierarchical permission for Stage-1 tables + */ +#define S1_TABLE_AP (_AT(pmdval_t, 3) << 61) + +/* * Highest possible physical address supported. */ #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) @@ -298,6 +303,10 @@ #define TCR_TBI1 (UL(1) << 38) #define TCR_HA (UL(1) << 39) #define TCR_HD (UL(1) << 40) +#define TCR_HPD0_SHIFT 41 +#define TCR_HPD0 (UL(1) << TCR_HPD0_SHIFT) +#define TCR_HPD1_SHIFT 42 +#define TCR_HPD1 (UL(1) << TCR_HPD1_SHIFT) #define TCR_TBID0 (UL(1) << 51) #define TCR_TBID1 (UL(1) << 52) #define TCR_NFD0 (UL(1) << 53) diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 5b1701c76d1c..6cf4aae05219 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -5,6 +5,8 @@ #ifndef __ASM_PTDUMP_H #define __ASM_PTDUMP_H +#include <linux/ptdump.h> + #ifdef CONFIG_PTDUMP_CORE #include <linux/mm_types.h> @@ -21,14 +23,53 @@ struct ptdump_info { unsigned long base_addr; }; +struct ptdump_prot_bits { + u64 mask; + u64 val; + const char *set; + const char *clear; +}; + +struct ptdump_pg_level { + const struct ptdump_prot_bits *bits; + char name[4]; + int num; + u64 mask; +}; + +/* + * The page dumper groups page table entries of the same type into a single + * description. It uses pg_state to track the range information while + * iterating over the pte entries. When the continuity is broken it then + * dumps out a description of the range. + */ +struct ptdump_pg_state { + struct ptdump_state ptdump; + struct ptdump_pg_level *pg_level; + struct seq_file *seq; + const struct addr_marker *marker; + const struct mm_struct *mm; + unsigned long start_address; + int level; + u64 current_prot; + bool check_wx; + unsigned long wx_pages; + unsigned long uxn_pages; +}; + void ptdump_walk(struct seq_file *s, struct ptdump_info *info); +void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + u64 val); #ifdef CONFIG_PTDUMP_DEBUGFS #define EFI_RUNTIME_MAP_END DEFAULT_MAP_WINDOW_64 void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name); #else static inline void ptdump_debugfs_register(struct ptdump_info *info, const char *name) { } -#endif +#endif /* CONFIG_PTDUMP_DEBUGFS */ +#else +static inline void note_page(struct ptdump_state *pt_st, unsigned long addr, + int level, u64 val) { } #endif /* CONFIG_PTDUMP_CORE */ #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 4a9ea103817e..bc161f160854 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -109,6 +109,9 @@ #define set_pstate_ssbs(x) asm volatile(SET_PSTATE_SSBS(x)) #define set_pstate_dit(x) asm volatile(SET_PSTATE_DIT(x)) +/* Register-based PAN access, for save/restore purposes */ +#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3) + #define __SYS_BARRIER_INSN(CRm, op2, Rt) \ __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f)) @@ -325,7 +328,25 @@ #define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0) #define SYS_PAR_EL1_F BIT(0) +/* When PAR_EL1.F == 1 */ #define SYS_PAR_EL1_FST GENMASK(6, 1) +#define SYS_PAR_EL1_PTW BIT(8) +#define SYS_PAR_EL1_S BIT(9) +#define SYS_PAR_EL1_AssuredOnly BIT(12) +#define SYS_PAR_EL1_TopLevel BIT(13) +#define SYS_PAR_EL1_Overlay BIT(14) +#define SYS_PAR_EL1_DirtyBit BIT(15) +#define SYS_PAR_EL1_F1_IMPDEF GENMASK_ULL(63, 48) +#define SYS_PAR_EL1_F1_RES0 (BIT(7) | BIT(10) | GENMASK_ULL(47, 16)) +#define SYS_PAR_EL1_RES1 BIT(11) +/* When PAR_EL1.F == 0 */ +#define SYS_PAR_EL1_SH GENMASK_ULL(8, 7) +#define SYS_PAR_EL1_NS BIT(9) +#define SYS_PAR_EL1_F0_IMPDEF BIT(10) +#define SYS_PAR_EL1_NSE BIT(11) +#define SYS_PAR_EL1_PA GENMASK_ULL(51, 12) +#define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56) +#define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52)) /*** Statistical Profiling Extension ***/ #define PMSEVFR_EL1_RES0_IMP \ @@ -652,6 +673,7 @@ #define OP_AT_S12E1W sys_insn(AT_Op0, 4, AT_CRn, 8, 5) #define OP_AT_S12E0R sys_insn(AT_Op0, 4, AT_CRn, 8, 6) #define OP_AT_S12E0W sys_insn(AT_Op0, 4, AT_CRn, 8, 7) +#define OP_AT_S1E2A sys_insn(AT_Op0, 4, AT_CRn, 9, 2) /* TLBI instructions */ #define TLBI_Op0 1 diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 6b3258860377..2729faaee4b4 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -25,6 +25,7 @@ * * @common: Common unwind state. * @task: The task being unwound. + * @graph_idx: Used by ftrace_graph_ret_addr() for optimized stack unwinding. * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance * associated with the most recently encountered replacement lr * value. @@ -32,6 +33,7 @@ struct kunwind_state { struct unwind_state common; struct task_struct *task; + int graph_idx; #ifdef CONFIG_KRETPROBES struct llist_node *kr_cur; #endif @@ -106,7 +108,7 @@ kunwind_recover_return_address(struct kunwind_state *state) if (state->task->ret_stack && (state->common.pc == (unsigned long)return_to_handler)) { unsigned long orig_pc; - orig_pc = ftrace_graph_ret_addr(state->task, NULL, + orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, state->common.pc, (void *)state->common.fp); if (WARN_ON_ONCE(state->common.pc == orig_pc)) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8304eb342be9..ead632ad01b4 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -66,4 +66,21 @@ config PROTECTED_NVHE_STACKTRACE If unsure, or not using protected nVHE (pKVM), say N. +config PTDUMP_STAGE2_DEBUGFS + bool "Present the stage-2 pagetables to debugfs" + depends on KVM + depends on DEBUG_KERNEL + depends on DEBUG_FS + depends on GENERIC_PTDUMP + select PTDUMP_CORE + default n + help + Say Y here if you want to show the stage-2 kernel pagetables + layout in a debugfs file. This information is only useful for kernel developers + who are working in architecture specific areas of the kernel. + It is probably not a good idea to enable this feature in a production + kernel. + + If in doubt, say N. + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 86a629aaf0a1..3cf7adb2b503 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -17,7 +17,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ - arch_timer.o trng.o vmid.o emulate-nested.o nested.o \ + arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \ vgic/vgic.o vgic/vgic-init.o \ vgic/vgic-irqfd.o vgic/vgic-v2.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ @@ -27,6 +27,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o +kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o always-y := hyp_constants.h hyp-constants.s diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9bef7638342e..a0d01c46e408 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -46,6 +46,8 @@ #include <kvm/arm_pmu.h> #include <kvm/arm_psci.h> +#include "sys_regs.h" + static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; enum kvm_wfx_trap_policy { @@ -228,6 +230,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) void kvm_arch_create_vm_debugfs(struct kvm *kvm) { kvm_sys_regs_create_debugfs(kvm); + kvm_s2_ptdump_create_debugfs(kvm); } static void kvm_destroy_mpidr_data(struct kvm *kvm) @@ -821,15 +824,13 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } - if (vcpu_has_nv(vcpu)) { - ret = kvm_init_nv_sysregs(vcpu->kvm); - if (ret) - return ret; - } + ret = kvm_finalize_sys_regs(vcpu); + if (ret) + return ret; /* - * This needs to happen after NV has imposed its own restrictions on - * the feature set + * This needs to happen after any restriction has been applied + * to the feature set. */ kvm_calculate_traps(vcpu); @@ -2163,7 +2164,7 @@ static void cpu_hyp_uninit(void *discard) } } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { /* * Most calls to this function are made with migration @@ -2183,7 +2184,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { kvm_timer_cpu_down(); kvm_vgic_cpu_down(); @@ -2379,7 +2380,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits) /* * The stub hypercalls are now disabled, so set our local flag to - * prevent a later re-init attempt in kvm_arch_hardware_enable(). + * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu(). */ __this_cpu_write(kvm_hyp_initialized, 1); preempt_enable(); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c new file mode 100644 index 000000000000..39f0e87a340e --- /dev/null +++ b/arch/arm64/kvm/at.c @@ -0,0 +1,1101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 - Linaro Ltd + * Author: Jintack Lim <jintack.lim@linaro.org> + */ + +#include <linux/kvm_host.h> + +#include <asm/esr.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +enum trans_regime { + TR_EL10, + TR_EL20, + TR_EL2, +}; + +struct s1_walk_info { + u64 baddr; + enum trans_regime regime; + unsigned int max_oa_bits; + unsigned int pgshift; + unsigned int txsz; + int sl; + bool hpd; + bool be; + bool s2; +}; + +struct s1_walk_result { + union { + struct { + u64 desc; + u64 pa; + s8 level; + u8 APTable; + bool UXNTable; + bool PXNTable; + }; + struct { + u8 fst; + bool ptw; + bool s2; + }; + }; + bool failed; +}; + +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) +{ + wr->fst = fst; + wr->ptw = ptw; + wr->s2 = s2; + wr->failed = true; +} + +#define S1_MMU_DISABLED (-127) + +static int get_ia_size(struct s1_walk_info *wi) +{ + return 64 - wi->txsz; +} + +/* Return true if the IPA is out of the OA range */ +static bool check_output_size(u64 ipa, struct s1_walk_info *wi) +{ + return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); +} + +/* Return the translation regime that applies to an AT instruction */ +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) +{ + /* + * We only get here from guest EL2, so the translation + * regime AT applies to is solely defined by {E2H,TGE}. + */ + switch (op) { + case OP_AT_S1E2R: + case OP_AT_S1E2W: + case OP_AT_S1E2A: + return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; + break; + default: + return (vcpu_el2_e2h_is_set(vcpu) && + vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; + } +} + +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr; + unsigned int stride, x; + bool va55, tbi, lva, as_el0; + + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + + wi->regime = compute_translation_regime(vcpu, op); + as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + + va55 = va & BIT(55); + + if (wi->regime == TR_EL2 && va55) + goto addrsz; + + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); + + switch (wi->regime) { + case TR_EL10: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); + break; + case TR_EL2: + case TR_EL20: + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr = (va55 ? + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); + break; + default: + BUG(); + } + + tbi = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_TBI, tcr) : + (va55 ? + FIELD_GET(TCR_TBI1, tcr) : + FIELD_GET(TCR_TBI0, tcr))); + + if (!tbi && (u64)sign_extend64(va, 55) != va) + goto addrsz; + + va = (u64)sign_extend64(va, 55); + + /* Let's put the MMU disabled case aside immediately */ + switch (wi->regime) { + case TR_EL10: + /* + * If dealing with the EL1&0 translation regime, 3 things + * can disable the S1 translation: + * + * - HCR_EL2.DC = 1 + * - HCR_EL2.{E2H,TGE} = {0,1} + * - SCTLR_EL1.M = 0 + * + * The TGE part is interesting. If we have decided that this + * is EL1&0, then it means that either {E2H,TGE} == {1,0} or + * {0,x}, and we only need to test for TGE == 1. + */ + if (hcr & (HCR_DC | HCR_TGE)) { + wr->level = S1_MMU_DISABLED; + break; + } + fallthrough; + case TR_EL2: + case TR_EL20: + if (!(sctlr & SCTLR_ELx_M)) + wr->level = S1_MMU_DISABLED; + break; + } + + if (wr->level == S1_MMU_DISABLED) { + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) + goto addrsz; + + wr->pa = va; + return 0; + } + + wi->be = sctlr & SCTLR_ELx_EE; + + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); + wi->hpd &= (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_HPD, tcr) : + (va55 ? + FIELD_GET(TCR_HPD1, tcr) : + FIELD_GET(TCR_HPD0, tcr))); + + /* Someone was silly enough to encode TG0/TG1 differently */ + if (va55) { + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG1_MASK, tcr); + + switch (tg << TCR_TG1_SHIFT) { + case TCR_TG1_4K: + wi->pgshift = 12; break; + case TCR_TG1_16K: + wi->pgshift = 14; break; + case TCR_TG1_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } else { + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); + tg = FIELD_GET(TCR_TG0_MASK, tcr); + + switch (tg << TCR_TG0_SHIFT) { + case TCR_TG0_4K: + wi->pgshift = 12; break; + case TCR_TG0_16K: + wi->pgshift = 14; break; + case TCR_TG0_64K: + default: /* IMPDEF: treat any other value as 64k */ + wi->pgshift = 16; break; + } + } + + /* R_PLCGL, R_YXNYW */ + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { + if (wi->txsz > 39) + goto transfault_l0; + } else { + if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) + goto transfault_l0; + } + + /* R_GTJBY, R_SXWGM */ + switch (BIT(wi->pgshift)) { + case SZ_4K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_16K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); + break; + case SZ_64K: + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); + break; + } + + if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) + goto transfault_l0; + + ia_bits = get_ia_size(wi); + + /* R_YYVYV, I_THCZK */ + if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || + (va55 && va < GENMASK(63, ia_bits))) + goto transfault_l0; + + /* I_ZFSYQ */ + if (wi->regime != TR_EL2 && + (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) + goto transfault_l0; + + /* R_BNDVG and following statements */ + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && + as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) + goto transfault_l0; + + /* AArch64.S1StartLevel() */ + stride = wi->pgshift - 3; + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); + + ps = (wi->regime == TR_EL2 ? + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); + + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); + + /* Compute minimal alignment */ + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); + + wi->baddr = ttbr & TTBRx_EL1_BADDR; + + /* R_VPBBF */ + if (check_output_size(wi->baddr, wi)) + goto addrsz; + + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); + + return 0; + +addrsz: /* Address Size Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); + return -EFAULT; + +transfault_l0: /* Translation Fault level 0 */ + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); + return -EFAULT; +} + +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, + struct s1_walk_result *wr, u64 va) +{ + u64 va_top, va_bottom, baddr, desc; + int level, stride, ret; + + level = wi->sl; + stride = wi->pgshift - 3; + baddr = wi->baddr; + + va_top = get_ia_size(wi) - 1; + + while (1) { + u64 index, ipa; + + va_bottom = (3 - level) * stride + wi->pgshift; + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); + + ipa = baddr | index; + + if (wi->s2) { + struct kvm_s2_trans s2_trans = {}; + + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); + if (ret) { + fail_s1_walk(wr, + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, + true, true); + return ret; + } + + if (!kvm_s2_trans_readable(&s2_trans)) { + fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), + true, true); + + return -EPERM; + } + + ipa = kvm_s2_trans_output(&s2_trans); + } + + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); + if (ret) { + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), + true, false); + return ret; + } + + if (wi->be) + desc = be64_to_cpu((__force __be64)desc); + else + desc = le64_to_cpu((__force __le64)desc); + + /* Invalid descriptor */ + if (!(desc & BIT(0))) + goto transfault; + + /* Block mapping, check validity down the line */ + if (!(desc & BIT(1))) + break; + + /* Page mapping */ + if (level == 3) + break; + + /* Table handling */ + if (!wi->hpd) { + wr->APTable |= FIELD_GET(S1_TABLE_AP, desc); + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); + } + + baddr = desc & GENMASK_ULL(47, wi->pgshift); + + /* Check for out-of-range OA */ + if (check_output_size(baddr, wi)) + goto addrsz; + + /* Prepare for next round */ + va_top = va_bottom - 1; + level++; + } + + /* Block mapping, check the validity of the level */ + if (!(desc & BIT(1))) { + bool valid_block = false; + + switch (BIT(wi->pgshift)) { + case SZ_4K: + valid_block = level == 1 || level == 2; + break; + case SZ_16K: + case SZ_64K: + valid_block = level == 2; + break; + } + + if (!valid_block) + goto transfault; + } + + if (check_output_size(desc & GENMASK(47, va_bottom), wi)) + goto addrsz; + + va_bottom += contiguous_bit_shift(desc, wi, level); + + wr->failed = false; + wr->level = level; + wr->desc = desc; + wr->pa = desc & GENMASK(47, va_bottom); + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); + + return 0; + +addrsz: + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false); + return -EINVAL; +transfault: + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false); + return -ENOENT; +} + +struct mmu_config { + u64 ttbr0; + u64 ttbr1; + u64 tcr; + u64 mair; + u64 sctlr; + u64 vttbr; + u64 vtcr; + u64 hcr; +}; + +static void __mmu_config_save(struct mmu_config *config) +{ + config->ttbr0 = read_sysreg_el1(SYS_TTBR0); + config->ttbr1 = read_sysreg_el1(SYS_TTBR1); + config->tcr = read_sysreg_el1(SYS_TCR); + config->mair = read_sysreg_el1(SYS_MAIR); + config->sctlr = read_sysreg_el1(SYS_SCTLR); + config->vttbr = read_sysreg(vttbr_el2); + config->vtcr = read_sysreg(vtcr_el2); + config->hcr = read_sysreg(hcr_el2); +} + +static void __mmu_config_restore(struct mmu_config *config) +{ + write_sysreg(config->hcr, hcr_el2); + + /* + * ARM errata 1165522 and 1530923 require TGE to be 1 before + * we update the guest state. + */ + asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); + + write_sysreg_el1(config->ttbr0, SYS_TTBR0); + write_sysreg_el1(config->ttbr1, SYS_TTBR1); + write_sysreg_el1(config->tcr, SYS_TCR); + write_sysreg_el1(config->mair, SYS_MAIR); + write_sysreg_el1(config->sctlr, SYS_SCTLR); + write_sysreg(config->vttbr, vttbr_el2); + write_sysreg(config->vtcr, vtcr_el2); +} + +static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 host_pan; + bool fail; + + host_pan = read_sysreg_s(SYS_PSTATE_PAN); + write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN); + + switch (op) { + case OP_AT_S1E1RP: + fail = __kvm_at(OP_AT_S1E1RP, vaddr); + break; + case OP_AT_S1E1WP: + fail = __kvm_at(OP_AT_S1E1WP, vaddr); + break; + } + + write_sysreg_s(host_pan, SYS_PSTATE_PAN); + + return fail; +} + +#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic) +#define MEMATTR_NC 0b0100 +#define MEMATTR_Wt 0b1000 +#define MEMATTR_Wb 0b1100 +#define MEMATTR_WbRaWa 0b1111 + +#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0) + +static u8 s2_memattr_to_attr(u8 memattr) +{ + memattr &= 0b1111; + + switch (memattr) { + case 0b0000: + case 0b0001: + case 0b0010: + case 0b0011: + return memattr << 2; + case 0b0100: + return MEMATTR(Wb, Wb); + case 0b0101: + return MEMATTR(NC, NC); + case 0b0110: + return MEMATTR(Wt, NC); + case 0b0111: + return MEMATTR(Wb, NC); + case 0b1000: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1001: + return MEMATTR(NC, Wt); + case 0b1010: + return MEMATTR(Wt, Wt); + case 0b1011: + return MEMATTR(Wb, Wt); + case 0b1100: + /* Reserved, assume NC */ + return MEMATTR(NC, NC); + case 0b1101: + return MEMATTR(NC, Wb); + case 0b1110: + return MEMATTR(Wt, Wb); + case 0b1111: + return MEMATTR(Wb, Wb); + default: + unreachable(); + } +} + +static u8 combine_s1_s2_attr(u8 s1, u8 s2) +{ + bool transient; + u8 final = 0; + + /* Upgrade transient s1 to non-transient to simplify things */ + switch (s1) { + case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */ + transient = true; + s1 = MEMATTR_Wt | (s1 & GENMASK(1,0)); + break; + case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */ + transient = true; + s1 = MEMATTR_Wb | (s1 & GENMASK(1,0)); + break; + default: + transient = false; + } + + /* S2CombineS1AttrHints() */ + if ((s1 & GENMASK(3, 2)) == MEMATTR_NC || + (s2 & GENMASK(3, 2)) == MEMATTR_NC) + final = MEMATTR_NC; + else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt || + (s2 & GENMASK(3, 2)) == MEMATTR_Wt) + final = MEMATTR_Wt; + else + final = MEMATTR_Wb; + + if (final != MEMATTR_NC) { + /* Inherit RaWa hints form S1 */ + if (transient) { + switch (s1 & GENMASK(3, 2)) { + case MEMATTR_Wt: + final = 0; + break; + case MEMATTR_Wb: + final = MEMATTR_NC; + break; + } + } + + final |= s1 & GENMASK(1, 0); + } + + return final; +} + +#define ATTR_NSH 0b00 +#define ATTR_RSV 0b01 +#define ATTR_OSH 0b10 +#define ATTR_ISH 0b11 + +static u8 compute_sh(u8 attr, u64 desc) +{ + u8 sh; + + /* Any form of device, as well as NC has SH[1:0]=0b10 */ + if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) + return ATTR_OSH; + + sh = FIELD_GET(PTE_SHARED, desc); + if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ + sh = ATTR_NSH; + + return sh; +} + +static u8 combine_sh(u8 s1_sh, u8 s2_sh) +{ + if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH) + return ATTR_OSH; + if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH) + return ATTR_ISH; + + return ATTR_NSH; +} + +static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, + struct kvm_s2_trans *tr) +{ + u8 s1_parattr, s2_memattr, final_attr; + u64 par; + + /* If S2 has failed to translate, report the damage */ + if (tr->esr) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= SYS_PAR_EL1_S; + par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr); + return par; + } + + s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par); + s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc); + + if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) { + if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP)) + s2_memattr &= ~BIT(3); + + /* Combination of R_VRJSW and R_RHWZM */ + switch (s2_memattr) { + case 0b0101: + if (MEMATTR_IS_DEVICE(s1_parattr)) + final_attr = s1_parattr; + else + final_attr = MEMATTR(NC, NC); + break; + case 0b0110: + case 0b1110: + final_attr = MEMATTR(WbRaWa, WbRaWa); + break; + case 0b0111: + case 0b1111: + /* Preserve S1 attribute */ + final_attr = s1_parattr; + break; + case 0b0100: + case 0b1100: + case 0b1101: + /* Reserved, do something non-silly */ + final_attr = s1_parattr; + break; + default: + /* MemAttr[2]=0, Device from S2 */ + final_attr = s2_memattr & GENMASK(1,0) << 2; + } + } else { + /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */ + u8 s2_parattr = s2_memattr_to_attr(s2_memattr); + + if (MEMATTR_IS_DEVICE(s1_parattr) || + MEMATTR_IS_DEVICE(s2_parattr)) { + final_attr = min(s1_parattr, s2_parattr); + } else { + /* At this stage, this is memory vs memory */ + final_attr = combine_s1_s2_attr(s1_parattr & 0xf, + s2_parattr & 0xf); + final_attr |= combine_s1_s2_attr(s1_parattr >> 4, + s2_parattr >> 4) << 4; + } + } + + if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) && + !MEMATTR_IS_DEVICE(final_attr)) + final_attr = MEMATTR(NC, NC); + + par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); + par |= tr->output & GENMASK(47, 12); + par |= FIELD_PREP(SYS_PAR_EL1_SH, + combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), + compute_sh(final_attr, tr->desc))); + + return par; +} + +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, + enum trans_regime regime) +{ + u64 par; + + if (wr->failed) { + par = SYS_PAR_EL1_RES1; + par |= SYS_PAR_EL1_F; + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; + par |= wr->s2 ? SYS_PAR_EL1_S : 0; + } else if (wr->level == S1_MMU_DISABLED) { + /* MMU off or HCR_EL2.DC == 1 */ + par = SYS_PAR_EL1_NSE; + par |= wr->pa & GENMASK_ULL(47, 12); + + if (regime == TR_EL10 && + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, + MEMATTR(WbRaWa, WbRaWa)); + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH); + } else { + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ + par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH); + } + } else { + u64 mair, sctlr; + u8 sh; + + par = SYS_PAR_EL1_NSE; + + mair = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, MAIR_EL1) : + vcpu_read_sys_reg(vcpu, MAIR_EL2)); + + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; + mair &= 0xff; + + sctlr = (regime == TR_EL10 ? + vcpu_read_sys_reg(vcpu, SCTLR_EL1) : + vcpu_read_sys_reg(vcpu, SCTLR_EL2)); + + /* Force NC for memory if SCTLR_ELx.C is clear */ + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) + mair = MEMATTR(NC, NC); + + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); + par |= wr->pa & GENMASK_ULL(47, 12); + + sh = compute_sh(mair, wr->desc); + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); + } + + return par; +} + +static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + u64 sctlr; + + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + return false; + + if (regime == TR_EL10) + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + else + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); + + return sctlr & SCTLR_EL1_EPAN; +} + +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + bool perm_fail, ur, uw, ux, pr, pw, px; + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + int ret, idx; + + ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (ret) + goto compute_par; + + /* FIXME: revisit when adding indirect permission support */ + /* AArch64.S1DirectBasePermissions() */ + if (wi.regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) { + case 0b00: + pr = pw = true; + ur = uw = false; + break; + case 0b01: + pr = pw = ur = uw = true; + break; + case 0b10: + pr = true; + pw = ur = uw = false; + break; + case 0b11: + pr = ur = true; + pw = uw = false; + break; + } + + switch (wr.APTable) { + case 0b00: + break; + case 0b01: + ur = uw = false; + break; + case 0b10: + pw = uw = false; + break; + case 0b11: + pw = ur = uw = false; + break; + } + + /* We don't use px for anything yet, but hey... */ + px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw); + ux = !((wr.desc & PTE_UXN) || wr.UXNTable); + + if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) { + bool pan; + + pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; + pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux); + pw &= !pan; + pr &= !pan; + } + } else { + ur = uw = ux = false; + + if (!(wr.desc & PTE_RDONLY)) { + pr = pw = true; + } else { + pr = true; + pw = false; + } + + if (wr.APTable & BIT(1)) + pw = false; + + /* XN maps to UXN */ + px = !((wr.desc & PTE_UXN) || wr.UXNTable); + } + + perm_fail = false; + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1R: + case OP_AT_S1E2R: + perm_fail = !pr; + break; + case OP_AT_S1E1WP: + case OP_AT_S1E1W: + case OP_AT_S1E2W: + perm_fail = !pw; + break; + case OP_AT_S1E0R: + perm_fail = !ur; + break; + case OP_AT_S1E0W: + perm_fail = !uw; + break; + case OP_AT_S1E1A: + case OP_AT_S1E2A: + break; + default: + BUG(); + } + + if (perm_fail) + fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false); + +compute_par: + return compute_par_s1(vcpu, &wr, wi.regime); +} + +/* + * Return the PAR_EL1 value as the result of a valid translation. + * + * If the translation is unsuccessful, the value may only contain + * PAR_EL1.F, and cannot be taken at face value. It isn't an + * indication of the translation having failed, only that the fast + * path did not succeed, *unless* it indicates a S1 permission fault. + */ +static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct mmu_config config; + struct kvm_s2_mmu *mmu; + bool fail; + u64 par; + + par = SYS_PAR_EL1_F; + + /* + * We've trapped, so everything is live on the CPU. As we will + * be switching contexts behind everybody's back, disable + * interrupts while holding the mmu lock. + */ + guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock); + + /* + * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already + * the right one (as we trapped from vEL2). If not, save the + * full MMU context. + */ + if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) + goto skip_mmu_switch; + + /* + * Obtaining the S2 MMU for a L2 is horribly racy, and we may not + * find it (recycled by another vcpu, for example). When this + * happens, admit defeat immediately and use the SW (slow) path. + */ + mmu = lookup_s2_mmu(vcpu); + if (!mmu) + return par; + + __mmu_config_save(&config); + + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); + __load_stage2(mmu, mmu->arch); + +skip_mmu_switch: + /* Clear TGE, enable S2 translation, we're rolling */ + write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2); + isb(); + + switch (op) { + case OP_AT_S1E1RP: + case OP_AT_S1E1WP: + fail = at_s1e1p_fast(vcpu, op, vaddr); + break; + case OP_AT_S1E1R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E1W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E0R: + fail = __kvm_at(OP_AT_S1E0R, vaddr); + break; + case OP_AT_S1E0W: + fail = __kvm_at(OP_AT_S1E0W, vaddr); + break; + case OP_AT_S1E1A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + break; + } + + if (!fail) + par = read_sysreg_par(); + + if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) + __mmu_config_restore(&config); + + return par; +} + +static bool par_check_s1_perm_fault(u64 par) +{ + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); + + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && + !(par & SYS_PAR_EL1_S)); +} + +void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); + + /* + * If PAR_EL1 reports that AT failed on a S1 permission fault, we + * know for sure that the PTW was able to walk the S1 tables and + * there's nothing else to do. + * + * If AT failed for any other reason, then we must walk the guest S1 + * to emulate the instruction. + */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + u64 par; + + /* + * We've trapped, so everything is live on the CPU. As we will be + * switching context behind everybody's back, disable interrupts... + */ + scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { + struct kvm_s2_mmu *mmu; + u64 val, hcr; + bool fail; + + mmu = &vcpu->kvm->arch.mmu; + + val = hcr = read_sysreg(hcr_el2); + val &= ~HCR_TGE; + val |= HCR_VM; + + if (!vcpu_el2_e2h_is_set(vcpu)) + val |= HCR_NV | HCR_NV1; + + write_sysreg(val, hcr_el2); + isb(); + + par = SYS_PAR_EL1_F; + + switch (op) { + case OP_AT_S1E2R: + fail = __kvm_at(OP_AT_S1E1R, vaddr); + break; + case OP_AT_S1E2W: + fail = __kvm_at(OP_AT_S1E1W, vaddr); + break; + case OP_AT_S1E2A: + fail = __kvm_at(OP_AT_S1E1A, vaddr); + break; + default: + WARN_ON_ONCE(1); + fail = true; + } + + isb(); + + if (!fail) + par = read_sysreg_par(); + + write_sysreg(hcr, hcr_el2); + isb(); + } + + /* We failed the translation, let's replay it in slow motion */ + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) + par = handle_at_slow(vcpu, op, vaddr); + + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} + +void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct kvm_s2_trans out = {}; + u64 ipa, par; + bool write; + int ret; + + /* Do the stage-1 translation */ + switch (op) { + case OP_AT_S12E1R: + op = OP_AT_S1E1R; + write = false; + break; + case OP_AT_S12E1W: + op = OP_AT_S1E1W; + write = true; + break; + case OP_AT_S12E0R: + op = OP_AT_S1E0R; + write = false; + break; + case OP_AT_S12E0W: + op = OP_AT_S1E0W; + write = true; + break; + default: + WARN_ON_ONCE(1); + return; + } + + __kvm_at_s1e01(vcpu, op, vaddr); + par = vcpu_read_sys_reg(vcpu, PAR_EL1); + if (par & SYS_PAR_EL1_F) + return; + + /* + * If we only have a single stage of translation (E2H=0 or + * TGE=1), exit early. Same thing if {VM,DC}=={0,0}. + */ + if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) || + !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))) + return; + + /* Do the stage-2 translation */ + ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); + out.esr = 0; + ret = kvm_walk_nested_s2(vcpu, ipa, &out); + if (ret < 0) + return; + + /* Check the access permission */ + if (!out.esr && + ((!write && !out.readable) || (write && !out.writable))) + out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); + + par = compute_par_s12(vcpu, par, &out); + vcpu_write_sys_reg(vcpu, par, PAR_EL1); +} diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 05166eccea0a..05b6435d02a9 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -83,14 +83,20 @@ enum cgt_group_id { CGT_CPTR_TAM, CGT_CPTR_TCPAC, + CGT_HCRX_EnFPM, CGT_HCRX_TCR2En, + CGT_ICH_HCR_TC, + CGT_ICH_HCR_TALL0, + CGT_ICH_HCR_TALL1, + CGT_ICH_HCR_TDIR, + /* * Anything after this point is a combination of coarse trap * controls, which must all be evaluated to decide what to do. */ __MULTIPLE_CONTROL_BITS__, - CGT_HCR_IMO_FMO = __MULTIPLE_CONTROL_BITS__, + CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__, CGT_HCR_TID2_TID4, CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB_TTLBOS, @@ -105,6 +111,8 @@ enum cgt_group_id { CGT_MDCR_TDE_TDRA, CGT_MDCR_TDCC_TDE_TDA, + CGT_ICH_HCR_TC_TDIR, + /* * Anything after this point requires a callback evaluating a * complex trap condition. Ugly stuff. @@ -372,12 +380,42 @@ static const struct trap_bits coarse_trap_bits[] = { .mask = CPTR_EL2_TCPAC, .behaviour = BEHAVE_FORWARD_ANY, }, + [CGT_HCRX_EnFPM] = { + .index = HCRX_EL2, + .value = 0, + .mask = HCRX_EL2_EnFPM, + .behaviour = BEHAVE_FORWARD_ANY, + }, [CGT_HCRX_TCR2En] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_TCR2En, .behaviour = BEHAVE_FORWARD_ANY, }, + [CGT_ICH_HCR_TC] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TC, + .mask = ICH_HCR_TC, + .behaviour = BEHAVE_FORWARD_ANY, + }, + [CGT_ICH_HCR_TALL0] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TALL0, + .mask = ICH_HCR_TALL0, + .behaviour = BEHAVE_FORWARD_ANY, + }, + [CGT_ICH_HCR_TALL1] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TALL1, + .mask = ICH_HCR_TALL1, + .behaviour = BEHAVE_FORWARD_ANY, + }, + [CGT_ICH_HCR_TDIR] = { + .index = ICH_HCR_EL2, + .value = ICH_HCR_TDIR, + .mask = ICH_HCR_TDIR, + .behaviour = BEHAVE_FORWARD_ANY, + }, }; #define MCB(id, ...) \ @@ -387,7 +425,6 @@ static const struct trap_bits coarse_trap_bits[] = { } static const enum cgt_group_id *coarse_control_combo[] = { - MCB(CGT_HCR_IMO_FMO, CGT_HCR_IMO, CGT_HCR_FMO), MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4), MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS), MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS), @@ -402,6 +439,9 @@ static const enum cgt_group_id *coarse_control_combo[] = { MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA), + + MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC), + MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR), }; typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); @@ -536,9 +576,9 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4), SR_RANGE_TRAP(SYS_ID_PFR0_EL1, sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3), - SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO), - SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO), + SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), + SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC), SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0), sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP), SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0), @@ -786,6 +826,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV), SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV), + SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV), SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV), @@ -867,6 +908,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT), SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT), + SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT), SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN), SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN), @@ -1108,6 +1150,35 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), + /* + * IMPDEF choice: + * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as + * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for + * ICC_SRE_EL1 access, and always handle it locally. + */ + SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR), + SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0), + SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1), + SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC), + SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC), }; static DEFINE_XARRAY(sr_forward_xa); diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index c53e5b14038d..ea5484ce1f3b 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -63,6 +63,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) */ *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); + *host_data_ptr(fpmr_ptr) = kern_hyp_va(¤t->thread.uw.fpmr); vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) @@ -134,8 +135,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) fp_state.sve_state = vcpu->arch.sve_state; fp_state.sve_vl = vcpu->arch.sve_max_vl; fp_state.sme_state = NULL; - fp_state.svcr = &vcpu->arch.svcr; - fp_state.fpmr = &vcpu->arch.fpmr; + fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR); + fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR); fp_state.fp_type = &vcpu->arch.fp_type; if (vcpu_has_sve(vcpu)) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 11098eb7eb44..962f985977c2 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1045,6 +1045,11 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, mutex_lock(&kvm->slots_lock); + if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) { + ret = -EBUSY; + goto out; + } + while (length > 0) { kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); void *maddr; @@ -1059,6 +1064,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, page = pfn_to_online_page(pfn); if (!page) { /* Reject ZONE_DEVICE memory */ + kvm_release_pfn_clean(pfn); ret = -EFAULT; goto out; } diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 9e13c1bc2ad5..487c06099d6f 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -27,7 +27,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) * saved the guest context yet, and we may return early... */ par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) + if (!__kvm_at(OP_AT_S1E1R, far)) tmp = read_sysreg_par(); else tmp = SYS_PAR_EL1_F; /* back to the guest */ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 37ff87d782b6..46d52e8a3df3 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -403,6 +403,9 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) else __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs); + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR); + /* Skip restoring fpexc32 for AArch64 guests */ if (!(read_sysreg(hcr_el2) & HCR_RW)) write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2); diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index e715c157c2c4..e433dfab882a 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -426,9 +426,9 @@ out: return; } -static __always_inline void do_ffa_mem_xfer(const u64 func_id, - struct arm_smccc_res *res, - struct kvm_cpu_context *ctxt) +static void __do_ffa_mem_xfer(const u64 func_id, + struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, len, ctxt, 1); DECLARE_REG(u32, fraglen, ctxt, 2); @@ -440,9 +440,6 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, u32 offset, nr_ranges; int ret = 0; - BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE && - func_id != FFA_FN64_MEM_LEND); - if (addr_mbz || npages_mbz || fraglen > len || fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { ret = FFA_RET_INVALID_PARAMETERS; @@ -461,6 +458,11 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, goto out_unlock; } + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + goto out_unlock; + } + buf = hyp_buffers.tx; memcpy(buf, host_buffers.tx, fraglen); @@ -512,6 +514,13 @@ err_unshare: goto out_unlock; } +#define do_ffa_mem_xfer(fid, res, ctxt) \ + do { \ + BUILD_BUG_ON((fid) != FFA_FN64_MEM_SHARE && \ + (fid) != FFA_FN64_MEM_LEND); \ + __do_ffa_mem_xfer((fid), (res), (ctxt)); \ + } while (0); + static void do_ffa_mem_reclaim(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 07120b37da35..401af1835be6 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -130,7 +130,7 @@ alternative_else_nop_endif /* Invalidate the stale TLBs from Bootloader */ tlbi alle2 - tlbi vmalls12e1 + tlbi alle1 dsb sy mov_q x0, INIT_SCTLR_EL2_MMU_ON diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f43d845f3c4e..87692b566d90 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -62,6 +62,8 @@ static void fpsimd_sve_flush(void) static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) { + bool has_fpmr; + if (!guest_owns_fp_regs()) return; @@ -73,11 +75,18 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) else __fpsimd_save_state(&vcpu->arch.ctxt.fp_regs); + has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); + if (has_fpmr) + __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR); + if (system_supports_sve()) __hyp_sve_restore_host(); else __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); + if (has_fpmr) + write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); + *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; } diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 8f5c56d5b1cd..cc69106734ca 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -197,6 +197,15 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) } else { __fpsimd_save_state(*host_data_ptr(fpsimd_state)); } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) { + u64 val = read_sysreg_s(SYS_FPMR); + + if (unlikely(is_protected_kvm_enabled())) + *host_data_ptr(fpmr) = val; + else + **host_data_ptr(fpmr_ptr) = val; + } } static const exit_handler_fn hyp_exit_handlers[] = { diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index ca3c09df8d7c..48da9ca9763f 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -132,10 +132,10 @@ static void exit_vmid_context(struct tlb_inv_context *cxt) else __load_host_stage2(); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { - /* Ensure write of the old VMID */ - isb(); + /* Ensure write of the old VMID */ + isb(); + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { if (!(cxt->sctlr & SCTLR_ELx_M)) { write_sysreg_el1(cxt->sctlr, SYS_SCTLR); isb(); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 9e2bbee77491..b11bcebac908 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -17,48 +17,6 @@ #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 -#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) - -#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ - ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ - ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) -#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) -#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 -#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) - -#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) - -#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) - -#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) - -#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) - -#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ - KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ - KVM_PTE_LEAF_ATTR_HI_S2_XN) - -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) -#define KVM_MAX_OWNER_ID 1 - -/* - * Used to indicate a pte for which a 'break-before-make' sequence is in - * progress. - */ -#define KVM_INVALID_PTE_LOCKED BIT(10) - struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; @@ -1547,7 +1505,6 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, */ new = kvm_init_table_pte(childp, mm_ops); stage2_make_pte(ctx, new); - dsb(ishst); return 0; } @@ -1559,8 +1516,11 @@ int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, .flags = KVM_PGTABLE_WALK_LEAF, .arg = mc, }; + int ret; - return kvm_pgtable_walk(pgt, addr, size, &walker); + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; } int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 7b397fad26f2..18d4677002b1 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -268,8 +268,16 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) * starting to mess with the rest of the GIC, and VMCR_EL2 in * particular. This logic must be called before * __vgic_v3_restore_state(). + * + * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is + * provisioned at all. In order to prevent illegal accesses to the + * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1 + * so that the trap bits can take effect. Yes, we *loves* the GIC. */ - if (!cpu_if->vgic_sre) { + if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) { + write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1); + isb(); + } else if (!cpu_if->vgic_sre) { write_gicreg(0, ICC_SRE_EL1); isb(); write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); @@ -288,8 +296,9 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) } /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. + * Prevent the guest from touching the ICC_SRE_EL1 system + * register. Note that this may not have any effect, as + * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation. */ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); @@ -297,10 +306,11 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if) /* * If we need to trap system registers, we must write * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected, + * injected. Note that this also applies if we don't expect + * any system register access (no vgic at all). */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } @@ -326,7 +336,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if) * no interrupts were being injected, and we disable it again here. */ if (static_branch_unlikely(&vgic_v3_cpuif_trap) || - cpu_if->its_vpe.its_vm) + cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre) write_gicreg(0, ICH_HCR_EL2); } @@ -1032,6 +1042,75 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) write_gicreg(vmcr, ICH_VMCR_EL2); } +static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu, + u32 sysreg, bool is_read) +{ + u64 ich_hcr; + + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + return false; + + ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); + + switch (sysreg) { + case SYS_ICC_IGRPEN0_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP0Rn_EL1(0): + case SYS_ICC_AP0Rn_EL1(1): + case SYS_ICC_AP0Rn_EL1(2): + case SYS_ICC_AP0Rn_EL1(3): + case SYS_ICC_BPR0_EL1: + case SYS_ICC_EOIR0_EL1: + case SYS_ICC_HPPIR0_EL1: + case SYS_ICC_IAR0_EL1: + return ich_hcr & ICH_HCR_TALL0; + + case SYS_ICC_IGRPEN1_EL1: + if (is_read && + (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + if (!is_read && + (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1)) + return true; + + fallthrough; + + case SYS_ICC_AP1Rn_EL1(0): + case SYS_ICC_AP1Rn_EL1(1): + case SYS_ICC_AP1Rn_EL1(2): + case SYS_ICC_AP1Rn_EL1(3): + case SYS_ICC_BPR1_EL1: + case SYS_ICC_EOIR1_EL1: + case SYS_ICC_HPPIR1_EL1: + case SYS_ICC_IAR1_EL1: + return ich_hcr & ICH_HCR_TALL1; + + case SYS_ICC_DIR_EL1: + if (ich_hcr & ICH_HCR_TDIR) + return true; + + fallthrough; + + case SYS_ICC_RPR_EL1: + case SYS_ICC_CTLR_EL1: + case SYS_ICC_PMR_EL1: + return ich_hcr & ICH_HCR_TC; + + default: + return false; + } +} + int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) { int rt; @@ -1041,6 +1120,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) bool is_read; u32 sysreg; + if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return 0; + esr = kvm_vcpu_get_esr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { if (!kvm_condition_valid(vcpu)) { @@ -1055,6 +1137,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read)) + return 0; + switch (sysreg) { case SYS_ICC_IAR0_EL1: case SYS_ICC_IAR1_EL1: diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 77010b76c150..80581b1c3995 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -312,6 +312,9 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) { __fpsimd_save_state(*host_data_ptr(fpsimd_state)); + + if (kvm_has_fpmr(vcpu->kvm)) + **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR); } static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 6981b1bc0946..a509b63bd4dd 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1540,8 +1540,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = min(vma_pagesize, (long)max_map_size); } - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) + /* + * Both the canonical IPA and fault IPA must be hugepage-aligned to + * ensure we find the right PFN and lay down the mapping in the right + * place. + */ + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { fault_ipa &= ~(vma_pagesize - 1); + ipa &= ~(vma_pagesize - 1); + } gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index bab27f9d8cc6..638b6205526f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -103,20 +103,6 @@ struct s2_walk_info { bool be; }; -static unsigned int ps_to_output_size(unsigned int ps) -{ - switch (ps) { - case 0: return 32; - case 1: return 36; - case 2: return 40; - case 3: return 42; - case 4: return 44; - case 5: - default: - return 48; - } -} - static u32 compute_fsc(int level, u32 fsc) { return fsc | (level & 0x3); @@ -256,7 +242,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, /* Check for valid descriptor at this point */ if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); - out->upper_attr = desc; + out->desc = desc; return 1; } @@ -266,7 +252,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, if (check_output_size(wi, desc)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); - out->upper_attr = desc; + out->desc = desc; return 1; } @@ -278,27 +264,24 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, if (level < first_block_level) { out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT); - out->upper_attr = desc; + out->desc = desc; return 1; } - /* - * We don't use the contiguous bit in the stage-2 ptes, so skip check - * for misprogramming of the contiguous bit. - */ - if (check_output_size(wi, desc)) { out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ); - out->upper_attr = desc; + out->desc = desc; return 1; } if (!(desc & BIT(10))) { out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS); - out->upper_attr = desc; + out->desc = desc; return 1; } + addr_bottom += contiguous_bit_shift(desc, wi, level); + /* Calculate and return the result */ paddr = (desc & GENMASK_ULL(47, addr_bottom)) | (ipa & GENMASK_ULL(addr_bottom - 1, 0)); @@ -307,7 +290,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa, out->readable = desc & (0b01 << 6); out->writable = desc & (0b10 << 6); out->level = level; - out->upper_attr = desc & GENMASK_ULL(63, 52); + out->desc = desc; return 0; } @@ -954,19 +937,16 @@ static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) int kvm_init_nv_sysregs(struct kvm *kvm) { u64 res0, res1; - int ret = 0; - mutex_lock(&kvm->arch.config_lock); + lockdep_assert_held(&kvm->arch.config_lock); if (kvm->arch.sysreg_masks) - goto out; + return 0; kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), GFP_KERNEL_ACCOUNT); - if (!kvm->arch.sysreg_masks) { - ret = -ENOMEM; - goto out; - } + if (!kvm->arch.sysreg_masks) + return -ENOMEM; limit_nv_id_regs(kvm); @@ -1195,8 +1175,13 @@ int kvm_init_nv_sysregs(struct kvm *kvm) if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1)) res0 |= ~(res0 | res1); set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); -out: - mutex_unlock(&kvm->arch.config_lock); - return ret; + /* SCTLR_EL1 */ + res0 = SCTLR_EL1_RES0; + res1 = SCTLR_EL1_RES1; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) + res0 |= SCTLR_EL1_EPAN; + set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + + return 0; } diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c new file mode 100644 index 000000000000..e4a342e903e2 --- /dev/null +++ b/arch/arm64/kvm/ptdump.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Debug helper used to dump the stage-2 pagetables of the system and their + * associated permissions. + * + * Copyright (C) Google, 2024 + * Author: Sebastian Ene <sebastianene@google.com> + */ +#include <linux/debugfs.h> +#include <linux/kvm_host.h> +#include <linux/seq_file.h> + +#include <asm/kvm_mmu.h> +#include <asm/kvm_pgtable.h> +#include <asm/ptdump.h> + +#define MARKERS_LEN 2 +#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1) + +struct kvm_ptdump_guest_state { + struct kvm *kvm; + struct ptdump_pg_state parser_state; + struct addr_marker ipa_marker[MARKERS_LEN]; + struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS]; + struct ptdump_range range[MARKERS_LEN]; +}; + +static const struct ptdump_prot_bits stage2_pte_bits[] = { + { + .mask = PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "F", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID, + .set = "R", + .clear = " ", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID, + .set = "W", + .clear = " ", + }, { + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "X", + }, { + .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, + .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID, + .set = "AF", + .clear = " ", + }, { + .mask = PTE_TABLE_BIT | PTE_VALID, + .val = PTE_VALID, + .set = "BLK", + .clear = " ", + }, +}; + +static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct ptdump_pg_state *st = ctx->arg; + struct ptdump_state *pt_st = &st->ptdump; + + note_page(pt_st, ctx->addr, ctx->level, ctx->old); + + return 0; +} + +static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl) +{ + u32 i; + u64 mask; + + if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL)) + return -EINVAL; + + mask = 0; + for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++) + mask |= stage2_pte_bits[i].mask; + + for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) { + snprintf(level[i].name, sizeof(level[i].name), "%u", i); + + level[i].num = ARRAY_SIZE(stage2_pte_bits); + level[i].bits = stage2_pte_bits; + level[i].mask = mask; + } + + return 0; +} + +static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm) +{ + struct kvm_ptdump_guest_state *st; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct kvm_pgtable *pgtable = mmu->pgt; + int ret; + + st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT); + if (!st) + return ERR_PTR(-ENOMEM); + + ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level); + if (ret) { + kfree(st); + return ERR_PTR(ret); + } + + st->ipa_marker[0].name = "Guest IPA"; + st->ipa_marker[1].start_address = BIT(pgtable->ia_bits); + st->range[0].end = BIT(pgtable->ia_bits); + + st->kvm = kvm; + st->parser_state = (struct ptdump_pg_state) { + .marker = &st->ipa_marker[0], + .level = -1, + .pg_level = &st->level[0], + .ptdump.range = &st->range[0], + .start_address = 0, + }; + + return st; +} + +static int kvm_ptdump_guest_show(struct seq_file *m, void *unused) +{ + int ret; + struct kvm_ptdump_guest_state *st = m->private; + struct kvm *kvm = st->kvm; + struct kvm_s2_mmu *mmu = &kvm->arch.mmu; + struct ptdump_pg_state *parser_state = &st->parser_state; + struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) { + .cb = kvm_ptdump_visitor, + .arg = parser_state, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + parser_state->seq = m; + + write_lock(&kvm->mmu_lock); + ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker); + write_unlock(&kvm->mmu_lock); + + return ret; +} + +static int kvm_ptdump_guest_open(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + struct kvm_ptdump_guest_state *st; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + st = kvm_ptdump_parser_create(kvm); + if (IS_ERR(st)) { + ret = PTR_ERR(st); + goto err_with_kvm_ref; + } + + ret = single_open(file, kvm_ptdump_guest_show, st); + if (!ret) + return 0; + + kfree(st); +err_with_kvm_ref: + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_ptdump_guest_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + void *st = ((struct seq_file *)file->private_data)->private; + + kfree(st); + kvm_put_kvm(kvm); + + return single_release(m, file); +} + +static const struct file_operations kvm_ptdump_guest_fops = { + .open = kvm_ptdump_guest_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_ptdump_guest_close, +}; + +static int kvm_pgtable_range_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%2u\n", pgtable->ia_bits); + return 0; +} + +static int kvm_pgtable_levels_show(struct seq_file *m, void *unused) +{ + struct kvm_pgtable *pgtable = m->private; + + seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level); + return 0; +} + +static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + struct kvm *kvm = m->i_private; + struct kvm_pgtable *pgtable; + int ret; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + pgtable = kvm->arch.mmu.pgt; + + ret = single_open(file, show, pgtable); + if (ret < 0) + kvm_put_kvm(kvm); + return ret; +} + +static int kvm_pgtable_range_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show); +} + +static int kvm_pgtable_levels_open(struct inode *m, struct file *file) +{ + return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show); +} + +static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file) +{ + struct kvm *kvm = m->i_private; + + kvm_put_kvm(kvm); + return single_release(m, file); +} + +static const struct file_operations kvm_pgtable_range_fops = { + .open = kvm_pgtable_range_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +static const struct file_operations kvm_pgtable_levels_fops = { + .open = kvm_pgtable_levels_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_pgtable_debugfs_close, +}; + +void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) +{ + debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry, + kvm, &kvm_ptdump_guest_fops); + debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm, + &kvm_pgtable_range_fops); + debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry, + kvm, &kvm_pgtable_levels_fops); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c90324060436..09a6a45efb49 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -33,6 +33,7 @@ #include <trace/events/kvm.h> #include "sys_regs.h" +#include "vgic/vgic.h" #include "trace.h" @@ -46,6 +47,13 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); +static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return false; +} + static bool bad_trap(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r, @@ -53,8 +61,7 @@ static bool bad_trap(struct kvm_vcpu *vcpu, { WARN_ONCE(1, "Unexpected %s\n", msg); print_sys_reg_instr(params); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, params, r); } static bool read_from_write_only(struct kvm_vcpu *vcpu, @@ -345,10 +352,8 @@ static bool access_dcgsw(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - if (!kvm_has_mte(vcpu->kvm)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_mte(vcpu->kvm)) + return undef_access(vcpu, p, r); /* Treat MTE S/W ops as we treat the classic ones: with contempt */ return access_dcsw(vcpu, p, r); @@ -385,10 +390,8 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, u64 val, mask, shift; if (reg_to_encoding(r) == SYS_TCR2_EL1 && - !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) { - kvm_inject_undefined(vcpu); - return false; - } + !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + return undef_access(vcpu, p, r); BUG_ON(!p->is_write); @@ -435,6 +438,9 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu, { bool g1; + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (!p->is_write) return read_from_write_only(vcpu, p, r); @@ -478,6 +484,9 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + if (!kvm_has_gicv3(vcpu->kvm)) + return undef_access(vcpu, p, r); + if (p->is_write) return ignore_write(vcpu, p); @@ -495,14 +504,6 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu, return read_zero(vcpu, p); } -static bool trap_undef(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - return false; -} - /* * ARMv8.1 mandates at least a trivial LORegion implementation, where all the * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 @@ -515,10 +516,8 @@ static bool trap_loregion(struct kvm_vcpu *vcpu, { u32 sr = reg_to_encoding(r); - if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) + return undef_access(vcpu, p, r); if (p->is_write && sr == SYS_LORID_EL1) return write_to_read_only(vcpu, p, r); @@ -1251,10 +1250,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { if (p->is_write) { - if (!vcpu_mode_priv(vcpu)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!vcpu_mode_priv(vcpu)) + return undef_access(vcpu, p, r); __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval & ARMV8_PMU_USERENR_MASK; @@ -1338,14 +1335,6 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } -static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) -{ - kvm_inject_undefined(vcpu); - - return false; -} - /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access } #define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access } @@ -1404,8 +1393,7 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, break; default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); - kvm_inject_undefined(vcpu); - return false; + return undef_access(vcpu, p, r); } if (p->is_write) @@ -1539,6 +1527,10 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); break; + case SYS_ID_AA64PFR2_EL1: + /* We only expose FPMR */ + val &= ID_AA64PFR2_EL1_FPMR; + break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | @@ -1669,6 +1661,24 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static unsigned int sme_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_fpmr(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { @@ -2085,26 +2095,6 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) /* - * EL{0,1}2 registers are the EL2 view on an EL0 or EL1 register when - * HCR_EL2.E2H==1, and only in the sysreg table for convenience of - * handling traps. Given that, they are always hidden from userspace. - */ -static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - return REG_HIDDEN_USER; -} - -#define EL12_REG(name, acc, rst, v) { \ - SYS_DESC(SYS_##name##_EL12), \ - .access = acc, \ - .reset = rst, \ - .reg = name##_EL1, \ - .val = v, \ - .visibility = hidden_user_visibility, \ -} - -/* * Since reset() callback and field val are not used for idregs, they will be * used for specific purposes for idregs. * The reset() would return KVM sanitised register value. The value would be the @@ -2211,6 +2201,18 @@ static bool access_spsr(struct kvm_vcpu *vcpu, return true; } +static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) + __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval; + else + p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); + + return true; +} + static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 val = r->val; @@ -2301,7 +2303,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { // DBGDTR[TR]X_EL0 share the same encoding { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, - { SYS_DESC(SYS_DBGVCR32_EL2), trap_undef, reset_val, DBGVCR32_EL2, 0 }, + { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 }, { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, @@ -2359,16 +2361,15 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_MPAM | ID_AA64PFR0_EL1_SVE | ID_AA64PFR0_EL1_RAS | - ID_AA64PFR0_EL1_GIC | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP), }, ID_SANITISED(ID_AA64PFR1_EL1), - ID_UNALLOCATED(4,2), + ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ID_HIDDEN(ID_AA64SMFR0_EL1), ID_UNALLOCATED(4,6), - ID_UNALLOCATED(4,7), + ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), /* CRm=5 */ { SYS_DESC(SYS_ID_AA64DFR0_EL1), @@ -2449,6 +2450,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SPSR_EL1), access_spsr}, { SYS_DESC(SYS_ELR_EL1), access_elr}, + { SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, @@ -2503,18 +2506,31 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, - { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi }, { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi }, - { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only }, - { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only }, - { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only }, + { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, @@ -2535,7 +2551,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { CTR_EL0_IDC_MASK | CTR_EL0_DminLine_MASK | CTR_EL0_IminLine_MASK), - { SYS_DESC(SYS_SVCR), undef_access }, + { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, + { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, @@ -2758,7 +2775,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), - { SYS_DESC(SYS_DACR32_EL2), trap_undef, reset_unknown, DACR32_EL2 }, + { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 }, EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0), EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0), EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0), @@ -2767,20 +2784,16 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SP_EL1), access_sp_el1}, /* AArch32 SPSR_* are RES0 if trapped from a NV guest */ - { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi, - .visibility = hidden_user_visibility }, - - { SYS_DESC(SYS_IFSR32_EL2), trap_undef, reset_unknown, IFSR32_EL2 }, + { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi }, + { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi }, + + { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 }, EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), EL2_REG_REDIR(ESR_EL2, reset_val, 0), - { SYS_DESC(SYS_FPEXC32_EL2), trap_undef, reset_val, FPEXC32_EL2, 0x700 }, + { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 }, EL2_REG_REDIR(FAR_EL2, reset_val, 0), EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), @@ -2790,7 +2803,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), - { SYS_DESC(SYS_RMR_EL2), trap_undef }, + { SYS_DESC(SYS_RMR_EL2), undef_access }, + + EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0), EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), @@ -2798,11 +2813,48 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), - EL12_REG(CNTKCTL, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; +static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s1e01(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + /* There is no FGT associated with AT S1E2A :-( */ + if (op == OP_AT_S1E2A && + !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) { + kvm_inject_undefined(vcpu); + return false; + } + + __kvm_at_s1e2(vcpu, op, p->regval); + + return true; +} + +static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); + + __kvm_at_s12(vcpu, op, p->regval); + + return true; +} + static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr) { struct kvm *kvm = vpcu->kvm; @@ -2824,10 +2876,8 @@ static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); - if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); write_lock(&vcpu->kvm->mmu_lock); @@ -2896,10 +2946,8 @@ static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 limit, vttbr; - if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm)); @@ -2924,10 +2972,8 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, u64 base, range, tg, num, scale; int shift; - if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); /* * Because the shadow S2 structure doesn't necessarily reflect that @@ -2995,10 +3041,8 @@ static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2); u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); - if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { @@ -3038,10 +3082,8 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, WARN_ON(!vcpu_is_el2(vcpu)); - if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) { - kvm_inject_undefined(vcpu); - return false; - } + if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) + return undef_access(vcpu, p, r); kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr), &(union tlbi_info) { @@ -3065,6 +3107,14 @@ static struct sys_reg_desc sys_insn_descs[] = { { SYS_DESC(SYS_DC_ISW), access_dcsw }, { SYS_DESC(SYS_DC_IGSW), access_dcgsw }, { SYS_DESC(SYS_DC_IGDSW), access_dcgsw }, + + SYS_INSN(AT_S1E1R, handle_at_s1e01), + SYS_INSN(AT_S1E1W, handle_at_s1e01), + SYS_INSN(AT_S1E0R, handle_at_s1e01), + SYS_INSN(AT_S1E0W, handle_at_s1e01), + SYS_INSN(AT_S1E1RP, handle_at_s1e01), + SYS_INSN(AT_S1E1WP, handle_at_s1e01), + { SYS_DESC(SYS_DC_CSW), access_dcsw }, { SYS_DESC(SYS_DC_CGSW), access_dcgsw }, { SYS_DESC(SYS_DC_CGDSW), access_dcgsw }, @@ -3144,19 +3194,27 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1), SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1), + SYS_INSN(AT_S1E2R, handle_at_s1e2), + SYS_INSN(AT_S1E2W, handle_at_s1e2), + SYS_INSN(AT_S12E1R, handle_at_s12), + SYS_INSN(AT_S12E1W, handle_at_s12), + SYS_INSN(AT_S12E0R, handle_at_s12), + SYS_INSN(AT_S12E0W, handle_at_s12), + SYS_INSN(AT_S1E2A, handle_at_s1e2), + SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is), SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OS, trap_undef), - SYS_INSN(TLBI_VAE2OS, trap_undef), + SYS_INSN(TLBI_ALLE2OS, undef_access), + SYS_INSN(TLBI_VAE2OS, undef_access), SYS_INSN(TLBI_ALLE1OS, handle_alle1is), - SYS_INSN(TLBI_VALE2OS, trap_undef), + SYS_INSN(TLBI_VALE2OS, undef_access), SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2IS, trap_undef), - SYS_INSN(TLBI_RVALE2IS, trap_undef), + SYS_INSN(TLBI_RVAE2IS, undef_access), + SYS_INSN(TLBI_RVALE2IS, undef_access), SYS_INSN(TLBI_ALLE1IS, handle_alle1is), SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is), @@ -3168,10 +3226,10 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OS, trap_undef), - SYS_INSN(TLBI_RVALE2OS, trap_undef), - SYS_INSN(TLBI_RVAE2, trap_undef), - SYS_INSN(TLBI_RVALE2, trap_undef), + SYS_INSN(TLBI_RVAE2OS, undef_access), + SYS_INSN(TLBI_RVALE2OS, undef_access), + SYS_INSN(TLBI_RVAE2, undef_access), + SYS_INSN(TLBI_RVALE2, undef_access), SYS_INSN(TLBI_ALLE1, handle_alle1is), SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is), @@ -3180,19 +3238,19 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is), - SYS_INSN(TLBI_ALLE2OSNXS, trap_undef), - SYS_INSN(TLBI_VAE2OSNXS, trap_undef), + SYS_INSN(TLBI_ALLE2OSNXS, undef_access), + SYS_INSN(TLBI_VAE2OSNXS, undef_access), SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2OSNXS, trap_undef), + SYS_INSN(TLBI_VALE2OSNXS, undef_access), SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is), - SYS_INSN(TLBI_RVAE2ISNXS, trap_undef), - SYS_INSN(TLBI_RVALE2ISNXS, trap_undef), - SYS_INSN(TLBI_ALLE2ISNXS, trap_undef), - SYS_INSN(TLBI_VAE2ISNXS, trap_undef), + SYS_INSN(TLBI_RVAE2ISNXS, undef_access), + SYS_INSN(TLBI_RVALE2ISNXS, undef_access), + SYS_INSN(TLBI_ALLE2ISNXS, undef_access), + SYS_INSN(TLBI_VAE2ISNXS, undef_access), SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is), - SYS_INSN(TLBI_VALE2ISNXS, trap_undef), + SYS_INSN(TLBI_VALE2ISNXS, undef_access), SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is), SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is), SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is), @@ -3202,14 +3260,14 @@ static struct sys_reg_desc sys_insn_descs[] = { SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is), SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is), SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is), - SYS_INSN(TLBI_RVAE2OSNXS, trap_undef), - SYS_INSN(TLBI_RVALE2OSNXS, trap_undef), - SYS_INSN(TLBI_RVAE2NXS, trap_undef), - SYS_INSN(TLBI_RVALE2NXS, trap_undef), - SYS_INSN(TLBI_ALLE2NXS, trap_undef), - SYS_INSN(TLBI_VAE2NXS, trap_undef), + SYS_INSN(TLBI_RVAE2OSNXS, undef_access), + SYS_INSN(TLBI_RVALE2OSNXS, undef_access), + SYS_INSN(TLBI_RVAE2NXS, undef_access), + SYS_INSN(TLBI_RVALE2NXS, undef_access), + SYS_INSN(TLBI_ALLE2NXS, undef_access), + SYS_INSN(TLBI_VAE2NXS, undef_access), SYS_INSN(TLBI_ALLE1NXS, handle_alle1is), - SYS_INSN(TLBI_VALE2NXS, trap_undef), + SYS_INSN(TLBI_VALE2NXS, undef_access), SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is), }; @@ -3387,6 +3445,7 @@ static const struct sys_reg_desc cp15_regs[] = { /* TTBCR2 */ { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 }, { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 }, + { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access }, /* DFSR */ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 }, { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 }, @@ -3436,8 +3495,28 @@ static const struct sys_reg_desc cp15_regs[] = { /* AMAIR1 */ { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 }, - /* ICC_SRE */ - { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access }, + { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 }, @@ -4274,7 +4353,7 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, int ret; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r || sysreg_hidden_user(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) { @@ -4318,7 +4397,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, return -EFAULT; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); - if (!r || sysreg_hidden_user(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (sysreg_user_write_ignore(vcpu, r)) @@ -4404,7 +4483,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, if (!(rd->reg || rd->get_user)) return 0; - if (sysreg_hidden_user(vcpu, rd)) + if (sysreg_hidden(vcpu, rd)) return 0; if (!copy_reg_to_user(rd, uind)) @@ -4545,6 +4624,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) mutex_lock(&kvm->arch.config_lock); vcpu_set_hcr(vcpu); + vcpu_set_ich_hcr(vcpu); if (cpus_have_final_cap(ARM64_HAS_HCX)) { /* @@ -4560,6 +4640,9 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) if (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; + + if (kvm_has_fpmr(kvm)) + vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM; } if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags)) @@ -4600,6 +4683,13 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) HFGITR_EL2_TLBIRVAAE1OS | HFGITR_EL2_TLBIRVAE1OS); + if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) + kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A; + + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2)) + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP | + HFGITR_EL2_ATS1E1WP); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); @@ -4613,6 +4703,36 @@ out: mutex_unlock(&kvm->arch.config_lock); } +/* + * Perform last adjustments to the ID registers that are implied by the + * configuration outside of the ID regs themselves, as well as any + * initialisation that directly depend on these ID registers (such as + * RES0/RES1 behaviours). This is not the place to configure traps though. + * + * Because this can be called once per CPU, changes must be idempotent. + */ +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + guard(mutex)(&kvm->arch.config_lock); + + if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && + irqchip_in_kernel(kvm) && + kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) { + kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK; + kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK; + } + + if (vcpu_has_nv(vcpu)) { + int ret = kvm_init_nv_sysregs(kvm); + if (ret) + return ret; + } + + return 0; +} + int __init kvm_sys_reg_table_init(void) { bool valid = true; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 997eea21ba2a..1d94ed6efad2 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -95,9 +95,8 @@ struct sys_reg_desc { }; #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ -#define REG_HIDDEN_USER (1 << 1) /* hidden from userspace only */ -#define REG_RAZ (1 << 2) /* RAZ from userspace and guest */ -#define REG_USER_WI (1 << 3) /* WI from userspace only */ +#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ +#define REG_USER_WI (1 << 2) /* WI from userspace only */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, @@ -165,15 +164,6 @@ static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, return sysreg_visibility(vcpu, r) & REG_HIDDEN; } -static inline bool sysreg_hidden_user(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *r) -{ - if (likely(!r->visibility)) - return false; - - return r->visibility(vcpu, r) & (REG_HIDDEN | REG_HIDDEN_USER); -} - static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { @@ -235,6 +225,8 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); +int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); + #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x @@ -248,4 +240,11 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) +#define CP15_SYS_DESC(reg) \ + .name = #reg, \ + .aarch32_map = AA32_DIRECT, \ + Op0(0), Op1(sys_reg_Op1(reg)), \ + CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ + Op2(sys_reg_Op2(reg)) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index bc74d06398ef..e1397ab2072a 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -85,7 +85,7 @@ static void iter_unmark_lpis(struct kvm *kvm) struct vgic_irq *irq; unsigned long intid; - xa_for_each(&dist->lpi_xa, intid, irq) { + xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); vgic_put_irq(kvm, irq); } diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 41feb858ff9a..e7c53e8af3d1 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -417,10 +417,8 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) kfree(vgic_cpu->private_irqs); vgic_cpu->private_irqs = NULL; - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { - vgic_unregister_redist_iodev(vcpu); + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; - } } void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -448,6 +446,11 @@ void kvm_vgic_destroy(struct kvm *kvm) kvm_vgic_dist_destroy(kvm); mutex_unlock(&kvm->arch.config_lock); + + if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + kvm_for_each_vcpu(i, vcpu, kvm) + vgic_unregister_redist_iodev(vcpu); + mutex_unlock(&kvm->slots_lock); } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 3eecdd2f4b8f..b217b256853c 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -292,6 +292,18 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EN; +} + +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; + + /* Hide GICv3 sysreg if necessary */ + if (!kvm_has_gicv3(vcpu->kvm)) { + vgic_v3->vgic_hcr |= ICH_HCR_TALL0 | ICH_HCR_TALL1 | ICH_HCR_TC; + return; + } + if (group0_trap) vgic_v3->vgic_hcr |= ICH_HCR_TALL0; if (group1_trap) diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 974849ea7101..f50274fd5581 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -36,6 +36,11 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * we have to disable IRQs before taking this lock and everything lower * than it. * + * The config_lock has additional ordering requirements: + * kvm->slots_lock + * kvm->srcu + * kvm->arch.config_lock + * * If you need to take multiple locks, always take the upper lock first, * then the lower ones, e.g. first take the its_lock, then the irq_lock. * If you are already holding a lock and need to take a higher one, you @@ -917,10 +922,13 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) void kvm_vgic_load(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_load(vcpu); else vgic_v3_load(vcpu); @@ -928,10 +936,13 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) void kvm_vgic_put(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_put(vcpu); else vgic_v3_put(vcpu); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index ba8f790431bd..f2486b4d9f95 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -346,4 +346,11 @@ void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); +void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu); + +static inline bool kvm_has_gicv3(struct kvm *kvm) +{ + return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); +} + #endif diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 6986827e0d64..264c5f9b97d8 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -38,33 +38,7 @@ seq_printf(m, fmt); \ }) -/* - * The page dumper groups page table entries of the same type into a single - * description. It uses pg_state to track the range information while - * iterating over the pte entries. When the continuity is broken it then - * dumps out a description of the range. - */ -struct pg_state { - struct ptdump_state ptdump; - struct seq_file *seq; - const struct addr_marker *marker; - const struct mm_struct *mm; - unsigned long start_address; - int level; - u64 current_prot; - bool check_wx; - unsigned long wx_pages; - unsigned long uxn_pages; -}; - -struct prot_bits { - u64 mask; - u64 val; - const char *set; - const char *clear; -}; - -static const struct prot_bits pte_bits[] = { +static const struct ptdump_prot_bits pte_bits[] = { { .mask = PTE_VALID, .val = PTE_VALID, @@ -143,14 +117,7 @@ static const struct prot_bits pte_bits[] = { } }; -struct pg_level { - const struct prot_bits *bits; - char name[4]; - int num; - u64 mask; -}; - -static struct pg_level pg_level[] __ro_after_init = { +static struct ptdump_pg_level kernel_pg_levels[] __ro_after_init = { { /* pgd */ .name = "PGD", .bits = pte_bits, @@ -174,7 +141,7 @@ static struct pg_level pg_level[] __ro_after_init = { }, }; -static void dump_prot(struct pg_state *st, const struct prot_bits *bits, +static void dump_prot(struct ptdump_pg_state *st, const struct ptdump_prot_bits *bits, size_t num) { unsigned i; @@ -192,7 +159,7 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits, } } -static void note_prot_uxn(struct pg_state *st, unsigned long addr) +static void note_prot_uxn(struct ptdump_pg_state *st, unsigned long addr) { if (!st->check_wx) return; @@ -206,7 +173,7 @@ static void note_prot_uxn(struct pg_state *st, unsigned long addr) st->uxn_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_prot_wx(struct pg_state *st, unsigned long addr) +static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr) { if (!st->check_wx) return; @@ -221,16 +188,17 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - u64 val) +void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + u64 val) { - struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump); + struct ptdump_pg_level *pg_level = st->pg_level; static const char units[] = "KMGTPE"; u64 prot = 0; /* check if the current level has been folded dynamically */ - if ((level == 1 && mm_p4d_folded(st->mm)) || - (level == 2 && mm_pud_folded(st->mm))) + if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) || + (level == 2 && mm_pud_folded(st->mm)))) level = 0; if (level >= 0) @@ -286,15 +254,16 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, void ptdump_walk(struct seq_file *s, struct ptdump_info *info) { unsigned long end = ~0UL; - struct pg_state st; + struct ptdump_pg_state st; if (info->base_addr < TASK_SIZE_64) end = TASK_SIZE_64; - st = (struct pg_state){ + st = (struct ptdump_pg_state){ .seq = s, .marker = info->markers, .mm = info->mm, + .pg_level = &kernel_pg_levels[0], .level = -1, .ptdump = { .note_page = note_page, @@ -312,10 +281,10 @@ static void __init ptdump_initialize(void) { unsigned i, j; - for (i = 0; i < ARRAY_SIZE(pg_level); i++) - if (pg_level[i].bits) - for (j = 0; j < pg_level[i].num; j++) - pg_level[i].mask |= pg_level[i].bits[j].mask; + for (i = 0; i < ARRAY_SIZE(kernel_pg_levels); i++) + if (kernel_pg_levels[i].bits) + for (j = 0; j < kernel_pg_levels[i].num; j++) + kernel_pg_levels[i].mask |= kernel_pg_levels[i].bits[j].mask; } static struct ptdump_info kernel_ptdump_info __ro_after_init = { @@ -324,12 +293,13 @@ static struct ptdump_info kernel_ptdump_info __ro_after_init = { bool ptdump_check_wx(void) { - struct pg_state st = { + struct ptdump_pg_state st = { .seq = NULL, .marker = (struct addr_marker[]) { { 0, NULL}, { -1, NULL}, }, + .pg_level = &kernel_pg_levels[0], .level = -1, .check_wx = true, .ptdump = { diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 2bb3676429c0..4635b755b2b4 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -6,7 +6,6 @@ generic-y += mcs_spinlock.h generic-y += parport.h generic-y += early_ioremap.h generic-y += qrwlock.h -generic-y += qspinlock.h generic-y += user.h generic-y += ioctl.h generic-y += statfs.h diff --git a/arch/loongarch/include/asm/dma-direct.h b/arch/loongarch/include/asm/dma-direct.h deleted file mode 100644 index 75ccd808a2af..000000000000 --- a/arch/loongarch/include/asm/dma-direct.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2020-2022 Loongson Technology Corporation Limited - */ -#ifndef _LOONGARCH_DMA_DIRECT_H -#define _LOONGARCH_DMA_DIRECT_H - -dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr); -phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr); - -#endif /* _LOONGARCH_DMA_DIRECT_H */ diff --git a/arch/loongarch/include/asm/hw_irq.h b/arch/loongarch/include/asm/hw_irq.h index af4f4e8fbd85..8156ffb67415 100644 --- a/arch/loongarch/include/asm/hw_irq.h +++ b/arch/loongarch/include/asm/hw_irq.h @@ -9,6 +9,8 @@ extern atomic_t irq_err_count; +#define ARCH_IRQ_INIT_FLAGS IRQ_NOPROBE + /* * interrupt-retrigger: NOP for now. This may not be appropriate for all * machines, we'll see ... diff --git a/arch/loongarch/include/asm/kvm_csr.h b/arch/loongarch/include/asm/kvm_csr.h index 724ca8b7b401..4a76ce796f1f 100644 --- a/arch/loongarch/include/asm/kvm_csr.h +++ b/arch/loongarch/include/asm/kvm_csr.h @@ -30,6 +30,7 @@ : [val] "+r" (__v) \ : [reg] "i" (csr) \ : "memory"); \ + __v; \ }) #define gcsr_xchg(v, m, csr) \ @@ -181,6 +182,8 @@ __BUILD_GCSR_OP(tlbidx) #define kvm_save_hw_gcsr(csr, gid) (csr->csrs[gid] = gcsr_read(gid)) #define kvm_restore_hw_gcsr(csr, gid) (gcsr_write(csr->csrs[gid], gid)) +#define kvm_read_clear_hw_gcsr(csr, gid) (csr->csrs[gid] = gcsr_write(0, gid)) + int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu); static __always_inline unsigned long kvm_read_sw_gcsr(struct loongarch_csrs *csr, int gid) @@ -208,4 +211,7 @@ static __always_inline void kvm_change_sw_gcsr(struct loongarch_csrs *csr, csr->csrs[gid] |= val & _mask; } +#define KVM_PMU_EVENT_ENABLED (CSR_PERFCTRL_PLV0 | CSR_PERFCTRL_PLV1 | \ + CSR_PERFCTRL_PLV2 | CSR_PERFCTRL_PLV3) + #endif /* __ASM_LOONGARCH_KVM_CSR_H__ */ diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 5f0677e03817..d6bb72424027 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -30,6 +30,7 @@ #define KVM_HALT_POLL_NS_DEFAULT 500000 #define KVM_REQ_TLB_FLUSH_GPA KVM_ARCH_REQ(0) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(1) +#define KVM_REQ_PMU KVM_ARCH_REQ(2) #define KVM_GUESTDBG_SW_BP_MASK \ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP) @@ -60,9 +61,13 @@ struct kvm_arch_memory_slot { unsigned long flags; }; +#define HOST_MAX_PMNUM 16 struct kvm_context { unsigned long vpid_cache; struct kvm_vcpu *last_vcpu; + /* Host PMU CSR */ + u64 perf_ctrl[HOST_MAX_PMNUM]; + u64 perf_cntr[HOST_MAX_PMNUM]; }; struct kvm_world_switch { @@ -107,6 +112,8 @@ struct kvm_arch { unsigned int root_level; spinlock_t phyid_map_lock; struct kvm_phyid_map *phyid_map; + /* Enabled PV features */ + unsigned long pv_features; s64 time_offset; struct kvm_context __percpu *vmcs; @@ -133,8 +140,15 @@ enum emulation_result { #define KVM_LARCH_FPU (0x1 << 0) #define KVM_LARCH_LSX (0x1 << 1) #define KVM_LARCH_LASX (0x1 << 2) -#define KVM_LARCH_SWCSR_LATEST (0x1 << 3) -#define KVM_LARCH_HWCSR_USABLE (0x1 << 4) +#define KVM_LARCH_LBT (0x1 << 3) +#define KVM_LARCH_PMU (0x1 << 4) +#define KVM_LARCH_SWCSR_LATEST (0x1 << 5) +#define KVM_LARCH_HWCSR_USABLE (0x1 << 6) + +#define LOONGARCH_PV_FEAT_UPDATED BIT_ULL(63) +#define LOONGARCH_PV_FEAT_MASK (BIT(KVM_FEATURE_IPI) | \ + BIT(KVM_FEATURE_STEAL_TIME) | \ + BIT(KVM_FEATURE_VIRT_EXTIOI)) struct kvm_vcpu_arch { /* @@ -168,10 +182,14 @@ struct kvm_vcpu_arch { /* FPU state */ struct loongarch_fpu fpu FPU_ALIGN; + struct loongarch_lbt lbt; /* CSR state */ struct loongarch_csrs *csr; + /* Guest max PMU CSR id */ + int max_pmu_csrid; + /* GPR used as IO source/target */ u32 io_gpr; @@ -239,6 +257,21 @@ static inline bool kvm_guest_has_lasx(struct kvm_vcpu_arch *arch) return arch->cpucfg[2] & CPUCFG2_LASX; } +static inline bool kvm_guest_has_lbt(struct kvm_vcpu_arch *arch) +{ + return arch->cpucfg[2] & (CPUCFG2_X86BT | CPUCFG2_ARMBT | CPUCFG2_MIPSBT); +} + +static inline bool kvm_guest_has_pmu(struct kvm_vcpu_arch *arch) +{ + return arch->cpucfg[6] & CPUCFG6_PMP; +} + +static inline int kvm_get_pmu_num(struct kvm_vcpu_arch *arch) +{ + return (arch->cpucfg[6] & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT; +} + /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); diff --git a/arch/loongarch/include/asm/kvm_para.h b/arch/loongarch/include/asm/kvm_para.h index 43ec61589e6c..c4e84227280d 100644 --- a/arch/loongarch/include/asm/kvm_para.h +++ b/arch/loongarch/include/asm/kvm_para.h @@ -2,6 +2,8 @@ #ifndef _ASM_LOONGARCH_KVM_PARA_H #define _ASM_LOONGARCH_KVM_PARA_H +#include <uapi/asm/kvm_para.h> + /* * Hypercall code field */ @@ -154,10 +156,20 @@ static __always_inline long kvm_hypercall5(u64 fid, return ret; } +#ifdef CONFIG_PARAVIRT +bool kvm_para_available(void); +unsigned int kvm_arch_para_features(void); +#else +static inline bool kvm_para_available(void) +{ + return false; +} + static inline unsigned int kvm_arch_para_features(void) { return 0; } +#endif static inline unsigned int kvm_arch_para_hints(void) { diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h index c416cb7125c0..d7e8f7d50ee0 100644 --- a/arch/loongarch/include/asm/kvm_vcpu.h +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -75,8 +75,13 @@ static inline void kvm_save_lasx(struct loongarch_fpu *fpu) { } static inline void kvm_restore_lasx(struct loongarch_fpu *fpu) { } #endif +#ifdef CONFIG_CPU_HAS_LBT +int kvm_own_lbt(struct kvm_vcpu *vcpu); +#else +static inline int kvm_own_lbt(struct kvm_vcpu *vcpu) { return -EINVAL; } +#endif + void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz); -void kvm_reset_timer(struct kvm_vcpu *vcpu); void kvm_save_timer(struct kvm_vcpu *vcpu); void kvm_restore_timer(struct kvm_vcpu *vcpu); @@ -125,4 +130,9 @@ static inline bool kvm_pvtime_supported(void) return !!sched_info_on(); } +static inline bool kvm_guest_has_pv_feature(struct kvm_vcpu *vcpu, unsigned int feature) +{ + return vcpu->kvm->arch.pv_features & BIT(feature); +} + #endif /* __ASM_LOONGARCH_KVM_VCPU_H__ */ diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 04a78010fc72..24a3f4925cfb 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -119,6 +119,7 @@ #define CPUCFG6_PMP BIT(0) #define CPUCFG6_PAMVER GENMASK(3, 1) #define CPUCFG6_PMNUM GENMASK(7, 4) +#define CPUCFG6_PMNUM_SHIFT 4 #define CPUCFG6_PMBITS GENMASK(13, 8) #define CPUCFG6_UPM BIT(14) @@ -160,16 +161,8 @@ /* * CPUCFG index area: 0x40000000 -- 0x400000ff - * SW emulation for KVM hypervirsor + * SW emulation for KVM hypervirsor, see arch/loongarch/include/uapi/asm/kvm_para.h */ -#define CPUCFG_KVM_BASE 0x40000000 -#define CPUCFG_KVM_SIZE 0x100 - -#define CPUCFG_KVM_SIG (CPUCFG_KVM_BASE + 0) -#define KVM_SIGNATURE "KVM\0" -#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4) -#define KVM_FEATURE_IPI BIT(1) -#define KVM_FEATURE_STEAL_TIME BIT(2) #ifndef __ASSEMBLY__ diff --git a/arch/loongarch/include/asm/paravirt.h b/arch/loongarch/include/asm/paravirt.h index dddec49671ae..3f4323603e6a 100644 --- a/arch/loongarch/include/asm/paravirt.h +++ b/arch/loongarch/include/asm/paravirt.h @@ -19,6 +19,7 @@ static inline u64 paravirt_steal_clock(int cpu) int __init pv_ipi_init(void); int __init pv_time_init(void); +int __init pv_spinlock_init(void); #else @@ -31,5 +32,11 @@ static inline int pv_time_init(void) { return 0; } + +static inline int pv_spinlock_init(void) +{ + return 0; +} + #endif // CONFIG_PARAVIRT #endif diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h new file mode 100644 index 000000000000..e76d3aa1e1eb --- /dev/null +++ b/arch/loongarch/include/asm/qspinlock.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_LOONGARCH_QSPINLOCK_H +#define _ASM_LOONGARCH_QSPINLOCK_H + +#include <linux/jump_label.h> + +#ifdef CONFIG_PARAVIRT + +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); + +#define virt_spin_lock virt_spin_lock + +static inline bool virt_spin_lock(struct qspinlock *lock) +{ + int val; + + if (!static_branch_unlikely(&virt_spin_lock_key)) + return false; + + /* + * On hypervisors without PARAVIRT_SPINLOCKS support we fall + * back to a Test-and-Set spinlock, because fair locks have + * horrible lock 'holder' preemption issues. + */ + +__retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) { + cpu_relax(); + goto __retry; + } + + return true; +} + +#endif /* CONFIG_PARAVIRT */ + +#include <asm-generic/qspinlock.h> + +#endif // _ASM_LOONGARCH_QSPINLOCK_H diff --git a/arch/loongarch/include/uapi/asm/Kbuild b/arch/loongarch/include/uapi/asm/Kbuild index c6d141d7b7d7..517761419999 100644 --- a/arch/loongarch/include/uapi/asm/Kbuild +++ b/arch/loongarch/include/uapi/asm/Kbuild @@ -1,4 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 syscall-y += unistd_64.h - -generic-y += kvm_para.h diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h index ddc5cab0ffd0..70d89070bfeb 100644 --- a/arch/loongarch/include/uapi/asm/kvm.h +++ b/arch/loongarch/include/uapi/asm/kvm.h @@ -64,6 +64,7 @@ struct kvm_fpu { #define KVM_REG_LOONGARCH_KVM (KVM_REG_LOONGARCH | 0x20000ULL) #define KVM_REG_LOONGARCH_FPSIMD (KVM_REG_LOONGARCH | 0x30000ULL) #define KVM_REG_LOONGARCH_CPUCFG (KVM_REG_LOONGARCH | 0x40000ULL) +#define KVM_REG_LOONGARCH_LBT (KVM_REG_LOONGARCH | 0x50000ULL) #define KVM_REG_LOONGARCH_MASK (KVM_REG_LOONGARCH | 0x70000ULL) #define KVM_CSR_IDX_MASK 0x7fff #define KVM_CPUCFG_IDX_MASK 0x7fff @@ -77,11 +78,30 @@ struct kvm_fpu { /* Debugging: Special instruction for software breakpoint */ #define KVM_REG_LOONGARCH_DEBUG_INST (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 3) +/* LBT registers */ +#define KVM_REG_LOONGARCH_LBT_SCR0 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 1) +#define KVM_REG_LOONGARCH_LBT_SCR1 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 2) +#define KVM_REG_LOONGARCH_LBT_SCR2 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 3) +#define KVM_REG_LOONGARCH_LBT_SCR3 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 4) +#define KVM_REG_LOONGARCH_LBT_EFLAGS (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 5) +#define KVM_REG_LOONGARCH_LBT_FTOP (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 6) + #define LOONGARCH_REG_SHIFT 3 #define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) #define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) #define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) +/* Device Control API on vm fd */ +#define KVM_LOONGARCH_VM_FEAT_CTRL 0 +#define KVM_LOONGARCH_VM_FEAT_LSX 0 +#define KVM_LOONGARCH_VM_FEAT_LASX 1 +#define KVM_LOONGARCH_VM_FEAT_X86BT 2 +#define KVM_LOONGARCH_VM_FEAT_ARMBT 3 +#define KVM_LOONGARCH_VM_FEAT_MIPSBT 4 +#define KVM_LOONGARCH_VM_FEAT_PMU 5 +#define KVM_LOONGARCH_VM_FEAT_PV_IPI 6 +#define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7 + /* Device Control API on vcpu fd */ #define KVM_LOONGARCH_VCPU_CPUCFG 0 #define KVM_LOONGARCH_VCPU_PVTIME_CTRL 1 diff --git a/arch/loongarch/include/uapi/asm/kvm_para.h b/arch/loongarch/include/uapi/asm/kvm_para.h new file mode 100644 index 000000000000..b0604aa9b4bb --- /dev/null +++ b/arch/loongarch/include/uapi/asm/kvm_para.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_KVM_PARA_H +#define _UAPI_ASM_KVM_PARA_H + +#include <linux/types.h> + +/* + * CPUCFG index area: 0x40000000 -- 0x400000ff + * SW emulation for KVM hypervirsor + */ +#define CPUCFG_KVM_BASE 0x40000000 +#define CPUCFG_KVM_SIZE 0x100 +#define CPUCFG_KVM_SIG (CPUCFG_KVM_BASE + 0) +#define KVM_SIGNATURE "KVM\0" +#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4) +#define KVM_FEATURE_IPI 1 +#define KVM_FEATURE_STEAL_TIME 2 +/* BIT 24 - 31 are features configurable by user space vmm */ +#define KVM_FEATURE_VIRT_EXTIOI 24 + +#endif /* _UAPI_ASM_KVM_PARA_H */ diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index 69a85f2479fb..6ab640101457 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -530,6 +530,10 @@ SYM_FUNC_END(_restore_lasx_context) #ifdef CONFIG_CPU_HAS_LBT STACK_FRAME_NON_STANDARD _restore_fp +#ifdef CONFIG_CPU_HAS_LSX STACK_FRAME_NON_STANDARD _restore_lsx +#endif +#ifdef CONFIG_CPU_HAS_LASX STACK_FRAME_NON_STANDARD _restore_lasx #endif +#endif diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c index f4991c03514f..adac8fcbb2ac 100644 --- a/arch/loongarch/kernel/irq.c +++ b/arch/loongarch/kernel/irq.c @@ -102,9 +102,6 @@ void __init init_IRQ(void) mp_ops.init_ipi(); #endif - for (i = 0; i < NR_IRQS; i++) - irq_set_noprobe(i); - for_each_possible_cpu(i) { page = alloc_pages_node(cpu_to_node(i), GFP_KERNEL, order); diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c index 9c9b75b76f62..708eda025ed8 100644 --- a/arch/loongarch/kernel/paravirt.c +++ b/arch/loongarch/kernel/paravirt.c @@ -13,6 +13,7 @@ static int has_steal_clock; struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); static u64 native_steal_clock(int cpu) { @@ -151,11 +152,14 @@ static void pv_init_ipi(void) } #endif -static bool kvm_para_available(void) +bool kvm_para_available(void) { int config; static int hypervisor_type; + if (!cpu_has_hypervisor) + return false; + if (!hypervisor_type) { config = read_cpucfg(CPUCFG_KVM_SIG); if (!memcmp(&config, KVM_SIGNATURE, 4)) @@ -165,17 +169,22 @@ static bool kvm_para_available(void) return hypervisor_type == HYPERVISOR_KVM; } -int __init pv_ipi_init(void) +unsigned int kvm_arch_para_features(void) { - int feature; + static unsigned int feature; - if (!cpu_has_hypervisor) - return 0; if (!kvm_para_available()) return 0; - feature = read_cpucfg(CPUCFG_KVM_FEATURE); - if (!(feature & KVM_FEATURE_IPI)) + if (!feature) + feature = read_cpucfg(CPUCFG_KVM_FEATURE); + + return feature; +} + +int __init pv_ipi_init(void) +{ + if (!kvm_para_has_feature(KVM_FEATURE_IPI)) return 0; #ifdef CONFIG_SMP @@ -206,7 +215,7 @@ static int pv_enable_steal_time(void) } addr |= KVM_STEAL_PHYS_VALID; - kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, KVM_FEATURE_STEAL_TIME, addr); + kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, BIT(KVM_FEATURE_STEAL_TIME), addr); return 0; } @@ -214,7 +223,7 @@ static int pv_enable_steal_time(void) static void pv_disable_steal_time(void) { if (has_steal_clock) - kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, KVM_FEATURE_STEAL_TIME, 0); + kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, BIT(KVM_FEATURE_STEAL_TIME), 0); } #ifdef CONFIG_SMP @@ -258,15 +267,9 @@ static struct notifier_block pv_reboot_nb = { int __init pv_time_init(void) { - int r, feature; + int r; - if (!cpu_has_hypervisor) - return 0; - if (!kvm_para_available()) - return 0; - - feature = read_cpucfg(CPUCFG_KVM_FEATURE); - if (!(feature & KVM_FEATURE_STEAL_TIME)) + if (!kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) return 0; has_steal_clock = 1; @@ -300,3 +303,13 @@ int __init pv_time_init(void) return 0; } + +int __init pv_spinlock_init(void) +{ + if (!cpu_has_hypervisor) + return 0; + + static_branch_enable(&virt_spin_lock_key); + + return 0; +} diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 0f0740f0be27..00e307203ddb 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -603,6 +603,8 @@ void __init setup_arch(char **cmdline_p) arch_mem_init(cmdline_p); resource_init(); + jump_label_init(); /* Initialise the static keys for paravirtualization */ + #ifdef CONFIG_SMP plat_smp_setup(); prefill_possible_map(); diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index ca405ab86aae..482b3c7e3042 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -476,7 +476,7 @@ core_initcall(ipi_pm_init); #endif /* Preload SMP state for boot cpu */ -void smp_prepare_boot_cpu(void) +void __init smp_prepare_boot_cpu(void) { unsigned int cpu, node, rr_node; @@ -509,6 +509,8 @@ void smp_prepare_boot_cpu(void) rr_node = next_node_in(rr_node, node_online_map); } } + + pv_spinlock_init(); } /* called from main before smp_init() */ diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index ea73f9dc2cc6..90894f70ff4a 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -50,9 +50,7 @@ static int kvm_emu_cpucfg(struct kvm_vcpu *vcpu, larch_inst inst) vcpu->arch.gprs[rd] = *(unsigned int *)KVM_SIGNATURE; break; case CPUCFG_KVM_FEATURE: - ret = KVM_FEATURE_IPI; - if (kvm_pvtime_supported()) - ret |= KVM_FEATURE_STEAL_TIME; + ret = vcpu->kvm->arch.pv_features & LOONGARCH_PV_FEAT_MASK; vcpu->arch.gprs[rd] = ret; break; default: @@ -127,6 +125,14 @@ static int kvm_handle_csr(struct kvm_vcpu *vcpu, larch_inst inst) rj = inst.reg2csr_format.rj; csrid = inst.reg2csr_format.csr; + if (csrid >= LOONGARCH_CSR_PERFCTRL0 && csrid <= vcpu->arch.max_pmu_csrid) { + if (kvm_guest_has_pmu(&vcpu->arch)) { + vcpu->arch.pc -= 4; + kvm_make_request(KVM_REQ_PMU, vcpu); + return EMULATE_DONE; + } + } + /* Process CSR ops */ switch (rj) { case 0: /* process csrrd */ @@ -697,25 +703,22 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu) id = kvm_read_reg(vcpu, LOONGARCH_GPR_A1); data = kvm_read_reg(vcpu, LOONGARCH_GPR_A2); switch (id) { - case KVM_FEATURE_STEAL_TIME: - if (!kvm_pvtime_supported()) - return KVM_HCALL_INVALID_CODE; - + case BIT(KVM_FEATURE_STEAL_TIME): if (data & ~(KVM_STEAL_PHYS_MASK | KVM_STEAL_PHYS_VALID)) return KVM_HCALL_INVALID_PARAMETER; vcpu->arch.st.guest_addr = data; if (!(data & KVM_STEAL_PHYS_VALID)) - break; + return 0; vcpu->arch.st.last_steal = current->sched_info.run_delay; kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); - break; + return 0; default: - break; + return KVM_HCALL_INVALID_CODE; }; - return 0; + return KVM_HCALL_INVALID_CODE; }; /* @@ -748,6 +751,14 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu) +{ + if (kvm_own_lbt(vcpu)) + kvm_queue_exception(vcpu, EXCCODE_INE, 0); + + return RESUME_GUEST; +} + static int kvm_send_pv_ipi(struct kvm_vcpu *vcpu) { unsigned int min, cpu, i; @@ -781,19 +792,21 @@ static int kvm_send_pv_ipi(struct kvm_vcpu *vcpu) */ static void kvm_handle_service(struct kvm_vcpu *vcpu) { + long ret = KVM_HCALL_INVALID_CODE; unsigned long func = kvm_read_reg(vcpu, LOONGARCH_GPR_A0); - long ret; switch (func) { case KVM_HCALL_FUNC_IPI: - kvm_send_pv_ipi(vcpu); - ret = KVM_HCALL_SUCCESS; + if (kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_IPI)) { + kvm_send_pv_ipi(vcpu); + ret = KVM_HCALL_SUCCESS; + } break; case KVM_HCALL_FUNC_NOTIFY: - ret = kvm_save_notify(vcpu); + if (kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME)) + ret = kvm_save_notify(vcpu); break; default: - ret = KVM_HCALL_INVALID_CODE; break; } @@ -865,6 +878,7 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = { [EXCCODE_FPDIS] = kvm_handle_fpu_disabled, [EXCCODE_LSXDIS] = kvm_handle_lsx_disabled, [EXCCODE_LASXDIS] = kvm_handle_lasx_disabled, + [EXCCODE_BTDIS] = kvm_handle_lbt_disabled, [EXCCODE_GSPR] = kvm_handle_gspr, [EXCCODE_HVC] = kvm_handle_hypercall, }; diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 844736b99d38..27e9b94c0a0b 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -261,7 +261,7 @@ long kvm_arch_dev_ioctl(struct file *filp, return -ENOIOCTLCMD; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { unsigned long env, gcfg = 0; @@ -300,7 +300,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { write_csr_gcfg(0); write_csr_gstat(0); diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index 80e988985a6a..0c292f818492 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -277,6 +277,10 @@ SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) #ifdef CONFIG_CPU_HAS_LBT STACK_FRAME_NON_STANDARD kvm_restore_fpu +#ifdef CONFIG_CPU_HAS_LSX STACK_FRAME_NON_STANDARD kvm_restore_lsx +#endif +#ifdef CONFIG_CPU_HAS_LASX STACK_FRAME_NON_STANDARD kvm_restore_lasx #endif +#endif diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index bcc6b6d063d9..74a4b5c272d6 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -188,10 +188,3 @@ void kvm_save_timer(struct kvm_vcpu *vcpu) kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ESTAT); preempt_enable(); } - -void kvm_reset_timer(struct kvm_vcpu *vcpu) -{ - write_gcsr_timercfg(0); - kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TCFG, 0); - hrtimer_cancel(&vcpu->arch.swtimer); -} diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 16756ffb55e8..0697b1064251 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -6,6 +6,7 @@ #include <linux/kvm_host.h> #include <linux/entry-kvm.h> #include <asm/fpu.h> +#include <asm/lbt.h> #include <asm/loongarch.h> #include <asm/setup.h> #include <asm/time.h> @@ -31,6 +32,126 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; +static inline void kvm_save_host_pmu(struct kvm_vcpu *vcpu) +{ + struct kvm_context *context; + + context = this_cpu_ptr(vcpu->kvm->arch.vmcs); + context->perf_cntr[0] = read_csr_perfcntr0(); + context->perf_cntr[1] = read_csr_perfcntr1(); + context->perf_cntr[2] = read_csr_perfcntr2(); + context->perf_cntr[3] = read_csr_perfcntr3(); + context->perf_ctrl[0] = write_csr_perfctrl0(0); + context->perf_ctrl[1] = write_csr_perfctrl1(0); + context->perf_ctrl[2] = write_csr_perfctrl2(0); + context->perf_ctrl[3] = write_csr_perfctrl3(0); +} + +static inline void kvm_restore_host_pmu(struct kvm_vcpu *vcpu) +{ + struct kvm_context *context; + + context = this_cpu_ptr(vcpu->kvm->arch.vmcs); + write_csr_perfcntr0(context->perf_cntr[0]); + write_csr_perfcntr1(context->perf_cntr[1]); + write_csr_perfcntr2(context->perf_cntr[2]); + write_csr_perfcntr3(context->perf_cntr[3]); + write_csr_perfctrl0(context->perf_ctrl[0]); + write_csr_perfctrl1(context->perf_ctrl[1]); + write_csr_perfctrl2(context->perf_ctrl[2]); + write_csr_perfctrl3(context->perf_ctrl[3]); +} + + +static inline void kvm_save_guest_pmu(struct kvm_vcpu *vcpu) +{ + struct loongarch_csrs *csr = vcpu->arch.csr; + + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR3); + kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0); + kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1); + kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2); + kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); +} + +static inline void kvm_restore_guest_pmu(struct kvm_vcpu *vcpu) +{ + struct loongarch_csrs *csr = vcpu->arch.csr; + + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR3); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); +} + +static int kvm_own_pmu(struct kvm_vcpu *vcpu) +{ + unsigned long val; + + if (!kvm_guest_has_pmu(&vcpu->arch)) + return -EINVAL; + + kvm_save_host_pmu(vcpu); + + /* Set PM0-PM(num) to guest */ + val = read_csr_gcfg() & ~CSR_GCFG_GPERF; + val |= (kvm_get_pmu_num(&vcpu->arch) + 1) << CSR_GCFG_GPERF_SHIFT; + write_csr_gcfg(val); + + kvm_restore_guest_pmu(vcpu); + + return 0; +} + +static void kvm_lose_pmu(struct kvm_vcpu *vcpu) +{ + unsigned long val; + struct loongarch_csrs *csr = vcpu->arch.csr; + + if (!(vcpu->arch.aux_inuse & KVM_LARCH_PMU)) + return; + + kvm_save_guest_pmu(vcpu); + + /* Disable pmu access from guest */ + write_csr_gcfg(read_csr_gcfg() & ~CSR_GCFG_GPERF); + + /* + * Clear KVM_LARCH_PMU if the guest is not using PMU CSRs when + * exiting the guest, so that the next time trap into the guest. + * We don't need to deal with PMU CSRs contexts. + */ + val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0); + val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1); + val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2); + val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); + if (!(val & KVM_PMU_EVENT_ENABLED)) + vcpu->arch.aux_inuse &= ~KVM_LARCH_PMU; + + kvm_restore_host_pmu(vcpu); +} + +static void kvm_restore_pmu(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.aux_inuse & KVM_LARCH_PMU)) + kvm_make_request(KVM_REQ_PMU, vcpu); +} + +static void kvm_check_pmu(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_PMU, vcpu)) { + kvm_own_pmu(vcpu); + vcpu->arch.aux_inuse |= KVM_LARCH_PMU; + } +} + static void kvm_update_stolen_time(struct kvm_vcpu *vcpu) { u32 version; @@ -158,6 +279,7 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu) /* Make sure the vcpu mode has been written */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); kvm_check_vpid(vcpu); + kvm_check_pmu(vcpu); /* * Called after function kvm_check_vpid() @@ -195,6 +317,8 @@ static int kvm_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) /* Set a default exit reason */ run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_lose_pmu(vcpu); + guest_timing_exit_irqoff(); guest_state_exit_irqoff(); local_irq_enable(); @@ -468,6 +592,22 @@ static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val) kvm_write_sw_gcsr(csr, id, val); + /* + * After modifying the PMU CSR register value of the vcpu. + * If the PMU CSRs are used, we need to set KVM_REQ_PMU. + */ + if (id >= LOONGARCH_CSR_PERFCTRL0 && id <= LOONGARCH_CSR_PERFCNTR3) { + unsigned long val; + + val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0) | + kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1) | + kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2) | + kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); + + if (val & KVM_PMU_EVENT_ENABLED) + kvm_make_request(KVM_REQ_PMU, vcpu); + } + return ret; } @@ -497,6 +637,12 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) *v |= CPUCFG2_LSX; if (cpu_has_lasx) *v |= CPUCFG2_LASX; + if (cpu_has_lbt_x86) + *v |= CPUCFG2_X86BT; + if (cpu_has_lbt_arm) + *v |= CPUCFG2_ARMBT; + if (cpu_has_lbt_mips) + *v |= CPUCFG2_MIPSBT; return 0; case LOONGARCH_CPUCFG3: @@ -506,6 +652,12 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) case LOONGARCH_CPUCFG5: *v = GENMASK(31, 0); return 0; + case LOONGARCH_CPUCFG6: + if (cpu_has_pmp) + *v = GENMASK(14, 0); + else + *v = 0; + return 0; case LOONGARCH_CPUCFG16: *v = GENMASK(16, 0); return 0; @@ -550,6 +702,17 @@ static int kvm_check_cpucfg(int id, u64 val) /* LASX architecturally implies LSX and FP but val does not satisfy that */ return -EINVAL; return 0; + case LOONGARCH_CPUCFG6: + if (val & CPUCFG6_PMP) { + u32 host = read_cpucfg(LOONGARCH_CPUCFG6); + if ((val & CPUCFG6_PMBITS) != (host & CPUCFG6_PMBITS)) + return -EINVAL; + if ((val & CPUCFG6_PMNUM) > (host & CPUCFG6_PMNUM)) + return -EINVAL; + if ((val & CPUCFG6_UPM) && !(host & CPUCFG6_UPM)) + return -EINVAL; + } + return 0; default: /* * Values for the other CPUCFG IDs are not being further validated @@ -577,6 +740,34 @@ static int kvm_get_one_reg(struct kvm_vcpu *vcpu, else ret = -EINVAL; break; + case KVM_REG_LOONGARCH_LBT: + if (!kvm_guest_has_lbt(&vcpu->arch)) + return -ENXIO; + + switch (reg->id) { + case KVM_REG_LOONGARCH_LBT_SCR0: + *v = vcpu->arch.lbt.scr0; + break; + case KVM_REG_LOONGARCH_LBT_SCR1: + *v = vcpu->arch.lbt.scr1; + break; + case KVM_REG_LOONGARCH_LBT_SCR2: + *v = vcpu->arch.lbt.scr2; + break; + case KVM_REG_LOONGARCH_LBT_SCR3: + *v = vcpu->arch.lbt.scr3; + break; + case KVM_REG_LOONGARCH_LBT_EFLAGS: + *v = vcpu->arch.lbt.eflags; + break; + case KVM_REG_LOONGARCH_LBT_FTOP: + *v = vcpu->arch.fpu.ftop; + break; + default: + ret = -EINVAL; + break; + } + break; case KVM_REG_LOONGARCH_KVM: switch (reg->id) { case KVM_REG_LOONGARCH_COUNTER: @@ -635,6 +826,37 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu, if (ret) break; vcpu->arch.cpucfg[id] = (u32)v; + if (id == LOONGARCH_CPUCFG6) + vcpu->arch.max_pmu_csrid = + LOONGARCH_CSR_PERFCTRL0 + 2 * kvm_get_pmu_num(&vcpu->arch) + 1; + break; + case KVM_REG_LOONGARCH_LBT: + if (!kvm_guest_has_lbt(&vcpu->arch)) + return -ENXIO; + + switch (reg->id) { + case KVM_REG_LOONGARCH_LBT_SCR0: + vcpu->arch.lbt.scr0 = v; + break; + case KVM_REG_LOONGARCH_LBT_SCR1: + vcpu->arch.lbt.scr1 = v; + break; + case KVM_REG_LOONGARCH_LBT_SCR2: + vcpu->arch.lbt.scr2 = v; + break; + case KVM_REG_LOONGARCH_LBT_SCR3: + vcpu->arch.lbt.scr3 = v; + break; + case KVM_REG_LOONGARCH_LBT_EFLAGS: + vcpu->arch.lbt.eflags = v; + break; + case KVM_REG_LOONGARCH_LBT_FTOP: + vcpu->arch.fpu.ftop = v; + break; + default: + ret = -EINVAL; + break; + } break; case KVM_REG_LOONGARCH_KVM: switch (reg->id) { @@ -647,7 +869,7 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu, vcpu->kvm->arch.time_offset = (signed long)(v - drdtime()); break; case KVM_REG_LOONGARCH_VCPU_RESET: - kvm_reset_timer(vcpu); + vcpu->arch.st.guest_addr = 0; memset(&vcpu->arch.irq_pending, 0, sizeof(vcpu->arch.irq_pending)); memset(&vcpu->arch.irq_clear, 0, sizeof(vcpu->arch.irq_clear)); break; @@ -728,7 +950,10 @@ static int kvm_loongarch_cpucfg_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { switch (attr->attr) { - case 2: + case LOONGARCH_CPUCFG2: + case LOONGARCH_CPUCFG6: + return 0; + case CPUCFG_KVM_FEATURE: return 0; default: return -ENXIO; @@ -740,8 +965,8 @@ static int kvm_loongarch_cpucfg_has_attr(struct kvm_vcpu *vcpu, static int kvm_loongarch_pvtime_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - if (!kvm_pvtime_supported() || - attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) + if (!kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME) + || attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) return -ENXIO; return 0; @@ -773,9 +998,18 @@ static int kvm_loongarch_cpucfg_get_attr(struct kvm_vcpu *vcpu, uint64_t val; uint64_t __user *uaddr = (uint64_t __user *)attr->addr; - ret = _kvm_get_cpucfg_mask(attr->attr, &val); - if (ret) - return ret; + switch (attr->attr) { + case 0 ... (KVM_MAX_CPUCFG_REGS - 1): + ret = _kvm_get_cpucfg_mask(attr->attr, &val); + if (ret) + return ret; + break; + case CPUCFG_KVM_FEATURE: + val = vcpu->kvm->arch.pv_features & LOONGARCH_PV_FEAT_MASK; + break; + default: + return -ENXIO; + } put_user(val, uaddr); @@ -788,8 +1022,8 @@ static int kvm_loongarch_pvtime_get_attr(struct kvm_vcpu *vcpu, u64 gpa; u64 __user *user = (u64 __user *)attr->addr; - if (!kvm_pvtime_supported() || - attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) + if (!kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME) + || attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) return -ENXIO; gpa = vcpu->arch.st.guest_addr; @@ -821,7 +1055,28 @@ static int kvm_loongarch_vcpu_get_attr(struct kvm_vcpu *vcpu, static int kvm_loongarch_cpucfg_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - return -ENXIO; + u64 val, valid; + u64 __user *user = (u64 __user *)attr->addr; + struct kvm *kvm = vcpu->kvm; + + switch (attr->attr) { + case CPUCFG_KVM_FEATURE: + if (get_user(val, user)) + return -EFAULT; + + valid = LOONGARCH_PV_FEAT_MASK; + if (val & ~valid) + return -EINVAL; + + /* All vCPUs need set the same PV features */ + if ((kvm->arch.pv_features & LOONGARCH_PV_FEAT_UPDATED) + && ((kvm->arch.pv_features & valid) != val)) + return -EINVAL; + kvm->arch.pv_features = val | LOONGARCH_PV_FEAT_UPDATED; + return 0; + default: + return -ENXIO; + } } static int kvm_loongarch_pvtime_set_attr(struct kvm_vcpu *vcpu, @@ -831,8 +1086,8 @@ static int kvm_loongarch_pvtime_set_attr(struct kvm_vcpu *vcpu, u64 gpa, __user *user = (u64 __user *)attr->addr; struct kvm *kvm = vcpu->kvm; - if (!kvm_pvtime_supported() || - attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) + if (!kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME) + || attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA) return -ENXIO; if (get_user(gpa, user)) @@ -977,12 +1232,66 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } +#ifdef CONFIG_CPU_HAS_LBT +int kvm_own_lbt(struct kvm_vcpu *vcpu) +{ + if (!kvm_guest_has_lbt(&vcpu->arch)) + return -EINVAL; + + preempt_disable(); + set_csr_euen(CSR_EUEN_LBTEN); + _restore_lbt(&vcpu->arch.lbt); + vcpu->arch.aux_inuse |= KVM_LARCH_LBT; + preempt_enable(); + + return 0; +} + +static void kvm_lose_lbt(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + if (vcpu->arch.aux_inuse & KVM_LARCH_LBT) { + _save_lbt(&vcpu->arch.lbt); + clear_csr_euen(CSR_EUEN_LBTEN); + vcpu->arch.aux_inuse &= ~KVM_LARCH_LBT; + } + preempt_enable(); +} + +static void kvm_check_fcsr(struct kvm_vcpu *vcpu, unsigned long fcsr) +{ + /* + * If TM is enabled, top register save/restore will + * cause lbt exception, here enable lbt in advance + */ + if (fcsr & FPU_CSR_TM) + kvm_own_lbt(vcpu); +} + +static void kvm_check_fcsr_alive(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.aux_inuse & KVM_LARCH_FPU) { + if (vcpu->arch.aux_inuse & KVM_LARCH_LBT) + return; + kvm_check_fcsr(vcpu, read_fcsr(LOONGARCH_FCSR0)); + } +} +#else +static inline void kvm_lose_lbt(struct kvm_vcpu *vcpu) { } +static inline void kvm_check_fcsr(struct kvm_vcpu *vcpu, unsigned long fcsr) { } +static inline void kvm_check_fcsr_alive(struct kvm_vcpu *vcpu) { } +#endif + /* Enable FPU and restore context */ void kvm_own_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); - /* Enable FPU */ + /* + * Enable FPU for guest + * Set FR and FRE according to guest context + */ + kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr); set_csr_euen(CSR_EUEN_FPEN); kvm_restore_fpu(&vcpu->arch.fpu); @@ -1002,6 +1311,7 @@ int kvm_own_lsx(struct kvm_vcpu *vcpu) preempt_disable(); /* Enable LSX for guest */ + kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr); set_csr_euen(CSR_EUEN_LSXEN | CSR_EUEN_FPEN); switch (vcpu->arch.aux_inuse & KVM_LARCH_FPU) { case KVM_LARCH_FPU: @@ -1036,6 +1346,7 @@ int kvm_own_lasx(struct kvm_vcpu *vcpu) preempt_disable(); + kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr); set_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN | CSR_EUEN_LASXEN); switch (vcpu->arch.aux_inuse & (KVM_LARCH_FPU | KVM_LARCH_LSX)) { case KVM_LARCH_LSX: @@ -1067,6 +1378,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); + kvm_check_fcsr_alive(vcpu); if (vcpu->arch.aux_inuse & KVM_LARCH_LASX) { kvm_save_lasx(&vcpu->arch.fpu); vcpu->arch.aux_inuse &= ~(KVM_LARCH_LSX | KVM_LARCH_FPU | KVM_LARCH_LASX); @@ -1089,6 +1401,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) /* Disable FPU */ clear_csr_euen(CSR_EUEN_FPEN); } + kvm_lose_lbt(vcpu); preempt_enable(); } @@ -1235,6 +1548,9 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) change_csr_gcfg(CSR_GCFG_MATC_MASK, CSR_GCFG_MATC_ROOT); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + /* Restore hardware PMU CSRs */ + kvm_restore_pmu(vcpu); + /* Don't bother restoring registers multiple times unless necessary */ if (vcpu->arch.aux_inuse & KVM_LARCH_HWCSR_USABLE) return 0; diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index 6b2e4f66ad26..4ba734aaef87 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -5,6 +5,7 @@ #include <linux/kvm_host.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_vcpu.h> const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -39,6 +40,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) spin_lock_init(&kvm->arch.phyid_map_lock); kvm_init_vmcs(kvm); + + /* Enable all PV features by default */ + kvm->arch.pv_features = BIT(KVM_FEATURE_IPI); + if (kvm_pvtime_supported()) + kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME); + kvm->arch.gpa_size = BIT(cpu_vabits - 1); kvm->arch.root_level = CONFIG_PGTABLE_LEVELS - 1; kvm->arch.invalid_ptes[0] = 0; @@ -99,7 +106,67 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) return r; } +static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_LOONGARCH_VM_FEAT_LSX: + if (cpu_has_lsx) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_LASX: + if (cpu_has_lasx) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_X86BT: + if (cpu_has_lbt_x86) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_ARMBT: + if (cpu_has_lbt_arm) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_MIPSBT: + if (cpu_has_lbt_mips) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_PMU: + if (cpu_has_pmp) + return 0; + return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_PV_IPI: + return 0; + case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME: + if (kvm_pvtime_supported()) + return 0; + return -ENXIO; + default: + return -ENXIO; + } +} + +static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_LOONGARCH_VM_FEAT_CTRL: + return kvm_vm_feature_has_attr(kvm, attr); + default: + return -ENXIO; + } +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { - return -ENOIOCTLCMD; + void __user *argp = (void __user *)arg; + struct kvm *kvm = filp->private_data; + struct kvm_device_attr attr; + + switch (ioctl) { + case KVM_HAS_DEVICE_ATTR: + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + return kvm_vm_has_attr(kvm, &attr); + default: + return -ENOIOCTLCMD; + } } diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 3827dc76edd8..4520c5741579 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -193,11 +193,6 @@ asmlinkage void __init mmu_init(void) { unsigned int kstart, ksize; - if (!memblock.reserved.cnt) { - pr_emerg("Error memory count\n"); - machine_restart(NULL); - } - if ((u32) memblock.memory.regions[0].size < 0x400000) { pr_emerg("Memory must be greater than 4MB\n"); machine_restart(NULL); diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 6743a57c1ab4..f7222eb594ea 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -728,8 +728,8 @@ struct kvm_mips_callbacks { int (*handle_fpe)(struct kvm_vcpu *vcpu); int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); int (*handle_guest_exit)(struct kvm_vcpu *vcpu); - int (*hardware_enable)(void); - void (*hardware_disable)(void); + int (*enable_virtualization_cpu)(void); + void (*disable_virtualization_cpu)(void); int (*check_extension)(struct kvm *kvm, long ext); int (*vcpu_init)(struct kvm_vcpu *vcpu); void (*vcpu_uninit)(struct kvm_vcpu *vcpu); diff --git a/arch/mips/kernel/cevt-r4k.c b/arch/mips/kernel/cevt-r4k.c index 368e8475870f..5f6e9e2ebbdb 100644 --- a/arch/mips/kernel/cevt-r4k.c +++ b/arch/mips/kernel/cevt-r4k.c @@ -303,13 +303,6 @@ int r4k_clockevent_init(void) if (!c0_compare_int_usable()) return -ENXIO; - /* - * With vectored interrupts things are getting platform specific. - * get_c0_compare_int is a hook to allow a platform to return the - * interrupt number of its liking. - */ - irq = get_c0_compare_int(); - cd = &per_cpu(mips_clockevent_device, cpu); cd->name = "MIPS"; @@ -320,7 +313,6 @@ int r4k_clockevent_init(void) min_delta = calculate_min_delta(); cd->rating = 300; - cd->irq = irq; cd->cpumask = cpumask_of(cpu); cd->set_next_event = mips_next_event; cd->event_handler = mips_event_handler; @@ -332,6 +324,13 @@ int r4k_clockevent_init(void) cp0_timer_irq_installed = 1; + /* + * With vectored interrupts things are getting platform specific. + * get_c0_compare_int is a hook to allow a platform to return the + * interrupt number of its liking. + */ + irq = get_c0_compare_int(); + if (request_irq(irq, c0_compare_interrupt, flags, "timer", c0_compare_interrupt)) pr_err("Failed to request irq %d (timer)\n", irq); diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index bda7f193baab..af7412549e6e 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -1724,12 +1724,16 @@ static inline void cpu_probe_loongson(struct cpuinfo_mips *c, unsigned int cpu) c->ases |= (MIPS_ASE_LOONGSON_MMI | MIPS_ASE_LOONGSON_CAM | MIPS_ASE_LOONGSON_EXT | MIPS_ASE_LOONGSON_EXT2); c->ases &= ~MIPS_ASE_VZ; /* VZ of Loongson-3A2000/3000 is incomplete */ + change_c0_config6(LOONGSON_CONF6_EXTIMER | LOONGSON_CONF6_INTIMER, + LOONGSON_CONF6_INTIMER); break; case PRID_IMP_LOONGSON_64G: __cpu_name[cpu] = "ICT Loongson-3"; set_elf_platform(cpu, "loongson3a"); set_isa(c, MIPS_CPU_ISA_M64R2); decode_cpucfg(c); + change_c0_config6(LOONGSON_CONF6_EXTIMER | LOONGSON_CONF6_INTIMER, + LOONGSON_CONF6_INTIMER); break; default: panic("Unknown Loongson Processor ID!"); diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c index bdb1fa8931f4..59eca397f297 100644 --- a/arch/mips/kernel/csrc-r4k.c +++ b/arch/mips/kernel/csrc-r4k.c @@ -21,9 +21,7 @@ static struct clocksource clocksource_mips = { .name = "MIPS", .read = c0_hpt_read, .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | - CLOCK_SOURCE_MUST_VERIFY | - CLOCK_SOURCE_VERIFY_PERCPU, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static u64 __maybe_unused notrace r4k_read_sched_clock(void) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index b5de770b092e..60b43ea85c12 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -125,14 +125,14 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return 1; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { - return kvm_mips_callbacks->hardware_enable(); + return kvm_mips_callbacks->enable_virtualization_cpu(); } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { - kvm_mips_callbacks->hardware_disable(); + kvm_mips_callbacks->disable_virtualization_cpu(); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 99d5a71e4300..ccab4d76b126 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -2869,7 +2869,7 @@ static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size) return ret + 1; } -static int kvm_vz_hardware_enable(void) +static int kvm_vz_enable_virtualization_cpu(void) { unsigned int mmu_size, guest_mmu_size, ftlb_size; u64 guest_cvmctl, cvmvmconfig; @@ -2983,7 +2983,7 @@ static int kvm_vz_hardware_enable(void) return 0; } -static void kvm_vz_hardware_disable(void) +static void kvm_vz_disable_virtualization_cpu(void) { u64 cvmvmconfig; unsigned int mmu_size; @@ -3280,8 +3280,8 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = { .handle_msa_disabled = kvm_trap_vz_handle_msa_disabled, .handle_guest_exit = kvm_trap_vz_handle_guest_exit, - .hardware_enable = kvm_vz_hardware_enable, - .hardware_disable = kvm_vz_hardware_disable, + .enable_virtualization_cpu = kvm_vz_enable_virtualization_cpu, + .disable_virtualization_cpu = kvm_vz_disable_virtualization_cpu, .check_extension = kvm_vz_check_extension, .vcpu_init = kvm_vz_vcpu_init, .vcpu_uninit = kvm_vz_vcpu_uninit, diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 34d91cb8b259..96970fa75e4a 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -459,7 +459,6 @@ void free_initmem(void) unsigned long kernel_end = (unsigned long)&_end; /* Remap kernel text and data, but do not touch init section yet. */ - kernel_set_to_readonly = true; map_pages(init_end, __pa(init_end), kernel_end - init_end, PAGE_KERNEL, 0); @@ -493,11 +492,18 @@ void free_initmem(void) #ifdef CONFIG_STRICT_KERNEL_RWX void mark_rodata_ro(void) { - /* rodata memory was already mapped with KERNEL_RO access rights by - pagetable_init() and map_pages(). No need to do additional stuff here */ - unsigned long roai_size = __end_ro_after_init - __start_ro_after_init; + unsigned long start = (unsigned long) &__start_rodata; + unsigned long end = (unsigned long) &__end_rodata; + + pr_info("Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + + kernel_set_to_readonly = true; + map_pages(start, __pa(start), end - start, PAGE_KERNEL, 0); - pr_info("Write protected read-only-after-init data: %luk\n", roai_size >> 10); + /* force the kernel to see the new page table entries */ + flush_cache_all(); + flush_tlb_all(); } #endif diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 9508399dd036..b481738c4bb5 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -52,7 +52,7 @@ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) #define pgd_ERROR(e) \ - pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + pr_err("%s:%d: bad pgd %08llx.\n", __FILE__, __LINE__, (unsigned long long)pgd_val(e)) /* * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary @@ -170,7 +170,7 @@ static inline void pmd_clear(pmd_t *pmdp) #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #else #define pmd_page_vaddr(pmd) \ - ((const void *)(pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1))) + ((const void *)((unsigned long)pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1))) #define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT) #endif diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 7b3d4c592a10..f3086e39e7d2 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -49,16 +49,22 @@ static inline unsigned long pud_val(pud_t x) #endif /* CONFIG_PPC64 */ /* PGD level */ -#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT) +#if defined(CONFIG_PPC_85xx) && defined(CONFIG_PTE_64BIT) typedef struct { unsigned long long pgd; } pgd_t; + +static inline unsigned long long pgd_val(pgd_t x) +{ + return x.pgd; +} #else typedef struct { unsigned long pgd; } pgd_t; -#endif -#define __pgd(x) ((pgd_t) { (x) }) + static inline unsigned long pgd_val(pgd_t x) { return x.pgd; } +#endif +#define __pgd(x) ((pgd_t) { (x) }) /* Page protection bits */ typedef struct { unsigned long pgprot; } pgprot_t; diff --git a/arch/powerpc/kernel/vdso/vdso32.lds.S b/arch/powerpc/kernel/vdso/vdso32.lds.S index 426e1ccc6971..8f57107000a2 100644 --- a/arch/powerpc/kernel/vdso/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso/vdso32.lds.S @@ -74,6 +74,8 @@ SECTIONS .got : { *(.got) } :text .plt : { *(.plt) } + .rela.dyn : { *(.rela .rela*) } + _end = .; __end = .; PROVIDE(end = .); @@ -87,7 +89,7 @@ SECTIONS *(.branch_lt) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) - *(.got1 .glink .iplt .rela*) + *(.got1 .glink .iplt) } } diff --git a/arch/powerpc/kernel/vdso/vdso64.lds.S b/arch/powerpc/kernel/vdso/vdso64.lds.S index bda6c8cdd459..400819258c06 100644 --- a/arch/powerpc/kernel/vdso/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso/vdso64.lds.S @@ -69,7 +69,7 @@ SECTIONS .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text .gcc_except_table : { *(.gcc_except_table) } - .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .rela.dyn ALIGN(8) : { *(.rela .rela*) } .got ALIGN(8) : { *(.got .toc) } @@ -86,7 +86,7 @@ SECTIONS *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) *(.opd) - *(.glink .iplt .plt .rela*) + *(.glink .iplt .plt) } } diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 5de4dd549f6e..bcc7e4dff8c3 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -697,7 +697,15 @@ again: } release: - qnodesp->count--; /* release the node */ + /* + * Clear the lock before releasing the node, as another CPU might see stale + * values if an interrupt occurs after we increment qnodesp->count + * but before node->lock is initialized. The barrier ensures that + * there are no further stores to the node after it has been released. + */ + node->lock = NULL; + barrier(); + qnodesp->count--; } void queued_spin_lock_slowpath(struct qspinlock *lock) diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c index 113edf76d3ce..d26656b07b72 100644 --- a/arch/powerpc/mm/nohash/tlb_64e.c +++ b/arch/powerpc/mm/nohash/tlb_64e.c @@ -33,7 +33,7 @@ * though this will probably be made common with other nohash * implementations at some point */ -int mmu_pte_psize; /* Page size used for PTE pages */ +static int mmu_pte_psize; /* Page size used for PTE pages */ int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ unsigned long linear_map_top; /* Top of linear mapping */ diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 0f3cd7c3a436..939ea7f6a228 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -552,8 +552,8 @@ config RISCV_ISA_SVPBMT config TOOLCHAIN_HAS_V bool default y - depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64iv) - depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32iv) + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64imv) + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32imv) depends on LLD_VERSION >= 140000 || LD_VERSION >= 23800 depends on AS_HAS_OPTION_ARCH diff --git a/arch/riscv/include/asm/kvm_vcpu_pmu.h b/arch/riscv/include/asm/kvm_vcpu_pmu.h index fa0f535bbbf0..1d85b6617508 100644 --- a/arch/riscv/include/asm/kvm_vcpu_pmu.h +++ b/arch/riscv/include/asm/kvm_vcpu_pmu.h @@ -10,6 +10,7 @@ #define __KVM_VCPU_RISCV_PMU_H #include <linux/perf/riscv_pmu.h> +#include <asm/kvm_vcpu_insn.h> #include <asm/sbi.h> #ifdef CONFIG_RISCV_PMU_SBI @@ -64,11 +65,11 @@ struct kvm_pmu { #if defined(CONFIG_32BIT) #define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \ -{.base = CSR_CYCLEH, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, \ -{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, +{.base = CSR_CYCLEH, .count = 32, .func = kvm_riscv_vcpu_pmu_read_hpm }, \ +{.base = CSR_CYCLE, .count = 32, .func = kvm_riscv_vcpu_pmu_read_hpm }, #else #define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \ -{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, +{.base = CSR_CYCLE, .count = 32, .func = kvm_riscv_vcpu_pmu_read_hpm }, #endif int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid); @@ -104,8 +105,20 @@ void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu); struct kvm_pmu { }; +static inline int kvm_riscv_vcpu_pmu_read_legacy(struct kvm_vcpu *vcpu, unsigned int csr_num, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) { + *val = 0; + return KVM_INSN_CONTINUE_NEXT_SEPC; + } else { + return KVM_INSN_ILLEGAL_TRAP; + } +} + #define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \ -{.base = 0, .count = 0, .func = NULL }, +{.base = CSR_CYCLE, .count = 3, .func = kvm_riscv_vcpu_pmu_read_legacy }, static inline void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) {} static inline int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 8702b8721a27..efa1b3519b23 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -14,36 +14,14 @@ #include <asm/ptrace.h> -/* - * addr is a hint to the maximum userspace address that mmap should provide, so - * this macro needs to return the largest address space available so that - * mmap_end < addr, being mmap_end the top of that address space. - * See Documentation/arch/riscv/vm-layout.rst for more details. - */ #define arch_get_mmap_end(addr, len, flags) \ ({ \ - unsigned long mmap_end; \ - typeof(addr) _addr = (addr); \ - if ((_addr) == 0 || is_compat_task() || \ - ((_addr + len) > BIT(VA_BITS - 1))) \ - mmap_end = STACK_TOP_MAX; \ - else \ - mmap_end = (_addr + len); \ - mmap_end; \ + STACK_TOP_MAX; \ }) #define arch_get_mmap_base(addr, base) \ ({ \ - unsigned long mmap_base; \ - typeof(addr) _addr = (addr); \ - typeof(base) _base = (base); \ - unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base); \ - if ((_addr) == 0 || is_compat_task() || \ - ((_addr + len) > BIT(VA_BITS - 1))) \ - mmap_base = (_base); \ - else \ - mmap_base = (_addr + len) - rnd_gap; \ - mmap_base; \ + base; \ }) #ifdef CONFIG_64BIT diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 7cffd4ffecd0..7bd3746028c9 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/cpumask.h> +#include <linux/jump_label.h> #ifdef CONFIG_RISCV_SBI enum sbi_ext_id { @@ -304,6 +305,7 @@ struct sbiret { }; void sbi_init(void); +long __sbi_base_ecall(int fid); struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, @@ -373,7 +375,23 @@ static inline unsigned long sbi_mk_version(unsigned long major, | (minor & SBI_SPEC_VERSION_MINOR_MASK); } -int sbi_err_map_linux_errno(int err); +static inline int sbi_err_map_linux_errno(int err) +{ + switch (err) { + case SBI_SUCCESS: + return 0; + case SBI_ERR_DENIED: + return -EPERM; + case SBI_ERR_INVALID_PARAM: + return -EINVAL; + case SBI_ERR_INVALID_ADDRESS: + return -EFAULT; + case SBI_ERR_NOT_SUPPORTED: + case SBI_ERR_FAILURE: + default: + return -ENOTSUPP; + }; +} extern bool sbi_debug_console_available; int sbi_debug_console_write(const char *bytes, unsigned int num_bytes); diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 06d407f1b30b..7f88cc4931f5 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -20,17 +20,21 @@ endif ifdef CONFIG_RISCV_ALTERNATIVE_EARLY CFLAGS_alternative.o := -mcmodel=medany CFLAGS_cpufeature.o := -mcmodel=medany +CFLAGS_sbi_ecall.o := -mcmodel=medany ifdef CONFIG_FTRACE CFLAGS_REMOVE_alternative.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_cpufeature.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_sbi_ecall.o = $(CC_FLAGS_FTRACE) endif ifdef CONFIG_RELOCATABLE CFLAGS_alternative.o += -fno-pie CFLAGS_cpufeature.o += -fno-pie +CFLAGS_sbi_ecall.o += -fno-pie endif ifdef CONFIG_KASAN KASAN_SANITIZE_alternative.o := n KASAN_SANITIZE_cpufeature.o := n +KASAN_SANITIZE_sbi_ecall.o := n endif endif @@ -88,7 +92,7 @@ obj-$(CONFIG_DYNAMIC_FTRACE) += mcount-dyn.o obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o -obj-$(CONFIG_RISCV_SBI) += sbi.o +obj-$(CONFIG_RISCV_SBI) += sbi.o sbi_ecall.o ifeq ($(CONFIG_RISCV_SBI), y) obj-$(CONFIG_SMP) += sbi-ipi.o obj-$(CONFIG_SMP) += cpu_ops_sbi.o diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 837bdab2601b..1989b8cade1b 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -14,9 +14,6 @@ #include <asm/smp.h> #include <asm/tlbflush.h> -#define CREATE_TRACE_POINTS -#include <asm/trace.h> - /* default SBI version is 0.1 */ unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT; EXPORT_SYMBOL(sbi_spec_version); @@ -27,55 +24,6 @@ static int (*__sbi_rfence)(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) __ro_after_init; -struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4, unsigned long arg5, - int fid, int ext) -{ - struct sbiret ret; - - trace_sbi_call(ext, fid); - - register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); - register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); - register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); - register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); - register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4); - register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5); - register uintptr_t a6 asm ("a6") = (uintptr_t)(fid); - register uintptr_t a7 asm ("a7") = (uintptr_t)(ext); - asm volatile ("ecall" - : "+r" (a0), "+r" (a1) - : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7) - : "memory"); - ret.error = a0; - ret.value = a1; - - trace_sbi_return(ext, ret.error, ret.value); - - return ret; -} -EXPORT_SYMBOL(__sbi_ecall); - -int sbi_err_map_linux_errno(int err) -{ - switch (err) { - case SBI_SUCCESS: - return 0; - case SBI_ERR_DENIED: - return -EPERM; - case SBI_ERR_INVALID_PARAM: - return -EINVAL; - case SBI_ERR_INVALID_ADDRESS: - return -EFAULT; - case SBI_ERR_NOT_SUPPORTED: - case SBI_ERR_FAILURE: - default: - return -ENOTSUPP; - }; -} -EXPORT_SYMBOL(sbi_err_map_linux_errno); - #ifdef CONFIG_RISCV_SBI_V01 static unsigned long __sbi_v01_cpumask_to_hartmask(const struct cpumask *cpu_mask) { @@ -535,17 +483,6 @@ long sbi_probe_extension(int extid) } EXPORT_SYMBOL(sbi_probe_extension); -static long __sbi_base_ecall(int fid) -{ - struct sbiret ret; - - ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0); - if (!ret.error) - return ret.value; - else - return sbi_err_map_linux_errno(ret.error); -} - static inline long sbi_get_spec_version(void) { return __sbi_base_ecall(SBI_EXT_BASE_GET_SPEC_VERSION); diff --git a/arch/riscv/kernel/sbi_ecall.c b/arch/riscv/kernel/sbi_ecall.c new file mode 100644 index 000000000000..24aabb4fbde3 --- /dev/null +++ b/arch/riscv/kernel/sbi_ecall.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Rivos Inc. */ + +#include <asm/sbi.h> +#define CREATE_TRACE_POINTS +#include <asm/trace.h> + +long __sbi_base_ecall(int fid) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0); + if (!ret.error) + return ret.value; + else + return sbi_err_map_linux_errno(ret.error); +} +EXPORT_SYMBOL(__sbi_base_ecall); + +struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5, + int fid, int ext) +{ + struct sbiret ret; + + trace_sbi_call(ext, fid); + + register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); + register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); + register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); + register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); + register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4); + register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5); + register uintptr_t a6 asm ("a6") = (uintptr_t)(fid); + register uintptr_t a7 asm ("a7") = (uintptr_t)(ext); + asm volatile ("ecall" + : "+r" (a0), "+r" (a1) + : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7) + : "memory"); + ret.error = a0; + ret.value = a1; + + trace_sbi_return(ext, ret.error, ret.value); + + return ret; +} +EXPORT_SYMBOL(__sbi_ecall); diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 192cd5603e95..d4fd8af7aaf5 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -417,7 +417,7 @@ int handle_misaligned_load(struct pt_regs *regs) val.data_u64 = 0; if (user_mode(regs)) { - if (raw_copy_from_user(&val, (u8 __user *)addr, len)) + if (copy_from_user(&val, (u8 __user *)addr, len)) return -1; } else { memcpy(&val, (u8 *)addr, len); @@ -515,7 +515,7 @@ int handle_misaligned_store(struct pt_regs *regs) return -EOPNOTSUPP; if (user_mode(regs)) { - if (raw_copy_to_user((u8 __user *)addr, &val, len)) + if (copy_to_user((u8 __user *)addr, &val, len)) return -1; } else { memcpy((u8 *)addr, &val, len); diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index bab2ec34cd87..f3427f6de608 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -20,7 +20,7 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); @@ -35,7 +35,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { kvm_riscv_aia_disable(); diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index bcf41d6e0df0..2707a51b082c 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -391,19 +391,9 @@ int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num, static void kvm_pmu_clear_snapshot_area(struct kvm_vcpu *vcpu) { struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu); - int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data); - if (kvpmu->sdata) { - if (kvpmu->snapshot_addr != INVALID_GPA) { - memset(kvpmu->sdata, 0, snapshot_area_size); - kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr, - kvpmu->sdata, snapshot_area_size); - } else { - pr_warn("snapshot address invalid\n"); - } - kfree(kvpmu->sdata); - kvpmu->sdata = NULL; - } + kfree(kvpmu->sdata); + kvpmu->sdata = NULL; kvpmu->snapshot_addr = INVALID_GPA; } diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index 62f409d4176e..7de128be8db9 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -127,8 +127,8 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run) run->riscv_sbi.args[3] = cp->a3; run->riscv_sbi.args[4] = cp->a4; run->riscv_sbi.args[5] = cp->a5; - run->riscv_sbi.ret[0] = cp->a0; - run->riscv_sbi.ret[1] = cp->a1; + run->riscv_sbi.ret[0] = SBI_ERR_NOT_SUPPORTED; + run->riscv_sbi.ret[1] = 0; } void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index eb0649a61b4c..1785782c2e55 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -252,7 +252,7 @@ static void __init setup_bootmem(void) * The size of the linear page mapping may restrict the amount of * usable RAM. */ - if (IS_ENABLED(CONFIG_64BIT)) { + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) { max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE; memblock_cap_memory_range(phys_ram_base, max_mapped_addr - phys_ram_base); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a822f952f64a..c60e699e99f5 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -604,6 +604,19 @@ config RANDOMIZE_BASE as a security feature that deters exploit attempts relying on knowledge of the location of kernel internals. +config RANDOMIZE_IDENTITY_BASE + bool "Randomize the address of the identity mapping base" + depends on RANDOMIZE_BASE + default DEBUG_VM + help + The identity mapping base address is pinned to zero by default. + Allow randomization of that base to expose otherwise missed + notion of physical and virtual addresses of data structures. + That does not have any impact on the base address at which the + kernel image is loaded. + + If unsure, say N + config KERNEL_IMAGE_BASE hex "Kernel image base address" range 0x100000 0x1FFFFFE0000000 if !KASAN diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index ce232552bc1c..c73b5118ad42 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -162,7 +162,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) error("64-bit relocation outside of kernel!\n"); - *(u64 *)loc += offset - __START_KERNEL; + *(u64 *)loc += offset; } } @@ -177,7 +177,7 @@ static void kaslr_adjust_got(unsigned long offset) */ for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) { if (*entry) - *entry += offset - __START_KERNEL; + *entry += offset; } } @@ -252,7 +252,7 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ - BUILD_BUG_ON(!IS_ALIGNED(__START_KERNEL, THREAD_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(TEXT_OFFSET, THREAD_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE)); BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE); vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE); @@ -341,7 +341,8 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS)); max_mappable = max(ident_map_size, MAX_DCSS_ADDR); max_mappable = min(max_mappable, vmemmap_start); - __identity_base = round_down(vmemmap_start - max_mappable, rte_size); + if (IS_ENABLED(CONFIG_RANDOMIZE_IDENTITY_BASE)) + __identity_base = round_down(vmemmap_start - max_mappable, rte_size); return asce_limit; } @@ -388,31 +389,25 @@ static void kaslr_adjust_vmlinux_info(long offset) #endif } -static void fixup_vmlinux_info(void) -{ - vmlinux.entry -= __START_KERNEL; - kaslr_adjust_vmlinux_info(-__START_KERNEL); -} - void startup_kernel(void) { - unsigned long kernel_size = vmlinux.image_size + vmlinux.bss_size; - unsigned long nokaslr_offset_phys, kaslr_large_page_offset; - unsigned long amode31_lma = 0; + unsigned long vmlinux_size = vmlinux.image_size + vmlinux.bss_size; + unsigned long nokaslr_text_lma, text_lma = 0, amode31_lma = 0; + unsigned long kernel_size = TEXT_OFFSET + vmlinux_size; + unsigned long kaslr_large_page_offset; unsigned long max_physmem_end; unsigned long asce_limit; unsigned long safe_addr; psw_t psw; - fixup_vmlinux_info(); setup_lpp(); /* * Non-randomized kernel physical start address must be _SEGMENT_SIZE * aligned (see blow). */ - nokaslr_offset_phys = ALIGN(mem_safe_offset(), _SEGMENT_SIZE); - safe_addr = PAGE_ALIGN(nokaslr_offset_phys + kernel_size); + nokaslr_text_lma = ALIGN(mem_safe_offset(), _SEGMENT_SIZE); + safe_addr = PAGE_ALIGN(nokaslr_text_lma + vmlinux_size); /* * Reserve decompressor memory together with decompression heap, @@ -456,16 +451,27 @@ void startup_kernel(void) */ kaslr_large_page_offset = __kaslr_offset & ~_SEGMENT_MASK; if (kaslr_enabled()) { - unsigned long end = ident_map_size - kaslr_large_page_offset; + unsigned long size = vmlinux_size + kaslr_large_page_offset; - __kaslr_offset_phys = randomize_within_range(kernel_size, _SEGMENT_SIZE, 0, end); + text_lma = randomize_within_range(size, _SEGMENT_SIZE, TEXT_OFFSET, ident_map_size); } - if (!__kaslr_offset_phys) - __kaslr_offset_phys = nokaslr_offset_phys; - __kaslr_offset_phys |= kaslr_large_page_offset; + if (!text_lma) + text_lma = nokaslr_text_lma; + text_lma |= kaslr_large_page_offset; + + /* + * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region is + * never accessed via the kernel image mapping as per the linker script: + * + * . = TEXT_OFFSET; + * + * Therefore, this region could be used for something else and does + * not need to be reserved. See how it is skipped in setup_vmem(). + */ + __kaslr_offset_phys = text_lma - TEXT_OFFSET; kaslr_adjust_vmlinux_info(__kaslr_offset_phys); - physmem_reserve(RR_VMLINUX, __kaslr_offset_phys, kernel_size); - deploy_kernel((void *)__kaslr_offset_phys); + physmem_reserve(RR_VMLINUX, text_lma, vmlinux_size); + deploy_kernel((void *)text_lma); /* vmlinux decompression is done, shrink reserved low memory */ physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end); @@ -488,7 +494,7 @@ void startup_kernel(void) amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, amode31_min, SZ_2G); } if (!amode31_lma) - amode31_lma = __kaslr_offset_phys - vmlinux.amode31_size; + amode31_lma = text_lma - vmlinux.amode31_size; physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size); /* @@ -504,8 +510,8 @@ void startup_kernel(void) * - copy_bootdata() must follow setup_vmem() to propagate changes * to bootdata made by setup_vmem() */ - clear_bss_section(__kaslr_offset_phys); - kaslr_adjust_relocs(__kaslr_offset_phys, __kaslr_offset_phys + vmlinux.image_size, + clear_bss_section(text_lma); + kaslr_adjust_relocs(text_lma, text_lma + vmlinux.image_size, __kaslr_offset, __kaslr_offset_phys); kaslr_adjust_got(__kaslr_offset); setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 2847cc059ab7..145035f84a0e 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -90,7 +90,7 @@ static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kern } memgap_start = end; } - kasan_populate(kernel_start, kernel_end, POPULATE_KASAN_MAP_SHADOW); + kasan_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KASAN_MAP_SHADOW); kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW); kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { @@ -475,7 +475,17 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l (unsigned long)__identity_va(end), POPULATE_IDENTITY); } - pgtable_populate(kernel_start, kernel_end, POPULATE_KERNEL); + + /* + * [kernel_start..kernel_start + TEXT_OFFSET] region is never + * accessed as per the linker script: + * + * . = TEXT_OFFSET; + * + * Therefore, skip mapping TEXT_OFFSET bytes to prevent access to + * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region. + */ + pgtable_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KERNEL); pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT); pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore), POPULATE_ABS_LOWCORE); diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index a750711d44c8..66670212a361 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -109,7 +109,12 @@ SECTIONS #ifdef CONFIG_KERNEL_UNCOMPRESSED . = ALIGN(PAGE_SIZE); . += AMODE31_SIZE; /* .amode31 section */ - . = ALIGN(1 << 20); /* _SEGMENT_SIZE */ + + /* + * Make sure the location counter is not less than TEXT_OFFSET. + * _SEGMENT_SIZE is not available, use ALIGN(1 << 20) instead. + */ + . = MAX(TEXT_OFFSET, ALIGN(1 << 20)); #else . = ALIGN(8); #endif diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index ea63a7342f5f..0c989caed19a 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -59,6 +59,7 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_S390_HYPFS_FS=y CONFIG_KVM=m +CONFIG_KVM_S390_UCONTROL=y CONFIG_S390_UNWIND_SELFTEST=m CONFIG_S390_KPROBES_SANITY_TEST=m CONFIG_S390_MODULES_SANITY_TEST=m diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 06416b3f94f5..16e4caa931f1 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -279,8 +279,9 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define AMODE31_SIZE (3 * PAGE_SIZE) #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) -#define __START_KERNEL 0x100000 #define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE #define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE) +#define TEXT_OFFSET 0x100000 + #endif /* _S390_PAGE_H */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 4ec99f73fa27..a3fea683b227 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -734,7 +734,23 @@ static void __init memblock_add_physmem_info(void) } /* - * Reserve memory used for lowcore/command line/kernel image. + * Reserve memory used for lowcore. + */ +static void __init reserve_lowcore(void) +{ + void *lowcore_start = get_lowcore(); + void *lowcore_end = lowcore_start + sizeof(struct lowcore); + void *start, *end; + + if ((void *)__identity_base < lowcore_end) { + start = max(lowcore_start, (void *)__identity_base); + end = min(lowcore_end, (void *)(__identity_base + ident_map_size)); + memblock_reserve(__pa(start), __pa(end)); + } +} + +/* + * Reserve memory used for absolute lowcore/command line/kernel image. */ static void __init reserve_kernel(void) { @@ -918,6 +934,7 @@ void __init setup_arch(char **cmdline_p) /* Do some memory reservations *before* memory is added to memblock */ reserve_pgtables(); + reserve_lowcore(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index e67cd409b858..ae5d0a9d6911 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -39,7 +39,7 @@ PHDRS { SECTIONS { - . = __START_KERNEL; + . = TEXT_OFFSET; .text : { _stext = .; /* Start of text section */ _text = .; /* Text and read-only data */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0fd96860fc45..bb7134faaebf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -348,20 +348,29 @@ static inline int plo_test_bit(unsigned char nr) return cc == 0; } -static __always_inline void __insn32_query(unsigned int opcode, u8 *query) +static __always_inline void __sortl_query(u8 (*query)[32]) { asm volatile( " lghi 0,0\n" - " lgr 1,%[query]\n" + " la 1,%[query]\n" /* Parameter registers are ignored */ - " .insn rrf,%[opc] << 16,2,4,6,0\n" + " .insn rre,0xb9380000,2,4\n" + : [query] "=R" (*query) : - : [query] "d" ((unsigned long)query), [opc] "i" (opcode) - : "cc", "memory", "0", "1"); + : "cc", "0", "1"); } -#define INSN_SORTL 0xb938 -#define INSN_DFLTCC 0xb939 +static __always_inline void __dfltcc_query(u8 (*query)[32]) +{ + asm volatile( + " lghi 0,0\n" + " la 1,%[query]\n" + /* Parameter registers are ignored */ + " .insn rrf,0xb9390000,2,4,6,0\n" + : [query] "=R" (*query) + : + : "cc", "0", "1"); +} static void __init kvm_s390_cpu_feat_init(void) { @@ -415,10 +424,10 @@ static void __init kvm_s390_cpu_feat_init(void) kvm_s390_available_subfunc.kdsa); if (test_facility(150)) /* SORTL */ - __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl); + __sortl_query(&kvm_s390_available_subfunc.sortl); if (test_facility(151)) /* DFLTCC */ - __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc); + __dfltcc_query(&kvm_s390_available_subfunc.dfltcc); if (MACHINE_HAS_ESOP) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c index a74dbd5c9896..30a732c808f3 100644 --- a/arch/s390/tools/relocs.c +++ b/arch/s390/tools/relocs.c @@ -280,7 +280,7 @@ static int do_reloc(struct section *sec, Elf_Rel *rel) case R_390_GOTOFF64: break; case R_390_64: - add_reloc(&relocs64, offset - ehdr.e_entry); + add_reloc(&relocs64, offset); break; default: die("Unsupported relocation type: %d\n", r_type); diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 078e2bac2553..da8b66dce0da 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -389,7 +389,6 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val) .r12 = size, .r13 = EPT_READ, .r14 = addr, - .r15 = *val, }; if (__tdx_hypercall(&args)) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0c9c2706d4ec..9e519d8a810a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) return HYBRID_INTEL_CORE; } +static inline bool erratum_hsw11(struct perf_event *event) +{ + return (event->hw.config & INTEL_ARCH_EVENT_MASK) == + X86_CONFIG(.event=0xc0, .umask=0x01); +} + +/* + * The HSW11 requires a period larger than 100 which is the same as the BDM11. + * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL. + * + * The message 'interrupt took too long' can be observed on any counter which + * was armed with a period < 32 and two events expired in the same NMI. + * A minimum period of 32 is enforced for the rest of the events. + */ +static void hsw_limit_period(struct perf_event *event, s64 *left) +{ + *left = max(*left, erratum_hsw11(event) ? 128 : 32); +} + /* * Broadwell: * @@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) */ static void bdw_limit_period(struct perf_event *event, s64 *left) { - if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == - X86_CONFIG(.event=0xc0, .umask=0x01)) { + if (erratum_hsw11(event)) { if (*left < 128) *left = 128; *left &= ~0x3fULL; @@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.limit_period = hsw_limit_period; x86_pmu.lbr_double_abort = true; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 6b122a31da06..aa21c105eef1 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -179,6 +179,7 @@ static __always_inline bool cpuid_function_is_indexed(u32 function) case 0x1d: case 0x1e: case 0x1f: + case 0x24: case 0x8000001d: return true; } diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index eb17f31b06d2..de16862bf230 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -591,6 +591,13 @@ struct fpu_state_config { * even without XSAVE support, i.e. legacy features FP + SSE */ u64 legacy_features; + /* + * @independent_features: + * + * Features that are supported by XSAVES, but not managed as part of + * the FPU core, such as LBR + */ + u64 independent_features; }; /* FPU state configuration information */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 68ad4f923664..861d080ed4c6 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -14,8 +14,8 @@ BUILD_BUG_ON(1) * be __static_call_return0. */ KVM_X86_OP(check_processor_compatibility) -KVM_X86_OP(hardware_enable) -KVM_X86_OP(hardware_disable) +KVM_X86_OP(enable_virtualization_cpu) +KVM_X86_OP(disable_virtualization_cpu) KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) @@ -125,7 +125,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) -KVM_X86_OP(get_msr_feature) +KVM_X86_OP(get_feature_msr) KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4a68cb3eba78..5f794814226f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -36,6 +36,7 @@ #include <asm/kvm_page_track.h> #include <asm/kvm_vcpu_regs.h> #include <asm/hyperv-tlfs.h> +#include <asm/reboot.h> #define __KVM_HAVE_ARCH_VCPU_DEBUGFS @@ -211,6 +212,7 @@ enum exit_fastpath_completion { EXIT_FASTPATH_NONE, EXIT_FASTPATH_REENTER_GUEST, EXIT_FASTPATH_EXIT_HANDLED, + EXIT_FASTPATH_EXIT_USERSPACE, }; typedef enum exit_fastpath_completion fastpath_t; @@ -280,10 +282,6 @@ enum x86_intercept_stage; #define PFERR_PRIVATE_ACCESS BIT_ULL(49) #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) -#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ - PFERR_WRITE_MASK | \ - PFERR_PRESENT_MASK) - /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -1629,8 +1627,10 @@ struct kvm_x86_ops { int (*check_processor_compatibility)(void); - int (*hardware_enable)(void); - void (*hardware_disable)(void); + int (*enable_virtualization_cpu)(void); + void (*disable_virtualization_cpu)(void); + cpu_emergency_virt_cb *emergency_disable_virtualization_cpu; + void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); @@ -1727,6 +1727,8 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + + const bool x2apic_icr_is_split; const unsigned long required_apicv_inhibits; bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); @@ -1806,7 +1808,7 @@ struct kvm_x86_ops { int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); - int (*get_msr_feature)(struct kvm_msr_entry *entry); + int (*get_feature_msr)(u32 msr, u64 *data); int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len); @@ -2136,7 +2138,15 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry); + +static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa) +{ + return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); +} + void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); @@ -2345,7 +2355,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ - KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ + KVM_X86_QUIRK_SLOT_ZAP_ALL) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index af4302d79b59..f3d257c45225 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -17,6 +17,7 @@ extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; +extern unsigned long physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 9053dfe9fa03..a98e53491a4e 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d; # define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +#ifdef CONFIG_RANDOMIZE_MEMORY +# define PHYSMEM_END physmem_end +#endif + /* * End of the region for which vmalloc page tables are pre-allocated. * For non-KMSAN builds, this is the same as VMALLOC_END. diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 6536873f8fc0..d0ef2a678d66 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,8 +25,8 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) typedef void (cpu_emergency_virt_cb)(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 12dbd2588ca7..8b1b6ce1e51b 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -156,12 +156,6 @@ static inline void resctrl_sched_in(struct task_struct *tsk) __resctrl_sched_in(tsk); } -static inline u32 resctrl_arch_system_num_rmid_idx(void) -{ - /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return boot_cpu_data.x86_cache_max_rmid + 1; -} - static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) { *rmid = idx; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index bf57a824f722..a8debbf2f702 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -439,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) +#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 66fd4b2a37a3..373638691cd4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic); static __init void x2apic_disable(void) { - u32 x2apic_id, state = x2apic_state; + u32 x2apic_id; - x2apic_mode = 0; - x2apic_state = X2APIC_DISABLED; - - if (state != X2APIC_ON) + if (x2apic_state < X2APIC_ON) return; x2apic_id = read_apic_id(); @@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void) } __x2apic_disable(); + + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + /* * Don't reread the APIC ID as it was already done from * check_x2apic() and the APIC driver still is a x2APIC variant, diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 1930fce9dfe9..8591d53c144b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -119,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = { }, }; +u32 resctrl_arch_system_num_rmid_idx(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return r->num_rmid; +} + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c5a026fee5e0..1339f8328db5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -788,6 +788,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & + XFEATURE_MASK_INDEPENDENT; + /* * Clear XSAVE features that are disabled in the normal CPUID. */ diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 2ee0b9c53dcc..afb404cd2059 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -62,9 +62,9 @@ static inline u64 xfeatures_mask_supervisor(void) static inline u64 xfeatures_mask_independent(void) { if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR; - return XFEATURE_MASK_INDEPENDENT; + return fpu_kernel_cfg.independent_features; } /* XSAVE/XRSTOR wrapper functions */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 472a1537b7a9..730c2f34d347 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -19,7 +19,6 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" - depends on HIGH_RES_TIMERS depends on X86_LOCAL_APIC select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER @@ -144,8 +143,10 @@ config KVM_AMD_SEV select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE help - Provides support for launching Encrypted VMs (SEV) and Encrypted VMs - with Encrypted State (SEV-ES) on AMD processors. + Provides support for launching encrypted VMs which use Secure + Encrypted Virtualization (SEV), Secure Encrypted Virtualization with + Encrypted State (SEV-ES), and Secure Encrypted Virtualization with + Secure Nested Paging (SEV-SNP) technologies on AMD processors. config KVM_SMM bool "System Management Mode emulation" diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2617be544480..41786b834b16 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -705,7 +705,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | - F(AMX_COMPLEX) + F(AMX_COMPLEX) | F(AVX10) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, @@ -721,6 +721,10 @@ void kvm_set_cpu_caps(void) SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) ); + kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX, + F(AVX10_128) | F(AVX10_256) | F(AVX10_512) + ); + kvm_cpu_cap_mask(CPUID_8000_0001_ECX, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | @@ -949,7 +953,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) switch (function) { case 0: /* Limited to the highest leaf implemented in KVM. */ - entry->eax = min(entry->eax, 0x1fU); + entry->eax = min(entry->eax, 0x24U); break; case 1: cpuid_entry_override(entry, CPUID_1_EDX); @@ -1174,6 +1178,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } break; + case 0x24: { + u8 avx10_version; + + if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + /* + * The AVX10 version is encoded in EBX[7:0]. Note, the version + * is guaranteed to be >=1 if AVX10 is supported. Note #2, the + * version needs to be captured before overriding EBX features! + */ + avx10_version = min_t(u8, entry->ebx & 0xff, 1); + cpuid_entry_override(entry, CPUID_24_0_EBX); + entry->ebx |= avx10_version; + + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } case KVM_CPUID_SIGNATURE: { const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5bb481aefcbc..c7180cb5f464 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1944,7 +1944,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; + u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; ktime_t now; @@ -2453,6 +2453,43 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) + +int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +{ + if (data & X2APIC_ICR_RESERVED_BITS) + return 1; + + /* + * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but + * only AMD requires it to be zero, Intel essentially just ignores the + * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, + * the CPU performs the reserved bits checks, i.e. the underlying CPU + * behavior will "win". Arbitrarily clear the BUSY bit, as there is no + * sane way to provide consistent behavior with respect to hardware. + */ + data &= ~APIC_ICR_BUSY; + + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + if (kvm_x86_ops.x2apic_icr_is_split) { + kvm_lapic_set_reg(apic, APIC_ICR, data); + kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); + } else { + kvm_lapic_set_reg64(apic, APIC_ICR, data); + } + trace_kvm_apic_write(APIC_ICR, data); + return 0; +} + +static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) +{ + if (kvm_x86_ops.x2apic_icr_is_split) + return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | + (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; + + return kvm_lapic_get_reg64(apic, APIC_ICR); +} + /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { @@ -2470,7 +2507,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) * maybe-unecessary write, and both are in the noise anyways. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) - kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)); + WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } @@ -2990,18 +3027,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, /* * In x2APIC mode, the LDR is fixed and based on the id. And - * ICR is internally a single 64-bit register, but needs to be - * split to ICR+ICR2 in userspace for backwards compatibility. + * if the ICR is _not_ split, ICR is internally a single 64-bit + * register, but needs to be split to ICR+ICR2 in userspace for + * backwards compatibility. */ - if (set) { + if (set) *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); - icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | - (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; - __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); - } else { - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + if (!kvm_x86_ops.x2apic_icr_is_split) { + if (set) { + icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | + (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; + __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); + } else { + icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); + __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + } } } @@ -3194,22 +3235,12 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) return 0; } -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) -{ - data &= ~APIC_ICR_BUSY; - - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); - kvm_lapic_set_reg64(apic, APIC_ICR, data); - trace_kvm_apic_write(APIC_ICR, data); - return 0; -} - static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) { u32 low; if (reg == APIC_ICR) { - *data = kvm_lapic_get_reg64(apic, APIC_ICR); + *data = kvm_x2apic_icr_read(apic); return 0; } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 7ef8ae73e82d..7c95eedd771e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -96,7 +96,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 4341e0e28571..9dc5dd43ae7f 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -223,8 +223,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool kvm_mmu_may_ignore_guest_pat(void); -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); - int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 928cf84778b0..e081f785fb23 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -614,32 +614,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -/* Returns the Accessed status of the PTE and resets it at the same time. */ -static bool mmu_spte_age(u64 *sptep) -{ - u64 spte = mmu_spte_get_lockless(sptep); - - if (!is_accessed_spte(spte)) - return false; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(spte)) - kvm_set_pfn_dirty(spte_to_pfn(spte)); - - spte = mark_spte_for_access_track(spte); - mmu_spte_update_no_track(sptep, spte); - } - - return true; -} - static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) { return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; @@ -938,6 +912,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct * pte_list_desc containing more mappings. */ +#define KVM_RMAP_MANY BIT(0) /* * Returns the number of pointers in the rmap chain, not counting the new one. @@ -950,16 +925,16 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, if (!rmap_head->val) { rmap_head->val = (unsigned long)spte; - } else if (!(rmap_head->val & 1)) { + } else if (!(rmap_head->val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; - rmap_head->val = (unsigned long)desc | 1; + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* @@ -968,10 +943,10 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; - rmap_head->val = (unsigned long)desc | 1; + rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; } desc->sptes[desc->spte_count++] = spte; } @@ -982,7 +957,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i) { - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* @@ -1011,7 +986,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, if (!head_desc->more) rmap_head->val = 0; else - rmap_head->val = (unsigned long)head_desc->more | 1; + rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } @@ -1024,13 +999,13 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) return; - if (!(rmap_head->val & 1)) { + if (!(rmap_head->val & KVM_RMAP_MANY)) { if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) return; rmap_head->val = 0; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { @@ -1063,12 +1038,12 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, if (!rmap_head->val) return false; - if (!(rmap_head->val & 1)) { + if (!(rmap_head->val & KVM_RMAP_MANY)) { mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); goto out; } - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) @@ -1088,10 +1063,10 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) if (!rmap_head->val) return 0; - else if (!(rmap_head->val & 1)) + else if (!(rmap_head->val & KVM_RMAP_MANY)) return 1; - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } @@ -1153,13 +1128,13 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, if (!rmap_head->val) return NULL; - if (!(rmap_head->val & 1)) { + if (!(rmap_head->val & KVM_RMAP_MANY)) { iter->desc = NULL; sptep = (u64 *)rmap_head->val; goto out; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); iter->pos = 0; sptep = iter->desc->sptes[iter->pos]; out: @@ -1307,15 +1282,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return flush; } -/** - * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages - * @kvm: kvm instance - * @slot: slot to protect - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should protect - * - * Used when we do not need to care about huge page mappings. - */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) @@ -1339,16 +1305,6 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } -/** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write - * protect the page if the D-bit isn't supported. - * @kvm: kvm instance - * @slot: slot to clear D-bit - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should clear D-bit - * - * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. - */ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) @@ -1372,24 +1328,16 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, } } -/** - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * PT level pages. - * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. - * - * We need to care about huge page mappings: e.g. during dirty logging we may - * have such mappings. - */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { /* - * Huge pages are NOT write protected when we start dirty logging in - * initially-all-set mode; must write protect them here so that they - * are split to 4K on the first write. + * If the slot was assumed to be "initially all dirty", write-protect + * huge pages to ensure they are split to 4KiB on the first write (KVM + * dirty logs at 4KiB granularity). If eager page splitting is enabled, + * immediately try to split huge pages, e.g. so that vCPUs don't get + * saddled with the cost of splitting. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large @@ -1411,7 +1359,16 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, PG_LEVEL_2M); } - /* Now handle 4K PTEs. */ + /* + * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in + * mask. If PML is enabled and the GFN doesn't need to be write- + * protected for other reasons, e.g. shadow paging, clear the Dirty bit. + * Otherwise clear the Writable bit. + * + * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is + * enabled but it chooses between clearing the Dirty bit and Writeable + * bit based on the context. + */ if (kvm_x86_ops.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else @@ -1453,16 +1410,10 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) -{ - return kvm_zap_all_rmap_sptes(kvm, rmap_head); -} - static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level) + const struct kvm_memory_slot *slot) { - return __kvm_zap_rmap(kvm, rmap_head, slot); + return kvm_zap_all_rmap_sptes(kvm, rmap_head); } struct slot_rmap_walk_iterator { @@ -1513,7 +1464,7 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { while (++iterator->rmap <= iterator->end_rmap) { - iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); if (iterator->rmap->val) return; @@ -1534,23 +1485,71 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level); +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_rmaps_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); -static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, - rmap_handler_t handler) +static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, + bool can_yield, bool flush_on_yield, + bool flush) { struct slot_rmap_walk_iterator iterator; - bool ret = false; - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) - ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, - iterator.level); + lockdep_assert_held_write(&kvm->mmu_lock); - return ret; + for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap, slot); + + if (!can_yield) + continue; + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + if (flush && flush_on_yield) { + kvm_flush_remote_tlbs_range(kvm, start_gfn, + iterator.gfn - start_gfn + 1); + flush = false; + } + cond_resched_rwlock_write(&kvm->mmu_lock); + } + } + + return flush; +} + +static __always_inline bool walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + bool flush_on_yield) +{ + return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, + slot->base_gfn, slot->base_gfn + slot->npages - 1, + true, flush_on_yield, false); +} + +static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + bool flush_on_yield) +{ + return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); +} + +static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, bool can_yield, + bool flush) +{ + return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, can_yield, true, flush); } bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1558,7 +1557,9 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); + flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, + range->start, range->end, + range->may_block, flush); if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1570,31 +1571,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) return flush; } -static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level) -{ - u64 *sptep; - struct rmap_iterator iter; - int young = 0; - - for_each_rmap_spte(rmap_head, &iter, sptep) - young |= mmu_spte_age(sptep); - - return young; -} - -static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level) -{ - u64 *sptep; - struct rmap_iterator iter; - - for_each_rmap_spte(rmap_head, &iter, sptep) - if (is_accessed_spte(*sptep)) - return true; - return false; -} - #define RMAP_RECYCLE_THRESHOLD 1000 static void __rmap_add(struct kvm *kvm, @@ -1629,12 +1605,52 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); } +static bool kvm_rmap_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, bool test_only) +{ + struct slot_rmap_walk_iterator iterator; + struct rmap_iterator iter; + bool young = false; + u64 *sptep; + + for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + range->start, range->end - 1, &iterator) { + for_each_rmap_spte(iterator.rmap, &iter, sptep) { + u64 spte = *sptep; + + if (!is_accessed_spte(spte)) + continue; + + if (test_only) + return true; + + if (spte_ad_enabled(spte)) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + } else { + /* + * Capture the dirty status of the page, so that + * it doesn't get lost when the SPTE is marked + * for access tracking. + */ + if (is_writable_pte(spte)) + kvm_set_pfn_dirty(spte_to_pfn(spte)); + + spte = mark_spte_for_access_track(spte); + mmu_spte_update_no_track(sptep, spte); + } + young = true; + } + } + return young; +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); + young = kvm_rmap_age_gfn_range(kvm, range, false); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1647,7 +1663,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); + young = kvm_rmap_age_gfn_range(kvm, range, true); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -2713,36 +2729,49 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) write_unlock(&kvm->mmu_lock); } -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry) { - struct kvm_mmu_page *sp; + struct kvm *kvm = vcpu->kvm; LIST_HEAD(invalid_list); - int r; + struct kvm_mmu_page *sp; + gpa_t gpa = cr2_or_gpa; + bool r = false; + + /* + * Bail early if there aren't any write-protected shadow pages to avoid + * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked + * by a third party. Reading indirect_shadow_pages without holding + * mmu_lock is safe, as this is purely an optimization, i.e. a false + * positive is benign, and a false negative will simply result in KVM + * skipping the unprotect+retry path, which is also an optimization. + */ + if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) + goto out; + + if (!vcpu->arch.mmu->root_role.direct) { + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + if (gpa == INVALID_GPA) + goto out; + } - r = 0; write_lock(&kvm->mmu_lock); - for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - r = 1; + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } + + /* + * Snapshot the result before zapping, as zapping will remove all list + * entries, i.e. checking the list later would yield a false negative. + */ + r = !list_empty(&invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); - return r; -} - -static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - int r; - - if (vcpu->arch.mmu->root_role.direct) - return 0; - - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); - - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - +out: + if (r || always_retry) { + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); + vcpu->arch.last_retry_addr = cr2_or_gpa; + } return r; } @@ -2914,10 +2943,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, trace_kvm_mmu_set_spte(level, gfn, sptep); } - if (wrprot) { - if (write_fault) - ret = RET_PF_EMULATE; - } + if (wrprot && write_fault) + ret = RET_PF_WRITE_PROTECTED; if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); @@ -4549,7 +4576,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_RETRY; if (page_fault_handle_page_track(vcpu, fault)) - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) @@ -4618,8 +4645,6 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { @@ -4642,7 +4667,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, int r; if (page_fault_handle_page_track(vcpu, fault)) - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) @@ -4721,6 +4746,7 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, switch (r) { case RET_PF_FIXED: case RET_PF_SPURIOUS: + case RET_PF_WRITE_PROTECTED: return 0; case RET_PF_EMULATE: @@ -4750,7 +4776,9 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, * reload is efficient when called repeatedly, so we can do it on * every iteration. */ - kvm_mmu_reload(vcpu); + r = kvm_mmu_reload(vcpu); + if (r) + return r; if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) @@ -5963,6 +5991,106 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, write_unlock(&vcpu->kvm->mmu_lock); } +static bool is_write_to_guest_page_table(u64 error_code) +{ + const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; + + return (error_code & mask) == mask; +} + +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u64 error_code, int *emulation_type) +{ + bool direct = vcpu->arch.mmu->root_role.direct; + + /* + * Do not try to unprotect and retry if the vCPU re-faulted on the same + * RIP with the same address that was previously unprotected, as doing + * so will likely put the vCPU into an infinite. E.g. if the vCPU uses + * a non-page-table modifying instruction on the PDE that points to the + * instruction, then unprotecting the gfn will unmap the instruction's + * code, i.e. make it impossible for the instruction to ever complete. + */ + if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && + vcpu->arch.last_retry_addr == cr2_or_gpa) + return RET_PF_EMULATE; + + /* + * Reset the unprotect+retry values that guard against infinite loops. + * The values will be refreshed if KVM explicitly unprotects a gfn and + * retries, in all other cases it's safe to retry in the future even if + * the next page fault happens on the same RIP+address. + */ + vcpu->arch.last_retry_eip = 0; + vcpu->arch.last_retry_addr = 0; + + /* + * It should be impossible to reach this point with an MMIO cache hit, + * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, + * writable memslot, and creating a memslot should invalidate the MMIO + * cache by way of changing the memslot generation. WARN and disallow + * retry if MMIO is detected, as retrying MMIO emulation is pointless + * and could put the vCPU into an infinite loop because the processor + * will keep faulting on the non-existent MMIO address. + */ + if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) + return RET_PF_EMULATE; + + /* + * Before emulating the instruction, check to see if the access was due + * to a read-only violation while the CPU was walking non-nested NPT + * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. + * If L1 is sharing (a subset of) its page tables with L2, e.g. by + * having nCR3 share lower level page tables with hCR3, then when KVM + * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also + * unknowingly write-protecting L1's guest page tables, which KVM isn't + * shadowing. + * + * Because the CPU (by default) walks NPT page tables using a write + * access (to ensure the CPU can do A/D updates), page walks in L1 can + * trigger write faults for the above case even when L1 isn't modifying + * PTEs. As a result, KVM will unnecessarily emulate (or at least, try + * to emulate) an excessive number of L1 instructions; because L1's MMU + * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs + * and thus no need to emulate in order to guarantee forward progress. + * + * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can + * proceed without triggering emulation. If one or more shadow pages + * was zapped, skip emulation and resume L1 to let it natively execute + * the instruction. If no shadow pages were zapped, then the write- + * fault is due to something else entirely, i.e. KVM needs to emulate, + * as resuming the guest will put it into an infinite loop. + * + * Note, this code also applies to Intel CPUs, even though it is *very* + * unlikely that an L1 will share its page tables (IA32/PAE/paging64 + * format) with L2's page tables (EPT format). + * + * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to + * unprotect the gfn and retry if an event is awaiting reinjection. If + * KVM emulates multiple instructions before completing event injection, + * the event could be delayed beyond what is architecturally allowed, + * e.g. KVM could inject an IRQ after the TPR has been raised. + */ + if (((direct && is_write_to_guest_page_table(error_code)) || + (!direct && kvm_event_needs_reinjection(vcpu))) && + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) + return RET_PF_RETRY; + + /* + * The gfn is write-protected, but if KVM detects its emulating an + * instruction that is unlikely to be used to modify page tables, or if + * emulation fails, KVM can try to unprotect the gfn and let the CPU + * re-execute the instruction that caused the page fault. Do not allow + * retrying an instruction from a nested guest as KVM is only explicitly + * shadowing L1's page tables, i.e. unprotecting something for L1 isn't + * going to magically fix whatever issue caused L2 to fail. + */ + if (!is_guest_mode(vcpu)) + *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; + + return RET_PF_EMULATE; +} + int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { @@ -6008,6 +6136,10 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r < 0) return r; + if (r == RET_PF_WRITE_PROTECTED) + r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, + &emulation_type); + if (r == RET_PF_FIXED) vcpu->stat.pf_fixed++; else if (r == RET_PF_EMULATE) @@ -6018,32 +6150,6 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err if (r != RET_PF_EMULATE) return 1; - /* - * Before emulating the instruction, check if the error code - * was due to a RO violation while translating the guest page. - * This can occur when using nested virtualization with nested - * paging in both guests. If true, we simply unprotect the page - * and resume the guest. - */ - if (vcpu->arch.mmu->root_role.direct && - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); - return 1; - } - - /* - * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still - * optimistically try to just unprotect the page and let the processor - * re-execute the instruction that caused the page fault. Do not allow - * retrying MMIO emulation, as it's not only pointless but could also - * cause us to enter an infinite loop because the processor will keep - * faulting on the non-existent MMIO address. Retrying an instruction - * from a nested guest is also pointless and dangerous as we are only - * explicitly shadowing L1's page tables, i.e. unprotecting something - * for L1 isn't going to magically fix whatever issue cause L2 to fail. - */ - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) - emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); @@ -6202,59 +6308,6 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, } EXPORT_SYMBOL_GPL(kvm_configure_mmu); -/* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_rmaps_handler) (struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot); - -static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, - bool flush_on_yield, bool flush) -{ - struct slot_rmap_walk_iterator iterator; - - lockdep_assert_held_write(&kvm->mmu_lock); - - for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, - end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap, slot); - - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush && flush_on_yield) { - kvm_flush_remote_tlbs_range(kvm, start_gfn, - iterator.gfn - start_gfn + 1); - flush = false; - } - cond_resched_rwlock_write(&kvm->mmu_lock); - } - } - - return flush; -} - -static __always_inline bool walk_slot_rmaps(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - int start_level, int end_level, - bool flush_on_yield) -{ - return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, - slot->base_gfn, slot->base_gfn + slot->npages - 1, - flush_on_yield, false); -} - -static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - bool flush_on_yield) -{ - return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); -} - static void free_mmu_pages(struct kvm_mmu *mmu) { if (!tdp_enabled && mmu->pae_root) @@ -6528,9 +6581,8 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e if (WARN_ON_ONCE(start >= end)) continue; - flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start, + end, true, flush); } } @@ -6818,7 +6870,7 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, */ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, - level, level, start, end - 1, true, false); + level, level, start, end - 1, true, true, false); } /* Must be called with the mmu_lock held in write-mode. */ @@ -6997,10 +7049,42 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) kvm_mmu_zap_all(kvm); } +/* + * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted. + * + * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst + * case scenario we'll have unused shadow pages lying around until they + * are recycled due to age or when the VM is destroyed. + */ +static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_gfn_range range = { + .slot = slot, + .start = slot->base_gfn, + .end = slot->base_gfn + slot->npages, + .may_block = true, + }; + + write_lock(&kvm->mmu_lock); + if (kvm_unmap_gfn_range(kvm, &range)) + kvm_flush_remote_tlbs_memslot(kvm, slot); + + write_unlock(&kvm->mmu_lock); +} + +static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_DEFAULT_VM && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL); +} + void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_zap_all_fast(kvm); + if (kvm_memslot_flush_zap_all(kvm)) + kvm_mmu_zap_all_fast(kvm); + else + kvm_mmu_zap_memslot_leafs(kvm, slot); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1721d97743e9..c98827840e07 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -258,6 +258,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * RET_PF_CONTINUE: So far, so good, keep handling the page fault. * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the + * gfn and retry, or emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. @@ -274,6 +276,7 @@ enum { RET_PF_CONTINUE = 0, RET_PF_RETRY, RET_PF_EMULATE, + RET_PF_WRITE_PROTECTED, RET_PF_INVALID, RET_PF_FIXED, RET_PF_SPURIOUS, @@ -349,8 +352,6 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); -void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); - void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 195d98bc8de8..f35a830ce469 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -57,6 +57,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); TRACE_DEFINE_ENUM(RET_PF_RETRY); TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED); TRACE_DEFINE_ENUM(RET_PF_INVALID); TRACE_DEFINE_ENUM(RET_PF_FIXED); TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 69941cebb3a8..ae7d39ff2d07 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -646,10 +646,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * really care if it changes underneath us after this point). */ if (FNAME(gpte_changed)(vcpu, gw, top_level)) - goto out_gpte_changed; + return RET_PF_RETRY; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) - goto out_gpte_changed; + return RET_PF_RETRY; /* * Load a new root and retry the faulting instruction in the extremely @@ -659,7 +659,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, */ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); - goto out_gpte_changed; + return RET_PF_RETRY; } for_each_shadow_entry(vcpu, fault->addr, it) { @@ -674,34 +674,38 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, false, access); - if (sp != ERR_PTR(-EEXIST)) { - /* - * We must synchronize the pagetable before linking it - * because the guest doesn't need to flush tlb when - * the gpte is changed from non-present to present. - * Otherwise, the guest may use the wrong mapping. - * - * For PG_LEVEL_4K, kvm_mmu_get_page() has already - * synchronized it transiently via kvm_sync_page(). - * - * For higher level pagetable, we synchronize it via - * the slower mmu_sync_children(). If it needs to - * break, some progress has been made; return - * RET_PF_RETRY and retry on the next #PF. - * KVM_REQ_MMU_SYNC is not necessary but it - * expedites the process. - */ - if (sp->unsync_children && - mmu_sync_children(vcpu, sp, false)) - return RET_PF_RETRY; - } + /* + * Synchronize the new page before linking it, as the CPU (KVM) + * is architecturally disallowed from inserting non-present + * entries into the TLB, i.e. the guest isn't required to flush + * the TLB when changing the gPTE from non-present to present. + * + * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already + * synchronized the page via kvm_sync_page(). + * + * For higher level pages, which cannot be unsync themselves + * but can have unsync children, synchronize via the slower + * mmu_sync_children(). If KVM needs to drop mmu_lock due to + * contention or to reschedule, instruct the caller to retry + * the #PF (mmu_sync_children() ensures forward progress will + * be made). + */ + if (sp != ERR_PTR(-EEXIST) && sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) + return RET_PF_RETRY; /* - * Verify that the gpte in the page we've just write - * protected is still there. + * Verify that the gpte in the page, which is now either + * write-protected or unsync, wasn't modified between the fault + * and acquiring mmu_lock. This needs to be done even when + * reusing an existing shadow page to ensure the information + * gathered by the walker matches the information stored in the + * shadow page (which could have been modified by a different + * vCPU even if the page was already linked). Holding mmu_lock + * prevents the shadow page from changing after this point. */ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) - goto out_gpte_changed; + return RET_PF_RETRY; if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); @@ -755,9 +759,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, FNAME(pte_prefetch)(vcpu, gw, it.sptep); return ret; - -out_gpte_changed: - return RET_PF_RETRY; } /* @@ -805,7 +806,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (page_fault_handle_page_track(vcpu, fault)) { shadow_page_table_clear_flood(vcpu, fault->addr); - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; } r = mmu_topup_memory_caches(vcpu, true); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index d4527965e48c..8f7eb3ad88fc 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -391,9 +391,9 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) mmio_value = 0; /* - * The masked MMIO value must obviously match itself and a removed SPTE - * must not get a false positive. Removed SPTEs and MMIO SPTEs should - * never collide as MMIO must set some RWX bits, and removed SPTEs must + * The masked MMIO value must obviously match itself and a frozen SPTE + * must not get a false positive. Frozen SPTEs and MMIO SPTEs should + * never collide as MMIO must set some RWX bits, and frozen SPTEs must * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index ef793c459b05..2cb816ea2430 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -214,7 +214,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; */ #define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) -/* Removed SPTEs must not be misconstrued as shadow present PTEs. */ +/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); static inline bool is_frozen_spte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c7dc49ee7388..3b996c1fdaab 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -359,10 +359,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) /* * Set the SPTE to a nonpresent value that other * threads will not overwrite. If the SPTE was - * already marked as removed then another thread + * already marked as frozen then another thread * handling a page fault could overwrite it, so * set the SPTE until it is set from some other - * value to the removed SPTE value. + * value to the frozen SPTE value. */ for (;;) { old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); @@ -536,8 +536,8 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 *sptep = rcu_dereference(iter->sptep); /* - * The caller is responsible for ensuring the old SPTE is not a REMOVED - * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, + * The caller is responsible for ensuring the old SPTE is not a FROZEN + * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE, * and pre-checking before inserting a new SPTE is advantageous as it * avoids unnecessary work. */ @@ -1046,10 +1046,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ - if (wrprot) { - if (fault->write) - ret = RET_PF_EMULATE; - } + if (wrprot && fault->write) + ret = RET_PF_WRITE_PROTECTED; /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 2f4e155080ba..0d17d6b70639 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -17,6 +17,7 @@ enum kvm_only_cpuid_leafs { CPUID_8000_0007_EDX, CPUID_8000_0022_EAX, CPUID_7_2_EDX, + CPUID_24_0_EBX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -46,6 +47,7 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19) /* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ #define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) @@ -55,6 +57,11 @@ enum kvm_only_cpuid_leafs { #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) +/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ +#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) +#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) +#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -90,6 +97,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, + [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, }; /* diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 00e3c27d2a87..85241c0c7f56 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -624,17 +624,31 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) #endif /* - * Give leave_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. enter guest mode) before loading state from the SMM - * state-save area. + * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest + * mode should happen _after_ loading state from SMRAM. However, KVM + * piggybacks the nested VM-Enter flows (which is wrong for many other + * reasons), and so nSVM/nVMX would clobber state that is loaded from + * SMRAM and from the VMCS/VMCB. */ if (kvm_x86_call(leave_smm)(vcpu, &smram)) return X86EMUL_UNHANDLEABLE; #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - return rsm_load_state_64(ctxt, &smram.smram64); + ret = rsm_load_state_64(ctxt, &smram.smram64); else #endif - return rsm_load_state_32(ctxt, &smram.smram32); + ret = rsm_load_state_32(ctxt, &smram.smram32); + + /* + * If RSM fails and triggers shutdown, architecturally the shutdown + * occurs *before* the transition to guest mode. But due to KVM's + * flawed handling of RSM to L2 (see above), the vCPU may already be + * in_guest_mode(). Force the vCPU out of guest mode before delivering + * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit + * that architecturally shouldn't be possible. + */ + if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu)) + kvm_leave_nested(vcpu); + return ret; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d6f252555ab3..c1062e8c3f50 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -592,14 +592,14 @@ static inline void kvm_cpu_svm_disable(void) } } -static void svm_emergency_disable(void) +static void svm_emergency_disable_virtualization_cpu(void) { kvm_rebooting = true; kvm_cpu_svm_disable(); } -static void svm_hardware_disable(void) +static void svm_disable_virtualization_cpu(void) { /* Make sure we clean up behind us */ if (tsc_scaling) @@ -610,7 +610,7 @@ static void svm_hardware_disable(void) amd_pmu_disable_virt(); } -static int svm_hardware_enable(void) +static int svm_enable_virtualization_cpu(void) { struct svm_cpu_data *sd; @@ -1533,7 +1533,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * TSC_AUX is always virtualized for SEV-ES guests when the feature is * available. The user return MSR support is not required in this case * because TSC_AUX is restored on #VMEXIT from the host save area - * (which has been initialized in svm_hardware_enable()). + * (which has been initialized in svm_enable_virtualization_cpu()). */ if (likely(tsc_aux_uret_slot >= 0) && (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) @@ -2825,17 +2825,17 @@ static int efer_trap(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, ret); } -static int svm_get_msr_feature(struct kvm_msr_entry *msr) +static int svm_get_feature_msr(u32 msr, u64 *data) { - msr->data = 0; + *data = 0; - switch (msr->index) { + switch (msr) { case MSR_AMD64_DE_CFG: if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; + *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; @@ -2876,6 +2876,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CSTAR: msr_info->data = svm->vmcb01.ptr->save.cstar; break; + case MSR_GS_BASE: + msr_info->data = svm->vmcb01.ptr->save.gs.base; + break; + case MSR_FS_BASE: + msr_info->data = svm->vmcb01.ptr->save.fs.base; + break; case MSR_KERNEL_GS_BASE: msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; break; @@ -3101,6 +3107,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_CSTAR: svm->vmcb01.ptr->save.cstar = data; break; + case MSR_GS_BASE: + svm->vmcb01.ptr->save.gs.base = data; + break; + case MSR_FS_BASE: + svm->vmcb01.ptr->save.fs.base = data; + break; case MSR_KERNEL_GS_BASE: svm->vmcb01.ptr->save.kernel_gs_base = data; break; @@ -3132,7 +3144,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * feature is available. The user return MSR support is not * required in this case because TSC_AUX is restored on #VMEXIT * from the host save area (which has been initialized in - * svm_hardware_enable()). + * svm_enable_virtualization_cpu()). */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) break; @@ -3179,18 +3191,21 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { - struct kvm_msr_entry msr_entry; + u64 supported_de_cfg; - msr_entry.index = msr->index; - if (svm_get_msr_feature(&msr_entry)) + if (svm_get_feature_msr(ecx, &supported_de_cfg)) return 1; - /* Check the supported bits */ - if (data & ~msr_entry.data) + if (data & ~supported_de_cfg) return 1; - /* Don't allow the guest to change a bit, #GP */ - if (!msr->host_initiated && (data ^ msr_entry.data)) + /* + * Don't let the guest change the host-programmed value. The + * MSR is very model specific, i.e. contains multiple bits that + * are completely unknown to KVM, and the one bit known to KVM + * is simply a reflection of hardware capabilities. + */ + if (!msr->host_initiated && data != svm->msr_decfg) return 1; svm->msr_decfg = data; @@ -4144,12 +4159,21 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) + switch (svm->vmcb->control.exit_code) { + case SVM_EXIT_MSR: + if (!svm->vmcb->control.exit_info_1) + break; return handle_fastpath_set_msr_irqoff(vcpu); + case SVM_EXIT_HLT: + return handle_fastpath_hlt(vcpu); + default: + break; + } return EXIT_FASTPATH_NONE; } @@ -4980,8 +5004,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, - .hardware_enable = svm_hardware_enable, - .hardware_disable = svm_hardware_disable, + .enable_virtualization_cpu = svm_enable_virtualization_cpu, + .disable_virtualization_cpu = svm_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_vcpu_create, @@ -4999,7 +5024,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, - .get_msr_feature = svm_get_msr_feature, + .get_feature_msr = svm_get_feature_msr, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, @@ -5050,6 +5075,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + + .x2apic_icr_is_split = true, .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .apicv_post_state_restore = avic_apicv_post_state_restore, @@ -5224,6 +5251,9 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); + + /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ + kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); } static __init int svm_hardware_setup(void) @@ -5410,8 +5440,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static void __svm_exit(void) { kvm_x86_vendor_exit(); - - cpu_emergency_unregister_virt_callback(svm_emergency_disable); } static int __init svm_init(void) @@ -5427,8 +5455,6 @@ static int __init svm_init(void) if (r) return r; - cpu_emergency_register_virt_callback(svm_emergency_disable); - /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0bf35ebe8a1b..7668e2fb8043 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -23,8 +23,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vmx_hardware_unsetup, - .hardware_enable = vmx_hardware_enable, - .hardware_disable = vmx_hardware_disable, + .enable_virtualization_cpu = vmx_enable_virtualization_cpu, + .disable_virtualization_cpu = vmx_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, + .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), @@ -41,7 +43,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .vcpu_put = vmx_vcpu_put, .update_exception_bitmap = vmx_update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, + .get_feature_msr = vmx_get_feature_msr, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, @@ -89,6 +91,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .enable_nmi_window = vmx_enable_nmi_window, .enable_irq_window = vmx_enable_irq_window, .update_cr8_intercept = vmx_update_cr8_intercept, + + .x2apic_icr_is_split = false, .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 39a26ecf87ea..9cfcfebd5f99 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -755,7 +755,7 @@ fault: return -EIO; } -static void vmx_emergency_disable(void) +void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -1998,15 +1998,15 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } -int vmx_get_msr_feature(struct kvm_msr_entry *msr) +int vmx_get_feature_msr(u32 msr, u64 *data) { - switch (msr->index) { + switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } } @@ -2844,7 +2844,7 @@ fault: return -EFAULT; } -int vmx_hardware_enable(void) +int vmx_enable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2881,7 +2881,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -void vmx_hardware_disable(void) +void vmx_disable_virtualization_cpu(void) { vmclear_local_loaded_vmcss(); @@ -7265,6 +7265,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); + case EXIT_REASON_HLT: + return handle_fastpath_hlt(vcpu); default: return EXIT_FASTPATH_NONE; } @@ -8584,8 +8586,6 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(vmx_emergency_disable); - vmx_cleanup_l1d_flush(); } @@ -8632,8 +8632,6 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(vmx_emergency_disable); - vmx_check_vmcs12_offsets(); /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 42498fa63abb..3839afb921e2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -17,10 +17,6 @@ #include "run_flags.h" #include "../mmu.h" -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -#define MSR_TYPE_RW 3 - #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3221cd1d01..a55981c5216e 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -13,8 +13,9 @@ extern struct kvm_x86_init_ops vt_init_ops __initdata; void vmx_hardware_unsetup(void); int vmx_check_processor_compat(void); -int vmx_hardware_enable(void); -void vmx_hardware_disable(void); +int vmx_enable_virtualization_cpu(void); +void vmx_disable_virtualization_cpu(void); +void vmx_emergency_disable_virtualization_cpu(void); int vmx_vm_init(struct kvm *kvm); void vmx_vm_destroy(struct kvm *kvm); int vmx_vcpu_precreate(struct kvm *kvm); @@ -56,7 +57,7 @@ bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -int vmx_get_msr_feature(struct kvm_msr_entry *msr); +int vmx_get_feature_msr(u32 msr, u64 *data); int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3ef6299ef823..0c1d54d9ef45 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -305,24 +305,237 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { static struct kmem_cache *x86_emulator_cache; /* - * When called, it means the previous get/set msr reached an invalid msr. - * Return true if we want to ignore/silent this failed msr access. + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds + * MSRs that KVM emulates without strictly requiring host support. + * msr_based_features holds MSRs that enumerate features, i.e. are effectively + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with + * msrs_to_save and emulated_msrs. + */ + +static const u32 msrs_to_save_base[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, + MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + + MSR_IA32_XFD, MSR_IA32_XFD_ERR, +}; + +static const u32 msrs_to_save_pmu[] = { + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, + + /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + + /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, +}; + +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; +static unsigned num_msrs_to_save; + +static const u32 emulated_msrs_all[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + +#ifdef CONFIG_KVM_HYPERV + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_STIMER0_CONFIG, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, +#endif + + MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, + + MSR_IA32_TSC_ADJUST, + MSR_IA32_TSC_DEADLINE, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, + MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, + MSR_IA32_SMBASE, + MSR_SMI_COUNT, + MSR_PLATFORM_INFO, + MSR_MISC_FEATURES_ENABLES, + MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, + MSR_IA32_POWER_CTL, + MSR_IA32_UCODE_REV, + + /* + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + + MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, +}; + +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; +static unsigned num_emulated_msrs; + +/* + * List of MSRs that control the existence of MSR-based features, i.e. MSRs + * that are effectively CPUID leafs. VMX MSRs are also included in the set of + * feature MSRs, but are handled separately to allow expedited lookups. + */ +static const u32 msr_based_features_all_except_vmx[] = { + MSR_AMD64_DE_CFG, + MSR_IA32_UCODE_REV, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, +}; + +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; +static unsigned int num_msr_based_features; + +/* + * All feature MSRs except uCode revID, which tracks the currently loaded uCode + * patch, are immutable once the vCPU model is defined. */ -static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) +static bool kvm_is_immutable_feature_msr(u32 msr) { - const char *op = write ? "wrmsr" : "rdmsr"; + int i; - if (ignore_msrs) { - if (report_ignored_msrs) - kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", - op, msr, data); - /* Mask the error */ + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) return true; - } else { + + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { + if (msr == msr_based_features_all_except_vmx[i]) + return msr != MSR_IA32_UCODE_REV; + } + + return false; +} + +static bool kvm_is_advertised_msr(u32 msr_index) +{ + unsigned int i; + + for (i = 0; i < num_msrs_to_save; i++) { + if (msrs_to_save[i] == msr_index) + return true; + } + + for (i = 0; i < num_emulated_msrs; i++) { + if (emulated_msrs[i] == msr_index) + return true; + } + + return false; +} + +typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated); + +static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, + u64 *data, bool host_initiated, + enum kvm_msr_access rw, + msr_access_t msr_access_fn) +{ + const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr"; + int ret; + + BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W); + + /* + * Zero the data on read failures to avoid leaking stack data to the + * guest and/or userspace, e.g. if the failure is ignored below. + */ + ret = msr_access_fn(vcpu, msr, data, host_initiated); + if (ret && rw == MSR_TYPE_R) + *data = 0; + + if (ret != KVM_MSR_RET_UNSUPPORTED) + return ret; + + /* + * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM + * advertises to userspace, even if an MSR isn't fully supported. + * Simply check that @data is '0', which covers both the write '0' case + * and all reads (in which case @data is zeroed on failure; see above). + */ + if (host_initiated && !*data && kvm_is_advertised_msr(msr)) + return 0; + + if (!ignore_msrs) { kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", - op, msr, data); - return false; + op, msr, *data); + return ret; } + + if (report_ignored_msrs) + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data); + + return 0; } static struct kmem_cache *kvm_alloc_emulator_cache(void) @@ -355,7 +568,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) /* * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_hardware_disable() + * interrupted and executed through kvm_arch_disable_virtualization_cpu() */ local_irq_save(flags); if (msrs->registered) { @@ -413,8 +626,7 @@ EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); u64 value; int i; @@ -621,12 +833,6 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto ex->payload = payload; } -/* Forcibly leave the nested mode in cases like a vCPU reset */ -static void kvm_leave_nested(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops.nested_ops->leave_nested(vcpu); -} - static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool has_payload, unsigned long payload, bool reinject) @@ -1412,178 +1618,6 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track - * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, - * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that - * require host support, i.e. should be probed via RDMSR. emulated_msrs holds - * MSRs that KVM emulates without strictly requiring host support. - * msr_based_features holds MSRs that enumerate features, i.e. are effectively - * CPUID leafs. Note, msr_based_features isn't mutually exclusive with - * msrs_to_save and emulated_msrs. - */ - -static const u32 msrs_to_save_base[] = { - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, - MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, - MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, - MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, - MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, - MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, - MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, - MSR_IA32_UMWAIT_CONTROL, - - MSR_IA32_XFD, MSR_IA32_XFD_ERR, -}; - -static const u32 msrs_to_save_pmu[] = { - MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, - MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, - MSR_CORE_PERF_GLOBAL_CTRL, - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, - - /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ - MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, - MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, - MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, - MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, - MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, - MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, - MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, - MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - - MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, - MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, - - /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ - MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, - MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, - MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, - MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - - MSR_AMD64_PERF_CNTR_GLOBAL_CTL, - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, -}; - -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + - ARRAY_SIZE(msrs_to_save_pmu)]; -static unsigned num_msrs_to_save; - -static const u32 emulated_msrs_all[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - -#ifdef CONFIG_KVM_HYPERV - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, - HV_X64_MSR_RESET, - HV_X64_MSR_VP_INDEX, - HV_X64_MSR_VP_RUNTIME, - HV_X64_MSR_SCONTROL, - HV_X64_MSR_STIMER0_CONFIG, - HV_X64_MSR_VP_ASSIST_PAGE, - HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, - HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, - HV_X64_MSR_SYNDBG_OPTIONS, - HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, - HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, - HV_X64_MSR_SYNDBG_PENDING_BUFFER, -#endif - - MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, - - MSR_IA32_TSC_ADJUST, - MSR_IA32_TSC_DEADLINE, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, - MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, - MSR_IA32_MCG_EXT_CTL, - MSR_IA32_SMBASE, - MSR_SMI_COUNT, - MSR_PLATFORM_INFO, - MSR_MISC_FEATURES_ENABLES, - MSR_AMD64_VIRT_SPEC_CTRL, - MSR_AMD64_TSC_RATIO, - MSR_IA32_POWER_CTL, - MSR_IA32_UCODE_REV, - - /* - * KVM always supports the "true" VMX control MSRs, even if the host - * does not. The VMX MSRs as a whole are considered "emulated" as KVM - * doesn't strictly require them to exist in the host (ignoring that - * KVM would refuse to load in the first place if the core set of MSRs - * aren't supported). - */ - MSR_IA32_VMX_BASIC, - MSR_IA32_VMX_TRUE_PINBASED_CTLS, - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, - MSR_IA32_VMX_TRUE_EXIT_CTLS, - MSR_IA32_VMX_TRUE_ENTRY_CTLS, - MSR_IA32_VMX_MISC, - MSR_IA32_VMX_CR0_FIXED0, - MSR_IA32_VMX_CR4_FIXED0, - MSR_IA32_VMX_VMCS_ENUM, - MSR_IA32_VMX_PROCBASED_CTLS2, - MSR_IA32_VMX_EPT_VPID_CAP, - MSR_IA32_VMX_VMFUNC, - - MSR_K7_HWCR, - MSR_KVM_POLL_CONTROL, -}; - -static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; -static unsigned num_emulated_msrs; - -/* - * List of MSRs that control the existence of MSR-based features, i.e. MSRs - * that are effectively CPUID leafs. VMX MSRs are also included in the set of - * feature MSRs, but are handled separately to allow expedited lookups. - */ -static const u32 msr_based_features_all_except_vmx[] = { - MSR_AMD64_DE_CFG, - MSR_IA32_UCODE_REV, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, -}; - -static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + - (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; -static unsigned int num_msr_based_features; - -/* - * All feature MSRs except uCode revID, which tracks the currently loaded uCode - * patch, are immutable once the vCPU model is defined. - */ -static bool kvm_is_immutable_feature_msr(u32 msr) -{ - int i; - - if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) - return true; - - for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { - if (msr == msr_based_features_all_except_vmx[i]) - return msr != MSR_IA32_UCODE_REV; - } - - return false; -} - -/* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM * does not yet virtualize. These include: * 10 - MISC_PACKAGE_CTRLS @@ -1660,40 +1694,31 @@ static u64 kvm_get_arch_capabilities(void) return data; } -static int kvm_get_msr_feature(struct kvm_msr_entry *msr) +static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { - switch (msr->index) { + WARN_ON_ONCE(!host_initiated); + + switch (index) { case MSR_IA32_ARCH_CAPABILITIES: - msr->data = kvm_get_arch_capabilities(); + *data = kvm_get_arch_capabilities(); break; case MSR_IA32_PERF_CAPABILITIES: - msr->data = kvm_caps.supported_perf_cap; + *data = kvm_caps.supported_perf_cap; break; case MSR_IA32_UCODE_REV: - rdmsrl_safe(msr->index, &msr->data); + rdmsrl_safe(index, data); break; default: - return kvm_x86_call(get_msr_feature)(msr); + return kvm_x86_call(get_feature_msr)(index, data); } return 0; } -static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - struct kvm_msr_entry msr; - int r; - - /* Unconditionally clear the output for simplicity */ - msr.data = 0; - msr.index = index; - r = kvm_get_msr_feature(&msr); - - if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) - r = 0; - - *data = msr.data; - - return r; + return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, + kvm_get_feature_msr); } static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -1880,16 +1905,17 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, return kvm_x86_call(set_msr)(vcpu, &msr); } +static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) +{ + return __kvm_set_msr(vcpu, index, *data, host_initiated); +} + static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated) { - int ret = __kvm_set_msr(vcpu, index, data, host_initiated); - - if (ret == KVM_MSR_RET_INVALID) - if (kvm_msr_ignored_check(index, data, true)) - ret = 0; - - return ret; + return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, + _kvm_set_msr); } /* @@ -1928,16 +1954,8 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { - int ret = __kvm_get_msr(vcpu, index, data, host_initiated); - - if (ret == KVM_MSR_RET_INVALID) { - /* Unconditionally clear *data for simplicity */ - *data = 0; - if (kvm_msr_ignored_check(index, 0, false)) - ret = 0; - } - - return ret; + return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R, + __kvm_get_msr); } static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) @@ -1999,7 +2017,7 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) static u64 kvm_msr_reason(int r) { switch (r) { - case KVM_MSR_RET_INVALID: + case KVM_MSR_RET_UNSUPPORTED: return KVM_MSR_EXIT_REASON_UNKNOWN; case KVM_MSR_RET_FILTERED: return KVM_MSR_EXIT_REASON_FILTER; @@ -2162,31 +2180,34 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; - fastpath_t ret = EXIT_FASTPATH_NONE; + fastpath_t ret; + bool handled; kvm_vcpu_srcu_read_lock(vcpu); switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); - if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_EXIT_HANDLED; - } + handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); break; case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); - if (!handle_fastpath_set_tscdeadline(vcpu, data)) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_REENTER_GUEST; - } + handled = !handle_fastpath_set_tscdeadline(vcpu, data); break; default: + handled = false; break; } - if (ret != EXIT_FASTPATH_NONE) + if (handled) { + if (!kvm_skip_emulated_instruction(vcpu)) + ret = EXIT_FASTPATH_EXIT_USERSPACE; + else + ret = EXIT_FASTPATH_REENTER_GUEST; trace_kvm_msr_write(msr, data); + } else { + ret = EXIT_FASTPATH_NONE; + } kvm_vcpu_srcu_read_unlock(vcpu); @@ -3746,18 +3767,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } -static bool kvm_is_msr_to_save(u32 msr_index) -{ - unsigned int i; - - for (i = 0; i < num_msrs_to_save; i++) { - if (msrs_to_save[i] == msr_index) - return true; - } - - return false; -} - int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; @@ -4139,15 +4148,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - /* - * Userspace is allowed to write '0' to MSRs that KVM reports - * as to-be-saved, even if an MSRs isn't fully supported. - */ - if (msr_info->host_initiated && !data && - kvm_is_msr_to_save(msr)) - break; - - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; } @@ -4498,17 +4499,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); - /* - * Userspace is allowed to read MSRs that KVM reports as - * to-be-saved, even if an MSR isn't fully supported. - */ - if (msr_info->host_initiated && - kvm_is_msr_to_save(msr_info->index)) { - msr_info->data = 0; - break; - } - - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; } @@ -4656,7 +4647,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: - case KVM_CAP_READONLY_MEM: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_DISABLE_QUIRKS: @@ -4815,6 +4805,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_TYPES: r = kvm_caps.supported_vm_types; break; + case KVM_CAP_READONLY_MEM: + r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1; + break; default: break; } @@ -4944,7 +4937,7 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_GET_MSRS: - r = msr_io(NULL, argp, do_get_msr_feature, 1); + r = msr_io(NULL, argp, do_get_feature_msr, 1); break; #ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: @@ -6040,7 +6033,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) break; + kvm_vcpu_srcu_read_lock(vcpu); r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); + kvm_vcpu_srcu_read_unlock(vcpu); break; } case KVM_GET_DEBUGREGS: { @@ -7379,11 +7374,9 @@ out: static void kvm_probe_feature_msr(u32 msr_index) { - struct kvm_msr_entry msr = { - .index = msr_index, - }; + u64 data; - if (kvm_get_msr_feature(&msr)) + if (kvm_get_feature_msr(NULL, msr_index, &data, true)) return; msr_based_features[num_msr_based_features++] = msr_index; @@ -8861,60 +8854,13 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - int emulation_type) +static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa, + int emulation_type) { - gpa_t gpa = cr2_or_gpa; - kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - - if (!vcpu->arch.mmu->root_role.direct) { - /* - * Write permission should be allowed since only - * write access need to be emulated. - */ - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - /* - * If the mapping is invalid in guest, let cpu retry - * it to generate fault. - */ - if (gpa == INVALID_GPA) - return true; - } - - /* - * Do not retry the unhandleable instruction if it faults on the - * readonly host memory, otherwise it will goto a infinite loop: - * retry instruction -> write #PF -> emulation fail -> retry - * instruction -> ... - */ - pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); - - /* - * If the instruction failed on the error pfn, it can not be fixed, - * report the error to userspace. - */ - if (is_error_noslot_pfn(pfn)) - return false; - - kvm_release_pfn_clean(pfn); - - /* - * If emulation may have been triggered by a write to a shadowed page - * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the - * guest to let the CPU re-execute the instruction in the hope that the - * CPU can cleanly execute the instruction that KVM failed to emulate. - */ - if (vcpu->kvm->arch.indirect_shadow_pages) - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - /* * If the failed instruction faulted on an access to page tables that * are used to translate any part of the instruction, KVM can't resolve @@ -8925,54 +8871,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * then zap the SPTE to unprotect the gfn, and then do it all over * again. Report the error to userspace. */ - return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); -} - -static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - gpa_t cr2_or_gpa, int emulation_type) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; - - last_retry_eip = vcpu->arch.last_retry_eip; - last_retry_addr = vcpu->arch.last_retry_addr; + if (emulation_type & EMULTYPE_WRITE_PF_TO_SP) + return false; /* - * If the emulation is caused by #PF and it is non-page_table - * writing instruction, it means the VM-EXIT is caused by shadow - * page protected, we can zap the shadow page and retry this - * instruction directly. - * - * Note: if the guest uses a non-page-table modifying instruction - * on the PDE that points to the instruction, then we will unmap - * the instruction and go to an infinite loop. So, we cache the - * last retried eip and the last fault address, if we meet the eip - * and the address again, we can break out of the potential infinite - * loop. + * If emulation may have been triggered by a write to a shadowed page + * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the + * guest to let the CPU re-execute the instruction in the hope that the + * CPU can cleanly execute the instruction that KVM failed to emulate. */ - vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) - return false; - - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - - if (x86_page_table_writing_insn(ctxt)) - return false; - - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) - return false; - - vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2_or_gpa; - - if (!vcpu->arch.mmu->root_role.direct) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true); + /* + * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible + * all SPTEs were already zapped by a different task. The alternative + * is to report the error to userspace and likely terminate the guest, + * and the last_retry_{eip,addr} checks will prevent retrying the page + * fault indefinitely, i.e. there's nothing to lose by retrying. + */ return true; } @@ -9172,6 +9088,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))) + emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF; + r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); if (r != X86EMUL_CONTINUE) { if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) @@ -9202,8 +9123,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (reexecute_instruction(vcpu, cr2_or_gpa, - emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; if (ctxt->have_exception && @@ -9250,7 +9171,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return 1; } - if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) + /* + * If emulation was caused by a write-protection #PF on a non-page_table + * writing instruction, try to unprotect the gfn, i.e. zap shadow pages, + * and retry the instruction, as the vCPU is likely no longer using the + * gfn as a page table. + */ + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + !x86_page_table_writing_insn(ctxt) && + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return 1; /* this is needed for vmware backdoor interface to work since it @@ -9281,7 +9210,8 @@ restart: return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; return handle_emulation_failure(vcpu, emulation_type); @@ -9749,7 +9679,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) guard(mutex)(&vendor_module_lock); - if (kvm_x86_ops.hardware_enable) { + if (kvm_x86_ops.enable_virtualization_cpu) { pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; } @@ -9876,7 +9806,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return 0; out_unwind_ops: - kvm_x86_ops.hardware_enable = NULL; + kvm_x86_ops.enable_virtualization_cpu = NULL; kvm_x86_call(hardware_unsetup)(); out_mmu_exit: kvm_mmu_vendor_module_exit(); @@ -9917,56 +9847,11 @@ void kvm_x86_vendor_exit(void) WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif mutex_lock(&vendor_module_lock); - kvm_x86_ops.hardware_enable = NULL; + kvm_x86_ops.enable_virtualization_cpu = NULL; mutex_unlock(&vendor_module_lock); } EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); -static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) -{ - /* - * The vCPU has halted, e.g. executed HLT. Update the run state if the - * local APIC is in-kernel, the run loop will detect the non-runnable - * state and halt the vCPU. Exit to userspace if the local APIC is - * managed by userspace, in which case userspace is responsible for - * handling wake events. - */ - ++vcpu->stat.halt_exits; - if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = state; - return 1; - } else { - vcpu->run->exit_reason = reason; - return 0; - } -} - -int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) -{ - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); - -int kvm_emulate_halt(struct kvm_vcpu *vcpu) -{ - int ret = kvm_skip_emulated_instruction(vcpu); - /* - * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered - * KVM_EXIT_DEBUG here. - */ - return kvm_emulate_halt_noskip(vcpu) && ret; -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) -{ - int ret = kvm_skip_emulated_instruction(vcpu); - - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, - KVM_EXIT_AP_RESET_HOLD) && ret; -} -EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); - #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, unsigned long clock_type) @@ -11203,6 +11088,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) + return 0; + r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); return r; @@ -11216,6 +11104,67 @@ out: return r; } +static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted); +} + +static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +{ + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; + + if (kvm_apic_has_pending_init_or_sipi(vcpu) && + kvm_apic_init_sipi_allowed(vcpu)) + return true; + + if (vcpu->arch.pv.pv_unhalted) + return true; + + if (kvm_is_exception_pending(vcpu)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + (vcpu->arch.nmi_pending && + kvm_x86_call(nmi_allowed)(vcpu, false))) + return true; + +#ifdef CONFIG_KVM_SMM + if (kvm_test_request(KVM_REQ_SMI, vcpu) || + (vcpu->arch.smi_pending && + kvm_x86_call(smi_allowed)(vcpu, false))) + return true; +#endif + + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) + return true; + + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) + return true; + + if (kvm_hv_has_stimer_pending(vcpu)) + return true; + + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu, false)) + return true; + + if (kvm_xen_has_pending_events(vcpu)) + return true; + + return false; +} + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); +} + /* Called within kvm->srcu read side. */ static inline int vcpu_block(struct kvm_vcpu *vcpu) { @@ -11287,12 +11236,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) return 1; } -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) -{ - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted); -} - /* Called within kvm->srcu read side. */ static int vcpu_run(struct kvm_vcpu *vcpu) { @@ -11344,6 +11287,98 @@ static int vcpu_run(struct kvm_vcpu *vcpu) return r; } +static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) +{ + /* + * The vCPU has halted, e.g. executed HLT. Update the run state if the + * local APIC is in-kernel, the run loop will detect the non-runnable + * state and halt the vCPU. Exit to userspace if the local APIC is + * managed by userspace, in which case userspace is responsible for + * handling wake events. + */ + ++vcpu->stat.halt_exits; + if (lapic_in_kernel(vcpu)) { + if (kvm_vcpu_has_events(vcpu)) + vcpu->arch.pv.pv_unhalted = false; + else + vcpu->arch.mp_state = state; + return 1; + } else { + vcpu->run->exit_reason = reason; + return 0; + } +} + +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + /* + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_emulate_halt_noskip(vcpu) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); + +fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) +{ + int ret; + + kvm_vcpu_srcu_read_lock(vcpu); + ret = kvm_emulate_halt(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); + + if (!ret) + return EXIT_FASTPATH_EXIT_USERSPACE; + + if (kvm_vcpu_running(vcpu)) + return EXIT_FASTPATH_REENTER_GUEST; + + return EXIT_FASTPATH_EXIT_HANDLED; +} +EXPORT_SYMBOL_GPL(handle_fastpath_hlt); + +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, + KVM_EXIT_AP_RESET_HOLD) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); + +bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_apicv_active(vcpu) && + kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); +} + +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.preempted_in_kernel; +} + +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || +#ifdef CONFIG_KVM_SMM + kvm_test_request(KVM_REQ_SMI, vcpu) || +#endif + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + return kvm_arch_dy_has_pending_interrupt(vcpu); +} + static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); @@ -12512,7 +12547,17 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) } EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); -int kvm_arch_hardware_enable(void) +void kvm_arch_enable_virtualization(void) +{ + cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + +void kvm_arch_disable_virtualization(void) +{ + cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + +int kvm_arch_enable_virtualization_cpu(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; @@ -12528,7 +12573,7 @@ int kvm_arch_hardware_enable(void) if (ret) return ret; - ret = kvm_x86_call(hardware_enable)(); + ret = kvm_x86_call(enable_virtualization_cpu)(); if (ret != 0) return ret; @@ -12608,9 +12653,9 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { - kvm_x86_call(hardware_disable)(); + kvm_x86_call(disable_virtualization_cpu)(); drop_user_return_notifiers(); } @@ -13158,87 +13203,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_arch_free_memslot(kvm, old); } -static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) -{ - if (!list_empty_careful(&vcpu->async_pf.done)) - return true; - - if (kvm_apic_has_pending_init_or_sipi(vcpu) && - kvm_apic_init_sipi_allowed(vcpu)) - return true; - - if (vcpu->arch.pv.pv_unhalted) - return true; - - if (kvm_is_exception_pending(vcpu)) - return true; - - if (kvm_test_request(KVM_REQ_NMI, vcpu) || - (vcpu->arch.nmi_pending && - kvm_x86_call(nmi_allowed)(vcpu, false))) - return true; - -#ifdef CONFIG_KVM_SMM - if (kvm_test_request(KVM_REQ_SMI, vcpu) || - (vcpu->arch.smi_pending && - kvm_x86_call(smi_allowed)(vcpu, false))) - return true; -#endif - - if (kvm_test_request(KVM_REQ_PMI, vcpu)) - return true; - - if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) - return true; - - if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) - return true; - - if (kvm_hv_has_stimer_pending(vcpu)) - return true; - - if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->has_events && - kvm_x86_ops.nested_ops->has_events(vcpu, false)) - return true; - - if (kvm_xen_has_pending_events(vcpu)) - return true; - - return false; -} - -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); -} - -bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_apicv_active(vcpu) && - kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); -} - -bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.preempted_in_kernel; -} - -bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) -{ - if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) - return true; - - if (kvm_test_request(KVM_REQ_NMI, vcpu) || -#ifdef CONFIG_KVM_SMM - kvm_test_request(KVM_REQ_SMI, vcpu) || -#endif - kvm_test_request(KVM_REQ_EVENT, vcpu)) - return true; - - return kvm_arch_dy_has_pending_interrupt(vcpu); -} - bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5585c2ef147a..a84c48ef5278 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -109,6 +109,12 @@ static inline unsigned int __shrink_ple_window(unsigned int val, void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); +/* Forcibly leave the nested mode in cases like a vCPU reset */ +static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops.nested_ops->leave_nested(vcpu); +} + static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) { return vcpu->arch.last_vmentry_cpu != -1; @@ -335,6 +341,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; @@ -505,13 +512,26 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); +enum kvm_msr_access { + MSR_TYPE_R = BIT(0), + MSR_TYPE_W = BIT(1), + MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W, +}; + /* * Internal error codes that are used to indicate that MSR emulation encountered - * an error that should result in #GP in the guest, unless userspace - * handles it. + * an error that should result in #GP in the guest, unless userspace handles it. + * Note, '1', '0', and negative numbers are off limits, as they are used by KVM + * as part of KVM's lightly documented internal KVM_RUN return codes. + * + * UNSUPPORTED - The MSR isn't supported, either because it is completely + * unknown to KVM, or because the MSR should not exist according + * to the vCPU model. + * + * FILTERED - Access to the MSR is denied by a userspace MSR filter. */ -#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */ -#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */ +#define KVM_MSR_RET_UNSUPPORTED 2 +#define KVM_MSR_RET_FILTERED 3 #define __cr4_reserved_bits(__cpu_has, __c) \ ({ \ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d8dbeac8b206..ff253648706f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size) int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params) { + unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; int ret; + if (WARN_ON_ONCE(end > PHYSMEM_END)) + return -ERANGE; + ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 37db264866b6..230f1dee4f09 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; */ static __initdata struct kaslr_memory_region { unsigned long *base; + unsigned long *end; unsigned long size_tb; } kaslr_regions[] = { - { &page_offset_base, 0 }, - { &vmalloc_base, 0 }, - { &vmemmap_base, 0 }, + { + .base = &page_offset_base, + .end = &physmem_end, + }, + { + .base = &vmalloc_base, + }, + { + .base = &vmemmap_base, + }, }; +/* The end of the possible address space for physical memory */ +unsigned long physmem_end __ro_after_init; + /* Get size in bytes used by the memory region */ static inline unsigned long get_padding(struct kaslr_memory_region *region) { @@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void) BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + /* Preset the end of the possible address space for physical memory */ + physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); if (!kaslr_memory_enabled()) return; @@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void) vaddr += entropy; *kaslr_regions[i].base = vaddr; + /* Calculate the end of the region */ + vaddr += get_padding(&kaslr_regions[i]); /* - * Jump the region and add a minimum padding based on - * randomization alignment. + * KASLR trims the maximum possible size of the + * direct-map. Update the physmem_end boundary. + * No rounding required as the region starts + * PUD aligned and size is in units of TB. */ - vaddr += get_padding(&kaslr_regions[i]); + if (kaslr_regions[i].end) + *kaslr_regions[i].end = __pa_nodebug(vaddr - 1); + + /* Add a minimum padding based on randomization alignment. */ vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } |