summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi12
-rw-r--r--arch/arm/boot/dts/ti/omap/omap3-n900.dts2
-rw-r--r--arch/arm/kernel/entry-armv.S12
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi8
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts12
-rw-r--r--arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts2
-rw-r--r--arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi3
-rw-r--r--arch/arm64/boot/dts/freescale/imx93.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx95.dtsi14
-rw-r--r--arch/arm64/boot/dts/qcom/ipq5332.dtsi4
-rw-r--r--arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts42
-rw-r--r--arch/arm64/boot/dts/qcom/x1e80100-crd.dts70
-rw-r--r--arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts54
-rw-r--r--arch/arm64/boot/dts/qcom/x1e80100-qcp.dts53
-rw-r--r--arch/arm64/boot/dts/qcom/x1e80100.dtsi21
-rw-r--r--arch/arm64/configs/defconfig1
-rw-r--r--arch/arm64/include/asm/esr.h5
-rw-r--r--arch/arm64/include/asm/kvm_arm.h1
-rw-r--r--arch/arm64/include/asm/kvm_asm.h6
-rw-r--r--arch/arm64/include/asm/kvm_host.h22
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h6
-rw-r--r--arch/arm64/include/asm/kvm_nested.h40
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h42
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h9
-rw-r--r--arch/arm64/include/asm/ptdump.h43
-rw-r--r--arch/arm64/include/asm/sysreg.h22
-rw-r--r--arch/arm64/kernel/stacktrace.c4
-rw-r--r--arch/arm64/kvm/Kconfig17
-rw-r--r--arch/arm64/kvm/Makefile3
-rw-r--r--arch/arm64/kvm/arm.c21
-rw-r--r--arch/arm64/kvm/at.c1101
-rw-r--r--arch/arm64/kvm/emulate-nested.c81
-rw-r--r--arch/arm64/kvm/fpsimd.c5
-rw-r--r--arch/arm64/kvm/guest.c6
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/fault.h2
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h3
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c21
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c6
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c48
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c97
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c3
-rw-r--r--arch/arm64/kvm/mmu.c9
-rw-r--r--arch/arm64/kvm/nested.c55
-rw-r--r--arch/arm64/kvm/ptdump.c268
-rw-r--r--arch/arm64/kvm/sys_regs.c384
-rw-r--r--arch/arm64/kvm/sys_regs.h23
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c9
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic.c19
-rw-r--r--arch/arm64/kvm/vgic/vgic.h7
-rw-r--r--arch/arm64/mm/ptdump.c70
-rw-r--r--arch/loongarch/include/asm/Kbuild1
-rw-r--r--arch/loongarch/include/asm/dma-direct.h11
-rw-r--r--arch/loongarch/include/asm/hw_irq.h2
-rw-r--r--arch/loongarch/include/asm/kvm_csr.h6
-rw-r--r--arch/loongarch/include/asm/kvm_host.h37
-rw-r--r--arch/loongarch/include/asm/kvm_para.h12
-rw-r--r--arch/loongarch/include/asm/kvm_vcpu.h12
-rw-r--r--arch/loongarch/include/asm/loongarch.h11
-rw-r--r--arch/loongarch/include/asm/paravirt.h7
-rw-r--r--arch/loongarch/include/asm/qspinlock.h41
-rw-r--r--arch/loongarch/include/uapi/asm/Kbuild2
-rw-r--r--arch/loongarch/include/uapi/asm/kvm.h20
-rw-r--r--arch/loongarch/include/uapi/asm/kvm_para.h21
-rw-r--r--arch/loongarch/kernel/fpu.S4
-rw-r--r--arch/loongarch/kernel/irq.c3
-rw-r--r--arch/loongarch/kernel/paravirt.c47
-rw-r--r--arch/loongarch/kernel/setup.c2
-rw-r--r--arch/loongarch/kernel/smp.c4
-rw-r--r--arch/loongarch/kvm/exit.c46
-rw-r--r--arch/loongarch/kvm/main.c4
-rw-r--r--arch/loongarch/kvm/switch.S4
-rw-r--r--arch/loongarch/kvm/timer.c7
-rw-r--r--arch/loongarch/kvm/vcpu.c342
-rw-r--r--arch/loongarch/kvm/vm.c69
-rw-r--r--arch/microblaze/mm/init.c5
-rw-r--r--arch/mips/include/asm/kvm_host.h4
-rw-r--r--arch/mips/kernel/cevt-r4k.c15
-rw-r--r--arch/mips/kernel/cpu-probe.c4
-rw-r--r--arch/mips/kernel/csrc-r4k.c4
-rw-r--r--arch/mips/kvm/mips.c8
-rw-r--r--arch/mips/kvm/vz.c8
-rw-r--r--arch/parisc/mm/init.c16
-rw-r--r--arch/powerpc/include/asm/nohash/32/pgtable.h4
-rw-r--r--arch/powerpc/include/asm/pgtable-types.h12
-rw-r--r--arch/powerpc/kernel/vdso/vdso32.lds.S4
-rw-r--r--arch/powerpc/kernel/vdso/vdso64.lds.S4
-rw-r--r--arch/powerpc/lib/qspinlock.c10
-rw-r--r--arch/powerpc/mm/nohash/tlb_64e.c2
-rw-r--r--arch/riscv/Kconfig4
-rw-r--r--arch/riscv/include/asm/kvm_vcpu_pmu.h21
-rw-r--r--arch/riscv/include/asm/processor.h26
-rw-r--r--arch/riscv/include/asm/sbi.h20
-rw-r--r--arch/riscv/kernel/Makefile6
-rw-r--r--arch/riscv/kernel/sbi.c63
-rw-r--r--arch/riscv/kernel/sbi_ecall.c48
-rw-r--r--arch/riscv/kernel/traps_misaligned.c4
-rw-r--r--arch/riscv/kvm/main.c4
-rw-r--r--arch/riscv/kvm/vcpu_pmu.c14
-rw-r--r--arch/riscv/kvm/vcpu_sbi.c4
-rw-r--r--arch/riscv/mm/init.c2
-rw-r--r--arch/s390/Kconfig13
-rw-r--r--arch/s390/boot/startup.c58
-rw-r--r--arch/s390/boot/vmem.c14
-rw-r--r--arch/s390/boot/vmlinux.lds.S7
-rw-r--r--arch/s390/configs/debug_defconfig1
-rw-r--r--arch/s390/include/asm/page.h3
-rw-r--r--arch/s390/kernel/setup.c19
-rw-r--r--arch/s390/kernel/vmlinux.lds.S2
-rw-r--r--arch/s390/kvm/kvm-s390.c27
-rw-r--r--arch/s390/tools/relocs.c2
-rw-r--r--arch/x86/coco/tdx/tdx.c1
-rw-r--r--arch/x86/events/intel/core.c23
-rw-r--r--arch/x86/include/asm/cpuid.h1
-rw-r--r--arch/x86/include/asm/fpu/types.h7
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h29
-rw-r--r--arch/x86/include/asm/page_64.h1
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/reboot.h2
-rw-r--r--arch/x86/include/asm/resctrl.h6
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kernel/apic/apic.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c8
-rw-r--r--arch/x86/kernel/fpu/xstate.c3
-rw-r--r--arch/x86/kernel/fpu/xstate.h4
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/cpuid.c30
-rw-r--r--arch/x86/kvm/lapic.c75
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c560
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h5
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h1
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h63
-rw-r--r--arch/x86/kvm/mmu/spte.c6
-rw-r--r--arch/x86/kvm/mmu/spte.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c14
-rw-r--r--arch/x86/kvm/reverse_cpuid.h8
-rw-r--r--arch/x86/kvm/smm.c24
-rw-r--r--arch/x86/kvm/svm/svm.c78
-rw-r--r--arch/x86/kvm/vmx/main.c10
-rw-r--r--arch/x86/kvm/vmx/vmx.c20
-rw-r--r--arch/x86/kvm/vmx/vmx.h4
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h7
-rw-r--r--arch/x86/kvm/x86.c998
-rw-r--r--arch/x86/kvm/x86.h28
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/kaslr.c32
161 files changed, 4479 insertions, 1604 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 54b2bb817a7f..173159e93c99 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -117,7 +117,7 @@ config ARM
select HAVE_KERNEL_XZ
select HAVE_KPROBES if !XIP_KERNEL && !CPU_ENDIAN_BE32 && !CPU_V7M
select HAVE_KRETPROBES if HAVE_KPROBES
- select HAVE_LD_DEAD_CODE_DATA_ELIMINATION
+ select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if (LD_VERSION >= 23600 || LD_IS_LLD)
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_OPTPROBES if !THUMB2_KERNEL
diff --git a/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi b/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi
index 52a0f6ee426f..bcf4d9c870ec 100644
--- a/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi
+++ b/arch/arm/boot/dts/nxp/imx/imx6dl-yapp43-common.dtsi
@@ -274,24 +274,24 @@
led@0 {
chan-name = "R";
- led-cur = /bits/ 8 <0x20>;
- max-cur = /bits/ 8 <0x60>;
+ led-cur = /bits/ 8 <0x6e>;
+ max-cur = /bits/ 8 <0xc8>;
reg = <0>;
color = <LED_COLOR_ID_RED>;
};
led@1 {
chan-name = "G";
- led-cur = /bits/ 8 <0x20>;
- max-cur = /bits/ 8 <0x60>;
+ led-cur = /bits/ 8 <0xbe>;
+ max-cur = /bits/ 8 <0xc8>;
reg = <1>;
color = <LED_COLOR_ID_GREEN>;
};
led@2 {
chan-name = "B";
- led-cur = /bits/ 8 <0x20>;
- max-cur = /bits/ 8 <0x60>;
+ led-cur = /bits/ 8 <0xbe>;
+ max-cur = /bits/ 8 <0xc8>;
reg = <2>;
color = <LED_COLOR_ID_BLUE>;
};
diff --git a/arch/arm/boot/dts/ti/omap/omap3-n900.dts b/arch/arm/boot/dts/ti/omap/omap3-n900.dts
index 07c5b963af78..4bde3342bb95 100644
--- a/arch/arm/boot/dts/ti/omap/omap3-n900.dts
+++ b/arch/arm/boot/dts/ti/omap/omap3-n900.dts
@@ -781,7 +781,7 @@
mount-matrix = "-1", "0", "0",
"0", "1", "0",
- "0", "0", "1";
+ "0", "0", "-1";
};
cam1: camera@3e {
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index f01d23a220e6..1dfae1af8e31 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -29,6 +29,12 @@
#include "entry-header.S"
#include <asm/probes.h>
+#ifdef CONFIG_HAVE_LD_DEAD_CODE_DATA_ELIMINATION
+#define RELOC_TEXT_NONE .reloc .text, R_ARM_NONE, .
+#else
+#define RELOC_TEXT_NONE
+#endif
+
/*
* Interrupt handling.
*/
@@ -1065,7 +1071,7 @@ vector_addrexcptn:
.globl vector_fiq
.section .vectors, "ax", %progbits
- .reloc .text, R_ARM_NONE, .
+ RELOC_TEXT_NONE
W(b) vector_rst
W(b) vector_und
ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_swi )
@@ -1079,7 +1085,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_swi )
#ifdef CONFIG_HARDEN_BRANCH_HISTORY
.section .vectors.bhb.loop8, "ax", %progbits
- .reloc .text, R_ARM_NONE, .
+ RELOC_TEXT_NONE
W(b) vector_rst
W(b) vector_bhb_loop8_und
ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_loop8_swi )
@@ -1092,7 +1098,7 @@ THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_loop8_swi )
W(b) vector_bhb_loop8_fiq
.section .vectors.bhb.bpiall, "ax", %progbits
- .reloc .text, R_ARM_NONE, .
+ RELOC_TEXT_NONE
W(b) vector_rst
W(b) vector_bhb_bpiall_und
ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_bpiall_swi )
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
index 6b6e3ee950e5..acf293310f7a 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
@@ -175,7 +175,7 @@
};
};
- core-cluster-thermal {
+ cluster-thermal {
polling-delay-passive = <1000>;
polling-delay = <5000>;
thermal-sensors = <&tmu 1>;
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
index 17f4e3171120..ab4c919e3e16 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
@@ -214,7 +214,7 @@
};
};
- core-cluster-thermal {
+ cluster-thermal {
polling-delay-passive = <1000>;
polling-delay = <5000>;
thermal-sensors = <&tmu 3>;
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
index 200e52622f99..55019866d6a2 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
@@ -182,7 +182,7 @@
};
};
- core-cluster-thermal {
+ cluster-thermal {
polling-delay-passive = <1000>;
polling-delay = <5000>;
thermal-sensors = <&tmu 3>;
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
index 8ce4b6aae79d..e3a7db21fe29 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
@@ -131,7 +131,7 @@
};
thermal-zones {
- core-cluster-thermal {
+ cluster-thermal {
polling-delay-passive = <1000>;
polling-delay = <5000>;
thermal-sensors = <&tmu 0>;
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
index bde89de2576e..1b306d6802ce 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
@@ -122,7 +122,7 @@
};
};
- core-cluster1-thermal {
+ cluster1-thermal {
polling-delay-passive = <1000>;
polling-delay = <5000>;
thermal-sensors = <&tmu 4>;
@@ -151,7 +151,7 @@
};
};
- core-cluster2-thermal {
+ cluster2-thermal {
polling-delay-passive = <1000>;
polling-delay = <5000>;
thermal-sensors = <&tmu 5>;
@@ -180,7 +180,7 @@
};
};
- core-cluster3-thermal {
+ cluster3-thermal {
polling-delay-passive = <1000>;
polling-delay = <5000>;
thermal-sensors = <&tmu 6>;
@@ -209,7 +209,7 @@
};
};
- core-cluster4-thermal {
+ cluster4-thermal {
polling-delay-passive = <1000>;
polling-delay = <5000>;
thermal-sensors = <&tmu 7>;
diff --git a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi
index 26c7ca31e22e..bd75a658767d 100644
--- a/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-lx2160a.dtsi
@@ -492,7 +492,7 @@
};
};
- ddr-cluster5-thermal {
+ ddr-ctrl5-thermal {
polling-delay-passive = <1000>;
polling-delay = <5000>;
thermal-sensors = <&tmu 1>;
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso
index bf3e04651ba0..353ace3601dc 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso
+++ b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs232.dtso
@@ -21,7 +21,7 @@
&gpio3 {
pinctrl-names = "default";
- pinctrcl-0 = <&pinctrl_gpio3_hog>;
+ pinctrl-0 = <&pinctrl_gpio3_hog>;
uart4_rs485_en {
gpio-hog;
diff --git a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso
index f4448cde0407..8a75d6783ad2 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso
+++ b/arch/arm64/boot/dts/freescale/imx8mm-phygate-tauri-l-rs232-rs485.dtso
@@ -22,7 +22,7 @@
&gpio3 {
pinctrl-names = "default";
- pinctrcl-0 = <&pinctrl_gpio3_hog>;
+ pinctrl-0 = <&pinctrl_gpio3_hog>;
uart4_rs485_en {
gpio-hog;
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts
index 17e2c19d8455..cc9b81d46188 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mp-beacon-kit.dts
@@ -211,13 +211,12 @@
simple-audio-card,cpu {
sound-dai = <&sai3>;
+ frame-master;
+ bitclock-master;
};
simple-audio-card,codec {
sound-dai = <&wm8962>;
- clocks = <&clk IMX8MP_CLK_IPP_DO_CLKO1>;
- frame-master;
- bitclock-master;
};
};
};
@@ -507,10 +506,9 @@
&sai3 {
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_sai3>;
- assigned-clocks = <&clk IMX8MP_CLK_SAI3>,
- <&clk IMX8MP_AUDIO_PLL2> ;
- assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL2_OUT>;
- assigned-clock-rates = <12288000>, <361267200>;
+ assigned-clocks = <&clk IMX8MP_CLK_SAI3>;
+ assigned-clock-parents = <&clk IMX8MP_AUDIO_PLL1_OUT>;
+ assigned-clock-rates = <12288000>;
fsl,sai-mclk-direction-output;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts b/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts
index da8f19a646a9..e2ee9f5a042c 100644
--- a/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts
+++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352-mba93xxla.dts
@@ -499,7 +499,7 @@
pinctrl-0 = <&pinctrl_usdhc2_hs>, <&pinctrl_usdhc2_gpio>;
pinctrl-1 = <&pinctrl_usdhc2_uhs>, <&pinctrl_usdhc2_gpio>;
pinctrl-2 = <&pinctrl_usdhc2_uhs>, <&pinctrl_usdhc2_gpio>;
- cd-gpios = <&gpio3 00 GPIO_ACTIVE_LOW>;
+ cd-gpios = <&gpio3 0 GPIO_ACTIVE_LOW>;
vmmc-supply = <&reg_usdhc2_vmmc>;
bus-width = <4>;
no-sdio;
diff --git a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi
index edbd8cad35bc..72a9a5d4e27a 100644
--- a/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx93-tqma9352.dtsi
@@ -19,7 +19,7 @@
linux,cma {
compatible = "shared-dma-pool";
reusable;
- alloc-ranges = <0 0x60000000 0 0x40000000>;
+ alloc-ranges = <0 0x80000000 0 0x40000000>;
size = <0 0x10000000>;
linux,cma-default;
};
@@ -156,6 +156,7 @@
&wdog3 {
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_wdog>;
+ fsl,ext-reset-output;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/freescale/imx93.dtsi b/arch/arm64/boot/dts/freescale/imx93.dtsi
index 4a3f42355cb8..a0993022c102 100644
--- a/arch/arm64/boot/dts/freescale/imx93.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx93.dtsi
@@ -1105,7 +1105,7 @@
<&clk IMX93_CLK_SYS_PLL_PFD0_DIV2>;
assigned-clock-rates = <100000000>, <250000000>;
intf_mode = <&wakeupmix_gpr 0x28>;
- snps,clk-csr = <0>;
+ snps,clk-csr = <6>;
nvmem-cells = <&eth_mac2>;
nvmem-cell-names = "mac-address";
status = "disabled";
diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi
index 1bbf9a0468f6..425272aa5a81 100644
--- a/arch/arm64/boot/dts/freescale/imx95.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx95.dtsi
@@ -27,7 +27,7 @@
reg = <0x0>;
enable-method = "psci";
#cooling-cells = <2>;
- power-domains = <&scmi_devpd IMX95_PERF_A55>;
+ power-domains = <&scmi_perf IMX95_PERF_A55>;
power-domain-names = "perf";
i-cache-size = <32768>;
i-cache-line-size = <64>;
@@ -44,7 +44,7 @@
reg = <0x100>;
enable-method = "psci";
#cooling-cells = <2>;
- power-domains = <&scmi_devpd IMX95_PERF_A55>;
+ power-domains = <&scmi_perf IMX95_PERF_A55>;
power-domain-names = "perf";
i-cache-size = <32768>;
i-cache-line-size = <64>;
@@ -61,7 +61,7 @@
reg = <0x200>;
enable-method = "psci";
#cooling-cells = <2>;
- power-domains = <&scmi_devpd IMX95_PERF_A55>;
+ power-domains = <&scmi_perf IMX95_PERF_A55>;
power-domain-names = "perf";
i-cache-size = <32768>;
i-cache-line-size = <64>;
@@ -78,7 +78,7 @@
reg = <0x300>;
enable-method = "psci";
#cooling-cells = <2>;
- power-domains = <&scmi_devpd IMX95_PERF_A55>;
+ power-domains = <&scmi_perf IMX95_PERF_A55>;
power-domain-names = "perf";
i-cache-size = <32768>;
i-cache-line-size = <64>;
@@ -93,7 +93,7 @@
device_type = "cpu";
compatible = "arm,cortex-a55";
reg = <0x400>;
- power-domains = <&scmi_devpd IMX95_PERF_A55>;
+ power-domains = <&scmi_perf IMX95_PERF_A55>;
power-domain-names = "perf";
enable-method = "psci";
#cooling-cells = <2>;
@@ -110,7 +110,7 @@
device_type = "cpu";
compatible = "arm,cortex-a55";
reg = <0x500>;
- power-domains = <&scmi_devpd IMX95_PERF_A55>;
+ power-domains = <&scmi_perf IMX95_PERF_A55>;
power-domain-names = "perf";
enable-method = "psci";
#cooling-cells = <2>;
@@ -187,7 +187,7 @@
compatible = "cache";
cache-size = <524288>;
cache-line-size = <64>;
- cache-sets = <1024>;
+ cache-sets = <512>;
cache-level = <3>;
cache-unified;
};
diff --git a/arch/arm64/boot/dts/qcom/ipq5332.dtsi b/arch/arm64/boot/dts/qcom/ipq5332.dtsi
index 573656587c0d..0a74ed4f72cc 100644
--- a/arch/arm64/boot/dts/qcom/ipq5332.dtsi
+++ b/arch/arm64/boot/dts/qcom/ipq5332.dtsi
@@ -320,8 +320,8 @@
reg = <0x08af8800 0x400>;
interrupts = <GIC_SPI 62 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 53 IRQ_TYPE_EDGE_BOTH>,
- <GIC_SPI 52 IRQ_TYPE_EDGE_BOTH>;
+ <GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 52 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "pwr_event",
"dp_hs_phy_irq",
"dm_hs_phy_irq";
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts
index 7fb980fcb307..9caa14dda585 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts
+++ b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts
@@ -278,6 +278,13 @@
vdd-l3-supply = <&vreg_s1f_0p7>;
vdd-s1-supply = <&vph_pwr>;
vdd-s2-supply = <&vph_pwr>;
+
+ vreg_l3i_0p8: ldo3 {
+ regulator-name = "vreg_l3i_0p8";
+ regulator-min-microvolt = <880000>;
+ regulator-max-microvolt = <920000>;
+ regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
+ };
};
regulators-7 {
@@ -423,11 +430,17 @@
};
&pcie4 {
+ perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>;
+ wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>;
+
+ pinctrl-0 = <&pcie4_default>;
+ pinctrl-names = "default";
+
status = "okay";
};
&pcie4_phy {
- vdda-phy-supply = <&vreg_l3j_0p8>;
+ vdda-phy-supply = <&vreg_l3i_0p8>;
vdda-pll-supply = <&vreg_l3e_1p2>;
status = "okay";
@@ -517,7 +530,30 @@
bias-disable;
};
- pcie6a_default: pcie2a-default-state {
+ pcie4_default: pcie4-default-state {
+ clkreq-n-pins {
+ pins = "gpio147";
+ function = "pcie4_clk";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
+
+ perst-n-pins {
+ pins = "gpio146";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-disable;
+ };
+
+ wake-n-pins {
+ pins = "gpio148";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
+ };
+
+ pcie6a_default: pcie6a-default-state {
clkreq-n-pins {
pins = "gpio153";
function = "pcie6a_clk";
@@ -529,7 +565,7 @@
pins = "gpio152";
function = "gpio";
drive-strength = <2>;
- bias-pull-down;
+ bias-disable;
};
wake-n-pins {
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts
index 6152bcd0bc1f..e17ab8251e2a 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts
+++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts
@@ -268,7 +268,6 @@
pinctrl-0 = <&edp_reg_en>;
pinctrl-names = "default";
- regulator-always-on;
regulator-boot-on;
};
@@ -637,6 +636,14 @@
};
};
+&gpu {
+ status = "okay";
+
+ zap-shader {
+ firmware-name = "qcom/x1e80100/gen70500_zap.mbn";
+ };
+};
+
&i2c0 {
clock-frequency = <400000>;
@@ -724,9 +731,13 @@
aux-bus {
panel {
- compatible = "edp-panel";
+ compatible = "samsung,atna45af01", "samsung,atna33xc20";
+ enable-gpios = <&pmc8380_3_gpios 4 GPIO_ACTIVE_HIGH>;
power-supply = <&vreg_edp_3p3>;
+ pinctrl-0 = <&edp_bl_en>;
+ pinctrl-names = "default";
+
port {
edp_panel_in: endpoint {
remote-endpoint = <&mdss_dp3_out>;
@@ -756,11 +767,17 @@
};
&pcie4 {
+ perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>;
+ wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>;
+
+ pinctrl-0 = <&pcie4_default>;
+ pinctrl-names = "default";
+
status = "okay";
};
&pcie4_phy {
- vdda-phy-supply = <&vreg_l3j_0p8>;
+ vdda-phy-supply = <&vreg_l3i_0p8>;
vdda-pll-supply = <&vreg_l3e_1p2>;
status = "okay";
@@ -785,6 +802,16 @@
status = "okay";
};
+&pmc8380_3_gpios {
+ edp_bl_en: edp-bl-en-state {
+ pins = "gpio4";
+ function = "normal";
+ power-source = <1>; /* 1.8V */
+ input-disable;
+ output-enable;
+ };
+};
+
&qupv3_0 {
status = "okay";
};
@@ -931,7 +958,30 @@
bias-disable;
};
- pcie6a_default: pcie2a-default-state {
+ pcie4_default: pcie4-default-state {
+ clkreq-n-pins {
+ pins = "gpio147";
+ function = "pcie4_clk";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
+
+ perst-n-pins {
+ pins = "gpio146";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-disable;
+ };
+
+ wake-n-pins {
+ pins = "gpio148";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
+ };
+
+ pcie6a_default: pcie6a-default-state {
clkreq-n-pins {
pins = "gpio153";
function = "pcie6a_clk";
@@ -943,15 +993,15 @@
pins = "gpio152";
function = "gpio";
drive-strength = <2>;
- bias-pull-down;
+ bias-disable;
};
wake-n-pins {
- pins = "gpio154";
- function = "gpio";
- drive-strength = <2>;
- bias-pull-up;
- };
+ pins = "gpio154";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
};
tpad_default: tpad-default-state {
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts
index fbff558f5b07..1943bdbfb8c0 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts
+++ b/arch/arm64/boot/dts/qcom/x1e80100-lenovo-yoga-slim7x.dts
@@ -625,16 +625,31 @@
};
&pcie4 {
+ perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>;
+ wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>;
+
+ pinctrl-0 = <&pcie4_default>;
+ pinctrl-names = "default";
+
status = "okay";
};
&pcie4_phy {
- vdda-phy-supply = <&vreg_l3j_0p8>;
+ vdda-phy-supply = <&vreg_l3i_0p8>;
vdda-pll-supply = <&vreg_l3e_1p2>;
status = "okay";
};
+&pcie4_port0 {
+ wifi@0 {
+ compatible = "pci17cb,1107";
+ reg = <0x10000 0x0 0x0 0x0 0x0>;
+
+ qcom,ath12k-calibration-variant = "LES790";
+ };
+};
+
&pcie6a {
perst-gpios = <&tlmm 152 GPIO_ACTIVE_LOW>;
wake-gpios = <&tlmm 154 GPIO_ACTIVE_LOW>;
@@ -782,7 +797,30 @@
bias-disable;
};
- pcie6a_default: pcie2a-default-state {
+ pcie4_default: pcie4-default-state {
+ clkreq-n-pins {
+ pins = "gpio147";
+ function = "pcie4_clk";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
+
+ perst-n-pins {
+ pins = "gpio146";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-disable;
+ };
+
+ wake-n-pins {
+ pins = "gpio148";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
+ };
+
+ pcie6a_default: pcie6a-default-state {
clkreq-n-pins {
pins = "gpio153";
function = "pcie6a_clk";
@@ -794,15 +832,15 @@
pins = "gpio152";
function = "gpio";
drive-strength = <2>;
- bias-pull-down;
+ bias-disable;
};
wake-n-pins {
- pins = "gpio154";
- function = "gpio";
- drive-strength = <2>;
- bias-pull-up;
- };
+ pins = "gpio154";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
};
tpad_default: tpad-default-state {
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts
index 72a4f4138616..8098e6730ae5 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts
+++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts
@@ -606,6 +606,14 @@
};
};
+&gpu {
+ status = "okay";
+
+ zap-shader {
+ firmware-name = "qcom/x1e80100/gen70500_zap.mbn";
+ };
+};
+
&lpass_tlmm {
spkr_01_sd_n_active: spkr-01-sd-n-active-state {
pins = "gpio12";
@@ -660,11 +668,17 @@
};
&pcie4 {
+ perst-gpios = <&tlmm 146 GPIO_ACTIVE_LOW>;
+ wake-gpios = <&tlmm 148 GPIO_ACTIVE_LOW>;
+
+ pinctrl-0 = <&pcie4_default>;
+ pinctrl-names = "default";
+
status = "okay";
};
&pcie4_phy {
- vdda-phy-supply = <&vreg_l3j_0p8>;
+ vdda-phy-supply = <&vreg_l3i_0p8>;
vdda-pll-supply = <&vreg_l3e_1p2>;
status = "okay";
@@ -804,7 +818,30 @@
bias-disable;
};
- pcie6a_default: pcie2a-default-state {
+ pcie4_default: pcie4-default-state {
+ clkreq-n-pins {
+ pins = "gpio147";
+ function = "pcie4_clk";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
+
+ perst-n-pins {
+ pins = "gpio146";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-disable;
+ };
+
+ wake-n-pins {
+ pins = "gpio148";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
+ };
+
+ pcie6a_default: pcie6a-default-state {
clkreq-n-pins {
pins = "gpio153";
function = "pcie6a_clk";
@@ -816,15 +853,15 @@
pins = "gpio152";
function = "gpio";
drive-strength = <2>;
- bias-pull-down;
+ bias-disable;
};
wake-n-pins {
- pins = "gpio154";
- function = "gpio";
- drive-strength = <2>;
- bias-pull-up;
- };
+ pins = "gpio154";
+ function = "gpio";
+ drive-strength = <2>;
+ bias-pull-up;
+ };
};
wcd_default: wcd-reset-n-active-state {
diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi
index 7bca5fcd7d52..cd732ef88cd8 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi
+++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi
@@ -2901,7 +2901,7 @@
dma-coherent;
- linux,pci-domain = <7>;
+ linux,pci-domain = <6>;
num-lanes = <2>;
interrupts = <GIC_SPI 773 IRQ_TYPE_LEVEL_HIGH>,
@@ -2959,6 +2959,7 @@
"link_down";
power-domains = <&gcc GCC_PCIE_6A_GDSC>;
+ required-opps = <&rpmhpd_opp_nom>;
phys = <&pcie6a_phy>;
phy-names = "pciephy";
@@ -3022,7 +3023,7 @@
dma-coherent;
- linux,pci-domain = <5>;
+ linux,pci-domain = <4>;
num-lanes = <2>;
interrupts = <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>,
@@ -3080,11 +3081,22 @@
"link_down";
power-domains = <&gcc GCC_PCIE_4_GDSC>;
+ required-opps = <&rpmhpd_opp_nom>;
phys = <&pcie4_phy>;
phy-names = "pciephy";
status = "disabled";
+
+ pcie4_port0: pcie@0 {
+ device_type = "pci";
+ reg = <0x0 0x0 0x0 0x0 0x0>;
+ bus-range = <0x01 0xff>;
+
+ #address-cells = <3>;
+ #size-cells = <2>;
+ ranges;
+ };
};
pcie4_phy: phy@1c0e000 {
@@ -3155,9 +3167,10 @@
interconnects = <&gem_noc MASTER_GFX3D 0 &mc_virt SLAVE_EBI1 0>;
interconnect-names = "gfx-mem";
+ status = "disabled";
+
zap-shader {
memory-region = <&gpu_microcode_mem>;
- firmware-name = "qcom/gen70500_zap.mbn";
};
gpu_opp_table: opp-table {
@@ -3288,7 +3301,7 @@
reg = <0x0 0x03da0000 0x0 0x40000>;
#iommu-cells = <2>;
#global-interrupts = <1>;
- interrupts = <GIC_SPI 673 IRQ_TYPE_LEVEL_HIGH>,
+ interrupts = <GIC_SPI 674 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 678 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 679 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 680 IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 7d32fca64996..362df9390263 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -887,6 +887,7 @@ CONFIG_DRM_PANEL_KHADAS_TS050=m
CONFIG_DRM_PANEL_MANTIX_MLAF057WE51=m
CONFIG_DRM_PANEL_NOVATEK_NT36672E=m
CONFIG_DRM_PANEL_RAYDIUM_RM67191=m
+CONFIG_DRM_PANEL_SAMSUNG_ATNA33XC20=m
CONFIG_DRM_PANEL_SITRONIX_ST7703=m
CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m
CONFIG_DRM_PANEL_VISIONOX_VTDR6130=m
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 56c148890daf..d79308c23ddb 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -122,8 +122,8 @@
#define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n))
/* Status codes for individual page table levels */
-#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + n)
-#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + n)
+#define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n))
+#define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + (n))
#define ESR_ELx_FSC_FAULT_nL (0x2C)
#define ESR_ELx_FSC_FAULT_L(n) (((n) < 0 ? ESR_ELx_FSC_FAULT_nL : \
@@ -161,6 +161,7 @@
/* ISS field definitions for exceptions taken in to Hyp */
#define ESR_ELx_FSC_ADDRSZ (0x00)
+#define ESR_ELx_FSC_ADDRSZ_L(n) (ESR_ELx_FSC_ADDRSZ + (n))
#define ESR_ELx_CV (UL(1) << 24)
#define ESR_ELx_COND_SHIFT (20)
#define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT)
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index d81cc746e0eb..109a85ee6910 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -107,6 +107,7 @@
/* TCR_EL2 Registers bits */
#define TCR_EL2_DS (1UL << 32)
#define TCR_EL2_RES1 ((1U << 31) | (1 << 23))
+#define TCR_EL2_HPD (1 << 24)
#define TCR_EL2_TBI (1 << 20)
#define TCR_EL2_PS_SHIFT 16
#define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 2181a11b9d92..b36a3b6cc011 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -10,6 +10,7 @@
#include <asm/hyp_image.h>
#include <asm/insn.h>
#include <asm/virt.h>
+#include <asm/sysreg.h>
#define ARM_EXIT_WITH_SERROR_BIT 31
#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
@@ -235,6 +236,9 @@ extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
extern int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding);
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
+extern void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
+extern void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
+extern void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
@@ -259,7 +263,7 @@ extern u64 __kvm_get_mdcr_el2(void);
asm volatile( \
" mrs %1, spsr_el2\n" \
" mrs %2, elr_el2\n" \
- "1: at "at_op", %3\n" \
+ "1: " __msr_s(at_op, "%3") "\n" \
" isb\n" \
" b 9f\n" \
"2: msr spsr_el2, %1\n" \
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a33f5996ca9f..b9ca899041db 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -446,6 +446,10 @@ enum vcpu_sysreg {
GCR_EL1, /* Tag Control Register */
TFSRE0_EL1, /* Tag Fault Status Register (EL0) */
+ /* FP/SIMD/SVE */
+ SVCR,
+ FPMR,
+
/* 32bit specific registers. */
DACR32_EL2, /* Domain Access Control Register */
IFSR32_EL2, /* Instruction Fault Status Register */
@@ -530,6 +534,8 @@ enum vcpu_sysreg {
VNCR(CNTP_CVAL_EL0),
VNCR(CNTP_CTL_EL0),
+ VNCR(ICH_HCR_EL2),
+
NR_SYS_REGS /* Nothing after this line! */
};
@@ -595,6 +601,16 @@ struct kvm_host_data {
struct cpu_sve_state *sve_state;
};
+ union {
+ /* HYP VA pointer to the host storage for FPMR */
+ u64 *fpmr_ptr;
+ /*
+ * Used by pKVM only, as it needs to provide storage
+ * for the host
+ */
+ u64 fpmr;
+ };
+
/* Ownership of the FP regs */
enum {
FP_STATE_FREE,
@@ -664,8 +680,6 @@ struct kvm_vcpu_arch {
void *sve_state;
enum fp_type fp_type;
unsigned int sve_max_vl;
- u64 svcr;
- u64 fpmr;
/* Stage 2 paging state used by the hardware on next switch */
struct kvm_s2_mmu *hw_mmu;
@@ -1473,4 +1487,8 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
(pa + pi + pa3) == 1; \
})
+#define kvm_has_fpmr(k) \
+ (system_supports_fpmr() && \
+ kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP))
+
#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 216ca424bb16..cd4087fbda9a 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -352,5 +352,11 @@ static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
return &kvm->arch.mmu != mmu;
}
+#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
+void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
+#else
+static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
+#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */
+
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 5b06c31035a2..e8bc6d67aba2 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -85,7 +85,7 @@ struct kvm_s2_trans {
bool readable;
int level;
u32 esr;
- u64 upper_attr;
+ u64 desc;
};
static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
@@ -115,7 +115,7 @@ static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans)
static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans)
{
- return !(trans->upper_attr & BIT(54));
+ return !(trans->desc & BIT(54));
}
extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
@@ -205,4 +205,40 @@ static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans)
return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level);
}
+/* Adjust alignment for the contiguous bit as per StageOA() */
+#define contiguous_bit_shift(d, wi, l) \
+ ({ \
+ u8 shift = 0; \
+ \
+ if ((d) & PTE_CONT) { \
+ switch (BIT((wi)->pgshift)) { \
+ case SZ_4K: \
+ shift = 4; \
+ break; \
+ case SZ_16K: \
+ shift = (l) == 2 ? 5 : 7; \
+ break; \
+ case SZ_64K: \
+ shift = 5; \
+ break; \
+ } \
+ } \
+ \
+ shift; \
+ })
+
+static inline unsigned int ps_to_output_size(unsigned int ps)
+{
+ switch (ps) {
+ case 0: return 32;
+ case 1: return 36;
+ case 2: return 40;
+ case 3: return 42;
+ case 4: return 44;
+ case 5:
+ default:
+ return 48;
+ }
+}
+
#endif /* __ARM64_KVM_NESTED_H */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 19278dfe7978..03f4c3d7839c 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -59,6 +59,48 @@ typedef u64 kvm_pte_t;
#define KVM_PHYS_INVALID (-1ULL)
+#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \
+ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \
+ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50)
+
+#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
+
+#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
+ KVM_PTE_LEAF_ATTR_HI_S2_XN)
+
+#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
+#define KVM_MAX_OWNER_ID 1
+
+/*
+ * Used to indicate a pte for which a 'break-before-make' sequence is in
+ * progress.
+ */
+#define KVM_INVALID_PTE_LOCKED BIT(10)
+
static inline bool kvm_pte_valid(kvm_pte_t pte)
{
return pte & KVM_PTE_VALID;
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 1f60aa1bc750..07dfbdb14bab 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -205,6 +205,11 @@
#define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2)
/*
+ * Hierarchical permission for Stage-1 tables
+ */
+#define S1_TABLE_AP (_AT(pmdval_t, 3) << 61)
+
+/*
* Highest possible physical address supported.
*/
#define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS)
@@ -298,6 +303,10 @@
#define TCR_TBI1 (UL(1) << 38)
#define TCR_HA (UL(1) << 39)
#define TCR_HD (UL(1) << 40)
+#define TCR_HPD0_SHIFT 41
+#define TCR_HPD0 (UL(1) << TCR_HPD0_SHIFT)
+#define TCR_HPD1_SHIFT 42
+#define TCR_HPD1 (UL(1) << TCR_HPD1_SHIFT)
#define TCR_TBID0 (UL(1) << 51)
#define TCR_TBID1 (UL(1) << 52)
#define TCR_NFD0 (UL(1) << 53)
diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
index 5b1701c76d1c..6cf4aae05219 100644
--- a/arch/arm64/include/asm/ptdump.h
+++ b/arch/arm64/include/asm/ptdump.h
@@ -5,6 +5,8 @@
#ifndef __ASM_PTDUMP_H
#define __ASM_PTDUMP_H
+#include <linux/ptdump.h>
+
#ifdef CONFIG_PTDUMP_CORE
#include <linux/mm_types.h>
@@ -21,14 +23,53 @@ struct ptdump_info {
unsigned long base_addr;
};
+struct ptdump_prot_bits {
+ u64 mask;
+ u64 val;
+ const char *set;
+ const char *clear;
+};
+
+struct ptdump_pg_level {
+ const struct ptdump_prot_bits *bits;
+ char name[4];
+ int num;
+ u64 mask;
+};
+
+/*
+ * The page dumper groups page table entries of the same type into a single
+ * description. It uses pg_state to track the range information while
+ * iterating over the pte entries. When the continuity is broken it then
+ * dumps out a description of the range.
+ */
+struct ptdump_pg_state {
+ struct ptdump_state ptdump;
+ struct ptdump_pg_level *pg_level;
+ struct seq_file *seq;
+ const struct addr_marker *marker;
+ const struct mm_struct *mm;
+ unsigned long start_address;
+ int level;
+ u64 current_prot;
+ bool check_wx;
+ unsigned long wx_pages;
+ unsigned long uxn_pages;
+};
+
void ptdump_walk(struct seq_file *s, struct ptdump_info *info);
+void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
+ u64 val);
#ifdef CONFIG_PTDUMP_DEBUGFS
#define EFI_RUNTIME_MAP_END DEFAULT_MAP_WINDOW_64
void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
#else
static inline void ptdump_debugfs_register(struct ptdump_info *info,
const char *name) { }
-#endif
+#endif /* CONFIG_PTDUMP_DEBUGFS */
+#else
+static inline void note_page(struct ptdump_state *pt_st, unsigned long addr,
+ int level, u64 val) { }
#endif /* CONFIG_PTDUMP_CORE */
#endif /* __ASM_PTDUMP_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 4a9ea103817e..bc161f160854 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -109,6 +109,9 @@
#define set_pstate_ssbs(x) asm volatile(SET_PSTATE_SSBS(x))
#define set_pstate_dit(x) asm volatile(SET_PSTATE_DIT(x))
+/* Register-based PAN access, for save/restore purposes */
+#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3)
+
#define __SYS_BARRIER_INSN(CRm, op2, Rt) \
__emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f))
@@ -325,7 +328,25 @@
#define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0)
#define SYS_PAR_EL1_F BIT(0)
+/* When PAR_EL1.F == 1 */
#define SYS_PAR_EL1_FST GENMASK(6, 1)
+#define SYS_PAR_EL1_PTW BIT(8)
+#define SYS_PAR_EL1_S BIT(9)
+#define SYS_PAR_EL1_AssuredOnly BIT(12)
+#define SYS_PAR_EL1_TopLevel BIT(13)
+#define SYS_PAR_EL1_Overlay BIT(14)
+#define SYS_PAR_EL1_DirtyBit BIT(15)
+#define SYS_PAR_EL1_F1_IMPDEF GENMASK_ULL(63, 48)
+#define SYS_PAR_EL1_F1_RES0 (BIT(7) | BIT(10) | GENMASK_ULL(47, 16))
+#define SYS_PAR_EL1_RES1 BIT(11)
+/* When PAR_EL1.F == 0 */
+#define SYS_PAR_EL1_SH GENMASK_ULL(8, 7)
+#define SYS_PAR_EL1_NS BIT(9)
+#define SYS_PAR_EL1_F0_IMPDEF BIT(10)
+#define SYS_PAR_EL1_NSE BIT(11)
+#define SYS_PAR_EL1_PA GENMASK_ULL(51, 12)
+#define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56)
+#define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52))
/*** Statistical Profiling Extension ***/
#define PMSEVFR_EL1_RES0_IMP \
@@ -652,6 +673,7 @@
#define OP_AT_S12E1W sys_insn(AT_Op0, 4, AT_CRn, 8, 5)
#define OP_AT_S12E0R sys_insn(AT_Op0, 4, AT_CRn, 8, 6)
#define OP_AT_S12E0W sys_insn(AT_Op0, 4, AT_CRn, 8, 7)
+#define OP_AT_S1E2A sys_insn(AT_Op0, 4, AT_CRn, 9, 2)
/* TLBI instructions */
#define TLBI_Op0 1
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 6b3258860377..2729faaee4b4 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -25,6 +25,7 @@
*
* @common: Common unwind state.
* @task: The task being unwound.
+ * @graph_idx: Used by ftrace_graph_ret_addr() for optimized stack unwinding.
* @kr_cur: When KRETPROBES is selected, holds the kretprobe instance
* associated with the most recently encountered replacement lr
* value.
@@ -32,6 +33,7 @@
struct kunwind_state {
struct unwind_state common;
struct task_struct *task;
+ int graph_idx;
#ifdef CONFIG_KRETPROBES
struct llist_node *kr_cur;
#endif
@@ -106,7 +108,7 @@ kunwind_recover_return_address(struct kunwind_state *state)
if (state->task->ret_stack &&
(state->common.pc == (unsigned long)return_to_handler)) {
unsigned long orig_pc;
- orig_pc = ftrace_graph_ret_addr(state->task, NULL,
+ orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx,
state->common.pc,
(void *)state->common.fp);
if (WARN_ON_ONCE(state->common.pc == orig_pc))
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 8304eb342be9..ead632ad01b4 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -66,4 +66,21 @@ config PROTECTED_NVHE_STACKTRACE
If unsure, or not using protected nVHE (pKVM), say N.
+config PTDUMP_STAGE2_DEBUGFS
+ bool "Present the stage-2 pagetables to debugfs"
+ depends on KVM
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
+ depends on GENERIC_PTDUMP
+ select PTDUMP_CORE
+ default n
+ help
+ Say Y here if you want to show the stage-2 kernel pagetables
+ layout in a debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+
+ If in doubt, say N.
+
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 86a629aaf0a1..3cf7adb2b503 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -17,7 +17,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
inject_fault.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o stacktrace.o \
vgic-sys-reg-v3.o fpsimd.o pkvm.o \
- arch_timer.o trng.o vmid.o emulate-nested.o nested.o \
+ arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \
vgic/vgic.o vgic/vgic-init.o \
vgic/vgic-irqfd.o vgic/vgic-v2.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
@@ -27,6 +27,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o
+kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
always-y := hyp_constants.h hyp-constants.s
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9bef7638342e..a0d01c46e408 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -46,6 +46,8 @@
#include <kvm/arm_pmu.h>
#include <kvm/arm_psci.h>
+#include "sys_regs.h"
+
static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
enum kvm_wfx_trap_policy {
@@ -228,6 +230,7 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
void kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
kvm_sys_regs_create_debugfs(kvm);
+ kvm_s2_ptdump_create_debugfs(kvm);
}
static void kvm_destroy_mpidr_data(struct kvm *kvm)
@@ -821,15 +824,13 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
return ret;
}
- if (vcpu_has_nv(vcpu)) {
- ret = kvm_init_nv_sysregs(vcpu->kvm);
- if (ret)
- return ret;
- }
+ ret = kvm_finalize_sys_regs(vcpu);
+ if (ret)
+ return ret;
/*
- * This needs to happen after NV has imposed its own restrictions on
- * the feature set
+ * This needs to happen after any restriction has been applied
+ * to the feature set.
*/
kvm_calculate_traps(vcpu);
@@ -2163,7 +2164,7 @@ static void cpu_hyp_uninit(void *discard)
}
}
-int kvm_arch_hardware_enable(void)
+int kvm_arch_enable_virtualization_cpu(void)
{
/*
* Most calls to this function are made with migration
@@ -2183,7 +2184,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
kvm_timer_cpu_down();
kvm_vgic_cpu_down();
@@ -2379,7 +2380,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits)
/*
* The stub hypercalls are now disabled, so set our local flag to
- * prevent a later re-init attempt in kvm_arch_hardware_enable().
+ * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
*/
__this_cpu_write(kvm_hyp_initialized, 1);
preempt_enable();
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
new file mode 100644
index 000000000000..39f0e87a340e
--- /dev/null
+++ b/arch/arm64/kvm/at.c
@@ -0,0 +1,1101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 - Linaro Ltd
+ * Author: Jintack Lim <jintack.lim@linaro.org>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/esr.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+enum trans_regime {
+ TR_EL10,
+ TR_EL20,
+ TR_EL2,
+};
+
+struct s1_walk_info {
+ u64 baddr;
+ enum trans_regime regime;
+ unsigned int max_oa_bits;
+ unsigned int pgshift;
+ unsigned int txsz;
+ int sl;
+ bool hpd;
+ bool be;
+ bool s2;
+};
+
+struct s1_walk_result {
+ union {
+ struct {
+ u64 desc;
+ u64 pa;
+ s8 level;
+ u8 APTable;
+ bool UXNTable;
+ bool PXNTable;
+ };
+ struct {
+ u8 fst;
+ bool ptw;
+ bool s2;
+ };
+ };
+ bool failed;
+};
+
+static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
+{
+ wr->fst = fst;
+ wr->ptw = ptw;
+ wr->s2 = s2;
+ wr->failed = true;
+}
+
+#define S1_MMU_DISABLED (-127)
+
+static int get_ia_size(struct s1_walk_info *wi)
+{
+ return 64 - wi->txsz;
+}
+
+/* Return true if the IPA is out of the OA range */
+static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
+{
+ return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
+}
+
+/* Return the translation regime that applies to an AT instruction */
+static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
+{
+ /*
+ * We only get here from guest EL2, so the translation
+ * regime AT applies to is solely defined by {E2H,TGE}.
+ */
+ switch (op) {
+ case OP_AT_S1E2R:
+ case OP_AT_S1E2W:
+ case OP_AT_S1E2A:
+ return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
+ break;
+ default:
+ return (vcpu_el2_e2h_is_set(vcpu) &&
+ vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
+ }
+}
+
+static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
+ struct s1_walk_result *wr, u64 va)
+{
+ u64 hcr, sctlr, tcr, tg, ps, ia_bits, ttbr;
+ unsigned int stride, x;
+ bool va55, tbi, lva, as_el0;
+
+ hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
+
+ wi->regime = compute_translation_regime(vcpu, op);
+ as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
+
+ va55 = va & BIT(55);
+
+ if (wi->regime == TR_EL2 && va55)
+ goto addrsz;
+
+ wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC));
+
+ switch (wi->regime) {
+ case TR_EL10:
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL1);
+ ttbr = (va55 ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL1));
+ break;
+ case TR_EL2:
+ case TR_EL20:
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+ tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
+ ttbr = (va55 ?
+ vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+ vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+ break;
+ default:
+ BUG();
+ }
+
+ tbi = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_TBI, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_TBI1, tcr) :
+ FIELD_GET(TCR_TBI0, tcr)));
+
+ if (!tbi && (u64)sign_extend64(va, 55) != va)
+ goto addrsz;
+
+ va = (u64)sign_extend64(va, 55);
+
+ /* Let's put the MMU disabled case aside immediately */
+ switch (wi->regime) {
+ case TR_EL10:
+ /*
+ * If dealing with the EL1&0 translation regime, 3 things
+ * can disable the S1 translation:
+ *
+ * - HCR_EL2.DC = 1
+ * - HCR_EL2.{E2H,TGE} = {0,1}
+ * - SCTLR_EL1.M = 0
+ *
+ * The TGE part is interesting. If we have decided that this
+ * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
+ * {0,x}, and we only need to test for TGE == 1.
+ */
+ if (hcr & (HCR_DC | HCR_TGE)) {
+ wr->level = S1_MMU_DISABLED;
+ break;
+ }
+ fallthrough;
+ case TR_EL2:
+ case TR_EL20:
+ if (!(sctlr & SCTLR_ELx_M))
+ wr->level = S1_MMU_DISABLED;
+ break;
+ }
+
+ if (wr->level == S1_MMU_DISABLED) {
+ if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
+ goto addrsz;
+
+ wr->pa = va;
+ return 0;
+ }
+
+ wi->be = sctlr & SCTLR_ELx_EE;
+
+ wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
+ wi->hpd &= (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_HPD, tcr) :
+ (va55 ?
+ FIELD_GET(TCR_HPD1, tcr) :
+ FIELD_GET(TCR_HPD0, tcr)));
+
+ /* Someone was silly enough to encode TG0/TG1 differently */
+ if (va55) {
+ wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG1_MASK, tcr);
+
+ switch (tg << TCR_TG1_SHIFT) {
+ case TCR_TG1_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG1_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG1_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ } else {
+ wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+ tg = FIELD_GET(TCR_TG0_MASK, tcr);
+
+ switch (tg << TCR_TG0_SHIFT) {
+ case TCR_TG0_4K:
+ wi->pgshift = 12; break;
+ case TCR_TG0_16K:
+ wi->pgshift = 14; break;
+ case TCR_TG0_64K:
+ default: /* IMPDEF: treat any other value as 64k */
+ wi->pgshift = 16; break;
+ }
+ }
+
+ /* R_PLCGL, R_YXNYW */
+ if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
+ if (wi->txsz > 39)
+ goto transfault_l0;
+ } else {
+ if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
+ goto transfault_l0;
+ }
+
+ /* R_GTJBY, R_SXWGM */
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
+ lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+ break;
+ case SZ_16K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
+ lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+ break;
+ case SZ_64K:
+ lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
+ break;
+ }
+
+ if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
+ goto transfault_l0;
+
+ ia_bits = get_ia_size(wi);
+
+ /* R_YYVYV, I_THCZK */
+ if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
+ (va55 && va < GENMASK(63, ia_bits)))
+ goto transfault_l0;
+
+ /* I_ZFSYQ */
+ if (wi->regime != TR_EL2 &&
+ (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
+ goto transfault_l0;
+
+ /* R_BNDVG and following statements */
+ if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
+ as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0)))
+ goto transfault_l0;
+
+ /* AArch64.S1StartLevel() */
+ stride = wi->pgshift - 3;
+ wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
+
+ ps = (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
+
+ wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
+
+ /* Compute minimal alignment */
+ x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
+
+ wi->baddr = ttbr & TTBRx_EL1_BADDR;
+
+ /* R_VPBBF */
+ if (check_output_size(wi->baddr, wi))
+ goto addrsz;
+
+ wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
+
+ return 0;
+
+addrsz: /* Address Size Fault level 0 */
+ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
+ return -EFAULT;
+
+transfault_l0: /* Translation Fault level 0 */
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
+ return -EFAULT;
+}
+
+static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+ struct s1_walk_result *wr, u64 va)
+{
+ u64 va_top, va_bottom, baddr, desc;
+ int level, stride, ret;
+
+ level = wi->sl;
+ stride = wi->pgshift - 3;
+ baddr = wi->baddr;
+
+ va_top = get_ia_size(wi) - 1;
+
+ while (1) {
+ u64 index, ipa;
+
+ va_bottom = (3 - level) * stride + wi->pgshift;
+ index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
+
+ ipa = baddr | index;
+
+ if (wi->s2) {
+ struct kvm_s2_trans s2_trans = {};
+
+ ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
+ if (ret) {
+ fail_s1_walk(wr,
+ (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
+ true, true);
+ return ret;
+ }
+
+ if (!kvm_s2_trans_readable(&s2_trans)) {
+ fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
+ true, true);
+
+ return -EPERM;
+ }
+
+ ipa = kvm_s2_trans_output(&s2_trans);
+ }
+
+ ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
+ if (ret) {
+ fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
+ true, false);
+ return ret;
+ }
+
+ if (wi->be)
+ desc = be64_to_cpu((__force __be64)desc);
+ else
+ desc = le64_to_cpu((__force __le64)desc);
+
+ /* Invalid descriptor */
+ if (!(desc & BIT(0)))
+ goto transfault;
+
+ /* Block mapping, check validity down the line */
+ if (!(desc & BIT(1)))
+ break;
+
+ /* Page mapping */
+ if (level == 3)
+ break;
+
+ /* Table handling */
+ if (!wi->hpd) {
+ wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
+ wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
+ wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
+ }
+
+ baddr = desc & GENMASK_ULL(47, wi->pgshift);
+
+ /* Check for out-of-range OA */
+ if (check_output_size(baddr, wi))
+ goto addrsz;
+
+ /* Prepare for next round */
+ va_top = va_bottom - 1;
+ level++;
+ }
+
+ /* Block mapping, check the validity of the level */
+ if (!(desc & BIT(1))) {
+ bool valid_block = false;
+
+ switch (BIT(wi->pgshift)) {
+ case SZ_4K:
+ valid_block = level == 1 || level == 2;
+ break;
+ case SZ_16K:
+ case SZ_64K:
+ valid_block = level == 2;
+ break;
+ }
+
+ if (!valid_block)
+ goto transfault;
+ }
+
+ if (check_output_size(desc & GENMASK(47, va_bottom), wi))
+ goto addrsz;
+
+ va_bottom += contiguous_bit_shift(desc, wi, level);
+
+ wr->failed = false;
+ wr->level = level;
+ wr->desc = desc;
+ wr->pa = desc & GENMASK(47, va_bottom);
+ wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
+
+ return 0;
+
+addrsz:
+ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
+ return -EINVAL;
+transfault:
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
+ return -ENOENT;
+}
+
+struct mmu_config {
+ u64 ttbr0;
+ u64 ttbr1;
+ u64 tcr;
+ u64 mair;
+ u64 sctlr;
+ u64 vttbr;
+ u64 vtcr;
+ u64 hcr;
+};
+
+static void __mmu_config_save(struct mmu_config *config)
+{
+ config->ttbr0 = read_sysreg_el1(SYS_TTBR0);
+ config->ttbr1 = read_sysreg_el1(SYS_TTBR1);
+ config->tcr = read_sysreg_el1(SYS_TCR);
+ config->mair = read_sysreg_el1(SYS_MAIR);
+ config->sctlr = read_sysreg_el1(SYS_SCTLR);
+ config->vttbr = read_sysreg(vttbr_el2);
+ config->vtcr = read_sysreg(vtcr_el2);
+ config->hcr = read_sysreg(hcr_el2);
+}
+
+static void __mmu_config_restore(struct mmu_config *config)
+{
+ write_sysreg(config->hcr, hcr_el2);
+
+ /*
+ * ARM errata 1165522 and 1530923 require TGE to be 1 before
+ * we update the guest state.
+ */
+ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
+
+ write_sysreg_el1(config->ttbr0, SYS_TTBR0);
+ write_sysreg_el1(config->ttbr1, SYS_TTBR1);
+ write_sysreg_el1(config->tcr, SYS_TCR);
+ write_sysreg_el1(config->mair, SYS_MAIR);
+ write_sysreg_el1(config->sctlr, SYS_SCTLR);
+ write_sysreg(config->vttbr, vttbr_el2);
+ write_sysreg(config->vtcr, vtcr_el2);
+}
+
+static bool at_s1e1p_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 host_pan;
+ bool fail;
+
+ host_pan = read_sysreg_s(SYS_PSTATE_PAN);
+ write_sysreg_s(*vcpu_cpsr(vcpu) & PSTATE_PAN, SYS_PSTATE_PAN);
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ fail = __kvm_at(OP_AT_S1E1RP, vaddr);
+ break;
+ case OP_AT_S1E1WP:
+ fail = __kvm_at(OP_AT_S1E1WP, vaddr);
+ break;
+ }
+
+ write_sysreg_s(host_pan, SYS_PSTATE_PAN);
+
+ return fail;
+}
+
+#define MEMATTR(ic, oc) (MEMATTR_##oc << 4 | MEMATTR_##ic)
+#define MEMATTR_NC 0b0100
+#define MEMATTR_Wt 0b1000
+#define MEMATTR_Wb 0b1100
+#define MEMATTR_WbRaWa 0b1111
+
+#define MEMATTR_IS_DEVICE(m) (((m) & GENMASK(7, 4)) == 0)
+
+static u8 s2_memattr_to_attr(u8 memattr)
+{
+ memattr &= 0b1111;
+
+ switch (memattr) {
+ case 0b0000:
+ case 0b0001:
+ case 0b0010:
+ case 0b0011:
+ return memattr << 2;
+ case 0b0100:
+ return MEMATTR(Wb, Wb);
+ case 0b0101:
+ return MEMATTR(NC, NC);
+ case 0b0110:
+ return MEMATTR(Wt, NC);
+ case 0b0111:
+ return MEMATTR(Wb, NC);
+ case 0b1000:
+ /* Reserved, assume NC */
+ return MEMATTR(NC, NC);
+ case 0b1001:
+ return MEMATTR(NC, Wt);
+ case 0b1010:
+ return MEMATTR(Wt, Wt);
+ case 0b1011:
+ return MEMATTR(Wb, Wt);
+ case 0b1100:
+ /* Reserved, assume NC */
+ return MEMATTR(NC, NC);
+ case 0b1101:
+ return MEMATTR(NC, Wb);
+ case 0b1110:
+ return MEMATTR(Wt, Wb);
+ case 0b1111:
+ return MEMATTR(Wb, Wb);
+ default:
+ unreachable();
+ }
+}
+
+static u8 combine_s1_s2_attr(u8 s1, u8 s2)
+{
+ bool transient;
+ u8 final = 0;
+
+ /* Upgrade transient s1 to non-transient to simplify things */
+ switch (s1) {
+ case 0b0001 ... 0b0011: /* Normal, Write-Through Transient */
+ transient = true;
+ s1 = MEMATTR_Wt | (s1 & GENMASK(1,0));
+ break;
+ case 0b0101 ... 0b0111: /* Normal, Write-Back Transient */
+ transient = true;
+ s1 = MEMATTR_Wb | (s1 & GENMASK(1,0));
+ break;
+ default:
+ transient = false;
+ }
+
+ /* S2CombineS1AttrHints() */
+ if ((s1 & GENMASK(3, 2)) == MEMATTR_NC ||
+ (s2 & GENMASK(3, 2)) == MEMATTR_NC)
+ final = MEMATTR_NC;
+ else if ((s1 & GENMASK(3, 2)) == MEMATTR_Wt ||
+ (s2 & GENMASK(3, 2)) == MEMATTR_Wt)
+ final = MEMATTR_Wt;
+ else
+ final = MEMATTR_Wb;
+
+ if (final != MEMATTR_NC) {
+ /* Inherit RaWa hints form S1 */
+ if (transient) {
+ switch (s1 & GENMASK(3, 2)) {
+ case MEMATTR_Wt:
+ final = 0;
+ break;
+ case MEMATTR_Wb:
+ final = MEMATTR_NC;
+ break;
+ }
+ }
+
+ final |= s1 & GENMASK(1, 0);
+ }
+
+ return final;
+}
+
+#define ATTR_NSH 0b00
+#define ATTR_RSV 0b01
+#define ATTR_OSH 0b10
+#define ATTR_ISH 0b11
+
+static u8 compute_sh(u8 attr, u64 desc)
+{
+ u8 sh;
+
+ /* Any form of device, as well as NC has SH[1:0]=0b10 */
+ if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC))
+ return ATTR_OSH;
+
+ sh = FIELD_GET(PTE_SHARED, desc);
+ if (sh == ATTR_RSV) /* Reserved, mapped to NSH */
+ sh = ATTR_NSH;
+
+ return sh;
+}
+
+static u8 combine_sh(u8 s1_sh, u8 s2_sh)
+{
+ if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
+ return ATTR_OSH;
+ if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
+ return ATTR_ISH;
+
+ return ATTR_NSH;
+}
+
+static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
+ struct kvm_s2_trans *tr)
+{
+ u8 s1_parattr, s2_memattr, final_attr;
+ u64 par;
+
+ /* If S2 has failed to translate, report the damage */
+ if (tr->esr) {
+ par = SYS_PAR_EL1_RES1;
+ par |= SYS_PAR_EL1_F;
+ par |= SYS_PAR_EL1_S;
+ par |= FIELD_PREP(SYS_PAR_EL1_FST, tr->esr);
+ return par;
+ }
+
+ s1_parattr = FIELD_GET(SYS_PAR_EL1_ATTR, s1_par);
+ s2_memattr = FIELD_GET(GENMASK(5, 2), tr->desc);
+
+ if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_FWB) {
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64PFR2_EL1, MTEPERM, IMP))
+ s2_memattr &= ~BIT(3);
+
+ /* Combination of R_VRJSW and R_RHWZM */
+ switch (s2_memattr) {
+ case 0b0101:
+ if (MEMATTR_IS_DEVICE(s1_parattr))
+ final_attr = s1_parattr;
+ else
+ final_attr = MEMATTR(NC, NC);
+ break;
+ case 0b0110:
+ case 0b1110:
+ final_attr = MEMATTR(WbRaWa, WbRaWa);
+ break;
+ case 0b0111:
+ case 0b1111:
+ /* Preserve S1 attribute */
+ final_attr = s1_parattr;
+ break;
+ case 0b0100:
+ case 0b1100:
+ case 0b1101:
+ /* Reserved, do something non-silly */
+ final_attr = s1_parattr;
+ break;
+ default:
+ /* MemAttr[2]=0, Device from S2 */
+ final_attr = s2_memattr & GENMASK(1,0) << 2;
+ }
+ } else {
+ /* Combination of R_HMNDG, R_TNHFM and R_GQFSF */
+ u8 s2_parattr = s2_memattr_to_attr(s2_memattr);
+
+ if (MEMATTR_IS_DEVICE(s1_parattr) ||
+ MEMATTR_IS_DEVICE(s2_parattr)) {
+ final_attr = min(s1_parattr, s2_parattr);
+ } else {
+ /* At this stage, this is memory vs memory */
+ final_attr = combine_s1_s2_attr(s1_parattr & 0xf,
+ s2_parattr & 0xf);
+ final_attr |= combine_s1_s2_attr(s1_parattr >> 4,
+ s2_parattr >> 4) << 4;
+ }
+ }
+
+ if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_CD) &&
+ !MEMATTR_IS_DEVICE(final_attr))
+ final_attr = MEMATTR(NC, NC);
+
+ par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
+ par |= tr->output & GENMASK(47, 12);
+ par |= FIELD_PREP(SYS_PAR_EL1_SH,
+ combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
+ compute_sh(final_attr, tr->desc)));
+
+ return par;
+}
+
+static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
+ enum trans_regime regime)
+{
+ u64 par;
+
+ if (wr->failed) {
+ par = SYS_PAR_EL1_RES1;
+ par |= SYS_PAR_EL1_F;
+ par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
+ par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
+ par |= wr->s2 ? SYS_PAR_EL1_S : 0;
+ } else if (wr->level == S1_MMU_DISABLED) {
+ /* MMU off or HCR_EL2.DC == 1 */
+ par = SYS_PAR_EL1_NSE;
+ par |= wr->pa & GENMASK_ULL(47, 12);
+
+ if (regime == TR_EL10 &&
+ (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
+ MEMATTR(WbRaWa, WbRaWa));
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
+ } else {
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
+ }
+ } else {
+ u64 mair, sctlr;
+ u8 sh;
+
+ par = SYS_PAR_EL1_NSE;
+
+ mair = (regime == TR_EL10 ?
+ vcpu_read_sys_reg(vcpu, MAIR_EL1) :
+ vcpu_read_sys_reg(vcpu, MAIR_EL2));
+
+ mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
+ mair &= 0xff;
+
+ sctlr = (regime == TR_EL10 ?
+ vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
+ vcpu_read_sys_reg(vcpu, SCTLR_EL2));
+
+ /* Force NC for memory if SCTLR_ELx.C is clear */
+ if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
+ mair = MEMATTR(NC, NC);
+
+ par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
+ par |= wr->pa & GENMASK_ULL(47, 12);
+
+ sh = compute_sh(mair, wr->desc);
+ par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
+ }
+
+ return par;
+}
+
+static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime)
+{
+ u64 sctlr;
+
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
+ return false;
+
+ if (regime == TR_EL10)
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ else
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+
+ return sctlr & SCTLR_EL1_EPAN;
+}
+
+static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ bool perm_fail, ur, uw, ux, pr, pw, px;
+ struct s1_walk_result wr = {};
+ struct s1_walk_info wi = {};
+ int ret, idx;
+
+ ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
+ if (ret)
+ goto compute_par;
+
+ if (wr.level == S1_MMU_DISABLED)
+ goto compute_par;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ ret = walk_s1(vcpu, &wi, &wr, vaddr);
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (ret)
+ goto compute_par;
+
+ /* FIXME: revisit when adding indirect permission support */
+ /* AArch64.S1DirectBasePermissions() */
+ if (wi.regime != TR_EL2) {
+ switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) {
+ case 0b00:
+ pr = pw = true;
+ ur = uw = false;
+ break;
+ case 0b01:
+ pr = pw = ur = uw = true;
+ break;
+ case 0b10:
+ pr = true;
+ pw = ur = uw = false;
+ break;
+ case 0b11:
+ pr = ur = true;
+ pw = uw = false;
+ break;
+ }
+
+ switch (wr.APTable) {
+ case 0b00:
+ break;
+ case 0b01:
+ ur = uw = false;
+ break;
+ case 0b10:
+ pw = uw = false;
+ break;
+ case 0b11:
+ pw = ur = uw = false;
+ break;
+ }
+
+ /* We don't use px for anything yet, but hey... */
+ px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw);
+ ux = !((wr.desc & PTE_UXN) || wr.UXNTable);
+
+ if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) {
+ bool pan;
+
+ pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
+ pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux);
+ pw &= !pan;
+ pr &= !pan;
+ }
+ } else {
+ ur = uw = ux = false;
+
+ if (!(wr.desc & PTE_RDONLY)) {
+ pr = pw = true;
+ } else {
+ pr = true;
+ pw = false;
+ }
+
+ if (wr.APTable & BIT(1))
+ pw = false;
+
+ /* XN maps to UXN */
+ px = !((wr.desc & PTE_UXN) || wr.UXNTable);
+ }
+
+ perm_fail = false;
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ case OP_AT_S1E1R:
+ case OP_AT_S1E2R:
+ perm_fail = !pr;
+ break;
+ case OP_AT_S1E1WP:
+ case OP_AT_S1E1W:
+ case OP_AT_S1E2W:
+ perm_fail = !pw;
+ break;
+ case OP_AT_S1E0R:
+ perm_fail = !ur;
+ break;
+ case OP_AT_S1E0W:
+ perm_fail = !uw;
+ break;
+ case OP_AT_S1E1A:
+ case OP_AT_S1E2A:
+ break;
+ default:
+ BUG();
+ }
+
+ if (perm_fail)
+ fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
+
+compute_par:
+ return compute_par_s1(vcpu, &wr, wi.regime);
+}
+
+/*
+ * Return the PAR_EL1 value as the result of a valid translation.
+ *
+ * If the translation is unsuccessful, the value may only contain
+ * PAR_EL1.F, and cannot be taken at face value. It isn't an
+ * indication of the translation having failed, only that the fast
+ * path did not succeed, *unless* it indicates a S1 permission fault.
+ */
+static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ struct mmu_config config;
+ struct kvm_s2_mmu *mmu;
+ bool fail;
+ u64 par;
+
+ par = SYS_PAR_EL1_F;
+
+ /*
+ * We've trapped, so everything is live on the CPU. As we will
+ * be switching contexts behind everybody's back, disable
+ * interrupts while holding the mmu lock.
+ */
+ guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
+
+ /*
+ * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already
+ * the right one (as we trapped from vEL2). If not, save the
+ * full MMU context.
+ */
+ if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))
+ goto skip_mmu_switch;
+
+ /*
+ * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
+ * find it (recycled by another vcpu, for example). When this
+ * happens, admit defeat immediately and use the SW (slow) path.
+ */
+ mmu = lookup_s2_mmu(vcpu);
+ if (!mmu)
+ return par;
+
+ __mmu_config_save(&config);
+
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR);
+ write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR);
+ __load_stage2(mmu, mmu->arch);
+
+skip_mmu_switch:
+ /* Clear TGE, enable S2 translation, we're rolling */
+ write_sysreg((config.hcr & ~HCR_TGE) | HCR_VM, hcr_el2);
+ isb();
+
+ switch (op) {
+ case OP_AT_S1E1RP:
+ case OP_AT_S1E1WP:
+ fail = at_s1e1p_fast(vcpu, op, vaddr);
+ break;
+ case OP_AT_S1E1R:
+ fail = __kvm_at(OP_AT_S1E1R, vaddr);
+ break;
+ case OP_AT_S1E1W:
+ fail = __kvm_at(OP_AT_S1E1W, vaddr);
+ break;
+ case OP_AT_S1E0R:
+ fail = __kvm_at(OP_AT_S1E0R, vaddr);
+ break;
+ case OP_AT_S1E0W:
+ fail = __kvm_at(OP_AT_S1E0W, vaddr);
+ break;
+ case OP_AT_S1E1A:
+ fail = __kvm_at(OP_AT_S1E1A, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ fail = true;
+ break;
+ }
+
+ if (!fail)
+ par = read_sysreg_par();
+
+ if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
+ __mmu_config_restore(&config);
+
+ return par;
+}
+
+static bool par_check_s1_perm_fault(u64 par)
+{
+ u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
+
+ return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
+ !(par & SYS_PAR_EL1_S));
+}
+
+void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
+
+ /*
+ * If PAR_EL1 reports that AT failed on a S1 permission fault, we
+ * know for sure that the PTW was able to walk the S1 tables and
+ * there's nothing else to do.
+ *
+ * If AT failed for any other reason, then we must walk the guest S1
+ * to emulate the instruction.
+ */
+ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+ par = handle_at_slow(vcpu, op, vaddr);
+
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+}
+
+void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ u64 par;
+
+ /*
+ * We've trapped, so everything is live on the CPU. As we will be
+ * switching context behind everybody's back, disable interrupts...
+ */
+ scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) {
+ struct kvm_s2_mmu *mmu;
+ u64 val, hcr;
+ bool fail;
+
+ mmu = &vcpu->kvm->arch.mmu;
+
+ val = hcr = read_sysreg(hcr_el2);
+ val &= ~HCR_TGE;
+ val |= HCR_VM;
+
+ if (!vcpu_el2_e2h_is_set(vcpu))
+ val |= HCR_NV | HCR_NV1;
+
+ write_sysreg(val, hcr_el2);
+ isb();
+
+ par = SYS_PAR_EL1_F;
+
+ switch (op) {
+ case OP_AT_S1E2R:
+ fail = __kvm_at(OP_AT_S1E1R, vaddr);
+ break;
+ case OP_AT_S1E2W:
+ fail = __kvm_at(OP_AT_S1E1W, vaddr);
+ break;
+ case OP_AT_S1E2A:
+ fail = __kvm_at(OP_AT_S1E1A, vaddr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ fail = true;
+ }
+
+ isb();
+
+ if (!fail)
+ par = read_sysreg_par();
+
+ write_sysreg(hcr, hcr_el2);
+ isb();
+ }
+
+ /* We failed the translation, let's replay it in slow motion */
+ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+ par = handle_at_slow(vcpu, op, vaddr);
+
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+}
+
+void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+ struct kvm_s2_trans out = {};
+ u64 ipa, par;
+ bool write;
+ int ret;
+
+ /* Do the stage-1 translation */
+ switch (op) {
+ case OP_AT_S12E1R:
+ op = OP_AT_S1E1R;
+ write = false;
+ break;
+ case OP_AT_S12E1W:
+ op = OP_AT_S1E1W;
+ write = true;
+ break;
+ case OP_AT_S12E0R:
+ op = OP_AT_S1E0R;
+ write = false;
+ break;
+ case OP_AT_S12E0W:
+ op = OP_AT_S1E0W;
+ write = true;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ __kvm_at_s1e01(vcpu, op, vaddr);
+ par = vcpu_read_sys_reg(vcpu, PAR_EL1);
+ if (par & SYS_PAR_EL1_F)
+ return;
+
+ /*
+ * If we only have a single stage of translation (E2H=0 or
+ * TGE=1), exit early. Same thing if {VM,DC}=={0,0}.
+ */
+ if (!vcpu_el2_e2h_is_set(vcpu) || vcpu_el2_tge_is_set(vcpu) ||
+ !(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
+ return;
+
+ /* Do the stage-2 translation */
+ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
+ out.esr = 0;
+ ret = kvm_walk_nested_s2(vcpu, ipa, &out);
+ if (ret < 0)
+ return;
+
+ /* Check the access permission */
+ if (!out.esr &&
+ ((!write && !out.readable) || (write && !out.writable)))
+ out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
+
+ par = compute_par_s12(vcpu, par, &out);
+ vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+}
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
index 05166eccea0a..05b6435d02a9 100644
--- a/arch/arm64/kvm/emulate-nested.c
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -83,14 +83,20 @@ enum cgt_group_id {
CGT_CPTR_TAM,
CGT_CPTR_TCPAC,
+ CGT_HCRX_EnFPM,
CGT_HCRX_TCR2En,
+ CGT_ICH_HCR_TC,
+ CGT_ICH_HCR_TALL0,
+ CGT_ICH_HCR_TALL1,
+ CGT_ICH_HCR_TDIR,
+
/*
* Anything after this point is a combination of coarse trap
* controls, which must all be evaluated to decide what to do.
*/
__MULTIPLE_CONTROL_BITS__,
- CGT_HCR_IMO_FMO = __MULTIPLE_CONTROL_BITS__,
+ CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__,
CGT_HCR_TID2_TID4,
CGT_HCR_TTLB_TTLBIS,
CGT_HCR_TTLB_TTLBOS,
@@ -105,6 +111,8 @@ enum cgt_group_id {
CGT_MDCR_TDE_TDRA,
CGT_MDCR_TDCC_TDE_TDA,
+ CGT_ICH_HCR_TC_TDIR,
+
/*
* Anything after this point requires a callback evaluating a
* complex trap condition. Ugly stuff.
@@ -372,12 +380,42 @@ static const struct trap_bits coarse_trap_bits[] = {
.mask = CPTR_EL2_TCPAC,
.behaviour = BEHAVE_FORWARD_ANY,
},
+ [CGT_HCRX_EnFPM] = {
+ .index = HCRX_EL2,
+ .value = 0,
+ .mask = HCRX_EL2_EnFPM,
+ .behaviour = BEHAVE_FORWARD_ANY,
+ },
[CGT_HCRX_TCR2En] = {
.index = HCRX_EL2,
.value = 0,
.mask = HCRX_EL2_TCR2En,
.behaviour = BEHAVE_FORWARD_ANY,
},
+ [CGT_ICH_HCR_TC] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_TC,
+ .mask = ICH_HCR_TC,
+ .behaviour = BEHAVE_FORWARD_ANY,
+ },
+ [CGT_ICH_HCR_TALL0] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_TALL0,
+ .mask = ICH_HCR_TALL0,
+ .behaviour = BEHAVE_FORWARD_ANY,
+ },
+ [CGT_ICH_HCR_TALL1] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_TALL1,
+ .mask = ICH_HCR_TALL1,
+ .behaviour = BEHAVE_FORWARD_ANY,
+ },
+ [CGT_ICH_HCR_TDIR] = {
+ .index = ICH_HCR_EL2,
+ .value = ICH_HCR_TDIR,
+ .mask = ICH_HCR_TDIR,
+ .behaviour = BEHAVE_FORWARD_ANY,
+ },
};
#define MCB(id, ...) \
@@ -387,7 +425,6 @@ static const struct trap_bits coarse_trap_bits[] = {
}
static const enum cgt_group_id *coarse_control_combo[] = {
- MCB(CGT_HCR_IMO_FMO, CGT_HCR_IMO, CGT_HCR_FMO),
MCB(CGT_HCR_TID2_TID4, CGT_HCR_TID2, CGT_HCR_TID4),
MCB(CGT_HCR_TTLB_TTLBIS, CGT_HCR_TTLB, CGT_HCR_TTLBIS),
MCB(CGT_HCR_TTLB_TTLBOS, CGT_HCR_TTLB, CGT_HCR_TTLBOS),
@@ -402,6 +439,9 @@ static const enum cgt_group_id *coarse_control_combo[] = {
MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA),
MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA),
MCB(CGT_MDCR_TDCC_TDE_TDA, CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA),
+
+ MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC, CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC),
+ MCB(CGT_ICH_HCR_TC_TDIR, CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR),
};
typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *);
@@ -536,9 +576,9 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(SYS_CSSELR_EL1, CGT_HCR_TID2_TID4),
SR_RANGE_TRAP(SYS_ID_PFR0_EL1,
sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3),
- SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO),
- SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO),
- SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO),
+ SR_TRAP(SYS_ICC_SGI0R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_ASGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_SGI1R_EL1, CGT_HCR_IMO_FMO_ICH_HCR_TC),
SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0),
sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP),
SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0),
@@ -786,6 +826,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(OP_AT_S12E1W, CGT_HCR_NV),
SR_TRAP(OP_AT_S12E0R, CGT_HCR_NV),
SR_TRAP(OP_AT_S12E0W, CGT_HCR_NV),
+ SR_TRAP(OP_AT_S1E2A, CGT_HCR_NV),
SR_TRAP(OP_TLBI_IPAS2E1, CGT_HCR_NV),
SR_TRAP(OP_TLBI_RIPAS2E1, CGT_HCR_NV),
SR_TRAP(OP_TLBI_IPAS2LE1, CGT_HCR_NV),
@@ -867,6 +908,7 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(OP_AT_S1E0W, CGT_HCR_AT),
SR_TRAP(OP_AT_S1E1RP, CGT_HCR_AT),
SR_TRAP(OP_AT_S1E1WP, CGT_HCR_AT),
+ SR_TRAP(OP_AT_S1E1A, CGT_HCR_AT),
SR_TRAP(SYS_ERXPFGF_EL1, CGT_HCR_nFIEN),
SR_TRAP(SYS_ERXPFGCTL_EL1, CGT_HCR_nFIEN),
SR_TRAP(SYS_ERXPFGCDN_EL1, CGT_HCR_nFIEN),
@@ -1108,6 +1150,35 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN),
SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN),
SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN),
+ SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM),
+ /*
+ * IMPDEF choice:
+ * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as
+ * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for
+ * ICC_SRE_EL1 access, and always handle it locally.
+ */
+ SR_TRAP(SYS_ICC_AP0R0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP0R1_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP0R2_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP0R3_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_AP1R0_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_AP1R1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_AP1R2_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_AP1R3_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_BPR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_BPR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_CTLR_EL1, CGT_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_DIR_EL1, CGT_ICH_HCR_TC_TDIR),
+ SR_TRAP(SYS_ICC_EOIR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_EOIR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_HPPIR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_HPPIR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_IAR0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_IAR1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_IGRPEN0_EL1, CGT_ICH_HCR_TALL0),
+ SR_TRAP(SYS_ICC_IGRPEN1_EL1, CGT_ICH_HCR_TALL1),
+ SR_TRAP(SYS_ICC_PMR_EL1, CGT_ICH_HCR_TC),
+ SR_TRAP(SYS_ICC_RPR_EL1, CGT_ICH_HCR_TC),
};
static DEFINE_XARRAY(sr_forward_xa);
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index c53e5b14038d..ea5484ce1f3b 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -63,6 +63,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
*/
*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
*host_data_ptr(fpsimd_state) = kern_hyp_va(&current->thread.uw.fpsimd_state);
+ *host_data_ptr(fpmr_ptr) = kern_hyp_va(&current->thread.uw.fpmr);
vcpu_clear_flag(vcpu, HOST_SVE_ENABLED);
if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
@@ -134,8 +135,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
fp_state.sve_state = vcpu->arch.sve_state;
fp_state.sve_vl = vcpu->arch.sve_max_vl;
fp_state.sme_state = NULL;
- fp_state.svcr = &vcpu->arch.svcr;
- fp_state.fpmr = &vcpu->arch.fpmr;
+ fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR);
+ fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR);
fp_state.fp_type = &vcpu->arch.fp_type;
if (vcpu_has_sve(vcpu))
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 11098eb7eb44..962f985977c2 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1045,6 +1045,11 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
mutex_lock(&kvm->slots_lock);
+ if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
while (length > 0) {
kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
void *maddr;
@@ -1059,6 +1064,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
page = pfn_to_online_page(pfn);
if (!page) {
/* Reject ZONE_DEVICE memory */
+ kvm_release_pfn_clean(pfn);
ret = -EFAULT;
goto out;
}
diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h
index 9e13c1bc2ad5..487c06099d6f 100644
--- a/arch/arm64/kvm/hyp/include/hyp/fault.h
+++ b/arch/arm64/kvm/hyp/include/hyp/fault.h
@@ -27,7 +27,7 @@ static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
* saved the guest context yet, and we may return early...
*/
par = read_sysreg_par();
- if (!__kvm_at("s1e1r", far))
+ if (!__kvm_at(OP_AT_S1E1R, far))
tmp = read_sysreg_par();
else
tmp = SYS_PAR_EL1_F; /* back to the guest */
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 37ff87d782b6..46d52e8a3df3 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -403,6 +403,9 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
else
__fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs);
+ if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm)))
+ write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR);
+
/* Skip restoring fpexc32 for AArch64 guests */
if (!(read_sysreg(hcr_el2) & HCR_RW))
write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2);
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index e715c157c2c4..e433dfab882a 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -426,9 +426,9 @@ out:
return;
}
-static __always_inline void do_ffa_mem_xfer(const u64 func_id,
- struct arm_smccc_res *res,
- struct kvm_cpu_context *ctxt)
+static void __do_ffa_mem_xfer(const u64 func_id,
+ struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
{
DECLARE_REG(u32, len, ctxt, 1);
DECLARE_REG(u32, fraglen, ctxt, 2);
@@ -440,9 +440,6 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
u32 offset, nr_ranges;
int ret = 0;
- BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE &&
- func_id != FFA_FN64_MEM_LEND);
-
if (addr_mbz || npages_mbz || fraglen > len ||
fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
ret = FFA_RET_INVALID_PARAMETERS;
@@ -461,6 +458,11 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
goto out_unlock;
}
+ if (len > ffa_desc_buf.len) {
+ ret = FFA_RET_NO_MEMORY;
+ goto out_unlock;
+ }
+
buf = hyp_buffers.tx;
memcpy(buf, host_buffers.tx, fraglen);
@@ -512,6 +514,13 @@ err_unshare:
goto out_unlock;
}
+#define do_ffa_mem_xfer(fid, res, ctxt) \
+ do { \
+ BUILD_BUG_ON((fid) != FFA_FN64_MEM_SHARE && \
+ (fid) != FFA_FN64_MEM_LEND); \
+ __do_ffa_mem_xfer((fid), (res), (ctxt)); \
+ } while (0);
+
static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
struct kvm_cpu_context *ctxt)
{
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 07120b37da35..401af1835be6 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -130,7 +130,7 @@ alternative_else_nop_endif
/* Invalidate the stale TLBs from Bootloader */
tlbi alle2
- tlbi vmalls12e1
+ tlbi alle1
dsb sy
mov_q x0, INIT_SCTLR_EL2_MMU_ON
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index f43d845f3c4e..87692b566d90 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -62,6 +62,8 @@ static void fpsimd_sve_flush(void)
static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
{
+ bool has_fpmr;
+
if (!guest_owns_fp_regs())
return;
@@ -73,11 +75,18 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
else
__fpsimd_save_state(&vcpu->arch.ctxt.fp_regs);
+ has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm));
+ if (has_fpmr)
+ __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR);
+
if (system_supports_sve())
__hyp_sve_restore_host();
else
__fpsimd_restore_state(*host_data_ptr(fpsimd_state));
+ if (has_fpmr)
+ write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR);
+
*host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 8f5c56d5b1cd..cc69106734ca 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -197,6 +197,15 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
} else {
__fpsimd_save_state(*host_data_ptr(fpsimd_state));
}
+
+ if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) {
+ u64 val = read_sysreg_s(SYS_FPMR);
+
+ if (unlikely(is_protected_kvm_enabled()))
+ *host_data_ptr(fpmr) = val;
+ else
+ **host_data_ptr(fpmr_ptr) = val;
+ }
}
static const exit_handler_fn hyp_exit_handlers[] = {
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index ca3c09df8d7c..48da9ca9763f 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -132,10 +132,10 @@ static void exit_vmid_context(struct tlb_inv_context *cxt)
else
__load_host_stage2();
- if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
- /* Ensure write of the old VMID */
- isb();
+ /* Ensure write of the old VMID */
+ isb();
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
if (!(cxt->sctlr & SCTLR_ELx_M)) {
write_sysreg_el1(cxt->sctlr, SYS_SCTLR);
isb();
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 9e2bbee77491..b11bcebac908 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -17,48 +17,6 @@
#define KVM_PTE_TYPE_PAGE 1
#define KVM_PTE_TYPE_TABLE 1
-#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
-
-#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \
- ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \
- ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
-#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50)
-
-#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
-
-#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
-
-#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
- KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
- KVM_PTE_LEAF_ATTR_HI_S2_XN)
-
-#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID 1
-
-/*
- * Used to indicate a pte for which a 'break-before-make' sequence is in
- * progress.
- */
-#define KVM_INVALID_PTE_LOCKED BIT(10)
-
struct kvm_pgtable_walk_data {
struct kvm_pgtable_walker *walker;
@@ -1547,7 +1505,6 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
*/
new = kvm_init_table_pte(childp, mm_ops);
stage2_make_pte(ctx, new);
- dsb(ishst);
return 0;
}
@@ -1559,8 +1516,11 @@ int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
.flags = KVM_PGTABLE_WALK_LEAF,
.arg = mc,
};
+ int ret;
- return kvm_pgtable_walk(pgt, addr, size, &walker);
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ return ret;
}
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 7b397fad26f2..18d4677002b1 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -268,8 +268,16 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
* starting to mess with the rest of the GIC, and VMCR_EL2 in
* particular. This logic must be called before
* __vgic_v3_restore_state().
+ *
+ * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is
+ * provisioned at all. In order to prevent illegal accesses to the
+ * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1
+ * so that the trap bits can take effect. Yes, we *loves* the GIC.
*/
- if (!cpu_if->vgic_sre) {
+ if (!(cpu_if->vgic_hcr & ICH_HCR_EN)) {
+ write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1);
+ isb();
+ } else if (!cpu_if->vgic_sre) {
write_gicreg(0, ICC_SRE_EL1);
isb();
write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
@@ -288,8 +296,9 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
}
/*
- * Prevent the guest from touching the GIC system registers if
- * SRE isn't enabled for GICv3 emulation.
+ * Prevent the guest from touching the ICC_SRE_EL1 system
+ * register. Note that this may not have any effect, as
+ * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
*/
write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
ICC_SRE_EL2);
@@ -297,10 +306,11 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
/*
* If we need to trap system registers, we must write
* ICH_HCR_EL2 anyway, even if no interrupts are being
- * injected,
+ * injected. Note that this also applies if we don't expect
+ * any system register access (no vgic at all).
*/
if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
- cpu_if->its_vpe.its_vm)
+ cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
}
@@ -326,7 +336,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
* no interrupts were being injected, and we disable it again here.
*/
if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
- cpu_if->its_vpe.its_vm)
+ cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
write_gicreg(0, ICH_HCR_EL2);
}
@@ -1032,6 +1042,75 @@ static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
write_gicreg(vmcr, ICH_VMCR_EL2);
}
+static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
+ u32 sysreg, bool is_read)
+{
+ u64 ich_hcr;
+
+ if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu))
+ return false;
+
+ ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+
+ switch (sysreg) {
+ case SYS_ICC_IGRPEN0_EL1:
+ if (is_read &&
+ (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ if (!is_read &&
+ (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ fallthrough;
+
+ case SYS_ICC_AP0Rn_EL1(0):
+ case SYS_ICC_AP0Rn_EL1(1):
+ case SYS_ICC_AP0Rn_EL1(2):
+ case SYS_ICC_AP0Rn_EL1(3):
+ case SYS_ICC_BPR0_EL1:
+ case SYS_ICC_EOIR0_EL1:
+ case SYS_ICC_HPPIR0_EL1:
+ case SYS_ICC_IAR0_EL1:
+ return ich_hcr & ICH_HCR_TALL0;
+
+ case SYS_ICC_IGRPEN1_EL1:
+ if (is_read &&
+ (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ if (!is_read &&
+ (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGxTR_EL2_ICC_IGRPENn_EL1))
+ return true;
+
+ fallthrough;
+
+ case SYS_ICC_AP1Rn_EL1(0):
+ case SYS_ICC_AP1Rn_EL1(1):
+ case SYS_ICC_AP1Rn_EL1(2):
+ case SYS_ICC_AP1Rn_EL1(3):
+ case SYS_ICC_BPR1_EL1:
+ case SYS_ICC_EOIR1_EL1:
+ case SYS_ICC_HPPIR1_EL1:
+ case SYS_ICC_IAR1_EL1:
+ return ich_hcr & ICH_HCR_TALL1;
+
+ case SYS_ICC_DIR_EL1:
+ if (ich_hcr & ICH_HCR_TDIR)
+ return true;
+
+ fallthrough;
+
+ case SYS_ICC_RPR_EL1:
+ case SYS_ICC_CTLR_EL1:
+ case SYS_ICC_PMR_EL1:
+ return ich_hcr & ICH_HCR_TC;
+
+ default:
+ return false;
+ }
+}
+
int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
{
int rt;
@@ -1041,6 +1120,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
bool is_read;
u32 sysreg;
+ if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+ return 0;
+
esr = kvm_vcpu_get_esr(vcpu);
if (vcpu_mode_is_32bit(vcpu)) {
if (!kvm_condition_valid(vcpu)) {
@@ -1055,6 +1137,9 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
+ if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read))
+ return 0;
+
switch (sysreg) {
case SYS_ICC_IAR0_EL1:
case SYS_ICC_IAR1_EL1:
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 77010b76c150..80581b1c3995 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -312,6 +312,9 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code)
static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
{
__fpsimd_save_state(*host_data_ptr(fpsimd_state));
+
+ if (kvm_has_fpmr(vcpu->kvm))
+ **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR);
}
static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6981b1bc0946..a509b63bd4dd 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1540,8 +1540,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vma_pagesize = min(vma_pagesize, (long)max_map_size);
}
- if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+ /*
+ * Both the canonical IPA and fault IPA must be hugepage-aligned to
+ * ensure we find the right PFN and lay down the mapping in the right
+ * place.
+ */
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
fault_ipa &= ~(vma_pagesize - 1);
+ ipa &= ~(vma_pagesize - 1);
+ }
gfn = ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index bab27f9d8cc6..638b6205526f 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -103,20 +103,6 @@ struct s2_walk_info {
bool be;
};
-static unsigned int ps_to_output_size(unsigned int ps)
-{
- switch (ps) {
- case 0: return 32;
- case 1: return 36;
- case 2: return 40;
- case 3: return 42;
- case 4: return 44;
- case 5:
- default:
- return 48;
- }
-}
-
static u32 compute_fsc(int level, u32 fsc)
{
return fsc | (level & 0x3);
@@ -256,7 +242,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
/* Check for valid descriptor at this point */
if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
@@ -266,7 +252,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
if (check_output_size(wi, desc)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
@@ -278,27 +264,24 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
if (level < first_block_level) {
out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
- /*
- * We don't use the contiguous bit in the stage-2 ptes, so skip check
- * for misprogramming of the contiguous bit.
- */
-
if (check_output_size(wi, desc)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
if (!(desc & BIT(10))) {
out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
- out->upper_attr = desc;
+ out->desc = desc;
return 1;
}
+ addr_bottom += contiguous_bit_shift(desc, wi, level);
+
/* Calculate and return the result */
paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
(ipa & GENMASK_ULL(addr_bottom - 1, 0));
@@ -307,7 +290,7 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
out->readable = desc & (0b01 << 6);
out->writable = desc & (0b10 << 6);
out->level = level;
- out->upper_attr = desc & GENMASK_ULL(63, 52);
+ out->desc = desc;
return 0;
}
@@ -954,19 +937,16 @@ static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1)
int kvm_init_nv_sysregs(struct kvm *kvm)
{
u64 res0, res1;
- int ret = 0;
- mutex_lock(&kvm->arch.config_lock);
+ lockdep_assert_held(&kvm->arch.config_lock);
if (kvm->arch.sysreg_masks)
- goto out;
+ return 0;
kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)),
GFP_KERNEL_ACCOUNT);
- if (!kvm->arch.sysreg_masks) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!kvm->arch.sysreg_masks)
+ return -ENOMEM;
limit_nv_id_regs(kvm);
@@ -1195,8 +1175,13 @@ int kvm_init_nv_sysregs(struct kvm *kvm)
if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, V1P1))
res0 |= ~(res0 | res1);
set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1);
-out:
- mutex_unlock(&kvm->arch.config_lock);
- return ret;
+ /* SCTLR_EL1 */
+ res0 = SCTLR_EL1_RES0;
+ res1 = SCTLR_EL1_RES1;
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
+ res0 |= SCTLR_EL1_EPAN;
+ set_sysreg_masks(kvm, SCTLR_EL1, res0, res1);
+
+ return 0;
}
diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
new file mode 100644
index 000000000000..e4a342e903e2
--- /dev/null
+++ b/arch/arm64/kvm/ptdump.c
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Debug helper used to dump the stage-2 pagetables of the system and their
+ * associated permissions.
+ *
+ * Copyright (C) Google, 2024
+ * Author: Sebastian Ene <sebastianene@google.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/kvm_host.h>
+#include <linux/seq_file.h>
+
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
+#include <asm/ptdump.h>
+
+#define MARKERS_LEN 2
+#define KVM_PGTABLE_MAX_LEVELS (KVM_PGTABLE_LAST_LEVEL + 1)
+
+struct kvm_ptdump_guest_state {
+ struct kvm *kvm;
+ struct ptdump_pg_state parser_state;
+ struct addr_marker ipa_marker[MARKERS_LEN];
+ struct ptdump_pg_level level[KVM_PGTABLE_MAX_LEVELS];
+ struct ptdump_range range[MARKERS_LEN];
+};
+
+static const struct ptdump_prot_bits stage2_pte_bits[] = {
+ {
+ .mask = PTE_VALID,
+ .val = PTE_VALID,
+ .set = " ",
+ .clear = "F",
+ }, {
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
+ .set = "R",
+ .clear = " ",
+ }, {
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
+ .set = "W",
+ .clear = " ",
+ }, {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
+ .val = PTE_VALID,
+ .set = " ",
+ .clear = "X",
+ }, {
+ .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
+ .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
+ .set = "AF",
+ .clear = " ",
+ }, {
+ .mask = PTE_TABLE_BIT | PTE_VALID,
+ .val = PTE_VALID,
+ .set = "BLK",
+ .clear = " ",
+ },
+};
+
+static int kvm_ptdump_visitor(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct ptdump_pg_state *st = ctx->arg;
+ struct ptdump_state *pt_st = &st->ptdump;
+
+ note_page(pt_st, ctx->addr, ctx->level, ctx->old);
+
+ return 0;
+}
+
+static int kvm_ptdump_build_levels(struct ptdump_pg_level *level, u32 start_lvl)
+{
+ u32 i;
+ u64 mask;
+
+ if (WARN_ON_ONCE(start_lvl >= KVM_PGTABLE_LAST_LEVEL))
+ return -EINVAL;
+
+ mask = 0;
+ for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
+ mask |= stage2_pte_bits[i].mask;
+
+ for (i = start_lvl; i < KVM_PGTABLE_MAX_LEVELS; i++) {
+ snprintf(level[i].name, sizeof(level[i].name), "%u", i);
+
+ level[i].num = ARRAY_SIZE(stage2_pte_bits);
+ level[i].bits = stage2_pte_bits;
+ level[i].mask = mask;
+ }
+
+ return 0;
+}
+
+static struct kvm_ptdump_guest_state *kvm_ptdump_parser_create(struct kvm *kvm)
+{
+ struct kvm_ptdump_guest_state *st;
+ struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+ struct kvm_pgtable *pgtable = mmu->pgt;
+ int ret;
+
+ st = kzalloc(sizeof(struct kvm_ptdump_guest_state), GFP_KERNEL_ACCOUNT);
+ if (!st)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kvm_ptdump_build_levels(&st->level[0], pgtable->start_level);
+ if (ret) {
+ kfree(st);
+ return ERR_PTR(ret);
+ }
+
+ st->ipa_marker[0].name = "Guest IPA";
+ st->ipa_marker[1].start_address = BIT(pgtable->ia_bits);
+ st->range[0].end = BIT(pgtable->ia_bits);
+
+ st->kvm = kvm;
+ st->parser_state = (struct ptdump_pg_state) {
+ .marker = &st->ipa_marker[0],
+ .level = -1,
+ .pg_level = &st->level[0],
+ .ptdump.range = &st->range[0],
+ .start_address = 0,
+ };
+
+ return st;
+}
+
+static int kvm_ptdump_guest_show(struct seq_file *m, void *unused)
+{
+ int ret;
+ struct kvm_ptdump_guest_state *st = m->private;
+ struct kvm *kvm = st->kvm;
+ struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+ struct ptdump_pg_state *parser_state = &st->parser_state;
+ struct kvm_pgtable_walker walker = (struct kvm_pgtable_walker) {
+ .cb = kvm_ptdump_visitor,
+ .arg = parser_state,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ parser_state->seq = m;
+
+ write_lock(&kvm->mmu_lock);
+ ret = kvm_pgtable_walk(mmu->pgt, 0, BIT(mmu->pgt->ia_bits), &walker);
+ write_unlock(&kvm->mmu_lock);
+
+ return ret;
+}
+
+static int kvm_ptdump_guest_open(struct inode *m, struct file *file)
+{
+ struct kvm *kvm = m->i_private;
+ struct kvm_ptdump_guest_state *st;
+ int ret;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ st = kvm_ptdump_parser_create(kvm);
+ if (IS_ERR(st)) {
+ ret = PTR_ERR(st);
+ goto err_with_kvm_ref;
+ }
+
+ ret = single_open(file, kvm_ptdump_guest_show, st);
+ if (!ret)
+ return 0;
+
+ kfree(st);
+err_with_kvm_ref:
+ kvm_put_kvm(kvm);
+ return ret;
+}
+
+static int kvm_ptdump_guest_close(struct inode *m, struct file *file)
+{
+ struct kvm *kvm = m->i_private;
+ void *st = ((struct seq_file *)file->private_data)->private;
+
+ kfree(st);
+ kvm_put_kvm(kvm);
+
+ return single_release(m, file);
+}
+
+static const struct file_operations kvm_ptdump_guest_fops = {
+ .open = kvm_ptdump_guest_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_ptdump_guest_close,
+};
+
+static int kvm_pgtable_range_show(struct seq_file *m, void *unused)
+{
+ struct kvm_pgtable *pgtable = m->private;
+
+ seq_printf(m, "%2u\n", pgtable->ia_bits);
+ return 0;
+}
+
+static int kvm_pgtable_levels_show(struct seq_file *m, void *unused)
+{
+ struct kvm_pgtable *pgtable = m->private;
+
+ seq_printf(m, "%1d\n", KVM_PGTABLE_MAX_LEVELS - pgtable->start_level);
+ return 0;
+}
+
+static int kvm_pgtable_debugfs_open(struct inode *m, struct file *file,
+ int (*show)(struct seq_file *, void *))
+{
+ struct kvm *kvm = m->i_private;
+ struct kvm_pgtable *pgtable;
+ int ret;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ pgtable = kvm->arch.mmu.pgt;
+
+ ret = single_open(file, show, pgtable);
+ if (ret < 0)
+ kvm_put_kvm(kvm);
+ return ret;
+}
+
+static int kvm_pgtable_range_open(struct inode *m, struct file *file)
+{
+ return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_range_show);
+}
+
+static int kvm_pgtable_levels_open(struct inode *m, struct file *file)
+{
+ return kvm_pgtable_debugfs_open(m, file, kvm_pgtable_levels_show);
+}
+
+static int kvm_pgtable_debugfs_close(struct inode *m, struct file *file)
+{
+ struct kvm *kvm = m->i_private;
+
+ kvm_put_kvm(kvm);
+ return single_release(m, file);
+}
+
+static const struct file_operations kvm_pgtable_range_fops = {
+ .open = kvm_pgtable_range_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_pgtable_debugfs_close,
+};
+
+static const struct file_operations kvm_pgtable_levels_fops = {
+ .open = kvm_pgtable_levels_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_pgtable_debugfs_close,
+};
+
+void kvm_s2_ptdump_create_debugfs(struct kvm *kvm)
+{
+ debugfs_create_file("stage2_page_tables", 0400, kvm->debugfs_dentry,
+ kvm, &kvm_ptdump_guest_fops);
+ debugfs_create_file("ipa_range", 0400, kvm->debugfs_dentry, kvm,
+ &kvm_pgtable_range_fops);
+ debugfs_create_file("stage2_levels", 0400, kvm->debugfs_dentry,
+ kvm, &kvm_pgtable_levels_fops);
+}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c90324060436..09a6a45efb49 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -33,6 +33,7 @@
#include <trace/events/kvm.h>
#include "sys_regs.h"
+#include "vgic/vgic.h"
#include "trace.h"
@@ -46,6 +47,13 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 val);
+static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ kvm_inject_undefined(vcpu);
+ return false;
+}
+
static bool bad_trap(struct kvm_vcpu *vcpu,
struct sys_reg_params *params,
const struct sys_reg_desc *r,
@@ -53,8 +61,7 @@ static bool bad_trap(struct kvm_vcpu *vcpu,
{
WARN_ONCE(1, "Unexpected %s\n", msg);
print_sys_reg_instr(params);
- kvm_inject_undefined(vcpu);
- return false;
+ return undef_access(vcpu, params, r);
}
static bool read_from_write_only(struct kvm_vcpu *vcpu,
@@ -345,10 +352,8 @@ static bool access_dcgsw(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- if (!kvm_has_mte(vcpu->kvm)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_has_mte(vcpu->kvm))
+ return undef_access(vcpu, p, r);
/* Treat MTE S/W ops as we treat the classic ones: with contempt */
return access_dcsw(vcpu, p, r);
@@ -385,10 +390,8 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu,
u64 val, mask, shift;
if (reg_to_encoding(r) == SYS_TCR2_EL1 &&
- !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP))
+ return undef_access(vcpu, p, r);
BUG_ON(!p->is_write);
@@ -435,6 +438,9 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu,
{
bool g1;
+ if (!kvm_has_gicv3(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
if (!p->is_write)
return read_from_write_only(vcpu, p, r);
@@ -478,6 +484,9 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
+ if (!kvm_has_gicv3(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
if (p->is_write)
return ignore_write(vcpu, p);
@@ -495,14 +504,6 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu,
return read_zero(vcpu, p);
}
-static bool trap_undef(struct kvm_vcpu *vcpu,
- struct sys_reg_params *p,
- const struct sys_reg_desc *r)
-{
- kvm_inject_undefined(vcpu);
- return false;
-}
-
/*
* ARMv8.1 mandates at least a trivial LORegion implementation, where all the
* RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0
@@ -515,10 +516,8 @@ static bool trap_loregion(struct kvm_vcpu *vcpu,
{
u32 sr = reg_to_encoding(r);
- if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP))
+ return undef_access(vcpu, p, r);
if (p->is_write && sr == SYS_LORID_EL1)
return write_to_read_only(vcpu, p, r);
@@ -1251,10 +1250,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
if (p->is_write) {
- if (!vcpu_mode_priv(vcpu)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!vcpu_mode_priv(vcpu))
+ return undef_access(vcpu, p, r);
__vcpu_sys_reg(vcpu, PMUSERENR_EL0) =
p->regval & ARMV8_PMU_USERENR_MASK;
@@ -1338,14 +1335,6 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
.reset = reset_pmevtyper, \
.access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), }
-static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
- const struct sys_reg_desc *r)
-{
- kvm_inject_undefined(vcpu);
-
- return false;
-}
-
/* Macro to expand the AMU counter and type registers*/
#define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access }
#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access }
@@ -1404,8 +1393,7 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
break;
default:
print_sys_reg_msg(p, "%s", "Unhandled trapped timer register");
- kvm_inject_undefined(vcpu);
- return false;
+ return undef_access(vcpu, p, r);
}
if (p->is_write)
@@ -1539,6 +1527,10 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
break;
+ case SYS_ID_AA64PFR2_EL1:
+ /* We only expose FPMR */
+ val &= ID_AA64PFR2_EL1_FPMR;
+ break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
@@ -1669,6 +1661,24 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
+static unsigned int sme_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_fpmr(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
@@ -2085,26 +2095,6 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
#define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v)
/*
- * EL{0,1}2 registers are the EL2 view on an EL0 or EL1 register when
- * HCR_EL2.E2H==1, and only in the sysreg table for convenience of
- * handling traps. Given that, they are always hidden from userspace.
- */
-static unsigned int hidden_user_visibility(const struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd)
-{
- return REG_HIDDEN_USER;
-}
-
-#define EL12_REG(name, acc, rst, v) { \
- SYS_DESC(SYS_##name##_EL12), \
- .access = acc, \
- .reset = rst, \
- .reg = name##_EL1, \
- .val = v, \
- .visibility = hidden_user_visibility, \
-}
-
-/*
* Since reset() callback and field val are not used for idregs, they will be
* used for specific purposes for idregs.
* The reset() would return KVM sanitised register value. The value would be the
@@ -2211,6 +2201,18 @@ static bool access_spsr(struct kvm_vcpu *vcpu,
return true;
}
+static bool access_cntkctl_el12(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval;
+ else
+ p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1);
+
+ return true;
+}
+
static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 val = r->val;
@@ -2301,7 +2303,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
// DBGDTR[TR]X_EL0 share the same encoding
{ SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi },
- { SYS_DESC(SYS_DBGVCR32_EL2), trap_undef, reset_val, DBGVCR32_EL2, 0 },
+ { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 },
{ SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 },
@@ -2359,16 +2361,15 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64PFR0_EL1_MPAM |
ID_AA64PFR0_EL1_SVE |
ID_AA64PFR0_EL1_RAS |
- ID_AA64PFR0_EL1_GIC |
ID_AA64PFR0_EL1_AdvSIMD |
ID_AA64PFR0_EL1_FP), },
ID_SANITISED(ID_AA64PFR1_EL1),
- ID_UNALLOCATED(4,2),
+ ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR),
ID_UNALLOCATED(4,3),
ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
ID_HIDDEN(ID_AA64SMFR0_EL1),
ID_UNALLOCATED(4,6),
- ID_UNALLOCATED(4,7),
+ ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0),
/* CRm=5 */
{ SYS_DESC(SYS_ID_AA64DFR0_EL1),
@@ -2449,6 +2450,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_SPSR_EL1), access_spsr},
{ SYS_DESC(SYS_ELR_EL1), access_elr},
+ { SYS_DESC(SYS_ICC_PMR_EL1), undef_access },
+
{ SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 },
{ SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
{ SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },
@@ -2503,18 +2506,31 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 },
{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
- { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only },
- { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only },
- { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only },
- { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only },
- { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only },
+ { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
{ SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi },
{ SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi },
- { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only },
- { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only },
- { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only },
+ { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access },
{ SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
+ { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access },
{ SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
{ SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 },
@@ -2535,7 +2551,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
CTR_EL0_IDC_MASK |
CTR_EL0_DminLine_MASK |
CTR_EL0_IminLine_MASK),
- { SYS_DESC(SYS_SVCR), undef_access },
+ { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility },
+ { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility },
{ PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr,
.reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr },
@@ -2758,7 +2775,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG_VNCR(VTTBR_EL2, reset_val, 0),
EL2_REG_VNCR(VTCR_EL2, reset_val, 0),
- { SYS_DESC(SYS_DACR32_EL2), trap_undef, reset_unknown, DACR32_EL2 },
+ { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 },
EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0),
EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0),
EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0),
@@ -2767,20 +2784,16 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_SP_EL1), access_sp_el1},
/* AArch32 SPSR_* are RES0 if trapped from a NV guest */
- { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi,
- .visibility = hidden_user_visibility },
- { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi,
- .visibility = hidden_user_visibility },
- { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi,
- .visibility = hidden_user_visibility },
- { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi,
- .visibility = hidden_user_visibility },
-
- { SYS_DESC(SYS_IFSR32_EL2), trap_undef, reset_unknown, IFSR32_EL2 },
+ { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi },
+ { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi },
+ { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi },
+ { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi },
+
+ { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 },
EL2_REG(AFSR0_EL2, access_rw, reset_val, 0),
EL2_REG(AFSR1_EL2, access_rw, reset_val, 0),
EL2_REG_REDIR(ESR_EL2, reset_val, 0),
- { SYS_DESC(SYS_FPEXC32_EL2), trap_undef, reset_val, FPEXC32_EL2, 0x700 },
+ { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 },
EL2_REG_REDIR(FAR_EL2, reset_val, 0),
EL2_REG(HPFAR_EL2, access_rw, reset_val, 0),
@@ -2790,7 +2803,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(VBAR_EL2, access_rw, reset_val, 0),
EL2_REG(RVBAR_EL2, access_rw, reset_val, 0),
- { SYS_DESC(SYS_RMR_EL2), trap_undef },
+ { SYS_DESC(SYS_RMR_EL2), undef_access },
+
+ EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0),
EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
@@ -2798,11 +2813,48 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0),
EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0),
- EL12_REG(CNTKCTL, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 },
EL2_REG(SP_EL2, NULL, reset_unknown, 0),
};
+static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ __kvm_at_s1e01(vcpu, op, p->regval);
+
+ return true;
+}
+
+static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ /* There is no FGT associated with AT S1E2A :-( */
+ if (op == OP_AT_S1E2A &&
+ !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) {
+ kvm_inject_undefined(vcpu);
+ return false;
+ }
+
+ __kvm_at_s1e2(vcpu, op, p->regval);
+
+ return true;
+}
+
+static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
+
+ __kvm_at_s12(vcpu, op, p->regval);
+
+ return true;
+}
+
static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr)
{
struct kvm *kvm = vpcu->kvm;
@@ -2824,10 +2876,8 @@ static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
- if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
write_lock(&vcpu->kvm->mmu_lock);
@@ -2896,10 +2946,8 @@ static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
u64 limit, vttbr;
- if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm));
@@ -2924,10 +2972,8 @@ static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
u64 base, range, tg, num, scale;
int shift;
- if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
/*
* Because the shadow S2 structure doesn't necessarily reflect that
@@ -2995,10 +3041,8 @@ static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
- if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
&(union tlbi_info) {
@@ -3038,10 +3082,8 @@ static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
WARN_ON(!vcpu_is_el2(vcpu));
- if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding)) {
- kvm_inject_undefined(vcpu);
- return false;
- }
+ if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding))
+ return undef_access(vcpu, p, r);
kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
&(union tlbi_info) {
@@ -3065,6 +3107,14 @@ static struct sys_reg_desc sys_insn_descs[] = {
{ SYS_DESC(SYS_DC_ISW), access_dcsw },
{ SYS_DESC(SYS_DC_IGSW), access_dcgsw },
{ SYS_DESC(SYS_DC_IGDSW), access_dcgsw },
+
+ SYS_INSN(AT_S1E1R, handle_at_s1e01),
+ SYS_INSN(AT_S1E1W, handle_at_s1e01),
+ SYS_INSN(AT_S1E0R, handle_at_s1e01),
+ SYS_INSN(AT_S1E0W, handle_at_s1e01),
+ SYS_INSN(AT_S1E1RP, handle_at_s1e01),
+ SYS_INSN(AT_S1E1WP, handle_at_s1e01),
+
{ SYS_DESC(SYS_DC_CSW), access_dcsw },
{ SYS_DESC(SYS_DC_CGSW), access_dcgsw },
{ SYS_DESC(SYS_DC_CGDSW), access_dcgsw },
@@ -3144,19 +3194,27 @@ static struct sys_reg_desc sys_insn_descs[] = {
SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1),
SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1),
+ SYS_INSN(AT_S1E2R, handle_at_s1e2),
+ SYS_INSN(AT_S1E2W, handle_at_s1e2),
+ SYS_INSN(AT_S12E1R, handle_at_s12),
+ SYS_INSN(AT_S12E1W, handle_at_s12),
+ SYS_INSN(AT_S12E0R, handle_at_s12),
+ SYS_INSN(AT_S12E0W, handle_at_s12),
+ SYS_INSN(AT_S1E2A, handle_at_s1e2),
+
SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is),
SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is),
- SYS_INSN(TLBI_ALLE2OS, trap_undef),
- SYS_INSN(TLBI_VAE2OS, trap_undef),
+ SYS_INSN(TLBI_ALLE2OS, undef_access),
+ SYS_INSN(TLBI_VAE2OS, undef_access),
SYS_INSN(TLBI_ALLE1OS, handle_alle1is),
- SYS_INSN(TLBI_VALE2OS, trap_undef),
+ SYS_INSN(TLBI_VALE2OS, undef_access),
SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is),
- SYS_INSN(TLBI_RVAE2IS, trap_undef),
- SYS_INSN(TLBI_RVALE2IS, trap_undef),
+ SYS_INSN(TLBI_RVAE2IS, undef_access),
+ SYS_INSN(TLBI_RVALE2IS, undef_access),
SYS_INSN(TLBI_ALLE1IS, handle_alle1is),
SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is),
@@ -3168,10 +3226,10 @@ static struct sys_reg_desc sys_insn_descs[] = {
SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is),
SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is),
- SYS_INSN(TLBI_RVAE2OS, trap_undef),
- SYS_INSN(TLBI_RVALE2OS, trap_undef),
- SYS_INSN(TLBI_RVAE2, trap_undef),
- SYS_INSN(TLBI_RVALE2, trap_undef),
+ SYS_INSN(TLBI_RVAE2OS, undef_access),
+ SYS_INSN(TLBI_RVALE2OS, undef_access),
+ SYS_INSN(TLBI_RVAE2, undef_access),
+ SYS_INSN(TLBI_RVALE2, undef_access),
SYS_INSN(TLBI_ALLE1, handle_alle1is),
SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is),
@@ -3180,19 +3238,19 @@ static struct sys_reg_desc sys_insn_descs[] = {
SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is),
- SYS_INSN(TLBI_ALLE2OSNXS, trap_undef),
- SYS_INSN(TLBI_VAE2OSNXS, trap_undef),
+ SYS_INSN(TLBI_ALLE2OSNXS, undef_access),
+ SYS_INSN(TLBI_VAE2OSNXS, undef_access),
SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is),
- SYS_INSN(TLBI_VALE2OSNXS, trap_undef),
+ SYS_INSN(TLBI_VALE2OSNXS, undef_access),
SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is),
- SYS_INSN(TLBI_RVAE2ISNXS, trap_undef),
- SYS_INSN(TLBI_RVALE2ISNXS, trap_undef),
- SYS_INSN(TLBI_ALLE2ISNXS, trap_undef),
- SYS_INSN(TLBI_VAE2ISNXS, trap_undef),
+ SYS_INSN(TLBI_RVAE2ISNXS, undef_access),
+ SYS_INSN(TLBI_RVALE2ISNXS, undef_access),
+ SYS_INSN(TLBI_ALLE2ISNXS, undef_access),
+ SYS_INSN(TLBI_VAE2ISNXS, undef_access),
SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is),
- SYS_INSN(TLBI_VALE2ISNXS, trap_undef),
+ SYS_INSN(TLBI_VALE2ISNXS, undef_access),
SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is),
SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is),
SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is),
@@ -3202,14 +3260,14 @@ static struct sys_reg_desc sys_insn_descs[] = {
SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is),
SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is),
SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is),
- SYS_INSN(TLBI_RVAE2OSNXS, trap_undef),
- SYS_INSN(TLBI_RVALE2OSNXS, trap_undef),
- SYS_INSN(TLBI_RVAE2NXS, trap_undef),
- SYS_INSN(TLBI_RVALE2NXS, trap_undef),
- SYS_INSN(TLBI_ALLE2NXS, trap_undef),
- SYS_INSN(TLBI_VAE2NXS, trap_undef),
+ SYS_INSN(TLBI_RVAE2OSNXS, undef_access),
+ SYS_INSN(TLBI_RVALE2OSNXS, undef_access),
+ SYS_INSN(TLBI_RVAE2NXS, undef_access),
+ SYS_INSN(TLBI_RVALE2NXS, undef_access),
+ SYS_INSN(TLBI_ALLE2NXS, undef_access),
+ SYS_INSN(TLBI_VAE2NXS, undef_access),
SYS_INSN(TLBI_ALLE1NXS, handle_alle1is),
- SYS_INSN(TLBI_VALE2NXS, trap_undef),
+ SYS_INSN(TLBI_VALE2NXS, undef_access),
SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is),
};
@@ -3387,6 +3445,7 @@ static const struct sys_reg_desc cp15_regs[] = {
/* TTBCR2 */
{ AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 },
{ Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 },
+ { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access },
/* DFSR */
{ Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 },
{ Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 },
@@ -3436,8 +3495,28 @@ static const struct sys_reg_desc cp15_regs[] = {
/* AMAIR1 */
{ AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 },
- /* ICC_SRE */
- { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre },
+ { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
+ { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access },
{ Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 },
@@ -4274,7 +4353,7 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int ret;
r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
- if (!r || sysreg_hidden_user(vcpu, r))
+ if (!r || sysreg_hidden(vcpu, r))
return -ENOENT;
if (r->get_user) {
@@ -4318,7 +4397,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
return -EFAULT;
r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
- if (!r || sysreg_hidden_user(vcpu, r))
+ if (!r || sysreg_hidden(vcpu, r))
return -ENOENT;
if (sysreg_user_write_ignore(vcpu, r))
@@ -4404,7 +4483,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu,
if (!(rd->reg || rd->get_user))
return 0;
- if (sysreg_hidden_user(vcpu, rd))
+ if (sysreg_hidden(vcpu, rd))
return 0;
if (!copy_reg_to_user(rd, uind))
@@ -4545,6 +4624,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu)
mutex_lock(&kvm->arch.config_lock);
vcpu_set_hcr(vcpu);
+ vcpu_set_ich_hcr(vcpu);
if (cpus_have_final_cap(ARM64_HAS_HCX)) {
/*
@@ -4560,6 +4640,9 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu)
if (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP))
vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En;
+
+ if (kvm_has_fpmr(kvm))
+ vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM;
}
if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags))
@@ -4600,6 +4683,13 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu)
HFGITR_EL2_TLBIRVAAE1OS |
HFGITR_EL2_TLBIRVAE1OS);
+ if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, ATS1A, IMP))
+ kvm->arch.fgu[HFGITR_GROUP] |= HFGITR_EL2_ATS1E1A;
+
+ if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN2))
+ kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP |
+ HFGITR_EL2_ATS1E1WP);
+
if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP))
kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 |
HFGxTR_EL2_nPIR_EL1);
@@ -4613,6 +4703,36 @@ out:
mutex_unlock(&kvm->arch.config_lock);
}
+/*
+ * Perform last adjustments to the ID registers that are implied by the
+ * configuration outside of the ID regs themselves, as well as any
+ * initialisation that directly depend on these ID registers (such as
+ * RES0/RES1 behaviours). This is not the place to configure traps though.
+ *
+ * Because this can be called once per CPU, changes must be idempotent.
+ */
+int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ guard(mutex)(&kvm->arch.config_lock);
+
+ if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) &&
+ irqchip_in_kernel(kvm) &&
+ kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) {
+ kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK;
+ kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK;
+ }
+
+ if (vcpu_has_nv(vcpu)) {
+ int ret = kvm_init_nv_sysregs(kvm);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int __init kvm_sys_reg_table_init(void)
{
bool valid = true;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 997eea21ba2a..1d94ed6efad2 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -95,9 +95,8 @@ struct sys_reg_desc {
};
#define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */
-#define REG_HIDDEN_USER (1 << 1) /* hidden from userspace only */
-#define REG_RAZ (1 << 2) /* RAZ from userspace and guest */
-#define REG_USER_WI (1 << 3) /* WI from userspace only */
+#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */
+#define REG_USER_WI (1 << 2) /* WI from userspace only */
static __printf(2, 3)
inline void print_sys_reg_msg(const struct sys_reg_params *p,
@@ -165,15 +164,6 @@ static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu,
return sysreg_visibility(vcpu, r) & REG_HIDDEN;
}
-static inline bool sysreg_hidden_user(const struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *r)
-{
- if (likely(!r->visibility))
- return false;
-
- return r->visibility(vcpu, r) & (REG_HIDDEN | REG_HIDDEN_USER);
-}
-
static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
@@ -235,6 +225,8 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index);
+int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu);
+
#define AA32(_x) .aarch32_map = AA32_##_x
#define Op0(_x) .Op0 = _x
#define Op1(_x) .Op1 = _x
@@ -248,4 +240,11 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index);
CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \
Op2(sys_reg_Op2(reg))
+#define CP15_SYS_DESC(reg) \
+ .name = #reg, \
+ .aarch32_map = AA32_DIRECT, \
+ Op0(0), Op1(sys_reg_Op1(reg)), \
+ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \
+ Op2(sys_reg_Op2(reg))
+
#endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index bc74d06398ef..e1397ab2072a 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -85,7 +85,7 @@ static void iter_unmark_lpis(struct kvm *kvm)
struct vgic_irq *irq;
unsigned long intid;
- xa_for_each(&dist->lpi_xa, intid, irq) {
+ xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) {
xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
vgic_put_irq(kvm, irq);
}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 41feb858ff9a..e7c53e8af3d1 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -417,10 +417,8 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
kfree(vgic_cpu->private_irqs);
vgic_cpu->private_irqs = NULL;
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- vgic_unregister_redist_iodev(vcpu);
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
- }
}
void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -448,6 +446,11 @@ void kvm_vgic_destroy(struct kvm *kvm)
kvm_vgic_dist_destroy(kvm);
mutex_unlock(&kvm->arch.config_lock);
+
+ if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ vgic_unregister_redist_iodev(vcpu);
+
mutex_unlock(&kvm->slots_lock);
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 3eecdd2f4b8f..b217b256853c 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -292,6 +292,18 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
/* Get the show on the road... */
vgic_v3->vgic_hcr = ICH_HCR_EN;
+}
+
+void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ /* Hide GICv3 sysreg if necessary */
+ if (!kvm_has_gicv3(vcpu->kvm)) {
+ vgic_v3->vgic_hcr |= ICH_HCR_TALL0 | ICH_HCR_TALL1 | ICH_HCR_TC;
+ return;
+ }
+
if (group0_trap)
vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
if (group1_trap)
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 974849ea7101..f50274fd5581 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -36,6 +36,11 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
* we have to disable IRQs before taking this lock and everything lower
* than it.
*
+ * The config_lock has additional ordering requirements:
+ * kvm->slots_lock
+ * kvm->srcu
+ * kvm->arch.config_lock
+ *
* If you need to take multiple locks, always take the upper lock first,
* then the lower ones, e.g. first take the its_lock, then the irq_lock.
* If you are already holding a lock and need to take a higher one, you
@@ -917,10 +922,13 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
void kvm_vgic_load(struct kvm_vcpu *vcpu)
{
- if (unlikely(!vgic_initialized(vcpu->kvm)))
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
+ if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
return;
+ }
- if (kvm_vgic_global_state.type == VGIC_V2)
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
vgic_v2_load(vcpu);
else
vgic_v3_load(vcpu);
@@ -928,10 +936,13 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu)
void kvm_vgic_put(struct kvm_vcpu *vcpu)
{
- if (unlikely(!vgic_initialized(vcpu->kvm)))
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
+ if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
return;
+ }
- if (kvm_vgic_global_state.type == VGIC_V2)
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
vgic_v2_put(vcpu);
else
vgic_v3_put(vcpu);
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index ba8f790431bd..f2486b4d9f95 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -346,4 +346,11 @@ void vgic_v4_configure_vsgis(struct kvm *kvm);
void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq);
+void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu);
+
+static inline bool kvm_has_gicv3(struct kvm *kvm)
+{
+ return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP);
+}
+
#endif
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 6986827e0d64..264c5f9b97d8 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -38,33 +38,7 @@
seq_printf(m, fmt); \
})
-/*
- * The page dumper groups page table entries of the same type into a single
- * description. It uses pg_state to track the range information while
- * iterating over the pte entries. When the continuity is broken it then
- * dumps out a description of the range.
- */
-struct pg_state {
- struct ptdump_state ptdump;
- struct seq_file *seq;
- const struct addr_marker *marker;
- const struct mm_struct *mm;
- unsigned long start_address;
- int level;
- u64 current_prot;
- bool check_wx;
- unsigned long wx_pages;
- unsigned long uxn_pages;
-};
-
-struct prot_bits {
- u64 mask;
- u64 val;
- const char *set;
- const char *clear;
-};
-
-static const struct prot_bits pte_bits[] = {
+static const struct ptdump_prot_bits pte_bits[] = {
{
.mask = PTE_VALID,
.val = PTE_VALID,
@@ -143,14 +117,7 @@ static const struct prot_bits pte_bits[] = {
}
};
-struct pg_level {
- const struct prot_bits *bits;
- char name[4];
- int num;
- u64 mask;
-};
-
-static struct pg_level pg_level[] __ro_after_init = {
+static struct ptdump_pg_level kernel_pg_levels[] __ro_after_init = {
{ /* pgd */
.name = "PGD",
.bits = pte_bits,
@@ -174,7 +141,7 @@ static struct pg_level pg_level[] __ro_after_init = {
},
};
-static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
+static void dump_prot(struct ptdump_pg_state *st, const struct ptdump_prot_bits *bits,
size_t num)
{
unsigned i;
@@ -192,7 +159,7 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
}
}
-static void note_prot_uxn(struct pg_state *st, unsigned long addr)
+static void note_prot_uxn(struct ptdump_pg_state *st, unsigned long addr)
{
if (!st->check_wx)
return;
@@ -206,7 +173,7 @@ static void note_prot_uxn(struct pg_state *st, unsigned long addr)
st->uxn_pages += (addr - st->start_address) / PAGE_SIZE;
}
-static void note_prot_wx(struct pg_state *st, unsigned long addr)
+static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr)
{
if (!st->check_wx)
return;
@@ -221,16 +188,17 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
}
-static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
- u64 val)
+void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
+ u64 val)
{
- struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+ struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump);
+ struct ptdump_pg_level *pg_level = st->pg_level;
static const char units[] = "KMGTPE";
u64 prot = 0;
/* check if the current level has been folded dynamically */
- if ((level == 1 && mm_p4d_folded(st->mm)) ||
- (level == 2 && mm_pud_folded(st->mm)))
+ if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) ||
+ (level == 2 && mm_pud_folded(st->mm))))
level = 0;
if (level >= 0)
@@ -286,15 +254,16 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
{
unsigned long end = ~0UL;
- struct pg_state st;
+ struct ptdump_pg_state st;
if (info->base_addr < TASK_SIZE_64)
end = TASK_SIZE_64;
- st = (struct pg_state){
+ st = (struct ptdump_pg_state){
.seq = s,
.marker = info->markers,
.mm = info->mm,
+ .pg_level = &kernel_pg_levels[0],
.level = -1,
.ptdump = {
.note_page = note_page,
@@ -312,10 +281,10 @@ static void __init ptdump_initialize(void)
{
unsigned i, j;
- for (i = 0; i < ARRAY_SIZE(pg_level); i++)
- if (pg_level[i].bits)
- for (j = 0; j < pg_level[i].num; j++)
- pg_level[i].mask |= pg_level[i].bits[j].mask;
+ for (i = 0; i < ARRAY_SIZE(kernel_pg_levels); i++)
+ if (kernel_pg_levels[i].bits)
+ for (j = 0; j < kernel_pg_levels[i].num; j++)
+ kernel_pg_levels[i].mask |= kernel_pg_levels[i].bits[j].mask;
}
static struct ptdump_info kernel_ptdump_info __ro_after_init = {
@@ -324,12 +293,13 @@ static struct ptdump_info kernel_ptdump_info __ro_after_init = {
bool ptdump_check_wx(void)
{
- struct pg_state st = {
+ struct ptdump_pg_state st = {
.seq = NULL,
.marker = (struct addr_marker[]) {
{ 0, NULL},
{ -1, NULL},
},
+ .pg_level = &kernel_pg_levels[0],
.level = -1,
.check_wx = true,
.ptdump = {
diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild
index 2bb3676429c0..4635b755b2b4 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -6,7 +6,6 @@ generic-y += mcs_spinlock.h
generic-y += parport.h
generic-y += early_ioremap.h
generic-y += qrwlock.h
-generic-y += qspinlock.h
generic-y += user.h
generic-y += ioctl.h
generic-y += statfs.h
diff --git a/arch/loongarch/include/asm/dma-direct.h b/arch/loongarch/include/asm/dma-direct.h
deleted file mode 100644
index 75ccd808a2af..000000000000
--- a/arch/loongarch/include/asm/dma-direct.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
- */
-#ifndef _LOONGARCH_DMA_DIRECT_H
-#define _LOONGARCH_DMA_DIRECT_H
-
-dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr);
-phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr);
-
-#endif /* _LOONGARCH_DMA_DIRECT_H */
diff --git a/arch/loongarch/include/asm/hw_irq.h b/arch/loongarch/include/asm/hw_irq.h
index af4f4e8fbd85..8156ffb67415 100644
--- a/arch/loongarch/include/asm/hw_irq.h
+++ b/arch/loongarch/include/asm/hw_irq.h
@@ -9,6 +9,8 @@
extern atomic_t irq_err_count;
+#define ARCH_IRQ_INIT_FLAGS IRQ_NOPROBE
+
/*
* interrupt-retrigger: NOP for now. This may not be appropriate for all
* machines, we'll see ...
diff --git a/arch/loongarch/include/asm/kvm_csr.h b/arch/loongarch/include/asm/kvm_csr.h
index 724ca8b7b401..4a76ce796f1f 100644
--- a/arch/loongarch/include/asm/kvm_csr.h
+++ b/arch/loongarch/include/asm/kvm_csr.h
@@ -30,6 +30,7 @@
: [val] "+r" (__v) \
: [reg] "i" (csr) \
: "memory"); \
+ __v; \
})
#define gcsr_xchg(v, m, csr) \
@@ -181,6 +182,8 @@ __BUILD_GCSR_OP(tlbidx)
#define kvm_save_hw_gcsr(csr, gid) (csr->csrs[gid] = gcsr_read(gid))
#define kvm_restore_hw_gcsr(csr, gid) (gcsr_write(csr->csrs[gid], gid))
+#define kvm_read_clear_hw_gcsr(csr, gid) (csr->csrs[gid] = gcsr_write(0, gid))
+
int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu);
static __always_inline unsigned long kvm_read_sw_gcsr(struct loongarch_csrs *csr, int gid)
@@ -208,4 +211,7 @@ static __always_inline void kvm_change_sw_gcsr(struct loongarch_csrs *csr,
csr->csrs[gid] |= val & _mask;
}
+#define KVM_PMU_EVENT_ENABLED (CSR_PERFCTRL_PLV0 | CSR_PERFCTRL_PLV1 | \
+ CSR_PERFCTRL_PLV2 | CSR_PERFCTRL_PLV3)
+
#endif /* __ASM_LOONGARCH_KVM_CSR_H__ */
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index 5f0677e03817..d6bb72424027 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -30,6 +30,7 @@
#define KVM_HALT_POLL_NS_DEFAULT 500000
#define KVM_REQ_TLB_FLUSH_GPA KVM_ARCH_REQ(0)
#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(1)
+#define KVM_REQ_PMU KVM_ARCH_REQ(2)
#define KVM_GUESTDBG_SW_BP_MASK \
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)
@@ -60,9 +61,13 @@ struct kvm_arch_memory_slot {
unsigned long flags;
};
+#define HOST_MAX_PMNUM 16
struct kvm_context {
unsigned long vpid_cache;
struct kvm_vcpu *last_vcpu;
+ /* Host PMU CSR */
+ u64 perf_ctrl[HOST_MAX_PMNUM];
+ u64 perf_cntr[HOST_MAX_PMNUM];
};
struct kvm_world_switch {
@@ -107,6 +112,8 @@ struct kvm_arch {
unsigned int root_level;
spinlock_t phyid_map_lock;
struct kvm_phyid_map *phyid_map;
+ /* Enabled PV features */
+ unsigned long pv_features;
s64 time_offset;
struct kvm_context __percpu *vmcs;
@@ -133,8 +140,15 @@ enum emulation_result {
#define KVM_LARCH_FPU (0x1 << 0)
#define KVM_LARCH_LSX (0x1 << 1)
#define KVM_LARCH_LASX (0x1 << 2)
-#define KVM_LARCH_SWCSR_LATEST (0x1 << 3)
-#define KVM_LARCH_HWCSR_USABLE (0x1 << 4)
+#define KVM_LARCH_LBT (0x1 << 3)
+#define KVM_LARCH_PMU (0x1 << 4)
+#define KVM_LARCH_SWCSR_LATEST (0x1 << 5)
+#define KVM_LARCH_HWCSR_USABLE (0x1 << 6)
+
+#define LOONGARCH_PV_FEAT_UPDATED BIT_ULL(63)
+#define LOONGARCH_PV_FEAT_MASK (BIT(KVM_FEATURE_IPI) | \
+ BIT(KVM_FEATURE_STEAL_TIME) | \
+ BIT(KVM_FEATURE_VIRT_EXTIOI))
struct kvm_vcpu_arch {
/*
@@ -168,10 +182,14 @@ struct kvm_vcpu_arch {
/* FPU state */
struct loongarch_fpu fpu FPU_ALIGN;
+ struct loongarch_lbt lbt;
/* CSR state */
struct loongarch_csrs *csr;
+ /* Guest max PMU CSR id */
+ int max_pmu_csrid;
+
/* GPR used as IO source/target */
u32 io_gpr;
@@ -239,6 +257,21 @@ static inline bool kvm_guest_has_lasx(struct kvm_vcpu_arch *arch)
return arch->cpucfg[2] & CPUCFG2_LASX;
}
+static inline bool kvm_guest_has_lbt(struct kvm_vcpu_arch *arch)
+{
+ return arch->cpucfg[2] & (CPUCFG2_X86BT | CPUCFG2_ARMBT | CPUCFG2_MIPSBT);
+}
+
+static inline bool kvm_guest_has_pmu(struct kvm_vcpu_arch *arch)
+{
+ return arch->cpucfg[6] & CPUCFG6_PMP;
+}
+
+static inline int kvm_get_pmu_num(struct kvm_vcpu_arch *arch)
+{
+ return (arch->cpucfg[6] & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT;
+}
+
/* Debug: dump vcpu state */
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
diff --git a/arch/loongarch/include/asm/kvm_para.h b/arch/loongarch/include/asm/kvm_para.h
index 43ec61589e6c..c4e84227280d 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -2,6 +2,8 @@
#ifndef _ASM_LOONGARCH_KVM_PARA_H
#define _ASM_LOONGARCH_KVM_PARA_H
+#include <uapi/asm/kvm_para.h>
+
/*
* Hypercall code field
*/
@@ -154,10 +156,20 @@ static __always_inline long kvm_hypercall5(u64 fid,
return ret;
}
+#ifdef CONFIG_PARAVIRT
+bool kvm_para_available(void);
+unsigned int kvm_arch_para_features(void);
+#else
+static inline bool kvm_para_available(void)
+{
+ return false;
+}
+
static inline unsigned int kvm_arch_para_features(void)
{
return 0;
}
+#endif
static inline unsigned int kvm_arch_para_hints(void)
{
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h
index c416cb7125c0..d7e8f7d50ee0 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -75,8 +75,13 @@ static inline void kvm_save_lasx(struct loongarch_fpu *fpu) { }
static inline void kvm_restore_lasx(struct loongarch_fpu *fpu) { }
#endif
+#ifdef CONFIG_CPU_HAS_LBT
+int kvm_own_lbt(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_own_lbt(struct kvm_vcpu *vcpu) { return -EINVAL; }
+#endif
+
void kvm_init_timer(struct kvm_vcpu *vcpu, unsigned long hz);
-void kvm_reset_timer(struct kvm_vcpu *vcpu);
void kvm_save_timer(struct kvm_vcpu *vcpu);
void kvm_restore_timer(struct kvm_vcpu *vcpu);
@@ -125,4 +130,9 @@ static inline bool kvm_pvtime_supported(void)
return !!sched_info_on();
}
+static inline bool kvm_guest_has_pv_feature(struct kvm_vcpu *vcpu, unsigned int feature)
+{
+ return vcpu->kvm->arch.pv_features & BIT(feature);
+}
+
#endif /* __ASM_LOONGARCH_KVM_VCPU_H__ */
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index 04a78010fc72..24a3f4925cfb 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -119,6 +119,7 @@
#define CPUCFG6_PMP BIT(0)
#define CPUCFG6_PAMVER GENMASK(3, 1)
#define CPUCFG6_PMNUM GENMASK(7, 4)
+#define CPUCFG6_PMNUM_SHIFT 4
#define CPUCFG6_PMBITS GENMASK(13, 8)
#define CPUCFG6_UPM BIT(14)
@@ -160,16 +161,8 @@
/*
* CPUCFG index area: 0x40000000 -- 0x400000ff
- * SW emulation for KVM hypervirsor
+ * SW emulation for KVM hypervirsor, see arch/loongarch/include/uapi/asm/kvm_para.h
*/
-#define CPUCFG_KVM_BASE 0x40000000
-#define CPUCFG_KVM_SIZE 0x100
-
-#define CPUCFG_KVM_SIG (CPUCFG_KVM_BASE + 0)
-#define KVM_SIGNATURE "KVM\0"
-#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
-#define KVM_FEATURE_IPI BIT(1)
-#define KVM_FEATURE_STEAL_TIME BIT(2)
#ifndef __ASSEMBLY__
diff --git a/arch/loongarch/include/asm/paravirt.h b/arch/loongarch/include/asm/paravirt.h
index dddec49671ae..3f4323603e6a 100644
--- a/arch/loongarch/include/asm/paravirt.h
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -19,6 +19,7 @@ static inline u64 paravirt_steal_clock(int cpu)
int __init pv_ipi_init(void);
int __init pv_time_init(void);
+int __init pv_spinlock_init(void);
#else
@@ -31,5 +32,11 @@ static inline int pv_time_init(void)
{
return 0;
}
+
+static inline int pv_spinlock_init(void)
+{
+ return 0;
+}
+
#endif // CONFIG_PARAVIRT
#endif
diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h
new file mode 100644
index 000000000000..e76d3aa1e1eb
--- /dev/null
+++ b/arch/loongarch/include/asm/qspinlock.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_QSPINLOCK_H
+#define _ASM_LOONGARCH_QSPINLOCK_H
+
+#include <linux/jump_label.h>
+
+#ifdef CONFIG_PARAVIRT
+
+DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
+
+#define virt_spin_lock virt_spin_lock
+
+static inline bool virt_spin_lock(struct qspinlock *lock)
+{
+ int val;
+
+ if (!static_branch_unlikely(&virt_spin_lock_key))
+ return false;
+
+ /*
+ * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+ * back to a Test-and-Set spinlock, because fair locks have
+ * horrible lock 'holder' preemption issues.
+ */
+
+__retry:
+ val = atomic_read(&lock->val);
+
+ if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
+ cpu_relax();
+ goto __retry;
+ }
+
+ return true;
+}
+
+#endif /* CONFIG_PARAVIRT */
+
+#include <asm-generic/qspinlock.h>
+
+#endif // _ASM_LOONGARCH_QSPINLOCK_H
diff --git a/arch/loongarch/include/uapi/asm/Kbuild b/arch/loongarch/include/uapi/asm/Kbuild
index c6d141d7b7d7..517761419999 100644
--- a/arch/loongarch/include/uapi/asm/Kbuild
+++ b/arch/loongarch/include/uapi/asm/Kbuild
@@ -1,4 +1,2 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += unistd_64.h
-
-generic-y += kvm_para.h
diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h
index ddc5cab0ffd0..70d89070bfeb 100644
--- a/arch/loongarch/include/uapi/asm/kvm.h
+++ b/arch/loongarch/include/uapi/asm/kvm.h
@@ -64,6 +64,7 @@ struct kvm_fpu {
#define KVM_REG_LOONGARCH_KVM (KVM_REG_LOONGARCH | 0x20000ULL)
#define KVM_REG_LOONGARCH_FPSIMD (KVM_REG_LOONGARCH | 0x30000ULL)
#define KVM_REG_LOONGARCH_CPUCFG (KVM_REG_LOONGARCH | 0x40000ULL)
+#define KVM_REG_LOONGARCH_LBT (KVM_REG_LOONGARCH | 0x50000ULL)
#define KVM_REG_LOONGARCH_MASK (KVM_REG_LOONGARCH | 0x70000ULL)
#define KVM_CSR_IDX_MASK 0x7fff
#define KVM_CPUCFG_IDX_MASK 0x7fff
@@ -77,11 +78,30 @@ struct kvm_fpu {
/* Debugging: Special instruction for software breakpoint */
#define KVM_REG_LOONGARCH_DEBUG_INST (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 3)
+/* LBT registers */
+#define KVM_REG_LOONGARCH_LBT_SCR0 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 1)
+#define KVM_REG_LOONGARCH_LBT_SCR1 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 2)
+#define KVM_REG_LOONGARCH_LBT_SCR2 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 3)
+#define KVM_REG_LOONGARCH_LBT_SCR3 (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 4)
+#define KVM_REG_LOONGARCH_LBT_EFLAGS (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 5)
+#define KVM_REG_LOONGARCH_LBT_FTOP (KVM_REG_LOONGARCH_LBT | KVM_REG_SIZE_U64 | 6)
+
#define LOONGARCH_REG_SHIFT 3
#define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT))
#define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG)
#define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG)
+/* Device Control API on vm fd */
+#define KVM_LOONGARCH_VM_FEAT_CTRL 0
+#define KVM_LOONGARCH_VM_FEAT_LSX 0
+#define KVM_LOONGARCH_VM_FEAT_LASX 1
+#define KVM_LOONGARCH_VM_FEAT_X86BT 2
+#define KVM_LOONGARCH_VM_FEAT_ARMBT 3
+#define KVM_LOONGARCH_VM_FEAT_MIPSBT 4
+#define KVM_LOONGARCH_VM_FEAT_PMU 5
+#define KVM_LOONGARCH_VM_FEAT_PV_IPI 6
+#define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7
+
/* Device Control API on vcpu fd */
#define KVM_LOONGARCH_VCPU_CPUCFG 0
#define KVM_LOONGARCH_VCPU_PVTIME_CTRL 1
diff --git a/arch/loongarch/include/uapi/asm/kvm_para.h b/arch/loongarch/include/uapi/asm/kvm_para.h
new file mode 100644
index 000000000000..b0604aa9b4bb
--- /dev/null
+++ b/arch/loongarch/include/uapi/asm/kvm_para.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_KVM_PARA_H
+#define _UAPI_ASM_KVM_PARA_H
+
+#include <linux/types.h>
+
+/*
+ * CPUCFG index area: 0x40000000 -- 0x400000ff
+ * SW emulation for KVM hypervirsor
+ */
+#define CPUCFG_KVM_BASE 0x40000000
+#define CPUCFG_KVM_SIZE 0x100
+#define CPUCFG_KVM_SIG (CPUCFG_KVM_BASE + 0)
+#define KVM_SIGNATURE "KVM\0"
+#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
+#define KVM_FEATURE_IPI 1
+#define KVM_FEATURE_STEAL_TIME 2
+/* BIT 24 - 31 are features configurable by user space vmm */
+#define KVM_FEATURE_VIRT_EXTIOI 24
+
+#endif /* _UAPI_ASM_KVM_PARA_H */
diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
index 69a85f2479fb..6ab640101457 100644
--- a/arch/loongarch/kernel/fpu.S
+++ b/arch/loongarch/kernel/fpu.S
@@ -530,6 +530,10 @@ SYM_FUNC_END(_restore_lasx_context)
#ifdef CONFIG_CPU_HAS_LBT
STACK_FRAME_NON_STANDARD _restore_fp
+#ifdef CONFIG_CPU_HAS_LSX
STACK_FRAME_NON_STANDARD _restore_lsx
+#endif
+#ifdef CONFIG_CPU_HAS_LASX
STACK_FRAME_NON_STANDARD _restore_lasx
#endif
+#endif
diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c
index f4991c03514f..adac8fcbb2ac 100644
--- a/arch/loongarch/kernel/irq.c
+++ b/arch/loongarch/kernel/irq.c
@@ -102,9 +102,6 @@ void __init init_IRQ(void)
mp_ops.init_ipi();
#endif
- for (i = 0; i < NR_IRQS; i++)
- irq_set_noprobe(i);
-
for_each_possible_cpu(i) {
page = alloc_pages_node(cpu_to_node(i), GFP_KERNEL, order);
diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
index 9c9b75b76f62..708eda025ed8 100644
--- a/arch/loongarch/kernel/paravirt.c
+++ b/arch/loongarch/kernel/paravirt.c
@@ -13,6 +13,7 @@ static int has_steal_clock;
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
static u64 native_steal_clock(int cpu)
{
@@ -151,11 +152,14 @@ static void pv_init_ipi(void)
}
#endif
-static bool kvm_para_available(void)
+bool kvm_para_available(void)
{
int config;
static int hypervisor_type;
+ if (!cpu_has_hypervisor)
+ return false;
+
if (!hypervisor_type) {
config = read_cpucfg(CPUCFG_KVM_SIG);
if (!memcmp(&config, KVM_SIGNATURE, 4))
@@ -165,17 +169,22 @@ static bool kvm_para_available(void)
return hypervisor_type == HYPERVISOR_KVM;
}
-int __init pv_ipi_init(void)
+unsigned int kvm_arch_para_features(void)
{
- int feature;
+ static unsigned int feature;
- if (!cpu_has_hypervisor)
- return 0;
if (!kvm_para_available())
return 0;
- feature = read_cpucfg(CPUCFG_KVM_FEATURE);
- if (!(feature & KVM_FEATURE_IPI))
+ if (!feature)
+ feature = read_cpucfg(CPUCFG_KVM_FEATURE);
+
+ return feature;
+}
+
+int __init pv_ipi_init(void)
+{
+ if (!kvm_para_has_feature(KVM_FEATURE_IPI))
return 0;
#ifdef CONFIG_SMP
@@ -206,7 +215,7 @@ static int pv_enable_steal_time(void)
}
addr |= KVM_STEAL_PHYS_VALID;
- kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, KVM_FEATURE_STEAL_TIME, addr);
+ kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, BIT(KVM_FEATURE_STEAL_TIME), addr);
return 0;
}
@@ -214,7 +223,7 @@ static int pv_enable_steal_time(void)
static void pv_disable_steal_time(void)
{
if (has_steal_clock)
- kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, KVM_FEATURE_STEAL_TIME, 0);
+ kvm_hypercall2(KVM_HCALL_FUNC_NOTIFY, BIT(KVM_FEATURE_STEAL_TIME), 0);
}
#ifdef CONFIG_SMP
@@ -258,15 +267,9 @@ static struct notifier_block pv_reboot_nb = {
int __init pv_time_init(void)
{
- int r, feature;
+ int r;
- if (!cpu_has_hypervisor)
- return 0;
- if (!kvm_para_available())
- return 0;
-
- feature = read_cpucfg(CPUCFG_KVM_FEATURE);
- if (!(feature & KVM_FEATURE_STEAL_TIME))
+ if (!kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
return 0;
has_steal_clock = 1;
@@ -300,3 +303,13 @@ int __init pv_time_init(void)
return 0;
}
+
+int __init pv_spinlock_init(void)
+{
+ if (!cpu_has_hypervisor)
+ return 0;
+
+ static_branch_enable(&virt_spin_lock_key);
+
+ return 0;
+}
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 0f0740f0be27..00e307203ddb 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -603,6 +603,8 @@ void __init setup_arch(char **cmdline_p)
arch_mem_init(cmdline_p);
resource_init();
+ jump_label_init(); /* Initialise the static keys for paravirtualization */
+
#ifdef CONFIG_SMP
plat_smp_setup();
prefill_possible_map();
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index ca405ab86aae..482b3c7e3042 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -476,7 +476,7 @@ core_initcall(ipi_pm_init);
#endif
/* Preload SMP state for boot cpu */
-void smp_prepare_boot_cpu(void)
+void __init smp_prepare_boot_cpu(void)
{
unsigned int cpu, node, rr_node;
@@ -509,6 +509,8 @@ void smp_prepare_boot_cpu(void)
rr_node = next_node_in(rr_node, node_online_map);
}
}
+
+ pv_spinlock_init();
}
/* called from main before smp_init() */
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index ea73f9dc2cc6..90894f70ff4a 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -50,9 +50,7 @@ static int kvm_emu_cpucfg(struct kvm_vcpu *vcpu, larch_inst inst)
vcpu->arch.gprs[rd] = *(unsigned int *)KVM_SIGNATURE;
break;
case CPUCFG_KVM_FEATURE:
- ret = KVM_FEATURE_IPI;
- if (kvm_pvtime_supported())
- ret |= KVM_FEATURE_STEAL_TIME;
+ ret = vcpu->kvm->arch.pv_features & LOONGARCH_PV_FEAT_MASK;
vcpu->arch.gprs[rd] = ret;
break;
default:
@@ -127,6 +125,14 @@ static int kvm_handle_csr(struct kvm_vcpu *vcpu, larch_inst inst)
rj = inst.reg2csr_format.rj;
csrid = inst.reg2csr_format.csr;
+ if (csrid >= LOONGARCH_CSR_PERFCTRL0 && csrid <= vcpu->arch.max_pmu_csrid) {
+ if (kvm_guest_has_pmu(&vcpu->arch)) {
+ vcpu->arch.pc -= 4;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ return EMULATE_DONE;
+ }
+ }
+
/* Process CSR ops */
switch (rj) {
case 0: /* process csrrd */
@@ -697,25 +703,22 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu)
id = kvm_read_reg(vcpu, LOONGARCH_GPR_A1);
data = kvm_read_reg(vcpu, LOONGARCH_GPR_A2);
switch (id) {
- case KVM_FEATURE_STEAL_TIME:
- if (!kvm_pvtime_supported())
- return KVM_HCALL_INVALID_CODE;
-
+ case BIT(KVM_FEATURE_STEAL_TIME):
if (data & ~(KVM_STEAL_PHYS_MASK | KVM_STEAL_PHYS_VALID))
return KVM_HCALL_INVALID_PARAMETER;
vcpu->arch.st.guest_addr = data;
if (!(data & KVM_STEAL_PHYS_VALID))
- break;
+ return 0;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
- break;
+ return 0;
default:
- break;
+ return KVM_HCALL_INVALID_CODE;
};
- return 0;
+ return KVM_HCALL_INVALID_CODE;
};
/*
@@ -748,6 +751,14 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
}
+static int kvm_handle_lbt_disabled(struct kvm_vcpu *vcpu)
+{
+ if (kvm_own_lbt(vcpu))
+ kvm_queue_exception(vcpu, EXCCODE_INE, 0);
+
+ return RESUME_GUEST;
+}
+
static int kvm_send_pv_ipi(struct kvm_vcpu *vcpu)
{
unsigned int min, cpu, i;
@@ -781,19 +792,21 @@ static int kvm_send_pv_ipi(struct kvm_vcpu *vcpu)
*/
static void kvm_handle_service(struct kvm_vcpu *vcpu)
{
+ long ret = KVM_HCALL_INVALID_CODE;
unsigned long func = kvm_read_reg(vcpu, LOONGARCH_GPR_A0);
- long ret;
switch (func) {
case KVM_HCALL_FUNC_IPI:
- kvm_send_pv_ipi(vcpu);
- ret = KVM_HCALL_SUCCESS;
+ if (kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_IPI)) {
+ kvm_send_pv_ipi(vcpu);
+ ret = KVM_HCALL_SUCCESS;
+ }
break;
case KVM_HCALL_FUNC_NOTIFY:
- ret = kvm_save_notify(vcpu);
+ if (kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME))
+ ret = kvm_save_notify(vcpu);
break;
default:
- ret = KVM_HCALL_INVALID_CODE;
break;
}
@@ -865,6 +878,7 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = {
[EXCCODE_FPDIS] = kvm_handle_fpu_disabled,
[EXCCODE_LSXDIS] = kvm_handle_lsx_disabled,
[EXCCODE_LASXDIS] = kvm_handle_lasx_disabled,
+ [EXCCODE_BTDIS] = kvm_handle_lbt_disabled,
[EXCCODE_GSPR] = kvm_handle_gspr,
[EXCCODE_HVC] = kvm_handle_hypercall,
};
diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c
index 844736b99d38..27e9b94c0a0b 100644
--- a/arch/loongarch/kvm/main.c
+++ b/arch/loongarch/kvm/main.c
@@ -261,7 +261,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
return -ENOIOCTLCMD;
}
-int kvm_arch_hardware_enable(void)
+int kvm_arch_enable_virtualization_cpu(void)
{
unsigned long env, gcfg = 0;
@@ -300,7 +300,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
write_csr_gcfg(0);
write_csr_gstat(0);
diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S
index 80e988985a6a..0c292f818492 100644
--- a/arch/loongarch/kvm/switch.S
+++ b/arch/loongarch/kvm/switch.S
@@ -277,6 +277,10 @@ SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest)
#ifdef CONFIG_CPU_HAS_LBT
STACK_FRAME_NON_STANDARD kvm_restore_fpu
+#ifdef CONFIG_CPU_HAS_LSX
STACK_FRAME_NON_STANDARD kvm_restore_lsx
+#endif
+#ifdef CONFIG_CPU_HAS_LASX
STACK_FRAME_NON_STANDARD kvm_restore_lasx
#endif
+#endif
diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c
index bcc6b6d063d9..74a4b5c272d6 100644
--- a/arch/loongarch/kvm/timer.c
+++ b/arch/loongarch/kvm/timer.c
@@ -188,10 +188,3 @@ void kvm_save_timer(struct kvm_vcpu *vcpu)
kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ESTAT);
preempt_enable();
}
-
-void kvm_reset_timer(struct kvm_vcpu *vcpu)
-{
- write_gcsr_timercfg(0);
- kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_TCFG, 0);
- hrtimer_cancel(&vcpu->arch.swtimer);
-}
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 16756ffb55e8..0697b1064251 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -6,6 +6,7 @@
#include <linux/kvm_host.h>
#include <linux/entry-kvm.h>
#include <asm/fpu.h>
+#include <asm/lbt.h>
#include <asm/loongarch.h>
#include <asm/setup.h>
#include <asm/time.h>
@@ -31,6 +32,126 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
sizeof(kvm_vcpu_stats_desc),
};
+static inline void kvm_save_host_pmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_context *context;
+
+ context = this_cpu_ptr(vcpu->kvm->arch.vmcs);
+ context->perf_cntr[0] = read_csr_perfcntr0();
+ context->perf_cntr[1] = read_csr_perfcntr1();
+ context->perf_cntr[2] = read_csr_perfcntr2();
+ context->perf_cntr[3] = read_csr_perfcntr3();
+ context->perf_ctrl[0] = write_csr_perfctrl0(0);
+ context->perf_ctrl[1] = write_csr_perfctrl1(0);
+ context->perf_ctrl[2] = write_csr_perfctrl2(0);
+ context->perf_ctrl[3] = write_csr_perfctrl3(0);
+}
+
+static inline void kvm_restore_host_pmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_context *context;
+
+ context = this_cpu_ptr(vcpu->kvm->arch.vmcs);
+ write_csr_perfcntr0(context->perf_cntr[0]);
+ write_csr_perfcntr1(context->perf_cntr[1]);
+ write_csr_perfcntr2(context->perf_cntr[2]);
+ write_csr_perfcntr3(context->perf_cntr[3]);
+ write_csr_perfctrl0(context->perf_ctrl[0]);
+ write_csr_perfctrl1(context->perf_ctrl[1]);
+ write_csr_perfctrl2(context->perf_ctrl[2]);
+ write_csr_perfctrl3(context->perf_ctrl[3]);
+}
+
+
+static inline void kvm_save_guest_pmu(struct kvm_vcpu *vcpu)
+{
+ struct loongarch_csrs *csr = vcpu->arch.csr;
+
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR0);
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR1);
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR2);
+ kvm_save_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR3);
+ kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0);
+ kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1);
+ kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2);
+ kvm_read_clear_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3);
+}
+
+static inline void kvm_restore_guest_pmu(struct kvm_vcpu *vcpu)
+{
+ struct loongarch_csrs *csr = vcpu->arch.csr;
+
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR0);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR1);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR2);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCNTR3);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2);
+ kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3);
+}
+
+static int kvm_own_pmu(struct kvm_vcpu *vcpu)
+{
+ unsigned long val;
+
+ if (!kvm_guest_has_pmu(&vcpu->arch))
+ return -EINVAL;
+
+ kvm_save_host_pmu(vcpu);
+
+ /* Set PM0-PM(num) to guest */
+ val = read_csr_gcfg() & ~CSR_GCFG_GPERF;
+ val |= (kvm_get_pmu_num(&vcpu->arch) + 1) << CSR_GCFG_GPERF_SHIFT;
+ write_csr_gcfg(val);
+
+ kvm_restore_guest_pmu(vcpu);
+
+ return 0;
+}
+
+static void kvm_lose_pmu(struct kvm_vcpu *vcpu)
+{
+ unsigned long val;
+ struct loongarch_csrs *csr = vcpu->arch.csr;
+
+ if (!(vcpu->arch.aux_inuse & KVM_LARCH_PMU))
+ return;
+
+ kvm_save_guest_pmu(vcpu);
+
+ /* Disable pmu access from guest */
+ write_csr_gcfg(read_csr_gcfg() & ~CSR_GCFG_GPERF);
+
+ /*
+ * Clear KVM_LARCH_PMU if the guest is not using PMU CSRs when
+ * exiting the guest, so that the next time trap into the guest.
+ * We don't need to deal with PMU CSRs contexts.
+ */
+ val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0);
+ val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1);
+ val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2);
+ val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3);
+ if (!(val & KVM_PMU_EVENT_ENABLED))
+ vcpu->arch.aux_inuse &= ~KVM_LARCH_PMU;
+
+ kvm_restore_host_pmu(vcpu);
+}
+
+static void kvm_restore_pmu(struct kvm_vcpu *vcpu)
+{
+ if ((vcpu->arch.aux_inuse & KVM_LARCH_PMU))
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+}
+
+static void kvm_check_pmu(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_PMU, vcpu)) {
+ kvm_own_pmu(vcpu);
+ vcpu->arch.aux_inuse |= KVM_LARCH_PMU;
+ }
+}
+
static void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
{
u32 version;
@@ -158,6 +279,7 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu)
/* Make sure the vcpu mode has been written */
smp_store_mb(vcpu->mode, IN_GUEST_MODE);
kvm_check_vpid(vcpu);
+ kvm_check_pmu(vcpu);
/*
* Called after function kvm_check_vpid()
@@ -195,6 +317,8 @@ static int kvm_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
/* Set a default exit reason */
run->exit_reason = KVM_EXIT_UNKNOWN;
+ kvm_lose_pmu(vcpu);
+
guest_timing_exit_irqoff();
guest_state_exit_irqoff();
local_irq_enable();
@@ -468,6 +592,22 @@ static int _kvm_setcsr(struct kvm_vcpu *vcpu, unsigned int id, u64 val)
kvm_write_sw_gcsr(csr, id, val);
+ /*
+ * After modifying the PMU CSR register value of the vcpu.
+ * If the PMU CSRs are used, we need to set KVM_REQ_PMU.
+ */
+ if (id >= LOONGARCH_CSR_PERFCTRL0 && id <= LOONGARCH_CSR_PERFCNTR3) {
+ unsigned long val;
+
+ val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0) |
+ kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1) |
+ kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL2) |
+ kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3);
+
+ if (val & KVM_PMU_EVENT_ENABLED)
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
+
return ret;
}
@@ -497,6 +637,12 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v)
*v |= CPUCFG2_LSX;
if (cpu_has_lasx)
*v |= CPUCFG2_LASX;
+ if (cpu_has_lbt_x86)
+ *v |= CPUCFG2_X86BT;
+ if (cpu_has_lbt_arm)
+ *v |= CPUCFG2_ARMBT;
+ if (cpu_has_lbt_mips)
+ *v |= CPUCFG2_MIPSBT;
return 0;
case LOONGARCH_CPUCFG3:
@@ -506,6 +652,12 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v)
case LOONGARCH_CPUCFG5:
*v = GENMASK(31, 0);
return 0;
+ case LOONGARCH_CPUCFG6:
+ if (cpu_has_pmp)
+ *v = GENMASK(14, 0);
+ else
+ *v = 0;
+ return 0;
case LOONGARCH_CPUCFG16:
*v = GENMASK(16, 0);
return 0;
@@ -550,6 +702,17 @@ static int kvm_check_cpucfg(int id, u64 val)
/* LASX architecturally implies LSX and FP but val does not satisfy that */
return -EINVAL;
return 0;
+ case LOONGARCH_CPUCFG6:
+ if (val & CPUCFG6_PMP) {
+ u32 host = read_cpucfg(LOONGARCH_CPUCFG6);
+ if ((val & CPUCFG6_PMBITS) != (host & CPUCFG6_PMBITS))
+ return -EINVAL;
+ if ((val & CPUCFG6_PMNUM) > (host & CPUCFG6_PMNUM))
+ return -EINVAL;
+ if ((val & CPUCFG6_UPM) && !(host & CPUCFG6_UPM))
+ return -EINVAL;
+ }
+ return 0;
default:
/*
* Values for the other CPUCFG IDs are not being further validated
@@ -577,6 +740,34 @@ static int kvm_get_one_reg(struct kvm_vcpu *vcpu,
else
ret = -EINVAL;
break;
+ case KVM_REG_LOONGARCH_LBT:
+ if (!kvm_guest_has_lbt(&vcpu->arch))
+ return -ENXIO;
+
+ switch (reg->id) {
+ case KVM_REG_LOONGARCH_LBT_SCR0:
+ *v = vcpu->arch.lbt.scr0;
+ break;
+ case KVM_REG_LOONGARCH_LBT_SCR1:
+ *v = vcpu->arch.lbt.scr1;
+ break;
+ case KVM_REG_LOONGARCH_LBT_SCR2:
+ *v = vcpu->arch.lbt.scr2;
+ break;
+ case KVM_REG_LOONGARCH_LBT_SCR3:
+ *v = vcpu->arch.lbt.scr3;
+ break;
+ case KVM_REG_LOONGARCH_LBT_EFLAGS:
+ *v = vcpu->arch.lbt.eflags;
+ break;
+ case KVM_REG_LOONGARCH_LBT_FTOP:
+ *v = vcpu->arch.fpu.ftop;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ break;
case KVM_REG_LOONGARCH_KVM:
switch (reg->id) {
case KVM_REG_LOONGARCH_COUNTER:
@@ -635,6 +826,37 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu,
if (ret)
break;
vcpu->arch.cpucfg[id] = (u32)v;
+ if (id == LOONGARCH_CPUCFG6)
+ vcpu->arch.max_pmu_csrid =
+ LOONGARCH_CSR_PERFCTRL0 + 2 * kvm_get_pmu_num(&vcpu->arch) + 1;
+ break;
+ case KVM_REG_LOONGARCH_LBT:
+ if (!kvm_guest_has_lbt(&vcpu->arch))
+ return -ENXIO;
+
+ switch (reg->id) {
+ case KVM_REG_LOONGARCH_LBT_SCR0:
+ vcpu->arch.lbt.scr0 = v;
+ break;
+ case KVM_REG_LOONGARCH_LBT_SCR1:
+ vcpu->arch.lbt.scr1 = v;
+ break;
+ case KVM_REG_LOONGARCH_LBT_SCR2:
+ vcpu->arch.lbt.scr2 = v;
+ break;
+ case KVM_REG_LOONGARCH_LBT_SCR3:
+ vcpu->arch.lbt.scr3 = v;
+ break;
+ case KVM_REG_LOONGARCH_LBT_EFLAGS:
+ vcpu->arch.lbt.eflags = v;
+ break;
+ case KVM_REG_LOONGARCH_LBT_FTOP:
+ vcpu->arch.fpu.ftop = v;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
break;
case KVM_REG_LOONGARCH_KVM:
switch (reg->id) {
@@ -647,7 +869,7 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu,
vcpu->kvm->arch.time_offset = (signed long)(v - drdtime());
break;
case KVM_REG_LOONGARCH_VCPU_RESET:
- kvm_reset_timer(vcpu);
+ vcpu->arch.st.guest_addr = 0;
memset(&vcpu->arch.irq_pending, 0, sizeof(vcpu->arch.irq_pending));
memset(&vcpu->arch.irq_clear, 0, sizeof(vcpu->arch.irq_clear));
break;
@@ -728,7 +950,10 @@ static int kvm_loongarch_cpucfg_has_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
switch (attr->attr) {
- case 2:
+ case LOONGARCH_CPUCFG2:
+ case LOONGARCH_CPUCFG6:
+ return 0;
+ case CPUCFG_KVM_FEATURE:
return 0;
default:
return -ENXIO;
@@ -740,8 +965,8 @@ static int kvm_loongarch_cpucfg_has_attr(struct kvm_vcpu *vcpu,
static int kvm_loongarch_pvtime_has_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- if (!kvm_pvtime_supported() ||
- attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA)
+ if (!kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME)
+ || attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA)
return -ENXIO;
return 0;
@@ -773,9 +998,18 @@ static int kvm_loongarch_cpucfg_get_attr(struct kvm_vcpu *vcpu,
uint64_t val;
uint64_t __user *uaddr = (uint64_t __user *)attr->addr;
- ret = _kvm_get_cpucfg_mask(attr->attr, &val);
- if (ret)
- return ret;
+ switch (attr->attr) {
+ case 0 ... (KVM_MAX_CPUCFG_REGS - 1):
+ ret = _kvm_get_cpucfg_mask(attr->attr, &val);
+ if (ret)
+ return ret;
+ break;
+ case CPUCFG_KVM_FEATURE:
+ val = vcpu->kvm->arch.pv_features & LOONGARCH_PV_FEAT_MASK;
+ break;
+ default:
+ return -ENXIO;
+ }
put_user(val, uaddr);
@@ -788,8 +1022,8 @@ static int kvm_loongarch_pvtime_get_attr(struct kvm_vcpu *vcpu,
u64 gpa;
u64 __user *user = (u64 __user *)attr->addr;
- if (!kvm_pvtime_supported() ||
- attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA)
+ if (!kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME)
+ || attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA)
return -ENXIO;
gpa = vcpu->arch.st.guest_addr;
@@ -821,7 +1055,28 @@ static int kvm_loongarch_vcpu_get_attr(struct kvm_vcpu *vcpu,
static int kvm_loongarch_cpucfg_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- return -ENXIO;
+ u64 val, valid;
+ u64 __user *user = (u64 __user *)attr->addr;
+ struct kvm *kvm = vcpu->kvm;
+
+ switch (attr->attr) {
+ case CPUCFG_KVM_FEATURE:
+ if (get_user(val, user))
+ return -EFAULT;
+
+ valid = LOONGARCH_PV_FEAT_MASK;
+ if (val & ~valid)
+ return -EINVAL;
+
+ /* All vCPUs need set the same PV features */
+ if ((kvm->arch.pv_features & LOONGARCH_PV_FEAT_UPDATED)
+ && ((kvm->arch.pv_features & valid) != val))
+ return -EINVAL;
+ kvm->arch.pv_features = val | LOONGARCH_PV_FEAT_UPDATED;
+ return 0;
+ default:
+ return -ENXIO;
+ }
}
static int kvm_loongarch_pvtime_set_attr(struct kvm_vcpu *vcpu,
@@ -831,8 +1086,8 @@ static int kvm_loongarch_pvtime_set_attr(struct kvm_vcpu *vcpu,
u64 gpa, __user *user = (u64 __user *)attr->addr;
struct kvm *kvm = vcpu->kvm;
- if (!kvm_pvtime_supported() ||
- attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA)
+ if (!kvm_guest_has_pv_feature(vcpu, KVM_FEATURE_STEAL_TIME)
+ || attr->attr != KVM_LOONGARCH_VCPU_PVTIME_GPA)
return -ENXIO;
if (get_user(gpa, user))
@@ -977,12 +1232,66 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
return 0;
}
+#ifdef CONFIG_CPU_HAS_LBT
+int kvm_own_lbt(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_guest_has_lbt(&vcpu->arch))
+ return -EINVAL;
+
+ preempt_disable();
+ set_csr_euen(CSR_EUEN_LBTEN);
+ _restore_lbt(&vcpu->arch.lbt);
+ vcpu->arch.aux_inuse |= KVM_LARCH_LBT;
+ preempt_enable();
+
+ return 0;
+}
+
+static void kvm_lose_lbt(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ if (vcpu->arch.aux_inuse & KVM_LARCH_LBT) {
+ _save_lbt(&vcpu->arch.lbt);
+ clear_csr_euen(CSR_EUEN_LBTEN);
+ vcpu->arch.aux_inuse &= ~KVM_LARCH_LBT;
+ }
+ preempt_enable();
+}
+
+static void kvm_check_fcsr(struct kvm_vcpu *vcpu, unsigned long fcsr)
+{
+ /*
+ * If TM is enabled, top register save/restore will
+ * cause lbt exception, here enable lbt in advance
+ */
+ if (fcsr & FPU_CSR_TM)
+ kvm_own_lbt(vcpu);
+}
+
+static void kvm_check_fcsr_alive(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.aux_inuse & KVM_LARCH_FPU) {
+ if (vcpu->arch.aux_inuse & KVM_LARCH_LBT)
+ return;
+ kvm_check_fcsr(vcpu, read_fcsr(LOONGARCH_FCSR0));
+ }
+}
+#else
+static inline void kvm_lose_lbt(struct kvm_vcpu *vcpu) { }
+static inline void kvm_check_fcsr(struct kvm_vcpu *vcpu, unsigned long fcsr) { }
+static inline void kvm_check_fcsr_alive(struct kvm_vcpu *vcpu) { }
+#endif
+
/* Enable FPU and restore context */
void kvm_own_fpu(struct kvm_vcpu *vcpu)
{
preempt_disable();
- /* Enable FPU */
+ /*
+ * Enable FPU for guest
+ * Set FR and FRE according to guest context
+ */
+ kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr);
set_csr_euen(CSR_EUEN_FPEN);
kvm_restore_fpu(&vcpu->arch.fpu);
@@ -1002,6 +1311,7 @@ int kvm_own_lsx(struct kvm_vcpu *vcpu)
preempt_disable();
/* Enable LSX for guest */
+ kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr);
set_csr_euen(CSR_EUEN_LSXEN | CSR_EUEN_FPEN);
switch (vcpu->arch.aux_inuse & KVM_LARCH_FPU) {
case KVM_LARCH_FPU:
@@ -1036,6 +1346,7 @@ int kvm_own_lasx(struct kvm_vcpu *vcpu)
preempt_disable();
+ kvm_check_fcsr(vcpu, vcpu->arch.fpu.fcsr);
set_csr_euen(CSR_EUEN_FPEN | CSR_EUEN_LSXEN | CSR_EUEN_LASXEN);
switch (vcpu->arch.aux_inuse & (KVM_LARCH_FPU | KVM_LARCH_LSX)) {
case KVM_LARCH_LSX:
@@ -1067,6 +1378,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
{
preempt_disable();
+ kvm_check_fcsr_alive(vcpu);
if (vcpu->arch.aux_inuse & KVM_LARCH_LASX) {
kvm_save_lasx(&vcpu->arch.fpu);
vcpu->arch.aux_inuse &= ~(KVM_LARCH_LSX | KVM_LARCH_FPU | KVM_LARCH_LASX);
@@ -1089,6 +1401,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
/* Disable FPU */
clear_csr_euen(CSR_EUEN_FPEN);
}
+ kvm_lose_lbt(vcpu);
preempt_enable();
}
@@ -1235,6 +1548,9 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
change_csr_gcfg(CSR_GCFG_MATC_MASK, CSR_GCFG_MATC_ROOT);
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+ /* Restore hardware PMU CSRs */
+ kvm_restore_pmu(vcpu);
+
/* Don't bother restoring registers multiple times unless necessary */
if (vcpu->arch.aux_inuse & KVM_LARCH_HWCSR_USABLE)
return 0;
diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c
index 6b2e4f66ad26..4ba734aaef87 100644
--- a/arch/loongarch/kvm/vm.c
+++ b/arch/loongarch/kvm/vm.c
@@ -5,6 +5,7 @@
#include <linux/kvm_host.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_vcpu.h>
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
@@ -39,6 +40,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
spin_lock_init(&kvm->arch.phyid_map_lock);
kvm_init_vmcs(kvm);
+
+ /* Enable all PV features by default */
+ kvm->arch.pv_features = BIT(KVM_FEATURE_IPI);
+ if (kvm_pvtime_supported())
+ kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME);
+
kvm->arch.gpa_size = BIT(cpu_vabits - 1);
kvm->arch.root_level = CONFIG_PGTABLE_LEVELS - 1;
kvm->arch.invalid_ptes[0] = 0;
@@ -99,7 +106,67 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
return r;
}
+static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case KVM_LOONGARCH_VM_FEAT_LSX:
+ if (cpu_has_lsx)
+ return 0;
+ return -ENXIO;
+ case KVM_LOONGARCH_VM_FEAT_LASX:
+ if (cpu_has_lasx)
+ return 0;
+ return -ENXIO;
+ case KVM_LOONGARCH_VM_FEAT_X86BT:
+ if (cpu_has_lbt_x86)
+ return 0;
+ return -ENXIO;
+ case KVM_LOONGARCH_VM_FEAT_ARMBT:
+ if (cpu_has_lbt_arm)
+ return 0;
+ return -ENXIO;
+ case KVM_LOONGARCH_VM_FEAT_MIPSBT:
+ if (cpu_has_lbt_mips)
+ return 0;
+ return -ENXIO;
+ case KVM_LOONGARCH_VM_FEAT_PMU:
+ if (cpu_has_pmp)
+ return 0;
+ return -ENXIO;
+ case KVM_LOONGARCH_VM_FEAT_PV_IPI:
+ return 0;
+ case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME:
+ if (kvm_pvtime_supported())
+ return 0;
+ return -ENXIO;
+ default:
+ return -ENXIO;
+ }
+}
+
+static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_LOONGARCH_VM_FEAT_CTRL:
+ return kvm_vm_feature_has_attr(kvm, attr);
+ default:
+ return -ENXIO;
+ }
+}
+
int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
- return -ENOIOCTLCMD;
+ void __user *argp = (void __user *)arg;
+ struct kvm *kvm = filp->private_data;
+ struct kvm_device_attr attr;
+
+ switch (ioctl) {
+ case KVM_HAS_DEVICE_ATTR:
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ return kvm_vm_has_attr(kvm, &attr);
+ default:
+ return -ENOIOCTLCMD;
+ }
}
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 3827dc76edd8..4520c5741579 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -193,11 +193,6 @@ asmlinkage void __init mmu_init(void)
{
unsigned int kstart, ksize;
- if (!memblock.reserved.cnt) {
- pr_emerg("Error memory count\n");
- machine_restart(NULL);
- }
-
if ((u32) memblock.memory.regions[0].size < 0x400000) {
pr_emerg("Memory must be greater than 4MB\n");
machine_restart(NULL);
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 6743a57c1ab4..f7222eb594ea 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -728,8 +728,8 @@ struct kvm_mips_callbacks {
int (*handle_fpe)(struct kvm_vcpu *vcpu);
int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
int (*handle_guest_exit)(struct kvm_vcpu *vcpu);
- int (*hardware_enable)(void);
- void (*hardware_disable)(void);
+ int (*enable_virtualization_cpu)(void);
+ void (*disable_virtualization_cpu)(void);
int (*check_extension)(struct kvm *kvm, long ext);
int (*vcpu_init)(struct kvm_vcpu *vcpu);
void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/kernel/cevt-r4k.c b/arch/mips/kernel/cevt-r4k.c
index 368e8475870f..5f6e9e2ebbdb 100644
--- a/arch/mips/kernel/cevt-r4k.c
+++ b/arch/mips/kernel/cevt-r4k.c
@@ -303,13 +303,6 @@ int r4k_clockevent_init(void)
if (!c0_compare_int_usable())
return -ENXIO;
- /*
- * With vectored interrupts things are getting platform specific.
- * get_c0_compare_int is a hook to allow a platform to return the
- * interrupt number of its liking.
- */
- irq = get_c0_compare_int();
-
cd = &per_cpu(mips_clockevent_device, cpu);
cd->name = "MIPS";
@@ -320,7 +313,6 @@ int r4k_clockevent_init(void)
min_delta = calculate_min_delta();
cd->rating = 300;
- cd->irq = irq;
cd->cpumask = cpumask_of(cpu);
cd->set_next_event = mips_next_event;
cd->event_handler = mips_event_handler;
@@ -332,6 +324,13 @@ int r4k_clockevent_init(void)
cp0_timer_irq_installed = 1;
+ /*
+ * With vectored interrupts things are getting platform specific.
+ * get_c0_compare_int is a hook to allow a platform to return the
+ * interrupt number of its liking.
+ */
+ irq = get_c0_compare_int();
+
if (request_irq(irq, c0_compare_interrupt, flags, "timer",
c0_compare_interrupt))
pr_err("Failed to request irq %d (timer)\n", irq);
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index bda7f193baab..af7412549e6e 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -1724,12 +1724,16 @@ static inline void cpu_probe_loongson(struct cpuinfo_mips *c, unsigned int cpu)
c->ases |= (MIPS_ASE_LOONGSON_MMI | MIPS_ASE_LOONGSON_CAM |
MIPS_ASE_LOONGSON_EXT | MIPS_ASE_LOONGSON_EXT2);
c->ases &= ~MIPS_ASE_VZ; /* VZ of Loongson-3A2000/3000 is incomplete */
+ change_c0_config6(LOONGSON_CONF6_EXTIMER | LOONGSON_CONF6_INTIMER,
+ LOONGSON_CONF6_INTIMER);
break;
case PRID_IMP_LOONGSON_64G:
__cpu_name[cpu] = "ICT Loongson-3";
set_elf_platform(cpu, "loongson3a");
set_isa(c, MIPS_CPU_ISA_M64R2);
decode_cpucfg(c);
+ change_c0_config6(LOONGSON_CONF6_EXTIMER | LOONGSON_CONF6_INTIMER,
+ LOONGSON_CONF6_INTIMER);
break;
default:
panic("Unknown Loongson Processor ID!");
diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c
index bdb1fa8931f4..59eca397f297 100644
--- a/arch/mips/kernel/csrc-r4k.c
+++ b/arch/mips/kernel/csrc-r4k.c
@@ -21,9 +21,7 @@ static struct clocksource clocksource_mips = {
.name = "MIPS",
.read = c0_hpt_read,
.mask = CLOCKSOURCE_MASK(32),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS |
- CLOCK_SOURCE_MUST_VERIFY |
- CLOCK_SOURCE_VERIFY_PERCPU,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static u64 __maybe_unused notrace r4k_read_sched_clock(void)
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index b5de770b092e..60b43ea85c12 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -125,14 +125,14 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
return 1;
}
-int kvm_arch_hardware_enable(void)
+int kvm_arch_enable_virtualization_cpu(void)
{
- return kvm_mips_callbacks->hardware_enable();
+ return kvm_mips_callbacks->enable_virtualization_cpu();
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
- kvm_mips_callbacks->hardware_disable();
+ kvm_mips_callbacks->disable_virtualization_cpu();
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 99d5a71e4300..ccab4d76b126 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -2869,7 +2869,7 @@ static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size)
return ret + 1;
}
-static int kvm_vz_hardware_enable(void)
+static int kvm_vz_enable_virtualization_cpu(void)
{
unsigned int mmu_size, guest_mmu_size, ftlb_size;
u64 guest_cvmctl, cvmvmconfig;
@@ -2983,7 +2983,7 @@ static int kvm_vz_hardware_enable(void)
return 0;
}
-static void kvm_vz_hardware_disable(void)
+static void kvm_vz_disable_virtualization_cpu(void)
{
u64 cvmvmconfig;
unsigned int mmu_size;
@@ -3280,8 +3280,8 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
.handle_msa_disabled = kvm_trap_vz_handle_msa_disabled,
.handle_guest_exit = kvm_trap_vz_handle_guest_exit,
- .hardware_enable = kvm_vz_hardware_enable,
- .hardware_disable = kvm_vz_hardware_disable,
+ .enable_virtualization_cpu = kvm_vz_enable_virtualization_cpu,
+ .disable_virtualization_cpu = kvm_vz_disable_virtualization_cpu,
.check_extension = kvm_vz_check_extension,
.vcpu_init = kvm_vz_vcpu_init,
.vcpu_uninit = kvm_vz_vcpu_uninit,
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 34d91cb8b259..96970fa75e4a 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -459,7 +459,6 @@ void free_initmem(void)
unsigned long kernel_end = (unsigned long)&_end;
/* Remap kernel text and data, but do not touch init section yet. */
- kernel_set_to_readonly = true;
map_pages(init_end, __pa(init_end), kernel_end - init_end,
PAGE_KERNEL, 0);
@@ -493,11 +492,18 @@ void free_initmem(void)
#ifdef CONFIG_STRICT_KERNEL_RWX
void mark_rodata_ro(void)
{
- /* rodata memory was already mapped with KERNEL_RO access rights by
- pagetable_init() and map_pages(). No need to do additional stuff here */
- unsigned long roai_size = __end_ro_after_init - __start_ro_after_init;
+ unsigned long start = (unsigned long) &__start_rodata;
+ unsigned long end = (unsigned long) &__end_rodata;
+
+ pr_info("Write protecting the kernel read-only data: %luk\n",
+ (end - start) >> 10);
+
+ kernel_set_to_readonly = true;
+ map_pages(start, __pa(start), end - start, PAGE_KERNEL, 0);
- pr_info("Write protected read-only-after-init data: %luk\n", roai_size >> 10);
+ /* force the kernel to see the new page table entries */
+ flush_cache_all();
+ flush_tlb_all();
}
#endif
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 9508399dd036..b481738c4bb5 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -52,7 +52,7 @@
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
#define pgd_ERROR(e) \
- pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
+ pr_err("%s:%d: bad pgd %08llx.\n", __FILE__, __LINE__, (unsigned long long)pgd_val(e))
/*
* This is the bottom of the PKMAP area with HIGHMEM or an arbitrary
@@ -170,7 +170,7 @@ static inline void pmd_clear(pmd_t *pmdp)
#define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT)
#else
#define pmd_page_vaddr(pmd) \
- ((const void *)(pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1)))
+ ((const void *)((unsigned long)pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1)))
#define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT)
#endif
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
index 7b3d4c592a10..f3086e39e7d2 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -49,16 +49,22 @@ static inline unsigned long pud_val(pud_t x)
#endif /* CONFIG_PPC64 */
/* PGD level */
-#if defined(CONFIG_PPC_E500) && defined(CONFIG_PTE_64BIT)
+#if defined(CONFIG_PPC_85xx) && defined(CONFIG_PTE_64BIT)
typedef struct { unsigned long long pgd; } pgd_t;
+
+static inline unsigned long long pgd_val(pgd_t x)
+{
+ return x.pgd;
+}
#else
typedef struct { unsigned long pgd; } pgd_t;
-#endif
-#define __pgd(x) ((pgd_t) { (x) })
+
static inline unsigned long pgd_val(pgd_t x)
{
return x.pgd;
}
+#endif
+#define __pgd(x) ((pgd_t) { (x) })
/* Page protection bits */
typedef struct { unsigned long pgprot; } pgprot_t;
diff --git a/arch/powerpc/kernel/vdso/vdso32.lds.S b/arch/powerpc/kernel/vdso/vdso32.lds.S
index 426e1ccc6971..8f57107000a2 100644
--- a/arch/powerpc/kernel/vdso/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso/vdso32.lds.S
@@ -74,6 +74,8 @@ SECTIONS
.got : { *(.got) } :text
.plt : { *(.plt) }
+ .rela.dyn : { *(.rela .rela*) }
+
_end = .;
__end = .;
PROVIDE(end = .);
@@ -87,7 +89,7 @@ SECTIONS
*(.branch_lt)
*(.data .data.* .gnu.linkonce.d.* .sdata*)
*(.bss .sbss .dynbss .dynsbss)
- *(.got1 .glink .iplt .rela*)
+ *(.got1 .glink .iplt)
}
}
diff --git a/arch/powerpc/kernel/vdso/vdso64.lds.S b/arch/powerpc/kernel/vdso/vdso64.lds.S
index bda6c8cdd459..400819258c06 100644
--- a/arch/powerpc/kernel/vdso/vdso64.lds.S
+++ b/arch/powerpc/kernel/vdso/vdso64.lds.S
@@ -69,7 +69,7 @@ SECTIONS
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
.eh_frame : { KEEP (*(.eh_frame)) } :text
.gcc_except_table : { *(.gcc_except_table) }
- .rela.dyn ALIGN(8) : { *(.rela.dyn) }
+ .rela.dyn ALIGN(8) : { *(.rela .rela*) }
.got ALIGN(8) : { *(.got .toc) }
@@ -86,7 +86,7 @@ SECTIONS
*(.data .data.* .gnu.linkonce.d.* .sdata*)
*(.bss .sbss .dynbss .dynsbss)
*(.opd)
- *(.glink .iplt .plt .rela*)
+ *(.glink .iplt .plt)
}
}
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 5de4dd549f6e..bcc7e4dff8c3 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -697,7 +697,15 @@ again:
}
release:
- qnodesp->count--; /* release the node */
+ /*
+ * Clear the lock before releasing the node, as another CPU might see stale
+ * values if an interrupt occurs after we increment qnodesp->count
+ * but before node->lock is initialized. The barrier ensures that
+ * there are no further stores to the node after it has been released.
+ */
+ node->lock = NULL;
+ barrier();
+ qnodesp->count--;
}
void queued_spin_lock_slowpath(struct qspinlock *lock)
diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c
index 113edf76d3ce..d26656b07b72 100644
--- a/arch/powerpc/mm/nohash/tlb_64e.c
+++ b/arch/powerpc/mm/nohash/tlb_64e.c
@@ -33,7 +33,7 @@
* though this will probably be made common with other nohash
* implementations at some point
*/
-int mmu_pte_psize; /* Page size used for PTE pages */
+static int mmu_pte_psize; /* Page size used for PTE pages */
int mmu_vmemmap_psize; /* Page size used for the virtual mem map */
int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */
unsigned long linear_map_top; /* Top of linear mapping */
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 0f3cd7c3a436..939ea7f6a228 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -552,8 +552,8 @@ config RISCV_ISA_SVPBMT
config TOOLCHAIN_HAS_V
bool
default y
- depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64iv)
- depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32iv)
+ depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64imv)
+ depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32imv)
depends on LLD_VERSION >= 140000 || LD_VERSION >= 23800
depends on AS_HAS_OPTION_ARCH
diff --git a/arch/riscv/include/asm/kvm_vcpu_pmu.h b/arch/riscv/include/asm/kvm_vcpu_pmu.h
index fa0f535bbbf0..1d85b6617508 100644
--- a/arch/riscv/include/asm/kvm_vcpu_pmu.h
+++ b/arch/riscv/include/asm/kvm_vcpu_pmu.h
@@ -10,6 +10,7 @@
#define __KVM_VCPU_RISCV_PMU_H
#include <linux/perf/riscv_pmu.h>
+#include <asm/kvm_vcpu_insn.h>
#include <asm/sbi.h>
#ifdef CONFIG_RISCV_PMU_SBI
@@ -64,11 +65,11 @@ struct kvm_pmu {
#if defined(CONFIG_32BIT)
#define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \
-{.base = CSR_CYCLEH, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, \
-{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm },
+{.base = CSR_CYCLEH, .count = 32, .func = kvm_riscv_vcpu_pmu_read_hpm }, \
+{.base = CSR_CYCLE, .count = 32, .func = kvm_riscv_vcpu_pmu_read_hpm },
#else
#define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \
-{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm },
+{.base = CSR_CYCLE, .count = 32, .func = kvm_riscv_vcpu_pmu_read_hpm },
#endif
int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid);
@@ -104,8 +105,20 @@ void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu);
struct kvm_pmu {
};
+static inline int kvm_riscv_vcpu_pmu_read_legacy(struct kvm_vcpu *vcpu, unsigned int csr_num,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask)
+{
+ if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) {
+ *val = 0;
+ return KVM_INSN_CONTINUE_NEXT_SEPC;
+ } else {
+ return KVM_INSN_ILLEGAL_TRAP;
+ }
+}
+
#define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \
-{.base = 0, .count = 0, .func = NULL },
+{.base = CSR_CYCLE, .count = 3, .func = kvm_riscv_vcpu_pmu_read_legacy },
static inline void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) {}
static inline int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid)
diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index 8702b8721a27..efa1b3519b23 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -14,36 +14,14 @@
#include <asm/ptrace.h>
-/*
- * addr is a hint to the maximum userspace address that mmap should provide, so
- * this macro needs to return the largest address space available so that
- * mmap_end < addr, being mmap_end the top of that address space.
- * See Documentation/arch/riscv/vm-layout.rst for more details.
- */
#define arch_get_mmap_end(addr, len, flags) \
({ \
- unsigned long mmap_end; \
- typeof(addr) _addr = (addr); \
- if ((_addr) == 0 || is_compat_task() || \
- ((_addr + len) > BIT(VA_BITS - 1))) \
- mmap_end = STACK_TOP_MAX; \
- else \
- mmap_end = (_addr + len); \
- mmap_end; \
+ STACK_TOP_MAX; \
})
#define arch_get_mmap_base(addr, base) \
({ \
- unsigned long mmap_base; \
- typeof(addr) _addr = (addr); \
- typeof(base) _base = (base); \
- unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base); \
- if ((_addr) == 0 || is_compat_task() || \
- ((_addr + len) > BIT(VA_BITS - 1))) \
- mmap_base = (_base); \
- else \
- mmap_base = (_addr + len) - rnd_gap; \
- mmap_base; \
+ base; \
})
#ifdef CONFIG_64BIT
diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 7cffd4ffecd0..7bd3746028c9 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/cpumask.h>
+#include <linux/jump_label.h>
#ifdef CONFIG_RISCV_SBI
enum sbi_ext_id {
@@ -304,6 +305,7 @@ struct sbiret {
};
void sbi_init(void);
+long __sbi_base_ecall(int fid);
struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1,
unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5,
@@ -373,7 +375,23 @@ static inline unsigned long sbi_mk_version(unsigned long major,
| (minor & SBI_SPEC_VERSION_MINOR_MASK);
}
-int sbi_err_map_linux_errno(int err);
+static inline int sbi_err_map_linux_errno(int err)
+{
+ switch (err) {
+ case SBI_SUCCESS:
+ return 0;
+ case SBI_ERR_DENIED:
+ return -EPERM;
+ case SBI_ERR_INVALID_PARAM:
+ return -EINVAL;
+ case SBI_ERR_INVALID_ADDRESS:
+ return -EFAULT;
+ case SBI_ERR_NOT_SUPPORTED:
+ case SBI_ERR_FAILURE:
+ default:
+ return -ENOTSUPP;
+ };
+}
extern bool sbi_debug_console_available;
int sbi_debug_console_write(const char *bytes, unsigned int num_bytes);
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 06d407f1b30b..7f88cc4931f5 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -20,17 +20,21 @@ endif
ifdef CONFIG_RISCV_ALTERNATIVE_EARLY
CFLAGS_alternative.o := -mcmodel=medany
CFLAGS_cpufeature.o := -mcmodel=medany
+CFLAGS_sbi_ecall.o := -mcmodel=medany
ifdef CONFIG_FTRACE
CFLAGS_REMOVE_alternative.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_cpufeature.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_sbi_ecall.o = $(CC_FLAGS_FTRACE)
endif
ifdef CONFIG_RELOCATABLE
CFLAGS_alternative.o += -fno-pie
CFLAGS_cpufeature.o += -fno-pie
+CFLAGS_sbi_ecall.o += -fno-pie
endif
ifdef CONFIG_KASAN
KASAN_SANITIZE_alternative.o := n
KASAN_SANITIZE_cpufeature.o := n
+KASAN_SANITIZE_sbi_ecall.o := n
endif
endif
@@ -88,7 +92,7 @@ obj-$(CONFIG_DYNAMIC_FTRACE) += mcount-dyn.o
obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o
obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o
-obj-$(CONFIG_RISCV_SBI) += sbi.o
+obj-$(CONFIG_RISCV_SBI) += sbi.o sbi_ecall.o
ifeq ($(CONFIG_RISCV_SBI), y)
obj-$(CONFIG_SMP) += sbi-ipi.o
obj-$(CONFIG_SMP) += cpu_ops_sbi.o
diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index 837bdab2601b..1989b8cade1b 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -14,9 +14,6 @@
#include <asm/smp.h>
#include <asm/tlbflush.h>
-#define CREATE_TRACE_POINTS
-#include <asm/trace.h>
-
/* default SBI version is 0.1 */
unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT;
EXPORT_SYMBOL(sbi_spec_version);
@@ -27,55 +24,6 @@ static int (*__sbi_rfence)(int fid, const struct cpumask *cpu_mask,
unsigned long start, unsigned long size,
unsigned long arg4, unsigned long arg5) __ro_after_init;
-struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1,
- unsigned long arg2, unsigned long arg3,
- unsigned long arg4, unsigned long arg5,
- int fid, int ext)
-{
- struct sbiret ret;
-
- trace_sbi_call(ext, fid);
-
- register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0);
- register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1);
- register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2);
- register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3);
- register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4);
- register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5);
- register uintptr_t a6 asm ("a6") = (uintptr_t)(fid);
- register uintptr_t a7 asm ("a7") = (uintptr_t)(ext);
- asm volatile ("ecall"
- : "+r" (a0), "+r" (a1)
- : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7)
- : "memory");
- ret.error = a0;
- ret.value = a1;
-
- trace_sbi_return(ext, ret.error, ret.value);
-
- return ret;
-}
-EXPORT_SYMBOL(__sbi_ecall);
-
-int sbi_err_map_linux_errno(int err)
-{
- switch (err) {
- case SBI_SUCCESS:
- return 0;
- case SBI_ERR_DENIED:
- return -EPERM;
- case SBI_ERR_INVALID_PARAM:
- return -EINVAL;
- case SBI_ERR_INVALID_ADDRESS:
- return -EFAULT;
- case SBI_ERR_NOT_SUPPORTED:
- case SBI_ERR_FAILURE:
- default:
- return -ENOTSUPP;
- };
-}
-EXPORT_SYMBOL(sbi_err_map_linux_errno);
-
#ifdef CONFIG_RISCV_SBI_V01
static unsigned long __sbi_v01_cpumask_to_hartmask(const struct cpumask *cpu_mask)
{
@@ -535,17 +483,6 @@ long sbi_probe_extension(int extid)
}
EXPORT_SYMBOL(sbi_probe_extension);
-static long __sbi_base_ecall(int fid)
-{
- struct sbiret ret;
-
- ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0);
- if (!ret.error)
- return ret.value;
- else
- return sbi_err_map_linux_errno(ret.error);
-}
-
static inline long sbi_get_spec_version(void)
{
return __sbi_base_ecall(SBI_EXT_BASE_GET_SPEC_VERSION);
diff --git a/arch/riscv/kernel/sbi_ecall.c b/arch/riscv/kernel/sbi_ecall.c
new file mode 100644
index 000000000000..24aabb4fbde3
--- /dev/null
+++ b/arch/riscv/kernel/sbi_ecall.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Rivos Inc. */
+
+#include <asm/sbi.h>
+#define CREATE_TRACE_POINTS
+#include <asm/trace.h>
+
+long __sbi_base_ecall(int fid)
+{
+ struct sbiret ret;
+
+ ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0);
+ if (!ret.error)
+ return ret.value;
+ else
+ return sbi_err_map_linux_errno(ret.error);
+}
+EXPORT_SYMBOL(__sbi_base_ecall);
+
+struct sbiret __sbi_ecall(unsigned long arg0, unsigned long arg1,
+ unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5,
+ int fid, int ext)
+{
+ struct sbiret ret;
+
+ trace_sbi_call(ext, fid);
+
+ register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0);
+ register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1);
+ register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2);
+ register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3);
+ register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4);
+ register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5);
+ register uintptr_t a6 asm ("a6") = (uintptr_t)(fid);
+ register uintptr_t a7 asm ("a7") = (uintptr_t)(ext);
+ asm volatile ("ecall"
+ : "+r" (a0), "+r" (a1)
+ : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7)
+ : "memory");
+ ret.error = a0;
+ ret.value = a1;
+
+ trace_sbi_return(ext, ret.error, ret.value);
+
+ return ret;
+}
+EXPORT_SYMBOL(__sbi_ecall);
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 192cd5603e95..d4fd8af7aaf5 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -417,7 +417,7 @@ int handle_misaligned_load(struct pt_regs *regs)
val.data_u64 = 0;
if (user_mode(regs)) {
- if (raw_copy_from_user(&val, (u8 __user *)addr, len))
+ if (copy_from_user(&val, (u8 __user *)addr, len))
return -1;
} else {
memcpy(&val, (u8 *)addr, len);
@@ -515,7 +515,7 @@ int handle_misaligned_store(struct pt_regs *regs)
return -EOPNOTSUPP;
if (user_mode(regs)) {
- if (raw_copy_to_user((u8 __user *)addr, &val, len))
+ if (copy_to_user((u8 __user *)addr, &val, len))
return -1;
} else {
memcpy((u8 *)addr, &val, len);
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index bab2ec34cd87..f3427f6de608 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -20,7 +20,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
return -EINVAL;
}
-int kvm_arch_hardware_enable(void)
+int kvm_arch_enable_virtualization_cpu(void)
{
csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT);
csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT);
@@ -35,7 +35,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
kvm_riscv_aia_disable();
diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
index bcf41d6e0df0..2707a51b082c 100644
--- a/arch/riscv/kvm/vcpu_pmu.c
+++ b/arch/riscv/kvm/vcpu_pmu.c
@@ -391,19 +391,9 @@ int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
static void kvm_pmu_clear_snapshot_area(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
- int snapshot_area_size = sizeof(struct riscv_pmu_snapshot_data);
- if (kvpmu->sdata) {
- if (kvpmu->snapshot_addr != INVALID_GPA) {
- memset(kvpmu->sdata, 0, snapshot_area_size);
- kvm_vcpu_write_guest(vcpu, kvpmu->snapshot_addr,
- kvpmu->sdata, snapshot_area_size);
- } else {
- pr_warn("snapshot address invalid\n");
- }
- kfree(kvpmu->sdata);
- kvpmu->sdata = NULL;
- }
+ kfree(kvpmu->sdata);
+ kvpmu->sdata = NULL;
kvpmu->snapshot_addr = INVALID_GPA;
}
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
index 62f409d4176e..7de128be8db9 100644
--- a/arch/riscv/kvm/vcpu_sbi.c
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -127,8 +127,8 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
run->riscv_sbi.args[3] = cp->a3;
run->riscv_sbi.args[4] = cp->a4;
run->riscv_sbi.args[5] = cp->a5;
- run->riscv_sbi.ret[0] = cp->a0;
- run->riscv_sbi.ret[1] = cp->a1;
+ run->riscv_sbi.ret[0] = SBI_ERR_NOT_SUPPORTED;
+ run->riscv_sbi.ret[1] = 0;
}
void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index eb0649a61b4c..1785782c2e55 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -252,7 +252,7 @@ static void __init setup_bootmem(void)
* The size of the linear page mapping may restrict the amount of
* usable RAM.
*/
- if (IS_ENABLED(CONFIG_64BIT)) {
+ if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) {
max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
memblock_cap_memory_range(phys_ram_base,
max_mapped_addr - phys_ram_base);
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a822f952f64a..c60e699e99f5 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -604,6 +604,19 @@ config RANDOMIZE_BASE
as a security feature that deters exploit attempts relying on
knowledge of the location of kernel internals.
+config RANDOMIZE_IDENTITY_BASE
+ bool "Randomize the address of the identity mapping base"
+ depends on RANDOMIZE_BASE
+ default DEBUG_VM
+ help
+ The identity mapping base address is pinned to zero by default.
+ Allow randomization of that base to expose otherwise missed
+ notion of physical and virtual addresses of data structures.
+ That does not have any impact on the base address at which the
+ kernel image is loaded.
+
+ If unsure, say N
+
config KERNEL_IMAGE_BASE
hex "Kernel image base address"
range 0x100000 0x1FFFFFE0000000 if !KASAN
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index ce232552bc1c..c73b5118ad42 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -162,7 +162,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr,
loc = (long)*reloc + phys_offset;
if (loc < min_addr || loc > max_addr)
error("64-bit relocation outside of kernel!\n");
- *(u64 *)loc += offset - __START_KERNEL;
+ *(u64 *)loc += offset;
}
}
@@ -177,7 +177,7 @@ static void kaslr_adjust_got(unsigned long offset)
*/
for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) {
if (*entry)
- *entry += offset - __START_KERNEL;
+ *entry += offset;
}
}
@@ -252,7 +252,7 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size)
vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page);
/* choose kernel address space layout: 4 or 3 levels. */
- BUILD_BUG_ON(!IS_ALIGNED(__START_KERNEL, THREAD_SIZE));
+ BUILD_BUG_ON(!IS_ALIGNED(TEXT_OFFSET, THREAD_SIZE));
BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE));
BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE);
vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE);
@@ -341,7 +341,8 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size)
BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS));
max_mappable = max(ident_map_size, MAX_DCSS_ADDR);
max_mappable = min(max_mappable, vmemmap_start);
- __identity_base = round_down(vmemmap_start - max_mappable, rte_size);
+ if (IS_ENABLED(CONFIG_RANDOMIZE_IDENTITY_BASE))
+ __identity_base = round_down(vmemmap_start - max_mappable, rte_size);
return asce_limit;
}
@@ -388,31 +389,25 @@ static void kaslr_adjust_vmlinux_info(long offset)
#endif
}
-static void fixup_vmlinux_info(void)
-{
- vmlinux.entry -= __START_KERNEL;
- kaslr_adjust_vmlinux_info(-__START_KERNEL);
-}
-
void startup_kernel(void)
{
- unsigned long kernel_size = vmlinux.image_size + vmlinux.bss_size;
- unsigned long nokaslr_offset_phys, kaslr_large_page_offset;
- unsigned long amode31_lma = 0;
+ unsigned long vmlinux_size = vmlinux.image_size + vmlinux.bss_size;
+ unsigned long nokaslr_text_lma, text_lma = 0, amode31_lma = 0;
+ unsigned long kernel_size = TEXT_OFFSET + vmlinux_size;
+ unsigned long kaslr_large_page_offset;
unsigned long max_physmem_end;
unsigned long asce_limit;
unsigned long safe_addr;
psw_t psw;
- fixup_vmlinux_info();
setup_lpp();
/*
* Non-randomized kernel physical start address must be _SEGMENT_SIZE
* aligned (see blow).
*/
- nokaslr_offset_phys = ALIGN(mem_safe_offset(), _SEGMENT_SIZE);
- safe_addr = PAGE_ALIGN(nokaslr_offset_phys + kernel_size);
+ nokaslr_text_lma = ALIGN(mem_safe_offset(), _SEGMENT_SIZE);
+ safe_addr = PAGE_ALIGN(nokaslr_text_lma + vmlinux_size);
/*
* Reserve decompressor memory together with decompression heap,
@@ -456,16 +451,27 @@ void startup_kernel(void)
*/
kaslr_large_page_offset = __kaslr_offset & ~_SEGMENT_MASK;
if (kaslr_enabled()) {
- unsigned long end = ident_map_size - kaslr_large_page_offset;
+ unsigned long size = vmlinux_size + kaslr_large_page_offset;
- __kaslr_offset_phys = randomize_within_range(kernel_size, _SEGMENT_SIZE, 0, end);
+ text_lma = randomize_within_range(size, _SEGMENT_SIZE, TEXT_OFFSET, ident_map_size);
}
- if (!__kaslr_offset_phys)
- __kaslr_offset_phys = nokaslr_offset_phys;
- __kaslr_offset_phys |= kaslr_large_page_offset;
+ if (!text_lma)
+ text_lma = nokaslr_text_lma;
+ text_lma |= kaslr_large_page_offset;
+
+ /*
+ * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region is
+ * never accessed via the kernel image mapping as per the linker script:
+ *
+ * . = TEXT_OFFSET;
+ *
+ * Therefore, this region could be used for something else and does
+ * not need to be reserved. See how it is skipped in setup_vmem().
+ */
+ __kaslr_offset_phys = text_lma - TEXT_OFFSET;
kaslr_adjust_vmlinux_info(__kaslr_offset_phys);
- physmem_reserve(RR_VMLINUX, __kaslr_offset_phys, kernel_size);
- deploy_kernel((void *)__kaslr_offset_phys);
+ physmem_reserve(RR_VMLINUX, text_lma, vmlinux_size);
+ deploy_kernel((void *)text_lma);
/* vmlinux decompression is done, shrink reserved low memory */
physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end);
@@ -488,7 +494,7 @@ void startup_kernel(void)
amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, amode31_min, SZ_2G);
}
if (!amode31_lma)
- amode31_lma = __kaslr_offset_phys - vmlinux.amode31_size;
+ amode31_lma = text_lma - vmlinux.amode31_size;
physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size);
/*
@@ -504,8 +510,8 @@ void startup_kernel(void)
* - copy_bootdata() must follow setup_vmem() to propagate changes
* to bootdata made by setup_vmem()
*/
- clear_bss_section(__kaslr_offset_phys);
- kaslr_adjust_relocs(__kaslr_offset_phys, __kaslr_offset_phys + vmlinux.image_size,
+ clear_bss_section(text_lma);
+ kaslr_adjust_relocs(text_lma, text_lma + vmlinux.image_size,
__kaslr_offset, __kaslr_offset_phys);
kaslr_adjust_got(__kaslr_offset);
setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit);
diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c
index 2847cc059ab7..145035f84a0e 100644
--- a/arch/s390/boot/vmem.c
+++ b/arch/s390/boot/vmem.c
@@ -90,7 +90,7 @@ static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kern
}
memgap_start = end;
}
- kasan_populate(kernel_start, kernel_end, POPULATE_KASAN_MAP_SHADOW);
+ kasan_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KASAN_MAP_SHADOW);
kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW);
kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW);
if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
@@ -475,7 +475,17 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l
(unsigned long)__identity_va(end),
POPULATE_IDENTITY);
}
- pgtable_populate(kernel_start, kernel_end, POPULATE_KERNEL);
+
+ /*
+ * [kernel_start..kernel_start + TEXT_OFFSET] region is never
+ * accessed as per the linker script:
+ *
+ * . = TEXT_OFFSET;
+ *
+ * Therefore, skip mapping TEXT_OFFSET bytes to prevent access to
+ * [__kaslr_offset_phys..__kaslr_offset_phys + TEXT_OFFSET] region.
+ */
+ pgtable_populate(kernel_start + TEXT_OFFSET, kernel_end, POPULATE_KERNEL);
pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT);
pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore),
POPULATE_ABS_LOWCORE);
diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S
index a750711d44c8..66670212a361 100644
--- a/arch/s390/boot/vmlinux.lds.S
+++ b/arch/s390/boot/vmlinux.lds.S
@@ -109,7 +109,12 @@ SECTIONS
#ifdef CONFIG_KERNEL_UNCOMPRESSED
. = ALIGN(PAGE_SIZE);
. += AMODE31_SIZE; /* .amode31 section */
- . = ALIGN(1 << 20); /* _SEGMENT_SIZE */
+
+ /*
+ * Make sure the location counter is not less than TEXT_OFFSET.
+ * _SEGMENT_SIZE is not available, use ALIGN(1 << 20) instead.
+ */
+ . = MAX(TEXT_OFFSET, ALIGN(1 << 20));
#else
. = ALIGN(8);
#endif
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index ea63a7342f5f..0c989caed19a 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -59,6 +59,7 @@ CONFIG_CMM=m
CONFIG_APPLDATA_BASE=y
CONFIG_S390_HYPFS_FS=y
CONFIG_KVM=m
+CONFIG_KVM_S390_UCONTROL=y
CONFIG_S390_UNWIND_SELFTEST=m
CONFIG_S390_KPROBES_SANITY_TEST=m
CONFIG_S390_MODULES_SANITY_TEST=m
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 06416b3f94f5..16e4caa931f1 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -279,8 +279,9 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
#define AMODE31_SIZE (3 * PAGE_SIZE)
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
-#define __START_KERNEL 0x100000
#define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE
#define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE)
+#define TEXT_OFFSET 0x100000
+
#endif /* _S390_PAGE_H */
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 4ec99f73fa27..a3fea683b227 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -734,7 +734,23 @@ static void __init memblock_add_physmem_info(void)
}
/*
- * Reserve memory used for lowcore/command line/kernel image.
+ * Reserve memory used for lowcore.
+ */
+static void __init reserve_lowcore(void)
+{
+ void *lowcore_start = get_lowcore();
+ void *lowcore_end = lowcore_start + sizeof(struct lowcore);
+ void *start, *end;
+
+ if ((void *)__identity_base < lowcore_end) {
+ start = max(lowcore_start, (void *)__identity_base);
+ end = min(lowcore_end, (void *)(__identity_base + ident_map_size));
+ memblock_reserve(__pa(start), __pa(end));
+ }
+}
+
+/*
+ * Reserve memory used for absolute lowcore/command line/kernel image.
*/
static void __init reserve_kernel(void)
{
@@ -918,6 +934,7 @@ void __init setup_arch(char **cmdline_p)
/* Do some memory reservations *before* memory is added to memblock */
reserve_pgtables();
+ reserve_lowcore();
reserve_kernel();
reserve_initrd();
reserve_certificate_list();
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index e67cd409b858..ae5d0a9d6911 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -39,7 +39,7 @@ PHDRS {
SECTIONS
{
- . = __START_KERNEL;
+ . = TEXT_OFFSET;
.text : {
_stext = .; /* Start of text section */
_text = .; /* Text and read-only data */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0fd96860fc45..bb7134faaebf 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -348,20 +348,29 @@ static inline int plo_test_bit(unsigned char nr)
return cc == 0;
}
-static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
+static __always_inline void __sortl_query(u8 (*query)[32])
{
asm volatile(
" lghi 0,0\n"
- " lgr 1,%[query]\n"
+ " la 1,%[query]\n"
/* Parameter registers are ignored */
- " .insn rrf,%[opc] << 16,2,4,6,0\n"
+ " .insn rre,0xb9380000,2,4\n"
+ : [query] "=R" (*query)
:
- : [query] "d" ((unsigned long)query), [opc] "i" (opcode)
- : "cc", "memory", "0", "1");
+ : "cc", "0", "1");
}
-#define INSN_SORTL 0xb938
-#define INSN_DFLTCC 0xb939
+static __always_inline void __dfltcc_query(u8 (*query)[32])
+{
+ asm volatile(
+ " lghi 0,0\n"
+ " la 1,%[query]\n"
+ /* Parameter registers are ignored */
+ " .insn rrf,0xb9390000,2,4,6,0\n"
+ : [query] "=R" (*query)
+ :
+ : "cc", "0", "1");
+}
static void __init kvm_s390_cpu_feat_init(void)
{
@@ -415,10 +424,10 @@ static void __init kvm_s390_cpu_feat_init(void)
kvm_s390_available_subfunc.kdsa);
if (test_facility(150)) /* SORTL */
- __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl);
+ __sortl_query(&kvm_s390_available_subfunc.sortl);
if (test_facility(151)) /* DFLTCC */
- __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc);
+ __dfltcc_query(&kvm_s390_available_subfunc.dfltcc);
if (MACHINE_HAS_ESOP)
allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c
index a74dbd5c9896..30a732c808f3 100644
--- a/arch/s390/tools/relocs.c
+++ b/arch/s390/tools/relocs.c
@@ -280,7 +280,7 @@ static int do_reloc(struct section *sec, Elf_Rel *rel)
case R_390_GOTOFF64:
break;
case R_390_64:
- add_reloc(&relocs64, offset - ehdr.e_entry);
+ add_reloc(&relocs64, offset);
break;
default:
die("Unsupported relocation type: %d\n", r_type);
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 078e2bac2553..da8b66dce0da 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -389,7 +389,6 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val)
.r12 = size,
.r13 = EPT_READ,
.r14 = addr,
- .r15 = *val,
};
if (__tdx_hypercall(&args))
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 0c9c2706d4ec..9e519d8a810a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4589,6 +4589,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
return HYBRID_INTEL_CORE;
}
+static inline bool erratum_hsw11(struct perf_event *event)
+{
+ return (event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01);
+}
+
+/*
+ * The HSW11 requires a period larger than 100 which is the same as the BDM11.
+ * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
+ *
+ * The message 'interrupt took too long' can be observed on any counter which
+ * was armed with a period < 32 and two events expired in the same NMI.
+ * A minimum period of 32 is enforced for the rest of the events.
+ */
+static void hsw_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, erratum_hsw11(event) ? 128 : 32);
+}
+
/*
* Broadwell:
*
@@ -4606,8 +4625,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
*/
static void bdw_limit_period(struct perf_event *event, s64 *left)
{
- if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
- X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (erratum_hsw11(event)) {
if (*left < 128)
*left = 128;
*left &= ~0x3fULL;
@@ -6766,6 +6784,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = hsw_limit_period;
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
index 6b122a31da06..aa21c105eef1 100644
--- a/arch/x86/include/asm/cpuid.h
+++ b/arch/x86/include/asm/cpuid.h
@@ -179,6 +179,7 @@ static __always_inline bool cpuid_function_is_indexed(u32 function)
case 0x1d:
case 0x1e:
case 0x1f:
+ case 0x24:
case 0x8000001d:
return true;
}
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index eb17f31b06d2..de16862bf230 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -591,6 +591,13 @@ struct fpu_state_config {
* even without XSAVE support, i.e. legacy features FP + SSE
*/
u64 legacy_features;
+ /*
+ * @independent_features:
+ *
+ * Features that are supported by XSAVES, but not managed as part of
+ * the FPU core, such as LBR
+ */
+ u64 independent_features;
};
/* FPU state configuration information */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 68ad4f923664..861d080ed4c6 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -14,8 +14,8 @@ BUILD_BUG_ON(1)
* be __static_call_return0.
*/
KVM_X86_OP(check_processor_compatibility)
-KVM_X86_OP(hardware_enable)
-KVM_X86_OP(hardware_disable)
+KVM_X86_OP(enable_virtualization_cpu)
+KVM_X86_OP(disable_virtualization_cpu)
KVM_X86_OP(hardware_unsetup)
KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
@@ -125,7 +125,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
-KVM_X86_OP(get_msr_feature)
+KVM_X86_OP(get_feature_msr)
KVM_X86_OP(check_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4a68cb3eba78..5f794814226f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -36,6 +36,7 @@
#include <asm/kvm_page_track.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/hyperv-tlfs.h>
+#include <asm/reboot.h>
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
@@ -211,6 +212,7 @@ enum exit_fastpath_completion {
EXIT_FASTPATH_NONE,
EXIT_FASTPATH_REENTER_GUEST,
EXIT_FASTPATH_EXIT_HANDLED,
+ EXIT_FASTPATH_EXIT_USERSPACE,
};
typedef enum exit_fastpath_completion fastpath_t;
@@ -280,10 +282,6 @@ enum x86_intercept_stage;
#define PFERR_PRIVATE_ACCESS BIT_ULL(49)
#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
-#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
- PFERR_WRITE_MASK | \
- PFERR_PRESENT_MASK)
-
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
/*
@@ -1629,8 +1627,10 @@ struct kvm_x86_ops {
int (*check_processor_compatibility)(void);
- int (*hardware_enable)(void);
- void (*hardware_disable)(void);
+ int (*enable_virtualization_cpu)(void);
+ void (*disable_virtualization_cpu)(void);
+ cpu_emergency_virt_cb *emergency_disable_virtualization_cpu;
+
void (*hardware_unsetup)(void);
bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
@@ -1727,6 +1727,8 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+
+ const bool x2apic_icr_is_split;
const unsigned long required_apicv_inhibits;
bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
@@ -1806,7 +1808,7 @@ struct kvm_x86_ops {
int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
void (*guest_memory_reclaimed)(struct kvm *kvm);
- int (*get_msr_feature)(struct kvm_msr_entry *entry);
+ int (*get_feature_msr)(u32 msr, u64 *data);
int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len);
@@ -2136,7 +2138,15 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
-int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry);
+
+static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu,
+ gpa_t cr2_or_gpa)
+{
+ return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false);
+}
+
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free);
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
@@ -2345,7 +2355,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
- KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
+ KVM_X86_QUIRK_SLOT_ZAP_ALL)
/*
* KVM previously used a u32 field in kvm_run to indicate the hypercall was
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index af4302d79b59..f3d257c45225 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,6 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
+extern unsigned long physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 9053dfe9fa03..a98e53491a4e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d;
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define PHYSMEM_END physmem_end
+#endif
+
/*
* End of the region for which vmalloc page tables are pre-allocated.
* For non-KMSAN builds, this is the same as VMALLOC_END.
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 6536873f8fc0..d0ef2a678d66 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,8 +25,8 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
-#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
typedef void (cpu_emergency_virt_cb)(void);
+#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_disable_virtualization(void);
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 12dbd2588ca7..8b1b6ce1e51b 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -156,12 +156,6 @@ static inline void resctrl_sched_in(struct task_struct *tsk)
__resctrl_sched_in(tsk);
}
-static inline u32 resctrl_arch_system_num_rmid_idx(void)
-{
- /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
- return boot_cpu_data.x86_cache_max_rmid + 1;
-}
-
static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
{
*rmid = idx;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index bf57a824f722..a8debbf2f702 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -439,6 +439,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
+#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 66fd4b2a37a3..373638691cd4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1775,12 +1775,9 @@ static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
+ u32 x2apic_id;
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
-
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1793,6 +1790,10 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
/*
* Don't reread the APIC ID as it was already done from
* check_x2apic() and the APIC driver still is a x2APIC variant,
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 1930fce9dfe9..8591d53c144b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -119,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = {
},
};
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->num_rmid;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c5a026fee5e0..1339f8328db5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -788,6 +788,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}
+ fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
+ XFEATURE_MASK_INDEPENDENT;
+
/*
* Clear XSAVE features that are disabled in the normal CPUID.
*/
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 2ee0b9c53dcc..afb404cd2059 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -62,9 +62,9 @@ static inline u64 xfeatures_mask_supervisor(void)
static inline u64 xfeatures_mask_independent(void)
{
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
- return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+ return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;
- return XFEATURE_MASK_INDEPENDENT;
+ return fpu_kernel_cfg.independent_features;
}
/* XSAVE/XRSTOR wrapper functions */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 472a1537b7a9..730c2f34d347 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -19,7 +19,6 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HIGH_RES_TIMERS
depends on X86_LOCAL_APIC
select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
@@ -144,8 +143,10 @@ config KVM_AMD_SEV
select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE
help
- Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
- with Encrypted State (SEV-ES) on AMD processors.
+ Provides support for launching encrypted VMs which use Secure
+ Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
+ Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
+ Secure Nested Paging (SEV-SNP) technologies on AMD processors.
config KVM_SMM
bool "System Management Mode emulation"
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2617be544480..41786b834b16 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -705,7 +705,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) |
- F(AMX_COMPLEX)
+ F(AMX_COMPLEX) | F(AVX10)
);
kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
@@ -721,6 +721,10 @@ void kvm_set_cpu_caps(void)
SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
);
+ kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX,
+ F(AVX10_128) | F(AVX10_256) | F(AVX10_512)
+ );
+
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -949,7 +953,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
switch (function) {
case 0:
/* Limited to the highest leaf implemented in KVM. */
- entry->eax = min(entry->eax, 0x1fU);
+ entry->eax = min(entry->eax, 0x24U);
break;
case 1:
cpuid_entry_override(entry, CPUID_1_EDX);
@@ -1174,6 +1178,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
}
break;
+ case 0x24: {
+ u8 avx10_version;
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * The AVX10 version is encoded in EBX[7:0]. Note, the version
+ * is guaranteed to be >=1 if AVX10 is supported. Note #2, the
+ * version needs to be captured before overriding EBX features!
+ */
+ avx10_version = min_t(u8, entry->ebx & 0xff, 1);
+ cpuid_entry_override(entry, CPUID_24_0_EBX);
+ entry->ebx |= avx10_version;
+
+ entry->eax = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ }
case KVM_CPUID_SIGNATURE: {
const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
entry->eax = KVM_CPUID_FEATURES;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 5bb481aefcbc..c7180cb5f464 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1944,7 +1944,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
u64 ns = 0;
ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
- unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
ktime_t now;
@@ -2453,6 +2453,43 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
+
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+{
+ if (data & X2APIC_ICR_RESERVED_BITS)
+ return 1;
+
+ /*
+ * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
+ * only AMD requires it to be zero, Intel essentially just ignores the
+ * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
+ * the CPU performs the reserved bits checks, i.e. the underlying CPU
+ * behavior will "win". Arbitrarily clear the BUSY bit, as there is no
+ * sane way to provide consistent behavior with respect to hardware.
+ */
+ data &= ~APIC_ICR_BUSY;
+
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ if (kvm_x86_ops.x2apic_icr_is_split) {
+ kvm_lapic_set_reg(apic, APIC_ICR, data);
+ kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ }
+ trace_kvm_apic_write(APIC_ICR, data);
+ return 0;
+}
+
+static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
+{
+ if (kvm_x86_ops.x2apic_icr_is_split)
+ return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
+ (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
+
+ return kvm_lapic_get_reg64(apic, APIC_ICR);
+}
+
/* emulate APIC access in a trap manner */
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
{
@@ -2470,7 +2507,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
* maybe-unecessary write, and both are in the noise anyways.
*/
if (apic_x2apic_mode(apic) && offset == APIC_ICR)
- kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR));
+ WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
else
kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
}
@@ -2990,18 +3027,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
/*
* In x2APIC mode, the LDR is fixed and based on the id. And
- * ICR is internally a single 64-bit register, but needs to be
- * split to ICR+ICR2 in userspace for backwards compatibility.
+ * if the ICR is _not_ split, ICR is internally a single 64-bit
+ * register, but needs to be split to ICR+ICR2 in userspace for
+ * backwards compatibility.
*/
- if (set) {
+ if (set)
*ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
- } else {
- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ if (!kvm_x86_ops.x2apic_icr_is_split) {
+ if (set) {
+ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
}
}
@@ -3194,22 +3235,12 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
return 0;
}
-int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
-{
- data &= ~APIC_ICR_BUSY;
-
- kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
- kvm_lapic_set_reg64(apic, APIC_ICR, data);
- trace_kvm_apic_write(APIC_ICR, data);
- return 0;
-}
-
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
{
u32 low;
if (reg == APIC_ICR) {
- *data = kvm_lapic_get_reg64(apic, APIC_ICR);
+ *data = kvm_x2apic_icr_read(apic);
return 0;
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 7ef8ae73e82d..7c95eedd771e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -96,7 +96,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 4341e0e28571..9dc5dd43ae7f 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -223,8 +223,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
bool kvm_mmu_may_ignore_guest_pat(void);
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
-
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 928cf84778b0..e081f785fb23 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -614,32 +614,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
-/* Returns the Accessed status of the PTE and resets it at the same time. */
-static bool mmu_spte_age(u64 *sptep)
-{
- u64 spte = mmu_spte_get_lockless(sptep);
-
- if (!is_accessed_spte(spte))
- return false;
-
- if (spte_ad_enabled(spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)sptep);
- } else {
- /*
- * Capture the dirty status of the page, so that it doesn't get
- * lost when the SPTE is marked for access tracking.
- */
- if (is_writable_pte(spte))
- kvm_set_pfn_dirty(spte_to_pfn(spte));
-
- spte = mark_spte_for_access_track(spte);
- mmu_spte_update_no_track(sptep, spte);
- }
-
- return true;
-}
-
static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
{
return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
@@ -938,6 +912,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
* pte_list_desc containing more mappings.
*/
+#define KVM_RMAP_MANY BIT(0)
/*
* Returns the number of pointers in the rmap chain, not counting the new one.
@@ -950,16 +925,16 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
if (!rmap_head->val) {
rmap_head->val = (unsigned long)spte;
- } else if (!(rmap_head->val & 1)) {
+ } else if (!(rmap_head->val & KVM_RMAP_MANY)) {
desc = kvm_mmu_memory_cache_alloc(cache);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
desc->spte_count = 2;
desc->tail_count = 0;
- rmap_head->val = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
++count;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
count = desc->tail_count + desc->spte_count;
/*
@@ -968,10 +943,10 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
*/
if (desc->spte_count == PTE_LIST_EXT) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
desc->spte_count = 0;
desc->tail_count = count;
- rmap_head->val = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
}
desc->sptes[desc->spte_count++] = spte;
}
@@ -982,7 +957,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i)
{
- struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
int j = head_desc->spte_count - 1;
/*
@@ -1011,7 +986,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
if (!head_desc->more)
rmap_head->val = 0;
else
- rmap_head->val = (unsigned long)head_desc->more | 1;
+ rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
@@ -1024,13 +999,13 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
return;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
return;
rmap_head->val = 0;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
@@ -1063,12 +1038,12 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
if (!rmap_head->val)
return false;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
goto out;
}
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
for (; desc; desc = next) {
for (i = 0; i < desc->spte_count; i++)
@@ -1088,10 +1063,10 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
if (!rmap_head->val)
return 0;
- else if (!(rmap_head->val & 1))
+ else if (!(rmap_head->val & KVM_RMAP_MANY))
return 1;
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
return desc->tail_count + desc->spte_count;
}
@@ -1153,13 +1128,13 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
if (!rmap_head->val)
return NULL;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
iter->desc = NULL;
sptep = (u64 *)rmap_head->val;
goto out;
}
- iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
iter->pos = 0;
sptep = iter->desc->sptes[iter->pos];
out:
@@ -1307,15 +1282,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
return flush;
}
-/**
- * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
- * @kvm: kvm instance
- * @slot: slot to protect
- * @gfn_offset: start of the BITS_PER_LONG pages we care about
- * @mask: indicates which pages we should protect
- *
- * Used when we do not need to care about huge page mappings.
- */
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
@@ -1339,16 +1305,6 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
}
-/**
- * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
- * protect the page if the D-bit isn't supported.
- * @kvm: kvm instance
- * @slot: slot to clear D-bit
- * @gfn_offset: start of the BITS_PER_LONG pages we care about
- * @mask: indicates which pages we should clear D-bit
- *
- * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
- */
static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
@@ -1372,24 +1328,16 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
}
}
-/**
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * PT level pages.
- *
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
- *
- * We need to care about huge page mappings: e.g. during dirty logging we may
- * have such mappings.
- */
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
/*
- * Huge pages are NOT write protected when we start dirty logging in
- * initially-all-set mode; must write protect them here so that they
- * are split to 4K on the first write.
+ * If the slot was assumed to be "initially all dirty", write-protect
+ * huge pages to ensure they are split to 4KiB on the first write (KVM
+ * dirty logs at 4KiB granularity). If eager page splitting is enabled,
+ * immediately try to split huge pages, e.g. so that vCPUs don't get
+ * saddled with the cost of splitting.
*
* The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
* of memslot has no such restriction, so the range can cross two large
@@ -1411,7 +1359,16 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
PG_LEVEL_2M);
}
- /* Now handle 4K PTEs. */
+ /*
+ * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in
+ * mask. If PML is enabled and the GFN doesn't need to be write-
+ * protected for other reasons, e.g. shadow paging, clear the Dirty bit.
+ * Otherwise clear the Writable bit.
+ *
+ * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is
+ * enabled but it chooses between clearing the Dirty bit and Writeable
+ * bit based on the context.
+ */
if (kvm_x86_ops.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
@@ -1453,16 +1410,10 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
}
-static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- const struct kvm_memory_slot *slot)
-{
- return kvm_zap_all_rmap_sptes(kvm, rmap_head);
-}
-
static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level)
+ const struct kvm_memory_slot *slot)
{
- return __kvm_zap_rmap(kvm, rmap_head, slot);
+ return kvm_zap_all_rmap_sptes(kvm, rmap_head);
}
struct slot_rmap_walk_iterator {
@@ -1513,7 +1464,7 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
{
while (++iterator->rmap <= iterator->end_rmap) {
- iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+ iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
if (iterator->rmap->val)
return;
@@ -1534,23 +1485,71 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level);
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
-static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
- struct kvm_gfn_range *range,
- rmap_handler_t handler)
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool can_yield, bool flush_on_yield,
+ bool flush)
{
struct slot_rmap_walk_iterator iterator;
- bool ret = false;
- for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- range->start, range->end - 1, &iterator)
- ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- iterator.level);
+ lockdep_assert_held_write(&kvm->mmu_lock);
- return ret;
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap, slot);
+
+ if (!can_yield)
+ continue;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ if (flush && flush_on_yield) {
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
+ flush = false;
+ }
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
+
+ return flush;
+}
+
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
+{
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ true, flush_on_yield, false);
+}
+
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
+{
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
+}
+
+static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end, bool can_yield,
+ bool flush)
+{
+ return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, can_yield, true, flush);
}
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -1558,7 +1557,9 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool flush = false;
if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
+ flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
+ range->start, range->end,
+ range->may_block, flush);
if (tdp_mmu_enabled)
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
@@ -1570,31 +1571,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
return flush;
}
-static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level)
-{
- u64 *sptep;
- struct rmap_iterator iter;
- int young = 0;
-
- for_each_rmap_spte(rmap_head, &iter, sptep)
- young |= mmu_spte_age(sptep);
-
- return young;
-}
-
-static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level)
-{
- u64 *sptep;
- struct rmap_iterator iter;
-
- for_each_rmap_spte(rmap_head, &iter, sptep)
- if (is_accessed_spte(*sptep))
- return true;
- return false;
-}
-
#define RMAP_RECYCLE_THRESHOLD 1000
static void __rmap_add(struct kvm *kvm,
@@ -1629,12 +1605,52 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
__rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
}
+static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range, bool test_only)
+{
+ struct slot_rmap_walk_iterator iterator;
+ struct rmap_iterator iter;
+ bool young = false;
+ u64 *sptep;
+
+ for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ range->start, range->end - 1, &iterator) {
+ for_each_rmap_spte(iterator.rmap, &iter, sptep) {
+ u64 spte = *sptep;
+
+ if (!is_accessed_spte(spte))
+ continue;
+
+ if (test_only)
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ } else {
+ /*
+ * Capture the dirty status of the page, so that
+ * it doesn't get lost when the SPTE is marked
+ * for access tracking.
+ */
+ if (is_writable_pte(spte))
+ kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+ spte = mark_spte_for_access_track(spte);
+ mmu_spte_update_no_track(sptep, spte);
+ }
+ young = true;
+ }
+ }
+ return young;
+}
+
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
+ young = kvm_rmap_age_gfn_range(kvm, range, false);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1647,7 +1663,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
+ young = kvm_rmap_age_gfn_range(kvm, range, true);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
@@ -2713,36 +2729,49 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
write_unlock(&kvm->mmu_lock);
}
-int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry)
{
- struct kvm_mmu_page *sp;
+ struct kvm *kvm = vcpu->kvm;
LIST_HEAD(invalid_list);
- int r;
+ struct kvm_mmu_page *sp;
+ gpa_t gpa = cr2_or_gpa;
+ bool r = false;
+
+ /*
+ * Bail early if there aren't any write-protected shadow pages to avoid
+ * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked
+ * by a third party. Reading indirect_shadow_pages without holding
+ * mmu_lock is safe, as this is purely an optimization, i.e. a false
+ * positive is benign, and a false negative will simply result in KVM
+ * skipping the unprotect+retry path, which is also an optimization.
+ */
+ if (!READ_ONCE(kvm->arch.indirect_shadow_pages))
+ goto out;
+
+ if (!vcpu->arch.mmu->root_role.direct) {
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+ if (gpa == INVALID_GPA)
+ goto out;
+ }
- r = 0;
write_lock(&kvm->mmu_lock);
- for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
- r = 1;
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa))
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- }
+
+ /*
+ * Snapshot the result before zapping, as zapping will remove all list
+ * entries, i.e. checking the list later would yield a false negative.
+ */
+ r = !list_empty(&invalid_list);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
write_unlock(&kvm->mmu_lock);
- return r;
-}
-
-static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
- int r;
-
- if (vcpu->arch.mmu->root_role.direct)
- return 0;
-
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
- r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
+out:
+ if (r || always_retry) {
+ vcpu->arch.last_retry_eip = kvm_rip_read(vcpu);
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
+ }
return r;
}
@@ -2914,10 +2943,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
trace_kvm_mmu_set_spte(level, gfn, sptep);
}
- if (wrprot) {
- if (write_fault)
- ret = RET_PF_EMULATE;
- }
+ if (wrprot && write_fault)
+ ret = RET_PF_WRITE_PROTECTED;
if (flush)
kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
@@ -4549,7 +4576,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
return RET_PF_RETRY;
if (page_fault_handle_page_track(vcpu, fault))
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
@@ -4618,8 +4645,6 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
if (!flags) {
trace_kvm_page_fault(vcpu, fault_address, error_code);
- if (kvm_event_needs_reinjection(vcpu))
- kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
@@ -4642,7 +4667,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
int r;
if (page_fault_handle_page_track(vcpu, fault))
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
@@ -4721,6 +4746,7 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
switch (r) {
case RET_PF_FIXED:
case RET_PF_SPURIOUS:
+ case RET_PF_WRITE_PROTECTED:
return 0;
case RET_PF_EMULATE:
@@ -4750,7 +4776,9 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
* reload is efficient when called repeatedly, so we can do it on
* every iteration.
*/
- kvm_mmu_reload(vcpu);
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
if (kvm_arch_has_private_mem(vcpu->kvm) &&
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
@@ -5963,6 +5991,106 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
write_unlock(&vcpu->kvm->mmu_lock);
}
+static bool is_write_to_guest_page_table(u64 error_code)
+{
+ const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK;
+
+ return (error_code & mask) == mask;
+}
+
+static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u64 error_code, int *emulation_type)
+{
+ bool direct = vcpu->arch.mmu->root_role.direct;
+
+ /*
+ * Do not try to unprotect and retry if the vCPU re-faulted on the same
+ * RIP with the same address that was previously unprotected, as doing
+ * so will likely put the vCPU into an infinite. E.g. if the vCPU uses
+ * a non-page-table modifying instruction on the PDE that points to the
+ * instruction, then unprotecting the gfn will unmap the instruction's
+ * code, i.e. make it impossible for the instruction to ever complete.
+ */
+ if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) &&
+ vcpu->arch.last_retry_addr == cr2_or_gpa)
+ return RET_PF_EMULATE;
+
+ /*
+ * Reset the unprotect+retry values that guard against infinite loops.
+ * The values will be refreshed if KVM explicitly unprotects a gfn and
+ * retries, in all other cases it's safe to retry in the future even if
+ * the next page fault happens on the same RIP+address.
+ */
+ vcpu->arch.last_retry_eip = 0;
+ vcpu->arch.last_retry_addr = 0;
+
+ /*
+ * It should be impossible to reach this point with an MMIO cache hit,
+ * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
+ * writable memslot, and creating a memslot should invalidate the MMIO
+ * cache by way of changing the memslot generation. WARN and disallow
+ * retry if MMIO is detected, as retrying MMIO emulation is pointless
+ * and could put the vCPU into an infinite loop because the processor
+ * will keep faulting on the non-existent MMIO address.
+ */
+ if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
+ return RET_PF_EMULATE;
+
+ /*
+ * Before emulating the instruction, check to see if the access was due
+ * to a read-only violation while the CPU was walking non-nested NPT
+ * page tables, i.e. for a direct MMU, for _guest_ page tables in L1.
+ * If L1 is sharing (a subset of) its page tables with L2, e.g. by
+ * having nCR3 share lower level page tables with hCR3, then when KVM
+ * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also
+ * unknowingly write-protecting L1's guest page tables, which KVM isn't
+ * shadowing.
+ *
+ * Because the CPU (by default) walks NPT page tables using a write
+ * access (to ensure the CPU can do A/D updates), page walks in L1 can
+ * trigger write faults for the above case even when L1 isn't modifying
+ * PTEs. As a result, KVM will unnecessarily emulate (or at least, try
+ * to emulate) an excessive number of L1 instructions; because L1's MMU
+ * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs
+ * and thus no need to emulate in order to guarantee forward progress.
+ *
+ * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can
+ * proceed without triggering emulation. If one or more shadow pages
+ * was zapped, skip emulation and resume L1 to let it natively execute
+ * the instruction. If no shadow pages were zapped, then the write-
+ * fault is due to something else entirely, i.e. KVM needs to emulate,
+ * as resuming the guest will put it into an infinite loop.
+ *
+ * Note, this code also applies to Intel CPUs, even though it is *very*
+ * unlikely that an L1 will share its page tables (IA32/PAE/paging64
+ * format) with L2's page tables (EPT format).
+ *
+ * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to
+ * unprotect the gfn and retry if an event is awaiting reinjection. If
+ * KVM emulates multiple instructions before completing event injection,
+ * the event could be delayed beyond what is architecturally allowed,
+ * e.g. KVM could inject an IRQ after the TPR has been raised.
+ */
+ if (((direct && is_write_to_guest_page_table(error_code)) ||
+ (!direct && kvm_event_needs_reinjection(vcpu))) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
+ return RET_PF_RETRY;
+
+ /*
+ * The gfn is write-protected, but if KVM detects its emulating an
+ * instruction that is unlikely to be used to modify page tables, or if
+ * emulation fails, KVM can try to unprotect the gfn and let the CPU
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying an instruction from a nested guest as KVM is only explicitly
+ * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
+ * going to magically fix whatever issue caused L2 to fail.
+ */
+ if (!is_guest_mode(vcpu))
+ *emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
+
+ return RET_PF_EMULATE;
+}
+
int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len)
{
@@ -6008,6 +6136,10 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
if (r < 0)
return r;
+ if (r == RET_PF_WRITE_PROTECTED)
+ r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
+ &emulation_type);
+
if (r == RET_PF_FIXED)
vcpu->stat.pf_fixed++;
else if (r == RET_PF_EMULATE)
@@ -6018,32 +6150,6 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
if (r != RET_PF_EMULATE)
return 1;
- /*
- * Before emulating the instruction, check if the error code
- * was due to a RO violation while translating the guest page.
- * This can occur when using nested virtualization with nested
- * paging in both guests. If true, we simply unprotect the page
- * and resume the guest.
- */
- if (vcpu->arch.mmu->root_role.direct &&
- (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
- return 1;
- }
-
- /*
- * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
- * optimistically try to just unprotect the page and let the processor
- * re-execute the instruction that caused the page fault. Do not allow
- * retrying MMIO emulation, as it's not only pointless but could also
- * cause us to enter an infinite loop because the processor will keep
- * faulting on the non-existent MMIO address. Retrying an instruction
- * from a nested guest is also pointless and dangerous as we are only
- * explicitly shadowing L1's page tables, i.e. unprotecting something
- * for L1 isn't going to magically fix whatever issue cause L2 to fail.
- */
- if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
- emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
emulate:
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
@@ -6202,59 +6308,6 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
}
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
-/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- const struct kvm_memory_slot *slot);
-
-static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn,
- bool flush_on_yield, bool flush)
-{
- struct slot_rmap_walk_iterator iterator;
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
- end_gfn, &iterator) {
- if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap, slot);
-
- if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- if (flush && flush_on_yield) {
- kvm_flush_remote_tlbs_range(kvm, start_gfn,
- iterator.gfn - start_gfn + 1);
- flush = false;
- }
- cond_resched_rwlock_write(&kvm->mmu_lock);
- }
- }
-
- return flush;
-}
-
-static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- int start_level, int end_level,
- bool flush_on_yield)
-{
- return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
- slot->base_gfn, slot->base_gfn + slot->npages - 1,
- flush_on_yield, false);
-}
-
-static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- bool flush_on_yield)
-{
- return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
-}
-
static void free_mmu_pages(struct kvm_mmu *mmu)
{
if (!tdp_enabled && mmu->pae_root)
@@ -6528,9 +6581,8 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
if (WARN_ON_ONCE(start >= end))
continue;
- flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true, flush);
+ flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start,
+ end, true, flush);
}
}
@@ -6818,7 +6870,7 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
*/
for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
__walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
- level, level, start, end - 1, true, false);
+ level, level, start, end - 1, true, true, false);
}
/* Must be called with the mmu_lock held in write-mode. */
@@ -6997,10 +7049,42 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
kvm_mmu_zap_all(kvm);
}
+/*
+ * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted.
+ *
+ * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
+ * case scenario we'll have unused shadow pages lying around until they
+ * are recycled due to age or when the VM is destroyed.
+ */
+static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ struct kvm_gfn_range range = {
+ .slot = slot,
+ .start = slot->base_gfn,
+ .end = slot->base_gfn + slot->npages,
+ .may_block = true,
+ };
+
+ write_lock(&kvm->mmu_lock);
+ if (kvm_unmap_gfn_range(kvm, &range))
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+}
+
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_mmu_zap_all_fast(kvm);
+ if (kvm_memslot_flush_zap_all(kvm))
+ kvm_mmu_zap_all_fast(kvm);
+ else
+ kvm_mmu_zap_memslot_leafs(kvm, slot);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 1721d97743e9..c98827840e07 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -258,6 +258,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
* RET_PF_CONTINUE: So far, so good, keep handling the page fault.
* RET_PF_RETRY: let CPU fault again on the address.
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the
+ * gfn and retry, or emulate the instruction directly.
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
* RET_PF_FIXED: The faulting entry has been fixed.
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
@@ -274,6 +276,7 @@ enum {
RET_PF_CONTINUE = 0,
RET_PF_RETRY,
RET_PF_EMULATE,
+ RET_PF_WRITE_PROTECTED,
RET_PF_INVALID,
RET_PF_FIXED,
RET_PF_SPURIOUS,
@@ -349,8 +352,6 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
-void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
-
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 195d98bc8de8..f35a830ce469 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -57,6 +57,7 @@
TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
TRACE_DEFINE_ENUM(RET_PF_RETRY);
TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED);
TRACE_DEFINE_ENUM(RET_PF_INVALID);
TRACE_DEFINE_ENUM(RET_PF_FIXED);
TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 69941cebb3a8..ae7d39ff2d07 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -646,10 +646,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* really care if it changes underneath us after this point).
*/
if (FNAME(gpte_changed)(vcpu, gw, top_level))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
/*
* Load a new root and retry the faulting instruction in the extremely
@@ -659,7 +659,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
*/
if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
- goto out_gpte_changed;
+ return RET_PF_RETRY;
}
for_each_shadow_entry(vcpu, fault->addr, it) {
@@ -674,34 +674,38 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
false, access);
- if (sp != ERR_PTR(-EEXIST)) {
- /*
- * We must synchronize the pagetable before linking it
- * because the guest doesn't need to flush tlb when
- * the gpte is changed from non-present to present.
- * Otherwise, the guest may use the wrong mapping.
- *
- * For PG_LEVEL_4K, kvm_mmu_get_page() has already
- * synchronized it transiently via kvm_sync_page().
- *
- * For higher level pagetable, we synchronize it via
- * the slower mmu_sync_children(). If it needs to
- * break, some progress has been made; return
- * RET_PF_RETRY and retry on the next #PF.
- * KVM_REQ_MMU_SYNC is not necessary but it
- * expedites the process.
- */
- if (sp->unsync_children &&
- mmu_sync_children(vcpu, sp, false))
- return RET_PF_RETRY;
- }
+ /*
+ * Synchronize the new page before linking it, as the CPU (KVM)
+ * is architecturally disallowed from inserting non-present
+ * entries into the TLB, i.e. the guest isn't required to flush
+ * the TLB when changing the gPTE from non-present to present.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already
+ * synchronized the page via kvm_sync_page().
+ *
+ * For higher level pages, which cannot be unsync themselves
+ * but can have unsync children, synchronize via the slower
+ * mmu_sync_children(). If KVM needs to drop mmu_lock due to
+ * contention or to reschedule, instruct the caller to retry
+ * the #PF (mmu_sync_children() ensures forward progress will
+ * be made).
+ */
+ if (sp != ERR_PTR(-EEXIST) && sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
/*
- * Verify that the gpte in the page we've just write
- * protected is still there.
+ * Verify that the gpte in the page, which is now either
+ * write-protected or unsync, wasn't modified between the fault
+ * and acquiring mmu_lock. This needs to be done even when
+ * reusing an existing shadow page to ensure the information
+ * gathered by the walker matches the information stored in the
+ * shadow page (which could have been modified by a different
+ * vCPU even if the page was already linked). Holding mmu_lock
+ * prevents the shadow page from changing after this point.
*/
if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
@@ -755,9 +759,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return ret;
-
-out_gpte_changed:
- return RET_PF_RETRY;
}
/*
@@ -805,7 +806,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (page_fault_handle_page_track(vcpu, fault)) {
shadow_page_table_clear_flood(vcpu, fault->addr);
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
}
r = mmu_topup_memory_caches(vcpu, true);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index d4527965e48c..8f7eb3ad88fc 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -391,9 +391,9 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
mmio_value = 0;
/*
- * The masked MMIO value must obviously match itself and a removed SPTE
- * must not get a false positive. Removed SPTEs and MMIO SPTEs should
- * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * The masked MMIO value must obviously match itself and a frozen SPTE
+ * must not get a false positive. Frozen SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and frozen SPTEs must
* not set any RWX bits.
*/
if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index ef793c459b05..2cb816ea2430 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -214,7 +214,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*/
#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
-/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */
static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
static inline bool is_frozen_spte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c7dc49ee7388..3b996c1fdaab 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -359,10 +359,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
/*
* Set the SPTE to a nonpresent value that other
* threads will not overwrite. If the SPTE was
- * already marked as removed then another thread
+ * already marked as frozen then another thread
* handling a page fault could overwrite it, so
* set the SPTE until it is set from some other
- * value to the removed SPTE value.
+ * value to the frozen SPTE value.
*/
for (;;) {
old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
@@ -536,8 +536,8 @@ static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
u64 *sptep = rcu_dereference(iter->sptep);
/*
- * The caller is responsible for ensuring the old SPTE is not a REMOVED
- * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
+ * The caller is responsible for ensuring the old SPTE is not a FROZEN
+ * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
* and pre-checking before inserting a new SPTE is advantageous as it
* avoids unnecessary work.
*/
@@ -1046,10 +1046,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
* protected, emulation is needed. If the emulation was skipped,
* the vCPU would have the same fault again.
*/
- if (wrprot) {
- if (fault->write)
- ret = RET_PF_EMULATE;
- }
+ if (wrprot && fault->write)
+ ret = RET_PF_WRITE_PROTECTED;
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 2f4e155080ba..0d17d6b70639 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -17,6 +17,7 @@ enum kvm_only_cpuid_leafs {
CPUID_8000_0007_EDX,
CPUID_8000_0022_EAX,
CPUID_7_2_EDX,
+ CPUID_24_0_EBX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -46,6 +47,7 @@ enum kvm_only_cpuid_leafs {
#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19)
/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0)
@@ -55,6 +57,11 @@ enum kvm_only_cpuid_leafs {
#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
+/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */
+#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16)
+#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17)
+#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18)
+
/* CPUID level 0x80000007 (EDX). */
#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
@@ -90,6 +97,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
[CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
[CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
+ [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
};
/*
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index 00e3c27d2a87..85241c0c7f56 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -624,17 +624,31 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
#endif
/*
- * Give leave_smm() a chance to make ISA-specific changes to the vCPU
- * state (e.g. enter guest mode) before loading state from the SMM
- * state-save area.
+ * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest
+ * mode should happen _after_ loading state from SMRAM. However, KVM
+ * piggybacks the nested VM-Enter flows (which is wrong for many other
+ * reasons), and so nSVM/nVMX would clobber state that is loaded from
+ * SMRAM and from the VMCS/VMCB.
*/
if (kvm_x86_call(leave_smm)(vcpu, &smram))
return X86EMUL_UNHANDLEABLE;
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- return rsm_load_state_64(ctxt, &smram.smram64);
+ ret = rsm_load_state_64(ctxt, &smram.smram64);
else
#endif
- return rsm_load_state_32(ctxt, &smram.smram32);
+ ret = rsm_load_state_32(ctxt, &smram.smram32);
+
+ /*
+ * If RSM fails and triggers shutdown, architecturally the shutdown
+ * occurs *before* the transition to guest mode. But due to KVM's
+ * flawed handling of RSM to L2 (see above), the vCPU may already be
+ * in_guest_mode(). Force the vCPU out of guest mode before delivering
+ * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit
+ * that architecturally shouldn't be possible.
+ */
+ if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+ return ret;
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d6f252555ab3..c1062e8c3f50 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -592,14 +592,14 @@ static inline void kvm_cpu_svm_disable(void)
}
}
-static void svm_emergency_disable(void)
+static void svm_emergency_disable_virtualization_cpu(void)
{
kvm_rebooting = true;
kvm_cpu_svm_disable();
}
-static void svm_hardware_disable(void)
+static void svm_disable_virtualization_cpu(void)
{
/* Make sure we clean up behind us */
if (tsc_scaling)
@@ -610,7 +610,7 @@ static void svm_hardware_disable(void)
amd_pmu_disable_virt();
}
-static int svm_hardware_enable(void)
+static int svm_enable_virtualization_cpu(void)
{
struct svm_cpu_data *sd;
@@ -1533,7 +1533,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
* TSC_AUX is always virtualized for SEV-ES guests when the feature is
* available. The user return MSR support is not required in this case
* because TSC_AUX is restored on #VMEXIT from the host save area
- * (which has been initialized in svm_hardware_enable()).
+ * (which has been initialized in svm_enable_virtualization_cpu()).
*/
if (likely(tsc_aux_uret_slot >= 0) &&
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
@@ -2825,17 +2825,17 @@ static int efer_trap(struct kvm_vcpu *vcpu)
return kvm_complete_insn_gp(vcpu, ret);
}
-static int svm_get_msr_feature(struct kvm_msr_entry *msr)
+static int svm_get_feature_msr(u32 msr, u64 *data)
{
- msr->data = 0;
+ *data = 0;
- switch (msr->index) {
+ switch (msr) {
case MSR_AMD64_DE_CFG:
if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
- msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
+ *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
break;
default:
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
return 0;
@@ -2876,6 +2876,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CSTAR:
msr_info->data = svm->vmcb01.ptr->save.cstar;
break;
+ case MSR_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.gs.base;
+ break;
+ case MSR_FS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.fs.base;
+ break;
case MSR_KERNEL_GS_BASE:
msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
break;
@@ -3101,6 +3107,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_CSTAR:
svm->vmcb01.ptr->save.cstar = data;
break;
+ case MSR_GS_BASE:
+ svm->vmcb01.ptr->save.gs.base = data;
+ break;
+ case MSR_FS_BASE:
+ svm->vmcb01.ptr->save.fs.base = data;
+ break;
case MSR_KERNEL_GS_BASE:
svm->vmcb01.ptr->save.kernel_gs_base = data;
break;
@@ -3132,7 +3144,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* feature is available. The user return MSR support is not
* required in this case because TSC_AUX is restored on #VMEXIT
* from the host save area (which has been initialized in
- * svm_hardware_enable()).
+ * svm_enable_virtualization_cpu()).
*/
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
break;
@@ -3179,18 +3191,21 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
case MSR_AMD64_DE_CFG: {
- struct kvm_msr_entry msr_entry;
+ u64 supported_de_cfg;
- msr_entry.index = msr->index;
- if (svm_get_msr_feature(&msr_entry))
+ if (svm_get_feature_msr(ecx, &supported_de_cfg))
return 1;
- /* Check the supported bits */
- if (data & ~msr_entry.data)
+ if (data & ~supported_de_cfg)
return 1;
- /* Don't allow the guest to change a bit, #GP */
- if (!msr->host_initiated && (data ^ msr_entry.data))
+ /*
+ * Don't let the guest change the host-programmed value. The
+ * MSR is very model specific, i.e. contains multiple bits that
+ * are completely unknown to KVM, and the one bit known to KVM
+ * is simply a reflection of hardware capabilities.
+ */
+ if (!msr->host_initiated && data != svm->msr_decfg)
return 1;
svm->msr_decfg = data;
@@ -4144,12 +4159,21 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
- if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
- to_svm(vcpu)->vmcb->control.exit_info_1)
+ switch (svm->vmcb->control.exit_code) {
+ case SVM_EXIT_MSR:
+ if (!svm->vmcb->control.exit_info_1)
+ break;
return handle_fastpath_set_msr_irqoff(vcpu);
+ case SVM_EXIT_HLT:
+ return handle_fastpath_hlt(vcpu);
+ default:
+ break;
+ }
return EXIT_FASTPATH_NONE;
}
@@ -4980,8 +5004,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_processor_compatibility = svm_check_processor_compat,
.hardware_unsetup = svm_hardware_unsetup,
- .hardware_enable = svm_hardware_enable,
- .hardware_disable = svm_hardware_disable,
+ .enable_virtualization_cpu = svm_enable_virtualization_cpu,
+ .disable_virtualization_cpu = svm_disable_virtualization_cpu,
+ .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
.has_emulated_msr = svm_has_emulated_msr,
.vcpu_create = svm_vcpu_create,
@@ -4999,7 +5024,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_unblocking = avic_vcpu_unblocking,
.update_exception_bitmap = svm_update_exception_bitmap,
- .get_msr_feature = svm_get_msr_feature,
+ .get_feature_msr = svm_get_feature_msr,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.get_segment_base = svm_get_segment_base,
@@ -5050,6 +5075,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
+
+ .x2apic_icr_is_split = true,
.set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
.apicv_post_state_restore = avic_apicv_post_state_restore,
@@ -5224,6 +5251,9 @@ static __init void svm_set_cpu_caps(void)
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
+
+ /* Don't advertise Bus Lock Detect to guest if SVM support is absent */
+ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
}
static __init int svm_hardware_setup(void)
@@ -5410,8 +5440,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static void __svm_exit(void)
{
kvm_x86_vendor_exit();
-
- cpu_emergency_unregister_virt_callback(svm_emergency_disable);
}
static int __init svm_init(void)
@@ -5427,8 +5455,6 @@ static int __init svm_init(void)
if (r)
return r;
- cpu_emergency_register_virt_callback(svm_emergency_disable);
-
/*
* Common KVM initialization _must_ come last, after this, /dev/kvm is
* exposed to userspace!
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 0bf35ebe8a1b..7668e2fb8043 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -23,8 +23,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.hardware_unsetup = vmx_hardware_unsetup,
- .hardware_enable = vmx_hardware_enable,
- .hardware_disable = vmx_hardware_disable,
+ .enable_virtualization_cpu = vmx_enable_virtualization_cpu,
+ .disable_virtualization_cpu = vmx_disable_virtualization_cpu,
+ .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
+
.has_emulated_msr = vmx_has_emulated_msr,
.vm_size = sizeof(struct kvm_vmx),
@@ -41,7 +43,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.vcpu_put = vmx_vcpu_put,
.update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
+ .get_feature_msr = vmx_get_feature_msr,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
@@ -89,6 +91,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.enable_nmi_window = vmx_enable_nmi_window,
.enable_irq_window = vmx_enable_irq_window,
.update_cr8_intercept = vmx_update_cr8_intercept,
+
+ .x2apic_icr_is_split = false,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 39a26ecf87ea..9cfcfebd5f99 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -755,7 +755,7 @@ fault:
return -EIO;
}
-static void vmx_emergency_disable(void)
+void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
@@ -1998,15 +1998,15 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
return !(msr->data & ~valid_bits);
}
-int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+int vmx_get_feature_msr(u32 msr, u64 *data)
{
- switch (msr->index) {
+ switch (msr) {
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
- return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
default:
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
}
@@ -2844,7 +2844,7 @@ fault:
return -EFAULT;
}
-int vmx_hardware_enable(void)
+int vmx_enable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2881,7 +2881,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-void vmx_hardware_disable(void)
+void vmx_disable_virtualization_cpu(void)
{
vmclear_local_loaded_vmcss();
@@ -7265,6 +7265,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ case EXIT_REASON_HLT:
+ return handle_fastpath_hlt(vcpu);
default:
return EXIT_FASTPATH_NONE;
}
@@ -8584,8 +8586,6 @@ static void __vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
- cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
-
vmx_cleanup_l1d_flush();
}
@@ -8632,8 +8632,6 @@ static int __init vmx_init(void)
pi_init_cpu(cpu);
}
- cpu_emergency_register_virt_callback(vmx_emergency_disable);
-
vmx_check_vmcs12_offsets();
/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 42498fa63abb..3839afb921e2 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -17,10 +17,6 @@
#include "run_flags.h"
#include "../mmu.h"
-#define MSR_TYPE_R 1
-#define MSR_TYPE_W 2
-#define MSR_TYPE_RW 3
-
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ce3221cd1d01..a55981c5216e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -13,8 +13,9 @@ extern struct kvm_x86_init_ops vt_init_ops __initdata;
void vmx_hardware_unsetup(void);
int vmx_check_processor_compat(void);
-int vmx_hardware_enable(void);
-void vmx_hardware_disable(void);
+int vmx_enable_virtualization_cpu(void);
+void vmx_disable_virtualization_cpu(void);
+void vmx_emergency_disable_virtualization_cpu(void);
int vmx_vm_init(struct kvm *kvm);
void vmx_vm_destroy(struct kvm *kvm);
int vmx_vcpu_precreate(struct kvm *kvm);
@@ -56,7 +57,7 @@ bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
-int vmx_get_msr_feature(struct kvm_msr_entry *msr);
+int vmx_get_feature_msr(u32 msr, u64 *data);
int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3ef6299ef823..0c1d54d9ef45 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -305,24 +305,237 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
static struct kmem_cache *x86_emulator_cache;
/*
- * When called, it means the previous get/set msr reached an invalid msr.
- * Return true if we want to ignore/silent this failed msr access.
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
+ * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
+ * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
+ * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
+ * MSRs that KVM emulates without strictly requiring host support.
+ * msr_based_features holds MSRs that enumerate features, i.e. are effectively
+ * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
+ * msrs_to_save and emulated_msrs.
+ */
+
+static const u32 msrs_to_save_base[] = {
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_UMWAIT_CONTROL,
+
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+};
+
+static const u32 msrs_to_save_pmu[] = {
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+ MSR_CORE_PERF_GLOBAL_CTRL,
+ MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
+
+ /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+
+ /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+
+ MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+};
+
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ ARRAY_SIZE(msrs_to_save_pmu)];
+static unsigned num_msrs_to_save;
+
+static const u32 emulated_msrs_all[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+
+#ifdef CONFIG_KVM_HYPERV
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
+ HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
+ HV_X64_MSR_SYNDBG_OPTIONS,
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
+#endif
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
+
+ MSR_IA32_TSC_ADJUST,
+ MSR_IA32_TSC_DEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+ MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
+ MSR_IA32_SMBASE,
+ MSR_SMI_COUNT,
+ MSR_PLATFORM_INFO,
+ MSR_MISC_FEATURES_ENABLES,
+ MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
+ MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
+
+ /*
+ * KVM always supports the "true" VMX control MSRs, even if the host
+ * does not. The VMX MSRs as a whole are considered "emulated" as KVM
+ * doesn't strictly require them to exist in the host (ignoring that
+ * KVM would refuse to load in the first place if the core set of MSRs
+ * aren't supported).
+ */
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
+ MSR_K7_HWCR,
+ MSR_KVM_POLL_CONTROL,
+};
+
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
+static unsigned num_emulated_msrs;
+
+/*
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs. VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
+ */
+static const u32 msr_based_features_all_except_vmx[] = {
+ MSR_AMD64_DE_CFG,
+ MSR_IA32_UCODE_REV,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+};
+
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
+static unsigned int num_msr_based_features;
+
+/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
*/
-static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
+static bool kvm_is_immutable_feature_msr(u32 msr)
{
- const char *op = write ? "wrmsr" : "rdmsr";
+ int i;
- if (ignore_msrs) {
- if (report_ignored_msrs)
- kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
- op, msr, data);
- /* Mask the error */
+ if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
return true;
- } else {
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ if (msr == msr_based_features_all_except_vmx[i])
+ return msr != MSR_IA32_UCODE_REV;
+ }
+
+ return false;
+}
+
+static bool kvm_is_advertised_msr(u32 msr_index)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_msrs_to_save; i++) {
+ if (msrs_to_save[i] == msr_index)
+ return true;
+ }
+
+ for (i = 0; i < num_emulated_msrs; i++) {
+ if (emulated_msrs[i] == msr_index)
+ return true;
+ }
+
+ return false;
+}
+
+typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated);
+
+static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr,
+ u64 *data, bool host_initiated,
+ enum kvm_msr_access rw,
+ msr_access_t msr_access_fn)
+{
+ const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr";
+ int ret;
+
+ BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W);
+
+ /*
+ * Zero the data on read failures to avoid leaking stack data to the
+ * guest and/or userspace, e.g. if the failure is ignored below.
+ */
+ ret = msr_access_fn(vcpu, msr, data, host_initiated);
+ if (ret && rw == MSR_TYPE_R)
+ *data = 0;
+
+ if (ret != KVM_MSR_RET_UNSUPPORTED)
+ return ret;
+
+ /*
+ * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM
+ * advertises to userspace, even if an MSR isn't fully supported.
+ * Simply check that @data is '0', which covers both the write '0' case
+ * and all reads (in which case @data is zeroed on failure; see above).
+ */
+ if (host_initiated && !*data && kvm_is_advertised_msr(msr))
+ return 0;
+
+ if (!ignore_msrs) {
kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
- op, msr, data);
- return false;
+ op, msr, *data);
+ return ret;
}
+
+ if (report_ignored_msrs)
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data);
+
+ return 0;
}
static struct kmem_cache *kvm_alloc_emulator_cache(void)
@@ -355,7 +568,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
/*
* Disabling irqs at this point since the following code could be
- * interrupted and executed through kvm_arch_hardware_disable()
+ * interrupted and executed through kvm_arch_disable_virtualization_cpu()
*/
local_irq_save(flags);
if (msrs->registered) {
@@ -413,8 +626,7 @@ EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
- unsigned int cpu = smp_processor_id();
- struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
u64 value;
int i;
@@ -621,12 +833,6 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto
ex->payload = payload;
}
-/* Forcibly leave the nested mode in cases like a vCPU reset */
-static void kvm_leave_nested(struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops.nested_ops->leave_nested(vcpu);
-}
-
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool has_payload, unsigned long payload, bool reinject)
@@ -1412,178 +1618,6 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
/*
- * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
- * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
- * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
- * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
- * MSRs that KVM emulates without strictly requiring host support.
- * msr_based_features holds MSRs that enumerate features, i.e. are effectively
- * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
- * msrs_to_save and emulated_msrs.
- */
-
-static const u32 msrs_to_save_base[] = {
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_STAR,
-#ifdef CONFIG_X86_64
- MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
- MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
- MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
- MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
- MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
- MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
- MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
- MSR_IA32_UMWAIT_CONTROL,
-
- MSR_IA32_XFD, MSR_IA32_XFD_ERR,
-};
-
-static const u32 msrs_to_save_pmu[] = {
- MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
- MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
- MSR_CORE_PERF_GLOBAL_CTRL,
- MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
-
- /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
- MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
- MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
- MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
- MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
- MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
- MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
- MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
- MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
-
- MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
- MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
-
- /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
- MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
- MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
- MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
- MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
-
- MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
- MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
- MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
-};
-
-static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
- ARRAY_SIZE(msrs_to_save_pmu)];
-static unsigned num_msrs_to_save;
-
-static const u32 emulated_msrs_all[] = {
- MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
-
-#ifdef CONFIG_KVM_HYPERV
- HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
- HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
- HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
- HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
- HV_X64_MSR_RESET,
- HV_X64_MSR_VP_INDEX,
- HV_X64_MSR_VP_RUNTIME,
- HV_X64_MSR_SCONTROL,
- HV_X64_MSR_STIMER0_CONFIG,
- HV_X64_MSR_VP_ASSIST_PAGE,
- HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
- HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
- HV_X64_MSR_SYNDBG_OPTIONS,
- HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
- HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
- HV_X64_MSR_SYNDBG_PENDING_BUFFER,
-#endif
-
- MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
-
- MSR_IA32_TSC_ADJUST,
- MSR_IA32_TSC_DEADLINE,
- MSR_IA32_ARCH_CAPABILITIES,
- MSR_IA32_PERF_CAPABILITIES,
- MSR_IA32_MISC_ENABLE,
- MSR_IA32_MCG_STATUS,
- MSR_IA32_MCG_CTL,
- MSR_IA32_MCG_EXT_CTL,
- MSR_IA32_SMBASE,
- MSR_SMI_COUNT,
- MSR_PLATFORM_INFO,
- MSR_MISC_FEATURES_ENABLES,
- MSR_AMD64_VIRT_SPEC_CTRL,
- MSR_AMD64_TSC_RATIO,
- MSR_IA32_POWER_CTL,
- MSR_IA32_UCODE_REV,
-
- /*
- * KVM always supports the "true" VMX control MSRs, even if the host
- * does not. The VMX MSRs as a whole are considered "emulated" as KVM
- * doesn't strictly require them to exist in the host (ignoring that
- * KVM would refuse to load in the first place if the core set of MSRs
- * aren't supported).
- */
- MSR_IA32_VMX_BASIC,
- MSR_IA32_VMX_TRUE_PINBASED_CTLS,
- MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
- MSR_IA32_VMX_TRUE_EXIT_CTLS,
- MSR_IA32_VMX_TRUE_ENTRY_CTLS,
- MSR_IA32_VMX_MISC,
- MSR_IA32_VMX_CR0_FIXED0,
- MSR_IA32_VMX_CR4_FIXED0,
- MSR_IA32_VMX_VMCS_ENUM,
- MSR_IA32_VMX_PROCBASED_CTLS2,
- MSR_IA32_VMX_EPT_VPID_CAP,
- MSR_IA32_VMX_VMFUNC,
-
- MSR_K7_HWCR,
- MSR_KVM_POLL_CONTROL,
-};
-
-static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
-static unsigned num_emulated_msrs;
-
-/*
- * List of MSRs that control the existence of MSR-based features, i.e. MSRs
- * that are effectively CPUID leafs. VMX MSRs are also included in the set of
- * feature MSRs, but are handled separately to allow expedited lookups.
- */
-static const u32 msr_based_features_all_except_vmx[] = {
- MSR_AMD64_DE_CFG,
- MSR_IA32_UCODE_REV,
- MSR_IA32_ARCH_CAPABILITIES,
- MSR_IA32_PERF_CAPABILITIES,
-};
-
-static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
- (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
-static unsigned int num_msr_based_features;
-
-/*
- * All feature MSRs except uCode revID, which tracks the currently loaded uCode
- * patch, are immutable once the vCPU model is defined.
- */
-static bool kvm_is_immutable_feature_msr(u32 msr)
-{
- int i;
-
- if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
- return true;
-
- for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
- if (msr == msr_based_features_all_except_vmx[i])
- return msr != MSR_IA32_UCODE_REV;
- }
-
- return false;
-}
-
-/*
* Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
* does not yet virtualize. These include:
* 10 - MISC_PACKAGE_CTRLS
@@ -1660,40 +1694,31 @@ static u64 kvm_get_arch_capabilities(void)
return data;
}
-static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
+static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
{
- switch (msr->index) {
+ WARN_ON_ONCE(!host_initiated);
+
+ switch (index) {
case MSR_IA32_ARCH_CAPABILITIES:
- msr->data = kvm_get_arch_capabilities();
+ *data = kvm_get_arch_capabilities();
break;
case MSR_IA32_PERF_CAPABILITIES:
- msr->data = kvm_caps.supported_perf_cap;
+ *data = kvm_caps.supported_perf_cap;
break;
case MSR_IA32_UCODE_REV:
- rdmsrl_safe(msr->index, &msr->data);
+ rdmsrl_safe(index, data);
break;
default:
- return kvm_x86_call(get_msr_feature)(msr);
+ return kvm_x86_call(get_feature_msr)(index, data);
}
return 0;
}
-static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- struct kvm_msr_entry msr;
- int r;
-
- /* Unconditionally clear the output for simplicity */
- msr.data = 0;
- msr.index = index;
- r = kvm_get_msr_feature(&msr);
-
- if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false))
- r = 0;
-
- *data = msr.data;
-
- return r;
+ return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R,
+ kvm_get_feature_msr);
}
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -1880,16 +1905,17 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
return kvm_x86_call(set_msr)(vcpu, &msr);
}
+static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ return __kvm_set_msr(vcpu, index, *data, host_initiated);
+}
+
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
u32 index, u64 data, bool host_initiated)
{
- int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
-
- if (ret == KVM_MSR_RET_INVALID)
- if (kvm_msr_ignored_check(index, data, true))
- ret = 0;
-
- return ret;
+ return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W,
+ _kvm_set_msr);
}
/*
@@ -1928,16 +1954,8 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
u32 index, u64 *data, bool host_initiated)
{
- int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
-
- if (ret == KVM_MSR_RET_INVALID) {
- /* Unconditionally clear *data for simplicity */
- *data = 0;
- if (kvm_msr_ignored_check(index, 0, false))
- ret = 0;
- }
-
- return ret;
+ return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R,
+ __kvm_get_msr);
}
static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
@@ -1999,7 +2017,7 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
static u64 kvm_msr_reason(int r)
{
switch (r) {
- case KVM_MSR_RET_INVALID:
+ case KVM_MSR_RET_UNSUPPORTED:
return KVM_MSR_EXIT_REASON_UNKNOWN;
case KVM_MSR_RET_FILTERED:
return KVM_MSR_EXIT_REASON_FILTER;
@@ -2162,31 +2180,34 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
{
u32 msr = kvm_rcx_read(vcpu);
u64 data;
- fastpath_t ret = EXIT_FASTPATH_NONE;
+ fastpath_t ret;
+ bool handled;
kvm_vcpu_srcu_read_lock(vcpu);
switch (msr) {
case APIC_BASE_MSR + (APIC_ICR >> 4):
data = kvm_read_edx_eax(vcpu);
- if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
- kvm_skip_emulated_instruction(vcpu);
- ret = EXIT_FASTPATH_EXIT_HANDLED;
- }
+ handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
break;
case MSR_IA32_TSC_DEADLINE:
data = kvm_read_edx_eax(vcpu);
- if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
- kvm_skip_emulated_instruction(vcpu);
- ret = EXIT_FASTPATH_REENTER_GUEST;
- }
+ handled = !handle_fastpath_set_tscdeadline(vcpu, data);
break;
default:
+ handled = false;
break;
}
- if (ret != EXIT_FASTPATH_NONE)
+ if (handled) {
+ if (!kvm_skip_emulated_instruction(vcpu))
+ ret = EXIT_FASTPATH_EXIT_USERSPACE;
+ else
+ ret = EXIT_FASTPATH_REENTER_GUEST;
trace_kvm_msr_write(msr, data);
+ } else {
+ ret = EXIT_FASTPATH_NONE;
+ }
kvm_vcpu_srcu_read_unlock(vcpu);
@@ -3746,18 +3767,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
-static bool kvm_is_msr_to_save(u32 msr_index)
-{
- unsigned int i;
-
- for (i = 0; i < num_msrs_to_save; i++) {
- if (msrs_to_save[i] == msr_index)
- return true;
- }
-
- return false;
-}
-
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u32 msr = msr_info->index;
@@ -4139,15 +4148,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- /*
- * Userspace is allowed to write '0' to MSRs that KVM reports
- * as to-be-saved, even if an MSRs isn't fully supported.
- */
- if (msr_info->host_initiated && !data &&
- kvm_is_msr_to_save(msr))
- break;
-
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
return 0;
}
@@ -4498,17 +4499,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
- /*
- * Userspace is allowed to read MSRs that KVM reports as
- * to-be-saved, even if an MSR isn't fully supported.
- */
- if (msr_info->host_initiated &&
- kvm_is_msr_to_save(msr_info->index)) {
- msr_info->data = 0;
- break;
- }
-
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
return 0;
}
@@ -4656,7 +4647,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ASYNC_PF_INT:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_KVMCLOCK_CTRL:
- case KVM_CAP_READONLY_MEM:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
case KVM_CAP_DISABLE_QUIRKS:
@@ -4815,6 +4805,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_TYPES:
r = kvm_caps.supported_vm_types;
break;
+ case KVM_CAP_READONLY_MEM:
+ r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1;
+ break;
default:
break;
}
@@ -4944,7 +4937,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
break;
}
case KVM_GET_MSRS:
- r = msr_io(NULL, argp, do_get_msr_feature, 1);
+ r = msr_io(NULL, argp, do_get_feature_msr, 1);
break;
#ifdef CONFIG_KVM_HYPERV
case KVM_GET_SUPPORTED_HV_CPUID:
@@ -6040,7 +6033,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
break;
+ kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ kvm_vcpu_srcu_read_unlock(vcpu);
break;
}
case KVM_GET_DEBUGREGS: {
@@ -7379,11 +7374,9 @@ out:
static void kvm_probe_feature_msr(u32 msr_index)
{
- struct kvm_msr_entry msr = {
- .index = msr_index,
- };
+ u64 data;
- if (kvm_get_msr_feature(&msr))
+ if (kvm_get_feature_msr(NULL, msr_index, &data, true))
return;
msr_based_features[num_msr_based_features++] = msr_index;
@@ -8861,60 +8854,13 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
return 1;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- int emulation_type)
+static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu,
+ gpa_t cr2_or_gpa,
+ int emulation_type)
{
- gpa_t gpa = cr2_or_gpa;
- kvm_pfn_t pfn;
-
if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
return false;
- if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
- WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
- return false;
-
- if (!vcpu->arch.mmu->root_role.direct) {
- /*
- * Write permission should be allowed since only
- * write access need to be emulated.
- */
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
-
- /*
- * If the mapping is invalid in guest, let cpu retry
- * it to generate fault.
- */
- if (gpa == INVALID_GPA)
- return true;
- }
-
- /*
- * Do not retry the unhandleable instruction if it faults on the
- * readonly host memory, otherwise it will goto a infinite loop:
- * retry instruction -> write #PF -> emulation fail -> retry
- * instruction -> ...
- */
- pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-
- /*
- * If the instruction failed on the error pfn, it can not be fixed,
- * report the error to userspace.
- */
- if (is_error_noslot_pfn(pfn))
- return false;
-
- kvm_release_pfn_clean(pfn);
-
- /*
- * If emulation may have been triggered by a write to a shadowed page
- * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
- * guest to let the CPU re-execute the instruction in the hope that the
- * CPU can cleanly execute the instruction that KVM failed to emulate.
- */
- if (vcpu->kvm->arch.indirect_shadow_pages)
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
/*
* If the failed instruction faulted on an access to page tables that
* are used to translate any part of the instruction, KVM can't resolve
@@ -8925,54 +8871,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* then zap the SPTE to unprotect the gfn, and then do it all over
* again. Report the error to userspace.
*/
- return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
-}
-
-static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
- gpa_t cr2_or_gpa, int emulation_type)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
-
- last_retry_eip = vcpu->arch.last_retry_eip;
- last_retry_addr = vcpu->arch.last_retry_addr;
+ if (emulation_type & EMULTYPE_WRITE_PF_TO_SP)
+ return false;
/*
- * If the emulation is caused by #PF and it is non-page_table
- * writing instruction, it means the VM-EXIT is caused by shadow
- * page protected, we can zap the shadow page and retry this
- * instruction directly.
- *
- * Note: if the guest uses a non-page-table modifying instruction
- * on the PDE that points to the instruction, then we will unmap
- * the instruction and go to an infinite loop. So, we cache the
- * last retried eip and the last fault address, if we meet the eip
- * and the address again, we can break out of the potential infinite
- * loop.
+ * If emulation may have been triggered by a write to a shadowed page
+ * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+ * guest to let the CPU re-execute the instruction in the hope that the
+ * CPU can cleanly execute the instruction that KVM failed to emulate.
*/
- vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
-
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
- return false;
-
- if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
- WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
- return false;
-
- if (x86_page_table_writing_insn(ctxt))
- return false;
-
- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
- return false;
-
- vcpu->arch.last_retry_eip = ctxt->eip;
- vcpu->arch.last_retry_addr = cr2_or_gpa;
-
- if (!vcpu->arch.mmu->root_role.direct)
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
-
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true);
+ /*
+ * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible
+ * all SPTEs were already zapped by a different task. The alternative
+ * is to report the error to userspace and likely terminate the guest,
+ * and the last_retry_{eip,addr} checks will prevent retrying the page
+ * fault indefinitely, i.e. there's nothing to lose by retrying.
+ */
return true;
}
@@ -9172,6 +9088,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
+ if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))))
+ emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF;
+
r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
if (r != X86EMUL_CONTINUE) {
if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
@@ -9202,8 +9123,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (reexecute_instruction(vcpu, cr2_or_gpa,
- emulation_type))
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
return 1;
if (ctxt->have_exception &&
@@ -9250,7 +9171,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return 1;
}
- if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
+ /*
+ * If emulation was caused by a write-protection #PF on a non-page_table
+ * writing instruction, try to unprotect the gfn, i.e. zap shadow pages,
+ * and retry the instruction, as the vCPU is likely no longer using the
+ * gfn as a page table.
+ */
+ if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ !x86_page_table_writing_insn(ctxt) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
return 1;
/* this is needed for vmware backdoor interface to work since it
@@ -9281,7 +9210,8 @@ restart:
return 1;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
return 1;
return handle_emulation_failure(vcpu, emulation_type);
@@ -9749,7 +9679,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
guard(mutex)(&vendor_module_lock);
- if (kvm_x86_ops.hardware_enable) {
+ if (kvm_x86_ops.enable_virtualization_cpu) {
pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
}
@@ -9876,7 +9806,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return 0;
out_unwind_ops:
- kvm_x86_ops.hardware_enable = NULL;
+ kvm_x86_ops.enable_virtualization_cpu = NULL;
kvm_x86_call(hardware_unsetup)();
out_mmu_exit:
kvm_mmu_vendor_module_exit();
@@ -9917,56 +9847,11 @@ void kvm_x86_vendor_exit(void)
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
mutex_lock(&vendor_module_lock);
- kvm_x86_ops.hardware_enable = NULL;
+ kvm_x86_ops.enable_virtualization_cpu = NULL;
mutex_unlock(&vendor_module_lock);
}
EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
-static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
-{
- /*
- * The vCPU has halted, e.g. executed HLT. Update the run state if the
- * local APIC is in-kernel, the run loop will detect the non-runnable
- * state and halt the vCPU. Exit to userspace if the local APIC is
- * managed by userspace, in which case userspace is responsible for
- * handling wake events.
- */
- ++vcpu->stat.halt_exits;
- if (lapic_in_kernel(vcpu)) {
- vcpu->arch.mp_state = state;
- return 1;
- } else {
- vcpu->run->exit_reason = reason;
- return 0;
- }
-}
-
-int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
-{
- return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
-
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
- int ret = kvm_skip_emulated_instruction(vcpu);
- /*
- * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
- return kvm_emulate_halt_noskip(vcpu) && ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-
-int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
-{
- int ret = kvm_skip_emulated_instruction(vcpu);
-
- return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
- KVM_EXIT_AP_RESET_HOLD) && ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
-
#ifdef CONFIG_X86_64
static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
unsigned long clock_type)
@@ -11203,6 +11088,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
+ if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
+ return 0;
+
r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
return r;
@@ -11216,6 +11104,67 @@ out:
return r;
}
+static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted);
+}
+
+static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+{
+ if (!list_empty_careful(&vcpu->async_pf.done))
+ return true;
+
+ if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+ kvm_apic_init_sipi_allowed(vcpu))
+ return true;
+
+ if (vcpu->arch.pv.pv_unhalted)
+ return true;
+
+ if (kvm_is_exception_pending(vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ kvm_x86_call(nmi_allowed)(vcpu, false)))
+ return true;
+
+#ifdef CONFIG_KVM_SMM
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending &&
+ kvm_x86_call(smi_allowed)(vcpu, false)))
+ return true;
+#endif
+
+ if (kvm_test_request(KVM_REQ_PMI, vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
+ return true;
+
+ if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
+ return true;
+
+ if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu, false))
+ return true;
+
+ if (kvm_xen_has_pending_events(vcpu))
+ return true;
+
+ return false;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+}
+
/* Called within kvm->srcu read side. */
static inline int vcpu_block(struct kvm_vcpu *vcpu)
{
@@ -11287,12 +11236,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
return 1;
}
-static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
-{
- return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- !vcpu->arch.apf.halted);
-}
-
/* Called within kvm->srcu read side. */
static int vcpu_run(struct kvm_vcpu *vcpu)
{
@@ -11344,6 +11287,98 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
return r;
}
+static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
+{
+ /*
+ * The vCPU has halted, e.g. executed HLT. Update the run state if the
+ * local APIC is in-kernel, the run loop will detect the non-runnable
+ * state and halt the vCPU. Exit to userspace if the local APIC is
+ * managed by userspace, in which case userspace is responsible for
+ * handling wake events.
+ */
+ ++vcpu->stat.halt_exits;
+ if (lapic_in_kernel(vcpu)) {
+ if (kvm_vcpu_has_events(vcpu))
+ vcpu->arch.pv.pv_unhalted = false;
+ else
+ vcpu->arch.mp_state = state;
+ return 1;
+ } else {
+ vcpu->run->exit_reason = reason;
+ return 0;
+ }
+}
+
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+ /*
+ * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_emulate_halt_noskip(vcpu) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+ ret = kvm_emulate_halt(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ if (!ret)
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+
+ if (kvm_vcpu_running(vcpu))
+ return EXIT_FASTPATH_REENTER_GUEST;
+
+ return EXIT_FASTPATH_EXIT_HANDLED;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_hlt);
+
+int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+ KVM_EXIT_AP_RESET_HOLD) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
+
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_apicv_active(vcpu) &&
+ kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
+}
+
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
+}
+
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+ if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+#ifdef CONFIG_KVM_SMM
+ kvm_test_request(KVM_REQ_SMI, vcpu) ||
+#endif
+ kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return true;
+
+ return kvm_arch_dy_has_pending_interrupt(vcpu);
+}
+
static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
@@ -12512,7 +12547,17 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
-int kvm_arch_hardware_enable(void)
+void kvm_arch_enable_virtualization(void)
+{
+ cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+}
+
+void kvm_arch_disable_virtualization(void)
+{
+ cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+}
+
+int kvm_arch_enable_virtualization_cpu(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
@@ -12528,7 +12573,7 @@ int kvm_arch_hardware_enable(void)
if (ret)
return ret;
- ret = kvm_x86_call(hardware_enable)();
+ ret = kvm_x86_call(enable_virtualization_cpu)();
if (ret != 0)
return ret;
@@ -12608,9 +12653,9 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
- kvm_x86_call(hardware_disable)();
+ kvm_x86_call(disable_virtualization_cpu)();
drop_user_return_notifiers();
}
@@ -13158,87 +13203,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_arch_free_memslot(kvm, old);
}
-static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
-{
- if (!list_empty_careful(&vcpu->async_pf.done))
- return true;
-
- if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
- kvm_apic_init_sipi_allowed(vcpu))
- return true;
-
- if (vcpu->arch.pv.pv_unhalted)
- return true;
-
- if (kvm_is_exception_pending(vcpu))
- return true;
-
- if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
- (vcpu->arch.nmi_pending &&
- kvm_x86_call(nmi_allowed)(vcpu, false)))
- return true;
-
-#ifdef CONFIG_KVM_SMM
- if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
- (vcpu->arch.smi_pending &&
- kvm_x86_call(smi_allowed)(vcpu, false)))
- return true;
-#endif
-
- if (kvm_test_request(KVM_REQ_PMI, vcpu))
- return true;
-
- if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
- return true;
-
- if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
- return true;
-
- if (kvm_hv_has_stimer_pending(vcpu))
- return true;
-
- if (is_guest_mode(vcpu) &&
- kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu, false))
- return true;
-
- if (kvm_xen_has_pending_events(vcpu))
- return true;
-
- return false;
-}
-
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-{
- return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
-}
-
-bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- return kvm_vcpu_apicv_active(vcpu) &&
- kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
-}
-
-bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.preempted_in_kernel;
-}
-
-bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
-{
- if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
- return true;
-
- if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
-#ifdef CONFIG_KVM_SMM
- kvm_test_request(KVM_REQ_SMI, vcpu) ||
-#endif
- kvm_test_request(KVM_REQ_EVENT, vcpu))
- return true;
-
- return kvm_arch_dy_has_pending_interrupt(vcpu);
-}
-
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.guest_state_protected)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5585c2ef147a..a84c48ef5278 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -109,6 +109,12 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+/* Forcibly leave the nested mode in cases like a vCPU reset */
+static inline void kvm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+}
+
static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
{
return vcpu->arch.last_vmentry_cpu != -1;
@@ -335,6 +341,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
+fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu);
extern struct kvm_caps kvm_caps;
extern struct kvm_host_values kvm_host;
@@ -505,13 +512,26 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
+enum kvm_msr_access {
+ MSR_TYPE_R = BIT(0),
+ MSR_TYPE_W = BIT(1),
+ MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W,
+};
+
/*
* Internal error codes that are used to indicate that MSR emulation encountered
- * an error that should result in #GP in the guest, unless userspace
- * handles it.
+ * an error that should result in #GP in the guest, unless userspace handles it.
+ * Note, '1', '0', and negative numbers are off limits, as they are used by KVM
+ * as part of KVM's lightly documented internal KVM_RUN return codes.
+ *
+ * UNSUPPORTED - The MSR isn't supported, either because it is completely
+ * unknown to KVM, or because the MSR should not exist according
+ * to the vCPU model.
+ *
+ * FILTERED - Access to the MSR is denied by a userspace MSR filter.
*/
-#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */
-#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */
+#define KVM_MSR_RET_UNSUPPORTED 2
+#define KVM_MSR_RET_FILTERED 3
#define __cr4_reserved_bits(__cpu_has, __c) \
({ \
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d8dbeac8b206..ff253648706f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..230f1dee4f09 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/* The end of the possible address space for physical memory */
+unsigned long physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void)
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
+ /* Calculate the end of the region */
+ vaddr += get_padding(&kaslr_regions[i]);
/*
- * Jump the region and add a minimum padding based on
- * randomization alignment.
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the physmem_end boundary.
+ * No rounding required as the region starts
+ * PUD aligned and size is in units of TB.
*/
- vaddr += get_padding(&kaslr_regions[i]);
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
+
+ /* Add a minimum padding based on randomization alignment. */
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}