diff options
Diffstat (limited to 'arch')
476 files changed, 11249 insertions, 4822 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index de5200eb55d1..832f68af7c77 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1025,6 +1025,14 @@ config ARCH_WANTS_EXECMEM_LATE enough entropy for module space randomization, for instance arm64. +config ARCH_HAS_EXECMEM_ROX + bool + depends on MMU && !HIGHMEM + help + For architectures that support allocations of executable memory + with read-only execute permissions. Architecture must implement + execmem_fill_trapping_insns() callback to enable this. + config HAVE_IRQ_EXIT_ON_IRQ_STACK bool help @@ -1683,4 +1691,10 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT config ARCH_NEED_CMPXCHG_1_EMU bool +config ARCH_WANTS_PRE_LINK_VMLINUX + bool + help + An architecture can select this if it provides arch/<arch>/tools/Makefile + with .arch.vmlinux.o target to be linked into vmlinux. + endmenu diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig index 1816c1dc22b1..3280bd9e6578 100644 --- a/arch/alpha/configs/defconfig +++ b/arch/alpha/configs/defconfig @@ -51,7 +51,6 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_CMOS=y CONFIG_EXT2_FS=y -CONFIG_REISERFS_FS=m CONFIG_ISO9660_FS=y CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 396caece6d6d..483965c5a4de 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -5,3 +5,4 @@ generic-y += agp.h generic-y += asm-offsets.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += text-patching.h diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 261af54fd601..5ec4c77e432e 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -14,7 +14,7 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) extern void copy_page(void * _to, void * _from); #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h index 2526fd3be5fd..05a444d77c53 100644 --- a/arch/alpha/include/asm/spinlock_types.h +++ b/arch/alpha/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define _ALPHA_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 763929e814e9..1e700468a685 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -78,6 +78,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 251b73c5481e..302507bf9b5d 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -146,6 +146,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 81 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 49285a3ce239..4c69522e0328 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -6,3 +6,4 @@ generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index e1cb170c2bf0..38916ac4bce4 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -583,10 +583,6 @@ CONFIG_EXT2_FS_SECURITY=y CONFIG_EXT3_FS=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y -CONFIG_REISERFS_FS=m -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -CONFIG_REISERFS_FS_SECURITY=y CONFIG_XFS_FS=m CONFIG_AUTOFS_FS=m CONFIG_FUSE_FS=m diff --git a/arch/arm/include/asm/jump_label.h b/arch/arm/include/asm/jump_label.h index e4eb54f6cd9f..a35aba7f548c 100644 --- a/arch/arm/include/asm/jump_label.h +++ b/arch/arm/include/asm/jump_label.h @@ -9,13 +9,17 @@ #define JUMP_LABEL_NOP_SIZE 4 +/* This macro is also expanded on the Rust side. */ +#define ARCH_STATIC_BRANCH_ASM(key, label) \ + "1:\n\t" \ + WASM(nop) "\n\t" \ + ".pushsection __jump_table, \"aw\"\n\t" \ + ".word 1b, " label ", " key "\n\t" \ + ".popsection\n\t" \ + static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm goto("1:\n\t" - WASM(nop) "\n\t" - ".pushsection __jump_table, \"aw\"\n\t" - ".word 1b, %l[l_yes], %c0\n\t" - ".popsection\n\t" + asm goto(ARCH_STATIC_BRANCH_ASM("%c0", "%l[l_yes]") : : "i" (&((char *)key)[branch]) : : l_yes); return false; diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h index 0c14b36ef101..5404a2a96bf3 100644 --- a/arch/arm/include/asm/spinlock_types.h +++ b/arch/arm/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define __ASM_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #define TICKET_SHIFT 16 diff --git a/arch/arm/include/asm/patch.h b/arch/arm/include/asm/text-patching.h index 0b48247c4600..0b48247c4600 100644 --- a/arch/arm/include/asm/patch.h +++ b/arch/arm/include/asm/text-patching.h diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index e61591f33a6c..845acf9ce21e 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -23,7 +23,7 @@ #include <asm/insn.h> #include <asm/set_memory.h> #include <asm/stacktrace.h> -#include <asm/patch.h> +#include <asm/text-patching.h> /* * The compiler emitted profiling hook consists of diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c index eb9c24b6e8e2..a06a92d0f550 100644 --- a/arch/arm/kernel/jump_label.c +++ b/arch/arm/kernel/jump_label.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/jump_label.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/insn.h> static void __arch_jump_label_transform(struct jump_entry *entry, diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c index 22f937e6f3ff..ab76c55fd610 100644 --- a/arch/arm/kernel/kgdb.c +++ b/arch/arm/kernel/kgdb.c @@ -15,7 +15,7 @@ #include <linux/kgdb.h> #include <linux/uaccess.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/traps.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c index e9e828b6bb30..4d45e60cd46d 100644 --- a/arch/arm/kernel/patch.c +++ b/arch/arm/kernel/patch.c @@ -9,7 +9,7 @@ #include <asm/fixmap.h> #include <asm/smp_plat.h> #include <asm/opcodes.h> -#include <asm/patch.h> +#include <asm/text-patching.h> struct patch { void *addr; diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig index ab767f059929..e4fe059cd861 100644 --- a/arch/arm/mach-imx/Kconfig +++ b/arch/arm/mach-imx/Kconfig @@ -6,7 +6,6 @@ menuconfig ARCH_MXC select CLKSRC_IMX_GPT select GENERIC_IRQ_CHIP select GPIOLIB - select PINCTRL select PM_OPP if PM select SOC_BUS select SRAM @@ -49,7 +48,6 @@ config SOC_IMX31 config SOC_IMX35 bool "i.MX35 support" select MXC_AVIC - select PINCTRL_IMX35 help This enables support for Freescale i.MX35 processor @@ -61,7 +59,6 @@ config SOC_IMX1 bool "i.MX1 support" select CPU_ARM920T select MXC_AVIC - select PINCTRL_IMX1 help This enables support for Freescale i.MX1 processor @@ -73,7 +70,6 @@ config SOC_IMX25 bool "i.MX25 support" select CPU_ARM926T select MXC_AVIC - select PINCTRL_IMX25 help This enables support for Freescale i.MX25 processor @@ -81,7 +77,6 @@ config SOC_IMX27 bool "i.MX27 support" select CPU_ARM926T select MXC_AVIC - select PINCTRL_IMX27 help This enables support for Freescale i.MX27 processor @@ -98,7 +93,6 @@ config SOC_IMX5 config SOC_IMX50 bool "i.MX50 support" - select PINCTRL_IMX50 select SOC_IMX5 help @@ -106,14 +100,12 @@ config SOC_IMX50 config SOC_IMX51 bool "i.MX51 support" - select PINCTRL_IMX51 select SOC_IMX5 help This enables support for Freescale i.MX51 processor config SOC_IMX53 bool "i.MX53 support" - select PINCTRL_IMX53 select SOC_IMX5 help @@ -137,7 +129,6 @@ config SOC_IMX6Q select ARM_ERRATA_775420 select HAVE_ARM_SCU if SMP select HAVE_ARM_TWD - select PINCTRL_IMX6Q select SOC_IMX6 help @@ -147,7 +138,6 @@ config SOC_IMX6SL bool "i.MX6 SoloLite support" select ARM_ERRATA_754322 select ARM_ERRATA_775420 - select PINCTRL_IMX6SL select SOC_IMX6 help @@ -157,7 +147,6 @@ config SOC_IMX6SLL bool "i.MX6 SoloLiteLite support" select ARM_ERRATA_754322 select ARM_ERRATA_775420 - select PINCTRL_IMX6SLL select SOC_IMX6 help @@ -167,7 +156,6 @@ config SOC_IMX6SX bool "i.MX6 SoloX support" select ARM_ERRATA_754322 select ARM_ERRATA_775420 - select PINCTRL_IMX6SX select SOC_IMX6 help @@ -175,7 +163,6 @@ config SOC_IMX6SX config SOC_IMX6UL bool "i.MX6 UltraLite support" - select PINCTRL_IMX6UL select SOC_IMX6 select ARM_ERRATA_814220 @@ -211,7 +198,6 @@ config SOC_IMX7D_CM4 config SOC_IMX7D bool "i.MX7 Dual support" - select PINCTRL_IMX7D select SOC_IMX7D_CA7 if ARCH_MULTI_V7 select SOC_IMX7D_CM4 if ARM_SINGLE_ARMV7M select ARM_ERRATA_814220 if ARCH_MULTI_V7 @@ -221,7 +207,6 @@ config SOC_IMX7D config SOC_IMX7ULP bool "i.MX7ULP support" select CLKSRC_IMX_TPM - select PINCTRL_IMX7ULP select SOC_IMX7D_CA7 if ARCH_MULTI_V7 select SOC_IMX7D_CM4 if ARM_SINGLE_ARMV7M help @@ -237,7 +222,6 @@ config SOC_IMXRT config SOC_VF610 bool "Vybrid Family VF610 support" select ARM_GIC if ARCH_MULTI_V7 - select PINCTRL_VF610 help This enables support for Freescale Vybrid VF610 processor. diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 831793cd6ff9..2bec87c3327d 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -61,32 +61,8 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address, return ret; } -#if defined(CONFIG_SPLIT_PTE_PTLOCKS) -/* - * If we are using split PTE locks, then we need to take the page - * lock here. Otherwise we are using shared mm->page_table_lock - * which is already locked, thus cannot take it. - */ -static inline void do_pte_lock(spinlock_t *ptl) -{ - /* - * Use nested version here to indicate that we are already - * holding one similar spinlock. - */ - spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); -} - -static inline void do_pte_unlock(spinlock_t *ptl) -{ - spin_unlock(ptl); -} -#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline void do_pte_lock(spinlock_t *ptl) {} -static inline void do_pte_unlock(spinlock_t *ptl) {} -#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ - static int adjust_pte(struct vm_area_struct *vma, unsigned long address, - unsigned long pfn) + unsigned long pfn, struct vm_fault *vmf) { spinlock_t *ptl; pgd_t *pgd; @@ -94,6 +70,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + pmd_t pmdval; int ret; pgd = pgd_offset(vma->vm_mm, address); @@ -112,20 +89,33 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, if (pmd_none_or_clear_bad(pmd)) return 0; +again: /* * This is called while another page table is mapped, so we * must use the nested version. This also means we need to * open-code the spin-locking. */ - pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl); + pte = pte_offset_map_rw_nolock(vma->vm_mm, pmd, address, &pmdval, &ptl); if (!pte) return 0; - do_pte_lock(ptl); + /* + * If we are using split PTE locks, then we need to take the page + * lock here. Otherwise we are using shared mm->page_table_lock + * which is already locked, thus cannot take it. + */ + if (ptl != vmf->ptl) { + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { + pte_unmap_unlock(pte, ptl); + goto again; + } + } ret = do_adjust_pte(vma, address, pfn, pte); - do_pte_unlock(ptl); + if (ptl != vmf->ptl) + spin_unlock(ptl); pte_unmap(pte); return ret; @@ -133,7 +123,8 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, static void make_coherent(struct address_space *mapping, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, unsigned long pfn) + unsigned long addr, pte_t *ptep, unsigned long pfn, + struct vm_fault *vmf) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *mpnt; @@ -160,7 +151,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, if (!(mpnt->vm_flags & VM_MAYSHARE)) continue; offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn); + aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn, vmf); } flush_dcache_mmap_unlock(mapping); if (aliases) @@ -203,7 +194,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, __flush_dcache_folio(mapping, folio); if (mapping) { if (cache_is_vivt()) - make_coherent(mapping, vma, addr, ptep, pfn); + make_coherent(mapping, vma, addr, ptep, pfn, vmf); else if (vma->vm_flags & VM_EXEC) __flush_icache_all(); } diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index d8238da095df..9fd877c87a38 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -25,7 +25,7 @@ #include <asm/cacheflush.h> #include <linux/percpu.h> #include <linux/bug.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/sections.h> #include "../decode-arm.h" diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c index 7f65048380ca..966c6042c5ad 100644 --- a/arch/arm/probes/kprobes/opt-arm.c +++ b/arch/arm/probes/kprobes/opt-arm.c @@ -14,7 +14,7 @@ /* for arm_gen_branch */ #include <asm/insn.h> /* for patch_text */ -#include <asm/patch.h> +#include <asm/text-patching.h> #include "core.h" diff --git a/arch/arm64/boot/dts/qcom/sc8180x.dtsi b/arch/arm64/boot/dts/qcom/sc8180x.dtsi index 717ec4ad63f3..745a7d0b8381 100644 --- a/arch/arm64/boot/dts/qcom/sc8180x.dtsi +++ b/arch/arm64/boot/dts/qcom/sc8180x.dtsi @@ -3889,7 +3889,7 @@ }; cpufreq_hw: cpufreq@18323000 { - compatible = "qcom,cpufreq-hw"; + compatible = "qcom,sc8180x-cpufreq-hw", "qcom,cpufreq-hw"; reg = <0 0x18323000 0 0x1400>, <0 0x18325800 0 0x1400>; reg-names = "freq-domain0", "freq-domain1"; diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 9b73fd0cd721..81e4157f92b7 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -46,6 +46,7 @@ struct cpuinfo_arm64 { u64 reg_revidr; u64 reg_gmid; u64 reg_smidr; + u64 reg_mpamidr; u64 reg_id_aa64dfr0; u64 reg_id_aa64dfr1; diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index a08a1212ffbb..201a46efd918 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -62,6 +62,11 @@ cpucap_is_possible(const unsigned int cap) return IS_ENABLED(CONFIG_ARM64_WORKAROUND_REPEAT_TLBI); case ARM64_WORKAROUND_SPECULATIVE_SSBS: return IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386); + case ARM64_MPAM: + /* + * KVM MPAM support doesn't rely on the host kernel supporting MPAM. + */ + return true; } return true; diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3d63c20ccefc..b64e49bd9d10 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -613,6 +613,13 @@ static inline bool id_aa64pfr1_sme(u64 pfr1) return val > 0; } +static inline bool id_aa64pfr0_mpam(u64 pfr0) +{ + u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT); + + return val > 0; +} + static inline bool id_aa64pfr1_mte(u64 pfr1) { u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_MTE_SHIFT); @@ -850,6 +857,16 @@ static inline bool system_supports_haft(void) cpus_have_final_cap(ARM64_HAFT); } +static __always_inline bool system_supports_mpam(void) +{ + return alternative_has_cap_unlikely(ARM64_MPAM); +} + +static __always_inline bool system_supports_mpam_hcr(void) +{ + return alternative_has_cap_unlikely(ARM64_MPAM_HCR); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 27086a81eae3..85ef966c08cd 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -249,6 +249,19 @@ msr spsr_el2, x0 .endm +.macro __init_el2_mpam + /* Memory Partitioning And Monitoring: disable EL2 traps */ + mrs x1, id_aa64pfr0_el1 + ubfx x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4 + cbz x0, .Lskip_mpam_\@ // skip if no MPAM + msr_s SYS_MPAM2_EL2, xzr // use the default partition + // and disable lower traps + mrs_s x0, SYS_MPAMIDR_EL1 + tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg + msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 +.Lskip_mpam_\@: +.endm + /** * Initialize EL2 registers to sane values. This should be called early on all * cores that were booted in EL2. Note that everything gets initialised as @@ -266,6 +279,7 @@ __init_el2_stage2 __init_el2_gicv3 __init_el2_hstr + __init_el2_mpam __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h index a0a5bbae7229..424ed421cd97 100644 --- a/arch/arm64/include/asm/jump_label.h +++ b/arch/arm64/include/asm/jump_label.h @@ -19,10 +19,14 @@ #define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\"\n\t" \ ".align 3\n\t" \ - ".long 1b - ., %l["#label"] - .\n\t" \ - ".quad %c0 - .\n\t" \ - ".popsection\n\t" \ - : : "i"(key) : : label + ".long 1b - ., " label " - .\n\t" \ + ".quad " key " - .\n\t" \ + ".popsection\n\t" + +/* This macro is also expanded on the Rust side. */ +#define ARCH_STATIC_BRANCH_ASM(key, label) \ + "1: nop\n\t" \ + JUMP_TABLE_ENTRY(key, label) static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) @@ -30,8 +34,8 @@ static __always_inline bool arch_static_branch(struct static_key * const key, char *k = &((char *)key)[branch]; asm goto( - "1: nop \n\t" - JUMP_TABLE_ENTRY(k, l_yes) + ARCH_STATIC_BRANCH_ASM("%c0", "%l[l_yes]") + : : "i"(k) : : l_yes ); return false; @@ -43,9 +47,11 @@ static __always_inline bool arch_static_branch_jump(struct static_key * const ke const bool branch) { char *k = &((char *)key)[branch]; + asm goto( "1: b %l[l_yes] \n\t" - JUMP_TABLE_ENTRY(k, l_yes) + JUMP_TABLE_ENTRY("%c0", "%l[l_yes]") + : : "i"(k) : : l_yes ); return false; l_yes: diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 109a85ee6910..3e0f0de1d2da 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -103,6 +103,7 @@ #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) #define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM) +#define MPAMHCR_HOST_FLAGS 0 /* TCR_EL2 Registers bits */ #define TCR_EL2_DS (1UL << 32) @@ -311,35 +312,6 @@ GENMASK(19, 18) | \ GENMASK(15, 0)) -/* Hyp Debug Configuration Register bits */ -#define MDCR_EL2_E2TB_MASK (UL(0x3)) -#define MDCR_EL2_E2TB_SHIFT (UL(24)) -#define MDCR_EL2_HPMFZS (UL(1) << 36) -#define MDCR_EL2_HPMFZO (UL(1) << 29) -#define MDCR_EL2_MTPME (UL(1) << 28) -#define MDCR_EL2_TDCC (UL(1) << 27) -#define MDCR_EL2_HLP (UL(1) << 26) -#define MDCR_EL2_HCCD (UL(1) << 23) -#define MDCR_EL2_TTRF (UL(1) << 19) -#define MDCR_EL2_HPMD (UL(1) << 17) -#define MDCR_EL2_TPMS (UL(1) << 14) -#define MDCR_EL2_E2PB_MASK (UL(0x3)) -#define MDCR_EL2_E2PB_SHIFT (UL(12)) -#define MDCR_EL2_TDRA (UL(1) << 11) -#define MDCR_EL2_TDOSA (UL(1) << 10) -#define MDCR_EL2_TDA (UL(1) << 9) -#define MDCR_EL2_TDE (UL(1) << 8) -#define MDCR_EL2_HPME (UL(1) << 7) -#define MDCR_EL2_TPM (UL(1) << 6) -#define MDCR_EL2_TPMCR (UL(1) << 5) -#define MDCR_EL2_HPMN_MASK (UL(0x1F)) -#define MDCR_EL2_RES0 (GENMASK(63, 37) | \ - GENMASK(35, 30) | \ - GENMASK(25, 24) | \ - GENMASK(22, 20) | \ - BIT(18) | \ - GENMASK(16, 15)) - /* * FGT register definitions * diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 67afac659231..ca2590344313 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -76,7 +76,6 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, - __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps, __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index a601a9305b10..cf811009a33c 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -225,6 +225,11 @@ static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) return vcpu_has_nv(vcpu) && __is_hyp_ctxt(&vcpu->arch.ctxt); } +static inline bool vcpu_is_host_el0(const struct kvm_vcpu *vcpu) +{ + return is_hyp_ctxt(vcpu) && !vcpu_is_el2(vcpu); +} + /* * The layout of SPSR for an AArch32 state is different when observed from an * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32 @@ -693,4 +698,8 @@ static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu) return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN); } +static inline void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) +{ + vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); +} #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index bf64fed9820e..e18e9244d17a 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -74,8 +74,6 @@ enum kvm_mode kvm_get_mode(void); static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; }; #endif -DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); - extern unsigned int __ro_after_init kvm_sve_max_vl; extern unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void); @@ -374,7 +372,7 @@ struct kvm_arch { u64 ctr_el0; - /* Masks for VNCR-baked sysregs */ + /* Masks for VNCR-backed and general EL2 sysregs */ struct kvm_sysreg_masks *sysreg_masks; /* @@ -408,6 +406,9 @@ struct kvm_vcpu_fault_info { r = __VNCR_START__ + ((VNCR_ ## r) / 8), \ __after_##r = __MAX__(__before_##r - 1, r) +#define MARKER(m) \ + m, __after_##m = m - 1 + enum vcpu_sysreg { __INVALID_SYSREG__, /* 0 is reserved as an invalid value */ MPIDR_EL1, /* MultiProcessor Affinity Register */ @@ -468,13 +469,15 @@ enum vcpu_sysreg { /* EL2 registers */ SCTLR_EL2, /* System Control Register (EL2) */ ACTLR_EL2, /* Auxiliary Control Register (EL2) */ - MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ HACR_EL2, /* Hypervisor Auxiliary Control Register */ ZCR_EL2, /* SVE Control Register (EL2) */ TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */ TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */ TCR_EL2, /* Translation Control Register (EL2) */ + PIRE0_EL2, /* Permission Indirection Register 0 (EL2) */ + PIR_EL2, /* Permission Indirection Register 1 (EL2) */ + POR_EL2, /* Permission Overlay Register 2 (EL2) */ SPSR_EL2, /* EL2 saved program status register */ ELR_EL2, /* EL2 exception link register */ AFSR0_EL2, /* Auxiliary Fault Status Register 0 (EL2) */ @@ -494,7 +497,13 @@ enum vcpu_sysreg { CNTHV_CTL_EL2, CNTHV_CVAL_EL2, - __VNCR_START__, /* Any VNCR-capable reg goes after this point */ + /* Anything from this can be RES0/RES1 sanitised */ + MARKER(__SANITISED_REG_START__), + TCR2_EL2, /* Extended Translation Control Register (EL2) */ + MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ + + /* Any VNCR-capable reg goes after this point */ + MARKER(__VNCR_START__), VNCR(SCTLR_EL1),/* System Control Register */ VNCR(ACTLR_EL1),/* Auxiliary Control Register */ @@ -554,7 +563,7 @@ struct kvm_sysreg_masks { struct { u64 res0; u64 res1; - } mask[NR_SYS_REGS - __VNCR_START__]; + } mask[NR_SYS_REGS - __SANITISED_REG_START__]; }; struct kvm_cpu_context { @@ -1002,13 +1011,13 @@ static inline u64 *___ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) -u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *, enum vcpu_sysreg); +u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); #define __vcpu_sys_reg(v,r) \ (*({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ u64 *__r = __ctxt_sys_reg(ctxt, (r)); \ - if (vcpu_has_nv((v)) && (r) >= __VNCR_START__) \ - *__r = kvm_vcpu_sanitise_vncr_reg((v), (r)); \ + if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ + *__r = kvm_vcpu_apply_reg_masks((v), (r), *__r);\ __r; \ })) @@ -1037,6 +1046,10 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break; case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break; case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break; + case TCR2_EL1: *val = read_sysreg_s(SYS_TCR2_EL12); break; + case PIR_EL1: *val = read_sysreg_s(SYS_PIR_EL12); break; + case PIRE0_EL1: *val = read_sysreg_s(SYS_PIRE0_EL12); break; + case POR_EL1: *val = read_sysreg_s(SYS_POR_EL12); break; case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break; case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break; case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break; @@ -1083,6 +1096,10 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; + case TCR2_EL1: write_sysreg_s(val, SYS_TCR2_EL12); break; + case PIR_EL1: write_sysreg_s(val, SYS_PIR_EL12); break; + case PIRE0_EL1: write_sysreg_s(val, SYS_PIRE0_EL12); break; + case POR_EL1: write_sysreg_s(val, SYS_POR_EL12); break; case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; @@ -1140,7 +1157,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -#define vcpu_has_run_once(vcpu) !!rcu_access_pointer((vcpu)->pid) +#define vcpu_has_run_once(vcpu) (!!READ_ONCE((vcpu)->pid)) #ifndef __KVM_NVHE_HYPERVISOR__ #define kvm_call_hyp_nvhe(f, ...) \ @@ -1503,4 +1520,13 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); (system_supports_fpmr() && \ kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP)) +#define kvm_has_tcr2(k) \ + (kvm_has_feat((k), ID_AA64MMFR3_EL1, TCRX, IMP)) + +#define kvm_has_s1pie(k) \ + (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1PIE, IMP)) + +#define kvm_has_s1poe(k) \ + (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 03f4c3d7839c..aab04097b505 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -674,10 +674,8 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); * * If there is a valid, leaf page-table entry used to translate @addr, then * set the access flag in that entry. - * - * Return: The old page-table entry prior to setting the flag, 0 on failure. */ -kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr); +void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr); /** * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index b9b992908a56..8b9f33cf561b 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -110,7 +110,7 @@ #define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ -#define PHYSMEM_END __pa(PAGE_END - 1) +#define DIRECT_MAP_PHYSMEM_END __pa(PAGE_END - 1) #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 37774c793006..90f61b17275e 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -13,6 +13,7 @@ int set_memory_valid(unsigned long addr, int numpages, int enable); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); int set_memory_encrypted(unsigned long addr, int numpages); diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h index 11ab1c077697..7cde3d8bd0ad 100644 --- a/arch/arm64/include/asm/spinlock_types.h +++ b/arch/arm64/include/asm/spinlock_types.h @@ -6,7 +6,7 @@ #define __ASM_SPINLOCK_TYPES_H #if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #include <asm-generic/qspinlock_types.h> diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9c98ff448bd9..b8303a83c0bf 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -542,18 +542,6 @@ #define SYS_MAIR_EL2 sys_reg(3, 4, 10, 2, 0) #define SYS_AMAIR_EL2 sys_reg(3, 4, 10, 3, 0) -#define SYS_MPAMHCR_EL2 sys_reg(3, 4, 10, 4, 0) -#define SYS_MPAMVPMV_EL2 sys_reg(3, 4, 10, 4, 1) -#define SYS_MPAM2_EL2 sys_reg(3, 4, 10, 5, 0) -#define __SYS__MPAMVPMx_EL2(x) sys_reg(3, 4, 10, 6, x) -#define SYS_MPAMVPM0_EL2 __SYS__MPAMVPMx_EL2(0) -#define SYS_MPAMVPM1_EL2 __SYS__MPAMVPMx_EL2(1) -#define SYS_MPAMVPM2_EL2 __SYS__MPAMVPMx_EL2(2) -#define SYS_MPAMVPM3_EL2 __SYS__MPAMVPMx_EL2(3) -#define SYS_MPAMVPM4_EL2 __SYS__MPAMVPMx_EL2(4) -#define SYS_MPAMVPM5_EL2 __SYS__MPAMVPMx_EL2(5) -#define SYS_MPAMVPM6_EL2 __SYS__MPAMVPMx_EL2(6) -#define SYS_MPAMVPM7_EL2 __SYS__MPAMVPMx_EL2(7) #define SYS_VBAR_EL2 sys_reg(3, 4, 12, 0, 0) #define SYS_RVBAR_EL2 sys_reg(3, 4, 12, 0, 1) diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/text-patching.h index 587bdb91ab7a..587bdb91ab7a 100644 --- a/arch/arm64/include/asm/patching.h +++ b/arch/arm64/include/asm/text-patching.h diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h index 06f8ec0906a6..4f9bbd4d6c26 100644 --- a/arch/arm64/include/asm/vncr_mapping.h +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -50,7 +50,6 @@ #define VNCR_VBAR_EL1 0x250 #define VNCR_TCR2_EL1 0x270 #define VNCR_PIRE0_EL1 0x290 -#define VNCR_PIRE0_EL2 0x298 #define VNCR_PIR_EL1 0x2A0 #define VNCR_POR_EL1 0x2A8 #define VNCR_ICH_LR0_EL2 0x400 diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 964df31da975..66736ff04011 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -484,6 +484,12 @@ enum { */ #define KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (1ULL << 0) +/* + * Shutdown caused by a PSCI v1.3 SYSTEM_OFF2 call. + * Valid only when the system event has a type of KVM_SYSTEM_EVENT_SHUTDOWN. + */ +#define KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2 (1ULL << 0) + /* run->fail_entry.hardware_entry_failure_reason codes. */ #define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED (1ULL << 0) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 351aa825ec40..6ce71f444ed8 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -688,6 +688,14 @@ static const struct arm64_ftr_bits ftr_id_dfr1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_mpamidr[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0), + ARM64_FTR_END, +}; + /* * Common ftr bits for a 32bit register with all hidden, strict * attributes, with 4bit feature fields and a default safe value of @@ -808,6 +816,9 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3), ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4), + /* Op1 = 0, CRn = 10, CRm = 4 */ + ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr), + /* Op1 = 1, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid), @@ -1167,6 +1178,9 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) cpacr_restore(cpacr); } + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); } @@ -1423,6 +1437,11 @@ void update_cpu_features(int cpu, cpacr_restore(cpacr); } + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) { + taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, + info->reg_mpamidr, boot->reg_mpamidr); + } + /* * The kernel uses the LDGM/STGM instructions and the number of tags * they read/write depends on the GMID_EL1.BS field. Check that the @@ -2389,6 +2408,36 @@ cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap) return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT); } +static bool +test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) +{ + if (!has_cpuid_feature(entry, scope)) + return false; + + /* Check firmware actually enabled MPAM on this cpu. */ + return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN); +} + +static void +cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) +{ + /* + * Access by the kernel (at EL1) should use the reserved PARTID + * which is configured unrestricted. This avoids priority-inversion + * where latency sensitive tasks have to wait for a task that has + * been throttled to release the lock. + */ + write_sysreg_s(0, SYS_MPAM1_EL1); +} + +static bool +test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + + return idr & MPAMIDR_EL1_HAS_HCR; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .capability = ARM64_ALWAYS_BOOT, @@ -2901,6 +2950,20 @@ static const struct arm64_cpu_capabilities arm64_features[] = { }, #endif { + .desc = "Memory Partitioning And Monitoring", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM, + .matches = test_has_mpam, + .cpu_enable = cpu_enable_mpam, + ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1) + }, + { + .desc = "Memory Partitioning And Monitoring Virtualisation", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_MPAM_HCR, + .matches = test_has_mpam_hcr, + }, + { .desc = "NV1", .capability = ARM64_HAS_HCR_NV1, .type = ARM64_CPUCAP_SYSTEM_FEATURE, @@ -3436,6 +3499,36 @@ static void verify_hyp_capabilities(void) } } +static void verify_mpam_capabilities(void) +{ + u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1); + u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max; + + if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) != + FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) { + pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_idr = read_cpuid(MPAMIDR_EL1); + sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) != + FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) { + pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id()); + cpu_die_early(); + } + + cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr); + cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr); + sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr); + sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr); + if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) { + pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id()); + cpu_die_early(); + } +} + /* * Run through the enabled system capabilities and enable() it on this CPU. * The capabilities were decided based on the available CPUs at the boot time. @@ -3462,6 +3555,9 @@ static void verify_local_cpu_capabilities(void) if (is_hyp_mode_available()) verify_hyp_capabilities(); + + if (system_supports_mpam()) + verify_mpam_capabilities(); } void check_local_cpu_capabilities(void) diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index f2f92c6b1c85..d79e88fccdfc 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -479,6 +479,9 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); + if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + cpuinfo_detect_icache_policy(info); } diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index b2d947175cbe..245cb419ca24 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -15,7 +15,7 @@ #include <asm/debug-monitors.h> #include <asm/ftrace.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS struct fregs_offset { diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index f63ea915d6ad..b345425193d2 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -9,7 +9,7 @@ #include <linux/jump_label.h> #include <linux/smp.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 4e1f983df3d1..f3c4d3a8a20f 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -17,7 +17,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/traps.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c index 945df74005c7..7f99723fbb8c 100644 --- a/arch/arm64/kernel/patching.c +++ b/arch/arm64/kernel/patching.c @@ -10,7 +10,7 @@ #include <asm/fixmap.h> #include <asm/insn.h> #include <asm/kprobes.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/sections.h> static DEFINE_RAW_SPINLOCK(patch_lock); diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 48d88e07611d..d9e462eafb95 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -27,7 +27,7 @@ #include <asm/debug-monitors.h> #include <asm/insn.h> #include <asm/irq.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/ptrace.h> #include <asm/sections.h> #include <asm/system_misc.h> diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index ee318f6df647..4e26bd356a48 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -41,7 +41,7 @@ #include <asm/extable.h> #include <asm/insn.h> #include <asm/kprobes.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/traps.h> #include <asm/smp.h> #include <asm/stack_pointer.h> diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 879982b1cc73..1215df590418 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -206,8 +206,7 @@ void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) static inline bool userspace_irqchip(struct kvm *kvm) { - return static_branch_unlikely(&userspace_irqchip_in_use) && - unlikely(!irqchip_in_kernel(kvm)); + return unlikely(!irqchip_in_kernel(kvm)); } static void soft_timer_start(struct hrtimer *hrt, u64 ns) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 48cafb65d6ac..a102c3aebdbc 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -69,7 +69,6 @@ DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); static bool vgic_present, kvm_arm_initialised; static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized); -DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); bool is_kvm_arm_initialised(void) { @@ -503,9 +502,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm))) - static_branch_dec(&userspace_irqchip_in_use); - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); @@ -848,22 +844,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) return ret; } - if (!irqchip_in_kernel(kvm)) { - /* - * Tell the rest of the code that there are userspace irqchip - * VMs in the wild. - */ - static_branch_inc(&userspace_irqchip_in_use); - } - - /* - * Initialize traps for protected VMs. - * NOTE: Move to run in EL2 directly, rather than via a hypercall, once - * the code is in place for first run initialization at EL2. - */ - if (kvm_vm_is_protected(kvm)) - kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); - mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); mutex_unlock(&kvm->arch.config_lock); @@ -1077,7 +1057,7 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) * state gets updated in kvm_timer_update_run and * kvm_pmu_update_run below). */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { if (kvm_timer_should_notify_user(vcpu) || kvm_pmu_should_notify_user(vcpu)) { *ret = -EINTR; @@ -1199,7 +1179,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ kvm_pmu_sync_hwstate(vcpu); - if (static_branch_unlikely(&userspace_irqchip_in_use)) + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); @@ -1245,7 +1225,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * we don't want vtimer interrupts to race with syncing the * timer virtual interrupt state. */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); kvm_arch_vcpu_ctxsync_fp(vcpu); diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 39f0e87a340e..8c5d7990e5b3 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -24,6 +24,9 @@ struct s1_walk_info { unsigned int txsz; int sl; bool hpd; + bool e0poe; + bool poe; + bool pan; bool be; bool s2; }; @@ -37,6 +40,16 @@ struct s1_walk_result { u8 APTable; bool UXNTable; bool PXNTable; + bool uwxn; + bool uov; + bool ur; + bool uw; + bool ux; + bool pwxn; + bool pov; + bool pr; + bool pw; + bool px; }; struct { u8 fst; @@ -87,6 +100,51 @@ static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 o } } +static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + if (!kvm_has_s1pie(vcpu->kvm)) + return false; + + switch (regime) { + case TR_EL2: + case TR_EL20: + return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; + case TR_EL10: + return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && + (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1x_PIE); + default: + BUG(); + } +} + +static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) +{ + u64 val; + + if (!kvm_has_s1poe(vcpu->kvm)) { + wi->poe = wi->e0poe = false; + return; + } + + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + val = vcpu_read_sys_reg(vcpu, TCR2_EL2); + wi->poe = val & TCR2_EL2_POE; + wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE); + break; + case TR_EL10: + if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) { + wi->poe = wi->e0poe = false; + return; + } + + val = __vcpu_sys_reg(vcpu, TCR2_EL1); + wi->poe = val & TCR2_EL1x_POE; + wi->e0poe = val & TCR2_EL1x_E0POE; + } +} + static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, struct s1_walk_result *wr, u64 va) { @@ -98,6 +156,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, wi->regime = compute_translation_regime(vcpu, op); as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); + wi->pan = (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) && + (*vcpu_cpsr(vcpu) & PSR_PAN_BIT); va55 = va & BIT(55); @@ -180,6 +240,14 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, (va55 ? FIELD_GET(TCR_HPD1, tcr) : FIELD_GET(TCR_HPD0, tcr))); + /* R_JHSVW */ + wi->hpd |= s1pie_enabled(vcpu, wi->regime); + + /* Do we have POE? */ + compute_s1poe(vcpu, wi); + + /* R_BVXDG */ + wi->hpd |= (wi->poe || wi->e0poe); /* Someone was silly enough to encode TG0/TG1 differently */ if (va55) { @@ -412,6 +480,11 @@ struct mmu_config { u64 ttbr1; u64 tcr; u64 mair; + u64 tcr2; + u64 pir; + u64 pire0; + u64 por_el0; + u64 por_el1; u64 sctlr; u64 vttbr; u64 vtcr; @@ -424,6 +497,17 @@ static void __mmu_config_save(struct mmu_config *config) config->ttbr1 = read_sysreg_el1(SYS_TTBR1); config->tcr = read_sysreg_el1(SYS_TCR); config->mair = read_sysreg_el1(SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + config->tcr2 = read_sysreg_el1(SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + config->pir = read_sysreg_el1(SYS_PIR); + config->pire0 = read_sysreg_el1(SYS_PIRE0); + } + if (system_supports_poe()) { + config->por_el1 = read_sysreg_el1(SYS_POR); + config->por_el0 = read_sysreg_s(SYS_POR_EL0); + } + } config->sctlr = read_sysreg_el1(SYS_SCTLR); config->vttbr = read_sysreg(vttbr_el2); config->vtcr = read_sysreg(vtcr_el2); @@ -444,6 +528,17 @@ static void __mmu_config_restore(struct mmu_config *config) write_sysreg_el1(config->ttbr1, SYS_TTBR1); write_sysreg_el1(config->tcr, SYS_TCR); write_sysreg_el1(config->mair, SYS_MAIR); + if (cpus_have_final_cap(ARM64_HAS_TCR2)) { + write_sysreg_el1(config->tcr2, SYS_TCR2); + if (cpus_have_final_cap(ARM64_HAS_S1PIE)) { + write_sysreg_el1(config->pir, SYS_PIR); + write_sysreg_el1(config->pire0, SYS_PIRE0); + } + if (system_supports_poe()) { + write_sysreg_el1(config->por_el1, SYS_POR); + write_sysreg_s(config->por_el0, SYS_POR_EL0); + } + } write_sysreg_el1(config->sctlr, SYS_SCTLR); write_sysreg(config->vttbr, vttbr_el2); write_sysreg(config->vtcr, vtcr_el2); @@ -739,6 +834,9 @@ static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3)) return false; + if (s1pie_enabled(vcpu, regime)) + return true; + if (regime == TR_EL10) sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); else @@ -747,111 +845,343 @@ static bool pan3_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) return sctlr & SCTLR_EL1_EPAN; } -static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) { - bool perm_fail, ur, uw, ux, pr, pw, px; - struct s1_walk_result wr = {}; - struct s1_walk_info wi = {}; - int ret, idx; + bool wxn; - ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); - if (ret) - goto compute_par; - - if (wr.level == S1_MMU_DISABLED) - goto compute_par; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - - ret = walk_s1(vcpu, &wi, &wr, vaddr); - - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (ret) - goto compute_par; - - /* FIXME: revisit when adding indirect permission support */ - /* AArch64.S1DirectBasePermissions() */ - if (wi.regime != TR_EL2) { - switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) { + /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) { case 0b00: - pr = pw = true; - ur = uw = false; + wr->pr = wr->pw = true; + wr->ur = wr->uw = false; break; case 0b01: - pr = pw = ur = uw = true; + wr->pr = wr->pw = wr->ur = wr->uw = true; break; case 0b10: - pr = true; - pw = ur = uw = false; + wr->pr = true; + wr->pw = wr->ur = wr->uw = false; break; case 0b11: - pr = ur = true; - pw = uw = false; + wr->pr = wr->ur = true; + wr->pw = wr->uw = false; break; } - switch (wr.APTable) { + /* We don't use px for anything yet, but hey... */ + wr->px = !((wr->desc & PTE_PXN) || wr->uw); + wr->ux = !(wr->desc & PTE_UXN); + } else { + wr->ur = wr->uw = wr->ux = false; + + if (!(wr->desc & PTE_RDONLY)) { + wr->pr = wr->pw = true; + } else { + wr->pr = true; + wr->pw = false; + } + + /* XN maps to UXN */ + wr->px = !(wr->desc & PTE_UXN); + } + + switch (wi->regime) { + case TR_EL2: + case TR_EL20: + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); + break; + case TR_EL10: + wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); + break; + } + + wr->pwxn = wr->uwxn = wxn; + wr->pov = wi->poe; + wr->uov = wi->e0poe; +} + +static void compute_s1_hierarchical_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + /* Hierarchical part of AArch64.S1DirectBasePermissions() */ + if (wi->regime != TR_EL2) { + switch (wr->APTable) { case 0b00: break; case 0b01: - ur = uw = false; + wr->ur = wr->uw = false; break; case 0b10: - pw = uw = false; + wr->pw = wr->uw = false; break; case 0b11: - pw = ur = uw = false; + wr->pw = wr->ur = wr->uw = false; break; } - /* We don't use px for anything yet, but hey... */ - px = !((wr.desc & PTE_PXN) || wr.PXNTable || uw); - ux = !((wr.desc & PTE_UXN) || wr.UXNTable); + wr->px &= !wr->PXNTable; + wr->ux &= !wr->UXNTable; + } else { + if (wr->APTable & BIT(1)) + wr->pw = false; + + /* XN maps to UXN */ + wr->px &= !wr->UXNTable; + } +} - if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) { - bool pan; +#define perm_idx(v, r, i) ((vcpu_read_sys_reg((v), (r)) >> ((i) * 4)) & 0xf) + +#define set_priv_perms(wr, r, w, x) \ + do { \ + (wr)->pr = (r); \ + (wr)->pw = (w); \ + (wr)->px = (x); \ + } while (0) + +#define set_unpriv_perms(wr, r, w, x) \ + do { \ + (wr)->ur = (r); \ + (wr)->uw = (w); \ + (wr)->ux = (x); \ + } while (0) + +#define set_priv_wxn(wr, v) \ + do { \ + (wr)->pwxn = (v); \ + } while (0) + +#define set_unpriv_wxn(wr, v) \ + do { \ + (wr)->uwxn = (v); \ + } while (0) + +/* Similar to AArch64.S1IndirectBasePermissions(), without GCS */ +#define set_perms(w, wr, ip) \ + do { \ + /* R_LLZDZ */ \ + switch ((ip)) { \ + case 0b0000: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b0010: \ + set_ ## w ## _perms((wr), false, false, true ); \ + break; \ + case 0b0011: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b0100: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b0101: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b0110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b0111: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1000: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1001: \ + set_ ## w ## _perms((wr), true , false, false); \ + break; \ + case 0b1010: \ + set_ ## w ## _perms((wr), true , false, true ); \ + break; \ + case 0b1011: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1100: \ + set_ ## w ## _perms((wr), true , true , false); \ + break; \ + case 0b1101: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + case 0b1110: \ + set_ ## w ## _perms((wr), true , true , true ); \ + break; \ + case 0b1111: \ + set_ ## w ## _perms((wr), false, false, false); \ + break; \ + } \ + \ + /* R_HJYGR */ \ + set_ ## w ## _wxn((wr), ((ip) == 0b0110)); \ + \ + } while (0) + +static void compute_s1_indirect_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 up, pp, idx; - pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; - pan &= ur || uw || (pan3_enabled(vcpu, wi.regime) && ux); - pw &= !pan; - pr &= !pan; - } - } else { - ur = uw = ux = false; + idx = pte_pi_index(wr->desc); - if (!(wr.desc & PTE_RDONLY)) { - pr = pw = true; - } else { - pr = true; - pw = false; - } + switch (wi->regime) { + case TR_EL10: + pp = perm_idx(vcpu, PIR_EL1, idx); + up = perm_idx(vcpu, PIRE0_EL1, idx); + break; + case TR_EL20: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = perm_idx(vcpu, PIRE0_EL2, idx); + break; + case TR_EL2: + pp = perm_idx(vcpu, PIR_EL2, idx); + up = 0; + break; + } - if (wr.APTable & BIT(1)) - pw = false; + set_perms(priv, wr, pp); - /* XN maps to UXN */ - px = !((wr.desc & PTE_UXN) || wr.UXNTable); + if (wi->regime != TR_EL2) + set_perms(unpriv, wr, up); + else + set_unpriv_perms(wr, false, false, false); + + wr->pov = wi->poe && !(pp & BIT(3)); + wr->uov = wi->e0poe && !(up & BIT(3)); + + /* R_VFPJF */ + if (wr->px && wr->uw) { + set_priv_perms(wr, false, false, false); + set_unpriv_perms(wr, false, false, false); + } +} + +static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + u8 idx, pov_perms, uov_perms; + + idx = FIELD_GET(PTE_PO_IDX_MASK, wr->desc); + + switch (wi->regime) { + case TR_EL10: + pov_perms = perm_idx(vcpu, POR_EL1, idx); + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL20: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + uov_perms = perm_idx(vcpu, POR_EL0, idx); + break; + case TR_EL2: + pov_perms = perm_idx(vcpu, POR_EL2, idx); + uov_perms = 0; + break; + } + + if (pov_perms & ~POE_RXW) + pov_perms = POE_NONE; + + if (wi->poe && wr->pov) { + wr->pr &= pov_perms & POE_R; + wr->px &= pov_perms & POE_X; + wr->pw &= pov_perms & POE_W; + } + + if (uov_perms & ~POE_RXW) + uov_perms = POE_NONE; + + if (wi->e0poe && wr->uov) { + wr->ur &= uov_perms & POE_R; + wr->ux &= uov_perms & POE_X; + wr->uw &= uov_perms & POE_W; } +} + +static void compute_s1_permissions(struct kvm_vcpu *vcpu, + struct s1_walk_info *wi, + struct s1_walk_result *wr) +{ + bool pan; + + if (!s1pie_enabled(vcpu, wi->regime)) + compute_s1_direct_permissions(vcpu, wi, wr); + else + compute_s1_indirect_permissions(vcpu, wi, wr); + + if (!wi->hpd) + compute_s1_hierarchical_permissions(vcpu, wi, wr); + + if (wi->poe || wi->e0poe) + compute_s1_overlay_permissions(vcpu, wi, wr); + + /* R_QXXPC */ + if (wr->pwxn) { + if (!wr->pov && wr->pw) + wr->px = false; + if (wr->pov && wr->px) + wr->pw = false; + } + + /* R_NPBXC */ + if (wr->uwxn) { + if (!wr->uov && wr->uw) + wr->ux = false; + if (wr->uov && wr->ux) + wr->uw = false; + } + + pan = wi->pan && (wr->ur || wr->uw || + (pan3_enabled(vcpu, wi->regime) && wr->ux)); + wr->pw &= !pan; + wr->pr &= !pan; +} - perm_fail = false; +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) +{ + struct s1_walk_result wr = {}; + struct s1_walk_info wi = {}; + bool perm_fail = false; + int ret, idx; + + ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr); + if (ret) + goto compute_par; + + if (wr.level == S1_MMU_DISABLED) + goto compute_par; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = walk_s1(vcpu, &wi, &wr, vaddr); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (ret) + goto compute_par; + + compute_s1_permissions(vcpu, &wi, &wr); switch (op) { case OP_AT_S1E1RP: case OP_AT_S1E1R: case OP_AT_S1E2R: - perm_fail = !pr; + perm_fail = !wr.pr; break; case OP_AT_S1E1WP: case OP_AT_S1E1W: case OP_AT_S1E2W: - perm_fail = !pw; + perm_fail = !wr.pw; break; case OP_AT_S1E0R: - perm_fail = !ur; + perm_fail = !wr.ur; break; case OP_AT_S1E0W: - perm_fail = !uw; + perm_fail = !wr.uw; break; case OP_AT_S1E1A: case OP_AT_S1E2A: @@ -914,6 +1244,17 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR_EL1), SYS_TCR); write_sysreg_el1(vcpu_read_sys_reg(vcpu, MAIR_EL1), SYS_MAIR); + if (kvm_has_tcr2(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, TCR2_EL1), SYS_TCR2); + if (kvm_has_s1pie(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIR_EL1), SYS_PIR); + write_sysreg_el1(vcpu_read_sys_reg(vcpu, PIRE0_EL1), SYS_PIRE0); + } + if (kvm_has_s1poe(vcpu->kvm)) { + write_sysreg_el1(vcpu_read_sys_reg(vcpu, POR_EL1), SYS_POR); + write_sysreg_s(vcpu_read_sys_reg(vcpu, POR_EL0), SYS_POR_EL0); + } + } write_sysreg_el1(vcpu_read_sys_reg(vcpu, SCTLR_EL1), SYS_SCTLR); __load_stage2(mmu, mmu->arch); @@ -992,12 +1333,9 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) * switching context behind everybody's back, disable interrupts... */ scoped_guard(write_lock_irqsave, &vcpu->kvm->mmu_lock) { - struct kvm_s2_mmu *mmu; u64 val, hcr; bool fail; - mmu = &vcpu->kvm->arch.mmu; - val = hcr = read_sysreg(hcr_el2); val &= ~HCR_TGE; val |= HCR_VM; diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 05b6435d02a9..1ffbfd1c3cf2 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -16,9 +16,13 @@ enum trap_behaviour { BEHAVE_HANDLE_LOCALLY = 0, + BEHAVE_FORWARD_READ = BIT(0), BEHAVE_FORWARD_WRITE = BIT(1), - BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + BEHAVE_FORWARD_RW = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, + + /* Traps that take effect in Host EL0, this is rare! */ + BEHAVE_FORWARD_IN_HOST_EL0 = BIT(2), }; struct trap_bits { @@ -79,7 +83,6 @@ enum cgt_group_id { CGT_MDCR_E2TB, CGT_MDCR_TDCC, - CGT_CPACR_E0POE, CGT_CPTR_TAM, CGT_CPTR_TCPAC, @@ -106,6 +109,7 @@ enum cgt_group_id { CGT_HCR_TPU_TOCU, CGT_HCR_NV1_nNV2_ENSCXT, CGT_MDCR_TPM_TPMCR, + CGT_MDCR_TPM_HPMN, CGT_MDCR_TDE_TDA, CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE_TDRA, @@ -122,6 +126,7 @@ enum cgt_group_id { CGT_CNTHCTL_EL1PTEN, CGT_CPTR_TTA, + CGT_MDCR_HPMN, /* Must be last */ __NR_CGT_GROUP_IDS__ @@ -138,7 +143,7 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TID2, .mask = HCR_TID2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID3] = { .index = HCR_EL2, @@ -162,37 +167,37 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TIDCP, .mask = HCR_TIDCP, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TACR] = { .index = HCR_EL2, .value = HCR_TACR, .mask = HCR_TACR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TSW] = { .index = HCR_EL2, .value = HCR_TSW, .mask = HCR_TSW, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */ .index = HCR_EL2, .value = HCR_TPC, .mask = HCR_TPC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TPU] = { .index = HCR_EL2, .value = HCR_TPU, .mask = HCR_TPU, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLB] = { .index = HCR_EL2, .value = HCR_TTLB, .mask = HCR_TTLB, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TVM] = { .index = HCR_EL2, @@ -204,7 +209,7 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TDZ, .mask = HCR_TDZ, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TRVM] = { .index = HCR_EL2, @@ -216,205 +221,201 @@ static const struct trap_bits coarse_trap_bits[] = { .index = HCR_EL2, .value = HCR_TLOR, .mask = HCR_TLOR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TERR] = { .index = HCR_EL2, .value = HCR_TERR, .mask = HCR_TERR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_APK] = { .index = HCR_EL2, .value = 0, .mask = HCR_APK, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV_nNV2] = { .index = HCR_EL2, .value = HCR_NV, .mask = HCR_NV | HCR_NV2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_NV1_nNV2] = { .index = HCR_EL2, .value = HCR_NV | HCR_NV1, .mask = HCR_NV | HCR_NV1 | HCR_NV2, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_AT] = { .index = HCR_EL2, .value = HCR_AT, .mask = HCR_AT, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_nFIEN] = { .index = HCR_EL2, .value = 0, .mask = HCR_FIEN, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TID4] = { .index = HCR_EL2, .value = HCR_TID4, .mask = HCR_TID4, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TICAB] = { .index = HCR_EL2, .value = HCR_TICAB, .mask = HCR_TICAB, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TOCU] = { .index = HCR_EL2, .value = HCR_TOCU, .mask = HCR_TOCU, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_ENSCXT] = { .index = HCR_EL2, .value = 0, .mask = HCR_ENSCXT, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBIS] = { .index = HCR_EL2, .value = HCR_TTLBIS, .mask = HCR_TTLBIS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCR_TTLBOS] = { .index = HCR_EL2, .value = HCR_TTLBOS, .mask = HCR_TTLBOS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMCR] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMCR, .mask = MDCR_EL2_TPMCR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TPM] = { .index = MDCR_EL2, .value = MDCR_EL2_TPM, .mask = MDCR_EL2_TPM, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW | + BEHAVE_FORWARD_IN_HOST_EL0, }, [CGT_MDCR_TDE] = { .index = MDCR_EL2, .value = MDCR_EL2_TDE, .mask = MDCR_EL2_TDE, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDA, .mask = MDCR_EL2_TDA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDOSA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDOSA, .mask = MDCR_EL2_TDOSA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDRA] = { .index = MDCR_EL2, .value = MDCR_EL2_TDRA, .mask = MDCR_EL2_TDRA, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2PB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2PB_SHIFT), - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TPMS] = { .index = MDCR_EL2, .value = MDCR_EL2_TPMS, .mask = MDCR_EL2_TPMS, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TTRF] = { .index = MDCR_EL2, .value = MDCR_EL2_TTRF, .mask = MDCR_EL2_TTRF, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_E2TB] = { .index = MDCR_EL2, .value = 0, .mask = BIT(MDCR_EL2_E2TB_SHIFT), - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_MDCR_TDCC] = { .index = MDCR_EL2, .value = MDCR_EL2_TDCC, .mask = MDCR_EL2_TDCC, - .behaviour = BEHAVE_FORWARD_ANY, - }, - [CGT_CPACR_E0POE] = { - .index = CPTR_EL2, - .value = CPACR_ELx_E0POE, - .mask = CPACR_ELx_E0POE, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CPTR_TAM] = { .index = CPTR_EL2, .value = CPTR_EL2_TAM, .mask = CPTR_EL2_TAM, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_CPTR_TCPAC] = { .index = CPTR_EL2, .value = CPTR_EL2_TCPAC, .mask = CPTR_EL2_TCPAC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCRX_EnFPM] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_EnFPM, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_HCRX_TCR2En] = { .index = HCRX_EL2, .value = 0, .mask = HCRX_EL2_TCR2En, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TC] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TC, .mask = ICH_HCR_TC, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL0] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TALL0, .mask = ICH_HCR_TALL0, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TALL1] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TALL1, .mask = ICH_HCR_TALL1, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, [CGT_ICH_HCR_TDIR] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TDIR, .mask = ICH_HCR_TDIR, - .behaviour = BEHAVE_FORWARD_ANY, + .behaviour = BEHAVE_FORWARD_RW, }, }; @@ -435,6 +436,7 @@ static const enum cgt_group_id *coarse_control_combo[] = { MCB(CGT_HCR_TPU_TOCU, CGT_HCR_TPU, CGT_HCR_TOCU), MCB(CGT_HCR_NV1_nNV2_ENSCXT, CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT), MCB(CGT_MDCR_TPM_TPMCR, CGT_MDCR_TPM, CGT_MDCR_TPMCR), + MCB(CGT_MDCR_TPM_HPMN, CGT_MDCR_TPM, CGT_MDCR_HPMN), MCB(CGT_MDCR_TDE_TDA, CGT_MDCR_TDE, CGT_MDCR_TDA), MCB(CGT_MDCR_TDE_TDOSA, CGT_MDCR_TDE, CGT_MDCR_TDOSA), MCB(CGT_MDCR_TDE_TDRA, CGT_MDCR_TDE, CGT_MDCR_TDRA), @@ -474,7 +476,7 @@ static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu) if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10)) return BEHAVE_HANDLE_LOCALLY; - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) @@ -482,7 +484,7 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10)) return BEHAVE_HANDLE_LOCALLY; - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; } static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) @@ -493,7 +495,35 @@ static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) val = translate_cptr_el2_to_cpacr_el1(val); if (val & CPACR_ELx_TTA) - return BEHAVE_FORWARD_ANY; + return BEHAVE_FORWARD_RW; + + return BEHAVE_HANDLE_LOCALLY; +} + +static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + unsigned int idx; + + + switch (sysreg) { + case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30): + case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30): + idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg); + break; + case SYS_PMXEVTYPER_EL0: + case SYS_PMXEVCNTR_EL0: + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, + __vcpu_sys_reg(vcpu, PMSELR_EL0)); + break; + default: + /* Someone used this trap helper for something else... */ + KVM_BUG_ON(1, vcpu->kvm); + return BEHAVE_HANDLE_LOCALLY; + } + + if (kvm_pmu_counter_is_hyp(vcpu, idx)) + return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0; return BEHAVE_HANDLE_LOCALLY; } @@ -505,6 +535,7 @@ static const complex_condition_check ccc[] = { CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), CCC(CGT_CPTR_TTA, check_cptr_tta), + CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), }; /* @@ -711,6 +742,10 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_MAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_AMAIR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_CONTEXTIDR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIR_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_PIRE0_EL1, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL0, CGT_HCR_TVM_TRVM), + SR_TRAP(SYS_POR_EL1, CGT_HCR_TVM_TRVM), SR_TRAP(SYS_TCR2_EL1, CGT_HCR_TVM_TRVM_HCRX_TCR2En), SR_TRAP(SYS_DC_ZVA, CGT_HCR_TDZ), SR_TRAP(SYS_DC_GVA, CGT_HCR_TDZ), @@ -919,77 +954,77 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_PMOVSCLR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID0_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMCEID1_EL0, CGT_MDCR_TPM), - SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVTYPER_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMSWINC_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMSELR_EL0, CGT_MDCR_TPM), - SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM), + SR_TRAP(SYS_PMXEVCNTR_EL0, CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCNTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMUSERENR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENSET_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMINTENCLR_EL1, CGT_MDCR_TPM), SR_TRAP(SYS_PMMIR_EL1, CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM), - SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM), + SR_TRAP(SYS_PMEVCNTRn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVCNTRn_EL0(30), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(0), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(1), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(2), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(3), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(4), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(5), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(6), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(7), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(8), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(9), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(10), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(11), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(12), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(13), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(14), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(15), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(16), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(17), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(18), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(19), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(20), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(21), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(22), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(23), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(24), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(25), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(26), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(27), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(28), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(29), CGT_MDCR_TPM_HPMN), + SR_TRAP(SYS_PMEVTYPERn_EL0(30), CGT_MDCR_TPM_HPMN), SR_TRAP(SYS_PMCCFILTR_EL0, CGT_MDCR_TPM), SR_TRAP(SYS_MDCCSR_EL0, CGT_MDCR_TDCC_TDE_TDA), SR_TRAP(SYS_MDCCINT_EL1, CGT_MDCR_TDCC_TDE_TDA), @@ -1141,7 +1176,6 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_AMEVTYPER1_EL0(13), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(14), CGT_CPTR_TAM), SR_TRAP(SYS_AMEVTYPER1_EL0(15), CGT_CPTR_TAM), - SR_TRAP(SYS_POR_EL0, CGT_CPACR_E0POE), /* op0=2, op1=1, and CRn<0b1000 */ SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0), sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA), @@ -2021,7 +2055,8 @@ check_mcb: cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; for (int i = 0; cgids[i] != __RESERVED__; i++) { - if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) { + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ && + cgids[i] < __COMPLEX_CONDITIONS__) { kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); ret = -EINVAL; } @@ -2126,11 +2161,19 @@ static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr) return masks->mask[sr - __VNCR_START__].res0; } -static bool check_fgt_bit(struct kvm *kvm, bool is_read, +static bool check_fgt_bit(struct kvm_vcpu *vcpu, bool is_read, u64 val, const union trap_config tc) { + struct kvm *kvm = vcpu->kvm; enum vcpu_sysreg sr; + /* + * KVM doesn't know about any FGTs that apply to the host, and hopefully + * that'll remain the case. + */ + if (is_hyp_ctxt(vcpu)) + return false; + if (tc.pol) return (val & BIT(tc.bit)); @@ -2207,7 +2250,15 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) * If we're not nesting, immediately return to the caller, with the * sysreg index, should we have it. */ - if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + if (!vcpu_has_nv(vcpu)) + goto local; + + /* + * There are a few traps that take effect InHost, but are constrained + * to EL0. Don't bother with computing the trap behaviour if the vCPU + * isn't in EL0. + */ + if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu)) goto local; switch ((enum fgt_group_id)tc.fgt) { @@ -2253,12 +2304,14 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) goto local; } - if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu->kvm, is_read, - val, tc)) + if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, is_read, val, tc)) goto inject; b = compute_trap_behaviour(vcpu, tc); + if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu)) + goto local; + if (((b & BEHAVE_FORWARD_READ) && is_read) || ((b & BEHAVE_FORWARD_WRITE) && !is_read)) goto inject; @@ -2393,6 +2446,8 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); + + kvm_pmu_nested_transition(vcpu); } static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2, @@ -2475,6 +2530,8 @@ static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2, kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); + kvm_pmu_nested_transition(vcpu); + return 1; } diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index e738a353b20e..12dad841f2a5 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1051,21 +1051,19 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, } while (length > 0) { - kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL); + struct page *page = __gfn_to_page(kvm, gfn, write); void *maddr; unsigned long num_tags; - struct page *page; struct folio *folio; - if (is_error_noslot_pfn(pfn)) { + if (!page) { ret = -EFAULT; goto out; } - page = pfn_to_online_page(pfn); - if (!page) { + if (!pfn_to_online_page(page_to_pfn(page))) { /* Reject ZONE_DEVICE memory */ - kvm_release_pfn_clean(pfn); + kvm_release_page_unused(page); ret = -EFAULT; goto out; } @@ -1082,7 +1080,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, /* No tags in memory, so write zeros */ num_tags = MTE_GRANULES_PER_PAGE - clear_user(tags, MTE_GRANULES_PER_PAGE); - kvm_release_pfn_clean(pfn); + kvm_release_page_clean(page); } else { /* * Only locking to serialise with a concurrent @@ -1104,7 +1102,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, else set_page_mte_tagged(page); - kvm_release_pfn_dirty(pfn); + kvm_release_page_dirty(page); } if (num_tags != MTE_GRANULES_PER_PAGE) { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5310fe1da616..34f53707892d 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -204,6 +204,35 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) __deactivate_fgt(hctxt, vcpu, kvm, HAFGRTR_EL2); } +static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) +{ + u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + + if (!system_supports_mpam()) + return; + + /* trap guest access to MPAMIDR_EL1 */ + if (system_supports_mpam_hcr()) { + write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); + } else { + /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ + r |= MPAM2_EL2_TIDR; + } + + write_sysreg_s(r, SYS_MPAM2_EL2); +} + +static inline void __deactivate_traps_mpam(void) +{ + if (!system_supports_mpam()) + return; + + write_sysreg_s(0, SYS_MPAM2_EL2); + + if (system_supports_mpam_hcr()) + write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); +} + static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ @@ -244,6 +273,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) } __activate_traps_hfgxtr(vcpu); + __activate_traps_mpam(vcpu); } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) @@ -263,6 +293,7 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); __deactivate_traps_hfgxtr(vcpu); + __deactivate_traps_mpam(); } static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr) diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 1579a3c08a36..a651c43ad679 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -58,7 +58,7 @@ static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt) return false; vcpu = ctxt_to_vcpu(ctxt); - return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1PIE, IMP); + return kvm_has_s1pie(kern_hyp_va(vcpu->kvm)); } static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) @@ -69,7 +69,7 @@ static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt) return false; vcpu = ctxt_to_vcpu(ctxt); - return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, TCRX, IMP); + return kvm_has_tcr2(kern_hyp_va(vcpu->kvm)); } static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) @@ -80,7 +80,7 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) return false; vcpu = ctxt_to_vcpu(ctxt); - return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1POE, IMP); + return kvm_has_s1poe(kern_hyp_va(vcpu->kvm)); } static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) @@ -152,9 +152,10 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0), tpidrro_el0); } -static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) +static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt, + u64 mpidr) { - write_sysreg(ctxt_sys_reg(ctxt, MPIDR_EL1), vmpidr_el2); + write_sysreg(mpidr, vmpidr_el2); if (has_vhe() || !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 45a84f0ade04..1e6d995968a1 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -15,6 +15,4 @@ #define DECLARE_REG(type, name, ctxt, reg) \ type name = (type)cpu_reg(ctxt, (reg)) -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu); - #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index fefc89209f9e..6aa0b13d86e5 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -105,8 +105,10 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; - hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); + hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & + (HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; @@ -349,13 +351,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } -static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); - - __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); -} - static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); @@ -411,7 +406,6 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), - HANDLE_FUNC(__pkvm_vcpu_init_traps), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 077d4098548d..01616c39a810 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -6,6 +6,9 @@ #include <linux/kvm_host.h> #include <linux/mm.h> + +#include <asm/kvm_emulate.h> + #include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> #include <nvhe/memory.h> @@ -201,11 +204,46 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) } } +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; + + if (has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; + + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); +} + /* * Initialize trap register values in protected mode. */ -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) { + vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); + + if ((!vcpu_is_protected(vcpu))) + return; + pvm_init_trap_regs(vcpu); pvm_init_traps_aa64pfr0(vcpu); pvm_init_traps_aa64pfr1(vcpu); @@ -289,6 +327,65 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_spin_unlock(&vm_table_lock); } +static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) +{ + struct kvm *kvm = &hyp_vm->kvm; + DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* No restrictions for non-protected VMs. */ + if (!kvm_vm_is_protected(kvm)) { + bitmap_copy(kvm->arch.vcpu_features, + host_kvm->arch.vcpu_features, + KVM_VCPU_MAX_FEATURES); + return; + } + + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* + * For protected VMs, always allow: + * - CPU starting in poweroff state + * - PSCI v0.2 + */ + set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features); + set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); + + /* + * Check if remaining features are allowed: + * - Performance Monitoring + * - Scalable Vectors + * - Pointer Authentication + */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), PVM_ID_AA64DFR0_ALLOW)) + set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED)) + set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), PVM_ID_AA64ISAR1_ALLOW)) + set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, + allowed_features, KVM_VCPU_MAX_FEATURES); +} + +static void pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || + vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) { + kvm_vcpu_enable_ptrauth(vcpu); + } else { + vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_PTRAUTH); + } +} + static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) @@ -310,6 +407,18 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, hyp_vm->host_kvm = host_kvm; hyp_vm->kvm.created_vcpus = nr_vcpus; hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + pkvm_init_features_from_host(hyp_vm, host_kvm); +} + +static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { + vcpu_clear_flag(vcpu, GUEST_HAS_SVE); + vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); + } } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, @@ -335,6 +444,11 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + + pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); + pkvm_vcpu_init_ptrauth(hyp_vcpu); + pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index dfe8fe0f7eaf..9c2ce1e0e99a 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -265,6 +265,8 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ case PSCI_1_0_FN_PSCI_FEATURES: case PSCI_1_0_FN_SET_SUSPEND_MODE: case PSCI_1_1_FN64_SYSTEM_RESET2: + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: return psci_forward(host_ctxt); case PSCI_1_0_FN64_SYSTEM_SUSPEND: return psci_system_suspend(func_id, host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 174007f3fadd..cbdd18cd3f98 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -95,7 +95,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, { void *start, *end, *virt = hyp_phys_to_virt(phys); unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; - enum kvm_pgtable_prot prot; int ret, i; /* Recreate the hyp page-table using the early page allocator */ @@ -147,24 +146,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } - pkvm_create_host_sve_mappings(); - - /* - * Map the host sections RO in the hypervisor, but transfer the - * ownership from the host to the hypervisor itself to make sure they - * can't be donated or shared with another entity. - * - * The ownership transition requires matching changes in the host - * stage-2. This will be done later (see finalize_host_mappings()) once - * the hyp_vmemmap is addressable. - */ - prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); - ret = pkvm_create_mappings(&kvm_vgic_global_state, - &kvm_vgic_global_state + 1, prot); - if (ret) - return ret; - - return 0; + return pkvm_create_host_sve_mappings(); } static void update_nvhe_init_params(void) diff --git a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c index 29305022bc04..dba101565de3 100644 --- a/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/sysreg-sr.c @@ -28,7 +28,7 @@ void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt) void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt) { - __sysreg_restore_el1_state(ctxt); + __sysreg_restore_el1_state(ctxt, ctxt_sys_reg(ctxt, MPIDR_EL1)); __sysreg_restore_common_state(ctxt); __sysreg_restore_user_state(ctxt); __sysreg_restore_el2_return_state(ctxt); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index b11bcebac908..40bd55966540 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1245,19 +1245,16 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) NULL, NULL, 0); } -kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) +void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) { - kvm_pte_t pte = 0; int ret; ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - &pte, NULL, + NULL, NULL, KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED); if (!ret) dsb(ishst); - - return pte; } struct stage2_age_data { diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 18d4677002b1..3f9741e51d41 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -1012,9 +1012,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) - val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT); /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index e12bd7d6d2dc..5f78a39053a7 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -15,6 +15,131 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_nested.h> +static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) +{ + /* These registers are common with EL1 */ + __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1); + __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1); + + __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR); + __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0); + __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1); + __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR); + __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR); + __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR); + __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR); + __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR); + + /* + * In VHE mode those registers are compatible between EL1 and EL2, + * and the guest uses the _EL1 versions on the CPU naturally. + * So we save them into their _EL2 versions here. + * For nVHE mode we trap accesses to those registers, so our + * _EL2 copy in sys_regs[] is always up-to-date and we don't need + * to save anything here. + */ + if (vcpu_el2_e2h_is_set(vcpu)) { + u64 val; + + /* + * We don't save CPTR_EL2, as accesses to CPACR_EL1 + * are always trapped, ensuring that the in-memory + * copy is always up-to-date. A small blessing... + */ + __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR); + __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0); + __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); + __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); + + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { + __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2); + + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0); + __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR); + } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR); + } + + /* + * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where + * the interesting CNTHCTL_EL2 bits live. So preserve these + * bits when reading back the guest-visible value. + */ + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS; + __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; + } + + __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1); + __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR); + __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR); +} + +static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) +{ + u64 val; + + /* These registers are common with EL1 */ + write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); + write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); + + write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); + write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); + + if (vcpu_el2_e2h_is_set(vcpu)) { + /* + * In VHE mode those registers are compatible between + * EL1 and EL2. + */ + write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL); + } else { + /* + * CNTHCTL_EL2 only affects EL1 when running nVHE, so + * no need to restore it. + */ + val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2)); + write_sysreg_el1(val, SYS_SCTLR); + val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2)); + write_sysreg_el1(val, SYS_CPACR); + val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2)); + write_sysreg_el1(val, SYS_TTBR0); + val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2)); + write_sysreg_el1(val, SYS_TCR); + } + + if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { + write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2); + + if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0); + } + + if (ctxt_has_s1poe(&vcpu->arch.ctxt)) + write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR); + } + + write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); + write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR); + write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); + write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); + write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); +} + /* * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and * pstate, which are handled as part of the el2 return state) on every @@ -66,6 +191,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; + u64 mpidr; host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); @@ -89,7 +215,29 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); - __sysreg_restore_el1_state(guest_ctxt); + + if (unlikely(__is_hyp_ctxt(guest_ctxt))) { + __sysreg_restore_vel2_state(vcpu); + } else { + if (vcpu_has_nv(vcpu)) { + /* + * Use the guest hypervisor's VPIDR_EL2 when in a + * nested state. The hardware value of MIDR_EL1 gets + * restored on put. + */ + write_sysreg(ctxt_sys_reg(guest_ctxt, VPIDR_EL2), vpidr_el2); + + /* + * As we're restoring a nested guest, set the value + * provided by the guest hypervisor. + */ + mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); + } else { + mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); + } + + __sysreg_restore_el1_state(guest_ctxt, mpidr); + } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); } @@ -112,12 +260,20 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); - __sysreg_save_el1_state(guest_ctxt); + if (unlikely(__is_hyp_ctxt(guest_ctxt))) + __sysreg_save_vel2_state(vcpu); + else + __sysreg_save_el1_state(guest_ctxt); + __sysreg_save_user_state(guest_ctxt); __sysreg32_save_state(vcpu); /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); + /* If leaving a nesting guest, restore MIDR_EL1 default view */ + if (vcpu_has_nv(vcpu)) + write_sysreg(read_cpuid_id(), vpidr_el2); + vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); } diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index ee6573befb81..27ce4cb44904 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -575,6 +575,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_ARM_PSCI_0_2: case KVM_ARM_PSCI_1_0: case KVM_ARM_PSCI_1_1: + case KVM_ARM_PSCI_1_2: + case KVM_ARM_PSCI_1_3: if (!wants_02) return -EINVAL; vcpu->kvm->arch.psci_version = val; diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index cd6b7b83e2c3..ab365e839874 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -72,6 +72,31 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) return data; } +static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu) +{ + if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) + return false; + + if (vcpu_el1_is_32bit(vcpu)) { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA32_UND): + case unpack_vcpu_flag(EXCEPT_AA32_IABT): + case unpack_vcpu_flag(EXCEPT_AA32_DABT): + return true; + default: + return false; + } + } else { + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): + case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): + return true; + default: + return false; + } + } +} + /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation * or in-kernel IO emulation @@ -84,8 +109,11 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) unsigned int len; int mask; - /* Detect an already handled MMIO return */ - if (unlikely(!vcpu->mmio_needed)) + /* + * Detect if the MMIO return was already handled or if userspace aborted + * the MMIO access. + */ + if (unlikely(!vcpu->mmio_needed || kvm_pending_sync_exception(vcpu))) return 1; vcpu->mmio_needed = 0; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 56d9a7f414fe..c9d46ad57e52 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1451,6 +1451,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; + struct page *page; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); @@ -1572,7 +1573,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to + * vma_lookup() or __kvm_faultin_pfn() become stale prior to * acquiring kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs @@ -1581,8 +1582,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, - write_fault, &writable, NULL); + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -1595,7 +1596,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If the page was identified as device early by looking at * the VMA flags, vma_pagesize is already representing the * largest quantity we can map. If instead it was mapped - * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE + * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE * and must not be upgraded. * * In both cases, we don't let transparent_hugepage_adjust() @@ -1704,33 +1705,27 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: + kvm_release_faultin_page(kvm, page, !!ret, writable); read_unlock(&kvm->mmu_lock); /* Mark the page dirty only if the fault is handled successfully */ - if (writable && !ret) { - kvm_set_pfn_dirty(pfn); + if (writable && !ret) mark_page_dirty_in_slot(kvm, memslot, gfn); - } - kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; } /* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { - kvm_pte_t pte; struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); read_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; - pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); + kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); read_unlock(&vcpu->kvm->mmu_lock); - - if (kvm_pte_valid(pte)) - kvm_set_pfn_accessed(kvm_pte_to_pfn(pte)); } /** diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index c4b17d90fc49..aeaa6017ffd8 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -917,12 +917,13 @@ static void limit_nv_id_regs(struct kvm *kvm) ID_AA64MMFR4_EL1_E2H0_NI_NV1); kvm_set_vm_id_reg(kvm, SYS_ID_AA64MMFR4_EL1, val); - /* Only limited support for PMU, Debug, BPs and WPs */ + /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); val &= (NV_FTR(DFR0, PMUVer) | NV_FTR(DFR0, WRPs) | NV_FTR(DFR0, BRPs) | - NV_FTR(DFR0, DebugVer)); + NV_FTR(DFR0, DebugVer) | + NV_FTR(DFR0, HPMN0)); /* Cap Debug to ARMv8.1 */ tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val); @@ -933,15 +934,15 @@ static void limit_nv_id_regs(struct kvm *kvm) kvm_set_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1, val); } -u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) +u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu, + enum vcpu_sysreg sr, u64 v) { - u64 v = ctxt_sys_reg(&vcpu->arch.ctxt, sr); struct kvm_sysreg_masks *masks; masks = vcpu->kvm->arch.sysreg_masks; if (masks) { - sr -= __VNCR_START__; + sr -= __SANITISED_REG_START__; v &= ~masks->mask[sr].res0; v |= masks->mask[sr].res1; @@ -952,7 +953,11 @@ u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *vcpu, enum vcpu_sysreg sr) static void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1) { - int i = sr - __VNCR_START__; + int i = sr - __SANITISED_REG_START__; + + BUILD_BUG_ON(!__builtin_constant_p(sr)); + BUILD_BUG_ON(sr < __SANITISED_REG_START__); + BUILD_BUG_ON(sr >= NR_SYS_REGS); kvm->arch.sysreg_masks->mask[i].res0 = res0; kvm->arch.sysreg_masks->mask[i].res1 = res1; @@ -1050,7 +1055,7 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= HCRX_EL2_PTTWI; if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SCTLRX, IMP)) res0 |= HCRX_EL2_SCTLR2En; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + if (!kvm_has_tcr2(kvm)) res0 |= HCRX_EL2_TCR2En; if (!kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) res0 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); @@ -1101,9 +1106,9 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= (HFGxTR_EL2_nSMPRI_EL1 | HFGxTR_EL2_nTPIDR2_EL0); if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) res0 |= HFGxTR_EL2_nRCWMASK_EL1; - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + if (!kvm_has_s1pie(kvm)) res0 |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + if (!kvm_has_s1poe(kvm)) res0 |= (HFGxTR_EL2_nPOR_EL0 | HFGxTR_EL2_nPOR_EL1); if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) res0 |= HFGxTR_EL2_nS2POR_EL1; @@ -1200,6 +1205,28 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= ~(res0 | res1); set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1); + /* TCR2_EL2 */ + res0 = TCR2_EL2_RES0; + res1 = TCR2_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP)) + res0 |= (TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1 | TCR2_EL2_D128); + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, MEC, IMP)) + res0 |= TCR2_EL2_AMEC1 | TCR2_EL2_AMEC0; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT)) + res0 |= TCR2_EL2_HAFT; + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP)) + res0 |= TCR2_EL2_PTTWI | TCR2_EL2_PnCH; + if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP)) + res0 |= TCR2_EL2_AIE; + if (!kvm_has_s1poe(kvm)) + res0 |= TCR2_EL2_POE | TCR2_EL2_E0POE; + if (!kvm_has_s1pie(kvm)) + res0 |= TCR2_EL2_PIE; + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= (TCR2_EL2_E0POE | TCR2_EL2_D128 | + TCR2_EL2_AMEC1 | TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1); + set_sysreg_masks(kvm, TCR2_EL2, res0, res1); + /* SCTLR_EL1 */ res0 = SCTLR_EL1_RES0; res1 = SCTLR_EL1_RES1; @@ -1207,6 +1234,43 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= SCTLR_EL1_EPAN; set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); + /* MDCR_EL2 */ + res0 = MDCR_EL2_RES0; + res1 = MDCR_EL2_RES1; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) + res0 |= (MDCR_EL2_HPMN | MDCR_EL2_TPMCR | + MDCR_EL2_TPM | MDCR_EL2_HPME); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) + res0 |= MDCR_EL2_E2PB | MDCR_EL2_TPMS; + if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, SPMU, IMP)) + res0 |= MDCR_EL2_EnSPM; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P1)) + res0 |= MDCR_EL2_HPMD; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) + res0 |= MDCR_EL2_TTRF; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P5)) + res0 |= MDCR_EL2_HCCD | MDCR_EL2_HLP; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP)) + res0 |= MDCR_EL2_E2TB; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) + res0 |= MDCR_EL2_TDCC; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, MTPMU, IMP) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) + res0 |= MDCR_EL2_MTPME; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P7)) + res0 |= MDCR_EL2_HPMFZO; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSS, IMP)) + res0 |= MDCR_EL2_PMSSE; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2)) + res0 |= MDCR_EL2_HPMFZS; + if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, EBEP, IMP)) + res0 |= MDCR_EL2_PMEE; + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, V8P9)) + res0 |= MDCR_EL2_EBWE; + if (!kvm_has_feat(kvm, ID_AA64DFR2_EL1, STEP, IMP)) + res0 |= MDCR_EL2_EnSTEPOP; + set_sysreg_masks(kvm, MDCR_EL2, res0, res1); + return 0; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index ac36c438b8c1..8ad62284fa23 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -89,7 +89,11 @@ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { - u64 val = kvm_vcpu_read_pmcr(kvm_pmc_to_vcpu(pmc)); + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 val = kvm_vcpu_read_pmcr(vcpu); + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP; return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); @@ -111,6 +115,11 @@ static u32 counter_index_to_evtreg(u64 idx) return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx; } +static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc) +{ + return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx)); +} + static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); @@ -244,7 +253,7 @@ void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) */ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu); int i; for_each_set_bit(i, &mask, 32) @@ -265,7 +274,37 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) irq_work_sync(&vcpu->arch.pmu.overflow_work); } -u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) +bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) +{ + unsigned int hpmn; + + if (!vcpu_has_nv(vcpu) || idx == ARMV8_PMU_CYCLE_IDX) + return false; + + /* + * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't + * implemented. Since KVM's ability to emulate HPMN=0 does not directly + * depend on hardware (all PMU registers are trapped), make the + * implementation choice that all counters are included in the second + * range reserved for EL2/EL3. + */ + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + return idx >= hpmn; +} + +u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + u64 hpmn; + + if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) + return mask; + + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + return mask & ~GENMASK(vcpu->kvm->arch.pmcr_n - 1, hpmn); +} + +u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) { u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); @@ -574,7 +613,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); + unsigned long mask = kvm_pmu_accessible_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true); @@ -585,8 +624,44 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - return (kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) && - (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); + unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx))) + return false; + + if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx)) + return mdcr & MDCR_EL2_HPME; + + return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E; +} + +static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0; + bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0; + + return u == nsu; +} + +static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc) +{ + u64 evtreg = kvm_pmc_read_evtreg(pmc); + bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1; + bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1; + + return p == nsk; +} + +static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc) +{ + struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); + u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2); + + if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD)) + return false; + + return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2; } /** @@ -599,17 +674,15 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu; struct perf_event *event; struct perf_event_attr attr; - u64 eventsel, reg, data; - bool p, u, nsk, nsu; + u64 eventsel, evtreg; - reg = counter_index_to_evtreg(pmc->idx); - data = __vcpu_sys_reg(vcpu, reg); + evtreg = kvm_pmc_read_evtreg(pmc); kvm_pmu_stop_counter(pmc); if (pmc->idx == ARMV8_PMU_CYCLE_IDX) eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; else - eventsel = data & kvm_pmu_event_mask(vcpu->kvm); + eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm); /* * Neither SW increment nor chained events need to be backed @@ -627,23 +700,26 @@ static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc) !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; - p = data & ARMV8_PMU_EXCLUDE_EL1; - u = data & ARMV8_PMU_EXCLUDE_EL0; - nsk = data & ARMV8_PMU_EXCLUDE_NS_EL1; - nsu = data & ARMV8_PMU_EXCLUDE_NS_EL0; - memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = arm_pmu->pmu.type; attr.size = sizeof(attr); attr.pinned = 1; attr.disabled = !kvm_pmu_counter_is_enabled(pmc); - attr.exclude_user = (u != nsu); - attr.exclude_kernel = (p != nsk); + attr.exclude_user = !kvm_pmc_counts_at_el0(pmc); attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ attr.config = eventsel; /* + * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the + * guest's EL2 filter. + */ + if (unlikely(is_hyp_ctxt(vcpu))) + attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc); + else + attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc); + + /* * If counting with a 64bit counter, advertise it to the perf * code, carefully dealing with the initial sample period * which also depends on the overflow. @@ -804,7 +880,7 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); kvm_pmu_handle_pmcr(vcpu, kvm_vcpu_read_pmcr(vcpu)); @@ -1139,3 +1215,32 @@ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N); } + +void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) +{ + bool reprogrammed = false; + unsigned long mask; + int i; + + if (!kvm_vcpu_has_pmu(vcpu)) + return; + + mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + for_each_set_bit(i, &mask, 32) { + struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i); + + /* + * We only need to reconfigure events where the filter is + * different at EL1 vs. EL2, as we're multiplexing the true EL1 + * event filter bit for nested. + */ + if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc)) + continue; + + kvm_pmu_create_perf_event(pmc); + reprogrammed = true; + } + + if (reprogrammed) + kvm_vcpu_pmu_restore_guest(vcpu); +} diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 1f69b667332b..3b5dbe9a0a0e 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -194,6 +194,12 @@ static void kvm_psci_system_off(struct kvm_vcpu *vcpu) kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, 0); } +static void kvm_psci_system_off2(struct kvm_vcpu *vcpu) +{ + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, + KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2); +} + static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) { kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, 0); @@ -322,7 +328,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) switch(psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: - val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1; + val = PSCI_VERSION(1, minor); break; case PSCI_1_0_FN_PSCI_FEATURES: arg = smccc_get_arg1(vcpu); @@ -358,6 +364,11 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) if (minor >= 1) val = 0; break; + case PSCI_1_3_FN_SYSTEM_OFF2: + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor >= 3) + val = PSCI_1_3_OFF_TYPE_HIBERNATE_OFF; + break; } break; case PSCI_1_0_FN_SYSTEM_SUSPEND: @@ -392,6 +403,33 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor) break; } break; + case PSCI_1_3_FN_SYSTEM_OFF2: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; + case PSCI_1_3_FN64_SYSTEM_OFF2: + if (minor < 3) + break; + + arg = smccc_get_arg1(vcpu); + /* + * SYSTEM_OFF2 defaults to HIBERNATE_OFF if arg1 is zero. arg2 + * must be zero. + */ + if ((arg && arg != PSCI_1_3_OFF_TYPE_HIBERNATE_OFF) || + smccc_get_arg2(vcpu) != 0) { + val = PSCI_RET_INVALID_PARAMS; + break; + } + kvm_psci_system_off2(vcpu); + /* + * We shouldn't be going back to the guest after receiving a + * SYSTEM_OFF2 request. Preload a return value of + * INTERNAL_FAILURE should userspace ignore the exit and resume + * the vCPU. + */ + val = PSCI_RET_INTERNAL_FAILURE; + ret = 0; + break; default: return kvm_psci_0_2_call(vcpu); } @@ -449,6 +487,10 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) } switch (version) { + case KVM_ARM_PSCI_1_3: + return kvm_psci_1_x_call(vcpu, 3); + case KVM_ARM_PSCI_1_2: + return kvm_psci_1_x_call(vcpu, 2); case KVM_ARM_PSCI_1_1: return kvm_psci_1_x_call(vcpu, 1); case KVM_ARM_PSCI_1_0: diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 0b0ae5ae7bc2..470524b31951 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -167,11 +167,6 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); } -static void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) -{ - vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); -} - /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ff8c4e1b847e..83c6b4a07ef5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -110,6 +110,14 @@ static bool get_el2_to_el1_mapping(unsigned int reg, PURE_EL2_SYSREG( RVBAR_EL2 ); PURE_EL2_SYSREG( TPIDR_EL2 ); PURE_EL2_SYSREG( HPFAR_EL2 ); + PURE_EL2_SYSREG( HCRX_EL2 ); + PURE_EL2_SYSREG( HFGRTR_EL2 ); + PURE_EL2_SYSREG( HFGWTR_EL2 ); + PURE_EL2_SYSREG( HFGITR_EL2 ); + PURE_EL2_SYSREG( HDFGRTR_EL2 ); + PURE_EL2_SYSREG( HDFGWTR_EL2 ); + PURE_EL2_SYSREG( HAFGRTR_EL2 ); + PURE_EL2_SYSREG( CNTVOFF_EL2 ); PURE_EL2_SYSREG( CNTHCTL_EL2 ); MAPPED_EL2_SYSREG(SCTLR_EL2, SCTLR_EL1, translate_sctlr_el2_to_sctlr_el1 ); @@ -126,10 +134,15 @@ static bool get_el2_to_el1_mapping(unsigned int reg, MAPPED_EL2_SYSREG(ESR_EL2, ESR_EL1, NULL ); MAPPED_EL2_SYSREG(FAR_EL2, FAR_EL1, NULL ); MAPPED_EL2_SYSREG(MAIR_EL2, MAIR_EL1, NULL ); + MAPPED_EL2_SYSREG(TCR2_EL2, TCR2_EL1, NULL ); + MAPPED_EL2_SYSREG(PIR_EL2, PIR_EL1, NULL ); + MAPPED_EL2_SYSREG(PIRE0_EL2, PIRE0_EL1, NULL ); + MAPPED_EL2_SYSREG(POR_EL2, POR_EL1, NULL ); MAPPED_EL2_SYSREG(AMAIR_EL2, AMAIR_EL1, NULL ); MAPPED_EL2_SYSREG(ELR_EL2, ELR_EL1, NULL ); MAPPED_EL2_SYSREG(SPSR_EL2, SPSR_EL1, NULL ); MAPPED_EL2_SYSREG(ZCR_EL2, ZCR_EL1, NULL ); + MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL ); default: return false; } @@ -149,6 +162,21 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) goto memory_read; /* + * CNTHCTL_EL2 requires some special treatment to + * account for the bits that can be set via CNTKCTL_EL1. + */ + switch (reg) { + case CNTHCTL_EL2: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el1(SYS_CNTKCTL); + val &= CNTKCTL_VALID_BITS; + val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS; + return val; + } + break; + } + + /* * If this register does not have an EL1 counterpart, * then read the stored EL2 version. */ @@ -165,6 +193,9 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) /* Get the current version of the EL1 counterpart. */ WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val)); + if (reg >= __SANITISED_REG_START__) + val = kvm_vcpu_apply_reg_masks(vcpu, reg, val); + return val; } @@ -198,6 +229,19 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) */ __vcpu_sys_reg(vcpu, reg) = val; + switch (reg) { + case CNTHCTL_EL2: + /* + * If E2H=0, CNHTCTL_EL2 is a pure shadow register. + * Otherwise, some of the bits are backed by + * CNTKCTL_EL1, while the rest is kept in memory. + * Yes, this is fun stuff. + */ + if (vcpu_el2_e2h_is_set(vcpu)) + write_sysreg_el1(val, SYS_CNTKCTL); + return; + } + /* No EL1 counterpart? We're done here.? */ if (reg == el1r) return; @@ -390,10 +434,6 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, bool was_enabled = vcpu_has_cache_enabled(vcpu); u64 val, mask, shift; - if (reg_to_encoding(r) == SYS_TCR2_EL1 && - !kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) - return undef_access(vcpu, p, r); - BUG_ON(!p->is_write); get_access_mask(r, &mask, &shift); @@ -1128,7 +1168,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va { bool set; - val &= kvm_pmu_valid_counter_mask(vcpu); + val &= kvm_pmu_accessible_counter_mask(vcpu); switch (r->reg) { case PMOVSSET_EL0: @@ -1151,7 +1191,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); *val = __vcpu_sys_reg(vcpu, r->reg) & mask; return 0; @@ -1165,7 +1205,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); if (p->is_write) { val = p->regval & mask; if (r->Op2 & 0x1) { @@ -1188,7 +1228,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (check_pmu_access_disabled(vcpu, 0)) return false; @@ -1212,7 +1252,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_valid_counter_mask(vcpu); + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); if (pmu_access_el0_disabled(vcpu)) return false; @@ -1242,7 +1282,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_write_swinc_el0_disabled(vcpu)) return false; - mask = kvm_pmu_valid_counter_mask(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu); kvm_pmu_software_increment(vcpu, p->regval & mask); return true; } @@ -1509,6 +1549,9 @@ static u8 pmuver_to_perfmon(u8 pmuver) } } +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); + /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) @@ -1522,6 +1565,12 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val = read_sanitised_ftr_reg(id); switch (id) { + case SYS_ID_AA64DFR0_EL1: + val = sanitise_id_aa64dfr0_el1(vcpu, val); + break; + case SYS_ID_AA64PFR0_EL1: + val = sanitise_id_aa64pfr0_el1(vcpu, val); + break; case SYS_ID_AA64PFR1_EL1: if (!kvm_has_mte(vcpu->kvm)) val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); @@ -1535,6 +1584,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac); break; case SYS_ID_AA64PFR2_EL1: /* We only expose FPMR */ @@ -1692,11 +1742,8 @@ static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } -static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); - if (!vcpu_has_sve(vcpu)) val &= ~ID_AA64PFR0_EL1_SVE_MASK; @@ -1724,6 +1771,13 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, val &= ~ID_AA64PFR0_EL1_AMU_MASK; + /* + * MPAM is disabled by default as KVM also needs a set of PARTID to + * program the MPAMVPMx_EL2 PARTID remapping registers with. But some + * older kernels let the guest see the ID bit. + */ + val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + return val; } @@ -1737,11 +1791,8 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, (val); \ }) -static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) { - u64 val = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); - val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); /* @@ -1834,6 +1885,70 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, return set_id_reg(vcpu, rd, val); } +static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK; + + /* + * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits + * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to + * guests, but didn't add trap handling. KVM doesn't support MPAM and + * always returns an UNDEF for these registers. The guest must see 0 + * for this field. + * + * But KVM must also accept values from user-space that were provided + * by KVM. On CPUs that support MPAM, permit user-space to write + * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field. + */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK; + + /* See set_id_aa64pfr0_el1 for comment about MPAM */ + if ((hw_val & mpam_mask) == (user_val & mpam_mask)) + user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK; + + return set_id_reg(vcpu, rd, user_val); +} + +static int set_ctr_el0(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, u64 user_val) +{ + u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val); + + /* + * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved. + * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based + * on what hardware reports. + * + * Using a VIPT software model on PIPT will lead to over invalidation, + * but still correct. Hence, we can allow downgrading PIPT to VIPT, + * but not the other way around. This is handled via arm64_ftr_safe_value() + * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value + * set as VIPT. + */ + switch (user_L1Ip) { + case CTR_EL0_L1Ip_RESERVED_VPIPT: + case CTR_EL0_L1Ip_RESERVED_AIVIVT: + return -EINVAL; + case CTR_EL0_L1Ip_VIPT: + case CTR_EL0_L1Ip_PIPT: + return set_id_reg(vcpu, rd, user_val); + default: + return -ENOENT; + } +} + /* * cpufeature ID register user accessors * @@ -2104,6 +2219,15 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = v, \ } +#define EL2_REG_FILTERED(name, acc, rst, v, filter) { \ + SYS_DESC(SYS_##name), \ + .access = acc, \ + .reset = rst, \ + .reg = name, \ + .visibility = filter, \ + .val = v, \ +} + #define EL2_REG_VNCR(name, rst, v) EL2_REG(name, bad_vncr_trap, rst, v) #define EL2_REG_REDIR(name, rst, v) EL2_REG(name, bad_redir_trap, rst, v) @@ -2150,6 +2274,15 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = mask, \ } +/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ +#define ID_FILTERED(sysreg, name, mask) { \ + ID_DESC(sysreg), \ + .set_user = set_##name, \ + .visibility = id_visibility, \ + .reset = kvm_read_sanitised_id_reg, \ + .val = (mask), \ +} + /* * sys_reg_desc initialiser for architecturally unallocated cpufeature ID * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2 @@ -2236,16 +2369,18 @@ static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return __vcpu_sys_reg(vcpu, r->reg) = val; } +static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + unsigned int (*fn)(const struct kvm_vcpu *, + const struct sys_reg_desc *)) +{ + return el2_visibility(vcpu, rd) ?: fn(vcpu, rd); +} + static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - unsigned int r; - - r = el2_visibility(vcpu, rd); - if (r) - return r; - - return sve_visibility(vcpu, rd); + return __el2_visibility(vcpu, rd, sve_visibility); } static bool access_zcr_el2(struct kvm_vcpu *vcpu, @@ -2273,12 +2408,48 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + if (kvm_has_s1poe(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1poe_visibility); +} + +static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_tcr2(vcpu->kvm)) + return 0; + + return REG_HIDDEN; +} + +static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, tcr2_visibility); +} + +static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + if (kvm_has_s1pie(vcpu->kvm)) return 0; return REG_HIDDEN; } +static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + return __el2_visibility(vcpu, rd, s1pie_visibility); +} + /* * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 @@ -2374,18 +2545,15 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 ID registers */ /* CRm=4 */ - { SYS_DESC(SYS_ID_AA64PFR0_EL1), - .access = access_id_reg, - .get_user = get_id_reg, - .set_user = set_id_reg, - .reset = read_sanitised_id_aa64pfr0_el1, - .val = ~(ID_AA64PFR0_EL1_AMU | - ID_AA64PFR0_EL1_MPAM | - ID_AA64PFR0_EL1_SVE | - ID_AA64PFR0_EL1_RAS | - ID_AA64PFR0_EL1_AdvSIMD | - ID_AA64PFR0_EL1_FP), }, - ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR | + ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1, + ~(ID_AA64PFR0_EL1_AMU | + ID_AA64PFR0_EL1_MPAM | + ID_AA64PFR0_EL1_SVE | + ID_AA64PFR0_EL1_RAS | + ID_AA64PFR0_EL1_AdvSIMD | + ID_AA64PFR0_EL1_FP)), + ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1, + ~(ID_AA64PFR1_EL1_PFAR | ID_AA64PFR1_EL1_DF2 | ID_AA64PFR1_EL1_MTEX | ID_AA64PFR1_EL1_THE | @@ -2406,11 +2574,6 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0), /* CRm=5 */ - { SYS_DESC(SYS_ID_AA64DFR0_EL1), - .access = access_id_reg, - .get_user = get_id_reg, - .set_user = set_id_aa64dfr0_el1, - .reset = read_sanitised_id_aa64dfr0_el1, /* * Prior to FEAT_Debugv8.9, the architecture defines context-aware * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). @@ -2423,10 +2586,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { * See DDI0487K.a, section D2.8.3 Breakpoint types and linking * of breakpoints for more details. */ - .val = ID_AA64DFR0_EL1_DoubleLock_MASK | - ID_AA64DFR0_EL1_WRPs_MASK | - ID_AA64DFR0_EL1_PMUVer_MASK | - ID_AA64DFR0_EL1_DebugVer_MASK, }, + ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1, + ID_AA64DFR0_EL1_DoubleLock_MASK | + ID_AA64DFR0_EL1_WRPs_MASK | + ID_AA64DFR0_EL1_PMUVer_MASK | + ID_AA64DFR0_EL1_DebugVer_MASK), ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), @@ -2489,7 +2653,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, - { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0 }, + { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0, + .visibility = tcr2_visibility }, PTRAUTH_KEY(APIA), PTRAUTH_KEY(APIB), @@ -2543,8 +2708,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, - { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 }, - { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 }, + { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1, + .visibility = s1pie_visibility }, + { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1, + .visibility = s1pie_visibility }, { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1, .visibility = s1poe_visibility }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, @@ -2553,8 +2720,11 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_LOREA_EL1), trap_loregion }, { SYS_DESC(SYS_LORN_EL1), trap_loregion }, { SYS_DESC(SYS_LORC_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAMIDR_EL1), undef_access }, { SYS_DESC(SYS_LORID_EL1), trap_loregion }, + { SYS_DESC(SYS_MPAM1_EL1), undef_access }, + { SYS_DESC(SYS_MPAM0_EL1), undef_access }, { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, @@ -2599,10 +2769,12 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CCSIDR2_EL1), undef_access }, { SYS_DESC(SYS_SMIDR_EL1), undef_access }, { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 }, - ID_WRITABLE(CTR_EL0, CTR_EL0_DIC_MASK | - CTR_EL0_IDC_MASK | - CTR_EL0_DminLine_MASK | - CTR_EL0_IminLine_MASK), + ID_FILTERED(CTR_EL0, ctr_el0, + CTR_EL0_DIC_MASK | + CTR_EL0_IDC_MASK | + CTR_EL0_DminLine_MASK | + CTR_EL0_L1Ip_MASK | + CTR_EL0_IminLine_MASK), { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility }, { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility }, @@ -2818,14 +2990,16 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(HFGITR_EL2, reset_val, 0), EL2_REG_VNCR(HACR_EL2, reset_val, 0), - { SYS_DESC(SYS_ZCR_EL2), .access = access_zcr_el2, .reset = reset_val, - .visibility = sve_el2_visibility, .reg = ZCR_EL2 }, + EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0, + sve_el2_visibility), EL2_REG_VNCR(HCRX_EL2, reset_val, 0), EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), + EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1, + tcr2_el2_visibility), EL2_REG_VNCR(VTTBR_EL2, reset_val, 0), EL2_REG_VNCR(VTCR_EL2, reset_val, 0), @@ -2853,7 +3027,24 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), EL2_REG(MAIR_EL2, access_rw, reset_val, 0), + EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0, + s1pie_el2_visibility), + EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0, + s1pie_el2_visibility), + EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0, + s1poe_el2_visibility), EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_MPAMHCR_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access }, + { SYS_DESC(SYS_MPAM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access }, + { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access }, EL2_REG(VBAR_EL2, access_rw, reset_val, 0), EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), @@ -4719,7 +4910,7 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP)) vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2); - if (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, TCRX, IMP)) + if (kvm_has_tcr2(kvm)) vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En; if (kvm_has_fpmr(kvm)) @@ -4769,11 +4960,11 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_ATS1E1RP | HFGITR_EL2_ATS1E1WP); - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1PIE, IMP)) + if (!kvm_has_s1pie(kvm)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPIRE0_EL1 | HFGxTR_EL2_nPIR_EL1); - if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, S1POE, IMP)) + if (!kvm_has_s1poe(kvm)) kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | HFGxTR_EL2_nPOR_EL0); diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index ba945ba78cc7..198296933e7e 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -782,6 +782,9 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, ite = find_ite(its, device_id, event_id); if (ite && its_is_collection_mapped(ite->collection)) { + struct its_device *device = find_its_device(its, device_id); + int ite_esz = vgic_its_get_abi(its)->ite_esz; + gpa_t gpa = device->itt_addr + ite->event_id * ite_esz; /* * Though the spec talks about removing the pending state, we * don't bother here since we clear the ITTE anyway and the @@ -790,7 +793,8 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, vgic_its_invalidate_cache(its); its_free_ite(kvm, ite); - return 0; + + return vgic_its_write_entry_lock(its, gpa, 0, ite_esz); } return E_ITS_DISCARD_UNMAPPED_INTERRUPT; @@ -1139,9 +1143,11 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, bool valid = its_cmd_get_validbit(its_cmd); u8 num_eventid_bits = its_cmd_get_size(its_cmd); gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); + int dte_esz = vgic_its_get_abi(its)->dte_esz; struct its_device *device; + gpa_t gpa; - if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL)) + if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa)) return E_ITS_MAPD_DEVICE_OOR; if (valid && num_eventid_bits > VITS_TYPER_IDBITS) @@ -1162,7 +1168,7 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, * is an error, so we are done in any case. */ if (!valid) - return 0; + return vgic_its_write_entry_lock(its, gpa, 0, dte_esz); device = vgic_its_alloc_device(its, device_id, itt_addr, num_eventid_bits); @@ -2086,7 +2092,6 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, struct its_ite *ite, gpa_t gpa, int ite_esz) { - struct kvm *kvm = its->dev->kvm; u32 next_offset; u64 val; @@ -2095,7 +2100,8 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return vgic_write_guest_lock(kvm, gpa, &val, ite_esz); + + return vgic_its_write_entry_lock(its, gpa, val, ite_esz); } /** @@ -2239,7 +2245,6 @@ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, gpa_t ptr, int dte_esz) { - struct kvm *kvm = its->dev->kvm; u64 val, itt_addr_field; u32 next_offset; @@ -2250,7 +2255,8 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return vgic_write_guest_lock(kvm, ptr, &val, dte_esz); + + return vgic_its_write_entry_lock(its, ptr, val, dte_esz); } /** @@ -2437,7 +2443,8 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return vgic_write_guest_lock(its->dev->kvm, gpa, &val, esz); + + return vgic_its_write_entry_lock(its, gpa, val, esz); } /* @@ -2453,8 +2460,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) u64 val; int ret; - BUG_ON(esz > sizeof(val)); - ret = kvm_read_guest_lock(kvm, gpa, &val, esz); + ret = vgic_its_read_entry_lock(its, gpa, &val, esz); if (ret) return ret; val = le64_to_cpu(val); @@ -2492,7 +2498,6 @@ static int vgic_its_save_collection_table(struct vgic_its *its) u64 baser = its->baser_coll_table; gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); struct its_collection *collection; - u64 val; size_t max_size, filled = 0; int ret, cte_esz = abi->cte_esz; @@ -2516,10 +2521,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) * table is not fully filled, add a last dummy element * with valid bit unset */ - val = 0; - BUG_ON(cte_esz > sizeof(val)); - ret = vgic_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); - return ret; + return vgic_its_write_entry_lock(its, gpa, 0, cte_esz); } /* diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index f2486b4d9f95..309295f5e1b0 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -146,6 +146,29 @@ static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, return ret; } +static inline int vgic_its_read_entry_lock(struct vgic_its *its, gpa_t eaddr, + u64 *eval, unsigned long esize) +{ + struct kvm *kvm = its->dev->kvm; + + if (KVM_BUG_ON(esize != sizeof(*eval), kvm)) + return -EINVAL; + + return kvm_read_guest_lock(kvm, eaddr, eval, esize); + +} + +static inline int vgic_its_write_entry_lock(struct vgic_its *its, gpa_t eaddr, + u64 eval, unsigned long esize) +{ + struct kvm *kvm = its->dev->kvm; + + if (KVM_BUG_ON(esize != sizeof(eval), kvm)) + return -EINVAL; + + return vgic_write_guest_lock(kvm, eaddr, &eval, esize); +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c2f89a678ac0..ef63651099a9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -1023,7 +1023,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, if (vma->vm_flags & VM_MTE) flags |= __GFP_ZEROTAGS; - return vma_alloc_folio(flags, 0, vma, vaddr, false); + return vma_alloc_folio(flags, 0, vma, vaddr); } void tag_clear_highpage(struct page *page) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 6ae6ae806454..39fd1f7ff02a 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -282,7 +282,23 @@ int realm_register_memory_enc_ops(void) return arm64_mem_crypt_ops_register(&realm_crypt_ops); } +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + unsigned long addr = (unsigned long)page_address(page); + + if (!can_set_direct_map()) + return 0; + + return set_memory_valid(addr, nr, valid); +} + #ifdef CONFIG_DEBUG_PAGEALLOC +/* + * This is - apart from the return value - doing the same + * thing as the new set_direct_map_valid_noflush() function. + * + * Unify? Explain the conceptual differences? + */ void __kernel_map_pages(struct page *page, int numpages, int enable) { if (!can_set_direct_map()) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 27ef366363e4..66708b95493a 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -19,7 +19,7 @@ #include <asm/cacheflush.h> #include <asm/debug-monitors.h> #include <asm/insn.h> -#include <asm/patching.h> +#include <asm/text-patching.h> #include <asm/set_memory.h> #include "bpf_jit.h" diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 8dfb2fa51d12..eb17f59e543c 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -62,6 +62,8 @@ HW_DBM KVM_HVHE KVM_PROTECTED_MODE MISMATCHED_CACHE_TYPE +MPAM +MPAM_HCR MTE MTE_ASYMM SME diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 283279af932c..b081b54d6d22 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1200,7 +1200,7 @@ UnsignedEnum 55:52 BRBE 0b0001 IMP 0b0010 BRBE_V1P1 EndEnum -Enum 51:48 MTPMU +SignedEnum 51:48 MTPMU 0b0000 NI_IMPDEF 0b0001 IMP 0b1111 NI @@ -1208,6 +1208,7 @@ EndEnum UnsignedEnum 47:44 TraceBuffer 0b0000 NI 0b0001 IMP + 0b0010 TRBE_V1P1 EndEnum UnsignedEnum 43:40 TraceFilt 0b0000 NI @@ -1224,11 +1225,18 @@ UnsignedEnum 35:32 PMSVer 0b0011 V1P2 0b0100 V1P3 0b0101 V1P4 + 0b0110 V1P5 EndEnum Field 31:28 CTX_CMPs -Res0 27:24 +UnsignedEnum 27:24 SEBEP + 0b0000 NI + 0b0001 IMP +EndEnum Field 23:20 WRPs -Res0 19:16 +UnsignedEnum 19:16 PMSS + 0b0000 NI + 0b0001 IMP +EndEnum Field 15:12 BRPs UnsignedEnum 11:8 PMUVer 0b0000 NI @@ -1288,6 +1296,32 @@ Field 15:8 BRPs Field 7:0 SYSPMUID EndSysreg +Sysreg ID_AA64DFR2_EL1 3 0 0 5 2 +Res0 63:28 +UnsignedEnum 27:24 TRBE_EXC + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 23:20 SPE_nVM + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 19:16 SPE_EXC + 0b0000 NI + 0b0001 IMP +EndEnum +Res0 15:8 +UnsignedEnum 7:4 BWE + 0b0000 NI + 0b0001 FEAT_BWE + 0b0002 FEAT_BWE2 +EndEnum +UnsignedEnum 3:0 STEP + 0b0000 NI + 0b0001 IMP +EndEnum +EndSysreg + Sysreg ID_AA64AFR0_EL1 3 0 0 5 4 Res0 63:32 Field 31:28 IMPDEF7 @@ -2400,6 +2434,41 @@ Field 1 AFSR1_EL1 Field 0 AFSR0_EL1 EndSysregFields +Sysreg MDCR_EL2 3 4 1 1 1 +Res0 63:51 +Field 50 EnSTEPOP +Res0 49:44 +Field 43 EBWE +Res0 42 +Field 41:40 PMEE +Res0 39:37 +Field 36 HPMFZS +Res0 35:32 +Field 31:30 PMSSE +Field 29 HPMFZO +Field 28 MTPME +Field 27 TDCC +Field 26 HLP +Field 25:24 E2TB +Field 23 HCCD +Res0 22:20 +Field 19 TTRF +Res0 18 +Field 17 HPMD +Res0 16 +Field 15 EnSPM +Field 14 TPMS +Field 13:12 E2PB +Field 11 TDRA +Field 10 TDOSA +Field 9 TDA +Field 8 TDE +Field 7 HPME +Field 6 TPM +Field 5 TPMCR +Field 4:0 HPMN +EndSysreg + Sysreg HFGRTR_EL2 3 4 1 1 4 Fields HFGxTR_EL2 EndSysreg @@ -2749,6 +2818,126 @@ Field 1 E2SPE Field 0 E0HSPE EndSysreg +Sysreg MPAMHCR_EL2 3 4 10 4 0 +Res0 63:32 +Field 31 TRAP_MPAMIDR_EL1 +Res0 30:9 +Field 8 GSTAPP_PLK +Res0 7:2 +Field 1 EL1_VPMEN +Field 0 EL0_VPMEN +EndSysreg + +Sysreg MPAMVPMV_EL2 3 4 10 4 1 +Res0 63:32 +Field 31 VPM_V31 +Field 30 VPM_V30 +Field 29 VPM_V29 +Field 28 VPM_V28 +Field 27 VPM_V27 +Field 26 VPM_V26 +Field 25 VPM_V25 +Field 24 VPM_V24 +Field 23 VPM_V23 +Field 22 VPM_V22 +Field 21 VPM_V21 +Field 20 VPM_V20 +Field 19 VPM_V19 +Field 18 VPM_V18 +Field 17 VPM_V17 +Field 16 VPM_V16 +Field 15 VPM_V15 +Field 14 VPM_V14 +Field 13 VPM_V13 +Field 12 VPM_V12 +Field 11 VPM_V11 +Field 10 VPM_V10 +Field 9 VPM_V9 +Field 8 VPM_V8 +Field 7 VPM_V7 +Field 6 VPM_V6 +Field 5 VPM_V5 +Field 4 VPM_V4 +Field 3 VPM_V3 +Field 2 VPM_V2 +Field 1 VPM_V1 +Field 0 VPM_V0 +EndSysreg + +Sysreg MPAM2_EL2 3 4 10 5 0 +Field 63 MPAMEN +Res0 62:59 +Field 58 TIDR +Res0 57 +Field 56 ALTSP_HFC +Field 55 ALTSP_EL2 +Field 54 ALTSP_FRCD +Res0 53:51 +Field 50 EnMPAMSM +Field 49 TRAPMPAM0EL1 +Field 48 TRAPMPAM1EL1 +Field 47:40 PMG_D +Field 39:32 PMG_I +Field 31:16 PARTID_D +Field 15:0 PARTID_I +EndSysreg + +Sysreg MPAMVPM0_EL2 3 4 10 6 0 +Field 63:48 PhyPARTID3 +Field 47:32 PhyPARTID2 +Field 31:16 PhyPARTID1 +Field 15:0 PhyPARTID0 +EndSysreg + +Sysreg MPAMVPM1_EL2 3 4 10 6 1 +Field 63:48 PhyPARTID7 +Field 47:32 PhyPARTID6 +Field 31:16 PhyPARTID5 +Field 15:0 PhyPARTID4 +EndSysreg + +Sysreg MPAMVPM2_EL2 3 4 10 6 2 +Field 63:48 PhyPARTID11 +Field 47:32 PhyPARTID10 +Field 31:16 PhyPARTID9 +Field 15:0 PhyPARTID8 +EndSysreg + +Sysreg MPAMVPM3_EL2 3 4 10 6 3 +Field 63:48 PhyPARTID15 +Field 47:32 PhyPARTID14 +Field 31:16 PhyPARTID13 +Field 15:0 PhyPARTID12 +EndSysreg + +Sysreg MPAMVPM4_EL2 3 4 10 6 4 +Field 63:48 PhyPARTID19 +Field 47:32 PhyPARTID18 +Field 31:16 PhyPARTID17 +Field 15:0 PhyPARTID16 +EndSysreg + +Sysreg MPAMVPM5_EL2 3 4 10 6 5 +Field 63:48 PhyPARTID23 +Field 47:32 PhyPARTID22 +Field 31:16 PhyPARTID21 +Field 15:0 PhyPARTID20 +EndSysreg + +Sysreg MPAMVPM6_EL2 3 4 10 6 6 +Field 63:48 PhyPARTID27 +Field 47:32 PhyPARTID26 +Field 31:16 PhyPARTID25 +Field 15:0 PhyPARTID24 +EndSysreg + +Sysreg MPAMVPM7_EL2 3 4 10 6 7 +Field 63:48 PhyPARTID31 +Field 47:32 PhyPARTID30 +Field 31:16 PhyPARTID29 +Field 15:0 PhyPARTID28 +EndSysreg + Sysreg CONTEXTIDR_EL2 3 4 13 0 1 Fields CONTEXTIDR_ELx EndSysreg @@ -2781,6 +2970,10 @@ Sysreg FAR_EL12 3 5 6 0 0 Field 63:0 ADDR EndSysreg +Sysreg MPAM1_EL12 3 5 10 5 0 +Fields MPAM1_ELx +EndSysreg + Sysreg CONTEXTIDR_EL12 3 5 13 0 1 Fields CONTEXTIDR_ELx EndSysreg @@ -2831,8 +3024,7 @@ Field 13 AMEC1 Field 12 AMEC0 Field 11 HAFT Field 10 PTTWI -Field 9:8 SKL1 -Field 7:6 SKL0 +Res0 9:6 Field 5 D128 Field 4 AIE Field 3 POE @@ -2895,6 +3087,10 @@ Sysreg PIRE0_EL12 3 5 10 2 2 Fields PIRx_ELx EndSysreg +Sysreg PIRE0_EL2 3 4 10 2 2 +Fields PIRx_ELx +EndSysreg + Sysreg PIR_EL1 3 0 10 2 3 Fields PIRx_ELx EndSysreg @@ -2915,6 +3111,10 @@ Sysreg POR_EL1 3 0 10 2 4 Fields PIRx_ELx EndSysreg +Sysreg POR_EL2 3 4 10 2 4 +Fields PIRx_ELx +EndSysreg + Sysreg POR_EL12 3 5 10 2 4 Fields PIRx_ELx EndSysreg @@ -2953,6 +3153,22 @@ Res0 1 Field 0 EN EndSysreg +Sysreg MPAMIDR_EL1 3 0 10 4 4 +Res0 63:62 +Field 61 HAS_SDEFLT +Field 60 HAS_FORCE_NS +Field 59 SP4 +Field 58 HAS_TIDR +Field 57 HAS_ALTSP +Res0 56:40 +Field 39:32 PMG_MAX +Res0 31:21 +Field 20:18 VPMR_MAX +Field 17 HAS_HCR +Res0 16 +Field 15:0 PARTID_MAX +EndSysreg + Sysreg LORID_EL1 3 0 10 4 7 Res0 63:24 Field 23:16 LD @@ -2960,6 +3176,27 @@ Res0 15:8 Field 7:0 LR EndSysreg +Sysreg MPAM1_EL1 3 0 10 5 0 +Field 63 MPAMEN +Res0 62:61 +Field 60 FORCED_NS +Res0 59:55 +Field 54 ALTSP_FRCD +Res0 53:48 +Field 47:40 PMG_D +Field 39:32 PMG_I +Field 31:16 PARTID_D +Field 15:0 PARTID_I +EndSysreg + +Sysreg MPAM0_EL1 3 0 10 5 1 +Res0 63:48 +Field 47:40 PMG_D +Field 39:32 PMG_I +Field 31:16 PARTID_D +Field 15:0 PARTID_I +EndSysreg + Sysreg ISR_EL1 3 0 12 1 0 Res0 63:11 Field 10 IS diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 9a9bc65b57a9..3a5c7f6e5aac 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -11,3 +11,4 @@ generic-y += qspinlock.h generic-y += parport.h generic-y += user.h generic-y += vmlinux.lds.h +generic-y += text-patching.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 8c1a78c8f527..1efa1e993d4b 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -5,3 +5,4 @@ generic-y += extable.h generic-y += iomap.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += text-patching.h diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h index d5f66495b670..63add2d863e8 100644 --- a/arch/hexagon/include/asm/spinlock_types.h +++ b/arch/hexagon/include/asm/spinlock_types.h @@ -9,7 +9,7 @@ #define _ASM_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index d9fce0fd475a..dae3a9104ca6 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -23,6 +23,7 @@ config LOONGARCH select ARCH_HAS_KERNEL_FPU_SUPPORT if CPU_HAS_FPU select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + select ARCH_HAS_PREEMPT_LAZY select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY @@ -66,6 +67,7 @@ config LOONGARCH select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_QUEUED_RWLOCKS @@ -155,6 +157,7 @@ config LOONGARCH select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_PREEMPT_DYNAMIC_KEY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index ae3f80622f4c..567bd122a9ee 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -59,7 +59,7 @@ endif ifdef CONFIG_64BIT ld-emul = $(64bit-emul) -cflags-y += -mabi=lp64s +cflags-y += -mabi=lp64s -mcmodel=normal endif cflags-y += -pipe $(CC_FLAGS_NO_FPU) @@ -104,7 +104,7 @@ ifdef CONFIG_OBJTOOL KBUILD_CFLAGS += -fno-jump-tables endif -KBUILD_RUSTFLAGS += --target=loongarch64-unknown-none-softfloat +KBUILD_RUSTFLAGS += --target=loongarch64-unknown-none-softfloat -Ccode-model=small KBUILD_RUSTFLAGS_KERNEL += -Zdirect-access-external-data=yes KBUILD_RUSTFLAGS_MODULE += -Zdirect-access-external-data=no diff --git a/arch/loongarch/boot/dts/loongson-2k1000.dtsi b/arch/loongarch/boot/dts/loongson-2k1000.dtsi index 92180140eb56..8dff2aa52417 100644 --- a/arch/loongarch/boot/dts/loongson-2k1000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k1000.dtsi @@ -266,7 +266,7 @@ status = "disabled"; }; - dma-controller@1fe00c20 { + apbdma2: dma-controller@1fe00c20 { compatible = "loongson,ls2k1000-apbdma"; reg = <0x0 0x1fe00c20 0x0 0x8>; interrupt-parent = <&liointc1>; @@ -276,7 +276,7 @@ status = "disabled"; }; - dma-controller@1fe00c30 { + apbdma3: dma-controller@1fe00c30 { compatible = "loongson,ls2k1000-apbdma"; reg = <0x0 0x1fe00c30 0x0 0x8>; interrupt-parent = <&liointc1>; @@ -352,6 +352,19 @@ status = "disabled"; }; + i2s: i2s@1fe2d000 { + compatible = "loongson,ls2k1000-i2s"; + reg = <0 0x1fe2d000 0 0x14>, + <0 0x1fe00438 0 0x8>; + interrupt-parent = <&liointc0>; + interrupts = <5 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + dmas = <&apbdma2 0>, <&apbdma3 0>; + dma-names = "tx", "rx"; + #sound-dai-cells = <0>; + status = "disabled"; + }; + spi0: spi@1fff0220 { compatible = "loongson,ls2k1000-spi"; reg = <0x0 0x1fff0220 0x0 0x10>; diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi index 0953c5707825..b4ff55a33e90 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi @@ -173,6 +173,22 @@ status = "disabled"; }; + i2c@1fe00120 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1fe00120 0x0 0x8>; + interrupt-parent = <&liointc>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + i2c@1fe00130 { + compatible = "loongson,ls2k-i2c"; + reg = <0x0 0x1fe00130 0x0 0x8>; + interrupt-parent = <&liointc>; + interrupts = <9 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + uart0: serial@1fe001e0 { compatible = "ns16550a"; reg = <0x0 0x1fe001e0 0x0 0x10>; @@ -243,9 +259,11 @@ status = "disabled"; }; - hda@7,0 { + i2s@7,0 { reg = <0x3800 0x0 0x0 0x0 0x0>; - interrupts = <58 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <78 IRQ_TYPE_LEVEL_HIGH>, + <79 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "tx", "rx"; interrupt-parent = <&pic>; status = "disabled"; }; diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 75b366407a60..4dffc90192f7 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -1,4 +1,5 @@ # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_KERNEL_ZSTD=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_NO_HZ=y @@ -70,6 +71,14 @@ CONFIG_ACPI_IPMI=m CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_PCI_SLOT=y CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_BGRT=y +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y +CONFIG_LOONGSON3_CPUFREQ=m CONFIG_VIRTUALIZATION=y CONFIG_KVM=m CONFIG_JUMP_LABEL=y @@ -78,6 +87,9 @@ CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y +CONFIG_MODULE_COMPRESS=y +CONFIG_MODULE_COMPRESS_ZSTD=y +CONFIG_MODULE_DECOMPRESS=y CONFIG_BLK_DEV_ZONED=y CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y @@ -85,6 +97,8 @@ CONFIG_BLK_CGROUP_IOLATENCY=y CONFIG_BLK_CGROUP_FC_APPID=y CONFIG_BLK_CGROUP_IOCOST=y CONFIG_BLK_CGROUP_IOPRIO=y +CONFIG_BLK_INLINE_ENCRYPTION=y +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y CONFIG_PARTITION_ADVANCED=y CONFIG_BSD_DISKLABEL=y CONFIG_UNIXWARE_DISKLABEL=y @@ -413,7 +427,16 @@ CONFIG_PARPORT_PC=y CONFIG_PARPORT_SERIAL=y CONFIG_PARPORT_PC_FIFO=y CONFIG_ZRAM=m +CONFIG_ZRAM_BACKEND_LZ4=y +CONFIG_ZRAM_BACKEND_LZ4HC=y +CONFIG_ZRAM_BACKEND_ZSTD=y +CONFIG_ZRAM_BACKEND_DEFLATE=y +CONFIG_ZRAM_BACKEND_842=y +CONFIG_ZRAM_BACKEND_LZO=y CONFIG_ZRAM_DEF_COMP_ZSTD=y +CONFIG_ZRAM_WRITEBACK=y +CONFIG_ZRAM_MEMORY_TRACKING=y +CONFIG_ZRAM_MULTI_COMP=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m @@ -433,6 +456,9 @@ CONFIG_NVME_TARGET_RDMA=m CONFIG_NVME_TARGET_FC=m CONFIG_NVME_TARGET_TCP=m CONFIG_EEPROM_AT24=m +CONFIG_PVPANIC=y +CONFIG_PVPANIC_MMIO=m +CONFIG_PVPANIC_PCI=m CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y CONFIG_CHR_DEV_SG=y @@ -470,12 +496,10 @@ CONFIG_PATA_ATIIXP=y CONFIG_PATA_PCMCIA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=m -CONFIG_MD_LINEAR=m CONFIG_MD_RAID0=m CONFIG_MD_RAID1=m CONFIG_MD_RAID10=m CONFIG_MD_RAID456=m -CONFIG_MD_MULTIPATH=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y CONFIG_DM_CRYPT=m @@ -489,6 +513,16 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_QL=m CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_MULTIPATH_HST=m +CONFIG_DM_MULTIPATH_IOA=m +CONFIG_DM_INIT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_DM_VDO=m CONFIG_TARGET_CORE=m CONFIG_TCM_IBLOCK=m CONFIG_TCM_FILEIO=m @@ -500,6 +534,13 @@ CONFIG_NETDEVICES=y CONFIG_BONDING=m CONFIG_DUMMY=y CONFIG_WIREGUARD=m +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m CONFIG_MACVLAN=m CONFIG_MACVTAP=m CONFIG_IPVLAN=m @@ -580,12 +621,14 @@ CONFIG_PPP_ASYNC=m CONFIG_PPP_SYNC_TTY=m CONFIG_USB_RTL8150=m CONFIG_USB_RTL8152=m +CONFIG_USB_USBNET=m # CONFIG_USB_NET_AX8817X is not set # CONFIG_USB_NET_AX88179_178A is not set CONFIG_USB_NET_CDC_EEM=m CONFIG_USB_NET_HUAWEI_CDC_NCM=m CONFIG_USB_NET_CDC_MBIM=m # CONFIG_USB_NET_NET1080 is not set +CONFIG_USB_NET_RNDIS_HOST=m # CONFIG_USB_BELKIN is not set # CONFIG_USB_ARMLINUX is not set # CONFIG_USB_NET_ZAURUS is not set @@ -594,10 +637,11 @@ CONFIG_ATH9K_HTC=m CONFIG_IWLWIFI=m CONFIG_IWLDVM=m CONFIG_IWLMVM=m -CONFIG_HOSTAP=m CONFIG_MT7601U=m CONFIG_RT2X00=m CONFIG_RT2800USB=m +CONFIG_RTL8180=m +CONFIG_RTL8187=m CONFIG_RTL8192CE=m CONFIG_RTL8192SE=m CONFIG_RTL8192DE=m @@ -607,18 +651,26 @@ CONFIG_RTL8188EE=m CONFIG_RTL8192EE=m CONFIG_RTL8821AE=m CONFIG_RTL8192CU=m +CONFIG_RTL8192DU=m # CONFIG_RTLWIFI_DEBUG is not set CONFIG_RTL8XXXU=m CONFIG_RTW88=m CONFIG_RTW88_8822BE=m +CONFIG_RTW88_8822BU=m CONFIG_RTW88_8822CE=m +CONFIG_RTW88_8822CU=m CONFIG_RTW88_8723DE=m +CONFIG_RTW88_8723DU=m CONFIG_RTW88_8821CE=m +CONFIG_RTW88_8821CU=m CONFIG_RTW89=m +CONFIG_RTW89_8851BE=m CONFIG_RTW89_8852AE=m +CONFIG_RTW89_8852BE=m +CONFIG_RTW89_8852BTE=m CONFIG_RTW89_8852CE=m +CONFIG_RTW89_8922AE=m CONFIG_ZD1211RW=m -CONFIG_USB_NET_RNDIS_WLAN=m CONFIG_USB4_NET=m CONFIG_INPUT_MOUSEDEV=y CONFIG_INPUT_MOUSEDEV_PSAUX=y @@ -651,6 +703,9 @@ CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_VIRTIO=m CONFIG_I2C_CHARDEV=y CONFIG_I2C_PIIX4=y +CONFIG_I2C_DESIGNWARE_CORE=y +CONFIG_I2C_DESIGNWARE_SLAVE=y +CONFIG_I2C_DESIGNWARE_PCI=y CONFIG_I2C_GPIO=y CONFIG_I2C_LS2X=y CONFIG_SPI=y @@ -727,11 +782,22 @@ CONFIG_SND_HDA_CODEC_CONEXANT=y CONFIG_SND_USB_AUDIO=m CONFIG_SND_SOC=m CONFIG_SND_SOC_LOONGSON_CARD=m +CONFIG_SND_SOC_ES7134=m +CONFIG_SND_SOC_ES7241=m +CONFIG_SND_SOC_ES8311=m +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8323=m +CONFIG_SND_SOC_ES8326=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_UDA1334=m +CONFIG_SND_SOC_UDA1342=m CONFIG_SND_VIRTIO=m CONFIG_HIDRAW=y CONFIG_UHID=m CONFIG_HID_A4TECH=m CONFIG_HID_CHERRY=m +CONFIG_HID_ELAN=m CONFIG_HID_LOGITECH=m CONFIG_HID_LOGITECH_DJ=m CONFIG_LOGITECH_FF=y @@ -740,7 +806,11 @@ CONFIG_LOGIG940_FF=y CONFIG_HID_MICROSOFT=m CONFIG_HID_MULTITOUCH=m CONFIG_HID_SUNPLUS=m +CONFIG_HID_WACOM=m CONFIG_USB_HIDDEV=y +CONFIG_I2C_HID_ACPI=m +CONFIG_I2C_HID_OF=m +CONFIG_I2C_HID_OF_ELAN=m CONFIG_USB=y CONFIG_USB_OTG=y CONFIG_USB_MON=y @@ -775,7 +845,7 @@ CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y CONFIG_RTC_DRV_LOONGSON=y CONFIG_DMADEVICES=y -CONFIG_LS2X_APB_DMA=y +CONFIG_LOONGSON2_APB_DMA=y CONFIG_UDMABUF=y CONFIG_DMABUF_HEAPS=y CONFIG_DMABUF_HEAPS_SYSTEM=y @@ -852,6 +922,9 @@ CONFIG_F2FS_FS=m CONFIG_F2FS_FS_SECURITY=y CONFIG_F2FS_CHECK_FS=y CONFIG_F2FS_FS_COMPRESSION=y +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y +CONFIG_FS_VERITY=y CONFIG_FANOTIFY=y CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA=y @@ -904,16 +977,14 @@ CONFIG_SQUASHFS_ZSTD=y CONFIG_MINIX_FS=m CONFIG_ROMFS_FS=m CONFIG_PSTORE=m -CONFIG_PSTORE_LZO_COMPRESS=m -CONFIG_PSTORE_LZ4_COMPRESS=m -CONFIG_PSTORE_LZ4HC_COMPRESS=m -CONFIG_PSTORE_842_COMPRESS=y -CONFIG_PSTORE_ZSTD_COMPRESS=y -CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y +CONFIG_PSTORE_COMPRESS=y CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_EROFS_FS_ZIP_LZMA=y +CONFIG_EROFS_FS_ZIP_DEFLATE=y +CONFIG_EROFS_FS_ZIP_ZSTD=y +CONFIG_EROFS_FS_ONDEMAND=y CONFIG_EROFS_FS_PCPU_KTHREAD=y CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 5b5a6c90e6e2..80ddb5edb845 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -11,3 +11,4 @@ generic-y += ioctl.h generic-y += mmzone.h generic-y += statfs.h generic-y += param.h +generic-y += text-patching.h diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h index 5da32c00d483..b837c65a4894 100644 --- a/arch/loongarch/include/asm/hugetlb.h +++ b/arch/loongarch/include/asm/hugetlb.h @@ -16,12 +16,7 @@ static inline int prepare_hugepage_range(struct file *file, unsigned long len) { unsigned long task_size = STACK_TOP; - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; if (len > task_size) return -ENOMEM; if (task_size - len < addr) diff --git a/arch/loongarch/include/asm/irq.h b/arch/loongarch/include/asm/irq.h index 9c2ca785faa9..a0ca84da8541 100644 --- a/arch/loongarch/include/asm/irq.h +++ b/arch/loongarch/include/asm/irq.h @@ -65,6 +65,7 @@ extern struct acpi_vector_group pch_group[MAX_IO_PICS]; extern struct acpi_vector_group msi_group[MAX_IO_PICS]; #define CORES_PER_EIO_NODE 4 +#define CORES_PER_VEIO_NODE 256 #define LOONGSON_CPU_UART0_VEC 10 /* CPU UART0 */ #define LOONGSON_CPU_THSENS_VEC 14 /* CPU Thsens */ diff --git a/arch/loongarch/include/asm/jump_label.h b/arch/loongarch/include/asm/jump_label.h index 29acfe3de3fa..8a924bd69d19 100644 --- a/arch/loongarch/include/asm/jump_label.h +++ b/arch/loongarch/include/asm/jump_label.h @@ -13,18 +13,22 @@ #define JUMP_LABEL_NOP_SIZE 4 -#define JUMP_TABLE_ENTRY \ +/* This macro is also expanded on the Rust side. */ +#define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\" \n\t" \ ".align 3 \n\t" \ - ".long 1b - ., %l[l_yes] - . \n\t" \ - ".quad %0 - . \n\t" \ + ".long 1b - ., " label " - . \n\t" \ + ".quad " key " - . \n\t" \ ".popsection \n\t" +#define ARCH_STATIC_BRANCH_ASM(key, label) \ + "1: nop \n\t" \ + JUMP_TABLE_ENTRY(key, label) + static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm goto( - "1: nop \n\t" - JUMP_TABLE_ENTRY + ARCH_STATIC_BRANCH_ASM("%0", "%l[l_yes]") : : "i"(&((char *)key)[branch]) : : l_yes); return false; @@ -37,7 +41,7 @@ static __always_inline bool arch_static_branch_jump(struct static_key * const ke { asm goto( "1: b %l[l_yes] \n\t" - JUMP_TABLE_ENTRY + JUMP_TABLE_ENTRY("%0", "%l[l_yes]") : : "i"(&((char *)key)[branch]) : : l_yes); return false; diff --git a/arch/loongarch/include/asm/kvm_eiointc.h b/arch/loongarch/include/asm/kvm_eiointc.h new file mode 100644 index 000000000000..a3a40aba8acf --- /dev/null +++ b/arch/loongarch/include/asm/kvm_eiointc.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#ifndef __ASM_KVM_EIOINTC_H +#define __ASM_KVM_EIOINTC_H + +#include <kvm/iodev.h> + +#define EIOINTC_IRQS 256 +#define EIOINTC_ROUTE_MAX_VCPUS 256 +#define EIOINTC_IRQS_U8_NUMS (EIOINTC_IRQS / 8) +#define EIOINTC_IRQS_U16_NUMS (EIOINTC_IRQS_U8_NUMS / 2) +#define EIOINTC_IRQS_U32_NUMS (EIOINTC_IRQS_U8_NUMS / 4) +#define EIOINTC_IRQS_U64_NUMS (EIOINTC_IRQS_U8_NUMS / 8) +/* map to ipnum per 32 irqs */ +#define EIOINTC_IRQS_NODETYPE_COUNT 16 + +#define EIOINTC_BASE 0x1400 +#define EIOINTC_SIZE 0x900 + +#define EIOINTC_NODETYPE_START 0xa0 +#define EIOINTC_NODETYPE_END 0xbf +#define EIOINTC_IPMAP_START 0xc0 +#define EIOINTC_IPMAP_END 0xc7 +#define EIOINTC_ENABLE_START 0x200 +#define EIOINTC_ENABLE_END 0x21f +#define EIOINTC_BOUNCE_START 0x280 +#define EIOINTC_BOUNCE_END 0x29f +#define EIOINTC_ISR_START 0x300 +#define EIOINTC_ISR_END 0x31f +#define EIOINTC_COREISR_START 0x400 +#define EIOINTC_COREISR_END 0x41f +#define EIOINTC_COREMAP_START 0x800 +#define EIOINTC_COREMAP_END 0x8ff + +#define EIOINTC_VIRT_BASE (0x40000000) +#define EIOINTC_VIRT_SIZE (0x1000) + +#define EIOINTC_VIRT_FEATURES (0x0) +#define EIOINTC_HAS_VIRT_EXTENSION (0) +#define EIOINTC_HAS_ENABLE_OPTION (1) +#define EIOINTC_HAS_INT_ENCODE (2) +#define EIOINTC_HAS_CPU_ENCODE (3) +#define EIOINTC_VIRT_HAS_FEATURES ((1U << EIOINTC_HAS_VIRT_EXTENSION) \ + | (1U << EIOINTC_HAS_ENABLE_OPTION) \ + | (1U << EIOINTC_HAS_INT_ENCODE) \ + | (1U << EIOINTC_HAS_CPU_ENCODE)) +#define EIOINTC_VIRT_CONFIG (0x4) +#define EIOINTC_ENABLE (1) +#define EIOINTC_ENABLE_INT_ENCODE (2) +#define EIOINTC_ENABLE_CPU_ENCODE (3) + +#define LOONGSON_IP_NUM 8 + +struct loongarch_eiointc { + spinlock_t lock; + struct kvm *kvm; + struct kvm_io_device device; + struct kvm_io_device device_vext; + uint32_t num_cpu; + uint32_t features; + uint32_t status; + + /* hardware state */ + union nodetype { + u64 reg_u64[EIOINTC_IRQS_NODETYPE_COUNT / 4]; + u32 reg_u32[EIOINTC_IRQS_NODETYPE_COUNT / 2]; + u16 reg_u16[EIOINTC_IRQS_NODETYPE_COUNT]; + u8 reg_u8[EIOINTC_IRQS_NODETYPE_COUNT * 2]; + } nodetype; + + /* one bit shows the state of one irq */ + union bounce { + u64 reg_u64[EIOINTC_IRQS_U64_NUMS]; + u32 reg_u32[EIOINTC_IRQS_U32_NUMS]; + u16 reg_u16[EIOINTC_IRQS_U16_NUMS]; + u8 reg_u8[EIOINTC_IRQS_U8_NUMS]; + } bounce; + + union isr { + u64 reg_u64[EIOINTC_IRQS_U64_NUMS]; + u32 reg_u32[EIOINTC_IRQS_U32_NUMS]; + u16 reg_u16[EIOINTC_IRQS_U16_NUMS]; + u8 reg_u8[EIOINTC_IRQS_U8_NUMS]; + } isr; + union coreisr { + u64 reg_u64[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U64_NUMS]; + u32 reg_u32[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U32_NUMS]; + u16 reg_u16[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U16_NUMS]; + u8 reg_u8[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U8_NUMS]; + } coreisr; + union enable { + u64 reg_u64[EIOINTC_IRQS_U64_NUMS]; + u32 reg_u32[EIOINTC_IRQS_U32_NUMS]; + u16 reg_u16[EIOINTC_IRQS_U16_NUMS]; + u8 reg_u8[EIOINTC_IRQS_U8_NUMS]; + } enable; + + /* use one byte to config ipmap for 32 irqs at once */ + union ipmap { + u64 reg_u64; + u32 reg_u32[EIOINTC_IRQS_U32_NUMS / 4]; + u16 reg_u16[EIOINTC_IRQS_U16_NUMS / 4]; + u8 reg_u8[EIOINTC_IRQS_U8_NUMS / 4]; + } ipmap; + /* use one byte to config coremap for one irq */ + union coremap { + u64 reg_u64[EIOINTC_IRQS / 8]; + u32 reg_u32[EIOINTC_IRQS / 4]; + u16 reg_u16[EIOINTC_IRQS / 2]; + u8 reg_u8[EIOINTC_IRQS]; + } coremap; + + DECLARE_BITMAP(sw_coreisr[EIOINTC_ROUTE_MAX_VCPUS][LOONGSON_IP_NUM], EIOINTC_IRQS); + uint8_t sw_coremap[EIOINTC_IRQS]; +}; + +int kvm_loongarch_register_eiointc_device(void); +void eiointc_set_irq(struct loongarch_eiointc *s, int irq, int level); + +#endif /* __ASM_KVM_EIOINTC_H */ diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index d6bb72424027..7b8367c39da8 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -18,8 +18,13 @@ #include <asm/inst.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_ipi.h> +#include <asm/kvm_eiointc.h> +#include <asm/kvm_pch_pic.h> #include <asm/loongarch.h> +#define __KVM_HAVE_ARCH_INTC_INITIALIZED + /* Loongarch KVM register ids */ #define KVM_GET_IOC_CSR_IDX(id) ((id & KVM_CSR_IDX_MASK) >> LOONGARCH_REG_SHIFT) #define KVM_GET_IOC_CPUCFG_IDX(id) ((id & KVM_CPUCFG_IDX_MASK) >> LOONGARCH_REG_SHIFT) @@ -44,6 +49,12 @@ struct kvm_vm_stat { struct kvm_vm_stat_generic generic; u64 pages; u64 hugepages; + u64 ipi_read_exits; + u64 ipi_write_exits; + u64 eiointc_read_exits; + u64 eiointc_write_exits; + u64 pch_pic_read_exits; + u64 pch_pic_write_exits; }; struct kvm_vcpu_stat { @@ -84,7 +95,7 @@ struct kvm_world_switch { * * For LOONGARCH_CSR_CPUID register, max CPUID size if 512 * For IPI hardware, max destination CPUID size 1024 - * For extioi interrupt controller, max destination CPUID size is 256 + * For eiointc interrupt controller, max destination CPUID size is 256 * For msgint interrupt controller, max supported CPUID size is 65536 * * Currently max CPUID is defined as 256 for KVM hypervisor, in future @@ -117,6 +128,9 @@ struct kvm_arch { s64 time_offset; struct kvm_context __percpu *vmcs; + struct loongarch_ipi *ipi; + struct loongarch_eiointc *eiointc; + struct loongarch_pch_pic *pch_pic; }; #define CSR_MAX_NUMS 0x800 @@ -221,6 +235,8 @@ struct kvm_vcpu_arch { int last_sched_cpu; /* mp state */ struct kvm_mp_state mp_state; + /* ipi state */ + struct ipi_state ipi_state; /* cpucfg */ u32 cpucfg[KVM_MAX_CPUCFG_REGS]; diff --git a/arch/loongarch/include/asm/kvm_ipi.h b/arch/loongarch/include/asm/kvm_ipi.h new file mode 100644 index 000000000000..060163dfb4a3 --- /dev/null +++ b/arch/loongarch/include/asm/kvm_ipi.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#ifndef __ASM_KVM_IPI_H +#define __ASM_KVM_IPI_H + +#include <kvm/iodev.h> + +#define LARCH_INT_IPI 12 + +struct loongarch_ipi { + spinlock_t lock; + struct kvm *kvm; + struct kvm_io_device device; +}; + +struct ipi_state { + spinlock_t lock; + uint32_t status; + uint32_t en; + uint32_t set; + uint32_t clear; + uint64_t buf[4]; +}; + +#define IOCSR_IPI_BASE 0x1000 +#define IOCSR_IPI_SIZE 0x160 + +#define IOCSR_IPI_STATUS 0x000 +#define IOCSR_IPI_EN 0x004 +#define IOCSR_IPI_SET 0x008 +#define IOCSR_IPI_CLEAR 0x00c +#define IOCSR_IPI_BUF_20 0x020 +#define IOCSR_IPI_BUF_28 0x028 +#define IOCSR_IPI_BUF_30 0x030 +#define IOCSR_IPI_BUF_38 0x038 +#define IOCSR_IPI_SEND 0x040 +#define IOCSR_MAIL_SEND 0x048 +#define IOCSR_ANY_SEND 0x158 + +int kvm_loongarch_register_ipi_device(void); + +#endif diff --git a/arch/loongarch/include/asm/kvm_pch_pic.h b/arch/loongarch/include/asm/kvm_pch_pic.h new file mode 100644 index 000000000000..e6df6a4c1c70 --- /dev/null +++ b/arch/loongarch/include/asm/kvm_pch_pic.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#ifndef __ASM_KVM_PCH_PIC_H +#define __ASM_KVM_PCH_PIC_H + +#include <kvm/iodev.h> + +#define PCH_PIC_SIZE 0x3e8 + +#define PCH_PIC_INT_ID_START 0x0 +#define PCH_PIC_INT_ID_END 0x7 +#define PCH_PIC_MASK_START 0x20 +#define PCH_PIC_MASK_END 0x27 +#define PCH_PIC_HTMSI_EN_START 0x40 +#define PCH_PIC_HTMSI_EN_END 0x47 +#define PCH_PIC_EDGE_START 0x60 +#define PCH_PIC_EDGE_END 0x67 +#define PCH_PIC_CLEAR_START 0x80 +#define PCH_PIC_CLEAR_END 0x87 +#define PCH_PIC_AUTO_CTRL0_START 0xc0 +#define PCH_PIC_AUTO_CTRL0_END 0xc7 +#define PCH_PIC_AUTO_CTRL1_START 0xe0 +#define PCH_PIC_AUTO_CTRL1_END 0xe7 +#define PCH_PIC_ROUTE_ENTRY_START 0x100 +#define PCH_PIC_ROUTE_ENTRY_END 0x13f +#define PCH_PIC_HTMSI_VEC_START 0x200 +#define PCH_PIC_HTMSI_VEC_END 0x23f +#define PCH_PIC_INT_IRR_START 0x380 +#define PCH_PIC_INT_IRR_END 0x38f +#define PCH_PIC_INT_ISR_START 0x3a0 +#define PCH_PIC_INT_ISR_END 0x3af +#define PCH_PIC_POLARITY_START 0x3e0 +#define PCH_PIC_POLARITY_END 0x3e7 +#define PCH_PIC_INT_ID_VAL 0x7000000UL +#define PCH_PIC_INT_ID_VER 0x1UL + +struct loongarch_pch_pic { + spinlock_t lock; + struct kvm *kvm; + struct kvm_io_device device; + uint64_t mask; /* 1:disable irq, 0:enable irq */ + uint64_t htmsi_en; /* 1:msi */ + uint64_t edge; /* 1:edge triggered, 0:level triggered */ + uint64_t auto_ctrl0; /* only use default value 00b */ + uint64_t auto_ctrl1; /* only use default value 00b */ + uint64_t last_intirr; /* edge detection */ + uint64_t irr; /* interrupt request register */ + uint64_t isr; /* interrupt service register */ + uint64_t polarity; /* 0: high level trigger, 1: low level trigger */ + uint8_t route_entry[64]; /* default value 0, route to int0: eiointc */ + uint8_t htmsi_vector[64]; /* irq route table for routing to eiointc */ + uint64_t pch_pic_base; +}; + +int kvm_loongarch_register_pch_pic_device(void); +void pch_pic_set_irq(struct loongarch_pch_pic *s, int irq, int level); +void pch_msi_set_irq(struct kvm *kvm, int irq, int level); + +#endif /* __ASM_KVM_PCH_PIC_H */ diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 20714b73f14c..da346733a1da 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -268,8 +268,11 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pm */ extern void pgd_init(void *addr); extern void pud_init(void *addr); +#define pud_init pud_init extern void pmd_init(void *addr); +#define pmd_init pmd_init extern void kernel_pte_init(void *addr); +#define kernel_pte_init kernel_pte_init /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that diff --git a/arch/loongarch/include/asm/set_memory.h b/arch/loongarch/include/asm/set_memory.h index d70505b6676c..55dfaefd02c8 100644 --- a/arch/loongarch/include/asm/set_memory.h +++ b/arch/loongarch/include/asm/set_memory.h @@ -17,5 +17,6 @@ int set_memory_rw(unsigned long addr, int numpages); bool kernel_page_present(struct page *page); int set_direct_map_default_noflush(struct page *page); int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); #endif /* _ASM_LOONGARCH_SET_MEMORY_H */ diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index 8bf0e6f51546..4f5a9441754e 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -66,8 +66,9 @@ register unsigned long current_stack_pointer __asm__("$sp"); * - pending work-to-be-done flags are in LSW * - other flags in MSW */ -#define TIF_SIGPENDING 1 /* signal pending */ -#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ +#define TIF_NEED_RESCHED 0 /* rescheduling necessary */ +#define TIF_NEED_RESCHED_LAZY 1 /* lazy rescheduling necessary */ +#define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NOTIFY_RESUME 3 /* callback before returning to user */ #define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ @@ -88,8 +89,9 @@ register unsigned long current_stack_pointer __asm__("$sp"); #define TIF_LBT_CTX_LIVE 20 /* LBT context must be preserved */ #define TIF_PATCH_PENDING 21 /* pending live patching update */ -#define _TIF_SIGPENDING (1<<TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) +#define _TIF_SIGPENDING (1<<TIF_SIGPENDING) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) #define _TIF_NOTIFY_SIGNAL (1<<TIF_NOTIFY_SIGNAL) #define _TIF_NOHZ (1<<TIF_NOHZ) diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h index 70d89070bfeb..5f354f5c6847 100644 --- a/arch/loongarch/include/uapi/asm/kvm.h +++ b/arch/loongarch/include/uapi/asm/kvm.h @@ -8,6 +8,8 @@ #include <linux/types.h> +#define __KVM_HAVE_IRQ_LINE + /* * KVM LoongArch specific structures and definitions. * @@ -132,4 +134,22 @@ struct kvm_iocsr_entry { #define KVM_IRQCHIP_NUM_PINS 64 #define KVM_MAX_CORES 256 +#define KVM_DEV_LOONGARCH_IPI_GRP_REGS 0x40000001 + +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS 0x40000002 + +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS 0x40000003 +#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU 0x0 +#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE 0x1 +#define KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE 0x2 + +#define KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL 0x40000004 +#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU 0x0 +#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE 0x1 +#define KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED 0x3 + +#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS 0x40000005 +#define KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL 0x40000006 +#define KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT 0 + #endif /* __UAPI_ASM_LOONGARCH_KVM_H */ diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index 46d7d40c87e3..a07d7eff4dc5 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -127,7 +127,11 @@ void sync_counter(void) int constant_clockevent_init(void) { unsigned int cpu = smp_processor_id(); - unsigned long min_delta = 0x600; +#ifdef CONFIG_PREEMPT_RT + unsigned long min_delta = 100; +#else + unsigned long min_delta = 1000; +#endif unsigned long max_delta = (1UL << 48) - 1; struct clock_event_device *cd; static int irq = 0, timer_irq_installed = 0; diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index 248744b4d086..97a811077ac3 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -21,13 +21,16 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on AS_HAS_LVZ_EXTENSION select HAVE_KVM_DIRTY_RING_ACQ_REL + select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_IRQCHIP + select HAVE_KVM_MSI + select HAVE_KVM_READONLY_MEM select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_MMU_NOTIFIER select KVM_MMIO - select HAVE_KVM_READONLY_MEM select KVM_XFER_TO_GUEST_WORK select SCHED_INFO help diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile index b2f4cbe01ae8..3a01292f71cc 100644 --- a/arch/loongarch/kvm/Makefile +++ b/arch/loongarch/kvm/Makefile @@ -18,5 +18,9 @@ kvm-y += timer.o kvm-y += tlb.o kvm-y += vcpu.o kvm-y += vm.o +kvm-y += intc/ipi.o +kvm-y += intc/eiointc.o +kvm-y += intc/pch_pic.o +kvm-y += irqfd.o CFLAGS_exit.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 90894f70ff4a..69f3e3782cc9 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -157,7 +157,7 @@ static int kvm_handle_csr(struct kvm_vcpu *vcpu, larch_inst inst) int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) { int ret; - unsigned long val; + unsigned long *val; u32 addr, rd, rj, opcode; /* @@ -170,6 +170,7 @@ int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) ret = EMULATE_DO_IOCSR; run->iocsr_io.phys_addr = addr; run->iocsr_io.is_write = 0; + val = &vcpu->arch.gprs[rd]; /* LoongArch is Little endian */ switch (opcode) { @@ -202,16 +203,25 @@ int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) run->iocsr_io.is_write = 1; break; default: - ret = EMULATE_FAIL; - break; + return EMULATE_FAIL; } - if (ret == EMULATE_DO_IOCSR) { - if (run->iocsr_io.is_write) { - val = vcpu->arch.gprs[rd]; - memcpy(run->iocsr_io.data, &val, run->iocsr_io.len); - } - vcpu->arch.io_gpr = rd; + if (run->iocsr_io.is_write) { + if (!kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, run->iocsr_io.len, val)) + ret = EMULATE_DONE; + else + /* Save data and let user space to write it */ + memcpy(run->iocsr_io.data, val, run->iocsr_io.len); + + trace_kvm_iocsr(KVM_TRACE_IOCSR_WRITE, run->iocsr_io.len, addr, val); + } else { + if (!kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, run->iocsr_io.len, val)) + ret = EMULATE_DONE; + else + /* Save register id for iocsr read completion */ + vcpu->arch.io_gpr = rd; + + trace_kvm_iocsr(KVM_TRACE_IOCSR_READ, run->iocsr_io.len, addr, NULL); } return ret; @@ -447,19 +457,33 @@ int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst) } if (ret == EMULATE_DO_MMIO) { + trace_kvm_mmio(KVM_TRACE_MMIO_READ, run->mmio.len, run->mmio.phys_addr, NULL); + + /* + * If mmio device such as PCH-PIC is emulated in KVM, + * it need not return to user space to handle the mmio + * exception. + */ + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, vcpu->arch.badv, + run->mmio.len, &vcpu->arch.gprs[rd]); + if (!ret) { + update_pc(&vcpu->arch); + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + /* Set for kvm_complete_mmio_read() use */ vcpu->arch.io_gpr = rd; run->mmio.is_write = 0; vcpu->mmio_is_write = 0; - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, run->mmio.len, - run->mmio.phys_addr, NULL); - } else { - kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", - inst.word, vcpu->arch.pc, vcpu->arch.badv); - kvm_arch_vcpu_dump_regs(vcpu); - vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; } + kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + vcpu->mmio_needed = 0; + return ret; } @@ -600,19 +624,29 @@ int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst) } if (ret == EMULATE_DO_MMIO) { + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, run->mmio.len, run->mmio.phys_addr, data); + + /* + * If mmio device such as PCH-PIC is emulated in KVM, + * it need not return to user space to handle the mmio + * exception. + */ + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, vcpu->arch.badv, run->mmio.len, data); + if (!ret) + return EMULATE_DONE; + run->mmio.is_write = 1; vcpu->mmio_needed = 1; vcpu->mmio_is_write = 1; - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, run->mmio.len, - run->mmio.phys_addr, data); - } else { - vcpu->arch.pc = curr_pc; - kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", - inst.word, vcpu->arch.pc, vcpu->arch.badv); - kvm_arch_vcpu_dump_regs(vcpu); - /* Rollback PC if emulation was unsuccessful */ + return EMULATE_DO_MMIO; } + vcpu->arch.pc = curr_pc; + kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + /* Rollback PC if emulation was unsuccessful */ + return ret; } diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c new file mode 100644 index 000000000000..f39929d7bf8a --- /dev/null +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -0,0 +1,1027 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include <asm/kvm_eiointc.h> +#include <asm/kvm_vcpu.h> +#include <linux/count_zeros.h> + +static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s) +{ + int ipnum, cpu, irq_index, irq_mask, irq; + + for (irq = 0; irq < EIOINTC_IRQS; irq++) { + ipnum = s->ipmap.reg_u8[irq / 32]; + if (!(s->status & BIT(EIOINTC_ENABLE_INT_ENCODE))) { + ipnum = count_trailing_zeros(ipnum); + ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0; + } + irq_index = irq / 32; + irq_mask = BIT(irq & 0x1f); + + cpu = s->coremap.reg_u8[irq]; + if (!!(s->coreisr.reg_u32[cpu][irq_index] & irq_mask)) + set_bit(irq, s->sw_coreisr[cpu][ipnum]); + else + clear_bit(irq, s->sw_coreisr[cpu][ipnum]); + } +} + +static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level) +{ + int ipnum, cpu, found, irq_index, irq_mask; + struct kvm_vcpu *vcpu; + struct kvm_interrupt vcpu_irq; + + ipnum = s->ipmap.reg_u8[irq / 32]; + if (!(s->status & BIT(EIOINTC_ENABLE_INT_ENCODE))) { + ipnum = count_trailing_zeros(ipnum); + ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0; + } + + cpu = s->sw_coremap[irq]; + vcpu = kvm_get_vcpu(s->kvm, cpu); + irq_index = irq / 32; + irq_mask = BIT(irq & 0x1f); + + if (level) { + /* if not enable return false */ + if (((s->enable.reg_u32[irq_index]) & irq_mask) == 0) + return; + s->coreisr.reg_u32[cpu][irq_index] |= irq_mask; + found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS); + set_bit(irq, s->sw_coreisr[cpu][ipnum]); + } else { + s->coreisr.reg_u32[cpu][irq_index] &= ~irq_mask; + clear_bit(irq, s->sw_coreisr[cpu][ipnum]); + found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS); + } + + if (found < EIOINTC_IRQS) + return; /* other irq is handling, needn't update parent irq */ + + vcpu_irq.irq = level ? (INT_HWI0 + ipnum) : -(INT_HWI0 + ipnum); + kvm_vcpu_ioctl_interrupt(vcpu, &vcpu_irq); +} + +static inline void eiointc_update_sw_coremap(struct loongarch_eiointc *s, + int irq, void *pvalue, u32 len, bool notify) +{ + int i, cpu; + u64 val = *(u64 *)pvalue; + + for (i = 0; i < len; i++) { + cpu = val & 0xff; + val = val >> 8; + + if (!(s->status & BIT(EIOINTC_ENABLE_CPU_ENCODE))) { + cpu = ffs(cpu) - 1; + cpu = (cpu >= 4) ? 0 : cpu; + } + + if (s->sw_coremap[irq + i] == cpu) + continue; + + if (notify && test_bit(irq + i, (unsigned long *)s->isr.reg_u8)) { + /* lower irq at old cpu and raise irq at new cpu */ + eiointc_update_irq(s, irq + i, 0); + s->sw_coremap[irq + i] = cpu; + eiointc_update_irq(s, irq + i, 1); + } else { + s->sw_coremap[irq + i] = cpu; + } + } +} + +void eiointc_set_irq(struct loongarch_eiointc *s, int irq, int level) +{ + unsigned long flags; + unsigned long *isr = (unsigned long *)s->isr.reg_u8; + + level ? set_bit(irq, isr) : clear_bit(irq, isr); + spin_lock_irqsave(&s->lock, flags); + eiointc_update_irq(s, irq, level); + spin_unlock_irqrestore(&s->lock, flags); +} + +static inline void eiointc_enable_irq(struct kvm_vcpu *vcpu, + struct loongarch_eiointc *s, int index, u8 mask, int level) +{ + u8 val; + int irq; + + val = mask & s->isr.reg_u8[index]; + irq = ffs(val); + while (irq != 0) { + /* + * enable bit change from 0 to 1, + * need to update irq by pending bits + */ + eiointc_update_irq(s, irq - 1 + index * 8, level); + val &= ~BIT(irq - 1); + irq = ffs(val); + } +} + +static int loongarch_eiointc_readb(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s, + gpa_t addr, int len, void *val) +{ + int index, ret = 0; + u8 data = 0; + gpa_t offset; + + offset = addr - EIOINTC_BASE; + switch (offset) { + case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: + index = offset - EIOINTC_NODETYPE_START; + data = s->nodetype.reg_u8[index]; + break; + case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: + index = offset - EIOINTC_IPMAP_START; + data = s->ipmap.reg_u8[index]; + break; + case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: + index = offset - EIOINTC_ENABLE_START; + data = s->enable.reg_u8[index]; + break; + case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: + index = offset - EIOINTC_BOUNCE_START; + data = s->bounce.reg_u8[index]; + break; + case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + index = offset - EIOINTC_COREISR_START; + data = s->coreisr.reg_u8[vcpu->vcpu_id][index]; + break; + case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: + index = offset - EIOINTC_COREMAP_START; + data = s->coremap.reg_u8[index]; + break; + default: + ret = -EINVAL; + break; + } + *(u8 *)val = data; + + return ret; +} + +static int loongarch_eiointc_readw(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s, + gpa_t addr, int len, void *val) +{ + int index, ret = 0; + u16 data = 0; + gpa_t offset; + + offset = addr - EIOINTC_BASE; + switch (offset) { + case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: + index = (offset - EIOINTC_NODETYPE_START) >> 1; + data = s->nodetype.reg_u16[index]; + break; + case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: + index = (offset - EIOINTC_IPMAP_START) >> 1; + data = s->ipmap.reg_u16[index]; + break; + case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: + index = (offset - EIOINTC_ENABLE_START) >> 1; + data = s->enable.reg_u16[index]; + break; + case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: + index = (offset - EIOINTC_BOUNCE_START) >> 1; + data = s->bounce.reg_u16[index]; + break; + case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + index = (offset - EIOINTC_COREISR_START) >> 1; + data = s->coreisr.reg_u16[vcpu->vcpu_id][index]; + break; + case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: + index = (offset - EIOINTC_COREMAP_START) >> 1; + data = s->coremap.reg_u16[index]; + break; + default: + ret = -EINVAL; + break; + } + *(u16 *)val = data; + + return ret; +} + +static int loongarch_eiointc_readl(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s, + gpa_t addr, int len, void *val) +{ + int index, ret = 0; + u32 data = 0; + gpa_t offset; + + offset = addr - EIOINTC_BASE; + switch (offset) { + case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: + index = (offset - EIOINTC_NODETYPE_START) >> 2; + data = s->nodetype.reg_u32[index]; + break; + case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: + index = (offset - EIOINTC_IPMAP_START) >> 2; + data = s->ipmap.reg_u32[index]; + break; + case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: + index = (offset - EIOINTC_ENABLE_START) >> 2; + data = s->enable.reg_u32[index]; + break; + case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: + index = (offset - EIOINTC_BOUNCE_START) >> 2; + data = s->bounce.reg_u32[index]; + break; + case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + index = (offset - EIOINTC_COREISR_START) >> 2; + data = s->coreisr.reg_u32[vcpu->vcpu_id][index]; + break; + case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: + index = (offset - EIOINTC_COREMAP_START) >> 2; + data = s->coremap.reg_u32[index]; + break; + default: + ret = -EINVAL; + break; + } + *(u32 *)val = data; + + return ret; +} + +static int loongarch_eiointc_readq(struct kvm_vcpu *vcpu, struct loongarch_eiointc *s, + gpa_t addr, int len, void *val) +{ + int index, ret = 0; + u64 data = 0; + gpa_t offset; + + offset = addr - EIOINTC_BASE; + switch (offset) { + case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: + index = (offset - EIOINTC_NODETYPE_START) >> 3; + data = s->nodetype.reg_u64[index]; + break; + case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: + index = (offset - EIOINTC_IPMAP_START) >> 3; + data = s->ipmap.reg_u64; + break; + case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: + index = (offset - EIOINTC_ENABLE_START) >> 3; + data = s->enable.reg_u64[index]; + break; + case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: + index = (offset - EIOINTC_BOUNCE_START) >> 3; + data = s->bounce.reg_u64[index]; + break; + case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + index = (offset - EIOINTC_COREISR_START) >> 3; + data = s->coreisr.reg_u64[vcpu->vcpu_id][index]; + break; + case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: + index = (offset - EIOINTC_COREMAP_START) >> 3; + data = s->coremap.reg_u64[index]; + break; + default: + ret = -EINVAL; + break; + } + *(u64 *)val = data; + + return ret; +} + +static int kvm_eiointc_read(struct kvm_vcpu *vcpu, + struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + int ret = -EINVAL; + unsigned long flags; + struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc; + + if (!eiointc) { + kvm_err("%s: eiointc irqchip not valid!\n", __func__); + return -EINVAL; + } + + vcpu->kvm->stat.eiointc_read_exits++; + spin_lock_irqsave(&eiointc->lock, flags); + switch (len) { + case 1: + ret = loongarch_eiointc_readb(vcpu, eiointc, addr, len, val); + break; + case 2: + ret = loongarch_eiointc_readw(vcpu, eiointc, addr, len, val); + break; + case 4: + ret = loongarch_eiointc_readl(vcpu, eiointc, addr, len, val); + break; + case 8: + ret = loongarch_eiointc_readq(vcpu, eiointc, addr, len, val); + break; + default: + WARN_ONCE(1, "%s: Abnormal address access: addr 0x%llx, size %d\n", + __func__, addr, len); + } + spin_unlock_irqrestore(&eiointc->lock, flags); + + return ret; +} + +static int loongarch_eiointc_writeb(struct kvm_vcpu *vcpu, + struct loongarch_eiointc *s, + gpa_t addr, int len, const void *val) +{ + int index, irq, bits, ret = 0; + u8 cpu; + u8 data, old_data; + u8 coreisr, old_coreisr; + gpa_t offset; + + data = *(u8 *)val; + offset = addr - EIOINTC_BASE; + + switch (offset) { + case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: + index = (offset - EIOINTC_NODETYPE_START); + s->nodetype.reg_u8[index] = data; + break; + case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: + /* + * ipmap cannot be set at runtime, can be set only at the beginning + * of irqchip driver, need not update upper irq level + */ + index = (offset - EIOINTC_IPMAP_START); + s->ipmap.reg_u8[index] = data; + break; + case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: + index = (offset - EIOINTC_ENABLE_START); + old_data = s->enable.reg_u8[index]; + s->enable.reg_u8[index] = data; + /* + * 1: enable irq. + * update irq when isr is set. + */ + data = s->enable.reg_u8[index] & ~old_data & s->isr.reg_u8[index]; + eiointc_enable_irq(vcpu, s, index, data, 1); + /* + * 0: disable irq. + * update irq when isr is set. + */ + data = ~s->enable.reg_u8[index] & old_data & s->isr.reg_u8[index]; + eiointc_enable_irq(vcpu, s, index, data, 0); + break; + case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: + /* do not emulate hw bounced irq routing */ + index = offset - EIOINTC_BOUNCE_START; + s->bounce.reg_u8[index] = data; + break; + case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + index = (offset - EIOINTC_COREISR_START); + /* use attrs to get current cpu index */ + cpu = vcpu->vcpu_id; + coreisr = data; + old_coreisr = s->coreisr.reg_u8[cpu][index]; + /* write 1 to clear interrupt */ + s->coreisr.reg_u8[cpu][index] = old_coreisr & ~coreisr; + coreisr &= old_coreisr; + bits = sizeof(data) * 8; + irq = find_first_bit((void *)&coreisr, bits); + while (irq < bits) { + eiointc_update_irq(s, irq + index * bits, 0); + bitmap_clear((void *)&coreisr, irq, 1); + irq = find_first_bit((void *)&coreisr, bits); + } + break; + case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: + irq = offset - EIOINTC_COREMAP_START; + index = irq; + s->coremap.reg_u8[index] = data; + eiointc_update_sw_coremap(s, irq, (void *)&data, sizeof(data), true); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int loongarch_eiointc_writew(struct kvm_vcpu *vcpu, + struct loongarch_eiointc *s, + gpa_t addr, int len, const void *val) +{ + int i, index, irq, bits, ret = 0; + u8 cpu; + u16 data, old_data; + u16 coreisr, old_coreisr; + gpa_t offset; + + data = *(u16 *)val; + offset = addr - EIOINTC_BASE; + + switch (offset) { + case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: + index = (offset - EIOINTC_NODETYPE_START) >> 1; + s->nodetype.reg_u16[index] = data; + break; + case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: + /* + * ipmap cannot be set at runtime, can be set only at the beginning + * of irqchip driver, need not update upper irq level + */ + index = (offset - EIOINTC_IPMAP_START) >> 1; + s->ipmap.reg_u16[index] = data; + break; + case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: + index = (offset - EIOINTC_ENABLE_START) >> 1; + old_data = s->enable.reg_u32[index]; + s->enable.reg_u16[index] = data; + /* + * 1: enable irq. + * update irq when isr is set. + */ + data = s->enable.reg_u16[index] & ~old_data & s->isr.reg_u16[index]; + index = index << 1; + for (i = 0; i < sizeof(data); i++) { + u8 mask = (data >> (i * 8)) & 0xff; + eiointc_enable_irq(vcpu, s, index + i, mask, 1); + } + /* + * 0: disable irq. + * update irq when isr is set. + */ + data = ~s->enable.reg_u16[index] & old_data & s->isr.reg_u16[index]; + for (i = 0; i < sizeof(data); i++) { + u8 mask = (data >> (i * 8)) & 0xff; + eiointc_enable_irq(vcpu, s, index, mask, 0); + } + break; + case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: + /* do not emulate hw bounced irq routing */ + index = (offset - EIOINTC_BOUNCE_START) >> 1; + s->bounce.reg_u16[index] = data; + break; + case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + index = (offset - EIOINTC_COREISR_START) >> 1; + /* use attrs to get current cpu index */ + cpu = vcpu->vcpu_id; + coreisr = data; + old_coreisr = s->coreisr.reg_u16[cpu][index]; + /* write 1 to clear interrupt */ + s->coreisr.reg_u16[cpu][index] = old_coreisr & ~coreisr; + coreisr &= old_coreisr; + bits = sizeof(data) * 8; + irq = find_first_bit((void *)&coreisr, bits); + while (irq < bits) { + eiointc_update_irq(s, irq + index * bits, 0); + bitmap_clear((void *)&coreisr, irq, 1); + irq = find_first_bit((void *)&coreisr, bits); + } + break; + case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: + irq = offset - EIOINTC_COREMAP_START; + index = irq >> 1; + s->coremap.reg_u16[index] = data; + eiointc_update_sw_coremap(s, irq, (void *)&data, sizeof(data), true); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int loongarch_eiointc_writel(struct kvm_vcpu *vcpu, + struct loongarch_eiointc *s, + gpa_t addr, int len, const void *val) +{ + int i, index, irq, bits, ret = 0; + u8 cpu; + u32 data, old_data; + u32 coreisr, old_coreisr; + gpa_t offset; + + data = *(u32 *)val; + offset = addr - EIOINTC_BASE; + + switch (offset) { + case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: + index = (offset - EIOINTC_NODETYPE_START) >> 2; + s->nodetype.reg_u32[index] = data; + break; + case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: + /* + * ipmap cannot be set at runtime, can be set only at the beginning + * of irqchip driver, need not update upper irq level + */ + index = (offset - EIOINTC_IPMAP_START) >> 2; + s->ipmap.reg_u32[index] = data; + break; + case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: + index = (offset - EIOINTC_ENABLE_START) >> 2; + old_data = s->enable.reg_u32[index]; + s->enable.reg_u32[index] = data; + /* + * 1: enable irq. + * update irq when isr is set. + */ + data = s->enable.reg_u32[index] & ~old_data & s->isr.reg_u32[index]; + index = index << 2; + for (i = 0; i < sizeof(data); i++) { + u8 mask = (data >> (i * 8)) & 0xff; + eiointc_enable_irq(vcpu, s, index + i, mask, 1); + } + /* + * 0: disable irq. + * update irq when isr is set. + */ + data = ~s->enable.reg_u32[index] & old_data & s->isr.reg_u32[index]; + for (i = 0; i < sizeof(data); i++) { + u8 mask = (data >> (i * 8)) & 0xff; + eiointc_enable_irq(vcpu, s, index, mask, 0); + } + break; + case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: + /* do not emulate hw bounced irq routing */ + index = (offset - EIOINTC_BOUNCE_START) >> 2; + s->bounce.reg_u32[index] = data; + break; + case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + index = (offset - EIOINTC_COREISR_START) >> 2; + /* use attrs to get current cpu index */ + cpu = vcpu->vcpu_id; + coreisr = data; + old_coreisr = s->coreisr.reg_u32[cpu][index]; + /* write 1 to clear interrupt */ + s->coreisr.reg_u32[cpu][index] = old_coreisr & ~coreisr; + coreisr &= old_coreisr; + bits = sizeof(data) * 8; + irq = find_first_bit((void *)&coreisr, bits); + while (irq < bits) { + eiointc_update_irq(s, irq + index * bits, 0); + bitmap_clear((void *)&coreisr, irq, 1); + irq = find_first_bit((void *)&coreisr, bits); + } + break; + case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: + irq = offset - EIOINTC_COREMAP_START; + index = irq >> 2; + s->coremap.reg_u32[index] = data; + eiointc_update_sw_coremap(s, irq, (void *)&data, sizeof(data), true); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int loongarch_eiointc_writeq(struct kvm_vcpu *vcpu, + struct loongarch_eiointc *s, + gpa_t addr, int len, const void *val) +{ + int i, index, irq, bits, ret = 0; + u8 cpu; + u64 data, old_data; + u64 coreisr, old_coreisr; + gpa_t offset; + + data = *(u64 *)val; + offset = addr - EIOINTC_BASE; + + switch (offset) { + case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: + index = (offset - EIOINTC_NODETYPE_START) >> 3; + s->nodetype.reg_u64[index] = data; + break; + case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: + /* + * ipmap cannot be set at runtime, can be set only at the beginning + * of irqchip driver, need not update upper irq level + */ + index = (offset - EIOINTC_IPMAP_START) >> 3; + s->ipmap.reg_u64 = data; + break; + case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: + index = (offset - EIOINTC_ENABLE_START) >> 3; + old_data = s->enable.reg_u64[index]; + s->enable.reg_u64[index] = data; + /* + * 1: enable irq. + * update irq when isr is set. + */ + data = s->enable.reg_u64[index] & ~old_data & s->isr.reg_u64[index]; + index = index << 3; + for (i = 0; i < sizeof(data); i++) { + u8 mask = (data >> (i * 8)) & 0xff; + eiointc_enable_irq(vcpu, s, index + i, mask, 1); + } + /* + * 0: disable irq. + * update irq when isr is set. + */ + data = ~s->enable.reg_u64[index] & old_data & s->isr.reg_u64[index]; + for (i = 0; i < sizeof(data); i++) { + u8 mask = (data >> (i * 8)) & 0xff; + eiointc_enable_irq(vcpu, s, index, mask, 0); + } + break; + case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: + /* do not emulate hw bounced irq routing */ + index = (offset - EIOINTC_BOUNCE_START) >> 3; + s->bounce.reg_u64[index] = data; + break; + case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + index = (offset - EIOINTC_COREISR_START) >> 3; + /* use attrs to get current cpu index */ + cpu = vcpu->vcpu_id; + coreisr = data; + old_coreisr = s->coreisr.reg_u64[cpu][index]; + /* write 1 to clear interrupt */ + s->coreisr.reg_u64[cpu][index] = old_coreisr & ~coreisr; + coreisr &= old_coreisr; + bits = sizeof(data) * 8; + irq = find_first_bit((void *)&coreisr, bits); + while (irq < bits) { + eiointc_update_irq(s, irq + index * bits, 0); + bitmap_clear((void *)&coreisr, irq, 1); + irq = find_first_bit((void *)&coreisr, bits); + } + break; + case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: + irq = offset - EIOINTC_COREMAP_START; + index = irq >> 3; + s->coremap.reg_u64[index] = data; + eiointc_update_sw_coremap(s, irq, (void *)&data, sizeof(data), true); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int kvm_eiointc_write(struct kvm_vcpu *vcpu, + struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + int ret = -EINVAL; + unsigned long flags; + struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc; + + if (!eiointc) { + kvm_err("%s: eiointc irqchip not valid!\n", __func__); + return -EINVAL; + } + + vcpu->kvm->stat.eiointc_write_exits++; + spin_lock_irqsave(&eiointc->lock, flags); + switch (len) { + case 1: + ret = loongarch_eiointc_writeb(vcpu, eiointc, addr, len, val); + break; + case 2: + ret = loongarch_eiointc_writew(vcpu, eiointc, addr, len, val); + break; + case 4: + ret = loongarch_eiointc_writel(vcpu, eiointc, addr, len, val); + break; + case 8: + ret = loongarch_eiointc_writeq(vcpu, eiointc, addr, len, val); + break; + default: + WARN_ONCE(1, "%s: Abnormal address access: addr 0x%llx, size %d\n", + __func__, addr, len); + } + spin_unlock_irqrestore(&eiointc->lock, flags); + + return ret; +} + +static const struct kvm_io_device_ops kvm_eiointc_ops = { + .read = kvm_eiointc_read, + .write = kvm_eiointc_write, +}; + +static int kvm_eiointc_virt_read(struct kvm_vcpu *vcpu, + struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + unsigned long flags; + u32 *data = val; + struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc; + + if (!eiointc) { + kvm_err("%s: eiointc irqchip not valid!\n", __func__); + return -EINVAL; + } + + addr -= EIOINTC_VIRT_BASE; + spin_lock_irqsave(&eiointc->lock, flags); + switch (addr) { + case EIOINTC_VIRT_FEATURES: + *data = eiointc->features; + break; + case EIOINTC_VIRT_CONFIG: + *data = eiointc->status; + break; + default: + break; + } + spin_unlock_irqrestore(&eiointc->lock, flags); + + return 0; +} + +static int kvm_eiointc_virt_write(struct kvm_vcpu *vcpu, + struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + int ret = 0; + unsigned long flags; + u32 value = *(u32 *)val; + struct loongarch_eiointc *eiointc = vcpu->kvm->arch.eiointc; + + if (!eiointc) { + kvm_err("%s: eiointc irqchip not valid!\n", __func__); + return -EINVAL; + } + + addr -= EIOINTC_VIRT_BASE; + spin_lock_irqsave(&eiointc->lock, flags); + switch (addr) { + case EIOINTC_VIRT_FEATURES: + ret = -EPERM; + break; + case EIOINTC_VIRT_CONFIG: + /* + * eiointc features can only be set at disabled status + */ + if ((eiointc->status & BIT(EIOINTC_ENABLE)) && value) { + ret = -EPERM; + break; + } + eiointc->status = value & eiointc->features; + break; + default: + break; + } + spin_unlock_irqrestore(&eiointc->lock, flags); + + return ret; +} + +static const struct kvm_io_device_ops kvm_eiointc_virt_ops = { + .read = kvm_eiointc_virt_read, + .write = kvm_eiointc_virt_write, +}; + +static int kvm_eiointc_ctrl_access(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + int ret = 0; + unsigned long flags; + unsigned long type = (unsigned long)attr->attr; + u32 i, start_irq; + void __user *data; + struct loongarch_eiointc *s = dev->kvm->arch.eiointc; + + data = (void __user *)attr->addr; + spin_lock_irqsave(&s->lock, flags); + switch (type) { + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: + if (copy_from_user(&s->num_cpu, data, 4)) + ret = -EFAULT; + break; + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE: + if (copy_from_user(&s->features, data, 4)) + ret = -EFAULT; + if (!(s->features & BIT(EIOINTC_HAS_VIRT_EXTENSION))) + s->status |= BIT(EIOINTC_ENABLE); + break; + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED: + eiointc_set_sw_coreisr(s); + for (i = 0; i < (EIOINTC_IRQS / 4); i++) { + start_irq = i * 4; + eiointc_update_sw_coremap(s, start_irq, + (void *)&s->coremap.reg_u32[i], sizeof(u32), false); + } + break; + default: + break; + } + spin_unlock_irqrestore(&s->lock, flags); + + return ret; +} + +static int kvm_eiointc_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + bool is_write) +{ + int addr, cpuid, offset, ret = 0; + unsigned long flags; + void *p = NULL; + void __user *data; + struct loongarch_eiointc *s; + + s = dev->kvm->arch.eiointc; + addr = attr->attr; + cpuid = addr >> 16; + addr &= 0xffff; + data = (void __user *)attr->addr; + switch (addr) { + case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: + offset = (addr - EIOINTC_NODETYPE_START) / 4; + p = &s->nodetype.reg_u32[offset]; + break; + case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: + offset = (addr - EIOINTC_IPMAP_START) / 4; + p = &s->ipmap.reg_u32[offset]; + break; + case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: + offset = (addr - EIOINTC_ENABLE_START) / 4; + p = &s->enable.reg_u32[offset]; + break; + case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: + offset = (addr - EIOINTC_BOUNCE_START) / 4; + p = &s->bounce.reg_u32[offset]; + break; + case EIOINTC_ISR_START ... EIOINTC_ISR_END: + offset = (addr - EIOINTC_ISR_START) / 4; + p = &s->isr.reg_u32[offset]; + break; + case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: + offset = (addr - EIOINTC_COREISR_START) / 4; + p = &s->coreisr.reg_u32[cpuid][offset]; + break; + case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: + offset = (addr - EIOINTC_COREMAP_START) / 4; + p = &s->coremap.reg_u32[offset]; + break; + default: + kvm_err("%s: unknown eiointc register, addr = %d\n", __func__, addr); + return -EINVAL; + } + + spin_lock_irqsave(&s->lock, flags); + if (is_write) { + if (copy_from_user(p, data, 4)) + ret = -EFAULT; + } else { + if (copy_to_user(data, p, 4)) + ret = -EFAULT; + } + spin_unlock_irqrestore(&s->lock, flags); + + return ret; +} + +static int kvm_eiointc_sw_status_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + bool is_write) +{ + int addr, ret = 0; + unsigned long flags; + void *p = NULL; + void __user *data; + struct loongarch_eiointc *s; + + s = dev->kvm->arch.eiointc; + addr = attr->attr; + addr &= 0xffff; + + data = (void __user *)attr->addr; + switch (addr) { + case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU: + p = &s->num_cpu; + break; + case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_FEATURE: + p = &s->features; + break; + case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_STATE: + p = &s->status; + break; + default: + kvm_err("%s: unknown eiointc register, addr = %d\n", __func__, addr); + return -EINVAL; + } + spin_lock_irqsave(&s->lock, flags); + if (is_write) { + if (copy_from_user(p, data, 4)) + ret = -EFAULT; + } else { + if (copy_to_user(data, p, 4)) + ret = -EFAULT; + } + spin_unlock_irqrestore(&s->lock, flags); + + return ret; +} + +static int kvm_eiointc_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS: + return kvm_eiointc_regs_access(dev, attr, false); + case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: + return kvm_eiointc_sw_status_access(dev, attr, false); + default: + return -EINVAL; + } +} + +static int kvm_eiointc_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL: + return kvm_eiointc_ctrl_access(dev, attr); + case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS: + return kvm_eiointc_regs_access(dev, attr, true); + case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: + return kvm_eiointc_sw_status_access(dev, attr, true); + default: + return -EINVAL; + } +} + +static int kvm_eiointc_create(struct kvm_device *dev, u32 type) +{ + int ret; + struct loongarch_eiointc *s; + struct kvm_io_device *device, *device1; + struct kvm *kvm = dev->kvm; + + /* eiointc has been created */ + if (kvm->arch.eiointc) + return -EINVAL; + + s = kzalloc(sizeof(struct loongarch_eiointc), GFP_KERNEL); + if (!s) + return -ENOMEM; + + spin_lock_init(&s->lock); + s->kvm = kvm; + + /* + * Initialize IOCSR device + */ + device = &s->device; + kvm_iodevice_init(device, &kvm_eiointc_ops); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_IOCSR_BUS, + EIOINTC_BASE, EIOINTC_SIZE, device); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) { + kfree(s); + return ret; + } + + device1 = &s->device_vext; + kvm_iodevice_init(device1, &kvm_eiointc_virt_ops); + ret = kvm_io_bus_register_dev(kvm, KVM_IOCSR_BUS, + EIOINTC_VIRT_BASE, EIOINTC_VIRT_SIZE, device1); + if (ret < 0) { + kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &s->device); + kfree(s); + return ret; + } + kvm->arch.eiointc = s; + + return 0; +} + +static void kvm_eiointc_destroy(struct kvm_device *dev) +{ + struct kvm *kvm; + struct loongarch_eiointc *eiointc; + + if (!dev || !dev->kvm || !dev->kvm->arch.eiointc) + return; + + kvm = dev->kvm; + eiointc = kvm->arch.eiointc; + kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &eiointc->device); + kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &eiointc->device_vext); + kfree(eiointc); +} + +static struct kvm_device_ops kvm_eiointc_dev_ops = { + .name = "kvm-loongarch-eiointc", + .create = kvm_eiointc_create, + .destroy = kvm_eiointc_destroy, + .set_attr = kvm_eiointc_set_attr, + .get_attr = kvm_eiointc_get_attr, +}; + +int kvm_loongarch_register_eiointc_device(void) +{ + return kvm_register_device_ops(&kvm_eiointc_dev_ops, KVM_DEV_TYPE_LOONGARCH_EIOINTC); +} diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c new file mode 100644 index 000000000000..a233a323e295 --- /dev/null +++ b/arch/loongarch/kvm/intc/ipi.c @@ -0,0 +1,475 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_ipi.h> +#include <asm/kvm_vcpu.h> + +static void ipi_send(struct kvm *kvm, uint64_t data) +{ + int cpu, action; + uint32_t status; + struct kvm_vcpu *vcpu; + struct kvm_interrupt irq; + + cpu = ((data & 0xffffffff) >> 16) & 0x3ff; + vcpu = kvm_get_vcpu_by_cpuid(kvm, cpu); + if (unlikely(vcpu == NULL)) { + kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); + return; + } + + action = BIT(data & 0x1f); + spin_lock(&vcpu->arch.ipi_state.lock); + status = vcpu->arch.ipi_state.status; + vcpu->arch.ipi_state.status |= action; + spin_unlock(&vcpu->arch.ipi_state.lock); + if (status == 0) { + irq.irq = LARCH_INT_IPI; + kvm_vcpu_ioctl_interrupt(vcpu, &irq); + } +} + +static void ipi_clear(struct kvm_vcpu *vcpu, uint64_t data) +{ + uint32_t status; + struct kvm_interrupt irq; + + spin_lock(&vcpu->arch.ipi_state.lock); + vcpu->arch.ipi_state.status &= ~data; + status = vcpu->arch.ipi_state.status; + spin_unlock(&vcpu->arch.ipi_state.lock); + if (status == 0) { + irq.irq = -LARCH_INT_IPI; + kvm_vcpu_ioctl_interrupt(vcpu, &irq); + } +} + +static uint64_t read_mailbox(struct kvm_vcpu *vcpu, int offset, int len) +{ + uint64_t data = 0; + + spin_lock(&vcpu->arch.ipi_state.lock); + data = *(ulong *)((void *)vcpu->arch.ipi_state.buf + (offset - 0x20)); + spin_unlock(&vcpu->arch.ipi_state.lock); + + switch (len) { + case 1: + return data & 0xff; + case 2: + return data & 0xffff; + case 4: + return data & 0xffffffff; + case 8: + return data; + default: + kvm_err("%s: unknown data len: %d\n", __func__, len); + return 0; + } +} + +static void write_mailbox(struct kvm_vcpu *vcpu, int offset, uint64_t data, int len) +{ + void *pbuf; + + spin_lock(&vcpu->arch.ipi_state.lock); + pbuf = (void *)vcpu->arch.ipi_state.buf + (offset - 0x20); + + switch (len) { + case 1: + *(unsigned char *)pbuf = (unsigned char)data; + break; + case 2: + *(unsigned short *)pbuf = (unsigned short)data; + break; + case 4: + *(unsigned int *)pbuf = (unsigned int)data; + break; + case 8: + *(unsigned long *)pbuf = (unsigned long)data; + break; + default: + kvm_err("%s: unknown data len: %d\n", __func__, len); + } + spin_unlock(&vcpu->arch.ipi_state.lock); +} + +static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) +{ + int i, ret; + uint32_t val = 0, mask = 0; + + /* + * Bit 27-30 is mask for byte writing. + * If the mask is 0, we need not to do anything. + */ + if ((data >> 27) & 0xf) { + /* Read the old val */ + ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val); + if (unlikely(ret)) { + kvm_err("%s: : read date from addr %llx failed\n", __func__, addr); + return ret; + } + /* Construct the mask by scanning the bit 27-30 */ + for (i = 0; i < 4; i++) { + if (data & (BIT(27 + i))) + mask |= (0xff << (i * 8)); + } + /* Save the old part of val */ + val &= mask; + } + val |= ((uint32_t)(data >> 32) & ~mask); + ret = kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val); + if (unlikely(ret)) + kvm_err("%s: : write date to addr %llx failed\n", __func__, addr); + + return ret; +} + +static int mail_send(struct kvm *kvm, uint64_t data) +{ + int cpu, mailbox, offset; + struct kvm_vcpu *vcpu; + + cpu = ((data & 0xffffffff) >> 16) & 0x3ff; + vcpu = kvm_get_vcpu_by_cpuid(kvm, cpu); + if (unlikely(vcpu == NULL)) { + kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); + return -EINVAL; + } + mailbox = ((data & 0xffffffff) >> 2) & 0x7; + offset = IOCSR_IPI_BASE + IOCSR_IPI_BUF_20 + mailbox * 4; + + return send_ipi_data(vcpu, offset, data); +} + +static int any_send(struct kvm *kvm, uint64_t data) +{ + int cpu, offset; + struct kvm_vcpu *vcpu; + + cpu = ((data & 0xffffffff) >> 16) & 0x3ff; + vcpu = kvm_get_vcpu_by_cpuid(kvm, cpu); + if (unlikely(vcpu == NULL)) { + kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); + return -EINVAL; + } + offset = data & 0xffff; + + return send_ipi_data(vcpu, offset, data); +} + +static int loongarch_ipi_readl(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *val) +{ + int ret = 0; + uint32_t offset; + uint64_t res = 0; + + offset = (uint32_t)(addr & 0x1ff); + WARN_ON_ONCE(offset & (len - 1)); + + switch (offset) { + case IOCSR_IPI_STATUS: + spin_lock(&vcpu->arch.ipi_state.lock); + res = vcpu->arch.ipi_state.status; + spin_unlock(&vcpu->arch.ipi_state.lock); + break; + case IOCSR_IPI_EN: + spin_lock(&vcpu->arch.ipi_state.lock); + res = vcpu->arch.ipi_state.en; + spin_unlock(&vcpu->arch.ipi_state.lock); + break; + case IOCSR_IPI_SET: + res = 0; + break; + case IOCSR_IPI_CLEAR: + res = 0; + break; + case IOCSR_IPI_BUF_20 ... IOCSR_IPI_BUF_38 + 7: + if (offset + len > IOCSR_IPI_BUF_38 + 8) { + kvm_err("%s: invalid offset or len: offset = %d, len = %d\n", + __func__, offset, len); + ret = -EINVAL; + break; + } + res = read_mailbox(vcpu, offset, len); + break; + default: + kvm_err("%s: unknown addr: %llx\n", __func__, addr); + ret = -EINVAL; + break; + } + *(uint64_t *)val = res; + + return ret; +} + +static int loongarch_ipi_writel(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *val) +{ + int ret = 0; + uint64_t data; + uint32_t offset; + + data = *(uint64_t *)val; + + offset = (uint32_t)(addr & 0x1ff); + WARN_ON_ONCE(offset & (len - 1)); + + switch (offset) { + case IOCSR_IPI_STATUS: + ret = -EINVAL; + break; + case IOCSR_IPI_EN: + spin_lock(&vcpu->arch.ipi_state.lock); + vcpu->arch.ipi_state.en = data; + spin_unlock(&vcpu->arch.ipi_state.lock); + break; + case IOCSR_IPI_SET: + ret = -EINVAL; + break; + case IOCSR_IPI_CLEAR: + /* Just clear the status of the current vcpu */ + ipi_clear(vcpu, data); + break; + case IOCSR_IPI_BUF_20 ... IOCSR_IPI_BUF_38 + 7: + if (offset + len > IOCSR_IPI_BUF_38 + 8) { + kvm_err("%s: invalid offset or len: offset = %d, len = %d\n", + __func__, offset, len); + ret = -EINVAL; + break; + } + write_mailbox(vcpu, offset, data, len); + break; + case IOCSR_IPI_SEND: + ipi_send(vcpu->kvm, data); + break; + case IOCSR_MAIL_SEND: + ret = mail_send(vcpu->kvm, *(uint64_t *)val); + break; + case IOCSR_ANY_SEND: + ret = any_send(vcpu->kvm, *(uint64_t *)val); + break; + default: + kvm_err("%s: unknown addr: %llx\n", __func__, addr); + ret = -EINVAL; + break; + } + + return ret; +} + +static int kvm_ipi_read(struct kvm_vcpu *vcpu, + struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + int ret; + struct loongarch_ipi *ipi; + + ipi = vcpu->kvm->arch.ipi; + if (!ipi) { + kvm_err("%s: ipi irqchip not valid!\n", __func__); + return -EINVAL; + } + ipi->kvm->stat.ipi_read_exits++; + ret = loongarch_ipi_readl(vcpu, addr, len, val); + + return ret; +} + +static int kvm_ipi_write(struct kvm_vcpu *vcpu, + struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + int ret; + struct loongarch_ipi *ipi; + + ipi = vcpu->kvm->arch.ipi; + if (!ipi) { + kvm_err("%s: ipi irqchip not valid!\n", __func__); + return -EINVAL; + } + ipi->kvm->stat.ipi_write_exits++; + ret = loongarch_ipi_writel(vcpu, addr, len, val); + + return ret; +} + +static const struct kvm_io_device_ops kvm_ipi_ops = { + .read = kvm_ipi_read, + .write = kvm_ipi_write, +}; + +static int kvm_ipi_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + bool is_write) +{ + int len = 4; + int cpu, addr; + uint64_t val; + void *p = NULL; + struct kvm_vcpu *vcpu; + + cpu = (attr->attr >> 16) & 0x3ff; + addr = attr->attr & 0xff; + + vcpu = kvm_get_vcpu(dev->kvm, cpu); + if (unlikely(vcpu == NULL)) { + kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); + return -EINVAL; + } + + switch (addr) { + case IOCSR_IPI_STATUS: + p = &vcpu->arch.ipi_state.status; + break; + case IOCSR_IPI_EN: + p = &vcpu->arch.ipi_state.en; + break; + case IOCSR_IPI_SET: + p = &vcpu->arch.ipi_state.set; + break; + case IOCSR_IPI_CLEAR: + p = &vcpu->arch.ipi_state.clear; + break; + case IOCSR_IPI_BUF_20: + p = &vcpu->arch.ipi_state.buf[0]; + len = 8; + break; + case IOCSR_IPI_BUF_28: + p = &vcpu->arch.ipi_state.buf[1]; + len = 8; + break; + case IOCSR_IPI_BUF_30: + p = &vcpu->arch.ipi_state.buf[2]; + len = 8; + break; + case IOCSR_IPI_BUF_38: + p = &vcpu->arch.ipi_state.buf[3]; + len = 8; + break; + default: + kvm_err("%s: unknown ipi register, addr = %d\n", __func__, addr); + return -EINVAL; + } + + if (is_write) { + if (len == 4) { + if (get_user(val, (uint32_t __user *)attr->addr)) + return -EFAULT; + *(uint32_t *)p = (uint32_t)val; + } else if (len == 8) { + if (get_user(val, (uint64_t __user *)attr->addr)) + return -EFAULT; + *(uint64_t *)p = val; + } + } else { + if (len == 4) { + val = *(uint32_t *)p; + return put_user(val, (uint32_t __user *)attr->addr); + } else if (len == 8) { + val = *(uint64_t *)p; + return put_user(val, (uint64_t __user *)attr->addr); + } + } + + return 0; +} + +static int kvm_ipi_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_LOONGARCH_IPI_GRP_REGS: + return kvm_ipi_regs_access(dev, attr, false); + default: + kvm_err("%s: unknown group (%d)\n", __func__, attr->group); + return -EINVAL; + } +} + +static int kvm_ipi_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_LOONGARCH_IPI_GRP_REGS: + return kvm_ipi_regs_access(dev, attr, true); + default: + kvm_err("%s: unknown group (%d)\n", __func__, attr->group); + return -EINVAL; + } +} + +static int kvm_ipi_create(struct kvm_device *dev, u32 type) +{ + int ret; + struct kvm *kvm; + struct kvm_io_device *device; + struct loongarch_ipi *s; + + if (!dev) { + kvm_err("%s: kvm_device ptr is invalid!\n", __func__); + return -EINVAL; + } + + kvm = dev->kvm; + if (kvm->arch.ipi) { + kvm_err("%s: LoongArch IPI has already been created!\n", __func__); + return -EINVAL; + } + + s = kzalloc(sizeof(struct loongarch_ipi), GFP_KERNEL); + if (!s) + return -ENOMEM; + + spin_lock_init(&s->lock); + s->kvm = kvm; + + /* + * Initialize IOCSR device + */ + device = &s->device; + kvm_iodevice_init(device, &kvm_ipi_ops); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_IOCSR_BUS, IOCSR_IPI_BASE, IOCSR_IPI_SIZE, device); + mutex_unlock(&kvm->slots_lock); + if (ret < 0) { + kvm_err("%s: Initialize IOCSR dev failed, ret = %d\n", __func__, ret); + goto err; + } + + kvm->arch.ipi = s; + return 0; + +err: + kfree(s); + return -EFAULT; +} + +static void kvm_ipi_destroy(struct kvm_device *dev) +{ + struct kvm *kvm; + struct loongarch_ipi *ipi; + + if (!dev || !dev->kvm || !dev->kvm->arch.ipi) + return; + + kvm = dev->kvm; + ipi = kvm->arch.ipi; + kvm_io_bus_unregister_dev(kvm, KVM_IOCSR_BUS, &ipi->device); + kfree(ipi); +} + +static struct kvm_device_ops kvm_ipi_dev_ops = { + .name = "kvm-loongarch-ipi", + .create = kvm_ipi_create, + .destroy = kvm_ipi_destroy, + .set_attr = kvm_ipi_set_attr, + .get_attr = kvm_ipi_get_attr, +}; + +int kvm_loongarch_register_ipi_device(void) +{ + return kvm_register_device_ops(&kvm_ipi_dev_ops, KVM_DEV_TYPE_LOONGARCH_IPI); +} diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c new file mode 100644 index 000000000000..08fce845f668 --- /dev/null +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include <asm/kvm_eiointc.h> +#include <asm/kvm_pch_pic.h> +#include <asm/kvm_vcpu.h> +#include <linux/count_zeros.h> + +/* update the isr according to irq level and route irq to eiointc */ +static void pch_pic_update_irq(struct loongarch_pch_pic *s, int irq, int level) +{ + u64 mask = BIT(irq); + + /* + * set isr and route irq to eiointc and + * the route table is in htmsi_vector[] + */ + if (level) { + if (mask & s->irr & ~s->mask) { + s->isr |= mask; + irq = s->htmsi_vector[irq]; + eiointc_set_irq(s->kvm->arch.eiointc, irq, level); + } + } else { + if (mask & s->isr & ~s->irr) { + s->isr &= ~mask; + irq = s->htmsi_vector[irq]; + eiointc_set_irq(s->kvm->arch.eiointc, irq, level); + } + } +} + +/* update batch irqs, the irq_mask is a bitmap of irqs */ +static void pch_pic_update_batch_irqs(struct loongarch_pch_pic *s, u64 irq_mask, int level) +{ + int irq, bits; + + /* find each irq by irqs bitmap and update each irq */ + bits = sizeof(irq_mask) * 8; + irq = find_first_bit((void *)&irq_mask, bits); + while (irq < bits) { + pch_pic_update_irq(s, irq, level); + bitmap_clear((void *)&irq_mask, irq, 1); + irq = find_first_bit((void *)&irq_mask, bits); + } +} + +/* called when a irq is triggered in pch pic */ +void pch_pic_set_irq(struct loongarch_pch_pic *s, int irq, int level) +{ + u64 mask = BIT(irq); + + spin_lock(&s->lock); + if (level) + s->irr |= mask; /* set irr */ + else { + /* + * In edge triggered mode, 0 does not mean to clear irq + * The irr register variable is cleared when cpu writes to the + * PCH_PIC_CLEAR_START address area + */ + if (s->edge & mask) { + spin_unlock(&s->lock); + return; + } + s->irr &= ~mask; + } + pch_pic_update_irq(s, irq, level); + spin_unlock(&s->lock); +} + +/* msi irq handler */ +void pch_msi_set_irq(struct kvm *kvm, int irq, int level) +{ + eiointc_set_irq(kvm->arch.eiointc, irq, level); +} + +/* + * pch pic register is 64-bit, but it is accessed by 32-bit, + * so we use high to get whether low or high 32 bits we want + * to read. + */ +static u32 pch_pic_read_reg(u64 *s, int high) +{ + u64 val = *s; + + /* read the high 32 bits when high is 1 */ + return high ? (u32)(val >> 32) : (u32)val; +} + +/* + * pch pic register is 64-bit, but it is accessed by 32-bit, + * so we use high to get whether low or high 32 bits we want + * to write. + */ +static u32 pch_pic_write_reg(u64 *s, int high, u32 v) +{ + u64 val = *s, data = v; + + if (high) { + /* + * Clear val high 32 bits + * Write the high 32 bits when the high is 1 + */ + *s = (val << 32 >> 32) | (data << 32); + val >>= 32; + } else + /* + * Clear val low 32 bits + * Write the low 32 bits when the high is 0 + */ + *s = (val >> 32 << 32) | v; + + return (u32)val; +} + +static int loongarch_pch_pic_read(struct loongarch_pch_pic *s, gpa_t addr, int len, void *val) +{ + int offset, index, ret = 0; + u32 data = 0; + u64 int_id = 0; + + offset = addr - s->pch_pic_base; + + spin_lock(&s->lock); + switch (offset) { + case PCH_PIC_INT_ID_START ... PCH_PIC_INT_ID_END: + /* int id version */ + int_id |= (u64)PCH_PIC_INT_ID_VER << 32; + /* irq number */ + int_id |= (u64)31 << (32 + 16); + /* int id value */ + int_id |= PCH_PIC_INT_ID_VAL; + *(u64 *)val = int_id; + break; + case PCH_PIC_MASK_START ... PCH_PIC_MASK_END: + offset -= PCH_PIC_MASK_START; + index = offset >> 2; + /* read mask reg */ + data = pch_pic_read_reg(&s->mask, index); + *(u32 *)val = data; + break; + case PCH_PIC_HTMSI_EN_START ... PCH_PIC_HTMSI_EN_END: + offset -= PCH_PIC_HTMSI_EN_START; + index = offset >> 2; + /* read htmsi enable reg */ + data = pch_pic_read_reg(&s->htmsi_en, index); + *(u32 *)val = data; + break; + case PCH_PIC_EDGE_START ... PCH_PIC_EDGE_END: + offset -= PCH_PIC_EDGE_START; + index = offset >> 2; + /* read edge enable reg */ + data = pch_pic_read_reg(&s->edge, index); + *(u32 *)val = data; + break; + case PCH_PIC_AUTO_CTRL0_START ... PCH_PIC_AUTO_CTRL0_END: + case PCH_PIC_AUTO_CTRL1_START ... PCH_PIC_AUTO_CTRL1_END: + /* we only use default mode: fixed interrupt distribution mode */ + *(u32 *)val = 0; + break; + case PCH_PIC_ROUTE_ENTRY_START ... PCH_PIC_ROUTE_ENTRY_END: + /* only route to int0: eiointc */ + *(u8 *)val = 1; + break; + case PCH_PIC_HTMSI_VEC_START ... PCH_PIC_HTMSI_VEC_END: + offset -= PCH_PIC_HTMSI_VEC_START; + /* read htmsi vector */ + data = s->htmsi_vector[offset]; + *(u8 *)val = data; + break; + case PCH_PIC_POLARITY_START ... PCH_PIC_POLARITY_END: + /* we only use defalut value 0: high level triggered */ + *(u32 *)val = 0; + break; + default: + ret = -EINVAL; + } + spin_unlock(&s->lock); + + return ret; +} + +static int kvm_pch_pic_read(struct kvm_vcpu *vcpu, + struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + int ret; + struct loongarch_pch_pic *s = vcpu->kvm->arch.pch_pic; + + if (!s) { + kvm_err("%s: pch pic irqchip not valid!\n", __func__); + return -EINVAL; + } + + /* statistics of pch pic reading */ + vcpu->kvm->stat.pch_pic_read_exits++; + ret = loongarch_pch_pic_read(s, addr, len, val); + + return ret; +} + +static int loongarch_pch_pic_write(struct loongarch_pch_pic *s, gpa_t addr, + int len, const void *val) +{ + int ret; + u32 old, data, offset, index; + u64 irq; + + ret = 0; + data = *(u32 *)val; + offset = addr - s->pch_pic_base; + + spin_lock(&s->lock); + switch (offset) { + case PCH_PIC_MASK_START ... PCH_PIC_MASK_END: + offset -= PCH_PIC_MASK_START; + /* get whether high or low 32 bits we want to write */ + index = offset >> 2; + old = pch_pic_write_reg(&s->mask, index, data); + /* enable irq when mask value change to 0 */ + irq = (old & ~data) << (32 * index); + pch_pic_update_batch_irqs(s, irq, 1); + /* disable irq when mask value change to 1 */ + irq = (~old & data) << (32 * index); + pch_pic_update_batch_irqs(s, irq, 0); + break; + case PCH_PIC_HTMSI_EN_START ... PCH_PIC_HTMSI_EN_END: + offset -= PCH_PIC_HTMSI_EN_START; + index = offset >> 2; + pch_pic_write_reg(&s->htmsi_en, index, data); + break; + case PCH_PIC_EDGE_START ... PCH_PIC_EDGE_END: + offset -= PCH_PIC_EDGE_START; + index = offset >> 2; + /* 1: edge triggered, 0: level triggered */ + pch_pic_write_reg(&s->edge, index, data); + break; + case PCH_PIC_CLEAR_START ... PCH_PIC_CLEAR_END: + offset -= PCH_PIC_CLEAR_START; + index = offset >> 2; + /* write 1 to clear edge irq */ + old = pch_pic_read_reg(&s->irr, index); + /* + * get the irq bitmap which is edge triggered and + * already set and to be cleared + */ + irq = old & pch_pic_read_reg(&s->edge, index) & data; + /* write irr to the new state where irqs have been cleared */ + pch_pic_write_reg(&s->irr, index, old & ~irq); + /* update cleared irqs */ + pch_pic_update_batch_irqs(s, irq, 0); + break; + case PCH_PIC_AUTO_CTRL0_START ... PCH_PIC_AUTO_CTRL0_END: + offset -= PCH_PIC_AUTO_CTRL0_START; + index = offset >> 2; + /* we only use default mode: fixed interrupt distribution mode */ + pch_pic_write_reg(&s->auto_ctrl0, index, 0); + break; + case PCH_PIC_AUTO_CTRL1_START ... PCH_PIC_AUTO_CTRL1_END: + offset -= PCH_PIC_AUTO_CTRL1_START; + index = offset >> 2; + /* we only use default mode: fixed interrupt distribution mode */ + pch_pic_write_reg(&s->auto_ctrl1, index, 0); + break; + case PCH_PIC_ROUTE_ENTRY_START ... PCH_PIC_ROUTE_ENTRY_END: + offset -= PCH_PIC_ROUTE_ENTRY_START; + /* only route to int0: eiointc */ + s->route_entry[offset] = 1; + break; + case PCH_PIC_HTMSI_VEC_START ... PCH_PIC_HTMSI_VEC_END: + /* route table to eiointc */ + offset -= PCH_PIC_HTMSI_VEC_START; + s->htmsi_vector[offset] = (u8)data; + break; + case PCH_PIC_POLARITY_START ... PCH_PIC_POLARITY_END: + offset -= PCH_PIC_POLARITY_START; + index = offset >> 2; + /* we only use defalut value 0: high level triggered */ + pch_pic_write_reg(&s->polarity, index, 0); + break; + default: + ret = -EINVAL; + break; + } + spin_unlock(&s->lock); + + return ret; +} + +static int kvm_pch_pic_write(struct kvm_vcpu *vcpu, + struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + int ret; + struct loongarch_pch_pic *s = vcpu->kvm->arch.pch_pic; + + if (!s) { + kvm_err("%s: pch pic irqchip not valid!\n", __func__); + return -EINVAL; + } + + /* statistics of pch pic writing */ + vcpu->kvm->stat.pch_pic_write_exits++; + ret = loongarch_pch_pic_write(s, addr, len, val); + + return ret; +} + +static const struct kvm_io_device_ops kvm_pch_pic_ops = { + .read = kvm_pch_pic_read, + .write = kvm_pch_pic_write, +}; + +static int kvm_pch_pic_init(struct kvm_device *dev, u64 addr) +{ + int ret; + struct kvm *kvm = dev->kvm; + struct kvm_io_device *device; + struct loongarch_pch_pic *s = dev->kvm->arch.pch_pic; + + s->pch_pic_base = addr; + device = &s->device; + /* init device by pch pic writing and reading ops */ + kvm_iodevice_init(device, &kvm_pch_pic_ops); + mutex_lock(&kvm->slots_lock); + /* register pch pic device */ + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, addr, PCH_PIC_SIZE, device); + mutex_unlock(&kvm->slots_lock); + + return (ret < 0) ? -EFAULT : 0; +} + +/* used by user space to get or set pch pic registers */ +static int kvm_pch_pic_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + bool is_write) +{ + int addr, offset, len = 8, ret = 0; + void __user *data; + void *p = NULL; + struct loongarch_pch_pic *s; + + s = dev->kvm->arch.pch_pic; + addr = attr->attr; + data = (void __user *)attr->addr; + + /* get pointer to pch pic register by addr */ + switch (addr) { + case PCH_PIC_MASK_START: + p = &s->mask; + break; + case PCH_PIC_HTMSI_EN_START: + p = &s->htmsi_en; + break; + case PCH_PIC_EDGE_START: + p = &s->edge; + break; + case PCH_PIC_AUTO_CTRL0_START: + p = &s->auto_ctrl0; + break; + case PCH_PIC_AUTO_CTRL1_START: + p = &s->auto_ctrl1; + break; + case PCH_PIC_ROUTE_ENTRY_START ... PCH_PIC_ROUTE_ENTRY_END: + offset = addr - PCH_PIC_ROUTE_ENTRY_START; + p = &s->route_entry[offset]; + len = 1; + break; + case PCH_PIC_HTMSI_VEC_START ... PCH_PIC_HTMSI_VEC_END: + offset = addr - PCH_PIC_HTMSI_VEC_START; + p = &s->htmsi_vector[offset]; + len = 1; + break; + case PCH_PIC_INT_IRR_START: + p = &s->irr; + break; + case PCH_PIC_INT_ISR_START: + p = &s->isr; + break; + case PCH_PIC_POLARITY_START: + p = &s->polarity; + break; + default: + return -EINVAL; + } + + spin_lock(&s->lock); + /* write or read value according to is_write */ + if (is_write) { + if (copy_from_user(p, data, len)) + ret = -EFAULT; + } else { + if (copy_to_user(data, p, len)) + ret = -EFAULT; + } + spin_unlock(&s->lock); + + return ret; +} + +static int kvm_pch_pic_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS: + return kvm_pch_pic_regs_access(dev, attr, false); + default: + return -EINVAL; + } +} + +static int kvm_pch_pic_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + u64 addr; + void __user *uaddr = (void __user *)(long)attr->addr; + + switch (attr->group) { + case KVM_DEV_LOONGARCH_PCH_PIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_LOONGARCH_PCH_PIC_CTRL_INIT: + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + if (!dev->kvm->arch.pch_pic) { + kvm_err("%s: please create pch_pic irqchip first!\n", __func__); + return -ENODEV; + } + + return kvm_pch_pic_init(dev, addr); + default: + kvm_err("%s: unknown group (%d) attr (%lld)\n", __func__, attr->group, + attr->attr); + return -EINVAL; + } + case KVM_DEV_LOONGARCH_PCH_PIC_GRP_REGS: + return kvm_pch_pic_regs_access(dev, attr, true); + default: + return -EINVAL; + } +} + +static int kvm_setup_default_irq_routing(struct kvm *kvm) +{ + int i, ret; + u32 nr = KVM_IRQCHIP_NUM_PINS; + struct kvm_irq_routing_entry *entries; + + entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL); + if (!entries) + return -ENOMEM; + + for (i = 0; i < nr; i++) { + entries[i].gsi = i; + entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; + entries[i].u.irqchip.irqchip = 0; + entries[i].u.irqchip.pin = i; + } + ret = kvm_set_irq_routing(kvm, entries, nr, 0); + kfree(entries); + + return ret; +} + +static int kvm_pch_pic_create(struct kvm_device *dev, u32 type) +{ + int ret; + struct kvm *kvm = dev->kvm; + struct loongarch_pch_pic *s; + + /* pch pic should not has been created */ + if (kvm->arch.pch_pic) + return -EINVAL; + + ret = kvm_setup_default_irq_routing(kvm); + if (ret) + return -ENOMEM; + + s = kzalloc(sizeof(struct loongarch_pch_pic), GFP_KERNEL); + if (!s) + return -ENOMEM; + + spin_lock_init(&s->lock); + s->kvm = kvm; + kvm->arch.pch_pic = s; + + return 0; +} + +static void kvm_pch_pic_destroy(struct kvm_device *dev) +{ + struct kvm *kvm; + struct loongarch_pch_pic *s; + + if (!dev || !dev->kvm || !dev->kvm->arch.pch_pic) + return; + + kvm = dev->kvm; + s = kvm->arch.pch_pic; + /* unregister pch pic device and free it's memory */ + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &s->device); + kfree(s); +} + +static struct kvm_device_ops kvm_pch_pic_dev_ops = { + .name = "kvm-loongarch-pch-pic", + .create = kvm_pch_pic_create, + .destroy = kvm_pch_pic_destroy, + .set_attr = kvm_pch_pic_set_attr, + .get_attr = kvm_pch_pic_get_attr, +}; + +int kvm_loongarch_register_pch_pic_device(void) +{ + return kvm_register_device_ops(&kvm_pch_pic_dev_ops, KVM_DEV_TYPE_LOONGARCH_PCHPIC); +} diff --git a/arch/loongarch/kvm/irqfd.c b/arch/loongarch/kvm/irqfd.c new file mode 100644 index 000000000000..9a39627aecf0 --- /dev/null +++ b/arch/loongarch/kvm/irqfd.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Loongson Technology Corporation Limited + */ + +#include <linux/kvm_host.h> +#include <trace/events/kvm.h> +#include <asm/kvm_pch_pic.h> + +static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + /* PCH-PIC pin (0 ~ 64) <---> GSI (0 ~ 64) */ + pch_pic_set_irq(kvm->arch.pch_pic, e->irqchip.pin, level); + + return 0; +} + +/* + * kvm_set_msi: inject the MSI corresponding to the + * MSI routing entry + * + * This is the entry point for irqfd MSI injection + * and userspace MSI injection. + */ +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + if (!level) + return -1; + + pch_msi_set_irq(kvm, e->msi.data, level); + + return 0; +} + +/* + * kvm_set_routing_entry: populate a kvm routing entry + * from a user routing entry + * + * @kvm: the VM this entry is applied to + * @e: kvm kernel routing entry handle + * @ue: user api routing entry handle + * return 0 on success, -EINVAL on errors. + */ +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = kvm_set_pic_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + + if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) + return -EINVAL; + + return 0; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + return 0; + default: + return -EINVAL; + } +} + +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + switch (e->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + pch_pic_set_irq(kvm->arch.pch_pic, e->irqchip.pin, level); + return 0; + case KVM_IRQ_ROUTING_MSI: + pch_msi_set_irq(kvm, e->msi.data, level); + return 0; + default: + return -EWOULDBLOCK; + } +} + +bool kvm_arch_intc_initialized(struct kvm *kvm) +{ + return kvm_arch_irqchip_in_kernel(kvm); +} diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 27e9b94c0a0b..396fed2665a5 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -9,6 +9,8 @@ #include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/kvm_csr.h> +#include <asm/kvm_eiointc.h> +#include <asm/kvm_pch_pic.h> #include "trace.h" unsigned long vpid_mask; @@ -313,7 +315,7 @@ void kvm_arch_disable_virtualization_cpu(void) static int kvm_loongarch_env_init(void) { - int cpu, order; + int cpu, order, ret; void *addr; struct kvm_context *context; @@ -368,7 +370,20 @@ static int kvm_loongarch_env_init(void) kvm_init_gcsr_flag(); - return 0; + /* Register LoongArch IPI interrupt controller interface. */ + ret = kvm_loongarch_register_ipi_device(); + if (ret) + return ret; + + /* Register LoongArch EIOINTC interrupt controller interface. */ + ret = kvm_loongarch_register_eiointc_device(); + if (ret) + return ret; + + /* Register LoongArch PCH-PIC interrupt controller interface. */ + ret = kvm_loongarch_register_pch_pic_device(); + + return ret; } static void kvm_loongarch_env_exit(void) diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 28681dfb4b85..4d203294767c 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -552,12 +552,10 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) { int ret = 0; - kvm_pfn_t pfn = 0; kvm_pte_t *ptep, changed, new; gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_memory_slot *slot; - struct page *page; spin_lock(&kvm->mmu_lock); @@ -570,8 +568,6 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ /* Track access to pages marked old */ new = kvm_pte_mkyoung(*ptep); - /* call kvm_set_pfn_accessed() after unlock */ - if (write && !kvm_pte_dirty(new)) { if (!kvm_pte_write(new)) { ret = -EFAULT; @@ -595,26 +591,14 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ } changed = new ^ (*ptep); - if (changed) { + if (changed) kvm_set_pte(ptep, new); - pfn = kvm_pte_pfn(new); - page = kvm_pfn_to_refcounted_page(pfn); - if (page) - get_page(page); - } + spin_unlock(&kvm->mmu_lock); - if (changed) { - if (kvm_pte_young(changed)) - kvm_set_pfn_accessed(pfn); + if (kvm_pte_dirty(changed)) + mark_page_dirty(kvm, gfn); - if (kvm_pte_dirty(changed)) { - mark_page_dirty(kvm, gfn); - kvm_set_pfn_dirty(pfn); - } - if (page) - put_page(page); - } return ret; out: spin_unlock(&kvm->mmu_lock); @@ -796,6 +780,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) struct kvm *kvm = vcpu->kvm; struct kvm_memory_slot *memslot; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + struct page *page; /* Try the fast path to handle old / clean pages */ srcu_idx = srcu_read_lock(&kvm->srcu); @@ -823,7 +808,7 @@ retry: mmu_seq = kvm->mmu_invalidate_seq; /* * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in - * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't + * kvm_faultin_pfn() (which calls get_user_pages()), so that we don't * risk the page we get a reference to getting unmapped before we have a * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. * @@ -835,7 +820,7 @@ retry: smp_rmb(); /* Slow path - ask KVM core whether we can access this GPA */ - pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable); + pfn = kvm_faultin_pfn(vcpu, gfn, write, &writeable, &page); if (is_error_noslot_pfn(pfn)) { err = -EFAULT; goto out; @@ -847,10 +832,10 @@ retry: /* * This can happen when mappings are changed asynchronously, but * also synchronously if a COW is triggered by - * gfn_to_pfn_prot(). + * kvm_faultin_pfn(). */ spin_unlock(&kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_page_unused(page); if (retry_no > 100) { retry_no = 0; schedule(); @@ -915,14 +900,13 @@ retry: else ++kvm->stat.pages; kvm_set_pte(ptep, new_pte); + + kvm_release_faultin_page(kvm, page, false, writeable); spin_unlock(&kvm->mmu_lock); - if (prot_bits & _PAGE_DIRTY) { + if (prot_bits & _PAGE_DIRTY) mark_page_dirty_in_slot(kvm, memslot, gfn); - kvm_set_pfn_dirty(pfn); - } - kvm_release_pfn_clean(pfn); out: srcu_read_unlock(&kvm->srcu, srcu_idx); return err; diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 174734a23d0a..cab1818be68d 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1475,6 +1475,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) /* Init */ vcpu->arch.last_sched_cpu = -1; + /* Init ipi_state lock */ + spin_lock_init(&vcpu->arch.ipi_state.lock); + /* * Initialize guest register state to valid architectural reset state. */ diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index 4ba734aaef87..b8b3e1972d6e 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -6,6 +6,8 @@ #include <linux/kvm_host.h> #include <asm/kvm_mmu.h> #include <asm/kvm_vcpu.h> +#include <asm/kvm_eiointc.h> +#include <asm/kvm_pch_pic.h> const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -76,6 +78,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) int r; switch (ext) { + case KVM_CAP_IRQCHIP: case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_READONLY_MEM: @@ -161,6 +164,8 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) struct kvm_device_attr attr; switch (ioctl) { + case KVM_CREATE_IRQCHIP: + return 0; case KVM_HAS_DEVICE_ATTR: if (copy_from_user(&attr, argp, sizeof(attr))) return -EFAULT; @@ -170,3 +175,19 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) return -ENOIOCTLCMD; } } + +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) +{ + if (!kvm_arch_irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level, line_status); + + return 0; +} + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return (kvm->arch.ipi && kvm->arch.eiointc && kvm->arch.pch_pic); +} diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c index ffd8d76021d4..bf8678248444 100644 --- a/arch/loongarch/mm/pageattr.c +++ b/arch/loongarch/mm/pageattr.c @@ -216,3 +216,22 @@ int set_direct_map_invalid_noflush(struct page *page) return __set_memory(addr, 1, __pgprot(0), __pgprot(_PAGE_PRESENT | _PAGE_VALID)); } + +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + unsigned long addr = (unsigned long)page_address(page); + pgprot_t set, clear; + + if (addr < vm_map_base) + return 0; + + if (valid) { + set = PAGE_KERNEL; + clear = __pgprot(0); + } else { + set = __pgprot(0); + clear = __pgprot(_PAGE_PRESENT | _PAGE_VALID); + } + + return __set_memory(addr, 1, set, clear); +} diff --git a/arch/loongarch/mm/tlb.c b/arch/loongarch/mm/tlb.c index 5ac9beb5f093..3b427b319db2 100644 --- a/arch/loongarch/mm/tlb.c +++ b/arch/loongarch/mm/tlb.c @@ -289,7 +289,7 @@ static void setup_tlb_handler(int cpu) /* Avoid lockdep warning */ rcutree_report_cpu_starting(cpu); -#ifdef CONFIG_NUMA +#if defined(CONFIG_NUMA) && !defined(CONFIG_PREEMPT_RT) vec_sz = sizeof(exception_handlers); if (pcpu_handlers[cpu]) diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index 7dbefd4ba210..dd350cba1252 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -179,7 +179,7 @@ static void __build_epilogue(struct jit_ctx *ctx, bool is_tail_call) if (!is_tail_call) { /* Set return value */ - move_reg(ctx, LOONGARCH_GPR_A0, regmap[BPF_REG_0]); + emit_insn(ctx, addiw, LOONGARCH_GPR_A0, regmap[BPF_REG_0], 0); /* Return to the caller */ emit_insn(ctx, jirl, LOONGARCH_GPR_RA, LOONGARCH_GPR_ZERO, 0); } else { diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index 40c1175823d6..fdde1bcd4e26 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -19,7 +19,7 @@ ccflags-vdso := \ cflags-vdso := $(ccflags-vdso) \ -isystem $(shell $(CC) -print-file-name=include) \ $(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \ - -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \ + -std=gnu11 -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \ -fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \ $(call cc-option, -fno-asynchronous-unwind-tables) \ $(call cc-option, -fno-stack-protector) diff --git a/arch/m68k/coldfire/device.c b/arch/m68k/coldfire/device.c index 7dab46728aed..b6958ec2a220 100644 --- a/arch/m68k/coldfire/device.c +++ b/arch/m68k/coldfire/device.c @@ -93,7 +93,7 @@ static struct platform_device mcf_uart = { .dev.platform_data = mcf_uart_platform_data, }; -#if IS_ENABLED(CONFIG_FEC) +#ifdef MCFFEC_BASE0 #ifdef CONFIG_M5441x #define FEC_NAME "enet-fec" @@ -145,6 +145,7 @@ static struct platform_device mcf_fec0 = { .platform_data = FEC_PDATA, } }; +#endif /* MCFFEC_BASE0 */ #ifdef MCFFEC_BASE1 static struct resource mcf_fec1_resources[] = { @@ -182,7 +183,6 @@ static struct platform_device mcf_fec1 = { } }; #endif /* MCFFEC_BASE1 */ -#endif /* CONFIG_FEC */ #if IS_ENABLED(CONFIG_SPI_COLDFIRE_QSPI) /* @@ -624,12 +624,12 @@ static struct platform_device mcf_flexcan0 = { static struct platform_device *mcf_devices[] __initdata = { &mcf_uart, -#if IS_ENABLED(CONFIG_FEC) +#ifdef MCFFEC_BASE0 &mcf_fec0, +#endif #ifdef MCFFEC_BASE1 &mcf_fec1, #endif -#endif #if IS_ENABLED(CONFIG_SPI_COLDFIRE_QSPI) &mcf_qspi, #endif diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index a70aec9a05c4..c705247e7b5b 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -449,7 +449,6 @@ CONFIG_RTC_DRV_RP5C01=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 312853f3d26a..6d62b9187a58 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -406,7 +406,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 0853e4358de9..c3c644df852d 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -426,7 +426,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index f738202d1f36..20261f819691 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -398,7 +398,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 74f74e03ccc9..ce4fe93a0f70 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -408,7 +408,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 14c8f1b374aa..040ae75f47c3 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -425,7 +425,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 41c8112c6d0d..f8edc9082724 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -511,7 +511,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index e72d37ee90a7..71fc71bb660e 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -397,7 +397,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 733f1fc9a50a..41072e68028e 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -398,7 +398,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 3efe25435561..e4c30e2b9bbb 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -415,7 +415,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 1b8ea0e7acb4..980843a9ea1e 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -396,7 +396,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 5bda93f6a200..38681cc6b598 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -396,7 +396,6 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 0dbf9c5c6fae..b282e0dd8dc1 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -4,3 +4,4 @@ generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += spinlock.h +generic-y += text-patching.h diff --git a/arch/m68k/include/asm/mcfgpio.h b/arch/m68k/include/asm/mcfgpio.h index 019f24439546..9c91ecdafc45 100644 --- a/arch/m68k/include/asm/mcfgpio.h +++ b/arch/m68k/include/asm/mcfgpio.h @@ -136,7 +136,7 @@ static inline void gpio_free(unsigned gpio) * read-modify-write as well as those controlled by the EPORT and GPIO modules. */ #define MCFGPIO_SCR_START 40 -#elif defined(CONFIGM5441x) +#elif defined(CONFIG_M5441x) /* The m5441x EPORT doesn't have its own GPIO port, uses PORT C */ #define MCFGPIO_SCR_START 0 #else diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index af3a10973233..63c0e706084b 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -14,7 +14,7 @@ extern unsigned long memory_end; #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #define __pa(vaddr) ((unsigned long)(vaddr)) #define __va(paddr) ((void *)((unsigned long)(paddr))) diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index a055f5dbe00a..7178f990e8b3 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -8,3 +8,4 @@ generic-y += parport.h generic-y += syscalls.h generic-y += tlb.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index 935585d8bb26..8e98c0796437 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -275,7 +275,6 @@ CONFIG_DM9102=m CONFIG_ULI526X=m CONFIG_PCMCIA_XIRCOM=m CONFIG_DL2K=m -CONFIG_SUNDANCE=m CONFIG_PCMCIA_FMVJ18X=m CONFIG_E100=m CONFIG_E1000=m diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 7ba67a0d6c97..684569b2ecd6 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -13,3 +13,4 @@ generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index fd69c8808554..d0a86ce83de9 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -17,12 +17,7 @@ static inline int prepare_hugepage_range(struct file *file, unsigned long len) { unsigned long task_size = STACK_TOP; - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; if (len > task_size) return -ENOMEM; if (task_size - len < addr) diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 401c1d9e4409..6e854bb11f37 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -317,7 +317,9 @@ static inline pmd_t *pud_pgtable(pud_t pud) */ extern void pgd_init(void *addr); extern void pud_init(void *addr); +#define pud_init pud_init extern void pmd_init(void *addr); +#define pmd_init pmd_init /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 9c48d9a21aa0..b700dae28c48 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -105,6 +105,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 8ab7582291ab..d118d4731580 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -157,6 +157,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 81 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index c17157e700c0..d2c3b6b41f18 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -484,8 +484,6 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, struct kvm *kvm = vcpu->kvm; gfn_t gfn = gpa >> PAGE_SHIFT; pte_t *ptep; - kvm_pfn_t pfn = 0; /* silence bogus GCC warning */ - bool pfn_valid = false; int ret = 0; spin_lock(&kvm->mmu_lock); @@ -498,12 +496,9 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, } /* Track access to pages marked old */ - if (!pte_young(*ptep)) { + if (!pte_young(*ptep)) set_pte(ptep, pte_mkyoung(*ptep)); - pfn = pte_pfn(*ptep); - pfn_valid = true; - /* call kvm_set_pfn_accessed() after unlock */ - } + if (write_fault && !pte_dirty(*ptep)) { if (!pte_write(*ptep)) { ret = -EFAULT; @@ -512,9 +507,7 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, /* Track dirtying of writeable pages */ set_pte(ptep, pte_mkdirty(*ptep)); - pfn = pte_pfn(*ptep); mark_page_dirty(kvm, gfn); - kvm_set_pfn_dirty(pfn); } if (out_entry) @@ -524,8 +517,6 @@ static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, out: spin_unlock(&kvm->mmu_lock); - if (pfn_valid) - kvm_set_pfn_accessed(pfn); return ret; } @@ -566,6 +557,7 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool writeable; unsigned long prot_bits; unsigned long mmu_seq; + struct page *page; /* Try the fast path to handle old / clean pages */ srcu_idx = srcu_read_lock(&kvm->srcu); @@ -587,7 +579,7 @@ retry: mmu_seq = kvm->mmu_invalidate_seq; /* * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads - * in gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't + * in kvm_faultin_pfn() (which calls get_user_pages()), so that we don't * risk the page we get a reference to getting unmapped before we have a * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. * @@ -599,7 +591,7 @@ retry: smp_rmb(); /* Slow path - ask KVM core whether we can access this GPA */ - pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writeable); + pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writeable, &page); if (is_error_noslot_pfn(pfn)) { err = -EFAULT; goto out; @@ -611,10 +603,10 @@ retry: /* * This can happen when mappings are changed asynchronously, but * also synchronously if a COW is triggered by - * gfn_to_pfn_prot(). + * kvm_faultin_pfn(). */ spin_unlock(&kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_page_unused(page); goto retry; } @@ -628,7 +620,6 @@ retry: if (write_fault) { prot_bits |= __WRITEABLE; mark_page_dirty(kvm, gfn); - kvm_set_pfn_dirty(pfn); } } entry = pfn_pte(pfn, __pgprot(prot_bits)); @@ -642,9 +633,8 @@ retry: if (out_buddy) *out_buddy = *ptep_buddy(ptep); + kvm_release_faultin_page(kvm, page, false, writeable); spin_unlock(&kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - kvm_set_pfn_accessed(pfn); out: srcu_read_unlock(&kvm->srcu, srcu_idx); return err; diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 0d09829ed144..28004301c236 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -7,3 +7,4 @@ generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += spinlock.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index cef49d60d74c..2b1a6b00cdac 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -9,3 +9,4 @@ generic-y += spinlock.h generic-y += qrwlock_types.h generic-y += qrwlock.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 72daacc472a0..5b3a5429f71b 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -12,21 +12,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/arch/parisc/include/asm/patch.h b/arch/parisc/include/asm/text-patching.h index 400d84c6e504..400d84c6e504 100644 --- a/arch/parisc/include/asm/patch.h +++ b/arch/parisc/include/asm/text-patching.h diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 68c44f99bc93..b6a709506987 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -75,6 +75,9 @@ #define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 38fc0b188e08..d268d69bfcd2 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -138,6 +138,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 0x404C + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index c91f9c2e61ed..10fd5b3e63e7 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -20,7 +20,7 @@ #include <asm/assembly.h> #include <asm/sections.h> #include <asm/ftrace.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #define __hot __section(".text.hot") @@ -87,7 +87,7 @@ int ftrace_enable_ftrace_graph_caller(void) int ftrace_disable_ftrace_graph_caller(void) { - static_key_enable(&ftrace_graph_enable.key); + static_key_disable(&ftrace_graph_enable.key); return 0; } #endif diff --git a/arch/parisc/kernel/jump_label.c b/arch/parisc/kernel/jump_label.c index e253b134500d..ea51f15bf0e6 100644 --- a/arch/parisc/kernel/jump_label.c +++ b/arch/parisc/kernel/jump_label.c @@ -8,7 +8,7 @@ #include <linux/jump_label.h> #include <linux/bug.h> #include <asm/alternative.h> -#include <asm/patch.h> +#include <asm/text-patching.h> static inline int reassemble_17(int as17) { diff --git a/arch/parisc/kernel/kgdb.c b/arch/parisc/kernel/kgdb.c index b16fa9bac5f4..fee81f877525 100644 --- a/arch/parisc/kernel/kgdb.c +++ b/arch/parisc/kernel/kgdb.c @@ -16,7 +16,7 @@ #include <asm/ptrace.h> #include <asm/traps.h> #include <asm/processor.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/cacheflush.h> const struct kgdb_arch arch_kgdb_ops = { diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c index 6e0b86652f30..9255adba67a3 100644 --- a/arch/parisc/kernel/kprobes.c +++ b/arch/parisc/kernel/kprobes.c @@ -12,7 +12,7 @@ #include <linux/kprobes.h> #include <linux/slab.h> #include <asm/cacheflush.h> -#include <asm/patch.h> +#include <asm/text-patching.h> DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c index e59574f65e64..35dd764b871e 100644 --- a/arch/parisc/kernel/patch.c +++ b/arch/parisc/kernel/patch.c @@ -13,7 +13,7 @@ #include <asm/cacheflush.h> #include <asm/fixmap.h> -#include <asm/patch.h> +#include <asm/text-patching.h> struct patch { void *addr; diff --git a/arch/parisc/lib/checksum.c b/arch/parisc/lib/checksum.c index 4818f3db84a5..59d8c15d81bd 100644 --- a/arch/parisc/lib/checksum.c +++ b/arch/parisc/lib/checksum.c @@ -25,15 +25,6 @@ : "=r"(_t) \ : "r"(_r), "0"(_t)); -static inline unsigned short from32to16(unsigned int x) -{ - /* 32 bits --> 16 bits + carry */ - x = (x & 0xffff) + (x >> 16); - /* 16 bits + carry --> 16 bits including carry */ - x = (x & 0xffff) + (x >> 16); - return (unsigned short)x; -} - static inline unsigned int do_csum(const unsigned char * buff, int len) { int odd, count; @@ -85,7 +76,7 @@ static inline unsigned int do_csum(const unsigned char * buff, int len) } if (len & 1) result += le16_to_cpu(*buff); - result = from32to16(result); + result = csum_from32to16(result); if (odd) result = swab16(result); out: @@ -102,7 +93,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) { unsigned int result = do_csum(buff, len); addc(result, sum); - return (__force __wsum)from32to16(result); + return (__force __wsum)csum_from32to16(result); } EXPORT_SYMBOL(csum_partial); diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index aa664f7ddb63..e9d18cf25b79 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -21,27 +21,6 @@ #include <asm/mmu_context.h> -unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > TASK_SIZE) - return -ENOMEM; - - if (flags & MAP_FIXED) - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - - if (addr) - addr = ALIGN(addr, huge_page_size(h)); - - /* we need to make sure the colouring is OK */ - return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); -} pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/arch/powerpc/Kbuild b/arch/powerpc/Kbuild index 571f260b0842..b010ccb071b6 100644 --- a/arch/powerpc/Kbuild +++ b/arch/powerpc/Kbuild @@ -19,4 +19,4 @@ obj-$(CONFIG_KEXEC_CORE) += kexec/ obj-$(CONFIG_KEXEC_FILE) += purgatory/ # for cleaning -subdir- += boot +subdir- += boot tools diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 568560671cf4..a0ce777f9706 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -234,6 +234,8 @@ config PPC select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_ARGS if ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32 + select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if PPC_FTRACE_OUT_OF_LINE || (PPC32 && ARCH_USING_PATCHABLE_FUNCTION_ENTRY) + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS select HAVE_DYNAMIC_FTRACE_WITH_REGS if ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -243,7 +245,7 @@ config PPC select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_TRACER if PPC64 || (PPC32 && CC_IS_GCC) + select HAVE_FUNCTION_TRACER if !COMPILE_TEST && (PPC64 || (PPC32 && CC_IS_GCC)) select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC select HAVE_GENERIC_VDSO select HAVE_HARDLOCKUP_DETECTOR_ARCH if PPC_BOOK3S_64 && SMP @@ -273,10 +275,12 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ + select HAVE_SAMPLE_FTRACE_DIRECT if HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_SETUP_PER_CPU_AREA if PPC64 select HAVE_SOFTIRQ_ON_OWN_STACK - select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) - select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) + select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,$(m32-flag) -mstack-protector-guard=tls -mstack-protector-guard-reg=r2 -mstack-protector-guard-offset=0) + select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,$(m64-flag) -mstack-protector-guard=tls -mstack-protector-guard-reg=r13 -mstack-protector-guard-offset=0) select HAVE_STATIC_CALL if PPC32 select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING @@ -569,6 +573,22 @@ config ARCH_USING_PATCHABLE_FUNCTION_ENTRY def_bool $(success,$(srctree)/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh $(CC) -mlittle-endian) if PPC64 && CPU_LITTLE_ENDIAN def_bool $(success,$(srctree)/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh $(CC) -mbig-endian) if PPC64 && CPU_BIG_ENDIAN +config PPC_FTRACE_OUT_OF_LINE + def_bool PPC64 && ARCH_USING_PATCHABLE_FUNCTION_ENTRY + select ARCH_WANTS_PRE_LINK_VMLINUX + +config PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE + int "Number of ftrace out-of-line stubs to reserve within .text" + depends on PPC_FTRACE_OUT_OF_LINE + default 32768 + help + Number of stubs to reserve for use by ftrace. This space is + reserved within .text, and is distinct from any additional space + added at the end of .text before the final vmlinux link. Set to + zero to have stubs only be generated at the end of vmlinux (only + if the size of vmlinux is less than 32MB). Set to a higher value + if building vmlinux larger than 48MB. + config HOTPLUG_CPU bool "Support for enabling/disabling CPUs" depends on SMP && (PPC_PSERIES || \ diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 0bbec4afc0d5..20d05605fa83 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -223,12 +223,6 @@ config PPC_EARLY_DEBUG_RTAS_CONSOLE help Select this to enable early debugging via the RTAS console. -config PPC_EARLY_DEBUG_MAPLE - bool "Maple real mode" - depends on PPC_MAPLE - help - Select this to enable early debugging for Maple. - config PPC_EARLY_DEBUG_PAS_REALMODE bool "PA Semi real mode" depends on PPC_PASEMI diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index bbfe4a1f06ef..41489483a602 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -62,14 +62,14 @@ KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o endif ifdef CONFIG_CPU_LITTLE_ENDIAN -KBUILD_CFLAGS += -mlittle-endian +KBUILD_CPPFLAGS += -mlittle-endian KBUILD_LDFLAGS += -EL LDEMULATION := lppc GNUTARGET := powerpcle MULTIPLEWORD := -mno-multiple KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-save-toc-indirect) else -KBUILD_CFLAGS += $(call cc-option,-mbig-endian) +KBUILD_CPPFLAGS += $(call cc-option,-mbig-endian) KBUILD_LDFLAGS += -EB LDEMULATION := ppc GNUTARGET := powerpc @@ -95,18 +95,11 @@ aflags-$(CONFIG_CPU_BIG_ENDIAN) += $(call cc-option,-mbig-endian) aflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mlittle-endian ifeq ($(HAS_BIARCH),y) -KBUILD_CFLAGS += -m$(BITS) +KBUILD_CPPFLAGS += -m$(BITS) KBUILD_AFLAGS += -m$(BITS) KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION) endif -cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard=tls -ifdef CONFIG_PPC64 -cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard-reg=r13 -else -cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard-reg=r2 -endif - LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) += -z notext @@ -155,7 +148,15 @@ CC_FLAGS_NO_FPU := $(call cc-option,-msoft-float) ifdef CONFIG_FUNCTION_TRACER ifdef CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY +ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +CC_FLAGS_FTRACE := -fpatchable-function-entry=1 +else +ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS # PPC32 only +CC_FLAGS_FTRACE := -fpatchable-function-entry=3,1 +else CC_FLAGS_FTRACE := -fpatchable-function-entry=2 +endif +endif else CC_FLAGS_FTRACE := -pg ifdef CONFIG_MPROFILE_KERNEL @@ -175,7 +176,6 @@ KBUILD_CPPFLAGS += -I $(srctree)/arch/powerpc $(asinstr) KBUILD_AFLAGS += $(AFLAGS-y) KBUILD_CFLAGS += $(CC_FLAGS_NO_FPU) KBUILD_CFLAGS += $(CFLAGS-y) -CPP = $(CC) -E $(KBUILD_CFLAGS) CHECKFLAGS += -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__ ifdef CONFIG_CPU_BIG_ENDIAN @@ -359,7 +359,7 @@ define archhelp echo ' install - Install kernel using' echo ' (your) ~/bin/$(INSTALLKERNEL) or' echo ' (distribution) /sbin/$(INSTALLKERNEL) or' - echo ' install to $$(INSTALL_PATH) and run lilo' + echo ' install to $$(INSTALL_PATH)' echo ' *_defconfig - Select default config from arch/powerpc/configs' echo '' echo ' Targets with <dt> embed a device tree blob inside the image' @@ -402,9 +402,11 @@ prepare: stack_protector_prepare PHONY += stack_protector_prepare stack_protector_prepare: prepare0 ifdef CONFIG_PPC64 - $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "PACA_CANARY") print $$3;}' include/generated/asm-offsets.h)) + $(eval KBUILD_CFLAGS += -mstack-protector-guard=tls -mstack-protector-guard-reg=r13 \ + -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "PACA_CANARY") print $$3;}' include/generated/asm-offsets.h)) else - $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h)) + $(eval KBUILD_CFLAGS += -mstack-protector-guard=tls -mstack-protector-guard-reg=r2 \ + -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h)) endif endif diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index ae5a4256b03d..bb601be36173 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -24,6 +24,9 @@ else $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" endif +quiet_cmd_ftrace_check = CHKFTRC $@ + cmd_ftrace_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/ftrace_check.sh "$(NM)" "$@" + # `@true` prevents complaint when there is nothing to be done vmlinux: FORCE @@ -34,6 +37,11 @@ endif ifdef CONFIG_RELOCATABLE $(call if_changed,relocs_check) endif +ifdef CONFIG_FUNCTION_TRACER +ifndef CONFIG_PPC64_ELF_ABI_V1 + $(call cmd,ftrace_check) +endif +endif clean: rm -f .tmp_symbols.txt diff --git a/arch/powerpc/boot/.gitignore b/arch/powerpc/boot/.gitignore index a4716d138cfc..5a867f23fe7f 100644 --- a/arch/powerpc/boot/.gitignore +++ b/arch/powerpc/boot/.gitignore @@ -30,7 +30,6 @@ zImage.coff zImage.epapr zImage.holly zImage.*lds -zImage.maple zImage.miboot zImage.pmac zImage.pseries diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index fa8518067d38..1ff6ad4f6cd2 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -276,7 +276,6 @@ quiet_cmd_wrap = WRAP $@ image-$(CONFIG_PPC_PSERIES) += zImage.pseries image-$(CONFIG_PPC_POWERNV) += zImage.pseries -image-$(CONFIG_PPC_MAPLE) += zImage.maple image-$(CONFIG_PPC_IBM_CELL_BLADE) += zImage.pseries image-$(CONFIG_PPC_PS3) += dtbImage.ps3 image-$(CONFIG_PPC_CHRP) += zImage.chrp @@ -444,7 +443,7 @@ $(obj)/zImage.initrd: $(addprefix $(obj)/, $(initrd-y)) clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ zImage zImage.initrd zImage.chrp zImage.coff zImage.holly \ zImage.miboot zImage.pmac zImage.pseries \ - zImage.maple simpleImage.* otheros.bld + simpleImage.* otheros.bld # clean up files cached by wrapper clean-kernel-base := vmlinux.strip vmlinux.bin diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index b1f5549a3c9c..1db60fe13802 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -271,11 +271,6 @@ pseries) fi make_space=n ;; -maple) - platformo="$object/of.o $object/epapr.o" - link_address='0x400000' - make_space=n - ;; pmac|chrp) platformo="$object/of.o $object/epapr.o" make_space=n @@ -517,7 +512,7 @@ fi # post-processing needed for some platforms case "$platform" in -pseries|chrp|maple) +pseries|chrp) $objbin/addnote "$ofile" ;; coff) diff --git a/arch/powerpc/configs/maple_defconfig b/arch/powerpc/configs/maple_defconfig deleted file mode 100644 index c821a97f4a89..000000000000 --- a/arch/powerpc/configs/maple_defconfig +++ /dev/null @@ -1,111 +0,0 @@ -CONFIG_PPC64=y -CONFIG_SMP=y -CONFIG_NR_CPUS=4 -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -# CONFIG_COMPAT_BRK is not set -CONFIG_PROFILING=y -CONFIG_KPROBES=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_MODULE_SRCVERSION_ALL=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y -CONFIG_MAC_PARTITION=y -# CONFIG_PPC_POWERNV is not set -# CONFIG_PPC_PSERIES is not set -# CONFIG_PPC_PMAC is not set -CONFIG_PPC_MAPLE=y -CONFIG_UDBG_RTAS_CONSOLE=y -CONFIG_GEN_RTC=y -CONFIG_KEXEC=y -CONFIG_IRQ_ALL_CPUS=y -CONFIG_PPC_4K_PAGES=y -CONFIG_PCI_MSI=y -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_XFRM_USER=m -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -# CONFIG_IPV6 is not set -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=8192 -# CONFIG_SCSI_PROC_FS is not set -CONFIG_BLK_DEV_SD=y -CONFIG_BLK_DEV_SR=y -CONFIG_CHR_DEV_SG=y -CONFIG_SCSI_IPR=y -CONFIG_ATA=y -CONFIG_PATA_AMD=y -CONFIG_ATA_GENERIC=y -CONFIG_NETDEVICES=y -CONFIG_AMD8111_ETH=y -CONFIG_TIGON3=y -CONFIG_E1000=y -CONFIG_USB_PEGASUS=y -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_SERIO is not set -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_HVC_RTAS=y -# CONFIG_HW_RANDOM is not set -CONFIG_I2C=y -CONFIG_I2C_CHARDEV=y -CONFIG_I2C_AMD8111=y -# CONFIG_VGA_CONSOLE is not set -CONFIG_HID_GYRATION=y -CONFIG_HID_PANTHERLORD=y -CONFIG_HID_PETALYNX=y -CONFIG_HID_SAMSUNG=y -CONFIG_HID_SUNPLUS=y -CONFIG_USB=y -CONFIG_USB_MON=y -CONFIG_USB_EHCI_HCD=y -CONFIG_USB_EHCI_ROOT_HUB_TT=y -# CONFIG_USB_EHCI_HCD_PPC_OF is not set -CONFIG_USB_OHCI_HCD=y -CONFIG_USB_UHCI_HCD=y -CONFIG_USB_SERIAL=y -CONFIG_USB_SERIAL_GENERIC=y -CONFIG_USB_SERIAL_CYPRESS_M8=m -CONFIG_USB_SERIAL_GARMIN=m -CONFIG_USB_SERIAL_IPW=m -CONFIG_USB_SERIAL_KEYSPAN=y -CONFIG_USB_SERIAL_TI=m -CONFIG_EXT2_FS=y -CONFIG_EXT4_FS=y -CONFIG_FS_DAX=y -CONFIG_MSDOS_FS=y -CONFIG_VFAT_FS=y -CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_HUGETLBFS=y -CONFIG_CRAMFS=y -CONFIG_NFS_FS=y -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=y -CONFIG_ROOT_NFS=y -CONFIG_NLS_DEFAULT="utf-8" -CONFIG_NLS_UTF8=y -CONFIG_CRC_CCITT=y -CONFIG_CRC_T10DIF=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_STACK_USAGE=y -CONFIG_DEBUG_STACKOVERFLOW=y -CONFIG_XMON=y -CONFIG_XMON_DEFAULT=y -CONFIG_BOOTX_TEXT=y -CONFIG_CRYPTO_ECB=m -CONFIG_CRYPTO_PCBC=m -# CONFIG_CRYPTO_HW is not set -CONFIG_PRINTK_TIME=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index a5e3e7f97f4d..f39c0d000c43 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -44,7 +44,6 @@ CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y CONFIG_PAPR_SCM=m CONFIG_PPC_SVM=y -CONFIG_PPC_MAPLE=y CONFIG_PPC_PASEMI=y CONFIG_PPC_PASEMI_IOMMU=y CONFIG_PPC_PS3=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index c06344db0eb3..4d77e17541e9 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -435,7 +435,6 @@ CONFIG_DM9102=m CONFIG_ULI526X=m CONFIG_PCMCIA_XIRCOM=m CONFIG_DL2K=m -CONFIG_SUNDANCE=m CONFIG_S2IO=m CONFIG_FEC_MPC52xx=m CONFIG_GIANFAR=m diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 201218faed61..29a529d2ab8b 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -193,6 +193,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_ARCH_31 LONG_ASM_CONST(0x0004000000000000) #define CPU_FTR_DAWR1 LONG_ASM_CONST(0x0008000000000000) #define CPU_FTR_DEXCR_NPHIE LONG_ASM_CONST(0x0010000000000000) +#define CPU_FTR_P11_PVR LONG_ASM_CONST(0x0020000000000000) #ifndef __ASSEMBLY__ @@ -454,7 +455,7 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_DAWR | CPU_FTR_DAWR1 | \ CPU_FTR_DEXCR_NPHIE) -#define CPU_FTRS_POWER11 CPU_FTRS_POWER10 +#define CPU_FTRS_POWER11 (CPU_FTRS_POWER10 | CPU_FTR_P11_PVR) #define CPU_FTRS_CELL (CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ @@ -475,7 +476,7 @@ static inline void cpu_feature_keys_init(void) { } (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | CPU_FTRS_POWER9 | \ CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | \ - CPU_FTRS_POWER9_DD2_3 | CPU_FTRS_POWER10) + CPU_FTRS_POWER9_DD2_3 | CPU_FTRS_POWER10 | CPU_FTRS_POWER11) #else #define CPU_FTRS_POSSIBLE \ (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \ @@ -483,7 +484,7 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \ CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | CPU_FTRS_POWER9 | \ CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | \ - CPU_FTRS_POWER9_DD2_3 | CPU_FTRS_POWER10) + CPU_FTRS_POWER9_DD2_3 | CPU_FTRS_POWER10 | CPU_FTRS_POWER11) #endif /* CONFIG_CPU_LITTLE_ENDIAN */ #endif #else @@ -547,7 +548,7 @@ enum { (CPU_FTRS_POSSIBLE & ~CPU_FTR_HVMODE & ~CPU_FTR_DBELL & \ CPU_FTRS_POWER7 & CPU_FTRS_POWER8E & CPU_FTRS_POWER8 & \ CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_POWER9_DD2_2 & \ - CPU_FTRS_POWER10 & CPU_FTRS_DT_CPU_BASE) + CPU_FTRS_POWER10 & CPU_FTRS_POWER11 & CPU_FTRS_DT_CPU_BASE) #else #define CPU_FTRS_ALWAYS \ (CPU_FTRS_PPC970 & CPU_FTRS_POWER5 & \ @@ -555,7 +556,7 @@ enum { CPU_FTRS_PA6T & CPU_FTRS_POWER8 & CPU_FTRS_POWER8E & \ ~CPU_FTR_HVMODE & ~CPU_FTR_DBELL & CPU_FTRS_POSSIBLE & \ CPU_FTRS_POWER9 & CPU_FTRS_POWER9_DD2_1 & CPU_FTRS_POWER9_DD2_2 & \ - CPU_FTRS_POWER10 & CPU_FTRS_DT_CPU_BASE) + CPU_FTRS_POWER10 & CPU_FTRS_POWER11 & CPU_FTRS_DT_CPU_BASE) #endif /* CONFIG_CPU_LITTLE_ENDIAN */ #endif #else diff --git a/arch/powerpc/include/asm/dtl.h b/arch/powerpc/include/asm/dtl.h index d6f43d149f8d..a5c21bc623cb 100644 --- a/arch/powerpc/include/asm/dtl.h +++ b/arch/powerpc/include/asm/dtl.h @@ -1,8 +1,8 @@ #ifndef _ASM_POWERPC_DTL_H #define _ASM_POWERPC_DTL_H +#include <linux/rwsem.h> #include <asm/lppaca.h> -#include <linux/spinlock_types.h> /* * Layout of entries in the hypervisor's dispatch trace log buffer. @@ -35,7 +35,7 @@ struct dtl_entry { #define DTL_LOG_ALL (DTL_LOG_CEDE | DTL_LOG_PREEMPT | DTL_LOG_FAULT) extern struct kmem_cache *dtl_cache; -extern rwlock_t dtl_access_lock; +extern struct rw_semaphore dtl_access_lock; extern void register_dtl_buffer(int cpu); extern void alloc_dtl_buffers(unsigned long *time_limit); diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index ef40c9b6972a..a48f54dde4f6 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -19,6 +19,7 @@ extern int is_fadump_active(void); extern int should_fadump_crash(void); extern void crash_fadump(struct pt_regs *, const char *); extern void fadump_cleanup(void); +void fadump_setup_param_area(void); extern void fadump_append_bootargs(void); #else /* CONFIG_FA_DUMP */ @@ -26,6 +27,7 @@ static inline int is_fadump_active(void) { return 0; } static inline int should_fadump_crash(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } static inline void fadump_cleanup(void) { } +static inline void fadump_setup_param_area(void) { } static inline void fadump_append_bootargs(void) { } #endif /* !CONFIG_FA_DUMP */ @@ -34,4 +36,11 @@ extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); extern int fadump_reserve_mem(void); #endif + +#if defined(CONFIG_FA_DUMP) && defined(CONFIG_CMA) +void fadump_cma_init(void); +#else +static inline void fadump_cma_init(void) { } +#endif + #endif /* _ASM_POWERPC_FADUMP_H */ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 0edfb874eb02..db481b336bca 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -24,7 +24,10 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, struct module; struct dyn_ftrace; struct dyn_arch_ftrace { - struct module *mod; +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + /* pointer to the associated out-of-line stub */ + unsigned long ool_stub; +#endif }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS @@ -110,8 +113,36 @@ static inline u8 this_cpu_get_ftrace_enabled(void) { return 1; } #ifdef CONFIG_FUNCTION_TRACER extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +struct ftrace_ool_stub { +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + struct ftrace_ops *ftrace_op; +#endif + u32 insn[4]; +} __aligned(sizeof(unsigned long)); +extern struct ftrace_ool_stub ftrace_ool_stub_text_end[], ftrace_ool_stub_text[], + ftrace_ool_stub_inittext[]; +extern unsigned int ftrace_ool_stub_text_end_count, ftrace_ool_stub_text_count, + ftrace_ool_stub_inittext_count; +#endif void ftrace_free_init_tramp(void); unsigned long ftrace_call_adjust(unsigned long addr); + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When an ftrace registered caller is tracing a function that is also set by a + * register_ftrace_direct() call, it needs to be differentiated in the + * ftrace_caller trampoline so that the direct call can be invoked after the + * other ftrace ops. To do this, place the direct caller in the orig_gpr3 field + * of pt_regs. This tells ftrace_caller that there's a direct caller. + */ +static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) +{ + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; + + regs->orig_gpr3 = addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #else static inline void ftrace_free_init_tramp(void) { } static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; } diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 7a8495660c2f..65d1f291393d 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -495,6 +495,7 @@ #define H_GUEST_CAP_COPY_MEM (1UL<<(63-0)) #define H_GUEST_CAP_POWER9 (1UL<<(63-1)) #define H_GUEST_CAP_POWER10 (1UL<<(63-2)) +#define H_GUEST_CAP_POWER11 (1UL<<(63-3)) #define H_GUEST_CAP_BITMAP2 (1UL<<(63-63)) #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/kfence.h b/arch/powerpc/include/asm/kfence.h index fab124ada1c7..1f7cab58ab2c 100644 --- a/arch/powerpc/include/asm/kfence.h +++ b/arch/powerpc/include/asm/kfence.h @@ -15,7 +15,7 @@ #define ARCH_FUNC_PREFIX "." #endif -#ifdef CONFIG_KFENCE +extern bool kfence_early_init; extern bool kfence_disabled; static inline void disable_kfence(void) @@ -27,7 +27,11 @@ static inline bool arch_kfence_init_pool(void) { return !kfence_disabled; } -#endif + +static inline bool kfence_early_init_enabled(void) +{ + return IS_ENABLED(CONFIG_KFENCE) && kfence_early_init; +} #ifdef CONFIG_PPC64 static inline bool kfence_protect_page(unsigned long addr, bool protect) diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index 4525a9c68260..dfe2e5ad3b21 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -21,7 +21,7 @@ #include <linux/percpu.h> #include <linux/module.h> #include <asm/probes.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #ifdef CONFIG_KPROBES #define __ARCH_WANT_KPROBES_INSN_SLOT diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 10618622d7ef..e1ff291ba891 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -203,7 +203,7 @@ extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long gpa, struct kvm_memory_slot *memslot, - bool writing, bool kvm_ro, + bool writing, pte_t *inserted_pte, unsigned int *levelp); extern int kvmppc_init_vm_radix(struct kvm *kvm); extern void kvmppc_free_radix(struct kvm *kvm); @@ -235,7 +235,7 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_vcpu *vcpu); extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, - bool writing, bool *writable); + bool writing, bool *writable, struct page **page); extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode); extern void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot, diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 2ef9a5f4e5d1..b936e174eefd 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -684,10 +684,16 @@ int kvmhv_nestedv2_set_ptbl_entry(unsigned long lpid, u64 dw0, u64 dw1); int kvmhv_nestedv2_parse_output(struct kvm_vcpu *vcpu); int kvmhv_nestedv2_set_vpa(struct kvm_vcpu *vcpu, unsigned long vpa); -int kmvhv_counters_tracepoint_regfunc(void); -void kmvhv_counters_tracepoint_unregfunc(void); +int kvmhv_counters_tracepoint_regfunc(void); +void kvmhv_counters_tracepoint_unregfunc(void); int kvmhv_get_l2_counters_status(void); void kvmhv_set_l2_counters_status(int cpu, bool status); +u64 kvmhv_get_l1_to_l2_cs_time(void); +u64 kvmhv_get_l2_to_l1_cs_time(void); +u64 kvmhv_get_l2_runtime_agg(void); +u64 kvmhv_get_l1_to_l2_cs_time_vcpu(void); +u64 kvmhv_get_l2_to_l1_cs_time_vcpu(void); +u64 kvmhv_get_l2_runtime_agg_vcpu(void); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 37e581c5b201..6e1108f8fce6 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -871,6 +871,11 @@ struct kvm_vcpu_arch { struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ #endif #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + u64 l1_to_l2_cs; + u64 l2_to_l1_cs; + u64 l2_runtime_agg; +#endif }; #define VCPU_FPR(vcpu, i) (vcpu)->arch.fp.fpr[i][TS_FPROFFSET] diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 1862f94335ee..3298eec123a3 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -4,20 +4,24 @@ #ifdef __KERNEL__ #include <linux/compiler.h> -#include <linux/seq_file.h> #include <linux/init.h> -#include <linux/dma-mapping.h> #include <linux/export.h> +#include <linux/time64.h> + +#include <asm/page.h> struct pt_regs; struct pci_bus; +struct device; struct device_node; struct iommu_table; struct rtc_time; struct file; +struct pci_dev; struct pci_controller; struct kimage; struct pci_host_bridge; +struct seq_file; struct machdep_calls { const char *name; diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index 300c777cc307..e1ee5026ac4a 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -35,9 +35,11 @@ struct mod_arch_specific { bool toc_fixed; /* Have we fixed up .TOC.? */ #endif +#ifdef CONFIG_PPC64_ELF_ABI_V1 /* For module function descriptor dereference */ unsigned long start_opd; unsigned long end_opd; +#endif #else /* powerpc64 */ /* Indices of PLT sections within module. */ unsigned int core_plt_section; @@ -47,6 +49,11 @@ struct mod_arch_specific { #ifdef CONFIG_DYNAMIC_FTRACE unsigned long tramp; unsigned long tramp_regs; +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + struct ftrace_ool_stub *ool_stubs; + unsigned int ool_stub_count; + unsigned int ool_stub_index; +#endif #endif }; diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index b98a9e982c03..4312bcb913a4 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -587,12 +587,26 @@ #define PPC_RAW_MTSPR(spr, d) (0x7c0003a6 | ___PPC_RS(d) | __PPC_SPR(spr)) #define PPC_RAW_EIEIO() (0x7c0006ac) +/* bcl 20,31,$+4 */ +#define PPC_RAW_BCL4() (0x429f0005) #define PPC_RAW_BRANCH(offset) (0x48000000 | PPC_LI(offset)) #define PPC_RAW_BL(offset) (0x48000001 | PPC_LI(offset)) #define PPC_RAW_TW(t0, a, b) (0x7c000008 | ___PPC_RS(t0) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_TRAP() PPC_RAW_TW(31, 0, 0) #define PPC_RAW_SETB(t, bfa) (0x7c000100 | ___PPC_RT(t) | ___PPC_RA((bfa) << 2)) +#ifdef CONFIG_PPC32 +#define PPC_RAW_STL PPC_RAW_STW +#define PPC_RAW_STLU PPC_RAW_STWU +#define PPC_RAW_LL PPC_RAW_LWZ +#define PPC_RAW_CMPLI PPC_RAW_CMPWI +#else +#define PPC_RAW_STL PPC_RAW_STD +#define PPC_RAW_STLU PPC_RAW_STDU +#define PPC_RAW_LL PPC_RAW_LD +#define PPC_RAW_CMPLI PPC_RAW_CMPDI +#endif + /* Deal with instructions that older assemblers aren't aware of */ #define PPC_BCCTR_FLUSH stringify_in_c(.long PPC_INST_BCCTR_FLUSH) #define PPC_CP_ABORT stringify_in_c(.long PPC_RAW_CP_ABORT) diff --git a/arch/powerpc/include/asm/set_memory.h b/arch/powerpc/include/asm/set_memory.h index 9a025b776a4b..9c8d5747755d 100644 --- a/arch/powerpc/include/asm/set_memory.h +++ b/arch/powerpc/include/asm/set_memory.h @@ -12,37 +12,37 @@ int change_memory_attr(unsigned long addr, int numpages, long action); -static inline int set_memory_ro(unsigned long addr, int numpages) +static inline int __must_check set_memory_ro(unsigned long addr, int numpages) { return change_memory_attr(addr, numpages, SET_MEMORY_RO); } -static inline int set_memory_rw(unsigned long addr, int numpages) +static inline int __must_check set_memory_rw(unsigned long addr, int numpages) { return change_memory_attr(addr, numpages, SET_MEMORY_RW); } -static inline int set_memory_nx(unsigned long addr, int numpages) +static inline int __must_check set_memory_nx(unsigned long addr, int numpages) { return change_memory_attr(addr, numpages, SET_MEMORY_NX); } -static inline int set_memory_x(unsigned long addr, int numpages) +static inline int __must_check set_memory_x(unsigned long addr, int numpages) { return change_memory_attr(addr, numpages, SET_MEMORY_X); } -static inline int set_memory_np(unsigned long addr, int numpages) +static inline int __must_check set_memory_np(unsigned long addr, int numpages) { return change_memory_attr(addr, numpages, SET_MEMORY_NP); } -static inline int set_memory_p(unsigned long addr, int numpages) +static inline int __must_check set_memory_p(unsigned long addr, int numpages) { return change_memory_attr(addr, numpages, SET_MEMORY_P); } -static inline int set_memory_rox(unsigned long addr, int numpages) +static inline int __must_check set_memory_rox(unsigned long addr, int numpages) { return change_memory_attr(addr, numpages, SET_MEMORY_ROX); } diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h index 08243338069d..391fc19f7272 100644 --- a/arch/powerpc/include/asm/simple_spinlock_types.h +++ b/arch/powerpc/include/asm/simple_spinlock_types.h @@ -3,7 +3,7 @@ #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index 40b01446cf75..569765fa16bc 100644 --- a/arch/powerpc/include/asm/spinlock_types.h +++ b/arch/powerpc/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define _ASM_POWERPC_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #ifdef CONFIG_PPC_QUEUED_SPINLOCKS diff --git a/arch/powerpc/include/asm/spu_priv1.h b/arch/powerpc/include/asm/spu_priv1.h index 2167d756e6d5..6fee411d973d 100644 --- a/arch/powerpc/include/asm/spu_priv1.h +++ b/arch/powerpc/include/asm/spu_priv1.h @@ -216,7 +216,6 @@ spu_disable_spu (struct spu_context *ctx) */ extern const struct spu_priv1_ops spu_priv1_mmio_ops; -extern const struct spu_priv1_ops spu_priv1_beat_ops; extern const struct spu_management_ops spu_management_of_ops; diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h index 50950deedb87..e3d0e714ff28 100644 --- a/arch/powerpc/include/asm/sstep.h +++ b/arch/powerpc/include/asm/sstep.h @@ -173,9 +173,4 @@ int emulate_step(struct pt_regs *regs, ppc_inst_t instr); */ extern int emulate_loadstore(struct pt_regs *regs, struct instruction_op *op); -extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, - const void *mem, bool cross_endian); -extern void emulate_vsx_store(struct instruction_op *op, - const union vsx_reg *reg, void *mem, - bool cross_endian); extern int emulate_dcbz(unsigned long ea, struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/text-patching.h index e7f14720f630..e7f14720f630 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/text-patching.h diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 289023f7a656..a8681b12864f 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -38,7 +38,6 @@ void __init udbg_early_init(void); void __init udbg_init_debug_lpar(void); void __init udbg_init_debug_lpar_hvsi(void); void __init udbg_init_pmac_realmode(void); -void __init udbg_init_maple_realmode(void); void __init udbg_init_pas_realmode(void); void __init udbg_init_rtas_panel(void); void __init udbg_init_rtas_console(void); diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 7650b6ce14c8..8d972bc98b55 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -25,6 +25,7 @@ int vdso_getcpu_init(void); #ifdef __VDSO64__ #define V_FUNCTION_BEGIN(name) \ .globl name; \ + .type name,@function; \ name: \ #define V_FUNCTION_END(name) \ diff --git a/arch/powerpc/include/asm/vdso/getrandom.h b/arch/powerpc/include/asm/vdso/getrandom.h index 501d6bb14e8a..80ce0709725e 100644 --- a/arch/powerpc/include/asm/vdso/getrandom.h +++ b/arch/powerpc/include/asm/vdso/getrandom.h @@ -7,6 +7,8 @@ #ifndef __ASSEMBLY__ +#include <asm/vdso_datapage.h> + static __always_inline int do_syscall_3(const unsigned long _r0, const unsigned long _r3, const unsigned long _r4, const unsigned long _r5) { @@ -43,11 +45,21 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig static __always_inline struct vdso_rng_data *__arch_get_vdso_rng_data(void) { - return NULL; + struct vdso_arch_data *data; + + asm ( + " bcl 20, 31, .+4 ;" + "0: mflr %0 ;" + " addis %0, %0, (_vdso_datapage - 0b)@ha ;" + " addi %0, %0, (_vdso_datapage - 0b)@l ;" + : "=r" (data) : : "lr" + ); + + return &data->rng_data; } ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, - size_t opaque_len, const struct vdso_rng_data *vd); + size_t opaque_len); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index a9686310be2c..a202f5b63479 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -28,8 +28,9 @@ struct vdso_arch_data { __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ __u32 compat_syscall_map[SYSCALL_MAP_SIZE]; /* Map of compat syscalls */ - struct vdso_data data[CS_BASES]; struct vdso_rng_data rng_data; + + struct vdso_data data[CS_BASES] __aligned(1 << CONFIG_PAGE_SHIFT); }; #else /* CONFIG_PPC64 */ @@ -38,8 +39,9 @@ struct vdso_arch_data { __u64 tb_ticks_per_sec; /* Timebase tics / sec */ __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ __u32 compat_syscall_map[0]; /* No compat syscalls on PPC32 */ - struct vdso_data data[CS_BASES]; struct vdso_rng_data rng_data; + + struct vdso_data data[CS_BASES] __aligned(1 << CONFIG_PAGE_SHIFT); }; #endif /* CONFIG_PPC64 */ @@ -48,29 +50,17 @@ extern struct vdso_arch_data *vdso_data; #else /* __ASSEMBLY__ */ -.macro get_datapage ptr +.macro get_datapage ptr offset=0 bcl 20, 31, .+4 999: mflr \ptr - addis \ptr, \ptr, (_vdso_datapage - 999b)@ha - addi \ptr, \ptr, (_vdso_datapage - 999b)@l + addis \ptr, \ptr, (_vdso_datapage - 999b + \offset)@ha + addi \ptr, \ptr, (_vdso_datapage - 999b + \offset)@l .endm #include <asm/asm-offsets.h> #include <asm/page.h> -.macro get_realdatapage ptr scratch - get_datapage \ptr -#ifdef CONFIG_TIME_NS - lwz \scratch, VDSO_CLOCKMODE_OFFSET(\ptr) - xoris \scratch, \scratch, VDSO_CLOCKMODE_TIMENS@h - xori \scratch, \scratch, VDSO_CLOCKMODE_TIMENS@l - cntlzw \scratch, \scratch - rlwinm \scratch, \scratch, PAGE_SHIFT - 5, 1 << PAGE_SHIFT - add \ptr, \ptr, \scratch -#endif -.endm - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 131a8cc10dbe..7a390bd4f4af 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -335,7 +335,6 @@ int main(void) /* datapage offsets for use by vdso */ OFFSET(VDSO_DATA_OFFSET, vdso_arch_data, data); - OFFSET(VDSO_RNG_DATA_OFFSET, vdso_arch_data, rng_data); OFFSET(CFG_TB_TICKS_PER_SEC, vdso_arch_data, tb_ticks_per_sec); #ifdef CONFIG_PPC64 OFFSET(CFG_ICACHE_BLOCKSZ, vdso_arch_data, icache_block_size); @@ -347,8 +346,6 @@ int main(void) #else OFFSET(CFG_SYSCALL_MAP32, vdso_arch_data, syscall_map); #endif - OFFSET(VDSO_CLOCKMODE_OFFSET, vdso_arch_data, data[0].clock_mode); - DEFINE(VDSO_CLOCKMODE_TIMENS, VDSO_CLOCKMODE_TIMENS); #ifdef CONFIG_BUG DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); @@ -597,7 +594,6 @@ int main(void) HSTATE_FIELD(HSTATE_DABR, dabr); HSTATE_FIELD(HSTATE_DECEXP, dec_expires); HSTATE_FIELD(HSTATE_SPLIT_MODE, kvm_split_mode); - DEFINE(IPI_PRIORITY, IPI_PRIORITY); OFFSET(KVM_SPLIT_RPR, kvm_split_mode, rpr); OFFSET(KVM_SPLIT_PMMAR, kvm_split_mode, pmmar); OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar); @@ -677,5 +673,16 @@ int main(void) DEFINE(BPT_SIZE, BPT_SIZE); #endif +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + DEFINE(FTRACE_OOL_STUB_SIZE, sizeof(struct ftrace_ool_stub)); +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + OFFSET(FTRACE_OPS_FUNC, ftrace_ops, func); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + OFFSET(FTRACE_OPS_DIRECT_CALL, ftrace_ops, direct_call); +#endif +#endif + return 0; } diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index 2086fa6cdc25..103b6605dd68 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -13,7 +13,7 @@ #include <linux/io.h> #include <linux/memblock.h> #include <linux/of.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/kdump.h> #include <asm/firmware.h> #include <linux/uio.h> diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c index d4b8aff20815..247ab2acaccc 100644 --- a/arch/powerpc/kernel/epapr_paravirt.c +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -9,7 +9,7 @@ #include <linux/of_fdt.h> #include <asm/epapr_hcalls.h> #include <asm/cacheflush.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/machdep.h> #include <asm/inst.h> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a612e7513a4f..4b371c738213 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -78,26 +78,38 @@ static struct cma *fadump_cma; * But for some reason even if it fails we still have the memory reservation * with us and we can still continue doing fadump. */ -static int __init fadump_cma_init(void) +void __init fadump_cma_init(void) { - unsigned long long base, size; + unsigned long long base, size, end; int rc; - if (!fw_dump.fadump_enabled) - return 0; - + if (!fw_dump.fadump_supported || !fw_dump.fadump_enabled || + fw_dump.dump_active) + return; /* * Do not use CMA if user has provided fadump=nocma kernel parameter. - * Return 1 to continue with fadump old behaviour. */ - if (fw_dump.nocma) - return 1; + if (fw_dump.nocma || !fw_dump.boot_memory_size) + return; + /* + * [base, end) should be reserved during early init in + * fadump_reserve_mem(). No need to check this here as + * cma_init_reserved_mem() already checks for overlap. + * Here we give the aligned chunk of this reserved memory to CMA. + */ base = fw_dump.reserve_dump_area_start; size = fw_dump.boot_memory_size; + end = base + size; - if (!size) - return 0; + base = ALIGN(base, CMA_MIN_ALIGNMENT_BYTES); + end = ALIGN_DOWN(end, CMA_MIN_ALIGNMENT_BYTES); + size = end - base; + + if (end <= base) { + pr_warn("%s: Too less memory to give to CMA\n", __func__); + return; + } rc = cma_init_reserved_mem(base, size, 0, "fadump_cma", &fadump_cma); if (rc) { @@ -108,7 +120,7 @@ static int __init fadump_cma_init(void) * blocked from production system usage. Hence return 1, * so that we can continue with fadump. */ - return 1; + return; } /* @@ -120,15 +132,13 @@ static int __init fadump_cma_init(void) /* * So we now have successfully initialized cma area for fadump. */ - pr_info("Initialized 0x%lx bytes cma area at %ldMB from 0x%lx " + pr_info("Initialized [0x%llx, %luMB] cma area from [0x%lx, %luMB] " "bytes of memory reserved for firmware-assisted dump\n", - cma_get_size(fadump_cma), - (unsigned long)cma_get_base(fadump_cma) >> 20, - fw_dump.reserve_dump_area_size); - return 1; + cma_get_base(fadump_cma), cma_get_size(fadump_cma) >> 20, + fw_dump.reserve_dump_area_start, + fw_dump.boot_memory_size >> 20); + return; } -#else -static int __init fadump_cma_init(void) { return 1; } #endif /* CONFIG_CMA */ /* @@ -143,7 +153,7 @@ void __init fadump_append_bootargs(void) if (!fw_dump.dump_active || !fw_dump.param_area_supported || !fw_dump.param_area) return; - if (fw_dump.param_area >= fw_dump.boot_mem_top) { + if (fw_dump.param_area < fw_dump.boot_mem_top) { if (memblock_reserve(fw_dump.param_area, COMMAND_LINE_SIZE)) { pr_warn("WARNING: Can't use additional parameters area!\n"); fw_dump.param_area = 0; @@ -558,13 +568,6 @@ int __init fadump_reserve_mem(void) if (!fw_dump.dump_active) { fw_dump.boot_memory_size = PAGE_ALIGN(fadump_calculate_reserve_size()); -#ifdef CONFIG_CMA - if (!fw_dump.nocma) { - fw_dump.boot_memory_size = - ALIGN(fw_dump.boot_memory_size, - CMA_MIN_ALIGNMENT_BYTES); - } -#endif bootmem_min = fw_dump.ops->fadump_get_bootmem_min(); if (fw_dump.boot_memory_size < bootmem_min) { @@ -637,8 +640,6 @@ int __init fadump_reserve_mem(void) pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n", (size >> 20), base, (memblock_phys_mem_size() >> 20)); - - ret = fadump_cma_init(); } return ret; @@ -1586,6 +1587,12 @@ static void __init fadump_init_files(void) return; } + if (fw_dump.param_area) { + rc = sysfs_create_file(fadump_kobj, &bootargs_append_attr.attr); + if (rc) + pr_err("unable to create bootargs_append sysfs file (%d)\n", rc); + } + debugfs_create_file("fadump_region", 0444, arch_debugfs_dir, NULL, &fadump_region_fops); @@ -1740,7 +1747,7 @@ err_out: * Reserve memory to store additional parameters to be passed * for fadump/capture kernel. */ -static void __init fadump_setup_param_area(void) +void __init fadump_setup_param_area(void) { phys_addr_t range_start, range_end; @@ -1748,7 +1755,7 @@ static void __init fadump_setup_param_area(void) return; /* This memory can't be used by PFW or bootloader as it is shared across kernels */ - if (radix_enabled()) { + if (early_radix_enabled()) { /* * Anywhere in the upper half should be good enough as all memory * is accessible in real mode. @@ -1776,12 +1783,12 @@ static void __init fadump_setup_param_area(void) COMMAND_LINE_SIZE, range_start, range_end); - if (!fw_dump.param_area || sysfs_create_file(fadump_kobj, &bootargs_append_attr.attr)) { + if (!fw_dump.param_area) { pr_warn("WARNING: Could not setup area to pass additional parameters!\n"); return; } - memset(phys_to_virt(fw_dump.param_area), 0, COMMAND_LINE_SIZE); + memset((void *)fw_dump.param_area, 0, COMMAND_LINE_SIZE); } /* @@ -1807,7 +1814,6 @@ int __init setup_fadump(void) } /* Initialize the kernel dump memory structure and register with f/w */ else if (fw_dump.reserve_dump_area_size) { - fadump_setup_param_area(); fw_dump.ops->fadump_init_mem_struct(&fw_dump); register_fadump(); } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 2e1600a8bbbb..a0e8b998c9b5 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -89,69 +89,69 @@ int arch_show_interrupts(struct seq_file *p, int prec) #if defined(CONFIG_PPC32) && defined(CONFIG_TAU_INT) if (tau_initialized) { - seq_printf(p, "%*s: ", prec, "TAU"); + seq_printf(p, "%*s:", prec, "TAU"); for_each_online_cpu(j) - seq_printf(p, "%10u ", tau_interrupts(j)); + seq_put_decimal_ull_width(p, " ", tau_interrupts(j), 10); seq_puts(p, " PowerPC Thermal Assist (cpu temp)\n"); } #endif /* CONFIG_PPC32 && CONFIG_TAU_INT */ - seq_printf(p, "%*s: ", prec, "LOC"); + seq_printf(p, "%*s:", prec, "LOC"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_event); + seq_put_decimal_ull_width(p, " ", per_cpu(irq_stat, j).timer_irqs_event, 10); seq_printf(p, " Local timer interrupts for timer event device\n"); - seq_printf(p, "%*s: ", prec, "BCT"); + seq_printf(p, "%*s:", prec, "BCT"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).broadcast_irqs_event); + seq_put_decimal_ull_width(p, " ", per_cpu(irq_stat, j).broadcast_irqs_event, 10); seq_printf(p, " Broadcast timer interrupts for timer event device\n"); - seq_printf(p, "%*s: ", prec, "LOC"); + seq_printf(p, "%*s:", prec, "LOC"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_others); + seq_put_decimal_ull_width(p, " ", per_cpu(irq_stat, j).timer_irqs_others, 10); seq_printf(p, " Local timer interrupts for others\n"); - seq_printf(p, "%*s: ", prec, "SPU"); + seq_printf(p, "%*s:", prec, "SPU"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).spurious_irqs); + seq_put_decimal_ull_width(p, " ", per_cpu(irq_stat, j).spurious_irqs, 10); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "PMI"); + seq_printf(p, "%*s:", prec, "PMI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).pmu_irqs); + seq_put_decimal_ull_width(p, " ", per_cpu(irq_stat, j).pmu_irqs, 10); seq_printf(p, " Performance monitoring interrupts\n"); - seq_printf(p, "%*s: ", prec, "MCE"); + seq_printf(p, "%*s:", prec, "MCE"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).mce_exceptions); + seq_put_decimal_ull_width(p, " ", per_cpu(irq_stat, j).mce_exceptions, 10); seq_printf(p, " Machine check exceptions\n"); #ifdef CONFIG_PPC_BOOK3S_64 if (cpu_has_feature(CPU_FTR_HVMODE)) { - seq_printf(p, "%*s: ", prec, "HMI"); + seq_printf(p, "%*s:", prec, "HMI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", paca_ptrs[j]->hmi_irqs); + seq_put_decimal_ull_width(p, " ", paca_ptrs[j]->hmi_irqs, 10); seq_printf(p, " Hypervisor Maintenance Interrupts\n"); } #endif - seq_printf(p, "%*s: ", prec, "NMI"); + seq_printf(p, "%*s:", prec, "NMI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs); + seq_put_decimal_ull_width(p, " ", per_cpu(irq_stat, j).sreset_irqs, 10); seq_printf(p, " System Reset interrupts\n"); #ifdef CONFIG_PPC_WATCHDOG - seq_printf(p, "%*s: ", prec, "WDG"); + seq_printf(p, "%*s:", prec, "WDG"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).soft_nmi_irqs); + seq_put_decimal_ull_width(p, " ", per_cpu(irq_stat, j).soft_nmi_irqs, 10); seq_printf(p, " Watchdog soft-NMI interrupts\n"); #endif #ifdef CONFIG_PPC_DOORBELL if (cpu_has_feature(CPU_FTR_DBELL)) { - seq_printf(p, "%*s: ", prec, "DBL"); + seq_printf(p, "%*s:", prec, "DBL"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).doorbell_irqs); + seq_put_decimal_ull_width(p, " ", per_cpu(irq_stat, j).doorbell_irqs, 10); seq_printf(p, " Doorbell interrupts\n"); } #endif diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c index 5277cf582c16..2659e1ac8604 100644 --- a/arch/powerpc/kernel/jump_label.c +++ b/arch/powerpc/kernel/jump_label.c @@ -5,7 +5,7 @@ #include <linux/kernel.h> #include <linux/jump_label.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/inst.h> void arch_jump_label_transform(struct jump_entry *entry, diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 7a8bc03a00af..5081334b7bd2 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -21,7 +21,7 @@ #include <asm/processor.h> #include <asm/machdep.h> #include <asm/debug.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <linux/slab.h> #include <asm/inst.h> diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index f8aa91bc3b17..c0d9f12cb441 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -21,7 +21,7 @@ #include <linux/slab.h> #include <linux/set_memory.h> #include <linux/execmem.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/cacheflush.h> #include <asm/sstep.h> #include <asm/sections.h> @@ -105,24 +105,22 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) return addr; } -static bool arch_kprobe_on_func_entry(unsigned long offset) +static bool arch_kprobe_on_func_entry(unsigned long addr, unsigned long offset) { -#ifdef CONFIG_PPC64_ELF_ABI_V2 -#ifdef CONFIG_KPROBES_ON_FTRACE - return offset <= 16; -#else - return offset <= 8; -#endif -#else + unsigned long ip = ftrace_location(addr); + + if (ip) + return offset <= (ip - addr); + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && !IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) + return offset <= 8; return !offset; -#endif } /* XXX try and fold the magic of kprobe_lookup_name() in this */ kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - *on_func_entry = arch_kprobe_on_func_entry(offset); + *on_func_entry = arch_kprobe_on_func_entry(addr, offset); return (kprobe_opcode_t *)(addr + offset); } diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 91123e102db4..a997c7f43dc0 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -74,7 +74,7 @@ _GLOBAL(rmci_off) blr #endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ -#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE) +#ifdef CONFIG_PPC_PMAC /* * Do an IO access in real mode @@ -137,7 +137,7 @@ _GLOBAL(real_writeb) sync isync blr -#endif /* defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE) */ +#endif // CONFIG_PPC_PMAC #ifdef CONFIG_PPC_PASEMI @@ -174,7 +174,7 @@ _GLOBAL(real_205_writeb) #endif /* CONFIG_PPC_PASEMI */ -#if defined(CONFIG_CPU_FREQ_PMAC64) || defined(CONFIG_CPU_FREQ_MAPLE) +#ifdef CONFIG_CPU_FREQ_PMAC64 /* * SCOM access functions for 970 (FX only for now) * @@ -243,7 +243,7 @@ _GLOBAL(scom970_write) /* restore interrupts */ mtmsrd r5,1 blr -#endif /* CONFIG_CPU_FREQ_PMAC64 || CONFIG_CPU_FREQ_MAPLE */ +#endif // CONFIG_CPU_FREQ_PMAC64 /* kexec_wait(phys_cpu) * diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 816a63fd71fb..f930e3395a7f 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -18,7 +18,7 @@ #include <linux/bug.h> #include <linux/sort.h> #include <asm/setup.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> /* Count how many different relocations (different symbol, different addend) */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index e9bab599d0c2..45dac7b46aa3 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -17,7 +17,7 @@ #include <linux/kernel.h> #include <asm/module.h> #include <asm/firmware.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <linux/sort.h> #include <asm/setup.h> #include <asm/sections.h> @@ -205,7 +205,9 @@ static int relacmp(const void *_x, const void *_y) /* Get size of potential trampolines required. */ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, - const Elf64_Shdr *sechdrs) + const Elf64_Shdr *sechdrs, + char *secstrings, + struct module *me) { /* One extra reloc so it's always 0-addr terminated */ unsigned long relocs = 1; @@ -241,13 +243,25 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, } } -#ifdef CONFIG_DYNAMIC_FTRACE - /* make the trampoline to the ftrace_caller */ - relocs++; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - /* an additional one for ftrace_regs_caller */ - relocs++; -#endif + /* stubs for ftrace_caller and ftrace_regs_caller */ + relocs += IS_ENABLED(CONFIG_DYNAMIC_FTRACE) + IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS); + +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + /* stubs for the function tracer */ + for (i = 1; i < hdr->e_shnum; i++) { + if (!strcmp(secstrings + sechdrs[i].sh_name, "__patchable_function_entries")) { + me->arch.ool_stub_count = sechdrs[i].sh_size / sizeof(unsigned long); + me->arch.ool_stub_index = 0; + relocs += roundup(me->arch.ool_stub_count * sizeof(struct ftrace_ool_stub), + sizeof(struct ppc64_stub_entry)) / + sizeof(struct ppc64_stub_entry); + break; + } + } + if (i == hdr->e_shnum) { + pr_err("%s: doesn't contain __patchable_function_entries.\n", me->name); + return -ENOEXEC; + } #endif pr_debug("Looks like a total of %lu stubs, max\n", relocs); @@ -460,7 +474,7 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr, #endif /* Override the stubs size */ - sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs); + sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs, secstrings, me); return 0; } @@ -1085,6 +1099,37 @@ int module_trampoline_target(struct module *mod, unsigned long addr, return 0; } +static int setup_ftrace_ool_stubs(const Elf64_Shdr *sechdrs, unsigned long addr, struct module *me) +{ +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + unsigned int i, total_stubs, num_stubs; + struct ppc64_stub_entry *stub; + + total_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stub); + num_stubs = roundup(me->arch.ool_stub_count * sizeof(struct ftrace_ool_stub), + sizeof(struct ppc64_stub_entry)) / sizeof(struct ppc64_stub_entry); + + /* Find the next available entry */ + stub = (void *)sechdrs[me->arch.stubs_section].sh_addr; + for (i = 0; stub_func_addr(stub[i].funcdata); i++) + if (WARN_ON(i >= total_stubs)) + return -1; + + if (WARN_ON(i + num_stubs > total_stubs)) + return -1; + + stub += i; + me->arch.ool_stubs = (struct ftrace_ool_stub *)stub; + + /* reserve stubs */ + for (i = 0; i < num_stubs; i++) + if (patch_u32((void *)&stub->funcdata, PPC_RAW_NOP())) + return -1; +#endif + + return 0; +} + int module_finalize_ftrace(struct module *mod, const Elf_Shdr *sechdrs) { mod->arch.tramp = stub_for_addr(sechdrs, @@ -1103,6 +1148,9 @@ int module_finalize_ftrace(struct module *mod, const Elf_Shdr *sechdrs) if (!mod->arch.tramp) return -ENOENT; + if (setup_ftrace_ool_stubs(sechdrs, mod->arch.tramp, mod)) + return -ENOENT; + return 0; } #endif diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index c0b351d61058..2e83702bf9ba 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -13,7 +13,7 @@ #include <asm/kprobes.h> #include <asm/ptrace.h> #include <asm/cacheflush.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/sstep.h> #include <asm/ppc-opcode.h> #include <asm/inst.h> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ff61a3e7984c..7b739b9a91ab 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -54,7 +54,7 @@ #include <asm/firmware.h> #include <asm/hw_irq.h> #endif -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/exec.h> #include <asm/livepatch.h> #include <asm/cpu_has_feature.h> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 88cbe432cad5..e0059842a1c6 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -908,6 +908,9 @@ void __init early_init_devtree(void *params) mmu_early_init_devtree(); + /* Setup param area for passing additional parameters to fadump capture kernel. */ + fadump_setup_param_area(); + #ifdef CONFIG_PPC_POWERNV /* Scan and build the list of machine check recoverable ranges */ of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL); diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index fbb68fc28ed3..73210e5bcfa7 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2792,90 +2792,6 @@ static void __init flatten_device_tree(void) dt_struct_start, dt_struct_end); } -#ifdef CONFIG_PPC_MAPLE -/* PIBS Version 1.05.0000 04/26/2005 has an incorrect /ht/isa/ranges property. - * The values are bad, and it doesn't even have the right number of cells. */ -static void __init fixup_device_tree_maple(void) -{ - phandle isa; - u32 rloc = 0x01002000; /* IO space; PCI device = 4 */ - u32 isa_ranges[6]; - char *name; - - name = "/ht@0/isa@4"; - isa = call_prom("finddevice", 1, 1, ADDR(name)); - if (!PHANDLE_VALID(isa)) { - name = "/ht@0/isa@6"; - isa = call_prom("finddevice", 1, 1, ADDR(name)); - rloc = 0x01003000; /* IO space; PCI device = 6 */ - } - if (!PHANDLE_VALID(isa)) - return; - - if (prom_getproplen(isa, "ranges") != 12) - return; - if (prom_getprop(isa, "ranges", isa_ranges, sizeof(isa_ranges)) - == PROM_ERROR) - return; - - if (isa_ranges[0] != 0x1 || - isa_ranges[1] != 0xf4000000 || - isa_ranges[2] != 0x00010000) - return; - - prom_printf("Fixing up bogus ISA range on Maple/Apache...\n"); - - isa_ranges[0] = 0x1; - isa_ranges[1] = 0x0; - isa_ranges[2] = rloc; - isa_ranges[3] = 0x0; - isa_ranges[4] = 0x0; - isa_ranges[5] = 0x00010000; - prom_setprop(isa, name, "ranges", - isa_ranges, sizeof(isa_ranges)); -} - -#define CPC925_MC_START 0xf8000000 -#define CPC925_MC_LENGTH 0x1000000 -/* The values for memory-controller don't have right number of cells */ -static void __init fixup_device_tree_maple_memory_controller(void) -{ - phandle mc; - u32 mc_reg[4]; - char *name = "/hostbridge@f8000000"; - u32 ac, sc; - - mc = call_prom("finddevice", 1, 1, ADDR(name)); - if (!PHANDLE_VALID(mc)) - return; - - if (prom_getproplen(mc, "reg") != 8) - return; - - prom_getprop(prom.root, "#address-cells", &ac, sizeof(ac)); - prom_getprop(prom.root, "#size-cells", &sc, sizeof(sc)); - if ((ac != 2) || (sc != 2)) - return; - - if (prom_getprop(mc, "reg", mc_reg, sizeof(mc_reg)) == PROM_ERROR) - return; - - if (mc_reg[0] != CPC925_MC_START || mc_reg[1] != CPC925_MC_LENGTH) - return; - - prom_printf("Fixing up bogus hostbridge on Maple...\n"); - - mc_reg[0] = 0x0; - mc_reg[1] = CPC925_MC_START; - mc_reg[2] = 0x0; - mc_reg[3] = CPC925_MC_LENGTH; - prom_setprop(mc, name, "reg", mc_reg, sizeof(mc_reg)); -} -#else -#define fixup_device_tree_maple() -#define fixup_device_tree_maple_memory_controller() -#endif - #ifdef CONFIG_PPC_CHRP /* * Pegasos and BriQ lacks the "ranges" property in the isa node @@ -3193,8 +3109,6 @@ static inline void fixup_device_tree_pasemi(void) { } static void __init fixup_device_tree(void) { - fixup_device_tree_maple(); - fixup_device_tree_maple_memory_controller(); fixup_device_tree_chrp(); fixup_device_tree_pmac(); fixup_device_tree_efika(); diff --git a/arch/powerpc/kernel/secure_boot.c b/arch/powerpc/kernel/secure_boot.c index 9e0efb657f39..3a28795b4ed8 100644 --- a/arch/powerpc/kernel/secure_boot.c +++ b/arch/powerpc/kernel/secure_boot.c @@ -5,6 +5,7 @@ */ #include <linux/types.h> #include <linux/of.h> +#include <linux/string_choices.h> #include <asm/secure_boot.h> static struct device_node *get_ppc_fw_sb_node(void) @@ -38,7 +39,7 @@ bool is_ppc_secureboot_enabled(void) of_node_put(node); out: - pr_info("Secure boot mode %s\n", enabled ? "enabled" : "disabled"); + pr_info("Secure boot mode %s\n", str_enabled_disabled(enabled)); return enabled; } @@ -62,7 +63,7 @@ bool is_ppc_trustedboot_enabled(void) of_node_put(node); out: - pr_info("Trusted boot mode %s\n", enabled ? "enabled" : "disabled"); + pr_info("Trusted boot mode %s\n", str_enabled_disabled(enabled)); return enabled; } diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index 4856e1a5161c..fbb7ebd8aa08 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -14,7 +14,7 @@ #include <linux/debugfs.h> #include <asm/asm-prototypes.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/security_features.h> #include <asm/sections.h> #include <asm/setup.h> diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 0b732d3b283b..6fa179448c33 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -1000,9 +1000,11 @@ void __init setup_arch(char **cmdline_p) initmem_init(); /* - * Reserve large chunks of memory for use by CMA for KVM and hugetlb. These must - * be called after initmem_init(), so that pageblock_order is initialised. + * Reserve large chunks of memory for use by CMA for fadump, KVM and + * hugetlb. These must be called after initmem_init(), so that + * pageblock_order is initialised. */ + fadump_cma_init(); kvm_cma_reserve(); gigantic_hugetlb_cma_reserve(); diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index e515c1f7d8d3..75dbf3e0d9c4 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -40,7 +40,7 @@ #include <asm/time.h> #include <asm/serial.h> #include <asm/udbg.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/cpu_has_feature.h> #include <asm/asm-prototypes.h> #include <asm/kdump.h> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 22f83fbbc762..e67f3048611f 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -60,7 +60,7 @@ #include <asm/xmon.h> #include <asm/udbg.h> #include <asm/kexec.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/opal.h> #include <asm/cputhreads.h> @@ -920,6 +920,7 @@ static int __init disable_hardlockup_detector(void) hardlockup_detector_disable(); #else if (firmware_has_feature(FW_FEATURE_LPAR)) { + check_kvm_guest(); if (is_kvm_guest()) hardlockup_detector_disable(); } diff --git a/arch/powerpc/kernel/static_call.c b/arch/powerpc/kernel/static_call.c index 1502b7e439ca..7cfd0710e757 100644 --- a/arch/powerpc/kernel/static_call.c +++ b/arch/powerpc/kernel/static_call.c @@ -2,7 +2,7 @@ #include <linux/memory.h> #include <linux/static_call.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) { diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index b842c83ab497..6b3dd6decdf9 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -17,6 +17,7 @@ #include <asm/hvcall.h> #include <asm/machdep.h> #include <asm/smp.h> +#include <asm/time.h> #include <asm/pmc.h> #include <asm/firmware.h> #include <asm/idle.h> diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index 125f4ca588b9..d6c3885453bd 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -9,12 +9,15 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ftrace_64_pg.o = $(CC_FLAGS_FTRACE) endif -obj32-$(CONFIG_FUNCTION_TRACER) += ftrace.o ftrace_entry.o -ifdef CONFIG_MPROFILE_KERNEL -obj64-$(CONFIG_FUNCTION_TRACER) += ftrace.o ftrace_entry.o +ifdef CONFIG_FUNCTION_TRACER +obj32-y += ftrace.o ftrace_entry.o +ifeq ($(CONFIG_MPROFILE_KERNEL)$(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY),) +obj64-y += ftrace_64_pg.o ftrace_64_pg_entry.o else -obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_pg.o ftrace_64_pg_entry.o +obj64-y += ftrace.o ftrace_entry.o +endif endif + obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index df41f4a7c738..5ccd791761e8 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -23,7 +23,7 @@ #include <linux/list.h> #include <asm/cacheflush.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/syscall.h> #include <asm/inst.h> @@ -37,8 +37,12 @@ unsigned long ftrace_call_adjust(unsigned long addr) if (addr >= (unsigned long)__exittext_begin && addr < (unsigned long)__exittext_end) return 0; - if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY)) + if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY) && + !IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) { addr += MCOUNT_INSN_SIZE; + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + addr += MCOUNT_INSN_SIZE; + } return addr; } @@ -82,7 +86,7 @@ static inline int ftrace_modify_code(unsigned long ip, ppc_inst_t old, ppc_inst_ { int ret = ftrace_validate_inst(ip, old); - if (!ret) + if (!ret && !ppc_inst_equal(old, new)) ret = patch_instruction((u32 *)ip, new); return ret; @@ -106,28 +110,68 @@ static unsigned long find_ftrace_tramp(unsigned long ip) return 0; } +#ifdef CONFIG_MODULES +static unsigned long ftrace_lookup_module_stub(unsigned long ip, unsigned long addr) +{ + struct module *mod = NULL; + + preempt_disable(); + mod = __module_text_address(ip); + preempt_enable(); + + if (!mod) + pr_err("No module loaded at addr=%lx\n", ip); + + return (addr == (unsigned long)ftrace_caller ? mod->arch.tramp : mod->arch.tramp_regs); +} +#else +static unsigned long ftrace_lookup_module_stub(unsigned long ip, unsigned long addr) +{ + return 0; +} +#endif + +static unsigned long ftrace_get_ool_stub(struct dyn_ftrace *rec) +{ +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + return rec->arch.ool_stub; +#else + BUILD_BUG(); +#endif +} + static int ftrace_get_call_inst(struct dyn_ftrace *rec, unsigned long addr, ppc_inst_t *call_inst) { - unsigned long ip = rec->ip; + unsigned long ip; unsigned long stub; - if (is_offset_in_branch_range(addr - ip)) { + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + ip = ftrace_get_ool_stub(rec) + MCOUNT_INSN_SIZE; /* second instruction in stub */ + else + ip = rec->ip; + + if (!is_offset_in_branch_range(addr - ip) && addr != FTRACE_ADDR && + addr != FTRACE_REGS_ADDR) { + /* This can only happen with ftrace direct */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS)) { + pr_err("0x%lx (0x%lx): Unexpected target address 0x%lx\n", + ip, rec->ip, addr); + return -EINVAL; + } + addr = FTRACE_ADDR; + } + + if (is_offset_in_branch_range(addr - ip)) /* Within range */ stub = addr; -#ifdef CONFIG_MODULES - } else if (rec->arch.mod) { - /* Module code would be going to one of the module stubs */ - stub = (addr == (unsigned long)ftrace_caller ? rec->arch.mod->arch.tramp : - rec->arch.mod->arch.tramp_regs); -#endif - } else if (core_kernel_text(ip)) { + else if (core_kernel_text(ip)) /* We would be branching to one of our ftrace stubs */ stub = find_ftrace_tramp(ip); - if (!stub) { - pr_err("0x%lx: No ftrace stubs reachable\n", ip); - return -EINVAL; - } - } else { + else + stub = ftrace_lookup_module_stub(ip, addr); + + if (!stub) { + pr_err("0x%lx (0x%lx): No ftrace stubs reachable\n", ip, rec->ip); return -EINVAL; } @@ -135,6 +179,145 @@ static int ftrace_get_call_inst(struct dyn_ftrace *rec, unsigned long addr, ppc_ return 0; } +static int ftrace_init_ool_stub(struct module *mod, struct dyn_ftrace *rec) +{ +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + static int ool_stub_text_index, ool_stub_text_end_index, ool_stub_inittext_index; + int ret = 0, ool_stub_count, *ool_stub_index; + ppc_inst_t inst; + /* + * See ftrace_entry.S if changing the below instruction sequence, as we rely on + * decoding the last branch instruction here to recover the correct function ip. + */ + struct ftrace_ool_stub *ool_stub, ool_stub_template = { + .insn = { + PPC_RAW_MFLR(_R0), + PPC_RAW_NOP(), /* bl ftrace_caller */ + PPC_RAW_MTLR(_R0), + PPC_RAW_NOP() /* b rec->ip + 4 */ + } + }; + + WARN_ON(rec->arch.ool_stub); + + if (is_kernel_inittext(rec->ip)) { + ool_stub = ftrace_ool_stub_inittext; + ool_stub_index = &ool_stub_inittext_index; + ool_stub_count = ftrace_ool_stub_inittext_count; + } else if (is_kernel_text(rec->ip)) { + /* + * ftrace records are sorted, so we first use up the stub area within .text + * (ftrace_ool_stub_text) before using the area at the end of .text + * (ftrace_ool_stub_text_end), unless the stub is out of range of the record. + */ + if (ool_stub_text_index >= ftrace_ool_stub_text_count || + !is_offset_in_branch_range((long)rec->ip - + (long)&ftrace_ool_stub_text[ool_stub_text_index])) { + ool_stub = ftrace_ool_stub_text_end; + ool_stub_index = &ool_stub_text_end_index; + ool_stub_count = ftrace_ool_stub_text_end_count; + } else { + ool_stub = ftrace_ool_stub_text; + ool_stub_index = &ool_stub_text_index; + ool_stub_count = ftrace_ool_stub_text_count; + } +#ifdef CONFIG_MODULES + } else if (mod) { + ool_stub = mod->arch.ool_stubs; + ool_stub_index = &mod->arch.ool_stub_index; + ool_stub_count = mod->arch.ool_stub_count; +#endif + } else { + return -EINVAL; + } + + ool_stub += (*ool_stub_index)++; + + if (WARN_ON(*ool_stub_index > ool_stub_count)) + return -EINVAL; + + if (!is_offset_in_branch_range((long)rec->ip - (long)&ool_stub->insn[0]) || + !is_offset_in_branch_range((long)(rec->ip + MCOUNT_INSN_SIZE) - + (long)&ool_stub->insn[3])) { + pr_err("%s: ftrace ool stub out of range (%p -> %p).\n", + __func__, (void *)rec->ip, (void *)&ool_stub->insn[0]); + return -EINVAL; + } + + rec->arch.ool_stub = (unsigned long)&ool_stub->insn[0]; + + /* bl ftrace_caller */ + if (!mod) + ret = ftrace_get_call_inst(rec, (unsigned long)ftrace_caller, &inst); +#ifdef CONFIG_MODULES + else + /* + * We can't use ftrace_get_call_inst() since that uses + * __module_text_address(rec->ip) to look up the module. + * But, since the module is not fully formed at this stage, + * the lookup fails. We know the target though, so generate + * the branch inst directly. + */ + inst = ftrace_create_branch_inst(ftrace_get_ool_stub(rec) + MCOUNT_INSN_SIZE, + mod->arch.tramp, 1); +#endif + ool_stub_template.insn[1] = ppc_inst_val(inst); + + /* b rec->ip + 4 */ + if (!ret && create_branch(&inst, &ool_stub->insn[3], rec->ip + MCOUNT_INSN_SIZE, 0)) + return -EINVAL; + ool_stub_template.insn[3] = ppc_inst_val(inst); + + if (!ret) + ret = patch_instructions((u32 *)ool_stub, (u32 *)&ool_stub_template, + sizeof(ool_stub_template), false); + + return ret; +#else /* !CONFIG_PPC_FTRACE_OUT_OF_LINE */ + BUILD_BUG(); +#endif +} + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS +static const struct ftrace_ops *powerpc_rec_get_ops(struct dyn_ftrace *rec) +{ + const struct ftrace_ops *ops = NULL; + + if (rec->flags & FTRACE_FL_CALL_OPS_EN) { + ops = ftrace_find_unique_ops(rec); + WARN_ON_ONCE(!ops); + } + + if (!ops) + ops = &ftrace_list_ops; + + return ops; +} + +static int ftrace_rec_set_ops(struct dyn_ftrace *rec, const struct ftrace_ops *ops) +{ + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + return patch_ulong((void *)(ftrace_get_ool_stub(rec) - sizeof(unsigned long)), + (unsigned long)ops); + else + return patch_ulong((void *)(rec->ip - MCOUNT_INSN_SIZE - sizeof(unsigned long)), + (unsigned long)ops); +} + +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, &ftrace_nop_ops); +} + +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, powerpc_rec_get_ops(rec)); +} +#else +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; } +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; } +#endif + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { @@ -147,18 +330,33 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { ppc_inst_t old, new; - int ret; + unsigned long ip = rec->ip; + int ret = 0; /* This can only ever be called during module load */ - if (WARN_ON(!IS_ENABLED(CONFIG_MODULES) || core_kernel_text(rec->ip))) + if (WARN_ON(!IS_ENABLED(CONFIG_MODULES) || core_kernel_text(ip))) return -EINVAL; old = ppc_inst(PPC_RAW_NOP()); - ret = ftrace_get_call_inst(rec, addr, &new); + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) { + ip = ftrace_get_ool_stub(rec) + MCOUNT_INSN_SIZE; /* second instruction in stub */ + ret = ftrace_get_call_inst(rec, (unsigned long)ftrace_caller, &old); + } + + ret |= ftrace_get_call_inst(rec, addr, &new); + + if (!ret) + ret = ftrace_modify_code(ip, old, new); + + ret = ftrace_rec_update_ops(rec); if (ret) return ret; - return ftrace_modify_code(rec->ip, old, new); + if (!ret && IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + ret = ftrace_modify_code(rec->ip, ppc_inst(PPC_RAW_NOP()), + ppc_inst(PPC_RAW_BRANCH((long)ftrace_get_ool_stub(rec) - (long)rec->ip))); + + return ret; } int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) @@ -191,6 +389,13 @@ void ftrace_replace_code(int enable) new_addr = ftrace_get_addr_new(rec); update = ftrace_update_record(rec, enable); + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) && update != FTRACE_UPDATE_IGNORE) { + ip = ftrace_get_ool_stub(rec) + MCOUNT_INSN_SIZE; + ret = ftrace_get_call_inst(rec, (unsigned long)ftrace_caller, &nop_inst); + if (ret) + goto out; + } + switch (update) { case FTRACE_UPDATE_IGNORE: default: @@ -198,16 +403,19 @@ void ftrace_replace_code(int enable) case FTRACE_UPDATE_MODIFY_CALL: ret = ftrace_get_call_inst(rec, new_addr, &new_call_inst); ret |= ftrace_get_call_inst(rec, addr, &call_inst); + ret |= ftrace_rec_update_ops(rec); old = call_inst; new = new_call_inst; break; case FTRACE_UPDATE_MAKE_NOP: ret = ftrace_get_call_inst(rec, addr, &call_inst); + ret |= ftrace_rec_set_nop_ops(rec); old = call_inst; new = nop_inst; break; case FTRACE_UPDATE_MAKE_CALL: ret = ftrace_get_call_inst(rec, new_addr, &call_inst); + ret |= ftrace_rec_update_ops(rec); old = nop_inst; new = call_inst; break; @@ -215,6 +423,24 @@ void ftrace_replace_code(int enable) if (!ret) ret = ftrace_modify_code(ip, old, new); + + if (!ret && IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) && + (update == FTRACE_UPDATE_MAKE_NOP || update == FTRACE_UPDATE_MAKE_CALL)) { + /* Update the actual ftrace location */ + call_inst = ppc_inst(PPC_RAW_BRANCH((long)ftrace_get_ool_stub(rec) - + (long)rec->ip)); + nop_inst = ppc_inst(PPC_RAW_NOP()); + ip = rec->ip; + + if (update == FTRACE_UPDATE_MAKE_NOP) + ret = ftrace_modify_code(ip, call_inst, nop_inst); + else + ret = ftrace_modify_code(ip, nop_inst, call_inst); + + if (ret) + goto out; + } + if (ret) goto out; } @@ -234,20 +460,27 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) /* Verify instructions surrounding the ftrace location */ if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY)) { /* Expect nops */ - ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_NOP())); + if (!IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_NOP())); if (!ret) ret = ftrace_validate_inst(ip, ppc_inst(PPC_RAW_NOP())); } else if (IS_ENABLED(CONFIG_PPC32)) { /* Expected sequence: 'mflr r0', 'stw r0,4(r1)', 'bl _mcount' */ ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0))); - if (!ret) - ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_STW(_R0, _R1, 4))); + if (ret) + return ret; + ret = ftrace_modify_code(ip - 4, ppc_inst(PPC_RAW_STW(_R0, _R1, 4)), + ppc_inst(PPC_RAW_NOP())); } else if (IS_ENABLED(CONFIG_MPROFILE_KERNEL)) { /* Expected sequence: 'mflr r0', ['std r0,16(r1)'], 'bl _mcount' */ ret = ftrace_read_inst(ip - 4, &old); if (!ret && !ppc_inst_equal(old, ppc_inst(PPC_RAW_MFLR(_R0)))) { + /* Gcc v5.x emit the additional 'std' instruction, gcc v6.x don't */ ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0))); - ret |= ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_STD(_R0, _R1, 16))); + if (ret) + return ret; + ret = ftrace_modify_code(ip - 4, ppc_inst(PPC_RAW_STD(_R0, _R1, 16)), + ppc_inst(PPC_RAW_NOP())); } } else { return -EINVAL; @@ -256,13 +489,9 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) if (ret) return ret; - if (!core_kernel_text(ip)) { - if (!mod) { - pr_err("0x%lx: No module provided for non-kernel address\n", ip); - return -EFAULT; - } - rec->arch.mod = mod; - } + /* Set up out-of-line stub */ + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + return ftrace_init_ool_stub(mod, rec); /* Nop-out the ftrace location */ new = ppc_inst(PPC_RAW_NOP()); @@ -302,6 +531,13 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ppc_inst_t old, new; int ret; + /* + * When using CALL_OPS, the function to call is associated with the + * call site, and we don't have a global function pointer to update. + */ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return 0; + old = ppc_inst_read((u32 *)&ftrace_call); new = ftrace_create_branch_inst(ip, ppc_function_entry(func), 1); ret = ftrace_modify_code(ip, old, new); diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c index d3c5552e4984..98787376eb87 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.c +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c @@ -23,7 +23,7 @@ #include <linux/list.h> #include <asm/cacheflush.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/syscall.h> #include <asm/inst.h> @@ -116,6 +116,20 @@ static unsigned long find_bl_target(unsigned long ip, ppc_inst_t op) } #ifdef CONFIG_MODULES +static struct module *ftrace_lookup_module(struct dyn_ftrace *rec) +{ + struct module *mod; + + preempt_disable(); + mod = __module_text_address(rec->ip); + preempt_enable(); + + if (!mod) + pr_err("No module loaded at addr=%lx\n", rec->ip); + + return mod; +} + static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) @@ -124,6 +138,12 @@ __ftrace_make_nop(struct module *mod, unsigned long ip = rec->ip; ppc_inst_t op, pop; + if (!mod) { + mod = ftrace_lookup_module(rec); + if (!mod) + return -EINVAL; + } + /* read where this goes */ if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { pr_err("Fetching opcode failed.\n"); @@ -366,27 +386,6 @@ int ftrace_make_nop(struct module *mod, return -EINVAL; } - /* - * Out of range jumps are called from modules. - * We should either already have a pointer to the module - * or it has been passed in. - */ - if (!rec->arch.mod) { - if (!mod) { - pr_err("No module loaded addr=%lx\n", addr); - return -EFAULT; - } - rec->arch.mod = mod; - } else if (mod) { - if (mod != rec->arch.mod) { - pr_err("Record mod %p not equal to passed in mod %p\n", - rec->arch.mod, mod); - return -EINVAL; - } - /* nothing to do if mod == rec->arch.mod */ - } else - mod = rec->arch.mod; - return __ftrace_make_nop(mod, rec, addr); } @@ -411,7 +410,10 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) ppc_inst_t op[2]; void *ip = (void *)rec->ip; unsigned long entry, ptr, tramp; - struct module *mod = rec->arch.mod; + struct module *mod = ftrace_lookup_module(rec); + + if (!mod) + return -EINVAL; /* read where this goes */ if (copy_inst_from_kernel_nofault(op, ip)) @@ -533,16 +535,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return -EINVAL; } - /* - * Out of range jumps are called from modules. - * Being that we are converting from nop, it had better - * already have a module defined. - */ - if (!rec->arch.mod) { - pr_err("No module loaded\n"); - return -EINVAL; - } - return __ftrace_make_call(rec, addr); } @@ -555,7 +547,10 @@ __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, ppc_inst_t op; unsigned long ip = rec->ip; unsigned long entry, ptr, tramp; - struct module *mod = rec->arch.mod; + struct module *mod = ftrace_lookup_module(rec); + + if (!mod) + return -EINVAL; /* If we never set up ftrace trampolines, then bail */ if (!mod->arch.tramp || !mod->arch.tramp_regs) { @@ -668,14 +663,6 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, return -EINVAL; } - /* - * Out of range jumps are called from modules. - */ - if (!rec->arch.mod) { - pr_err("No module loaded\n"); - return -EINVAL; - } - return __ftrace_modify_call(rec, old_addr, addr); } #endif diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S b/arch/powerpc/kernel/trace/ftrace_entry.S index 76dbe9fd2c0f..2c1b24100eca 100644 --- a/arch/powerpc/kernel/trace/ftrace_entry.S +++ b/arch/powerpc/kernel/trace/ftrace_entry.S @@ -39,13 +39,37 @@ /* Create our stack frame + pt_regs */ PPC_STLU r1,-SWITCH_FRAME_SIZE(r1) + .if \allregs == 1 + SAVE_GPRS(11, 12, r1) + .endif + + /* Get the _mcount() call site out of LR */ + mflr r11 + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* Load the ftrace_op */ + PPC_LL r12, -(MCOUNT_INSN_SIZE*2 + SZL)(r11) + + /* Load direct_call from the ftrace_op */ + PPC_LL r12, FTRACE_OPS_DIRECT_CALL(r12) + PPC_LCMPI r12, 0 + .if \allregs == 1 + bne .Lftrace_direct_call_regs + .else + bne .Lftrace_direct_call + .endif +#endif + + /* Save the previous LR in pt_regs->link */ + PPC_STL r0, _LINK(r1) + /* Also save it in A's stack frame */ + PPC_STL r0, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE+LRSAVE(r1) + /* Save all gprs to pt_regs */ SAVE_GPR(0, r1) SAVE_GPRS(3, 10, r1) #ifdef CONFIG_PPC64 - /* Save the original return address in A's stack frame */ - std r0, LRSAVE+SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE(r1) /* Ok to continue? */ lbz r3, PACA_FTRACE_ENABLED(r13) cmpdi r3, 0 @@ -54,9 +78,9 @@ .if \allregs == 1 SAVE_GPR(2, r1) - SAVE_GPRS(11, 31, r1) + SAVE_GPRS(13, 31, r1) .else -#ifdef CONFIG_LIVEPATCH_64 +#if defined(CONFIG_LIVEPATCH_64) || defined(CONFIG_PPC_FTRACE_OUT_OF_LINE) SAVE_GPR(14, r1) #endif .endif @@ -67,80 +91,143 @@ .if \allregs == 1 /* Load special regs for save below */ + mfcr r7 mfmsr r8 mfctr r9 mfxer r10 - mfcr r11 .else /* Clear MSR to flag as ftrace_caller versus frace_regs_caller */ li r8, 0 .endif - /* Get the _mcount() call site out of LR */ - mflr r7 - /* Save it as pt_regs->nip */ - PPC_STL r7, _NIP(r1) - /* Also save it in B's stackframe header for proper unwind */ - PPC_STL r7, LRSAVE+SWITCH_FRAME_SIZE(r1) - /* Save the read LR in pt_regs->link */ - PPC_STL r0, _LINK(r1) - #ifdef CONFIG_PPC64 /* Save callee's TOC in the ABI compliant location */ std r2, STK_GOT(r1) LOAD_PACA_TOC() /* get kernel TOC in r2 */ +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + /* r11 points to the instruction following the call to ftrace */ + PPC_LL r5, -(MCOUNT_INSN_SIZE*2 + SZL)(r11) + PPC_LL r12, FTRACE_OPS_FUNC(r5) + mtctr r12 +#else /* !CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS */ +#ifdef CONFIG_PPC64 LOAD_REG_ADDR(r3, function_trace_op) ld r5,0(r3) #else lis r3,function_trace_op@ha lwz r5,function_trace_op@l(r3) #endif - -#ifdef CONFIG_LIVEPATCH_64 - mr r14, r7 /* remember old NIP */ #endif - /* Calculate ip from nip-4 into r3 for call below */ - subi r3, r7, MCOUNT_INSN_SIZE - - /* Put the original return address in r4 as parent_ip */ - mr r4, r0 - /* Save special regs */ PPC_STL r8, _MSR(r1) .if \allregs == 1 + PPC_STL r7, _CCR(r1) PPC_STL r9, _CTR(r1) PPC_STL r10, _XER(r1) - PPC_STL r11, _CCR(r1) .endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* Clear orig_gpr3 to later detect ftrace_direct call */ + li r7, 0 + PPC_STL r7, ORIG_GPR3(r1) +#endif + +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + /* Save our real return address in nvr for return */ + .if \allregs == 0 + SAVE_GPR(15, r1) + .endif + mr r15, r11 + /* + * We want the ftrace location in the function, but our lr (in r11) + * points at the 'mtlr r0' instruction in the out of line stub. To + * recover the ftrace location, we read the branch instruction in the + * stub, and adjust our lr by the branch offset. + * + * See ftrace_init_ool_stub() for the profile sequence. + */ + lwz r8, MCOUNT_INSN_SIZE(r11) + slwi r8, r8, 6 + srawi r8, r8, 6 + add r3, r11, r8 + /* + * Override our nip to point past the branch in the original function. + * This allows reliable stack trace and the ftrace stack tracer to work as-is. + */ + addi r11, r3, MCOUNT_INSN_SIZE +#else + /* Calculate ip from nip-4 into r3 for call below */ + subi r3, r11, MCOUNT_INSN_SIZE +#endif + + /* Save NIP as pt_regs->nip */ + PPC_STL r11, _NIP(r1) + /* Also save it in B's stackframe header for proper unwind */ + PPC_STL r11, LRSAVE+SWITCH_FRAME_SIZE(r1) +#if defined(CONFIG_LIVEPATCH_64) || defined(CONFIG_PPC_FTRACE_OUT_OF_LINE) + mr r14, r11 /* remember old NIP */ +#endif + + /* Put the original return address in r4 as parent_ip */ + mr r4, r0 + /* Load &pt_regs in r6 for call below */ addi r6, r1, STACK_INT_FRAME_REGS .endm .macro ftrace_regs_exit allregs - /* Load ctr with the possibly modified NIP */ - PPC_LL r3, _NIP(r1) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* Check orig_gpr3 to detect ftrace_direct call */ + PPC_LL r3, ORIG_GPR3(r1) + PPC_LCMPI cr1, r3, 0 mtctr r3 +#endif + /* Restore possibly modified LR */ + PPC_LL r0, _LINK(r1) + +#ifndef CONFIG_PPC_FTRACE_OUT_OF_LINE + /* Load ctr with the possibly modified NIP */ + PPC_LL r3, _NIP(r1) #ifdef CONFIG_LIVEPATCH_64 cmpd r14, r3 /* has NIP been altered? */ #endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + beq cr1,2f + mtlr r3 + b 3f +#endif +2: mtctr r3 + mtlr r0 +3: + +#else /* !CONFIG_PPC_FTRACE_OUT_OF_LINE */ + /* Load LR with the possibly modified NIP */ + PPC_LL r3, _NIP(r1) + cmpd r14, r3 /* has NIP been altered? */ + bne- 1f + + mr r3, r15 + .if \allregs == 0 + REST_GPR(15, r1) + .endif +1: mtlr r3 +#endif + /* Restore gprs */ .if \allregs == 1 REST_GPRS(2, 31, r1) .else REST_GPRS(3, 10, r1) -#ifdef CONFIG_LIVEPATCH_64 +#if defined(CONFIG_LIVEPATCH_64) || defined(CONFIG_PPC_FTRACE_OUT_OF_LINE) REST_GPR(14, r1) #endif .endif - /* Restore possibly modified LR */ - PPC_LL r0, _LINK(r1) - mtlr r0 - #ifdef CONFIG_PPC64 /* Restore callee's TOC */ ld r2, STK_GOT(r1) @@ -153,23 +240,46 @@ /* Based on the cmpd above, if the NIP was altered handle livepatch */ bne- livepatch_handler #endif - bctr /* jump after _mcount site */ + + /* jump after _mcount site */ +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bnectr cr1 +#endif + /* + * Return with blr to keep the link stack balanced. The function profiling sequence + * uses 'mtlr r0' to restore LR. + */ + blr +#else + bctr +#endif .endm -_GLOBAL(ftrace_regs_caller) - ftrace_regs_entry 1 - /* ftrace_call(r3, r4, r5, r6) */ +.macro ftrace_regs_func allregs +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + bctrl +#else + .if \allregs == 1 .globl ftrace_regs_call ftrace_regs_call: + .else +.globl ftrace_call +ftrace_call: + .endif + /* ftrace_call(r3, r4, r5, r6) */ bl ftrace_stub +#endif +.endm + +_GLOBAL(ftrace_regs_caller) + ftrace_regs_entry 1 + ftrace_regs_func 1 ftrace_regs_exit 1 _GLOBAL(ftrace_caller) ftrace_regs_entry 0 - /* ftrace_call(r3, r4, r5, r6) */ -.globl ftrace_call -ftrace_call: - bl ftrace_stub + ftrace_regs_func 0 ftrace_regs_exit 0 _GLOBAL(ftrace_stub) @@ -177,6 +287,11 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_PPC64 ftrace_no_trace: +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + REST_GPR(3, r1) + addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE + blr +#else mflr r3 mtctr r3 REST_GPR(3, r1) @@ -184,6 +299,22 @@ ftrace_no_trace: mtlr r0 bctr #endif +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +.Lftrace_direct_call_regs: + mtctr r12 + REST_GPRS(11, 12, r1) + addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE + bctr +.Lftrace_direct_call: + mtctr r12 + addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE + bctr +SYM_FUNC_START(ftrace_stub_direct_tramp) + blr +SYM_FUNC_END(ftrace_stub_direct_tramp) +#endif #ifdef CONFIG_LIVEPATCH_64 /* @@ -194,11 +325,17 @@ ftrace_no_trace: * We get here when a function A, calls another function B, but B has * been live patched with a new function C. * - * On entry: - * - we have no stack frame and can not allocate one + * On entry, we have no stack frame and can not allocate one. + * + * With PPC_FTRACE_OUT_OF_LINE=n, on entry: * - LR points back to the original caller (in A) * - CTR holds the new NIP in C * - r0, r11 & r12 are free + * + * With PPC_FTRACE_OUT_OF_LINE=y, on entry: + * - r0 points back to the original caller (in A) + * - LR holds the new NIP in C + * - r11 & r12 are free */ livepatch_handler: ld r12, PACA_THREAD_INFO(r13) @@ -208,18 +345,23 @@ livepatch_handler: addi r11, r11, 24 std r11, TI_livepatch_sp(r12) - /* Save toc & real LR on livepatch stack */ - std r2, -24(r11) - mflr r12 - std r12, -16(r11) - /* Store stack end marker */ lis r12, STACK_END_MAGIC@h ori r12, r12, STACK_END_MAGIC@l std r12, -8(r11) - /* Put ctr in r12 for global entry and branch there */ + /* Save toc & real LR on livepatch stack */ + std r2, -24(r11) +#ifndef CONFIG_PPC_FTRACE_OUT_OF_LINE + mflr r12 + std r12, -16(r11) mfctr r12 +#else + std r0, -16(r11) + mflr r12 + /* Put ctr in r12 for global entry and branch there */ + mtctr r12 +#endif bctrl /* @@ -308,6 +450,14 @@ _GLOBAL(return_to_handler) blr #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +SYM_DATA(ftrace_ool_stub_text_count, .long CONFIG_PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE) + +SYM_START(ftrace_ool_stub_text, SYM_L_GLOBAL, .balign SZL) + .space CONFIG_PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE * FTRACE_OOL_STUB_SIZE +SYM_CODE_END(ftrace_ool_stub_text) +#endif + .pushsection ".tramp.ftrace.text","aw",@progbits; .globl ftrace_tramp_text ftrace_tramp_text: diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 4b99208f5adc..0a72a537f879 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -39,9 +39,6 @@ void __init udbg_early_init(void) #elif defined(CONFIG_PPC_EARLY_DEBUG_RTAS_CONSOLE) /* RTAS console debug */ udbg_init_rtas_console(); -#elif defined(CONFIG_PPC_EARLY_DEBUG_MAPLE) - /* Maple real mode debug */ - udbg_init_maple_realmode(); #elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE) udbg_init_pas_realmode(); #elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c index 313802aff571..dfe8ed2192e8 100644 --- a/arch/powerpc/kernel/udbg_16550.c +++ b/arch/powerpc/kernel/udbg_16550.c @@ -205,29 +205,6 @@ void __init udbg_uart_init_mmio(void __iomem *addr, unsigned int stride) udbg_use_uart(); } -#ifdef CONFIG_PPC_MAPLE - -#define UDBG_UART_MAPLE_ADDR ((void __iomem *)0xf40003f8) - -static u8 udbg_uart_in_maple(unsigned int reg) -{ - return real_readb(UDBG_UART_MAPLE_ADDR + reg); -} - -static void udbg_uart_out_maple(unsigned int reg, u8 val) -{ - real_writeb(val, UDBG_UART_MAPLE_ADDR + reg); -} - -void __init udbg_init_maple_realmode(void) -{ - udbg_uart_in = udbg_uart_in_maple; - udbg_uart_out = udbg_uart_out_maple; - udbg_use_uart(); -} - -#endif /* CONFIG_PPC_MAPLE */ - #ifdef CONFIG_PPC_PASEMI #define UDBG_UART_PAS_ADDR ((void __iomem *)0xfcff03f8UL) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 924f7f4fa597..43379365ce1b 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -47,12 +47,13 @@ long sys_ni_syscall(void); */ static union { struct vdso_arch_data data; - u8 page[PAGE_SIZE]; + u8 page[2 * PAGE_SIZE]; } vdso_data_store __page_aligned_data; struct vdso_arch_data *vdso_data = &vdso_data_store.data; enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, + VVAR_BASE_PAGE_OFFSET, + VVAR_TIME_PAGE_OFFSET, VVAR_TIMENS_PAGE_OFFSET, VVAR_NR_PAGES, }; @@ -118,7 +119,7 @@ static struct vm_special_mapping vdso64_spec __ro_after_init = { #ifdef CONFIG_TIME_NS struct vdso_data *arch_get_vdso_data(void *vvar_page) { - return ((struct vdso_arch_data *)vvar_page)->data; + return vvar_page; } /* @@ -152,11 +153,14 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, unsigned long pfn; switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: + case VVAR_BASE_PAGE_OFFSET: + pfn = virt_to_pfn(vdso_data); + break; + case VVAR_TIME_PAGE_OFFSET: if (timens_page) pfn = page_to_pfn(timens_page); else - pfn = virt_to_pfn(vdso_data); + pfn = virt_to_pfn(vdso_data->data); break; #ifdef CONFIG_TIME_NS case VVAR_TIMENS_PAGE_OFFSET: @@ -169,7 +173,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, */ if (!timens_page) return VM_FAULT_SIGBUS; - pfn = virt_to_pfn(vdso_data); + pfn = virt_to_pfn(vdso_data->data); break; #endif /* CONFIG_TIME_NS */ default: diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index 31ca5a547004..0e3ed6fb199f 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -50,14 +50,18 @@ ldflags-$(CONFIG_LD_IS_LLD) += $(call cc-option,--ld-path=$(LD),-fuse-ld=lld) ldflags-$(CONFIG_LD_ORPHAN_WARN) += -Wl,--orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) # Filter flags that clang will warn are unused for linking -ldflags-y += $(filter-out $(CC_AUTO_VAR_INIT_ZERO_ENABLER) $(CC_FLAGS_FTRACE) -Wa$(comma)%, $(KBUILD_CFLAGS)) +ldflags-y += $(filter-out $(CC_AUTO_VAR_INIT_ZERO_ENABLER) $(CC_FLAGS_FTRACE) -Wa$(comma)%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) CC32FLAGS := -m32 CC32FLAGSREMOVE := -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc - # This flag is supported by clang for 64-bit but not 32-bit so it will cause - # an unused command line flag warning for this file. ifdef CONFIG_CC_IS_CLANG +# This flag is supported by clang for 64-bit but not 32-bit so it will cause +# an unused command line flag warning for this file. CC32FLAGSREMOVE += -fno-stack-clash-protection +# -mstack-protector-guard values from the 64-bit build are not valid for the +# 32-bit one. clang validates the values passed to these arguments during +# parsing, even when -fno-stack-protector is passed afterwards. +CC32FLAGSREMOVE += -mstack-protector-guard% endif LD32FLAGS := -Wl,-soname=linux-vdso32.so.1 AS32FLAGS := -D__VDSO32__ diff --git a/arch/powerpc/kernel/vdso/cacheflush.S b/arch/powerpc/kernel/vdso/cacheflush.S index 3b2479bd2f9a..0085ae464dac 100644 --- a/arch/powerpc/kernel/vdso/cacheflush.S +++ b/arch/powerpc/kernel/vdso/cacheflush.S @@ -30,7 +30,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) #ifdef CONFIG_PPC64 mflr r12 .cfi_register lr,r12 - get_realdatapage r10, r11 + get_datapage r10 mtlr r12 .cfi_restore lr #endif diff --git a/arch/powerpc/kernel/vdso/datapage.S b/arch/powerpc/kernel/vdso/datapage.S index 2b19b6201a33..db8e167f0166 100644 --- a/arch/powerpc/kernel/vdso/datapage.S +++ b/arch/powerpc/kernel/vdso/datapage.S @@ -28,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) mflr r12 .cfi_register lr,r12 mr. r4,r3 - get_realdatapage r3, r11 + get_datapage r3 mtlr r12 #ifdef __powerpc64__ addi r3,r3,CFG_SYSCALL_MAP64 @@ -52,7 +52,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 .cfi_register lr,r12 - get_realdatapage r3, r11 + get_datapage r3 #ifndef __powerpc64__ lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) #endif diff --git a/arch/powerpc/kernel/vdso/getrandom.S b/arch/powerpc/kernel/vdso/getrandom.S index f3bbf931931c..a80d9fb436f7 100644 --- a/arch/powerpc/kernel/vdso/getrandom.S +++ b/arch/powerpc/kernel/vdso/getrandom.S @@ -31,8 +31,6 @@ PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) .cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT #endif - get_realdatapage r8, r11 - addi r8, r8, VDSO_RNG_DATA_OFFSET bl CFUNC(DOTSYM(\funct)) PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) #ifdef __powerpc64__ diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S index 5540d7021fa2..5333848322ca 100644 --- a/arch/powerpc/kernel/vdso/gettimeofday.S +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -32,11 +32,10 @@ PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) .cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT #endif - get_datapage r5 .ifeq \call_time - addi r5, r5, VDSO_DATA_OFFSET + get_datapage r5 VDSO_DATA_OFFSET .else - addi r4, r5, VDSO_DATA_OFFSET + get_datapage r4 VDSO_DATA_OFFSET .endif bl CFUNC(DOTSYM(\funct)) PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) diff --git a/arch/powerpc/kernel/vdso/vdso32.lds.S b/arch/powerpc/kernel/vdso/vdso32.lds.S index 7b41d5d256e8..1a1b0b6d681a 100644 --- a/arch/powerpc/kernel/vdso/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso/vdso32.lds.S @@ -16,7 +16,7 @@ OUTPUT_ARCH(powerpc:common) SECTIONS { - PROVIDE(_vdso_datapage = . - 2 * PAGE_SIZE); + PROVIDE(_vdso_datapage = . - 3 * PAGE_SIZE); . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/powerpc/kernel/vdso/vdso64.lds.S b/arch/powerpc/kernel/vdso/vdso64.lds.S index 9481e4b892ed..e21b5506cad6 100644 --- a/arch/powerpc/kernel/vdso/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso/vdso64.lds.S @@ -16,7 +16,7 @@ OUTPUT_ARCH(powerpc:common64) SECTIONS { - PROVIDE(_vdso_datapage = . - 2 * PAGE_SIZE); + PROVIDE(_vdso_datapage = . - 3 * PAGE_SIZE); . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/powerpc/kernel/vdso/vgetrandom.c b/arch/powerpc/kernel/vdso/vgetrandom.c index 5f855d45fb7b..cc79b960a541 100644 --- a/arch/powerpc/kernel/vdso/vgetrandom.c +++ b/arch/powerpc/kernel/vdso/vgetrandom.c @@ -8,7 +8,7 @@ #include <linux/types.h> ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, - size_t opaque_len, const struct vdso_rng_data *vd) + size_t opaque_len) { - return __cvdso_getrandom_data(vd, buffer, len, flags, opaque_state, opaque_len); + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); } diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7ab4e2fb28b1..b4c9decc7a75 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -265,14 +265,13 @@ SECTIONS .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; INIT_TEXT - + *(.tramp.ftrace.init); /* *.init.text might be RO so we must ensure this section ends on * a page boundary. */ . = ALIGN(PAGE_SIZE); _einittext = .; - *(.tramp.ftrace.init); } :text /* .exit.text is discarded at runtime, not link time, diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 9738adabeb1f..dc65c1391157 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -736,13 +736,18 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code, if (dn) { u64 val; - of_property_read_u64(dn, "opal-base-address", &val); + ret = of_property_read_u64(dn, "opal-base-address", &val); + if (ret) + goto out; + ret = kexec_purgatory_get_set_symbol(image, "opal_base", &val, sizeof(val), false); if (ret) goto out; - of_property_read_u64(dn, "opal-entry-address", &val); + ret = of_property_read_u64(dn, "opal-entry-address", &val); + if (ret) + goto out; ret = kexec_purgatory_get_set_symbol(image, "opal_entry", &val, sizeof(val), false); } diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index ff6c38373957..d79c5d1098c0 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -422,7 +422,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, - bool *writable) + bool *writable, struct page **page) { ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM; gfn_t gfn = gpa >> PAGE_SHIFT; @@ -437,13 +437,14 @@ kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, kvm_pfn_t pfn; pfn = (kvm_pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; - get_page(pfn_to_page(pfn)); + *page = pfn_to_page(pfn); + get_page(*page); if (writable) *writable = true; return pfn; } - return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable); + return kvm_faultin_pfn(vcpu, gfn, writing, writable, page); } EXPORT_SYMBOL_GPL(kvmppc_gpa_to_pfn); diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 4b3a8d80cfa3..5b7212edbb13 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -130,6 +130,7 @@ extern char etext[]; int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, bool iswrite) { + struct page *page; kvm_pfn_t hpaddr; u64 vpn; u64 vsid; @@ -145,7 +146,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, bool writable; /* Get host physical address for gpa */ - hpaddr = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable); + hpaddr = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable, &page); if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n", orig_pte->raddr); @@ -232,7 +233,7 @@ next_pteg: pte = kvmppc_mmu_hpte_cache_next(vcpu); if (!pte) { - kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + kvm_release_page_unused(page); r = -EAGAIN; goto out; } @@ -250,7 +251,7 @@ next_pteg: kvmppc_mmu_hpte_cache_map(vcpu, pte); - kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + kvm_release_page_clean(page); out: return r; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index bc6a381b5346..be20aee6fd7d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -88,13 +88,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, struct hpte_cache *cpte; unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT; unsigned long pfn; + struct page *page; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* Get host physical address for gpa */ - pfn = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable); + pfn = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable, &page); if (is_error_noslot_pfn(pfn)) { printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n", orig_pte->raddr); @@ -121,13 +122,10 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M); - kvm_set_pfn_accessed(pfn); if (!orig_pte->may_write || !writable) rflags |= PP_RXRX; - else { + else mark_page_dirty(vcpu->kvm, gfn); - kvm_set_pfn_dirty(pfn); - } if (!orig_pte->may_execute) rflags |= HPTE_R_N; @@ -202,8 +200,10 @@ map_again: } out_unlock: + /* FIXME: Don't unconditionally pass unused=false. */ + kvm_release_faultin_page(kvm, page, false, + orig_pte->may_write && writable); spin_unlock(&kvm->mmu_lock); - kvm_release_pfn_clean(pfn); if (cpte) kvmppc_mmu_hpte_cache_free(cpte); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 1b51b1c4713b..f305395cf26e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -603,27 +603,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - /* - * Do a fast check first, since __gfn_to_pfn_memslot doesn't - * do it with !atomic && !async, which is how we call it. - * We always ask for write permission since the common case - * is that the page is writable. - */ - if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { - write_ok = true; - } else { - /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, - writing, &write_ok, NULL); - if (is_error_noslot_pfn(pfn)) - return -EFAULT; - page = NULL; - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; - } - } + pfn = __kvm_faultin_pfn(memslot, gfn, writing ? FOLL_WRITE : 0, + &write_ok, &page); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; /* * Read the PTE from the process' radix tree and use that diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 408d98f8a514..b3e6e73d6a08 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -821,7 +821,7 @@ bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing, int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long gpa, struct kvm_memory_slot *memslot, - bool writing, bool kvm_ro, + bool writing, pte_t *inserted_pte, unsigned int *levelp) { struct kvm *kvm = vcpu->kvm; @@ -829,40 +829,21 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long mmu_seq; unsigned long hva, gfn = gpa >> PAGE_SHIFT; bool upgrade_write = false; - bool *upgrade_p = &upgrade_write; pte_t pte, *ptep; unsigned int shift, level; int ret; bool large_enable; + kvm_pfn_t pfn; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); - /* - * Do a fast check first, since __gfn_to_pfn_memslot doesn't - * do it with !atomic && !async, which is how we call it. - * We always ask for write permission since the common case - * is that the page is writable. - */ hva = gfn_to_hva_memslot(memslot, gfn); - if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { - upgrade_write = true; - } else { - unsigned long pfn; - - /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, - writing, upgrade_p, NULL); - if (is_error_noslot_pfn(pfn)) - return -EFAULT; - page = NULL; - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; - } - } + pfn = __kvm_faultin_pfn(memslot, gfn, writing ? FOLL_WRITE : 0, + &upgrade_write, &page); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; /* * Read the PTE from the process' radix tree and use that @@ -950,7 +931,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot; long ret; bool writing = !!(dsisr & DSISR_ISSTORE); - bool kvm_ro = false; /* Check for unusual errors */ if (dsisr & DSISR_UNSUPP_MMU) { @@ -1003,7 +983,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, ea, DSISR_ISSTORE | DSISR_PROTFAULT); return RESUME_GUEST; } - kvm_ro = true; } /* Failed to set the reference/change bits */ @@ -1021,7 +1000,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, /* Try to insert a pte */ ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, - kvm_ro, NULL, NULL); + NULL, NULL); if (ret == 0 || ret == -EAGAIN) ret = RESUME_GUEST; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ad8dc4ccdaab..25429905ae90 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -400,7 +400,10 @@ static inline unsigned long map_pcr_to_cap(unsigned long pcr) cap = H_GUEST_CAP_POWER9; break; case PCR_ARCH_31: - cap = H_GUEST_CAP_POWER10; + if (cpu_has_feature(CPU_FTR_P11_PVR)) + cap = H_GUEST_CAP_POWER11; + else + cap = H_GUEST_CAP_POWER10; break; default: break; @@ -415,7 +418,7 @@ static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) struct kvmppc_vcore *vc = vcpu->arch.vcore; /* We can (emulate) our own architecture version and anything older */ - if (cpu_has_feature(CPU_FTR_ARCH_31)) + if (cpu_has_feature(CPU_FTR_P11_PVR) || cpu_has_feature(CPU_FTR_ARCH_31)) host_pcr_bit = PCR_ARCH_31; else if (cpu_has_feature(CPU_FTR_ARCH_300)) host_pcr_bit = PCR_ARCH_300; @@ -2060,36 +2063,9 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) fallthrough; /* go to facility unavailable handler */ #endif - case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: { - u64 cause = vcpu->arch.hfscr >> 56; - - /* - * Only pass HFU interrupts to the L1 if the facility is - * permitted but disabled by the L1's HFSCR, otherwise - * the interrupt does not make sense to the L1 so turn - * it into a HEAI. - */ - if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) || - (vcpu->arch.nested_hfscr & (1UL << cause))) { - ppc_inst_t pinst; - vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST; - - /* - * If the fetch failed, return to guest and - * try executing it again. - */ - r = kvmppc_get_last_inst(vcpu, INST_GENERIC, &pinst); - vcpu->arch.emul_inst = ppc_inst_val(pinst); - if (r != EMULATE_DONE) - r = RESUME_GUEST; - else - r = RESUME_HOST; - } else { - r = RESUME_HOST; - } - + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + r = RESUME_HOST; break; - } case BOOK3S_INTERRUPT_HV_RM_HARD: vcpu->arch.trap = 0; @@ -4153,8 +4129,9 @@ void kvmhv_set_l2_counters_status(int cpu, bool status) else lppaca_of(cpu).l2_counters_enable = 0; } +EXPORT_SYMBOL(kvmhv_set_l2_counters_status); -int kmvhv_counters_tracepoint_regfunc(void) +int kvmhv_counters_tracepoint_regfunc(void) { int cpu; @@ -4164,7 +4141,7 @@ int kmvhv_counters_tracepoint_regfunc(void) return 0; } -void kmvhv_counters_tracepoint_unregfunc(void) +void kvmhv_counters_tracepoint_unregfunc(void) { int cpu; @@ -4190,7 +4167,73 @@ static void do_trace_nested_cs_time(struct kvm_vcpu *vcpu) *l1_to_l2_cs_ptr = l1_to_l2_ns; *l2_to_l1_cs_ptr = l2_to_l1_ns; *l2_runtime_agg_ptr = l2_runtime_ns; + vcpu->arch.l1_to_l2_cs = l1_to_l2_ns; + vcpu->arch.l2_to_l1_cs = l2_to_l1_ns; + vcpu->arch.l2_runtime_agg = l2_runtime_ns; +} + +u64 kvmhv_get_l1_to_l2_cs_time(void) +{ + return tb_to_ns(be64_to_cpu(get_lppaca()->l1_to_l2_cs_tb)); +} +EXPORT_SYMBOL(kvmhv_get_l1_to_l2_cs_time); + +u64 kvmhv_get_l2_to_l1_cs_time(void) +{ + return tb_to_ns(be64_to_cpu(get_lppaca()->l2_to_l1_cs_tb)); +} +EXPORT_SYMBOL(kvmhv_get_l2_to_l1_cs_time); + +u64 kvmhv_get_l2_runtime_agg(void) +{ + return tb_to_ns(be64_to_cpu(get_lppaca()->l2_runtime_tb)); +} +EXPORT_SYMBOL(kvmhv_get_l2_runtime_agg); + +u64 kvmhv_get_l1_to_l2_cs_time_vcpu(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vcpu_arch *arch; + + vcpu = local_paca->kvm_hstate.kvm_vcpu; + if (vcpu) { + arch = &vcpu->arch; + return arch->l1_to_l2_cs; + } else { + return 0; + } } +EXPORT_SYMBOL(kvmhv_get_l1_to_l2_cs_time_vcpu); + +u64 kvmhv_get_l2_to_l1_cs_time_vcpu(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vcpu_arch *arch; + + vcpu = local_paca->kvm_hstate.kvm_vcpu; + if (vcpu) { + arch = &vcpu->arch; + return arch->l2_to_l1_cs; + } else { + return 0; + } +} +EXPORT_SYMBOL(kvmhv_get_l2_to_l1_cs_time_vcpu); + +u64 kvmhv_get_l2_runtime_agg_vcpu(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vcpu_arch *arch; + + vcpu = local_paca->kvm_hstate.kvm_vcpu; + if (vcpu) { + arch = &vcpu->arch; + return arch->l2_runtime_agg; + } else { + return 0; + } +} +EXPORT_SYMBOL(kvmhv_get_l2_runtime_agg_vcpu); #else int kvmhv_get_l2_counters_status(void) @@ -4310,6 +4353,15 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns hvregs.hdec_expiry = time_limit; /* + * hvregs has the doorbell status, so zero it here which + * enables us to receive doorbells when H_ENTER_NESTED is + * in progress for this vCPU + */ + + if (vcpu->arch.doorbell_request) + vcpu->arch.doorbell_request = 0; + + /* * When setting DEC, we must always deal with irq_work_raise * via NMI vs setting DEC. The problem occurs right as we * switch into guest mode if a NMI hits and sets pending work @@ -4912,7 +4964,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, lpcr &= ~LPCR_MER; } } else if (vcpu->arch.pending_exceptions || - vcpu->arch.doorbell_request || xive_interrupt_pending(vcpu)) { vcpu->arch.ret = RESUME_HOST; goto out; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 05f5220960c6..5f8c2321cfb5 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -32,7 +32,7 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) struct kvmppc_vcore *vc = vcpu->arch.vcore; hr->pcr = vc->pcr | PCR_MASK; - hr->dpdes = vc->dpdes; + hr->dpdes = vcpu->arch.doorbell_request; hr->hfscr = vcpu->arch.hfscr; hr->tb_offset = vc->tb_offset; hr->dawr0 = vcpu->arch.dawr0; @@ -105,7 +105,7 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, { struct kvmppc_vcore *vc = vcpu->arch.vcore; - hr->dpdes = vc->dpdes; + hr->dpdes = vcpu->arch.doorbell_request; hr->purr = vcpu->arch.purr; hr->spurr = vcpu->arch.spurr; hr->ic = vcpu->arch.ic; @@ -143,7 +143,7 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state * struct kvmppc_vcore *vc = vcpu->arch.vcore; vc->pcr = hr->pcr | PCR_MASK; - vc->dpdes = hr->dpdes; + vcpu->arch.doorbell_request = hr->dpdes; vcpu->arch.hfscr = hr->hfscr; vcpu->arch.dawr0 = hr->dawr0; vcpu->arch.dawrx0 = hr->dawrx0; @@ -170,7 +170,13 @@ void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, { struct kvmppc_vcore *vc = vcpu->arch.vcore; - vc->dpdes = hr->dpdes; + /* + * This L2 vCPU might have received a doorbell while H_ENTER_NESTED was being handled. + * Make sure we preserve the doorbell if it was either: + * a) Sent after H_ENTER_NESTED was called on this vCPU (arch.doorbell_request would be 1) + * b) Doorbell was not handled and L2 exited for some other reason (hr->dpdes would be 1) + */ + vcpu->arch.doorbell_request = vcpu->arch.doorbell_request | hr->dpdes; vcpu->arch.hfscr = hr->hfscr; vcpu->arch.purr = hr->purr; vcpu->arch.spurr = hr->spurr; @@ -445,6 +451,8 @@ long kvmhv_nested_init(void) if (rc == H_SUCCESS) { unsigned long capabilities = 0; + if (cpu_has_feature(CPU_FTR_P11_PVR)) + capabilities |= H_GUEST_CAP_POWER11; if (cpu_has_feature(CPU_FTR_ARCH_31)) capabilities |= H_GUEST_CAP_POWER10; if (cpu_has_feature(CPU_FTR_ARCH_300)) @@ -1527,7 +1535,6 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, unsigned long n_gpa, gpa, gfn, perm = 0UL; unsigned int shift, l1_shift, level; bool writing = !!(dsisr & DSISR_ISSTORE); - bool kvm_ro = false; long int ret; if (!gp->l1_gr_to_hr) { @@ -1607,7 +1614,6 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, ea, DSISR_ISSTORE | DSISR_PROTFAULT); return RESUME_GUEST; } - kvm_ro = true; } /* 2. Find the host pte for this L1 guest real address */ @@ -1629,7 +1635,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) { /* No suitable pte found -> try to insert a mapping */ ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, - writing, kvm_ro, &pte, &level); + writing, &pte, &level); if (ret == -EAGAIN) return RESUME_GUEST; else if (ret) diff --git a/arch/powerpc/kvm/book3s_hv_nestedv2.c b/arch/powerpc/kvm/book3s_hv_nestedv2.c index eeecea8f202b..e5c7ce1fb761 100644 --- a/arch/powerpc/kvm/book3s_hv_nestedv2.c +++ b/arch/powerpc/kvm/book3s_hv_nestedv2.c @@ -370,7 +370,9 @@ static int gs_msg_ops_vcpu_fill_info(struct kvmppc_gs_buff *gsb, * default to L1's PVR. */ if (!vcpu->arch.vcore->arch_compat) { - if (cpu_has_feature(CPU_FTR_ARCH_31)) + if (cpu_has_feature(CPU_FTR_P11_PVR)) + arch_compat = PVR_ARCH_31_P11; + else if (cpu_has_feature(CPU_FTR_ARCH_31)) arch_compat = PVR_ARCH_31; else if (cpu_has_feature(CPU_FTR_ARCH_300)) arch_compat = PVR_ARCH_300; diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 92f33115144b..3a6592a31a10 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -879,9 +879,8 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa, { int ret = H_PARAMETER; - struct page *uvmem_page; + struct page *page, *uvmem_page; struct kvmppc_uvmem_page_pvt *pvt; - unsigned long pfn; unsigned long gfn = gpa >> page_shift; int srcu_idx; unsigned long uvmem_pfn; @@ -901,8 +900,8 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa, retry: mutex_unlock(&kvm->arch.uvmem_lock); - pfn = gfn_to_pfn(kvm, gfn); - if (is_error_noslot_pfn(pfn)) + page = gfn_to_page(kvm, gfn); + if (!page) goto out; mutex_lock(&kvm->arch.uvmem_lock); @@ -911,16 +910,16 @@ retry: pvt = uvmem_page->zone_device_data; pvt->skip_page_out = true; pvt->remove_gfn = false; /* it continues to be a valid GFN */ - kvm_release_pfn_clean(pfn); + kvm_release_page_unused(page); goto retry; } - if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0, + if (!uv_page_in(kvm->arch.lpid, page_to_pfn(page) << page_shift, gpa, 0, page_shift)) { kvmppc_gfn_shared(gfn, kvm); ret = H_SUCCESS; } - kvm_release_pfn_clean(pfn); + kvm_release_page_clean(page); mutex_unlock(&kvm->arch.uvmem_lock); out: srcu_read_unlock(&kvm->srcu, srcu_idx); @@ -1083,21 +1082,21 @@ out: int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn) { - unsigned long pfn; + struct page *page; int ret = U_SUCCESS; - pfn = gfn_to_pfn(kvm, gfn); - if (is_error_noslot_pfn(pfn)) + page = gfn_to_page(kvm, gfn); + if (!page) return -EFAULT; mutex_lock(&kvm->arch.uvmem_lock); if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL)) goto out; - ret = uv_page_in(kvm->arch.lpid, pfn << PAGE_SHIFT, gfn << PAGE_SHIFT, - 0, PAGE_SHIFT); + ret = uv_page_in(kvm->arch.lpid, page_to_pfn(page) << PAGE_SHIFT, + gfn << PAGE_SHIFT, 0, PAGE_SHIFT); out: - kvm_release_pfn_clean(pfn); + kvm_release_page_clean(page); mutex_unlock(&kvm->arch.uvmem_lock); return (ret == U_SUCCESS) ? RESUME_GUEST : -EFAULT; } diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index ce79ac33e8d3..d904e13e069b 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -92,12 +92,6 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) spin_unlock(&vcpu3s->mmu_lock); } -static void free_pte_rcu(struct rcu_head *head) -{ - struct hpte_cache *pte = container_of(head, struct hpte_cache, rcu_head); - kmem_cache_free(hpte_cache, pte); -} - static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); @@ -126,7 +120,7 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) spin_unlock(&vcpu3s->mmu_lock); - call_rcu(&pte->rcu_head, free_pte_rcu); + kfree_rcu(pte, rcu_head); } static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 7b8ae509328f..83bcdc80ce51 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -639,29 +639,27 @@ static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) */ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) { - struct page *hpage; + struct kvm_host_map map; u64 hpage_offset; u32 *page; - int i; + int i, r; - hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); - if (is_error_page(hpage)) + r = kvm_vcpu_map(vcpu, pte->raddr >> PAGE_SHIFT, &map); + if (r) return; hpage_offset = pte->raddr & ~PAGE_MASK; hpage_offset &= ~0xFFFULL; hpage_offset /= 4; - get_page(hpage); - page = kmap_atomic(hpage); + page = map.hva; /* patch dcbz into reserved instruction, so we trap */ for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) if ((be32_to_cpu(page[i]) & 0xff0007ff) == INS_DCBZ) page[i] &= cpu_to_be32(0xfffffff7); - kunmap_atomic(page); - put_page(hpage); + kvm_vcpu_unmap(vcpu, &map); } static bool kvmppc_visible_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 6e2ebbd8aaac..d9bf1bc3ff61 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -654,7 +654,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, } page = gfn_to_page(kvm, gfn); - if (is_error_page(page)) { + if (!page) { srcu_read_unlock(&kvm->srcu, srcu_idx); pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr); return -EINVAL; diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c664fdec75b1..e5a145b578a4 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -242,7 +242,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); } -static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, +static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref, struct kvm_book3e_206_tlb_entry *gtlbe, kvm_pfn_t pfn, unsigned int wimg) { @@ -252,11 +252,7 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, /* Use guest supplied MAS2_G and MAS2_E */ ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; - /* Mark the page accessed */ - kvm_set_pfn_accessed(pfn); - - if (tlbe_is_writable(gtlbe)) - kvm_set_pfn_dirty(pfn); + return tlbe_is_writable(gtlbe); } static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) @@ -326,6 +322,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, { struct kvm_memory_slot *slot; unsigned long pfn = 0; /* silence GCC warning */ + struct page *page = NULL; unsigned long hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; @@ -337,6 +334,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, unsigned int wimg = 0; pgd_t *pgdir; unsigned long flags; + bool writable = false; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_invalidate_seq; @@ -446,7 +444,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); - pfn = gfn_to_pfn_memslot(slot, gfn); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, NULL, &page); if (is_error_noslot_pfn(pfn)) { if (printk_ratelimit()) pr_err("%s: real page not found for gfn %lx\n", @@ -490,7 +488,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, goto out; } } - kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); @@ -499,11 +497,8 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvmppc_mmu_flush_icache(pfn); out: + kvm_release_faultin_page(kvm, page, !!ret, writable); spin_unlock(&kvm->mmu_lock); - - /* Drop refcount on page, so that mmu notifiers can clear it */ - kvm_release_pfn_clean(pfn); - return ret; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index b3b37ea77849..ce1d91eed231 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -612,9 +612,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 8 | 4 | 2 | 1; } break; - case KVM_CAP_PPC_RMA: - r = 0; - break; case KVM_CAP_PPC_HWRNG: r = kvmppc_hwrng_present(); break; diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h index 77ebc724e6cd..35fccaa575cc 100644 --- a/arch/powerpc/kvm/trace_hv.h +++ b/arch/powerpc/kvm/trace_hv.h @@ -538,7 +538,7 @@ TRACE_EVENT_FN_COND(kvmppc_vcpu_stats, TP_printk("VCPU %d: l1_to_l2_cs_time=%llu ns l2_to_l1_cs_time=%llu ns l2_runtime=%llu ns", __entry->vcpu_id, __entry->l1_to_l2_cs, __entry->l2_to_l1_cs, __entry->l2_runtime), - kmvhv_counters_tracepoint_regfunc, kmvhv_counters_tracepoint_unregfunc + kvmhv_counters_tracepoint_regfunc, kvmhv_counters_tracepoint_unregfunc ); #endif #endif /* _TRACE_KVM_HV_H */ diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index acdab294b340..af97fbb3c257 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -17,7 +17,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/page.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/inst.h> static int __patch_mem(void *exec_addr, unsigned long val, void *patch_addr, bool is_dword) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index b7201ba50b2e..587c8cf1230f 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -16,7 +16,7 @@ #include <linux/sched/mm.h> #include <linux/stop_machine.h> #include <asm/cputable.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/interrupt.h> #include <asm/page.h> #include <asm/sections.h> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index e65f3fb68d06..ac3ee19531d8 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -780,8 +780,8 @@ static nokprobe_inline int emulate_stq(struct pt_regs *regs, unsigned long ea, #endif /* __powerpc64 */ #ifdef CONFIG_VSX -void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, - const void *mem, bool rev) +static nokprobe_inline void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, + const void *mem, bool rev) { int size, read_size; int i, j; @@ -863,11 +863,9 @@ void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg, break; } } -EXPORT_SYMBOL_GPL(emulate_vsx_load); -NOKPROBE_SYMBOL(emulate_vsx_load); -void emulate_vsx_store(struct instruction_op *op, const union vsx_reg *reg, - void *mem, bool rev) +static nokprobe_inline void emulate_vsx_store(struct instruction_op *op, const union vsx_reg *reg, + void *mem, bool rev) { int size, write_size; int i, j; @@ -955,8 +953,6 @@ void emulate_vsx_store(struct instruction_op *op, const union vsx_reg *reg, break; } } -EXPORT_SYMBOL_GPL(emulate_vsx_store); -NOKPROBE_SYMBOL(emulate_vsx_store); static nokprobe_inline int do_vsx_load(struct instruction_op *op, unsigned long ea, struct pt_regs *regs, diff --git a/arch/powerpc/lib/test-code-patching.c b/arch/powerpc/lib/test-code-patching.c index 8cd3b32f805b..1440d99630b3 100644 --- a/arch/powerpc/lib/test-code-patching.c +++ b/arch/powerpc/lib/test-code-patching.c @@ -6,7 +6,7 @@ #include <linux/vmalloc.h> #include <linux/init.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr) { diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 23c7805fb7b3..66b5b4fa1686 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -11,7 +11,7 @@ #include <asm/cpu_has_feature.h> #include <asm/sstep.h> #include <asm/ppc-opcode.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/inst.h> #define MAX_SUBTESTS 16 diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 2db167f4233f..6978344edcb4 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -25,7 +25,7 @@ #include <asm/mmu.h> #include <asm/machdep.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/sections.h> #include <mm/mmu_decl.h> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index e1eadd03f133..c8b4fa71d4a7 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -40,6 +40,7 @@ #include <linux/random.h> #include <linux/elf-randomize.h> #include <linux/of_fdt.h> +#include <linux/kfence.h> #include <asm/interrupt.h> #include <asm/processor.h> @@ -57,7 +58,7 @@ #include <asm/sections.h> #include <asm/copro.h> #include <asm/udbg.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/fadump.h> #include <asm/firmware.h> #include <asm/tm.h> @@ -66,6 +67,7 @@ #include <asm/pte-walk.h> #include <asm/asm-prototypes.h> #include <asm/ultravisor.h> +#include <asm/kfence.h> #include <mm/mmu_decl.h> @@ -123,8 +125,6 @@ EXPORT_SYMBOL_GPL(mmu_slb_size); #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif -static u8 *linear_map_hash_slots; -static unsigned long linear_map_hash_count; struct mmu_hash_ops mmu_hash_ops __ro_after_init; EXPORT_SYMBOL(mmu_hash_ops); @@ -273,6 +273,270 @@ void hash__tlbiel_all(unsigned int action) WARN(1, "%s called on pre-POWER7 CPU\n", __func__); } +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +static void kernel_map_linear_page(unsigned long vaddr, unsigned long idx, + u8 *slots, raw_spinlock_t *lock) +{ + unsigned long hash; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); + long ret; + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + + /* Don't create HPTE entries for bad address */ + if (!vsid) + return; + + if (slots[idx] & 0x80) + return; + + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, + HPTE_V_BOLTED, + mmu_linear_psize, mmu_kernel_ssize); + + BUG_ON (ret < 0); + raw_spin_lock(lock); + BUG_ON(slots[idx] & 0x80); + slots[idx] = ret | 0x80; + raw_spin_unlock(lock); +} + +static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long idx, + u8 *slots, raw_spinlock_t *lock) +{ + unsigned long hash, hslot, slot; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + raw_spin_lock(lock); + if (!(slots[idx] & 0x80)) { + raw_spin_unlock(lock); + return; + } + hslot = slots[idx] & 0x7f; + slots[idx] = 0; + raw_spin_unlock(lock); + if (hslot & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hslot & _PTEIDX_GROUP_IX; + mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, + mmu_linear_psize, + mmu_kernel_ssize, 0); +} +#endif + +static inline bool hash_supports_debug_pagealloc(void) +{ + unsigned long max_hash_count = ppc64_rma_size / 4; + unsigned long linear_map_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + + if (!debug_pagealloc_enabled() || linear_map_count > max_hash_count) + return false; + return true; +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +static u8 *linear_map_hash_slots; +static unsigned long linear_map_hash_count; +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); +static void hash_debug_pagealloc_alloc_slots(void) +{ + if (!hash_supports_debug_pagealloc()) + return; + + linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + linear_map_hash_slots = memblock_alloc_try_nid( + linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, + ppc64_rma_size, NUMA_NO_NODE); + if (!linear_map_hash_slots) + panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", + __func__, linear_map_hash_count, &ppc64_rma_size); +} + +static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, + int slot) +{ + if (!debug_pagealloc_enabled() || !linear_map_hash_count) + return; + if ((paddr >> PAGE_SHIFT) < linear_map_hash_count) + linear_map_hash_slots[paddr >> PAGE_SHIFT] = slot | 0x80; +} + +static int hash_debug_pagealloc_map_pages(struct page *page, int numpages, + int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + if (!debug_pagealloc_enabled() || !linear_map_hash_count) + return 0; + + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = __pa(vaddr) >> PAGE_SHIFT; + if (lmi >= linear_map_hash_count) + continue; + if (enable) + kernel_map_linear_page(vaddr, lmi, + linear_map_hash_slots, &linear_map_hash_lock); + else + kernel_unmap_linear_page(vaddr, lmi, + linear_map_hash_slots, &linear_map_hash_lock); + } + local_irq_restore(flags); + return 0; +} + +#else /* CONFIG_DEBUG_PAGEALLOC */ +static inline void hash_debug_pagealloc_alloc_slots(void) {} +static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot) {} +static int __maybe_unused +hash_debug_pagealloc_map_pages(struct page *page, int numpages, int enable) +{ + return 0; +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +#ifdef CONFIG_KFENCE +static u8 *linear_map_kf_hash_slots; +static unsigned long linear_map_kf_hash_count; +static DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock); + +static phys_addr_t kfence_pool; + +static inline void hash_kfence_alloc_pool(void) +{ + if (!kfence_early_init_enabled()) + goto err; + + /* allocate linear map for kfence within RMA region */ + linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT; + linear_map_kf_hash_slots = memblock_alloc_try_nid( + linear_map_kf_hash_count, 1, + MEMBLOCK_LOW_LIMIT, ppc64_rma_size, + NUMA_NO_NODE); + if (!linear_map_kf_hash_slots) { + pr_err("%s: memblock for linear map (%lu) failed\n", __func__, + linear_map_kf_hash_count); + goto err; + } + + /* allocate kfence pool early */ + kfence_pool = memblock_phys_alloc_range(KFENCE_POOL_SIZE, PAGE_SIZE, + MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ANYWHERE); + if (!kfence_pool) { + pr_err("%s: memblock for kfence pool (%lu) failed\n", __func__, + KFENCE_POOL_SIZE); + memblock_free(linear_map_kf_hash_slots, + linear_map_kf_hash_count); + linear_map_kf_hash_count = 0; + goto err; + } + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); + + return; +err: + pr_info("Disabling kfence\n"); + disable_kfence(); +} + +static inline void hash_kfence_map_pool(void) +{ + unsigned long kfence_pool_start, kfence_pool_end; + unsigned long prot = pgprot_val(PAGE_KERNEL); + + if (!kfence_pool) + return; + + kfence_pool_start = (unsigned long) __va(kfence_pool); + kfence_pool_end = kfence_pool_start + KFENCE_POOL_SIZE; + __kfence_pool = (char *) kfence_pool_start; + BUG_ON(htab_bolt_mapping(kfence_pool_start, kfence_pool_end, + kfence_pool, prot, mmu_linear_psize, + mmu_kernel_ssize)); + memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); +} + +static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot) +{ + unsigned long vaddr = (unsigned long) __va(paddr); + unsigned long lmi = (vaddr - (unsigned long)__kfence_pool) + >> PAGE_SHIFT; + + if (!kfence_pool) + return; + BUG_ON(!is_kfence_address((void *)vaddr)); + BUG_ON(lmi >= linear_map_kf_hash_count); + linear_map_kf_hash_slots[lmi] = slot | 0x80; +} + +static int hash_kfence_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + WARN_ON_ONCE(!linear_map_kf_hash_count); + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = (vaddr - (unsigned long)__kfence_pool) >> PAGE_SHIFT; + + /* Ideally this should never happen */ + if (lmi >= linear_map_kf_hash_count) { + WARN_ON_ONCE(1); + continue; + } + + if (enable) + kernel_map_linear_page(vaddr, lmi, + linear_map_kf_hash_slots, + &linear_map_kf_hash_lock); + else + kernel_unmap_linear_page(vaddr, lmi, + linear_map_kf_hash_slots, + &linear_map_kf_hash_lock); + } + local_irq_restore(flags); + return 0; +} +#else +static inline void hash_kfence_alloc_pool(void) {} +static inline void hash_kfence_map_pool(void) {} +static inline void hash_kfence_add_slot(phys_addr_t paddr, int slot) {} +static int __maybe_unused +hash_kfence_map_pages(struct page *page, int numpages, int enable) +{ + return 0; +} +#endif + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) +int hash__kernel_map_pages(struct page *page, int numpages, int enable) +{ + void *vaddr = page_address(page); + + if (is_kfence_address(vaddr)) + return hash_kfence_map_pages(page, numpages, enable); + else + return hash_debug_pagealloc_map_pages(page, numpages, enable); +} + +static void hash_linear_map_add_slot(phys_addr_t paddr, int slot) +{ + if (is_kfence_address(__va(paddr))) + hash_kfence_add_slot(paddr, slot); + else + hash_debug_pagealloc_add_slot(paddr, slot); +} +#else +static void hash_linear_map_add_slot(phys_addr_t paddr, int slot) {} +#endif + /* * 'R' and 'C' update notes: * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* @@ -431,9 +695,8 @@ repeat: break; cond_resched(); - if (debug_pagealloc_enabled_or_kfence() && - (paddr >> PAGE_SHIFT) < linear_map_hash_count) - linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; + /* add slot info in debug_pagealloc / kfence linear map */ + hash_linear_map_add_slot(paddr, ret); } return ret < 0 ? ret : 0; } @@ -814,7 +1077,7 @@ static void __init htab_init_page_sizes(void) bool aligned = true; init_hpte_page_sizes(); - if (!debug_pagealloc_enabled_or_kfence()) { + if (!hash_supports_debug_pagealloc() && !kfence_early_init_enabled()) { /* * Pick a size for the linear mapping. Currently, we only * support 16M, 1M and 4K which is the default @@ -1134,16 +1397,8 @@ static void __init htab_initialize(void) prot = pgprot_val(PAGE_KERNEL); - if (debug_pagealloc_enabled_or_kfence()) { - linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; - linear_map_hash_slots = memblock_alloc_try_nid( - linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT, - ppc64_rma_size, NUMA_NO_NODE); - if (!linear_map_hash_slots) - panic("%s: Failed to allocate %lu bytes max_addr=%pa\n", - __func__, linear_map_hash_count, &ppc64_rma_size); - } - + hash_debug_pagealloc_alloc_slots(); + hash_kfence_alloc_pool(); /* create bolted the linear mapping in the hash table */ for_each_mem_range(i, &base, &end) { size = end - base; @@ -1160,6 +1415,7 @@ static void __init htab_initialize(void) BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), prot, mmu_linear_psize, mmu_kernel_ssize)); } + hash_kfence_map_pool(); memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); /* @@ -2120,82 +2376,6 @@ void hpt_do_stress(unsigned long ea, unsigned long hpte_group) } } -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) -static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); - -static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) -{ - unsigned long hash; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); - long ret; - - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - - /* Don't create HPTE entries for bad address */ - if (!vsid) - return; - - if (linear_map_hash_slots[lmi] & 0x80) - return; - - ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, - HPTE_V_BOLTED, - mmu_linear_psize, mmu_kernel_ssize); - - BUG_ON (ret < 0); - raw_spin_lock(&linear_map_hash_lock); - BUG_ON(linear_map_hash_slots[lmi] & 0x80); - linear_map_hash_slots[lmi] = ret | 0x80; - raw_spin_unlock(&linear_map_hash_lock); -} - -static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) -{ - unsigned long hash, hidx, slot; - unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); - unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - - hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - raw_spin_lock(&linear_map_hash_lock); - if (!(linear_map_hash_slots[lmi] & 0x80)) { - raw_spin_unlock(&linear_map_hash_lock); - return; - } - hidx = linear_map_hash_slots[lmi] & 0x7f; - linear_map_hash_slots[lmi] = 0; - raw_spin_unlock(&linear_map_hash_lock); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, - mmu_linear_psize, - mmu_kernel_ssize, 0); -} - -int hash__kernel_map_pages(struct page *page, int numpages, int enable) -{ - unsigned long flags, vaddr, lmi; - int i; - - local_irq_save(flags); - for (i = 0; i < numpages; i++, page++) { - vaddr = (unsigned long)page_address(page); - lmi = __pa(vaddr) >> PAGE_SHIFT; - if (lmi >= linear_map_hash_count) - continue; - if (enable) - kernel_map_linear_page(vaddr, lmi); - else - kernel_unmap_linear_page(vaddr, lmi); - } - local_irq_restore(flags); - return 0; -} -#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_KFENCE */ - void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 5a4a75369043..374542528080 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -37,6 +37,19 @@ EXPORT_SYMBOL(__pmd_frag_nr); unsigned long __pmd_frag_size_shift; EXPORT_SYMBOL(__pmd_frag_size_shift); +#ifdef CONFIG_KFENCE +extern bool kfence_early_init; +static int __init parse_kfence_early_init(char *arg) +{ + int val; + + if (get_option(&arg, &val)) + kfence_early_init = !!val; + return 0; +} +early_param("kfence.sample_interval", parse_kfence_early_init); +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is called when relaxing access to a hugepage. It's also called in the page diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index b0d927009af8..311e2112d782 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -363,18 +363,6 @@ static int __meminit create_physical_mapping(unsigned long start, } #ifdef CONFIG_KFENCE -static bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL; - -static int __init parse_kfence_early_init(char *arg) -{ - int val; - - if (get_option(&arg, &val)) - kfence_early_init = !!val; - return 0; -} -early_param("kfence.sample_interval", parse_kfence_early_init); - static inline phys_addr_t alloc_kfence_pool(void) { phys_addr_t kfence_pool; diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index f2708c8629a5..6b783552403c 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -24,7 +24,7 @@ #include <linux/pgtable.h> #include <asm/udbg.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include "internal.h" diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c index 87307d0fc3b8..bc9a39821d1c 100644 --- a/arch/powerpc/mm/book3s64/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -633,6 +633,20 @@ return_addr: } EXPORT_SYMBOL_GPL(slice_get_unmapped_area); +#ifdef CONFIG_HUGETLB_PAGE +static int file_to_psize(struct file *file) +{ + struct hstate *hstate = hstate_file(file); + + return shift_to_mmu_psize(huge_page_shift(hstate)); +} +#else +static int file_to_psize(struct file *file) +{ + return 0; +} +#endif + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, @@ -640,11 +654,17 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long flags, vm_flags_t vm_flags) { + unsigned int psize; + if (radix_enabled()) return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); - return slice_get_unmapped_area(addr, len, flags, - mm_ctx_user_psize(¤t->mm->context), 0); + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr, len, flags, psize, 0); } unsigned long arch_get_unmapped_area_topdown(struct file *filp, @@ -654,11 +674,17 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long flags, vm_flags_t vm_flags) { + unsigned int psize; + if (radix_enabled()) return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags); - return slice_get_unmapped_area(addr0, len, flags, - mm_ctx_user_psize(¤t->mm->context), 1); + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr0, len, flags, psize, 1); } unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) @@ -788,20 +814,4 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); } - -static int file_to_psize(struct file *file) -{ - struct hstate *hstate = hstate_file(file); - return shift_to_mmu_psize(huge_page_shift(hstate)); -} - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - if (radix_enabled()) - return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); - - return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); -} #endif diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 81c77ddce2e3..c156fe0d53c3 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -439,10 +439,16 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, /* * The kernel should never take an execute fault nor should it * take a page fault to a kernel address or a page fault to a user - * address outside of dedicated places + * address outside of dedicated places. + * + * Rather than kfence directly reporting false negatives, search whether + * the NIP belongs to the fixup table for cases where fault could come + * from functions like copy_from_kernel_nofault(). */ if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) { - if (kfence_handle_page_fault(address, is_write, regs)) + if (is_kfence_address((void *)address) && + !search_exception_tables(instruction_pointer(regs)) && + kfence_handle_page_fault(address, is_write, regs)) return 0; return SIGSEGV; diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 2978fcbe307e..745097554bea 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -33,6 +33,7 @@ bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); #ifdef CONFIG_KFENCE bool __ro_after_init kfence_disabled; +bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL; #endif static int __init parse_nosmep(char *p) diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c index aa9aa11927b2..03666d790a53 100644 --- a/arch/powerpc/mm/kasan/init_32.c +++ b/arch/powerpc/mm/kasan/init_32.c @@ -7,7 +7,7 @@ #include <linux/memblock.h> #include <linux/sched/task.h> #include <asm/pgalloc.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <mm/mmu_decl.h> static pgprot_t __init kasan_prot_ro(void) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 1221c561b43a..c7708c8fad29 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -26,7 +26,7 @@ #include <asm/svm.h> #include <asm/mmzone.h> #include <asm/ftrace.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/setup.h> #include <asm/fixmap.h> diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c index 1beae802bb1c..6d10c6d8be71 100644 --- a/arch/powerpc/mm/nohash/44x.c +++ b/arch/powerpc/mm/nohash/44x.c @@ -24,7 +24,7 @@ #include <asm/mmu.h> #include <asm/page.h> #include <asm/cacheflush.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/smp.h> #include <mm/mmu_decl.h> diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index ad2a7c26f2a0..062e8785c1bb 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -10,7 +10,7 @@ #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/dma.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <mm/mmu_decl.h> diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index b653a7be4cb1..0a650742f3a0 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -37,7 +37,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/cputhreads.h> #include <asm/hugetlb.h> #include <asm/paca.h> diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c index d26656b07b72..4f925adf2695 100644 --- a/arch/powerpc/mm/nohash/tlb_64e.c +++ b/arch/powerpc/mm/nohash/tlb_64e.c @@ -24,7 +24,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/cputhreads.h> #include <mm/mmu_decl.h> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 7316396e452d..61df5aed7989 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -398,7 +398,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) */ if (pmd_none(*pmd)) return; - pte = pte_offset_map_nolock(mm, pmd, addr, &ptl); + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); BUG_ON(!pte); assert_spin_locked(ptl); pte_unmap(pte); diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index cdea5dccaefe..6beacaec63d3 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -12,6 +12,7 @@ #include <asm/types.h> #include <asm/ppc-opcode.h> +#include <linux/build_bug.h> #ifdef CONFIG_PPC64_ELF_ABI_V1 #define FUNCTION_DESCR_SIZE 24 @@ -21,6 +22,9 @@ #define CTX_NIA(ctx) ((unsigned long)ctx->idx * 4) +#define SZL sizeof(unsigned long) +#define BPF_INSN_SAFETY 64 + #define PLANT_INSTR(d, idx, instr) \ do { if (d) { (d)[idx] = instr; } idx++; } while (0) #define EMIT(instr) PLANT_INSTR(image, ctx->idx, instr) @@ -81,6 +85,18 @@ EMIT(PPC_RAW_ORI(d, d, (uintptr_t)(i) & \ 0xffff)); \ } } while (0) +#define PPC_LI_ADDR PPC_LI64 + +#ifndef CONFIG_PPC_KERNEL_PCREL +#define PPC64_LOAD_PACA() \ + EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))) +#else +#define PPC64_LOAD_PACA() do {} while (0) +#endif +#else +#define PPC_LI64(d, i) BUILD_BUG() +#define PPC_LI_ADDR PPC_LI32 +#define PPC64_LOAD_PACA() BUILD_BUG() #endif /* @@ -165,6 +181,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code u32 *addrs, int pass, bool extra_pass); void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx); void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx); +void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx); void bpf_jit_realloc_regs(struct codegen_context *ctx); int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, long exit_addr); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 2a36cc2e7e9e..2991bb171a9b 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -18,15 +18,85 @@ #include <linux/bpf.h> #include <asm/kprobes.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include "bpf_jit.h" +/* These offsets are from bpf prog end and stay the same across progs */ +static int bpf_jit_ool_stub, bpf_jit_long_branch_stub; + static void bpf_jit_fill_ill_insns(void *area, unsigned int size) { memset32(area, BREAKPOINT_INSTRUCTION, size / 4); } +void dummy_tramp(void); + +asm ( +" .pushsection .text, \"ax\", @progbits ;" +" .global dummy_tramp ;" +" .type dummy_tramp, @function ;" +"dummy_tramp: ;" +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +" blr ;" +#else +/* LR is always in r11, so we don't need a 'mflr r11' here */ +" mtctr 11 ;" +" mtlr 0 ;" +" bctr ;" +#endif +" .size dummy_tramp, .-dummy_tramp ;" +" .popsection ;" +); + +void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx) +{ + int ool_stub_idx, long_branch_stub_idx; + + /* + * Out-of-line stub: + * mflr r0 + * [b|bl] tramp + * mtlr r0 // only with CONFIG_PPC_FTRACE_OUT_OF_LINE + * b bpf_func + 4 + */ + ool_stub_idx = ctx->idx; + EMIT(PPC_RAW_MFLR(_R0)); + EMIT(PPC_RAW_NOP()); + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + EMIT(PPC_RAW_MTLR(_R0)); + WARN_ON_ONCE(!is_offset_in_branch_range(4 - (long)ctx->idx * 4)); + EMIT(PPC_RAW_BRANCH(4 - (long)ctx->idx * 4)); + + /* + * Long branch stub: + * .long <dummy_tramp_addr> + * mflr r11 + * bcl 20,31,$+4 + * mflr r12 + * ld r12, -8-SZL(r12) + * mtctr r12 + * mtlr r11 // needed to retain ftrace ABI + * bctr + */ + if (image) + *((unsigned long *)&image[ctx->idx]) = (unsigned long)dummy_tramp; + ctx->idx += SZL / 4; + long_branch_stub_idx = ctx->idx; + EMIT(PPC_RAW_MFLR(_R11)); + EMIT(PPC_RAW_BCL4()); + EMIT(PPC_RAW_MFLR(_R12)); + EMIT(PPC_RAW_LL(_R12, _R12, -8-SZL)); + EMIT(PPC_RAW_MTCTR(_R12)); + EMIT(PPC_RAW_MTLR(_R11)); + EMIT(PPC_RAW_BCTR()); + + if (!bpf_jit_ool_stub) { + bpf_jit_ool_stub = (ctx->idx - ool_stub_idx) * 4; + bpf_jit_long_branch_stub = (ctx->idx - long_branch_stub_idx) * 4; + } +} + int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, long exit_addr) { if (!exit_addr || is_offset_in_branch_range(exit_addr - (ctx->idx * 4))) { @@ -222,7 +292,7 @@ skip_init_ctx: fp->bpf_func = (void *)fimage; fp->jited = 1; - fp->jited_len = proglen + FUNCTION_DESCR_SIZE; + fp->jited_len = cgctx.idx * 4 + FUNCTION_DESCR_SIZE; if (!fp->is_func || extra_pass) { if (bpf_jit_binary_pack_finalize(fhdr, hdr)) { @@ -369,3 +439,778 @@ bool bpf_jit_supports_far_kfunc_call(void) { return IS_ENABLED(CONFIG_PPC64); } + +void *arch_alloc_bpf_trampoline(unsigned int size) +{ + return bpf_prog_pack_alloc(size, bpf_jit_fill_ill_insns); +} + +void arch_free_bpf_trampoline(void *image, unsigned int size) +{ + bpf_prog_pack_free(image, size); +} + +int arch_protect_bpf_trampoline(void *image, unsigned int size) +{ + return 0; +} + +static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ctx, + struct bpf_tramp_link *l, int regs_off, int retval_off, + int run_ctx_off, bool save_ret) +{ + struct bpf_prog *p = l->link.prog; + ppc_inst_t branch_insn; + u32 jmp_idx; + int ret = 0; + + /* Save cookie */ + if (IS_ENABLED(CONFIG_PPC64)) { + PPC_LI64(_R3, l->cookie); + EMIT(PPC_RAW_STD(_R3, _R1, run_ctx_off + offsetof(struct bpf_tramp_run_ctx, + bpf_cookie))); + } else { + PPC_LI32(_R3, l->cookie >> 32); + PPC_LI32(_R4, l->cookie); + EMIT(PPC_RAW_STW(_R3, _R1, + run_ctx_off + offsetof(struct bpf_tramp_run_ctx, bpf_cookie))); + EMIT(PPC_RAW_STW(_R4, _R1, + run_ctx_off + offsetof(struct bpf_tramp_run_ctx, bpf_cookie) + 4)); + } + + /* __bpf_prog_enter(p, &bpf_tramp_run_ctx) */ + PPC_LI_ADDR(_R3, p); + EMIT(PPC_RAW_MR(_R25, _R3)); + EMIT(PPC_RAW_ADDI(_R4, _R1, run_ctx_off)); + ret = bpf_jit_emit_func_call_rel(image, ro_image, ctx, + (unsigned long)bpf_trampoline_enter(p)); + if (ret) + return ret; + + /* Remember prog start time returned by __bpf_prog_enter */ + EMIT(PPC_RAW_MR(_R26, _R3)); + + /* + * if (__bpf_prog_enter(p) == 0) + * goto skip_exec_of_prog; + * + * Emit a nop to be later patched with conditional branch, once offset is known + */ + EMIT(PPC_RAW_CMPLI(_R3, 0)); + jmp_idx = ctx->idx; + EMIT(PPC_RAW_NOP()); + + /* p->bpf_func(ctx) */ + EMIT(PPC_RAW_ADDI(_R3, _R1, regs_off)); + if (!p->jited) + PPC_LI_ADDR(_R4, (unsigned long)p->insnsi); + if (!create_branch(&branch_insn, (u32 *)&ro_image[ctx->idx], (unsigned long)p->bpf_func, + BRANCH_SET_LINK)) { + if (image) + image[ctx->idx] = ppc_inst_val(branch_insn); + ctx->idx++; + } else { + EMIT(PPC_RAW_LL(_R12, _R25, offsetof(struct bpf_prog, bpf_func))); + EMIT(PPC_RAW_MTCTR(_R12)); + EMIT(PPC_RAW_BCTRL()); + } + + if (save_ret) + EMIT(PPC_RAW_STL(_R3, _R1, retval_off)); + + /* Fix up branch */ + if (image) { + if (create_cond_branch(&branch_insn, &image[jmp_idx], + (unsigned long)&image[ctx->idx], COND_EQ << 16)) + return -EINVAL; + image[jmp_idx] = ppc_inst_val(branch_insn); + } + + /* __bpf_prog_exit(p, start_time, &bpf_tramp_run_ctx) */ + EMIT(PPC_RAW_MR(_R3, _R25)); + EMIT(PPC_RAW_MR(_R4, _R26)); + EMIT(PPC_RAW_ADDI(_R5, _R1, run_ctx_off)); + ret = bpf_jit_emit_func_call_rel(image, ro_image, ctx, + (unsigned long)bpf_trampoline_exit(p)); + + return ret; +} + +static int invoke_bpf_mod_ret(u32 *image, u32 *ro_image, struct codegen_context *ctx, + struct bpf_tramp_links *tl, int regs_off, int retval_off, + int run_ctx_off, u32 *branches) +{ + int i; + + /* + * The first fmod_ret program will receive a garbage return value. + * Set this to 0 to avoid confusing the program. + */ + EMIT(PPC_RAW_LI(_R3, 0)); + EMIT(PPC_RAW_STL(_R3, _R1, retval_off)); + for (i = 0; i < tl->nr_links; i++) { + if (invoke_bpf_prog(image, ro_image, ctx, tl->links[i], regs_off, retval_off, + run_ctx_off, true)) + return -EINVAL; + + /* + * mod_ret prog stored return value after prog ctx. Emit: + * if (*(u64 *)(ret_val) != 0) + * goto do_fexit; + */ + EMIT(PPC_RAW_LL(_R3, _R1, retval_off)); + EMIT(PPC_RAW_CMPLI(_R3, 0)); + + /* + * Save the location of the branch and generate a nop, which is + * replaced with a conditional jump once do_fexit (i.e. the + * start of the fexit invocation) is finalized. + */ + branches[i] = ctx->idx; + EMIT(PPC_RAW_NOP()); + } + + return 0; +} + +static void bpf_trampoline_setup_tail_call_cnt(u32 *image, struct codegen_context *ctx, + int func_frame_offset, int r4_off) +{ + if (IS_ENABLED(CONFIG_PPC64)) { + /* See bpf_jit_stack_tailcallcnt() */ + int tailcallcnt_offset = 6 * 8; + + EMIT(PPC_RAW_LL(_R3, _R1, func_frame_offset - tailcallcnt_offset)); + EMIT(PPC_RAW_STL(_R3, _R1, -tailcallcnt_offset)); + } else { + /* See bpf_jit_stack_offsetof() and BPF_PPC_TC */ + EMIT(PPC_RAW_LL(_R4, _R1, r4_off)); + } +} + +static void bpf_trampoline_restore_tail_call_cnt(u32 *image, struct codegen_context *ctx, + int func_frame_offset, int r4_off) +{ + if (IS_ENABLED(CONFIG_PPC64)) { + /* See bpf_jit_stack_tailcallcnt() */ + int tailcallcnt_offset = 6 * 8; + + EMIT(PPC_RAW_LL(_R3, _R1, -tailcallcnt_offset)); + EMIT(PPC_RAW_STL(_R3, _R1, func_frame_offset - tailcallcnt_offset)); + } else { + /* See bpf_jit_stack_offsetof() and BPF_PPC_TC */ + EMIT(PPC_RAW_STL(_R4, _R1, r4_off)); + } +} + +static void bpf_trampoline_save_args(u32 *image, struct codegen_context *ctx, int func_frame_offset, + int nr_regs, int regs_off) +{ + int param_save_area_offset; + + param_save_area_offset = func_frame_offset; /* the two frames we alloted */ + param_save_area_offset += STACK_FRAME_MIN_SIZE; /* param save area is past frame header */ + + for (int i = 0; i < nr_regs; i++) { + if (i < 8) { + EMIT(PPC_RAW_STL(_R3 + i, _R1, regs_off + i * SZL)); + } else { + EMIT(PPC_RAW_LL(_R3, _R1, param_save_area_offset + i * SZL)); + EMIT(PPC_RAW_STL(_R3, _R1, regs_off + i * SZL)); + } + } +} + +/* Used when restoring just the register parameters when returning back */ +static void bpf_trampoline_restore_args_regs(u32 *image, struct codegen_context *ctx, + int nr_regs, int regs_off) +{ + for (int i = 0; i < nr_regs && i < 8; i++) + EMIT(PPC_RAW_LL(_R3 + i, _R1, regs_off + i * SZL)); +} + +/* Used when we call into the traced function. Replicate parameter save area */ +static void bpf_trampoline_restore_args_stack(u32 *image, struct codegen_context *ctx, + int func_frame_offset, int nr_regs, int regs_off) +{ + int param_save_area_offset; + + param_save_area_offset = func_frame_offset; /* the two frames we alloted */ + param_save_area_offset += STACK_FRAME_MIN_SIZE; /* param save area is past frame header */ + + for (int i = 8; i < nr_regs; i++) { + EMIT(PPC_RAW_LL(_R3, _R1, param_save_area_offset + i * SZL)); + EMIT(PPC_RAW_STL(_R3, _R1, STACK_FRAME_MIN_SIZE + i * SZL)); + } + bpf_trampoline_restore_args_regs(image, ctx, nr_regs, regs_off); +} + +static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image, + void *rw_image_end, void *ro_image, + const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, + void *func_addr) +{ + int regs_off, nregs_off, ip_off, run_ctx_off, retval_off, nvr_off, alt_lr_off, r4_off = 0; + int i, ret, nr_regs, bpf_frame_size = 0, bpf_dummy_frame_size = 0, func_frame_offset; + struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; + struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; + struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + struct codegen_context codegen_ctx, *ctx; + u32 *image = (u32 *)rw_image; + ppc_inst_t branch_insn; + u32 *branches = NULL; + bool save_ret; + + if (IS_ENABLED(CONFIG_PPC32)) + return -EOPNOTSUPP; + + nr_regs = m->nr_args; + /* Extra registers for struct arguments */ + for (i = 0; i < m->nr_args; i++) + if (m->arg_size[i] > SZL) + nr_regs += round_up(m->arg_size[i], SZL) / SZL - 1; + + if (nr_regs > MAX_BPF_FUNC_ARGS) + return -EOPNOTSUPP; + + ctx = &codegen_ctx; + memset(ctx, 0, sizeof(*ctx)); + + /* + * Generated stack layout: + * + * func prev back chain [ back chain ] + * [ ] + * bpf prog redzone/tailcallcnt [ ... ] 64 bytes (64-bit powerpc) + * [ ] -- + * LR save area [ r0 save (64-bit) ] | header + * [ r0 save (32-bit) ] | + * dummy frame for unwind [ back chain 1 ] -- + * [ padding ] align stack frame + * r4_off [ r4 (tailcallcnt) ] optional - 32-bit powerpc + * alt_lr_off [ real lr (ool stub)] optional - actual lr + * [ r26 ] + * nvr_off [ r25 ] nvr save area + * retval_off [ return value ] + * [ reg argN ] + * [ ... ] + * regs_off [ reg_arg1 ] prog ctx context + * nregs_off [ args count ] + * ip_off [ traced function ] + * [ ... ] + * run_ctx_off [ bpf_tramp_run_ctx ] + * [ reg argN ] + * [ ... ] + * param_save_area [ reg_arg1 ] min 8 doublewords, per ABI + * [ TOC save (64-bit) ] -- + * [ LR save (64-bit) ] | header + * [ LR save (32-bit) ] | + * bpf trampoline frame [ back chain 2 ] -- + * + */ + + /* Minimum stack frame header */ + bpf_frame_size = STACK_FRAME_MIN_SIZE; + + /* + * Room for parameter save area. + * + * As per the ABI, this is required if we call into the traced + * function (BPF_TRAMP_F_CALL_ORIG): + * - if the function takes more than 8 arguments for the rest to spill onto the stack + * - or, if the function has variadic arguments + * - or, if this functions's prototype was not available to the caller + * + * Reserve space for at least 8 registers for now. This can be optimized later. + */ + bpf_frame_size += (nr_regs > 8 ? nr_regs : 8) * SZL; + + /* Room for struct bpf_tramp_run_ctx */ + run_ctx_off = bpf_frame_size; + bpf_frame_size += round_up(sizeof(struct bpf_tramp_run_ctx), SZL); + + /* Room for IP address argument */ + ip_off = bpf_frame_size; + if (flags & BPF_TRAMP_F_IP_ARG) + bpf_frame_size += SZL; + + /* Room for args count */ + nregs_off = bpf_frame_size; + bpf_frame_size += SZL; + + /* Room for args */ + regs_off = bpf_frame_size; + bpf_frame_size += nr_regs * SZL; + + /* Room for return value of func_addr or fentry prog */ + retval_off = bpf_frame_size; + save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); + if (save_ret) + bpf_frame_size += SZL; + + /* Room for nvr save area */ + nvr_off = bpf_frame_size; + bpf_frame_size += 2 * SZL; + + /* Optional save area for actual LR in case of ool ftrace */ + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) { + alt_lr_off = bpf_frame_size; + bpf_frame_size += SZL; + } + + if (IS_ENABLED(CONFIG_PPC32)) { + if (nr_regs < 2) { + r4_off = bpf_frame_size; + bpf_frame_size += SZL; + } else { + r4_off = regs_off + SZL; + } + } + + /* Padding to align stack frame, if any */ + bpf_frame_size = round_up(bpf_frame_size, SZL * 2); + + /* Dummy frame size for proper unwind - includes 64-bytes red zone for 64-bit powerpc */ + bpf_dummy_frame_size = STACK_FRAME_MIN_SIZE + 64; + + /* Offset to the traced function's stack frame */ + func_frame_offset = bpf_dummy_frame_size + bpf_frame_size; + + /* Create dummy frame for unwind, store original return value */ + EMIT(PPC_RAW_STL(_R0, _R1, PPC_LR_STKOFF)); + /* Protect red zone where tail call count goes */ + EMIT(PPC_RAW_STLU(_R1, _R1, -bpf_dummy_frame_size)); + + /* Create our stack frame */ + EMIT(PPC_RAW_STLU(_R1, _R1, -bpf_frame_size)); + + /* 64-bit: Save TOC and load kernel TOC */ + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && !IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) { + EMIT(PPC_RAW_STD(_R2, _R1, 24)); + PPC64_LOAD_PACA(); + } + + /* 32-bit: save tail call count in r4 */ + if (IS_ENABLED(CONFIG_PPC32) && nr_regs < 2) + EMIT(PPC_RAW_STL(_R4, _R1, r4_off)); + + bpf_trampoline_save_args(image, ctx, func_frame_offset, nr_regs, regs_off); + + /* Save our return address */ + EMIT(PPC_RAW_MFLR(_R3)); + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + EMIT(PPC_RAW_STL(_R3, _R1, alt_lr_off)); + else + EMIT(PPC_RAW_STL(_R3, _R1, bpf_frame_size + PPC_LR_STKOFF)); + + /* + * Save ip address of the traced function. + * We could recover this from LR, but we will need to address for OOL trampoline, + * and optional GEP area. + */ + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) || flags & BPF_TRAMP_F_IP_ARG) { + EMIT(PPC_RAW_LWZ(_R4, _R3, 4)); + EMIT(PPC_RAW_SLWI(_R4, _R4, 6)); + EMIT(PPC_RAW_SRAWI(_R4, _R4, 6)); + EMIT(PPC_RAW_ADD(_R3, _R3, _R4)); + EMIT(PPC_RAW_ADDI(_R3, _R3, 4)); + } + + if (flags & BPF_TRAMP_F_IP_ARG) + EMIT(PPC_RAW_STL(_R3, _R1, ip_off)); + + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + /* Fake our LR for unwind */ + EMIT(PPC_RAW_STL(_R3, _R1, bpf_frame_size + PPC_LR_STKOFF)); + + /* Save function arg count -- see bpf_get_func_arg_cnt() */ + EMIT(PPC_RAW_LI(_R3, nr_regs)); + EMIT(PPC_RAW_STL(_R3, _R1, nregs_off)); + + /* Save nv regs */ + EMIT(PPC_RAW_STL(_R25, _R1, nvr_off)); + EMIT(PPC_RAW_STL(_R26, _R1, nvr_off + SZL)); + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + PPC_LI_ADDR(_R3, (unsigned long)im); + ret = bpf_jit_emit_func_call_rel(image, ro_image, ctx, + (unsigned long)__bpf_tramp_enter); + if (ret) + return ret; + } + + for (i = 0; i < fentry->nr_links; i++) + if (invoke_bpf_prog(image, ro_image, ctx, fentry->links[i], regs_off, retval_off, + run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) + return -EINVAL; + + if (fmod_ret->nr_links) { + branches = kcalloc(fmod_ret->nr_links, sizeof(u32), GFP_KERNEL); + if (!branches) + return -ENOMEM; + + if (invoke_bpf_mod_ret(image, ro_image, ctx, fmod_ret, regs_off, retval_off, + run_ctx_off, branches)) { + ret = -EINVAL; + goto cleanup; + } + } + + /* Call the traced function */ + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* + * The address in LR save area points to the correct point in the original function + * with both PPC_FTRACE_OUT_OF_LINE as well as with traditional ftrace instruction + * sequence + */ + EMIT(PPC_RAW_LL(_R3, _R1, bpf_frame_size + PPC_LR_STKOFF)); + EMIT(PPC_RAW_MTCTR(_R3)); + + /* Replicate tail_call_cnt before calling the original BPF prog */ + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + bpf_trampoline_setup_tail_call_cnt(image, ctx, func_frame_offset, r4_off); + + /* Restore args */ + bpf_trampoline_restore_args_stack(image, ctx, func_frame_offset, nr_regs, regs_off); + + /* Restore TOC for 64-bit */ + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && !IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) + EMIT(PPC_RAW_LD(_R2, _R1, 24)); + EMIT(PPC_RAW_BCTRL()); + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && !IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) + PPC64_LOAD_PACA(); + + /* Store return value for bpf prog to access */ + EMIT(PPC_RAW_STL(_R3, _R1, retval_off)); + + /* Restore updated tail_call_cnt */ + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + bpf_trampoline_restore_tail_call_cnt(image, ctx, func_frame_offset, r4_off); + + /* Reserve space to patch branch instruction to skip fexit progs */ + im->ip_after_call = &((u32 *)ro_image)[ctx->idx]; + EMIT(PPC_RAW_NOP()); + } + + /* Update branches saved in invoke_bpf_mod_ret with address of do_fexit */ + for (i = 0; i < fmod_ret->nr_links && image; i++) { + if (create_cond_branch(&branch_insn, &image[branches[i]], + (unsigned long)&image[ctx->idx], COND_NE << 16)) { + ret = -EINVAL; + goto cleanup; + } + + image[branches[i]] = ppc_inst_val(branch_insn); + } + + for (i = 0; i < fexit->nr_links; i++) + if (invoke_bpf_prog(image, ro_image, ctx, fexit->links[i], regs_off, retval_off, + run_ctx_off, false)) { + ret = -EINVAL; + goto cleanup; + } + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = &((u32 *)ro_image)[ctx->idx]; + PPC_LI_ADDR(_R3, im); + ret = bpf_jit_emit_func_call_rel(image, ro_image, ctx, + (unsigned long)__bpf_tramp_exit); + if (ret) + goto cleanup; + } + + if (flags & BPF_TRAMP_F_RESTORE_REGS) + bpf_trampoline_restore_args_regs(image, ctx, nr_regs, regs_off); + + /* Restore return value of func_addr or fentry prog */ + if (save_ret) + EMIT(PPC_RAW_LL(_R3, _R1, retval_off)); + + /* Restore nv regs */ + EMIT(PPC_RAW_LL(_R26, _R1, nvr_off + SZL)); + EMIT(PPC_RAW_LL(_R25, _R1, nvr_off)); + + /* Epilogue */ + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && !IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) + EMIT(PPC_RAW_LD(_R2, _R1, 24)); + if (flags & BPF_TRAMP_F_SKIP_FRAME) { + /* Skip the traced function and return to parent */ + EMIT(PPC_RAW_ADDI(_R1, _R1, func_frame_offset)); + EMIT(PPC_RAW_LL(_R0, _R1, PPC_LR_STKOFF)); + EMIT(PPC_RAW_MTLR(_R0)); + EMIT(PPC_RAW_BLR()); + } else { + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) { + EMIT(PPC_RAW_LL(_R0, _R1, alt_lr_off)); + EMIT(PPC_RAW_MTLR(_R0)); + EMIT(PPC_RAW_ADDI(_R1, _R1, func_frame_offset)); + EMIT(PPC_RAW_LL(_R0, _R1, PPC_LR_STKOFF)); + EMIT(PPC_RAW_BLR()); + } else { + EMIT(PPC_RAW_LL(_R0, _R1, bpf_frame_size + PPC_LR_STKOFF)); + EMIT(PPC_RAW_MTCTR(_R0)); + EMIT(PPC_RAW_ADDI(_R1, _R1, func_frame_offset)); + EMIT(PPC_RAW_LL(_R0, _R1, PPC_LR_STKOFF)); + EMIT(PPC_RAW_MTLR(_R0)); + EMIT(PPC_RAW_BCTR()); + } + } + + /* Make sure the trampoline generation logic doesn't overflow */ + if (image && WARN_ON_ONCE(&image[ctx->idx] > (u32 *)rw_image_end - BPF_INSN_SAFETY)) { + ret = -EFAULT; + goto cleanup; + } + ret = ctx->idx * 4 + BPF_INSN_SAFETY * 4; + +cleanup: + kfree(branches); + return ret; +} + +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr) +{ + struct bpf_tramp_image im; + void *image; + int ret; + + /* + * Allocate a temporary buffer for __arch_prepare_bpf_trampoline(). + * This will NOT cause fragmentation in direct map, as we do not + * call set_memory_*() on this buffer. + * + * We cannot use kvmalloc here, because we need image to be in + * module memory range. + */ + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) + return -ENOMEM; + + ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image, + m, flags, tlinks, func_addr); + bpf_jit_free_exec(image); + + return ret; +} + +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, + const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, + void *func_addr) +{ + u32 size = image_end - image; + void *rw_image, *tmp; + int ret; + + /* + * rw_image doesn't need to be in module memory range, so we can + * use kvmalloc. + */ + rw_image = kvmalloc(size, GFP_KERNEL); + if (!rw_image) + return -ENOMEM; + + ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m, + flags, tlinks, func_addr); + if (ret < 0) + goto out; + + if (bpf_jit_enable > 1) + bpf_jit_dump(1, ret - BPF_INSN_SAFETY * 4, 1, rw_image); + + tmp = bpf_arch_text_copy(image, rw_image, size); + if (IS_ERR(tmp)) + ret = PTR_ERR(tmp); + +out: + kvfree(rw_image); + return ret; +} + +static int bpf_modify_inst(void *ip, ppc_inst_t old_inst, ppc_inst_t new_inst) +{ + ppc_inst_t org_inst; + + if (copy_inst_from_kernel_nofault(&org_inst, ip)) { + pr_err("0x%lx: fetching instruction failed\n", (unsigned long)ip); + return -EFAULT; + } + + if (!ppc_inst_equal(org_inst, old_inst)) { + pr_err("0x%lx: expected (%08lx) != found (%08lx)\n", + (unsigned long)ip, ppc_inst_as_ulong(old_inst), ppc_inst_as_ulong(org_inst)); + return -EINVAL; + } + + if (ppc_inst_equal(old_inst, new_inst)) + return 0; + + return patch_instruction(ip, new_inst); +} + +static void do_isync(void *info __maybe_unused) +{ + isync(); +} + +/* + * A 3-step process for bpf prog entry: + * 1. At bpf prog entry, a single nop/b: + * bpf_func: + * [nop|b] ool_stub + * 2. Out-of-line stub: + * ool_stub: + * mflr r0 + * [b|bl] <bpf_prog>/<long_branch_stub> + * mtlr r0 // CONFIG_PPC_FTRACE_OUT_OF_LINE only + * b bpf_func + 4 + * 3. Long branch stub: + * long_branch_stub: + * .long <branch_addr>/<dummy_tramp> + * mflr r11 + * bcl 20,31,$+4 + * mflr r12 + * ld r12, -16(r12) + * mtctr r12 + * mtlr r11 // needed to retain ftrace ABI + * bctr + * + * dummy_tramp is used to reduce synchronization requirements. + * + * When attaching a bpf trampoline to a bpf prog, we do not need any + * synchronization here since we always have a valid branch target regardless + * of the order in which the above stores are seen. dummy_tramp ensures that + * the long_branch stub goes to a valid destination on other cpus, even when + * the branch to the long_branch stub is seen before the updated trampoline + * address. + * + * However, when detaching a bpf trampoline from a bpf prog, or if changing + * the bpf trampoline address, we need synchronization to ensure that other + * cpus can no longer branch into the older trampoline so that it can be + * safely freed. bpf_tramp_image_put() uses rcu_tasks to ensure all cpus + * make forward progress, but we still need to ensure that other cpus + * execute isync (or some CSI) so that they don't go back into the + * trampoline again. + */ +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, + void *old_addr, void *new_addr) +{ + unsigned long bpf_func, bpf_func_end, size, offset; + ppc_inst_t old_inst, new_inst; + int ret = 0, branch_flags; + char name[KSYM_NAME_LEN]; + + if (IS_ENABLED(CONFIG_PPC32)) + return -EOPNOTSUPP; + + bpf_func = (unsigned long)ip; + branch_flags = poke_type == BPF_MOD_CALL ? BRANCH_SET_LINK : 0; + + /* We currently only support poking bpf programs */ + if (!__bpf_address_lookup(bpf_func, &size, &offset, name)) { + pr_err("%s (0x%lx): kernel/modules are not supported\n", __func__, bpf_func); + return -EOPNOTSUPP; + } + + /* + * If we are not poking at bpf prog entry, then we are simply patching in/out + * an unconditional branch instruction at im->ip_after_call + */ + if (offset) { + if (poke_type != BPF_MOD_JUMP) { + pr_err("%s (0x%lx): calls are not supported in bpf prog body\n", __func__, + bpf_func); + return -EOPNOTSUPP; + } + old_inst = ppc_inst(PPC_RAW_NOP()); + if (old_addr) + if (create_branch(&old_inst, ip, (unsigned long)old_addr, 0)) + return -ERANGE; + new_inst = ppc_inst(PPC_RAW_NOP()); + if (new_addr) + if (create_branch(&new_inst, ip, (unsigned long)new_addr, 0)) + return -ERANGE; + mutex_lock(&text_mutex); + ret = bpf_modify_inst(ip, old_inst, new_inst); + mutex_unlock(&text_mutex); + + /* Make sure all cpus see the new instruction */ + smp_call_function(do_isync, NULL, 1); + return ret; + } + + bpf_func_end = bpf_func + size; + + /* Address of the jmp/call instruction in the out-of-line stub */ + ip = (void *)(bpf_func_end - bpf_jit_ool_stub + 4); + + if (!is_offset_in_branch_range((long)ip - 4 - bpf_func)) { + pr_err("%s (0x%lx): bpf prog too large, ool stub out of branch range\n", __func__, + bpf_func); + return -ERANGE; + } + + old_inst = ppc_inst(PPC_RAW_NOP()); + if (old_addr) { + if (is_offset_in_branch_range(ip - old_addr)) + create_branch(&old_inst, ip, (unsigned long)old_addr, branch_flags); + else + create_branch(&old_inst, ip, bpf_func_end - bpf_jit_long_branch_stub, + branch_flags); + } + new_inst = ppc_inst(PPC_RAW_NOP()); + if (new_addr) { + if (is_offset_in_branch_range(ip - new_addr)) + create_branch(&new_inst, ip, (unsigned long)new_addr, branch_flags); + else + create_branch(&new_inst, ip, bpf_func_end - bpf_jit_long_branch_stub, + branch_flags); + } + + mutex_lock(&text_mutex); + + /* + * 1. Update the address in the long branch stub: + * If new_addr is out of range, we will have to use the long branch stub, so patch new_addr + * here. Otherwise, revert to dummy_tramp, but only if we had patched old_addr here. + */ + if ((new_addr && !is_offset_in_branch_range(new_addr - ip)) || + (old_addr && !is_offset_in_branch_range(old_addr - ip))) + ret = patch_ulong((void *)(bpf_func_end - bpf_jit_long_branch_stub - SZL), + (new_addr && !is_offset_in_branch_range(new_addr - ip)) ? + (unsigned long)new_addr : (unsigned long)dummy_tramp); + if (ret) + goto out; + + /* 2. Update the branch/call in the out-of-line stub */ + ret = bpf_modify_inst(ip, old_inst, new_inst); + if (ret) + goto out; + + /* 3. Update instruction at bpf prog entry */ + ip = (void *)bpf_func; + if (!old_addr || !new_addr) { + if (!old_addr) { + old_inst = ppc_inst(PPC_RAW_NOP()); + create_branch(&new_inst, ip, bpf_func_end - bpf_jit_ool_stub, 0); + } else { + new_inst = ppc_inst(PPC_RAW_NOP()); + create_branch(&old_inst, ip, bpf_func_end - bpf_jit_ool_stub, 0); + } + ret = bpf_modify_inst(ip, old_inst, new_inst); + } + +out: + mutex_unlock(&text_mutex); + + /* + * Sync only if we are not attaching a trampoline to a bpf prog so the older + * trampoline can be freed safely. + */ + if (old_addr) + smp_call_function(do_isync, NULL, 1); + + return ret; +} diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index a0c4f1bde83e..c4db278dae36 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -127,13 +127,16 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) { int i; + /* Instruction for trampoline attach */ + EMIT(PPC_RAW_NOP()); + /* Initialize tail_call_cnt, to be skipped if we do tail calls. */ if (ctx->seen & SEEN_TAILCALL) EMIT(PPC_RAW_LI(_R4, 0)); else EMIT(PPC_RAW_NOP()); -#define BPF_TAILCALL_PROLOGUE_SIZE 4 +#define BPF_TAILCALL_PROLOGUE_SIZE 8 if (bpf_has_stack_frame(ctx)) EMIT(PPC_RAW_STWU(_R1, _R1, -BPF_PPC_STACKFRAME(ctx))); @@ -198,6 +201,8 @@ void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) bpf_jit_emit_common_epilogue(image, ctx); EMIT(PPC_RAW_BLR()); + + bpf_jit_build_fentry_stubs(image, ctx); } /* Relative offset needs to be calculated based on final image location */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 2cbcdf93cc19..233703b06d7c 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -84,7 +84,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) } /* - * When not setting up our own stackframe, the redzone usage is: + * When not setting up our own stackframe, the redzone (288 bytes) usage is: * * [ prev sp ] <------------- * [ ... ] | @@ -92,7 +92,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) * [ nv gpr save area ] 5*8 * [ tail_call_cnt ] 8 * [ local_tmp_var ] 16 - * [ unused red zone ] 208 bytes protected + * [ unused red zone ] 224 */ static int bpf_jit_stack_local(struct codegen_context *ctx) { @@ -126,6 +126,9 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) { int i; + /* Instruction for trampoline attach */ + EMIT(PPC_RAW_NOP()); + #ifndef CONFIG_PPC_KERNEL_PCREL if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))); @@ -200,16 +203,26 @@ void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) EMIT(PPC_RAW_MR(_R3, bpf_to_ppc(BPF_REG_0))); EMIT(PPC_RAW_BLR()); + + bpf_jit_build_fentry_stubs(image, ctx); } -static int -bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) +int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) { unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0; long reladdr; - if (WARN_ON_ONCE(!kernel_text_address(func_addr))) - return -EINVAL; + /* bpf to bpf call, func is not known in the initial pass. Emit 5 nops as a placeholder */ + if (!func) { + for (int i = 0; i < 5; i++) + EMIT(PPC_RAW_NOP()); + /* elfv1 needs an additional instruction to load addr from descriptor */ + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1)) + EMIT(PPC_RAW_NOP()); + EMIT(PPC_RAW_MTCTR(_R12)); + EMIT(PPC_RAW_BCTRL()); + return 0; + } #ifdef CONFIG_PPC_KERNEL_PCREL reladdr = func_addr - local_paca->kernelbase; @@ -266,7 +279,8 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, * We can clobber r2 since we get called through a * function pointer (so caller will save/restore r2). */ - EMIT(PPC_RAW_LD(_R2, bpf_to_ppc(TMP_REG_2), 8)); + if (is_module_text_address(func_addr)) + EMIT(PPC_RAW_LD(_R2, bpf_to_ppc(TMP_REG_2), 8)); } else { PPC_LI64(_R12, func); EMIT(PPC_RAW_MTCTR(_R12)); @@ -276,46 +290,14 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, * Load r2 with kernel TOC as kernel TOC is used if function address falls * within core kernel text. */ - EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))); + if (is_module_text_address(func_addr)) + EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))); } #endif return 0; } -int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) -{ - unsigned int i, ctx_idx = ctx->idx; - - if (WARN_ON_ONCE(func && is_module_text_address(func))) - return -EINVAL; - - /* skip past descriptor if elf v1 */ - func += FUNCTION_DESCR_SIZE; - - /* Load function address into r12 */ - PPC_LI64(_R12, func); - - /* For bpf-to-bpf function calls, the callee's address is unknown - * until the last extra pass. As seen above, we use PPC_LI64() to - * load the callee's address, but this may optimize the number of - * instructions required based on the nature of the address. - * - * Since we don't want the number of instructions emitted to increase, - * we pad the optimized PPC_LI64() call with NOPs to guarantee that - * we always have a five-instruction sequence, which is the maximum - * that PPC_LI64() can emit. - */ - if (!image) - for (i = ctx->idx - ctx_idx; i < 5; i++) - EMIT(PPC_RAW_NOP()); - - EMIT(PPC_RAW_MTCTR(_R12)); - EMIT(PPC_RAW_BCTRL()); - - return 0; -} - static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) { /* @@ -326,7 +308,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o */ int b2p_bpf_array = bpf_to_ppc(BPF_REG_2); int b2p_index = bpf_to_ppc(BPF_REG_3); - int bpf_tailcall_prologue_size = 8; + int bpf_tailcall_prologue_size = 12; if (!IS_ENABLED(CONFIG_PPC_KERNEL_PCREL) && IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) bpf_tailcall_prologue_size += 4; /* skip past the toc load */ @@ -1102,11 +1084,7 @@ emit_clear: if (ret < 0) return ret; - if (func_addr_fixed) - ret = bpf_jit_emit_func_call_hlp(image, fimage, ctx, func_addr); - else - ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, func_addr); - + ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, func_addr); if (ret) return ret; diff --git a/arch/powerpc/perf/8xx-pmu.c b/arch/powerpc/perf/8xx-pmu.c index 308a2e40d7be..1d2972229e3a 100644 --- a/arch/powerpc/perf/8xx-pmu.c +++ b/arch/powerpc/perf/8xx-pmu.c @@ -14,7 +14,7 @@ #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/ptrace.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/inst.h> #define PERF_8xx_ID_CPU_CYCLES 1 diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index 4f53d0b97539..ac2cf58d62db 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -16,6 +16,8 @@ obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o +obj-$(CONFIG_VPA_PMU) += vpa-pmu.o + obj-$(CONFIG_PPC_8xx) += 8xx-pmu.o obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index dc01aa604cc1..2b79171ee185 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -16,7 +16,7 @@ #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/ptrace.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/hw_irq.h> #include <asm/interrupt.h> diff --git a/arch/powerpc/perf/vpa-pmu.c b/arch/powerpc/perf/vpa-pmu.c new file mode 100644 index 000000000000..6a5bfd2a13b5 --- /dev/null +++ b/arch/powerpc/perf/vpa-pmu.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Performance monitoring support for Virtual Processor Area(VPA) based counters + * + * Copyright (C) 2024 IBM Corporation + */ +#define pr_fmt(fmt) "vpa_pmu: " fmt + +#include <linux/module.h> +#include <linux/perf_event.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s_64.h> + +#define MODULE_VERS "1.0" +#define MODULE_NAME "pseries_vpa_pmu" + +#define EVENT(_name, _code) enum{_name = _code} + +#define VPA_PMU_EVENT_VAR(_id) event_attr_##_id +#define VPA_PMU_EVENT_PTR(_id) (&event_attr_##_id.attr.attr) + +static ssize_t vpa_pmu_events_sysfs_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); + + return sprintf(page, "event=0x%02llx\n", pmu_attr->id); +} + +#define VPA_PMU_EVENT_ATTR(_name, _id) \ + PMU_EVENT_ATTR(_name, VPA_PMU_EVENT_VAR(_id), _id, \ + vpa_pmu_events_sysfs_show) + +EVENT(L1_TO_L2_CS_LAT, 0x1); +EVENT(L2_TO_L1_CS_LAT, 0x2); +EVENT(L2_RUNTIME_AGG, 0x3); + +VPA_PMU_EVENT_ATTR(l1_to_l2_lat, L1_TO_L2_CS_LAT); +VPA_PMU_EVENT_ATTR(l2_to_l1_lat, L2_TO_L1_CS_LAT); +VPA_PMU_EVENT_ATTR(l2_runtime_agg, L2_RUNTIME_AGG); + +static struct attribute *vpa_pmu_events_attr[] = { + VPA_PMU_EVENT_PTR(L1_TO_L2_CS_LAT), + VPA_PMU_EVENT_PTR(L2_TO_L1_CS_LAT), + VPA_PMU_EVENT_PTR(L2_RUNTIME_AGG), + NULL +}; + +static const struct attribute_group vpa_pmu_events_group = { + .name = "events", + .attrs = vpa_pmu_events_attr, +}; + +PMU_FORMAT_ATTR(event, "config:0-31"); +static struct attribute *vpa_pmu_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group vpa_pmu_format_group = { + .name = "format", + .attrs = vpa_pmu_format_attr, +}; + +static const struct attribute_group *vpa_pmu_attr_groups[] = { + &vpa_pmu_events_group, + &vpa_pmu_format_group, + NULL +}; + +static int vpa_pmu_event_init(struct perf_event *event) +{ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* it does not support event sampling mode */ + if (is_sampling_event(event)) + return -EOPNOTSUPP; + + /* no branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + /* Invalid event code */ + if ((event->attr.config <= 0) || (event->attr.config > 3)) + return -EINVAL; + + return 0; +} + +static unsigned long get_counter_data(struct perf_event *event) +{ + unsigned int config = event->attr.config; + u64 data; + + switch (config) { + case L1_TO_L2_CS_LAT: + if (event->attach_state & PERF_ATTACH_TASK) + data = kvmhv_get_l1_to_l2_cs_time_vcpu(); + else + data = kvmhv_get_l1_to_l2_cs_time(); + break; + case L2_TO_L1_CS_LAT: + if (event->attach_state & PERF_ATTACH_TASK) + data = kvmhv_get_l2_to_l1_cs_time_vcpu(); + else + data = kvmhv_get_l2_to_l1_cs_time(); + break; + case L2_RUNTIME_AGG: + if (event->attach_state & PERF_ATTACH_TASK) + data = kvmhv_get_l2_runtime_agg_vcpu(); + else + data = kvmhv_get_l2_runtime_agg(); + break; + default: + data = 0; + break; + } + + return data; +} + +static int vpa_pmu_add(struct perf_event *event, int flags) +{ + u64 data; + + kvmhv_set_l2_counters_status(smp_processor_id(), true); + + data = get_counter_data(event); + local64_set(&event->hw.prev_count, data); + + return 0; +} + +static void vpa_pmu_read(struct perf_event *event) +{ + u64 prev_data, new_data, final_data; + + prev_data = local64_read(&event->hw.prev_count); + new_data = get_counter_data(event); + final_data = new_data - prev_data; + + local64_add(final_data, &event->count); +} + +static void vpa_pmu_del(struct perf_event *event, int flags) +{ + vpa_pmu_read(event); + + /* + * Disable vpa counter accumulation + */ + kvmhv_set_l2_counters_status(smp_processor_id(), false); +} + +static struct pmu vpa_pmu = { + .task_ctx_nr = perf_sw_context, + .name = "vpa_pmu", + .event_init = vpa_pmu_event_init, + .add = vpa_pmu_add, + .del = vpa_pmu_del, + .read = vpa_pmu_read, + .attr_groups = vpa_pmu_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, +}; + +static int __init pseries_vpa_pmu_init(void) +{ + /* + * List of current Linux on Power platforms and + * this driver is supported only in PowerVM LPAR + * (L1) platform. + * + * Enabled Linux on Power Platforms + * ---------------------------------------- + * [X] PowerVM LPAR (L1) + * [ ] KVM Guest On PowerVM KoP(L2) + * [ ] Baremetal(PowerNV) + * [ ] KVM Guest On PowerNV + */ + if (!firmware_has_feature(FW_FEATURE_LPAR) || is_kvm_guest()) + return -ENODEV; + + perf_pmu_register(&vpa_pmu, vpa_pmu.name, -1); + pr_info("Virtual Processor Area PMU registered.\n"); + + return 0; +} + +static void __exit pseries_vpa_pmu_cleanup(void) +{ + perf_pmu_unregister(&vpa_pmu); + pr_info("Virtual Processor Area PMU unregistered.\n"); +} + +module_init(pseries_vpa_pmu_init); +module_exit(pseries_vpa_pmu_cleanup); +MODULE_DESCRIPTION("Perf Driver for pSeries VPA pmu counter"); +MODULE_AUTHOR("Kajol Jain <kjain@linux.ibm.com>"); +MODULE_AUTHOR("Madhavan Srinivasan <maddy@linux.ibm.com>"); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/44x/pci.c b/arch/powerpc/platforms/44x/pci.c index db6d33ca753f..364aeb86ab64 100644 --- a/arch/powerpc/platforms/44x/pci.c +++ b/arch/powerpc/platforms/44x/pci.c @@ -94,10 +94,8 @@ static int __init ppc4xx_parse_dma_ranges(struct pci_controller *hose, struct resource *res) { u64 size; - const u32 *ranges; - int rlen; - int pna = of_n_addr_cells(hose->dn); - int np = pna + 5; + struct of_range_parser parser; + struct of_range range; /* Default */ res->start = 0; @@ -105,18 +103,15 @@ static int __init ppc4xx_parse_dma_ranges(struct pci_controller *hose, res->end = size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH; - /* Get dma-ranges property */ - ranges = of_get_property(hose->dn, "dma-ranges", &rlen); - if (ranges == NULL) + if (of_pci_dma_range_parser_init(&parser, hose->dn)) goto out; - /* Walk it */ - while ((rlen -= np * 4) >= 0) { - u32 pci_space = ranges[0]; - u64 pci_addr = of_read_number(ranges + 1, 2); - u64 cpu_addr = of_translate_dma_address(hose->dn, ranges + 3); - size = of_read_number(ranges + pna + 3, 2); - ranges += np; + for_each_of_range(&parser, &range) { + u32 pci_space = range.flags; + u64 pci_addr = range.bus_addr; + u64 cpu_addr = range.cpu_addr; + size = range.size; + if (cpu_addr == OF_BAD_ADDR || size == 0) continue; diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c index 37a67120f257..a7172f9ebaad 100644 --- a/arch/powerpc/platforms/52xx/efika.c +++ b/arch/powerpc/platforms/52xx/efika.c @@ -13,6 +13,7 @@ #include <generated/utsrelease.h> #include <linux/pci.h> #include <linux/of.h> +#include <linux/seq_file.h> #include <asm/dma.h> #include <asm/time.h> #include <asm/machdep.h> diff --git a/arch/powerpc/platforms/82xx/ep8248e.c b/arch/powerpc/platforms/82xx/ep8248e.c index 3dc65ce1f175..8f918916e631 100644 --- a/arch/powerpc/platforms/82xx/ep8248e.c +++ b/arch/powerpc/platforms/82xx/ep8248e.c @@ -128,7 +128,7 @@ static int ep8248e_mdio_probe(struct platform_device *ofdev) bus->name = "ep8248e-mdio-bitbang"; bus->parent = &ofdev->dev; - snprintf(bus->id, MII_BUS_ID_SIZE, "%x", res.start); + snprintf(bus->id, MII_BUS_ID_SIZE, "%pa", &res.start); ret = of_mdiobus_register(bus, ofdev->dev.of_node); if (ret) diff --git a/arch/powerpc/platforms/82xx/km82xx.c b/arch/powerpc/platforms/82xx/km82xx.c index c86da3f2b74b..99f0f0f41876 100644 --- a/arch/powerpc/platforms/82xx/km82xx.c +++ b/arch/powerpc/platforms/82xx/km82xx.c @@ -27,15 +27,15 @@ static void __init km82xx_pic_init(void) { - struct device_node *np = of_find_compatible_node(NULL, NULL, - "fsl,pq2-pic"); + struct device_node *np __free(device_node); + np = of_find_compatible_node(NULL, NULL, "fsl,pq2-pic"); + if (!np) { pr_err("PIC init: can not find cpm-pic node\n"); return; } cpm2_pic_init(np); - of_node_put(np); } struct cpm_pin { diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index 9315a3b69d6d..604c1b4b6d45 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -40,27 +40,6 @@ config BSC9132_QDS and dual StarCore SC3850 DSP cores. Manufacturer : Freescale Semiconductor, Inc -config MPC8540_ADS - bool "Freescale MPC8540 ADS" - select DEFAULT_UIMAGE - help - This option enables support for the MPC 8540 ADS board - -config MPC8560_ADS - bool "Freescale MPC8560 ADS" - select DEFAULT_UIMAGE - select CPM2 - help - This option enables support for the MPC 8560 ADS board - -config MPC85xx_CDS - bool "Freescale MPC85xx CDS" - select DEFAULT_UIMAGE - select PPC_I8259 - select HAVE_RAPIDIO - help - This option enables support for the MPC85xx CDS board - config MPC85xx_MDS bool "Freescale MPC8568 MDS / MPC8569 MDS / P1021 MDS" select DEFAULT_UIMAGE diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index e52b848b64b7..32fa5fb557c0 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -23,7 +23,7 @@ #include <asm/mpic.h> #include <asm/cacheflush.h> #include <asm/dbell.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/cputhreads.h> #include <asm/fsl_pm.h> diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c index 8a7e55acf090..9be33e41af6d 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c @@ -12,7 +12,7 @@ #include <linux/delay.h> #include <linux/pgtable.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/page.h> #include <asm/pci-bridge.h> #include <asm/mpic.h> diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 1112a5831619..a454149ae02f 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -7,7 +7,6 @@ source "arch/powerpc/platforms/chrp/Kconfig" source "arch/powerpc/platforms/512x/Kconfig" source "arch/powerpc/platforms/52xx/Kconfig" source "arch/powerpc/platforms/powermac/Kconfig" -source "arch/powerpc/platforms/maple/Kconfig" source "arch/powerpc/platforms/pasemi/Kconfig" source "arch/powerpc/platforms/ps3/Kconfig" source "arch/powerpc/platforms/cell/Kconfig" diff --git a/arch/powerpc/platforms/Makefile b/arch/powerpc/platforms/Makefile index 786d374bff31..3cee4a842736 100644 --- a/arch/powerpc/platforms/Makefile +++ b/arch/powerpc/platforms/Makefile @@ -14,7 +14,6 @@ obj-$(CONFIG_FSL_SOC_BOOKE) += 85xx/ obj-$(CONFIG_PPC_86xx) += 86xx/ obj-$(CONFIG_PPC_POWERNV) += powernv/ obj-$(CONFIG_PPC_PSERIES) += pseries/ -obj-$(CONFIG_PPC_MAPLE) += maple/ obj-$(CONFIG_PPC_PASEMI) += pasemi/ obj-$(CONFIG_PPC_CELL) += cell/ obj-$(CONFIG_PPC_PS3) += ps3/ diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 4cd9c0de22c2..62c9679b8ca3 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -779,58 +779,41 @@ static int __init cell_iommu_init_disabled(void) static u64 cell_iommu_get_fixed_address(struct device *dev) { - u64 cpu_addr, size, best_size, dev_addr = OF_BAD_ADDR; + u64 best_size, dev_addr = OF_BAD_ADDR; struct device_node *np; - const u32 *ranges = NULL; - int i, len, best, naddr, nsize, pna, range_size; + struct of_range_parser parser; + struct of_range range; /* We can be called for platform devices that have no of_node */ np = of_node_get(dev->of_node); if (!np) goto out; - while (1) { - naddr = of_n_addr_cells(np); - nsize = of_n_size_cells(np); - np = of_get_next_parent(np); - if (!np) - break; - - ranges = of_get_property(np, "dma-ranges", &len); + while ((np = of_get_next_parent(np))) { + if (of_pci_dma_range_parser_init(&parser, np)) + continue; - /* Ignore empty ranges, they imply no translation required */ - if (ranges && len > 0) + if (of_range_count(&parser)) break; } - if (!ranges) { + if (!np) { dev_dbg(dev, "iommu: no dma-ranges found\n"); goto out; } - len /= sizeof(u32); - - pna = of_n_addr_cells(np); - range_size = naddr + nsize + pna; - - /* dma-ranges format: - * child addr : naddr cells - * parent addr : pna cells - * size : nsize cells - */ - for (i = 0, best = -1, best_size = 0; i < len; i += range_size) { - cpu_addr = of_translate_dma_address(np, ranges + i + naddr); - size = of_read_number(ranges + i + naddr + pna, nsize); + best_size = 0; + for_each_of_range(&parser, &range) { + if (!range.cpu_addr) + continue; - if (cpu_addr == 0 && size > best_size) { - best = i; - best_size = size; + if (range.size > best_size) { + best_size = range.size; + dev_addr = range.bus_addr; } } - if (best >= 0) { - dev_addr = of_read_number(ranges + best, naddr); - } else + if (!best_size) dev_dbg(dev, "iommu: no suitable range found!\n"); out: diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index fee638fd8970..0e8f20ecca08 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -35,7 +35,7 @@ #include <asm/firmware.h> #include <asm/rtas.h> #include <asm/cputhreads.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include "interrupt.h" #include <asm/udbg.h> diff --git a/arch/powerpc/platforms/embedded6xx/linkstation.c b/arch/powerpc/platforms/embedded6xx/linkstation.c index e265f026eee2..4012f206ec63 100644 --- a/arch/powerpc/platforms/embedded6xx/linkstation.c +++ b/arch/powerpc/platforms/embedded6xx/linkstation.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/initrd.h> #include <linux/of_platform.h> +#include <linux/seq_file.h> #include <asm/time.h> #include <asm/mpic.h> diff --git a/arch/powerpc/platforms/embedded6xx/mvme5100.c b/arch/powerpc/platforms/embedded6xx/mvme5100.c index 00bec0f051be..5ca41972ef22 100644 --- a/arch/powerpc/platforms/embedded6xx/mvme5100.c +++ b/arch/powerpc/platforms/embedded6xx/mvme5100.c @@ -14,6 +14,7 @@ #include <linux/of_irq.h> #include <linux/of_platform.h> +#include <linux/seq_file.h> #include <asm/i8259.h> #include <asm/pci-bridge.h> diff --git a/arch/powerpc/platforms/maple/Kconfig b/arch/powerpc/platforms/maple/Kconfig deleted file mode 100644 index 4c058cc57c90..000000000000 --- a/arch/powerpc/platforms/maple/Kconfig +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -config PPC_MAPLE - depends on PPC64 && PPC_BOOK3S && CPU_BIG_ENDIAN - bool "Maple 970FX Evaluation Board" - select FORCE_PCI - select MPIC - select U3_DART - select MPIC_U3_HT_IRQS - select GENERIC_TBSYNC - select PPC_UDBG_16550 - select PPC_970_NAP - select PPC_64S_HASH_MMU - select PPC_HASH_MMU_NATIVE - select PPC_RTAS - select MMIO_NVRAM - select ATA_NONSTANDARD if ATA - help - This option enables support for the Maple 970FX Evaluation Board. - For more information, refer to <http://www.970eval.com> diff --git a/arch/powerpc/platforms/maple/maple.h b/arch/powerpc/platforms/maple/maple.h deleted file mode 100644 index 8ddbaa4ebd0b..000000000000 --- a/arch/powerpc/platforms/maple/maple.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Declarations for maple-specific code. - * - * Maple is the name of a PPC970 evaluation board. - */ -extern int maple_set_rtc_time(struct rtc_time *tm); -extern void maple_get_rtc_time(struct rtc_time *tm); -extern time64_t maple_get_boot_time(void); -extern void maple_pci_init(void); -extern void maple_pci_irq_fixup(struct pci_dev *dev); -extern int maple_pci_get_legacy_ide_irq(struct pci_dev *dev, int channel); - -extern struct pci_controller_ops maple_pci_controller_ops; diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c deleted file mode 100644 index b9ff37c7f6f0..000000000000 --- a/arch/powerpc/platforms/maple/pci.c +++ /dev/null @@ -1,672 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), - * IBM Corp. - */ - -#undef DEBUG - -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/delay.h> -#include <linux/string.h> -#include <linux/init.h> -#include <linux/irq.h> -#include <linux/of_irq.h> - -#include <asm/sections.h> -#include <asm/io.h> -#include <asm/pci-bridge.h> -#include <asm/machdep.h> -#include <asm/iommu.h> -#include <asm/ppc-pci.h> -#include <asm/isa-bridge.h> - -#include "maple.h" - -#ifdef DEBUG -#define DBG(x...) printk(x) -#else -#define DBG(x...) -#endif - -static struct pci_controller *u3_agp, *u3_ht, *u4_pcie; - -static int __init fixup_one_level_bus_range(struct device_node *node, int higher) -{ - for (; node; node = node->sibling) { - const int *bus_range; - const unsigned int *class_code; - int len; - - /* For PCI<->PCI bridges or CardBus bridges, we go down */ - class_code = of_get_property(node, "class-code", NULL); - if (!class_code || ((*class_code >> 8) != PCI_CLASS_BRIDGE_PCI && - (*class_code >> 8) != PCI_CLASS_BRIDGE_CARDBUS)) - continue; - bus_range = of_get_property(node, "bus-range", &len); - if (bus_range != NULL && len > 2 * sizeof(int)) { - if (bus_range[1] > higher) - higher = bus_range[1]; - } - higher = fixup_one_level_bus_range(node->child, higher); - } - return higher; -} - -/* This routine fixes the "bus-range" property of all bridges in the - * system since they tend to have their "last" member wrong on macs - * - * Note that the bus numbers manipulated here are OF bus numbers, they - * are not Linux bus numbers. - */ -static void __init fixup_bus_range(struct device_node *bridge) -{ - int *bus_range; - struct property *prop; - int len; - - /* Lookup the "bus-range" property for the hose */ - prop = of_find_property(bridge, "bus-range", &len); - if (prop == NULL || prop->value == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %pOF\n", - bridge); - return; - } - bus_range = prop->value; - bus_range[1] = fixup_one_level_bus_range(bridge->child, bus_range[1]); -} - - -static unsigned long u3_agp_cfa0(u8 devfn, u8 off) -{ - return (1 << (unsigned long)PCI_SLOT(devfn)) | - ((unsigned long)PCI_FUNC(devfn) << 8) | - ((unsigned long)off & 0xFCUL); -} - -static unsigned long u3_agp_cfa1(u8 bus, u8 devfn, u8 off) -{ - return ((unsigned long)bus << 16) | - ((unsigned long)devfn << 8) | - ((unsigned long)off & 0xFCUL) | - 1UL; -} - -static volatile void __iomem *u3_agp_cfg_access(struct pci_controller* hose, - u8 bus, u8 dev_fn, u8 offset) -{ - unsigned int caddr; - - if (bus == hose->first_busno) { - if (dev_fn < (11 << 3)) - return NULL; - caddr = u3_agp_cfa0(dev_fn, offset); - } else - caddr = u3_agp_cfa1(bus, dev_fn, offset); - - /* Uninorth will return garbage if we don't read back the value ! */ - do { - out_le32(hose->cfg_addr, caddr); - } while (in_le32(hose->cfg_addr) != caddr); - - offset &= 0x07; - return hose->cfg_data + offset; -} - -static int u3_agp_read_config(struct pci_bus *bus, unsigned int devfn, - int offset, int len, u32 *val) -{ - struct pci_controller *hose; - volatile void __iomem *addr; - - hose = pci_bus_to_host(bus); - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - - addr = u3_agp_cfg_access(hose, bus->number, devfn, offset); - if (!addr) - return PCIBIOS_DEVICE_NOT_FOUND; - /* - * Note: the caller has already checked that offset is - * suitably aligned and that len is 1, 2 or 4. - */ - switch (len) { - case 1: - *val = in_8(addr); - break; - case 2: - *val = in_le16(addr); - break; - default: - *val = in_le32(addr); - break; - } - return PCIBIOS_SUCCESSFUL; -} - -static int u3_agp_write_config(struct pci_bus *bus, unsigned int devfn, - int offset, int len, u32 val) -{ - struct pci_controller *hose; - volatile void __iomem *addr; - - hose = pci_bus_to_host(bus); - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - - addr = u3_agp_cfg_access(hose, bus->number, devfn, offset); - if (!addr) - return PCIBIOS_DEVICE_NOT_FOUND; - /* - * Note: the caller has already checked that offset is - * suitably aligned and that len is 1, 2 or 4. - */ - switch (len) { - case 1: - out_8(addr, val); - break; - case 2: - out_le16(addr, val); - break; - default: - out_le32(addr, val); - break; - } - return PCIBIOS_SUCCESSFUL; -} - -static struct pci_ops u3_agp_pci_ops = -{ - .read = u3_agp_read_config, - .write = u3_agp_write_config, -}; - -static unsigned long u3_ht_cfa0(u8 devfn, u8 off) -{ - return (devfn << 8) | off; -} - -static unsigned long u3_ht_cfa1(u8 bus, u8 devfn, u8 off) -{ - return u3_ht_cfa0(devfn, off) + (bus << 16) + 0x01000000UL; -} - -static volatile void __iomem *u3_ht_cfg_access(struct pci_controller* hose, - u8 bus, u8 devfn, u8 offset) -{ - if (bus == hose->first_busno) { - if (PCI_SLOT(devfn) == 0) - return NULL; - return hose->cfg_data + u3_ht_cfa0(devfn, offset); - } else - return hose->cfg_data + u3_ht_cfa1(bus, devfn, offset); -} - -static int u3_ht_root_read_config(struct pci_controller *hose, u8 offset, - int len, u32 *val) -{ - volatile void __iomem *addr; - - addr = hose->cfg_addr; - addr += ((offset & ~3) << 2) + (4 - len - (offset & 3)); - - switch (len) { - case 1: - *val = in_8(addr); - break; - case 2: - *val = in_be16(addr); - break; - default: - *val = in_be32(addr); - break; - } - - return PCIBIOS_SUCCESSFUL; -} - -static int u3_ht_root_write_config(struct pci_controller *hose, u8 offset, - int len, u32 val) -{ - volatile void __iomem *addr; - - addr = hose->cfg_addr + ((offset & ~3) << 2) + (4 - len - (offset & 3)); - - if (offset >= PCI_BASE_ADDRESS_0 && offset < PCI_CAPABILITY_LIST) - return PCIBIOS_SUCCESSFUL; - - switch (len) { - case 1: - out_8(addr, val); - break; - case 2: - out_be16(addr, val); - break; - default: - out_be32(addr, val); - break; - } - - return PCIBIOS_SUCCESSFUL; -} - -static int u3_ht_read_config(struct pci_bus *bus, unsigned int devfn, - int offset, int len, u32 *val) -{ - struct pci_controller *hose; - volatile void __iomem *addr; - - hose = pci_bus_to_host(bus); - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - - if (bus->number == hose->first_busno && devfn == PCI_DEVFN(0, 0)) - return u3_ht_root_read_config(hose, offset, len, val); - - if (offset > 0xff) - return PCIBIOS_BAD_REGISTER_NUMBER; - - addr = u3_ht_cfg_access(hose, bus->number, devfn, offset); - if (!addr) - return PCIBIOS_DEVICE_NOT_FOUND; - - /* - * Note: the caller has already checked that offset is - * suitably aligned and that len is 1, 2 or 4. - */ - switch (len) { - case 1: - *val = in_8(addr); - break; - case 2: - *val = in_le16(addr); - break; - default: - *val = in_le32(addr); - break; - } - return PCIBIOS_SUCCESSFUL; -} - -static int u3_ht_write_config(struct pci_bus *bus, unsigned int devfn, - int offset, int len, u32 val) -{ - struct pci_controller *hose; - volatile void __iomem *addr; - - hose = pci_bus_to_host(bus); - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - - if (bus->number == hose->first_busno && devfn == PCI_DEVFN(0, 0)) - return u3_ht_root_write_config(hose, offset, len, val); - - if (offset > 0xff) - return PCIBIOS_BAD_REGISTER_NUMBER; - - addr = u3_ht_cfg_access(hose, bus->number, devfn, offset); - if (!addr) - return PCIBIOS_DEVICE_NOT_FOUND; - /* - * Note: the caller has already checked that offset is - * suitably aligned and that len is 1, 2 or 4. - */ - switch (len) { - case 1: - out_8(addr, val); - break; - case 2: - out_le16(addr, val); - break; - default: - out_le32(addr, val); - break; - } - return PCIBIOS_SUCCESSFUL; -} - -static struct pci_ops u3_ht_pci_ops = -{ - .read = u3_ht_read_config, - .write = u3_ht_write_config, -}; - -static unsigned int u4_pcie_cfa0(unsigned int devfn, unsigned int off) -{ - return (1 << PCI_SLOT(devfn)) | - (PCI_FUNC(devfn) << 8) | - ((off >> 8) << 28) | - (off & 0xfcu); -} - -static unsigned int u4_pcie_cfa1(unsigned int bus, unsigned int devfn, - unsigned int off) -{ - return (bus << 16) | - (devfn << 8) | - ((off >> 8) << 28) | - (off & 0xfcu) | 1u; -} - -static volatile void __iomem *u4_pcie_cfg_access(struct pci_controller* hose, - u8 bus, u8 dev_fn, int offset) -{ - unsigned int caddr; - - if (bus == hose->first_busno) - caddr = u4_pcie_cfa0(dev_fn, offset); - else - caddr = u4_pcie_cfa1(bus, dev_fn, offset); - - /* Uninorth will return garbage if we don't read back the value ! */ - do { - out_le32(hose->cfg_addr, caddr); - } while (in_le32(hose->cfg_addr) != caddr); - - offset &= 0x03; - return hose->cfg_data + offset; -} - -static int u4_pcie_read_config(struct pci_bus *bus, unsigned int devfn, - int offset, int len, u32 *val) -{ - struct pci_controller *hose; - volatile void __iomem *addr; - - hose = pci_bus_to_host(bus); - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - if (offset >= 0x1000) - return PCIBIOS_BAD_REGISTER_NUMBER; - addr = u4_pcie_cfg_access(hose, bus->number, devfn, offset); - if (!addr) - return PCIBIOS_DEVICE_NOT_FOUND; - /* - * Note: the caller has already checked that offset is - * suitably aligned and that len is 1, 2 or 4. - */ - switch (len) { - case 1: - *val = in_8(addr); - break; - case 2: - *val = in_le16(addr); - break; - default: - *val = in_le32(addr); - break; - } - return PCIBIOS_SUCCESSFUL; -} -static int u4_pcie_write_config(struct pci_bus *bus, unsigned int devfn, - int offset, int len, u32 val) -{ - struct pci_controller *hose; - volatile void __iomem *addr; - - hose = pci_bus_to_host(bus); - if (hose == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; - if (offset >= 0x1000) - return PCIBIOS_BAD_REGISTER_NUMBER; - addr = u4_pcie_cfg_access(hose, bus->number, devfn, offset); - if (!addr) - return PCIBIOS_DEVICE_NOT_FOUND; - /* - * Note: the caller has already checked that offset is - * suitably aligned and that len is 1, 2 or 4. - */ - switch (len) { - case 1: - out_8(addr, val); - break; - case 2: - out_le16(addr, val); - break; - default: - out_le32(addr, val); - break; - } - return PCIBIOS_SUCCESSFUL; -} - -static struct pci_ops u4_pcie_pci_ops = -{ - .read = u4_pcie_read_config, - .write = u4_pcie_write_config, -}; - -static void __init setup_u3_agp(struct pci_controller* hose) -{ - /* On G5, we move AGP up to high bus number so we don't need - * to reassign bus numbers for HT. If we ever have P2P bridges - * on AGP, we'll have to move pci_assign_all_buses to the - * pci_controller structure so we enable it for AGP and not for - * HT childs. - * We hard code the address because of the different size of - * the reg address cell, we shall fix that by killing struct - * reg_property and using some accessor functions instead - */ - hose->first_busno = 0xf0; - hose->last_busno = 0xff; - hose->ops = &u3_agp_pci_ops; - hose->cfg_addr = ioremap(0xf0000000 + 0x800000, 0x1000); - hose->cfg_data = ioremap(0xf0000000 + 0xc00000, 0x1000); - - u3_agp = hose; -} - -static void __init setup_u4_pcie(struct pci_controller* hose) -{ - /* We currently only implement the "non-atomic" config space, to - * be optimised later. - */ - hose->ops = &u4_pcie_pci_ops; - hose->cfg_addr = ioremap(0xf0000000 + 0x800000, 0x1000); - hose->cfg_data = ioremap(0xf0000000 + 0xc00000, 0x1000); - - u4_pcie = hose; -} - -static void __init setup_u3_ht(struct pci_controller* hose) -{ - hose->ops = &u3_ht_pci_ops; - - /* We hard code the address because of the different size of - * the reg address cell, we shall fix that by killing struct - * reg_property and using some accessor functions instead - */ - hose->cfg_data = ioremap(0xf2000000, 0x02000000); - hose->cfg_addr = ioremap(0xf8070000, 0x1000); - - hose->first_busno = 0; - hose->last_busno = 0xef; - - u3_ht = hose; -} - -static int __init maple_add_bridge(struct device_node *dev) -{ - int len; - struct pci_controller *hose; - char* disp_name; - const int *bus_range; - int primary = 1; - - DBG("Adding PCI host bridge %pOF\n", dev); - - bus_range = of_get_property(dev, "bus-range", &len); - if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %pOF, assume bus 0\n", - dev); - } - - hose = pcibios_alloc_controller(dev); - if (hose == NULL) - return -ENOMEM; - hose->first_busno = bus_range ? bus_range[0] : 0; - hose->last_busno = bus_range ? bus_range[1] : 0xff; - hose->controller_ops = maple_pci_controller_ops; - - disp_name = NULL; - if (of_device_is_compatible(dev, "u3-agp")) { - setup_u3_agp(hose); - disp_name = "U3-AGP"; - primary = 0; - } else if (of_device_is_compatible(dev, "u3-ht")) { - setup_u3_ht(hose); - disp_name = "U3-HT"; - primary = 1; - } else if (of_device_is_compatible(dev, "u4-pcie")) { - setup_u4_pcie(hose); - disp_name = "U4-PCIE"; - primary = 0; - } - printk(KERN_INFO "Found %s PCI host bridge. Firmware bus number: %d->%d\n", - disp_name, hose->first_busno, hose->last_busno); - - /* Interpret the "ranges" property */ - /* This also maps the I/O region and sets isa_io/mem_base */ - pci_process_bridge_OF_ranges(hose, dev, primary); - - /* Fixup "bus-range" OF property */ - fixup_bus_range(dev); - - /* Check for legacy IOs */ - isa_bridge_find_early(hose); - - /* create pci_dn's for DT nodes under this PHB */ - pci_devs_phb_init_dynamic(hose); - - return 0; -} - - -void maple_pci_irq_fixup(struct pci_dev *dev) -{ - DBG(" -> maple_pci_irq_fixup\n"); - - /* Fixup IRQ for PCIe host */ - if (u4_pcie != NULL && dev->bus->number == 0 && - pci_bus_to_host(dev->bus) == u4_pcie) { - printk(KERN_DEBUG "Fixup U4 PCIe IRQ\n"); - dev->irq = irq_create_mapping(NULL, 1); - if (dev->irq) - irq_set_irq_type(dev->irq, IRQ_TYPE_LEVEL_LOW); - } - - /* Hide AMD8111 IDE interrupt when in legacy mode so - * the driver calls pci_get_legacy_ide_irq() - */ - if (dev->vendor == PCI_VENDOR_ID_AMD && - dev->device == PCI_DEVICE_ID_AMD_8111_IDE && - (dev->class & 5) != 5) { - dev->irq = 0; - } - - DBG(" <- maple_pci_irq_fixup\n"); -} - -static int maple_pci_root_bridge_prepare(struct pci_host_bridge *bridge) -{ - struct pci_controller *hose = pci_bus_to_host(bridge->bus); - struct device_node *np, *child; - - if (hose != u3_agp) - return 0; - - /* Fixup the PCI<->OF mapping for U3 AGP due to bus renumbering. We - * assume there is no P2P bridge on the AGP bus, which should be a - * safe assumptions hopefully. - */ - np = hose->dn; - PCI_DN(np)->busno = 0xf0; - for_each_child_of_node(np, child) - PCI_DN(child)->busno = 0xf0; - - return 0; -} - -void __init maple_pci_init(void) -{ - struct device_node *np, *root; - struct device_node *ht = NULL; - - /* Probe root PCI hosts, that is on U3 the AGP host and the - * HyperTransport host. That one is actually "kept" around - * and actually added last as its resource management relies - * on the AGP resources to have been setup first - */ - root = of_find_node_by_path("/"); - if (root == NULL) { - printk(KERN_CRIT "maple_find_bridges: can't find root of device tree\n"); - return; - } - for_each_child_of_node(root, np) { - if (!of_node_is_type(np, "pci") && !of_node_is_type(np, "ht")) - continue; - if ((of_device_is_compatible(np, "u4-pcie") || - of_device_is_compatible(np, "u3-agp")) && - maple_add_bridge(np) == 0) - of_node_get(np); - - if (of_device_is_compatible(np, "u3-ht")) { - of_node_get(np); - ht = np; - } - } - of_node_put(root); - - /* Now setup the HyperTransport host if we found any - */ - if (ht && maple_add_bridge(ht) != 0) - of_node_put(ht); - - ppc_md.pcibios_root_bridge_prepare = maple_pci_root_bridge_prepare; - - /* Tell pci.c to not change any resource allocations. */ - pci_add_flags(PCI_PROBE_ONLY); -} - -int maple_pci_get_legacy_ide_irq(struct pci_dev *pdev, int channel) -{ - struct device_node *np; - unsigned int defirq = channel ? 15 : 14; - unsigned int irq; - - if (pdev->vendor != PCI_VENDOR_ID_AMD || - pdev->device != PCI_DEVICE_ID_AMD_8111_IDE) - return defirq; - - np = pci_device_to_OF_node(pdev); - if (np == NULL) { - printk("Failed to locate OF node for IDE %s\n", - pci_name(pdev)); - return defirq; - } - irq = irq_of_parse_and_map(np, channel & 0x1); - if (!irq) { - printk("Failed to map onboard IDE interrupt for channel %d\n", - channel); - return defirq; - } - return irq; -} - -static void quirk_ipr_msi(struct pci_dev *dev) -{ - /* Something prevents MSIs from the IPR from working on Bimini, - * and the driver has no smarts to recover. So disable MSI - * on it for now. */ - - if (machine_is(maple)) { - dev->no_msi = 1; - dev_info(&dev->dev, "Quirk disabled MSI\n"); - } -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_OBSIDIAN, - quirk_ipr_msi); - -struct pci_controller_ops maple_pci_controller_ops = { -}; diff --git a/arch/powerpc/platforms/maple/setup.c b/arch/powerpc/platforms/maple/setup.c deleted file mode 100644 index f329a03edf4a..000000000000 --- a/arch/powerpc/platforms/maple/setup.c +++ /dev/null @@ -1,363 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Maple (970 eval board) setup code - * - * (c) Copyright 2004 Benjamin Herrenschmidt (benh@kernel.crashing.org), - * IBM Corp. - */ - -#undef DEBUG - -#include <linux/init.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/mm.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/tty.h> -#include <linux/string.h> -#include <linux/delay.h> -#include <linux/ioport.h> -#include <linux/major.h> -#include <linux/initrd.h> -#include <linux/vt_kern.h> -#include <linux/console.h> -#include <linux/pci.h> -#include <linux/adb.h> -#include <linux/cuda.h> -#include <linux/pmu.h> -#include <linux/irq.h> -#include <linux/seq_file.h> -#include <linux/root_dev.h> -#include <linux/serial.h> -#include <linux/smp.h> -#include <linux/bitops.h> -#include <linux/of.h> -#include <linux/of_address.h> -#include <linux/platform_device.h> -#include <linux/memblock.h> - -#include <asm/processor.h> -#include <asm/sections.h> -#include <asm/io.h> -#include <asm/pci-bridge.h> -#include <asm/iommu.h> -#include <asm/machdep.h> -#include <asm/dma.h> -#include <asm/cputable.h> -#include <asm/time.h> -#include <asm/mpic.h> -#include <asm/rtas.h> -#include <asm/udbg.h> -#include <asm/nvram.h> - -#include "maple.h" - -#ifdef DEBUG -#define DBG(fmt...) udbg_printf(fmt) -#else -#define DBG(fmt...) -#endif - -static unsigned long maple_find_nvram_base(void) -{ - struct device_node *rtcs; - unsigned long result = 0; - - /* find NVRAM device */ - rtcs = of_find_compatible_node(NULL, "nvram", "AMD8111"); - if (rtcs) { - struct resource r; - if (of_address_to_resource(rtcs, 0, &r)) { - printk(KERN_EMERG "Maple: Unable to translate NVRAM" - " address\n"); - goto bail; - } - if (!(r.flags & IORESOURCE_IO)) { - printk(KERN_EMERG "Maple: NVRAM address isn't PIO!\n"); - goto bail; - } - result = r.start; - } else - printk(KERN_EMERG "Maple: Unable to find NVRAM\n"); - bail: - of_node_put(rtcs); - return result; -} - -static void __noreturn maple_restart(char *cmd) -{ - unsigned int maple_nvram_base; - const unsigned int *maple_nvram_offset, *maple_nvram_command; - struct device_node *sp; - - maple_nvram_base = maple_find_nvram_base(); - if (maple_nvram_base == 0) - goto fail; - - /* find service processor device */ - sp = of_find_node_by_name(NULL, "service-processor"); - if (!sp) { - printk(KERN_EMERG "Maple: Unable to find Service Processor\n"); - goto fail; - } - maple_nvram_offset = of_get_property(sp, "restart-addr", NULL); - maple_nvram_command = of_get_property(sp, "restart-value", NULL); - of_node_put(sp); - - /* send command */ - outb_p(*maple_nvram_command, maple_nvram_base + *maple_nvram_offset); - for (;;) ; - fail: - printk(KERN_EMERG "Maple: Manual Restart Required\n"); - for (;;) ; -} - -static void __noreturn maple_power_off(void) -{ - unsigned int maple_nvram_base; - const unsigned int *maple_nvram_offset, *maple_nvram_command; - struct device_node *sp; - - maple_nvram_base = maple_find_nvram_base(); - if (maple_nvram_base == 0) - goto fail; - - /* find service processor device */ - sp = of_find_node_by_name(NULL, "service-processor"); - if (!sp) { - printk(KERN_EMERG "Maple: Unable to find Service Processor\n"); - goto fail; - } - maple_nvram_offset = of_get_property(sp, "power-off-addr", NULL); - maple_nvram_command = of_get_property(sp, "power-off-value", NULL); - of_node_put(sp); - - /* send command */ - outb_p(*maple_nvram_command, maple_nvram_base + *maple_nvram_offset); - for (;;) ; - fail: - printk(KERN_EMERG "Maple: Manual Power-Down Required\n"); - for (;;) ; -} - -static void __noreturn maple_halt(void) -{ - maple_power_off(); -} - -#ifdef CONFIG_SMP -static struct smp_ops_t maple_smp_ops = { - .probe = smp_mpic_probe, - .message_pass = smp_mpic_message_pass, - .kick_cpu = smp_generic_kick_cpu, - .setup_cpu = smp_mpic_setup_cpu, - .give_timebase = smp_generic_give_timebase, - .take_timebase = smp_generic_take_timebase, -}; -#endif /* CONFIG_SMP */ - -static void __init maple_use_rtas_reboot_and_halt_if_present(void) -{ - if (rtas_function_implemented(RTAS_FN_SYSTEM_REBOOT) && - rtas_function_implemented(RTAS_FN_POWER_OFF)) { - ppc_md.restart = rtas_restart; - pm_power_off = rtas_power_off; - ppc_md.halt = rtas_halt; - } -} - -static void __init maple_setup_arch(void) -{ - /* init to some ~sane value until calibrate_delay() runs */ - loops_per_jiffy = 50000000; - - /* Setup SMP callback */ -#ifdef CONFIG_SMP - smp_ops = &maple_smp_ops; -#endif - maple_use_rtas_reboot_and_halt_if_present(); - - printk(KERN_DEBUG "Using native/NAP idle loop\n"); - - mmio_nvram_init(); -} - -/* - * This is almost identical to pSeries and CHRP. We need to make that - * code generic at one point, with appropriate bits in the device-tree to - * identify the presence of an HT APIC - */ -static void __init maple_init_IRQ(void) -{ - struct device_node *root, *np, *mpic_node = NULL; - const unsigned int *opprop; - unsigned long openpic_addr = 0; - int naddr, n, i, opplen, has_isus = 0; - struct mpic *mpic; - unsigned int flags = 0; - - /* Locate MPIC in the device-tree. Note that there is a bug - * in Maple device-tree where the type of the controller is - * open-pic and not interrupt-controller - */ - - for_each_node_by_type(np, "interrupt-controller") - if (of_device_is_compatible(np, "open-pic")) { - mpic_node = np; - break; - } - if (mpic_node == NULL) - for_each_node_by_type(np, "open-pic") { - mpic_node = np; - break; - } - if (mpic_node == NULL) { - printk(KERN_ERR - "Failed to locate the MPIC interrupt controller\n"); - return; - } - - /* Find address list in /platform-open-pic */ - root = of_find_node_by_path("/"); - naddr = of_n_addr_cells(root); - opprop = of_get_property(root, "platform-open-pic", &opplen); - if (opprop) { - openpic_addr = of_read_number(opprop, naddr); - has_isus = (opplen > naddr); - printk(KERN_DEBUG "OpenPIC addr: %lx, has ISUs: %d\n", - openpic_addr, has_isus); - } - - BUG_ON(openpic_addr == 0); - - /* Check for a big endian MPIC */ - if (of_property_read_bool(np, "big-endian")) - flags |= MPIC_BIG_ENDIAN; - - /* XXX Maple specific bits */ - flags |= MPIC_U3_HT_IRQS; - /* All U3/U4 are big-endian, older SLOF firmware doesn't encode this */ - flags |= MPIC_BIG_ENDIAN; - - /* Setup the openpic driver. More device-tree junks, we hard code no - * ISUs for now. I'll have to revisit some stuffs with the folks doing - * the firmware for those - */ - mpic = mpic_alloc(mpic_node, openpic_addr, flags, - /*has_isus ? 16 :*/ 0, 0, " MPIC "); - BUG_ON(mpic == NULL); - - /* Add ISUs */ - opplen /= sizeof(u32); - for (n = 0, i = naddr; i < opplen; i += naddr, n++) { - unsigned long isuaddr = of_read_number(opprop + i, naddr); - mpic_assign_isu(mpic, n, isuaddr); - } - - /* All ISUs are setup, complete initialization */ - mpic_init(mpic); - ppc_md.get_irq = mpic_get_irq; - of_node_put(mpic_node); - of_node_put(root); -} - -static void __init maple_progress(char *s, unsigned short hex) -{ - printk("*** %04x : %s\n", hex, s ? s : ""); -} - - -/* - * Called very early, MMU is off, device-tree isn't unflattened - */ -static int __init maple_probe(void) -{ - if (!of_machine_is_compatible("Momentum,Maple") && - !of_machine_is_compatible("Momentum,Apache")) - return 0; - - pm_power_off = maple_power_off; - - iommu_init_early_dart(&maple_pci_controller_ops); - - return 1; -} - -#ifdef CONFIG_EDAC -/* - * Register a platform device for CPC925 memory controller on - * all boards with U3H (CPC925) bridge. - */ -static int __init maple_cpc925_edac_setup(void) -{ - struct platform_device *pdev; - struct device_node *np = NULL; - struct resource r; - int ret; - volatile void __iomem *mem; - u32 rev; - - np = of_find_node_by_type(NULL, "memory-controller"); - if (!np) { - printk(KERN_ERR "%s: Unable to find memory-controller node\n", - __func__); - return -ENODEV; - } - - ret = of_address_to_resource(np, 0, &r); - of_node_put(np); - - if (ret < 0) { - printk(KERN_ERR "%s: Unable to get memory-controller reg\n", - __func__); - return -ENODEV; - } - - mem = ioremap(r.start, resource_size(&r)); - if (!mem) { - printk(KERN_ERR "%s: Unable to map memory-controller memory\n", - __func__); - return -ENOMEM; - } - - rev = __raw_readl(mem); - iounmap(mem); - - if (rev < 0x34 || rev > 0x3f) { /* U3H */ - printk(KERN_ERR "%s: Non-CPC925(U3H) bridge revision: %02x\n", - __func__, rev); - return 0; - } - - pdev = platform_device_register_simple("cpc925_edac", 0, &r, 1); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - printk(KERN_INFO "%s: CPC925 platform device created\n", __func__); - - return 0; -} -machine_device_initcall(maple, maple_cpc925_edac_setup); -#endif - -define_machine(maple) { - .name = "Maple", - .probe = maple_probe, - .setup_arch = maple_setup_arch, - .discover_phbs = maple_pci_init, - .init_IRQ = maple_init_IRQ, - .pci_irq_fixup = maple_pci_irq_fixup, - .pci_get_legacy_ide_irq = maple_pci_get_legacy_ide_irq, - .restart = maple_restart, - .halt = maple_halt, - .get_boot_time = maple_get_boot_time, - .set_rtc_time = maple_set_rtc_time, - .get_rtc_time = maple_get_rtc_time, - .progress = maple_progress, - .power_save = power4_idle, -}; diff --git a/arch/powerpc/platforms/maple/time.c b/arch/powerpc/platforms/maple/time.c deleted file mode 100644 index 91606411d2e0..000000000000 --- a/arch/powerpc/platforms/maple/time.c +++ /dev/null @@ -1,170 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * (c) Copyright 2004 Benjamin Herrenschmidt (benh@kernel.crashing.org), - * IBM Corp. - */ - -#undef DEBUG - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/param.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/time.h> -#include <linux/adb.h> -#include <linux/pmu.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/bcd.h> -#include <linux/of_address.h> - -#include <asm/sections.h> -#include <asm/io.h> -#include <asm/machdep.h> -#include <asm/time.h> - -#include "maple.h" - -#ifdef DEBUG -#define DBG(x...) printk(x) -#else -#define DBG(x...) -#endif - -static int maple_rtc_addr; - -static int maple_clock_read(int addr) -{ - outb_p(addr, maple_rtc_addr); - return inb_p(maple_rtc_addr+1); -} - -static void maple_clock_write(unsigned long val, int addr) -{ - outb_p(addr, maple_rtc_addr); - outb_p(val, maple_rtc_addr+1); -} - -void maple_get_rtc_time(struct rtc_time *tm) -{ - do { - tm->tm_sec = maple_clock_read(RTC_SECONDS); - tm->tm_min = maple_clock_read(RTC_MINUTES); - tm->tm_hour = maple_clock_read(RTC_HOURS); - tm->tm_mday = maple_clock_read(RTC_DAY_OF_MONTH); - tm->tm_mon = maple_clock_read(RTC_MONTH); - tm->tm_year = maple_clock_read(RTC_YEAR); - } while (tm->tm_sec != maple_clock_read(RTC_SECONDS)); - - if (!(maple_clock_read(RTC_CONTROL) & RTC_DM_BINARY) - || RTC_ALWAYS_BCD) { - tm->tm_sec = bcd2bin(tm->tm_sec); - tm->tm_min = bcd2bin(tm->tm_min); - tm->tm_hour = bcd2bin(tm->tm_hour); - tm->tm_mday = bcd2bin(tm->tm_mday); - tm->tm_mon = bcd2bin(tm->tm_mon); - tm->tm_year = bcd2bin(tm->tm_year); - } - if ((tm->tm_year + 1900) < 1970) - tm->tm_year += 100; - - tm->tm_wday = -1; -} - -int maple_set_rtc_time(struct rtc_time *tm) -{ - unsigned char save_control, save_freq_select; - int sec, min, hour, mon, mday, year; - - spin_lock(&rtc_lock); - - save_control = maple_clock_read(RTC_CONTROL); /* tell the clock it's being set */ - - maple_clock_write((save_control|RTC_SET), RTC_CONTROL); - - save_freq_select = maple_clock_read(RTC_FREQ_SELECT); /* stop and reset prescaler */ - - maple_clock_write((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); - - sec = tm->tm_sec; - min = tm->tm_min; - hour = tm->tm_hour; - mon = tm->tm_mon; - mday = tm->tm_mday; - year = tm->tm_year; - - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - sec = bin2bcd(sec); - min = bin2bcd(min); - hour = bin2bcd(hour); - mon = bin2bcd(mon); - mday = bin2bcd(mday); - year = bin2bcd(year); - } - maple_clock_write(sec, RTC_SECONDS); - maple_clock_write(min, RTC_MINUTES); - maple_clock_write(hour, RTC_HOURS); - maple_clock_write(mon, RTC_MONTH); - maple_clock_write(mday, RTC_DAY_OF_MONTH); - maple_clock_write(year, RTC_YEAR); - - /* The following flags have to be released exactly in this order, - * otherwise the DS12887 (popular MC146818A clone with integrated - * battery and quartz) will not reset the oscillator and will not - * update precisely 500 ms later. You won't find this mentioned in - * the Dallas Semiconductor data sheets, but who believes data - * sheets anyway ... -- Markus Kuhn - */ - maple_clock_write(save_control, RTC_CONTROL); - maple_clock_write(save_freq_select, RTC_FREQ_SELECT); - - spin_unlock(&rtc_lock); - - return 0; -} - -static struct resource rtc_iores = { - .name = "rtc", - .flags = IORESOURCE_IO | IORESOURCE_BUSY, -}; - -time64_t __init maple_get_boot_time(void) -{ - struct rtc_time tm; - struct device_node *rtcs; - - rtcs = of_find_compatible_node(NULL, "rtc", "pnpPNP,b00"); - if (rtcs) { - struct resource r; - if (of_address_to_resource(rtcs, 0, &r)) { - printk(KERN_EMERG "Maple: Unable to translate RTC" - " address\n"); - goto bail; - } - if (!(r.flags & IORESOURCE_IO)) { - printk(KERN_EMERG "Maple: RTC address isn't PIO!\n"); - goto bail; - } - maple_rtc_addr = r.start; - printk(KERN_INFO "Maple: Found RTC at IO 0x%x\n", - maple_rtc_addr); - } - bail: - of_node_put(rtcs); - if (maple_rtc_addr == 0) { - maple_rtc_addr = RTC_PORT(0); /* legacy address */ - printk(KERN_INFO "Maple: No device node for RTC, assuming " - "legacy address (0x%x)\n", maple_rtc_addr); - } - - rtc_iores.start = maple_rtc_addr; - rtc_iores.end = maple_rtc_addr + 7; - request_resource(&ioport_resource, &rtc_iores); - - maple_get_rtc_time(&tm); - return rtc_tm_to_time64(&tm); -} - diff --git a/arch/powerpc/platforms/powermac/backlight.c b/arch/powerpc/platforms/powermac/backlight.c index 12bc01353bd3..79741370c40c 100644 --- a/arch/powerpc/platforms/powermac/backlight.c +++ b/arch/powerpc/platforms/powermac/backlight.c @@ -57,18 +57,10 @@ struct backlight_device *pmac_backlight; int pmac_has_backlight_type(const char *type) { struct device_node* bk_node = of_find_node_by_name(NULL, "backlight"); + int i = of_property_match_string(bk_node, "backlight-control", type); - if (bk_node) { - const char *prop = of_get_property(bk_node, - "backlight-control", NULL); - if (prop && strncmp(prop, type, strlen(type)) == 0) { - of_node_put(bk_node); - return 1; - } - of_node_put(bk_node); - } - - return 0; + of_node_put(bk_node); + return i >= 0; } static void pmac_backlight_key_worker(struct work_struct *work) diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index d21b681f52fb..09e7fe24fac1 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -35,7 +35,7 @@ #include <asm/ptrace.h> #include <linux/atomic.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/irq.h> #include <asm/page.h> #include <asm/sections.h> diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index ad41dffe4d92..d98b933e4984 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -18,7 +18,7 @@ #include <asm/opal.h> #include <asm/cputhreads.h> #include <asm/cpuidle.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/smp.h> #include <asm/runlatch.h> #include <asm/dbell.h> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 2e9da58195f5..8f41ef364fc6 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -28,7 +28,7 @@ #include <asm/xive.h> #include <asm/opal.h> #include <asm/runlatch.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/dbell.h> #include <asm/kvm_ppc.h> #include <asm/ppc-opcode.h> diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c index b18e1c92e554..61722133eb2d 100644 --- a/arch/powerpc/platforms/ps3/device-init.c +++ b/arch/powerpc/platforms/ps3/device-init.c @@ -178,7 +178,7 @@ fail_malloc: return result; } -static int __ref ps3_setup_uhc_device( +static int __init ps3_setup_uhc_device( const struct ps3_repository_device *repo, enum ps3_match_id match_id, enum ps3_interrupt_type interrupt_type, enum ps3_reg_type reg_type) { diff --git a/arch/powerpc/platforms/ps3/interrupt.c b/arch/powerpc/platforms/ps3/interrupt.c index 49871427f599..af3fe9f04f24 100644 --- a/arch/powerpc/platforms/ps3/interrupt.c +++ b/arch/powerpc/platforms/ps3/interrupt.c @@ -378,9 +378,9 @@ int ps3_send_event_locally(unsigned int virq) /** * ps3_sb_event_receive_port_setup - Setup a system bus event receive port. + * @dev: The system bus device instance. * @cpu: enum ps3_cpu_binding indicating the cpu the interrupt should be * serviced on. - * @dev: The system bus device instance. * @virq: The assigned Linux virq. * * An event irq represents a virtual device interrupt. The interrupt_id diff --git a/arch/powerpc/platforms/ps3/repository.c b/arch/powerpc/platforms/ps3/repository.c index 1abe33fbe529..b8c030eab138 100644 --- a/arch/powerpc/platforms/ps3/repository.c +++ b/arch/powerpc/platforms/ps3/repository.c @@ -940,7 +940,7 @@ int __init ps3_repository_read_vuart_sysmgr_port(unsigned int *port) /** * ps3_repository_read_boot_dat_info - Get address and size of cell_ext_os_area. - * address: lpar address of cell_ext_os_area + * @lpar_addr: lpar address of cell_ext_os_area * @size: size of cell_ext_os_area */ diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index b9a7d9bae687..afbaabf182d0 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -453,10 +453,9 @@ static ssize_t modalias_show(struct device *_dev, struct device_attribute *a, char *buf) { struct ps3_system_bus_device *dev = ps3_dev_to_system_bus_dev(_dev); - int len = snprintf(buf, PAGE_SIZE, "ps3:%d:%d\n", dev->match_id, - dev->match_sub_id); - return (len >= PAGE_SIZE) ? (PAGE_SIZE - 1) : len; + return sysfs_emit(buf, "ps3:%d:%d\n", dev->match_id, + dev->match_sub_id); } static DEVICE_ATTR_RO(modalias); diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index afc0f6a61337..42fc66e97539 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -140,6 +140,20 @@ config HV_PERF_CTRS If unsure, select Y. +config VPA_PMU + tristate "VPA PMU events" + depends on KVM_BOOK3S_64_HV && HV_PERF_CTRS + help + Enable access to the VPA PMU counters via perf. This enables + code that support measurement for KVM on PowerVM(KoP) feature. + PAPR hypervisor has introduced three new counters in the VPA area + of LPAR CPUs for KVM L2 guest observability. Two for context switches + from host to guest and vice versa, and one counter for getting + the total time spent inside the KVM guest. This config enables code + that access these software counters via perf. + + If unsure, Select N. + config IBMVIO depends on PPC_PSERIES bool diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 8cb9d36ea491..f293588b8c7b 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -191,7 +191,7 @@ static int dtl_enable(struct dtl *dtl) return -EBUSY; /* ensure there are no other conflicting dtl users */ - if (!read_trylock(&dtl_access_lock)) + if (!down_read_trylock(&dtl_access_lock)) return -EBUSY; n_entries = dtl_buf_entries; @@ -199,7 +199,7 @@ static int dtl_enable(struct dtl *dtl) if (!buf) { printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n", __func__, dtl->cpu); - read_unlock(&dtl_access_lock); + up_read(&dtl_access_lock); return -ENOMEM; } @@ -217,7 +217,7 @@ static int dtl_enable(struct dtl *dtl) spin_unlock(&dtl->lock); if (rc) { - read_unlock(&dtl_access_lock); + up_read(&dtl_access_lock); kmem_cache_free(dtl_cache, buf); } @@ -232,7 +232,7 @@ static void dtl_disable(struct dtl *dtl) dtl->buf = NULL; dtl->buf_entries = 0; spin_unlock(&dtl->lock); - read_unlock(&dtl_access_lock); + up_read(&dtl_access_lock); } /* file interface */ diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index c1d8bee8f701..6a415febc53b 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -16,6 +16,7 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/delay.h> +#include <linux/seq_file.h> #include <linux/stop_machine.h> #include <linux/spinlock.h> #include <linux/cpuhotplug.h> @@ -169,7 +170,7 @@ struct vcpu_dispatch_data { */ #define NR_CPUS_H NR_CPUS -DEFINE_RWLOCK(dtl_access_lock); +DECLARE_RWSEM(dtl_access_lock); static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data); static DEFINE_PER_CPU(u64, dtl_entry_ridx); static DEFINE_PER_CPU(struct dtl_worker, dtl_workers); @@ -463,7 +464,7 @@ static int dtl_worker_enable(unsigned long *time_limit) { int rc = 0, state; - if (!write_trylock(&dtl_access_lock)) { + if (!down_write_trylock(&dtl_access_lock)) { rc = -EBUSY; goto out; } @@ -479,7 +480,7 @@ static int dtl_worker_enable(unsigned long *time_limit) pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n"); free_dtl_buffers(time_limit); reset_global_dtl_mask(); - write_unlock(&dtl_access_lock); + up_write(&dtl_access_lock); rc = -EINVAL; goto out; } @@ -494,7 +495,7 @@ static void dtl_worker_disable(unsigned long *time_limit) cpuhp_remove_state(dtl_worker_state); free_dtl_buffers(time_limit); reset_global_dtl_mask(); - write_unlock(&dtl_access_lock); + up_write(&dtl_access_lock); } static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p, diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 6dfb55b52d36..fdc2f7f38dc9 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -9,6 +9,7 @@ #include <linux/irq.h> #include <linux/irqdomain.h> #include <linux/msi.h> +#include <linux/seq_file.h> #include <asm/rtas.h> #include <asm/hw_irq.h> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 9e297f88adc5..f84ac9fbe203 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/ioport.h> +#include <linux/seq_file.h> #include <linux/slab.h> #include <linux/ndctl.h> #include <linux/sched.h> diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index c597711ef20a..db99725e752b 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -39,7 +39,7 @@ #include <asm/xive.h> #include <asm/dbell.h> #include <asm/plpar_wrappers.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/svm.h> #include <asm/kvm_guest.h> diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 3b4045d508ec..c5d0f92c7969 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -8,7 +8,9 @@ #include <linux/mm.h> #include <linux/memblock.h> +#include <linux/mem_encrypt.h> #include <linux/cc_platform.h> +#include <linux/mem_encrypt.h> #include <asm/machdep.h> #include <asm/svm.h> #include <asm/swiotlb.h> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index fa01818c1972..a6c388bdf5d0 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -726,7 +726,7 @@ static int xive_irq_set_affinity(struct irq_data *d, pr_debug("%s: irq %d/0x%x\n", __func__, d->irq, hw_irq); /* Is this valid ? */ - if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) + if (!cpumask_intersects(cpumask, cpu_online_mask)) return -EINVAL; /* diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index f2fa985a2c77..5aedbe3e8e6a 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -7,6 +7,7 @@ #include <linux/types.h> #include <linux/irq.h> +#include <linux/seq_file.h> #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/init.h> diff --git a/arch/powerpc/platforms/maple/Makefile b/arch/powerpc/tools/.gitignore index 19f35ab828a7..ec380a14a09a 100644 --- a/arch/powerpc/platforms/maple/Makefile +++ b/arch/powerpc/tools/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += setup.o pci.o time.o +/vmlinux.arch.S diff --git a/arch/powerpc/tools/Makefile b/arch/powerpc/tools/Makefile new file mode 100644 index 000000000000..e1f7afcd9fdf --- /dev/null +++ b/arch/powerpc/tools/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +quiet_cmd_gen_ftrace_ool_stubs = GEN $@ + cmd_gen_ftrace_ool_stubs = $< "$(CONFIG_PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE)" "$(CONFIG_64BIT)" \ + "$(OBJDUMP)" vmlinux.o $@ + +$(obj)/vmlinux.arch.S: $(src)/ftrace-gen-ool-stubs.sh vmlinux.o FORCE + $(call if_changed,gen_ftrace_ool_stubs) + +targets += vmlinux.arch.S diff --git a/arch/powerpc/tools/ftrace-gen-ool-stubs.sh b/arch/powerpc/tools/ftrace-gen-ool-stubs.sh new file mode 100755 index 000000000000..bac186bdf64a --- /dev/null +++ b/arch/powerpc/tools/ftrace-gen-ool-stubs.sh @@ -0,0 +1,52 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later + +# Error out on error +set -e + +num_ool_stubs_text_builtin="$1" +is_64bit="$2" +objdump="$3" +vmlinux_o="$4" +arch_vmlinux_S="$5" + +RELOCATION=R_PPC64_ADDR64 +if [ -z "$is_64bit" ]; then + RELOCATION=R_PPC_ADDR32 +fi + +num_ool_stubs_total=$($objdump -r -j __patchable_function_entries "$vmlinux_o" | + grep -c "$RELOCATION") +num_ool_stubs_inittext=$($objdump -r -j __patchable_function_entries "$vmlinux_o" | + grep -e ".init.text" -e ".text.startup" | grep -c "$RELOCATION") +num_ool_stubs_text=$((num_ool_stubs_total - num_ool_stubs_inittext)) + +if [ "$num_ool_stubs_text" -gt "$num_ool_stubs_text_builtin" ]; then + num_ool_stubs_text_end=$((num_ool_stubs_text - num_ool_stubs_text_builtin)) +else + num_ool_stubs_text_end=0 +fi + +cat > "$arch_vmlinux_S" <<EOF +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <linux/linkage.h> + +.pushsection .tramp.ftrace.text,"aw" +SYM_DATA(ftrace_ool_stub_text_end_count, .long $num_ool_stubs_text_end) + +SYM_START(ftrace_ool_stub_text_end, SYM_L_GLOBAL, .balign SZL) +#if $num_ool_stubs_text_end + .space $num_ool_stubs_text_end * FTRACE_OOL_STUB_SIZE +#endif +SYM_CODE_END(ftrace_ool_stub_text_end) +.popsection + +.pushsection .tramp.ftrace.init,"aw" +SYM_DATA(ftrace_ool_stub_inittext_count, .long $num_ool_stubs_inittext) + +SYM_START(ftrace_ool_stub_inittext, SYM_L_GLOBAL, .balign SZL) + .space $num_ool_stubs_inittext * FTRACE_OOL_STUB_SIZE +SYM_CODE_END(ftrace_ool_stub_inittext) +.popsection +EOF diff --git a/arch/powerpc/tools/ftrace_check.sh b/arch/powerpc/tools/ftrace_check.sh new file mode 100755 index 000000000000..405e7e306617 --- /dev/null +++ b/arch/powerpc/tools/ftrace_check.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later +# +# This script checks vmlinux to ensure that all functions can call ftrace_caller() either directly, +# or through the stub, ftrace_tramp_text, at the end of kernel text. + +# Error out if any command fails +set -e + +# Allow for verbose output +if [ "$V" = "1" ]; then + set -x +fi + +if [ $# -lt 2 ]; then + echo "$0 [path to nm] [path to vmlinux]" 1>&2 + exit 1 +fi + +# Have Kbuild supply the path to nm so we handle cross compilation. +nm="$1" +vmlinux="$2" + +stext_addr=$($nm "$vmlinux" | grep -e " [TA] _stext$" | \ + cut -d' ' -f1 | tr '[:lower:]' '[:upper:]') +ftrace_caller_addr=$($nm "$vmlinux" | grep -e " T ftrace_caller$" | \ + cut -d' ' -f1 | tr '[:lower:]' '[:upper:]') +ftrace_tramp_addr=$($nm "$vmlinux" | grep -e " T ftrace_tramp_text$" | \ + cut -d' ' -f1 | tr '[:lower:]' '[:upper:]') + +ftrace_caller_offset=$(echo "ibase=16;$ftrace_caller_addr - $stext_addr" | bc) +ftrace_tramp_offset=$(echo "ibase=16;$ftrace_tramp_addr - $ftrace_caller_addr" | bc) +sz_32m=$(printf "%d" 0x2000000) +sz_64m=$(printf "%d" 0x4000000) + +# ftrace_caller - _stext < 32M +if [ "$ftrace_caller_offset" -ge "$sz_32m" ]; then + echo "ERROR: ftrace_caller (0x$ftrace_caller_addr) is beyond 32MiB of _stext" 1>&2 + echo "ERROR: consider disabling CONFIG_FUNCTION_TRACER, or reducing the size \ + of kernel text" 1>&2 + exit 1 +fi + +# ftrace_tramp_text - ftrace_caller < 64M +if [ "$ftrace_tramp_offset" -ge "$sz_64m" ]; then + echo "ERROR: kernel text extends beyond 64MiB from ftrace_caller" 1>&2 + echo "ERROR: consider disabling CONFIG_FUNCTION_TRACER, or reducing the size \ + of kernel text" 1>&2 + exit 1 +fi diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index e6cddbb2305f..f4e841a36458 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -50,7 +50,7 @@ #include <asm/xive.h> #include <asm/opal.h> #include <asm/firmware.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/sections.h> #include <asm/inst.h> #include <asm/interrupt.h> @@ -3662,7 +3662,7 @@ symbol_lookup(void) int type = inchar(); unsigned long addr, cpu; void __percpu *ptr = NULL; - static char tmp[64]; + static char tmp[KSYM_NAME_LEN]; switch (type) { case 'a': @@ -3671,7 +3671,7 @@ symbol_lookup(void) termch = 0; break; case 's': - getstring(tmp, 64); + getstring(tmp, KSYM_NAME_LEN); if (setjmp(bus_error_jmp) == 0) { catch_memory_errors = 1; sync(); @@ -3686,7 +3686,7 @@ symbol_lookup(void) termch = 0; break; case 'p': - getstring(tmp, 64); + getstring(tmp, KSYM_NAME_LEN); if (setjmp(bus_error_jmp) == 0) { catch_memory_errors = 1; sync(); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index ff1e353b0d6f..cc63aef41e94 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -83,6 +83,7 @@ config RISCV select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_NO_INSTR select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_WEAK_RELEASE_ACQUIRE if ARCH_USE_QUEUED_SPINLOCKS select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU select CLINT_TIMER if RISCV_M_MODE @@ -116,6 +117,7 @@ config RISCV select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU + select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_HUGE_VMAP if MMU && 64BIT @@ -507,6 +509,39 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. +choice + prompt "RISC-V spinlock type" + default RISCV_COMBO_SPINLOCKS + +config RISCV_TICKET_SPINLOCKS + bool "Using ticket spinlock" + +config RISCV_QUEUED_SPINLOCKS + bool "Using queued spinlock" + depends on SMP && MMU && NONPORTABLE + select ARCH_USE_QUEUED_SPINLOCKS + help + The queued spinlock implementation requires the forward progress + guarantee of cmpxchg()/xchg() atomic operations: CAS with Zabha or + LR/SC with Ziccrse provide such guarantee. + + Select this if and only if Zabha or Ziccrse is available on your + platform, RISCV_QUEUED_SPINLOCKS must not be selected for platforms + without one of those extensions. + + If unsure, select RISCV_COMBO_SPINLOCKS, which will use qspinlocks + when supported and otherwise ticket spinlocks. + +config RISCV_COMBO_SPINLOCKS + bool "Using combo spinlock" + depends on SMP && MMU + select ARCH_USE_QUEUED_SPINLOCKS + help + Embed both queued spinlock and ticket lock so that the spinlock + implementation can be chosen at runtime. + +endchoice + config RISCV_ALTERNATIVE bool depends on !XIP_KERNEL @@ -532,6 +567,17 @@ config RISCV_ISA_C If you don't know what to do here, say Y. +config RISCV_ISA_SUPM + bool "Supm extension for userspace pointer masking" + depends on 64BIT + default y + help + Add support for pointer masking in userspace (Supm) when the + underlying hardware extension (Smnpm or Ssnpm) is detected at boot. + + If this option is disabled, userspace will be unable to use + the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API. + config RISCV_ISA_SVNAPOT bool "Svnapot extension support for supervisor mode NAPOT pages" depends on 64BIT && MMU @@ -633,6 +679,40 @@ config RISCV_ISA_ZAWRS use of these instructions in the kernel when the Zawrs extension is detected at boot. +config TOOLCHAIN_HAS_ZABHA + bool + default y + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zabha) + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zabha) + depends on AS_HAS_OPTION_ARCH + +config RISCV_ISA_ZABHA + bool "Zabha extension support for atomic byte/halfword operations" + depends on TOOLCHAIN_HAS_ZABHA + depends on RISCV_ALTERNATIVE + default y + help + Enable the use of the Zabha ISA-extension to implement kernel + byte/halfword atomic memory operations when it is detected at boot. + + If you don't know what to do here, say Y. + +config TOOLCHAIN_HAS_ZACAS + bool + default y + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zacas) + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zacas) + depends on AS_HAS_OPTION_ARCH + +config RISCV_ISA_ZACAS + bool "Zacas extension support for atomic CAS" + depends on TOOLCHAIN_HAS_ZACAS + depends on RISCV_ALTERNATIVE + default y + help + Enable the use of the Zacas ISA-extension to implement kernel atomic + cmpxchg operations when it is detected at boot. + If you don't know what to do here, say Y. config TOOLCHAIN_HAS_ZBB @@ -786,10 +866,24 @@ config THREAD_SIZE_ORDER config RISCV_MISALIGNED bool + help + Embed support for detecting and emulating misaligned + scalar or vector loads and stores. + +config RISCV_SCALAR_MISALIGNED + bool + select RISCV_MISALIGNED select SYSCTL_ARCH_UNALIGN_ALLOW help Embed support for emulating misaligned loads and stores. +config RISCV_VECTOR_MISALIGNED + bool + select RISCV_MISALIGNED + depends on RISCV_ISA_V + help + Enable detecting support for vector misaligned loads and stores. + choice prompt "Unaligned Accesses Support" default RISCV_PROBE_UNALIGNED_ACCESS @@ -801,7 +895,7 @@ choice config RISCV_PROBE_UNALIGNED_ACCESS bool "Probe for hardware unaligned access support" - select RISCV_MISALIGNED + select RISCV_SCALAR_MISALIGNED help During boot, the kernel will run a series of tests to determine the speed of unaligned accesses. This probing will dynamically determine @@ -812,7 +906,7 @@ config RISCV_PROBE_UNALIGNED_ACCESS config RISCV_EMULATED_UNALIGNED_ACCESS bool "Emulate unaligned access where system support is missing" - select RISCV_MISALIGNED + select RISCV_SCALAR_MISALIGNED help If unaligned memory accesses trap into the kernel as they are not supported by the system, the kernel will emulate the unaligned @@ -841,6 +935,46 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS endchoice +choice + prompt "Vector unaligned Accesses Support" + depends on RISCV_ISA_V + default RISCV_PROBE_VECTOR_UNALIGNED_ACCESS + help + This determines the level of support for vector unaligned accesses. This + information is used by the kernel to perform optimizations. It is also + exposed to user space via the hwprobe syscall. The hardware will be + probed at boot by default. + +config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS + bool "Probe speed of vector unaligned accesses" + select RISCV_VECTOR_MISALIGNED + depends on RISCV_ISA_V + help + During boot, the kernel will run a series of tests to determine the + speed of vector unaligned accesses if they are supported. This probing + will dynamically determine the speed of vector unaligned accesses on + the underlying system if they are supported. + +config RISCV_SLOW_VECTOR_UNALIGNED_ACCESS + bool "Assume the system supports slow vector unaligned memory accesses" + depends on NONPORTABLE + help + Assume that the system supports slow vector unaligned memory accesses. The + kernel and userspace programs may not be able to run at all on systems + that do not support unaligned memory accesses. + +config RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS + bool "Assume the system supports fast vector unaligned memory accesses" + depends on NONPORTABLE + help + Assume that the system supports fast vector unaligned memory accesses. When + enabled, this option improves the performance of the kernel on such + systems. However, the kernel and userspace programs will run much more + slowly, or will not be able to run at all, on systems that do not + support efficient unaligned memory accesses. + +endchoice + source "arch/riscv/Kconfig.vendor" endmenu # "Platform type" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index d469db9f46f4..9fe1ee740dda 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -82,6 +82,12 @@ else riscv-march-$(CONFIG_TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI) := $(riscv-march-y)_zicsr_zifencei endif +# Check if the toolchain supports Zacas +riscv-march-$(CONFIG_TOOLCHAIN_HAS_ZACAS) := $(riscv-march-y)_zacas + +# Check if the toolchain supports Zabha +riscv-march-$(CONFIG_TOOLCHAIN_HAS_ZABHA) := $(riscv-march-y)_zabha + # Remove F,D,V from isa string for all. Keep extensions between "fd" and "v" by # matching non-v and non-multi-letter extensions out with the filter ([^v_]*) KBUILD_CFLAGS += -march=$(shell echo $(riscv-march-y) | sed -E 's/(rv32ima|rv64ima)fd([^v_]*)v?/\1\2/') diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 1d5e13b148f2..b4a37345703e 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -167,6 +167,7 @@ CONFIG_PINCTRL_SOPHGO_CV1800B=y CONFIG_PINCTRL_SOPHGO_CV1812H=y CONFIG_PINCTRL_SOPHGO_SG2000=y CONFIG_PINCTRL_SOPHGO_SG2002=y +CONFIG_GPIO_DWAPB=y CONFIG_GPIO_SIFIVE=y CONFIG_POWER_RESET_GPIO_RESTART=y CONFIG_SENSORS_SFCTEMP=m diff --git a/arch/riscv/errata/andes/errata.c b/arch/riscv/errata/andes/errata.c index fc1a34faa5f3..dcc9d1ee5ffd 100644 --- a/arch/riscv/errata/andes/errata.c +++ b/arch/riscv/errata/andes/errata.c @@ -13,7 +13,7 @@ #include <asm/alternative.h> #include <asm/cacheflush.h> #include <asm/errata_list.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/processor.h> #include <asm/sbi.h> #include <asm/vendorid_list.h> diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c index cea3b96ade11..38aac2c47845 100644 --- a/arch/riscv/errata/sifive/errata.c +++ b/arch/riscv/errata/sifive/errata.c @@ -8,7 +8,7 @@ #include <linux/module.h> #include <linux/string.h> #include <linux/bug.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/vendorid_list.h> #include <asm/errata_list.h> diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index f5120e07c318..e24770a77932 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -16,7 +16,7 @@ #include <asm/errata_list.h> #include <asm/hwprobe.h> #include <asm/io.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/vendorid_list.h> #include <asm/vendor_extensions.h> diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 1461af12da6e..de13d5a234f8 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -6,10 +6,12 @@ generic-y += early_ioremap.h generic-y += flat.h generic-y += kvm_para.h generic-y += mmzone.h +generic-y += mcs_spinlock.h generic-y += parport.h -generic-y += spinlock.h generic-y += spinlock_types.h +generic-y += ticket_spinlock.h generic-y += qrwlock.h generic-y += qrwlock_types.h +generic-y += qspinlock.h generic-y += user.h generic-y += vmlinux.lds.h diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index ebbce134917c..4cadc56220fe 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -12,30 +12,43 @@ #include <asm/fence.h> #include <asm/hwcap.h> #include <asm/insn-def.h> - -#define __arch_xchg_masked(sc_sfx, prepend, append, r, p, n) \ -({ \ - u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3); \ - ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE; \ - ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0) \ - << __s; \ - ulong __newx = (ulong)(n) << __s; \ - ulong __retx; \ - ulong __rc; \ - \ - __asm__ __volatile__ ( \ - prepend \ - "0: lr.w %0, %2\n" \ - " and %1, %0, %z4\n" \ - " or %1, %1, %z3\n" \ - " sc.w" sc_sfx " %1, %1, %2\n" \ - " bnez %1, 0b\n" \ - append \ - : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \ - : "rJ" (__newx), "rJ" (~__mask) \ - : "memory"); \ - \ - r = (__typeof__(*(p)))((__retx & __mask) >> __s); \ +#include <asm/cpufeature-macros.h> + +#define __arch_xchg_masked(sc_sfx, swap_sfx, prepend, sc_append, \ + swap_append, r, p, n) \ +({ \ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && \ + riscv_has_extension_unlikely(RISCV_ISA_EXT_ZABHA)) { \ + __asm__ __volatile__ ( \ + prepend \ + " amoswap" swap_sfx " %0, %z2, %1\n" \ + swap_append \ + : "=&r" (r), "+A" (*(p)) \ + : "rJ" (n) \ + : "memory"); \ + } else { \ + u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3); \ + ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE; \ + ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0) \ + << __s; \ + ulong __newx = (ulong)(n) << __s; \ + ulong __retx; \ + ulong __rc; \ + \ + __asm__ __volatile__ ( \ + prepend \ + "0: lr.w %0, %2\n" \ + " and %1, %0, %z4\n" \ + " or %1, %1, %z3\n" \ + " sc.w" sc_sfx " %1, %1, %2\n" \ + " bnez %1, 0b\n" \ + sc_append \ + : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \ + : "rJ" (__newx), "rJ" (~__mask) \ + : "memory"); \ + \ + r = (__typeof__(*(p)))((__retx & __mask) >> __s); \ + } \ }) #define __arch_xchg(sfx, prepend, append, r, p, n) \ @@ -58,8 +71,13 @@ \ switch (sizeof(*__ptr)) { \ case 1: \ + __arch_xchg_masked(sc_sfx, ".b" swap_sfx, \ + prepend, sc_append, swap_append, \ + __ret, __ptr, __new); \ + break; \ case 2: \ - __arch_xchg_masked(sc_sfx, prepend, sc_append, \ + __arch_xchg_masked(sc_sfx, ".h" swap_sfx, \ + prepend, sc_append, swap_append, \ __ret, __ptr, __new); \ break; \ case 4: \ @@ -106,55 +124,90 @@ * store NEW in MEM. Return the initial value in MEM. Success is * indicated by comparing RETURN with OLD. */ - -#define __arch_cmpxchg_masked(sc_sfx, prepend, append, r, p, o, n) \ -({ \ - u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3); \ - ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE; \ - ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0) \ - << __s; \ - ulong __newx = (ulong)(n) << __s; \ - ulong __oldx = (ulong)(o) << __s; \ - ulong __retx; \ - ulong __rc; \ - \ - __asm__ __volatile__ ( \ - prepend \ - "0: lr.w %0, %2\n" \ - " and %1, %0, %z5\n" \ - " bne %1, %z3, 1f\n" \ - " and %1, %0, %z6\n" \ - " or %1, %1, %z4\n" \ - " sc.w" sc_sfx " %1, %1, %2\n" \ - " bnez %1, 0b\n" \ - append \ - "1:\n" \ - : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \ - : "rJ" ((long)__oldx), "rJ" (__newx), \ - "rJ" (__mask), "rJ" (~__mask) \ - : "memory"); \ - \ - r = (__typeof__(*(p)))((__retx & __mask) >> __s); \ +#define __arch_cmpxchg_masked(sc_sfx, cas_sfx, \ + sc_prepend, sc_append, \ + cas_prepend, cas_append, \ + r, p, o, n) \ +({ \ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && \ + IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && \ + riscv_has_extension_unlikely(RISCV_ISA_EXT_ZABHA) && \ + riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS)) { \ + r = o; \ + \ + __asm__ __volatile__ ( \ + cas_prepend \ + " amocas" cas_sfx " %0, %z2, %1\n" \ + cas_append \ + : "+&r" (r), "+A" (*(p)) \ + : "rJ" (n) \ + : "memory"); \ + } else { \ + u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3); \ + ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE; \ + ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0) \ + << __s; \ + ulong __newx = (ulong)(n) << __s; \ + ulong __oldx = (ulong)(o) << __s; \ + ulong __retx; \ + ulong __rc; \ + \ + __asm__ __volatile__ ( \ + sc_prepend \ + "0: lr.w %0, %2\n" \ + " and %1, %0, %z5\n" \ + " bne %1, %z3, 1f\n" \ + " and %1, %0, %z6\n" \ + " or %1, %1, %z4\n" \ + " sc.w" sc_sfx " %1, %1, %2\n" \ + " bnez %1, 0b\n" \ + sc_append \ + "1:\n" \ + : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \ + : "rJ" ((long)__oldx), "rJ" (__newx), \ + "rJ" (__mask), "rJ" (~__mask) \ + : "memory"); \ + \ + r = (__typeof__(*(p)))((__retx & __mask) >> __s); \ + } \ }) -#define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n) \ +#define __arch_cmpxchg(lr_sfx, sc_sfx, cas_sfx, \ + sc_prepend, sc_append, \ + cas_prepend, cas_append, \ + r, p, co, o, n) \ ({ \ - register unsigned int __rc; \ + if (IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && \ + riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS)) { \ + r = o; \ \ - __asm__ __volatile__ ( \ - prepend \ - "0: lr" lr_sfx " %0, %2\n" \ - " bne %0, %z3, 1f\n" \ - " sc" sc_sfx " %1, %z4, %2\n" \ - " bnez %1, 0b\n" \ - append \ - "1:\n" \ - : "=&r" (r), "=&r" (__rc), "+A" (*(p)) \ - : "rJ" (co o), "rJ" (n) \ - : "memory"); \ + __asm__ __volatile__ ( \ + cas_prepend \ + " amocas" cas_sfx " %0, %z2, %1\n" \ + cas_append \ + : "+&r" (r), "+A" (*(p)) \ + : "rJ" (n) \ + : "memory"); \ + } else { \ + register unsigned int __rc; \ + \ + __asm__ __volatile__ ( \ + sc_prepend \ + "0: lr" lr_sfx " %0, %2\n" \ + " bne %0, %z3, 1f\n" \ + " sc" sc_sfx " %1, %z4, %2\n" \ + " bnez %1, 0b\n" \ + sc_append \ + "1:\n" \ + : "=&r" (r), "=&r" (__rc), "+A" (*(p)) \ + : "rJ" (co o), "rJ" (n) \ + : "memory"); \ + } \ }) -#define _arch_cmpxchg(ptr, old, new, sc_sfx, prepend, append) \ +#define _arch_cmpxchg(ptr, old, new, sc_sfx, cas_sfx, \ + sc_prepend, sc_append, \ + cas_prepend, cas_append) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ __typeof__(*(__ptr)) __old = (old); \ @@ -163,17 +216,28 @@ \ switch (sizeof(*__ptr)) { \ case 1: \ + __arch_cmpxchg_masked(sc_sfx, ".b" cas_sfx, \ + sc_prepend, sc_append, \ + cas_prepend, cas_append, \ + __ret, __ptr, __old, __new); \ + break; \ case 2: \ - __arch_cmpxchg_masked(sc_sfx, prepend, append, \ - __ret, __ptr, __old, __new); \ + __arch_cmpxchg_masked(sc_sfx, ".h" cas_sfx, \ + sc_prepend, sc_append, \ + cas_prepend, cas_append, \ + __ret, __ptr, __old, __new); \ break; \ case 4: \ - __arch_cmpxchg(".w", ".w" sc_sfx, prepend, append, \ - __ret, __ptr, (long), __old, __new); \ + __arch_cmpxchg(".w", ".w" sc_sfx, ".w" cas_sfx, \ + sc_prepend, sc_append, \ + cas_prepend, cas_append, \ + __ret, __ptr, (long), __old, __new); \ break; \ case 8: \ - __arch_cmpxchg(".d", ".d" sc_sfx, prepend, append, \ - __ret, __ptr, /**/, __old, __new); \ + __arch_cmpxchg(".d", ".d" sc_sfx, ".d" cas_sfx, \ + sc_prepend, sc_append, \ + cas_prepend, cas_append, \ + __ret, __ptr, /**/, __old, __new); \ break; \ default: \ BUILD_BUG(); \ @@ -181,17 +245,40 @@ (__typeof__(*(__ptr)))__ret; \ }) +/* + * These macros are here to improve the readability of the arch_cmpxchg_XXX() + * macros. + */ +#define SC_SFX(x) x +#define CAS_SFX(x) x +#define SC_PREPEND(x) x +#define SC_APPEND(x) x +#define CAS_PREPEND(x) x +#define CAS_APPEND(x) x + #define arch_cmpxchg_relaxed(ptr, o, n) \ - _arch_cmpxchg((ptr), (o), (n), "", "", "") + _arch_cmpxchg((ptr), (o), (n), \ + SC_SFX(""), CAS_SFX(""), \ + SC_PREPEND(""), SC_APPEND(""), \ + CAS_PREPEND(""), CAS_APPEND("")) #define arch_cmpxchg_acquire(ptr, o, n) \ - _arch_cmpxchg((ptr), (o), (n), "", "", RISCV_ACQUIRE_BARRIER) + _arch_cmpxchg((ptr), (o), (n), \ + SC_SFX(""), CAS_SFX(""), \ + SC_PREPEND(""), SC_APPEND(RISCV_ACQUIRE_BARRIER), \ + CAS_PREPEND(""), CAS_APPEND(RISCV_ACQUIRE_BARRIER)) #define arch_cmpxchg_release(ptr, o, n) \ - _arch_cmpxchg((ptr), (o), (n), "", RISCV_RELEASE_BARRIER, "") + _arch_cmpxchg((ptr), (o), (n), \ + SC_SFX(""), CAS_SFX(""), \ + SC_PREPEND(RISCV_RELEASE_BARRIER), SC_APPEND(""), \ + CAS_PREPEND(RISCV_RELEASE_BARRIER), CAS_APPEND("")) #define arch_cmpxchg(ptr, o, n) \ - _arch_cmpxchg((ptr), (o), (n), ".rl", "", " fence rw, rw\n") + _arch_cmpxchg((ptr), (o), (n), \ + SC_SFX(".rl"), CAS_SFX(".aqrl"), \ + SC_PREPEND(""), SC_APPEND(RISCV_FULL_BARRIER), \ + CAS_PREPEND(""), CAS_APPEND("")) #define arch_cmpxchg_local(ptr, o, n) \ arch_cmpxchg_relaxed((ptr), (o), (n)) @@ -226,6 +313,44 @@ arch_cmpxchg_release((ptr), (o), (n)); \ }) +#if defined(CONFIG_64BIT) && defined(CONFIG_RISCV_ISA_ZACAS) + +#define system_has_cmpxchg128() riscv_has_extension_unlikely(RISCV_ISA_EXT_ZACAS) + +union __u128_halves { + u128 full; + struct { + u64 low, high; + }; +}; + +#define __arch_cmpxchg128(p, o, n, cas_sfx) \ +({ \ + __typeof__(*(p)) __o = (o); \ + union __u128_halves __hn = { .full = (n) }; \ + union __u128_halves __ho = { .full = (__o) }; \ + register unsigned long t1 asm ("t1") = __hn.low; \ + register unsigned long t2 asm ("t2") = __hn.high; \ + register unsigned long t3 asm ("t3") = __ho.low; \ + register unsigned long t4 asm ("t4") = __ho.high; \ + \ + __asm__ __volatile__ ( \ + " amocas.q" cas_sfx " %0, %z3, %2" \ + : "+&r" (t3), "+&r" (t4), "+A" (*(p)) \ + : "rJ" (t1), "rJ" (t2) \ + : "memory"); \ + \ + ((u128)t4 << 64) | t3; \ +}) + +#define arch_cmpxchg128(ptr, o, n) \ + __arch_cmpxchg128((ptr), (o), (n), ".aqrl") + +#define arch_cmpxchg128_local(ptr, o, n) \ + __arch_cmpxchg128((ptr), (o), (n), "") + +#endif /* CONFIG_64BIT && CONFIG_RISCV_ISA_ZACAS */ + #ifdef CONFIG_RISCV_ISA_ZAWRS /* * Despite wrs.nto being "WRS-with-no-timeout", in the absence of changes to @@ -245,6 +370,11 @@ static __always_inline void __cmpwait(volatile void *ptr, : : : : no_zawrs); switch (size) { + case 1: + fallthrough; + case 2: + /* RISC-V doesn't have lr instructions on byte and half-word. */ + goto no_zawrs; case 4: asm volatile( " lr.w %0, %1\n" diff --git a/arch/riscv/include/asm/compat.h b/arch/riscv/include/asm/compat.h index aa103530a5c8..6081327e55f5 100644 --- a/arch/riscv/include/asm/compat.h +++ b/arch/riscv/include/asm/compat.h @@ -9,7 +9,6 @@ */ #include <linux/types.h> #include <linux/sched.h> -#include <linux/sched/task_stack.h> #include <asm-generic/compat.h> static inline int is_compat_task(void) diff --git a/arch/riscv/include/asm/cpufeature-macros.h b/arch/riscv/include/asm/cpufeature-macros.h new file mode 100644 index 000000000000..a8103edbf51f --- /dev/null +++ b/arch/riscv/include/asm/cpufeature-macros.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2022-2024 Rivos, Inc + */ + +#ifndef _ASM_CPUFEATURE_MACROS_H +#define _ASM_CPUFEATURE_MACROS_H + +#include <asm/hwcap.h> +#include <asm/alternative-macros.h> + +#define STANDARD_EXT 0 + +bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, unsigned int bit); +#define riscv_isa_extension_available(isa_bitmap, ext) \ + __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext) + +static __always_inline bool __riscv_has_extension_likely(const unsigned long vendor, + const unsigned long ext) +{ + asm goto(ALTERNATIVE("j %l[l_no]", "nop", %[vendor], %[ext], 1) + : + : [vendor] "i" (vendor), [ext] "i" (ext) + : + : l_no); + + return true; +l_no: + return false; +} + +static __always_inline bool __riscv_has_extension_unlikely(const unsigned long vendor, + const unsigned long ext) +{ + asm goto(ALTERNATIVE("nop", "j %l[l_yes]", %[vendor], %[ext], 1) + : + : [vendor] "i" (vendor), [ext] "i" (ext) + : + : l_yes); + + return false; +l_yes: + return true; +} + +static __always_inline bool riscv_has_extension_unlikely(const unsigned long ext) +{ + compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) + return __riscv_has_extension_unlikely(STANDARD_EXT, ext); + + return __riscv_isa_extension_available(NULL, ext); +} + +static __always_inline bool riscv_has_extension_likely(const unsigned long ext) +{ + compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); + + if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) + return __riscv_has_extension_likely(STANDARD_EXT, ext); + + return __riscv_isa_extension_available(NULL, ext); +} + +#endif /* _ASM_CPUFEATURE_MACROS_H */ diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 45f9c1171a48..4bd054c54c21 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -8,9 +8,12 @@ #include <linux/bitmap.h> #include <linux/jump_label.h> +#include <linux/workqueue.h> +#include <linux/kconfig.h> +#include <linux/percpu-defs.h> +#include <linux/threads.h> #include <asm/hwcap.h> -#include <asm/alternative-macros.h> -#include <asm/errno.h> +#include <asm/cpufeature-macros.h> /* * These are probed via a device_initcall(), via either the SBI or directly @@ -31,7 +34,7 @@ DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); /* Per-cpu ISA extensions. */ extern struct riscv_isainfo hart_isa[NR_CPUS]; -void riscv_user_isa_enable(void); +void __init riscv_user_isa_enable(void); #define _RISCV_ISA_EXT_DATA(_name, _id, _subset_exts, _subset_exts_size, _validate) { \ .name = #_name, \ @@ -58,8 +61,9 @@ void riscv_user_isa_enable(void); #define __RISCV_ISA_EXT_SUPERSET_VALIDATE(_name, _id, _sub_exts, _validate) \ _RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), _validate) -#if defined(CONFIG_RISCV_MISALIGNED) bool check_unaligned_access_emulated_all_cpus(void); +#if defined(CONFIG_RISCV_SCALAR_MISALIGNED) +void check_unaligned_access_emulated(struct work_struct *work __always_unused); void unaligned_emulation_finish(void); bool unaligned_ctl_available(void); DECLARE_PER_CPU(long, misaligned_access_speed); @@ -70,6 +74,12 @@ static inline bool unaligned_ctl_available(void) } #endif +bool check_vector_unaligned_access_emulated_all_cpus(void); +#if defined(CONFIG_RISCV_VECTOR_MISALIGNED) +void check_vector_unaligned_access_emulated(struct work_struct *work __always_unused); +DECLARE_PER_CPU(long, vector_misaligned_access); +#endif + #if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); @@ -103,61 +113,6 @@ extern const size_t riscv_isa_ext_count; extern bool riscv_isa_fallback; unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap); - -#define STANDARD_EXT 0 - -bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, unsigned int bit); -#define riscv_isa_extension_available(isa_bitmap, ext) \ - __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext) - -static __always_inline bool __riscv_has_extension_likely(const unsigned long vendor, - const unsigned long ext) -{ - asm goto(ALTERNATIVE("j %l[l_no]", "nop", %[vendor], %[ext], 1) - : - : [vendor] "i" (vendor), [ext] "i" (ext) - : - : l_no); - - return true; -l_no: - return false; -} - -static __always_inline bool __riscv_has_extension_unlikely(const unsigned long vendor, - const unsigned long ext) -{ - asm goto(ALTERNATIVE("nop", "j %l[l_yes]", %[vendor], %[ext], 1) - : - : [vendor] "i" (vendor), [ext] "i" (ext) - : - : l_yes); - - return false; -l_yes: - return true; -} - -static __always_inline bool riscv_has_extension_unlikely(const unsigned long ext) -{ - compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); - - if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) - return __riscv_has_extension_unlikely(STANDARD_EXT, ext); - - return __riscv_isa_extension_available(NULL, ext); -} - -static __always_inline bool riscv_has_extension_likely(const unsigned long ext) -{ - compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); - - if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) - return __riscv_has_extension_likely(STANDARD_EXT, ext); - - return __riscv_isa_extension_available(NULL, ext); -} - static __always_inline bool riscv_cpu_has_extension_likely(int cpu, const unsigned long ext) { compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX"); diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 25966995da04..fe5d4eb9adea 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -119,6 +119,10 @@ /* HSTATUS flags */ #ifdef CONFIG_64BIT +#define HSTATUS_HUPMM _AC(0x3000000000000, UL) +#define HSTATUS_HUPMM_PMLEN_0 _AC(0x0000000000000, UL) +#define HSTATUS_HUPMM_PMLEN_7 _AC(0x2000000000000, UL) +#define HSTATUS_HUPMM_PMLEN_16 _AC(0x3000000000000, UL) #define HSTATUS_VSXL _AC(0x300000000, UL) #define HSTATUS_VSXL_SHIFT 32 #endif @@ -195,6 +199,10 @@ /* xENVCFG flags */ #define ENVCFG_STCE (_AC(1, ULL) << 63) #define ENVCFG_PBMTE (_AC(1, ULL) << 62) +#define ENVCFG_PMM (_AC(0x3, ULL) << 32) +#define ENVCFG_PMM_PMLEN_0 (_AC(0x0, ULL) << 32) +#define ENVCFG_PMM_PMLEN_7 (_AC(0x2, ULL) << 32) +#define ENVCFG_PMM_PMLEN_16 (_AC(0x3, ULL) << 32) #define ENVCFG_CBZE (_AC(1, UL) << 7) #define ENVCFG_CBCFE (_AC(1, UL) << 6) #define ENVCFG_CBIE_SHIFT 4 @@ -216,6 +224,12 @@ #define SMSTATEEN0_SSTATEEN0_SHIFT 63 #define SMSTATEEN0_SSTATEEN0 (_ULL(1) << SMSTATEEN0_SSTATEEN0_SHIFT) +/* mseccfg bits */ +#define MSECCFG_PMM ENVCFG_PMM +#define MSECCFG_PMM_PMLEN_0 ENVCFG_PMM_PMLEN_0 +#define MSECCFG_PMM_PMLEN_7 ENVCFG_PMM_PMLEN_7 +#define MSECCFG_PMM_PMLEN_16 ENVCFG_PMM_PMLEN_16 + /* symbolic CSR names: */ #define CSR_CYCLE 0xc00 #define CSR_TIME 0xc01 @@ -382,6 +396,8 @@ #define CSR_MIP 0x344 #define CSR_PMPCFG0 0x3a0 #define CSR_PMPADDR0 0x3b0 +#define CSR_MSECCFG 0x747 +#define CSR_MSECCFGH 0x757 #define CSR_MVENDORID 0xf11 #define CSR_MARCHID 0xf12 #define CSR_MIMPID 0xf13 diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h index 2293e535f865..b28ccc6cdeea 100644 --- a/arch/riscv/include/asm/entry-common.h +++ b/arch/riscv/include/asm/entry-common.h @@ -33,6 +33,7 @@ static inline int handle_misaligned_load(struct pt_regs *regs) { return -1; } + static inline int handle_misaligned_store(struct pt_regs *regs) { return -1; diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 46d9de54179e..08d2a5697466 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -93,6 +93,11 @@ #define RISCV_ISA_EXT_ZCMOP 84 #define RISCV_ISA_EXT_ZAWRS 85 #define RISCV_ISA_EXT_SVVPTC 86 +#define RISCV_ISA_EXT_SMMPM 87 +#define RISCV_ISA_EXT_SMNPM 88 +#define RISCV_ISA_EXT_SSNPM 89 +#define RISCV_ISA_EXT_ZABHA 90 +#define RISCV_ISA_EXT_ZICCRSE 91 #define RISCV_ISA_EXT_XLINUXENVCFG 127 @@ -101,8 +106,10 @@ #ifdef CONFIG_RISCV_M_MODE #define RISCV_ISA_EXT_SxAIA RISCV_ISA_EXT_SMAIA +#define RISCV_ISA_EXT_SUPM RISCV_ISA_EXT_SMNPM #else #define RISCV_ISA_EXT_SxAIA RISCV_ISA_EXT_SSAIA +#define RISCV_ISA_EXT_SUPM RISCV_ISA_EXT_SSNPM #endif #endif /* _ASM_RISCV_HWCAP_H */ diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index ffb9484531af..1ce1df6d0ff3 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include <uapi/asm/hwprobe.h> -#define RISCV_HWPROBE_MAX_KEY 9 +#define RISCV_HWPROBE_MAX_KEY 10 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h index 1c768d02bd0c..87a71cc6d146 100644 --- a/arch/riscv/include/asm/jump_label.h +++ b/arch/riscv/include/asm/jump_label.h @@ -16,21 +16,28 @@ #define JUMP_LABEL_NOP_SIZE 4 +#define JUMP_TABLE_ENTRY(key, label) \ + ".pushsection __jump_table, \"aw\" \n\t" \ + ".align " RISCV_LGPTR " \n\t" \ + ".long 1b - ., " label " - . \n\t" \ + "" RISCV_PTR " " key " - . \n\t" \ + ".popsection \n\t" + +/* This macro is also expanded on the Rust side. */ +#define ARCH_STATIC_BRANCH_ASM(key, label) \ + " .align 2 \n\t" \ + " .option push \n\t" \ + " .option norelax \n\t" \ + " .option norvc \n\t" \ + "1: nop \n\t" \ + " .option pop \n\t" \ + JUMP_TABLE_ENTRY(key, label) + static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm goto( - " .align 2 \n\t" - " .option push \n\t" - " .option norelax \n\t" - " .option norvc \n\t" - "1: nop \n\t" - " .option pop \n\t" - " .pushsection __jump_table, \"aw\" \n\t" - " .align " RISCV_LGPTR " \n\t" - " .long 1b - ., %l[label] - . \n\t" - " " RISCV_PTR " %0 - . \n\t" - " .popsection \n\t" + ARCH_STATIC_BRANCH_ASM("%0", "%l[label]") : : "i"(&((char *)key)[branch]) : : label); return false; @@ -38,21 +45,20 @@ label: return true; } +#define ARCH_STATIC_BRANCH_JUMP_ASM(key, label) \ + " .align 2 \n\t" \ + " .option push \n\t" \ + " .option norelax \n\t" \ + " .option norvc \n\t" \ + "1: j " label " \n\t" \ + " .option pop \n\t" \ + JUMP_TABLE_ENTRY(key, label) + static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm goto( - " .align 2 \n\t" - " .option push \n\t" - " .option norelax \n\t" - " .option norvc \n\t" - "1: j %l[label] \n\t" - " .option pop \n\t" - " .pushsection __jump_table, \"aw\" \n\t" - " .align " RISCV_LGPTR " \n\t" - " .long 1b - ., %l[label] - . \n\t" - " " RISCV_PTR " %0 - . \n\t" - " .popsection \n\t" + ARCH_STATIC_BRANCH_JUMP_ASM("%0", "%l[label]") : : "i"(&((char *)key)[branch]) : : label); return false; diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 2e2254fd2a2a..35eab6e0f4ae 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -286,6 +286,16 @@ struct kvm_vcpu_arch { } sta; }; +/* + * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event, + * arrived in guest context. For riscv, any event that arrives while a vCPU is + * loaded is considered to be "in guest". + */ +static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu) +{ + return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu; +} + static inline void kvm_arch_sync_events(struct kvm *kvm) {} #define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12 diff --git a/arch/riscv/include/asm/kvm_nacl.h b/arch/riscv/include/asm/kvm_nacl.h new file mode 100644 index 000000000000..4124d5e06a0f --- /dev/null +++ b/arch/riscv/include/asm/kvm_nacl.h @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024 Ventana Micro Systems Inc. + */ + +#ifndef __KVM_NACL_H +#define __KVM_NACL_H + +#include <linux/jump_label.h> +#include <linux/percpu.h> +#include <asm/byteorder.h> +#include <asm/csr.h> +#include <asm/sbi.h> + +struct kvm_vcpu_arch; + +DECLARE_STATIC_KEY_FALSE(kvm_riscv_nacl_available); +#define kvm_riscv_nacl_available() \ + static_branch_unlikely(&kvm_riscv_nacl_available) + +DECLARE_STATIC_KEY_FALSE(kvm_riscv_nacl_sync_csr_available); +#define kvm_riscv_nacl_sync_csr_available() \ + static_branch_unlikely(&kvm_riscv_nacl_sync_csr_available) + +DECLARE_STATIC_KEY_FALSE(kvm_riscv_nacl_sync_hfence_available); +#define kvm_riscv_nacl_sync_hfence_available() \ + static_branch_unlikely(&kvm_riscv_nacl_sync_hfence_available) + +DECLARE_STATIC_KEY_FALSE(kvm_riscv_nacl_sync_sret_available); +#define kvm_riscv_nacl_sync_sret_available() \ + static_branch_unlikely(&kvm_riscv_nacl_sync_sret_available) + +DECLARE_STATIC_KEY_FALSE(kvm_riscv_nacl_autoswap_csr_available); +#define kvm_riscv_nacl_autoswap_csr_available() \ + static_branch_unlikely(&kvm_riscv_nacl_autoswap_csr_available) + +struct kvm_riscv_nacl { + void *shmem; + phys_addr_t shmem_phys; +}; +DECLARE_PER_CPU(struct kvm_riscv_nacl, kvm_riscv_nacl); + +void __kvm_riscv_nacl_hfence(void *shmem, + unsigned long control, + unsigned long page_num, + unsigned long page_count); + +void __kvm_riscv_nacl_switch_to(struct kvm_vcpu_arch *vcpu_arch, + unsigned long sbi_ext_id, + unsigned long sbi_func_id); + +int kvm_riscv_nacl_enable(void); + +void kvm_riscv_nacl_disable(void); + +void kvm_riscv_nacl_exit(void); + +int kvm_riscv_nacl_init(void); + +#ifdef CONFIG_32BIT +#define lelong_to_cpu(__x) le32_to_cpu(__x) +#define cpu_to_lelong(__x) cpu_to_le32(__x) +#else +#define lelong_to_cpu(__x) le64_to_cpu(__x) +#define cpu_to_lelong(__x) cpu_to_le64(__x) +#endif + +#define nacl_shmem() \ + this_cpu_ptr(&kvm_riscv_nacl)->shmem + +#define nacl_scratch_read_long(__shmem, __offset) \ +({ \ + unsigned long *__p = (__shmem) + \ + SBI_NACL_SHMEM_SCRATCH_OFFSET + \ + (__offset); \ + lelong_to_cpu(*__p); \ +}) + +#define nacl_scratch_write_long(__shmem, __offset, __val) \ +do { \ + unsigned long *__p = (__shmem) + \ + SBI_NACL_SHMEM_SCRATCH_OFFSET + \ + (__offset); \ + *__p = cpu_to_lelong(__val); \ +} while (0) + +#define nacl_scratch_write_longs(__shmem, __offset, __array, __count) \ +do { \ + unsigned int __i; \ + unsigned long *__p = (__shmem) + \ + SBI_NACL_SHMEM_SCRATCH_OFFSET + \ + (__offset); \ + for (__i = 0; __i < (__count); __i++) \ + __p[__i] = cpu_to_lelong((__array)[__i]); \ +} while (0) + +#define nacl_sync_hfence(__e) \ + sbi_ecall(SBI_EXT_NACL, SBI_EXT_NACL_SYNC_HFENCE, \ + (__e), 0, 0, 0, 0, 0) + +#define nacl_hfence_mkconfig(__type, __order, __vmid, __asid) \ +({ \ + unsigned long __c = SBI_NACL_SHMEM_HFENCE_CONFIG_PEND; \ + __c |= ((__type) & SBI_NACL_SHMEM_HFENCE_CONFIG_TYPE_MASK) \ + << SBI_NACL_SHMEM_HFENCE_CONFIG_TYPE_SHIFT; \ + __c |= (((__order) - SBI_NACL_SHMEM_HFENCE_ORDER_BASE) & \ + SBI_NACL_SHMEM_HFENCE_CONFIG_ORDER_MASK) \ + << SBI_NACL_SHMEM_HFENCE_CONFIG_ORDER_SHIFT; \ + __c |= ((__vmid) & SBI_NACL_SHMEM_HFENCE_CONFIG_VMID_MASK) \ + << SBI_NACL_SHMEM_HFENCE_CONFIG_VMID_SHIFT; \ + __c |= ((__asid) & SBI_NACL_SHMEM_HFENCE_CONFIG_ASID_MASK); \ + __c; \ +}) + +#define nacl_hfence_mkpnum(__order, __addr) \ + ((__addr) >> (__order)) + +#define nacl_hfence_mkpcount(__order, __size) \ + ((__size) >> (__order)) + +#define nacl_hfence_gvma(__shmem, __gpa, __gpsz, __order) \ +__kvm_riscv_nacl_hfence(__shmem, \ + nacl_hfence_mkconfig(SBI_NACL_SHMEM_HFENCE_TYPE_GVMA, \ + __order, 0, 0), \ + nacl_hfence_mkpnum(__order, __gpa), \ + nacl_hfence_mkpcount(__order, __gpsz)) + +#define nacl_hfence_gvma_all(__shmem) \ +__kvm_riscv_nacl_hfence(__shmem, \ + nacl_hfence_mkconfig(SBI_NACL_SHMEM_HFENCE_TYPE_GVMA_ALL, \ + 0, 0, 0), 0, 0) + +#define nacl_hfence_gvma_vmid(__shmem, __vmid, __gpa, __gpsz, __order) \ +__kvm_riscv_nacl_hfence(__shmem, \ + nacl_hfence_mkconfig(SBI_NACL_SHMEM_HFENCE_TYPE_GVMA_VMID, \ + __order, __vmid, 0), \ + nacl_hfence_mkpnum(__order, __gpa), \ + nacl_hfence_mkpcount(__order, __gpsz)) + +#define nacl_hfence_gvma_vmid_all(__shmem, __vmid) \ +__kvm_riscv_nacl_hfence(__shmem, \ + nacl_hfence_mkconfig(SBI_NACL_SHMEM_HFENCE_TYPE_GVMA_VMID_ALL, \ + 0, __vmid, 0), 0, 0) + +#define nacl_hfence_vvma(__shmem, __vmid, __gva, __gvsz, __order) \ +__kvm_riscv_nacl_hfence(__shmem, \ + nacl_hfence_mkconfig(SBI_NACL_SHMEM_HFENCE_TYPE_VVMA, \ + __order, __vmid, 0), \ + nacl_hfence_mkpnum(__order, __gva), \ + nacl_hfence_mkpcount(__order, __gvsz)) + +#define nacl_hfence_vvma_all(__shmem, __vmid) \ +__kvm_riscv_nacl_hfence(__shmem, \ + nacl_hfence_mkconfig(SBI_NACL_SHMEM_HFENCE_TYPE_VVMA_ALL, \ + 0, __vmid, 0), 0, 0) + +#define nacl_hfence_vvma_asid(__shmem, __vmid, __asid, __gva, __gvsz, __order)\ +__kvm_riscv_nacl_hfence(__shmem, \ + nacl_hfence_mkconfig(SBI_NACL_SHMEM_HFENCE_TYPE_VVMA_ASID, \ + __order, __vmid, __asid), \ + nacl_hfence_mkpnum(__order, __gva), \ + nacl_hfence_mkpcount(__order, __gvsz)) + +#define nacl_hfence_vvma_asid_all(__shmem, __vmid, __asid) \ +__kvm_riscv_nacl_hfence(__shmem, \ + nacl_hfence_mkconfig(SBI_NACL_SHMEM_HFENCE_TYPE_VVMA_ASID_ALL, \ + 0, __vmid, __asid), 0, 0) + +#define nacl_csr_read(__shmem, __csr) \ +({ \ + unsigned long *__a = (__shmem) + SBI_NACL_SHMEM_CSR_OFFSET; \ + lelong_to_cpu(__a[SBI_NACL_SHMEM_CSR_INDEX(__csr)]); \ +}) + +#define nacl_csr_write(__shmem, __csr, __val) \ +do { \ + void *__s = (__shmem); \ + unsigned int __i = SBI_NACL_SHMEM_CSR_INDEX(__csr); \ + unsigned long *__a = (__s) + SBI_NACL_SHMEM_CSR_OFFSET; \ + u8 *__b = (__s) + SBI_NACL_SHMEM_DBITMAP_OFFSET; \ + __a[__i] = cpu_to_lelong(__val); \ + __b[__i >> 3] |= 1U << (__i & 0x7); \ +} while (0) + +#define nacl_csr_swap(__shmem, __csr, __val) \ +({ \ + void *__s = (__shmem); \ + unsigned int __i = SBI_NACL_SHMEM_CSR_INDEX(__csr); \ + unsigned long *__a = (__s) + SBI_NACL_SHMEM_CSR_OFFSET; \ + u8 *__b = (__s) + SBI_NACL_SHMEM_DBITMAP_OFFSET; \ + unsigned long __r = lelong_to_cpu(__a[__i]); \ + __a[__i] = cpu_to_lelong(__val); \ + __b[__i >> 3] |= 1U << (__i & 0x7); \ + __r; \ +}) + +#define nacl_sync_csr(__csr) \ + sbi_ecall(SBI_EXT_NACL, SBI_EXT_NACL_SYNC_CSR, \ + (__csr), 0, 0, 0, 0, 0) + +/* + * Each ncsr_xyz() macro defined below has it's own static-branch so every + * use of ncsr_xyz() macro emits a patchable direct jump. This means multiple + * back-to-back ncsr_xyz() macro usage will emit multiple patchable direct + * jumps which is sub-optimal. + * + * Based on the above, it is recommended to avoid multiple back-to-back + * ncsr_xyz() macro usage. + */ + +#define ncsr_read(__csr) \ +({ \ + unsigned long __r; \ + if (kvm_riscv_nacl_available()) \ + __r = nacl_csr_read(nacl_shmem(), __csr); \ + else \ + __r = csr_read(__csr); \ + __r; \ +}) + +#define ncsr_write(__csr, __val) \ +do { \ + if (kvm_riscv_nacl_sync_csr_available()) \ + nacl_csr_write(nacl_shmem(), __csr, __val); \ + else \ + csr_write(__csr, __val); \ +} while (0) + +#define ncsr_swap(__csr, __val) \ +({ \ + unsigned long __r; \ + if (kvm_riscv_nacl_sync_csr_available()) \ + __r = nacl_csr_swap(nacl_shmem(), __csr, __val); \ + else \ + __r = csr_swap(__csr, __val); \ + __r; \ +}) + +#define nsync_csr(__csr) \ +do { \ + if (kvm_riscv_nacl_sync_csr_available()) \ + nacl_sync_csr(__csr); \ +} while (0) + +#endif diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index c9e03e9da3dc..1cc90465d75b 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -26,8 +26,15 @@ typedef struct { unsigned long exec_fdpic_loadmap; unsigned long interp_fdpic_loadmap; #endif + unsigned long flags; +#ifdef CONFIG_RISCV_ISA_SUPM + u8 pmlen; +#endif } mm_context_t; +/* Lock the pointer masking mode because this mm is multithreaded */ +#define MM_CONTEXT_LOCK_PMLEN 0 + #define cntx2asid(cntx) ((cntx) & SATP_ASID_MASK) #define cntx2version(cntx) ((cntx) & ~SATP_ASID_MASK) diff --git a/arch/riscv/include/asm/mmu_context.h b/arch/riscv/include/asm/mmu_context.h index 7030837adc1a..8c4bc49a3a0f 100644 --- a/arch/riscv/include/asm/mmu_context.h +++ b/arch/riscv/include/asm/mmu_context.h @@ -20,6 +20,9 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { +#ifdef CONFIG_RISCV_ISA_SUPM + next->context.pmlen = 0; +#endif switch_mm(prev, next, NULL); } @@ -30,11 +33,21 @@ static inline int init_new_context(struct task_struct *tsk, #ifdef CONFIG_MMU atomic_long_set(&mm->context.id, 0); #endif + if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM)) + clear_bit(MM_CONTEXT_LOCK_PMLEN, &mm->context.flags); return 0; } DECLARE_STATIC_KEY_FALSE(use_asid_allocator); +#ifdef CONFIG_RISCV_ISA_SUPM +#define mm_untag_mask mm_untag_mask +static inline unsigned long mm_untag_mask(struct mm_struct *mm) +{ + return -1UL >> mm->context.pmlen; +} +#endif + #include <asm-generic/mmu_context.h> #endif /* _ASM_RISCV_MMU_CONTEXT_H */ diff --git a/arch/riscv/include/asm/perf_event.h b/arch/riscv/include/asm/perf_event.h index 665bbc9b2f84..bcc928fd3785 100644 --- a/arch/riscv/include/asm/perf_event.h +++ b/arch/riscv/include/asm/perf_event.h @@ -8,6 +8,7 @@ #ifndef _ASM_RISCV_PERF_EVENT_H #define _ASM_RISCV_PERF_EVENT_H +#ifdef CONFIG_PERF_EVENTS #include <linux/perf_event.h> #define perf_arch_bpf_user_pt_regs(regs) (struct user_regs_struct *)regs @@ -17,4 +18,6 @@ (regs)->sp = current_stack_pointer; \ (regs)->status = SR_PP; \ } +#endif + #endif /* _ASM_RISCV_PERF_EVENT_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index e79f15293492..5d7f3e8c2e50 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -963,6 +963,25 @@ void misc_mem_init(void); extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +/* + * Use set_p*_safe(), and elide TLB flushing, when confident that *no* + * TLB flush will be required as a result of the "set". For example, use + * in scenarios where it is known ahead of time that the routine is + * setting non-present entries, or re-setting an existing entry to the + * same value. Otherwise, use the typical "set" helpers and flush the + * TLB. + */ +#define set_p4d_safe(p4dp, p4d) \ +({ \ + WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ + set_p4d(p4dp, p4d); \ +}) + +#define set_pgd_safe(pgdp, pgd) \ +({ \ + WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ + set_pgd(pgdp, pgd); \ +}) #endif /* !__ASSEMBLY__ */ #endif /* _ASM_RISCV_PGTABLE_H */ diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index efa1b3519b23..5f56eb9d114a 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -102,6 +102,7 @@ struct thread_struct { unsigned long s[12]; /* s[0]: frame pointer */ struct __riscv_d_ext_state fstate; unsigned long bad_cause; + unsigned long envcfg; u32 riscv_v_flags; u32 vstate_ctrl; struct __riscv_v_ext_state vstate; @@ -177,6 +178,14 @@ extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val); #define RISCV_SET_ICACHE_FLUSH_CTX(arg1, arg2) riscv_set_icache_flush_ctx(arg1, arg2) extern int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long per_thread); +#ifdef CONFIG_RISCV_ISA_SUPM +/* PR_{SET,GET}_TAGGED_ADDR_CTRL prctl */ +long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg); +long get_tagged_addr_ctrl(struct task_struct *task); +#define SET_TAGGED_ADDR_CTRL(arg) set_tagged_addr_ctrl(current, arg) +#define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl(current) +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_PROCESSOR_H */ diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 98f631b051db..6c82318065cf 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -34,6 +34,7 @@ enum sbi_ext_id { SBI_EXT_PMU = 0x504D55, SBI_EXT_DBCN = 0x4442434E, SBI_EXT_STA = 0x535441, + SBI_EXT_NACL = 0x4E41434C, /* Experimentals extensions must lie within this range */ SBI_EXT_EXPERIMENTAL_START = 0x08000000, @@ -281,6 +282,125 @@ struct sbi_sta_struct { #define SBI_SHMEM_DISABLE -1 +enum sbi_ext_nacl_fid { + SBI_EXT_NACL_PROBE_FEATURE = 0x0, + SBI_EXT_NACL_SET_SHMEM = 0x1, + SBI_EXT_NACL_SYNC_CSR = 0x2, + SBI_EXT_NACL_SYNC_HFENCE = 0x3, + SBI_EXT_NACL_SYNC_SRET = 0x4, +}; + +enum sbi_ext_nacl_feature { + SBI_NACL_FEAT_SYNC_CSR = 0x0, + SBI_NACL_FEAT_SYNC_HFENCE = 0x1, + SBI_NACL_FEAT_SYNC_SRET = 0x2, + SBI_NACL_FEAT_AUTOSWAP_CSR = 0x3, +}; + +#define SBI_NACL_SHMEM_ADDR_SHIFT 12 +#define SBI_NACL_SHMEM_SCRATCH_OFFSET 0x0000 +#define SBI_NACL_SHMEM_SCRATCH_SIZE 0x1000 +#define SBI_NACL_SHMEM_SRET_OFFSET 0x0000 +#define SBI_NACL_SHMEM_SRET_SIZE 0x0200 +#define SBI_NACL_SHMEM_AUTOSWAP_OFFSET (SBI_NACL_SHMEM_SRET_OFFSET + \ + SBI_NACL_SHMEM_SRET_SIZE) +#define SBI_NACL_SHMEM_AUTOSWAP_SIZE 0x0080 +#define SBI_NACL_SHMEM_UNUSED_OFFSET (SBI_NACL_SHMEM_AUTOSWAP_OFFSET + \ + SBI_NACL_SHMEM_AUTOSWAP_SIZE) +#define SBI_NACL_SHMEM_UNUSED_SIZE 0x0580 +#define SBI_NACL_SHMEM_HFENCE_OFFSET (SBI_NACL_SHMEM_UNUSED_OFFSET + \ + SBI_NACL_SHMEM_UNUSED_SIZE) +#define SBI_NACL_SHMEM_HFENCE_SIZE 0x0780 +#define SBI_NACL_SHMEM_DBITMAP_OFFSET (SBI_NACL_SHMEM_HFENCE_OFFSET + \ + SBI_NACL_SHMEM_HFENCE_SIZE) +#define SBI_NACL_SHMEM_DBITMAP_SIZE 0x0080 +#define SBI_NACL_SHMEM_CSR_OFFSET (SBI_NACL_SHMEM_DBITMAP_OFFSET + \ + SBI_NACL_SHMEM_DBITMAP_SIZE) +#define SBI_NACL_SHMEM_CSR_SIZE ((__riscv_xlen / 8) * 1024) +#define SBI_NACL_SHMEM_SIZE (SBI_NACL_SHMEM_CSR_OFFSET + \ + SBI_NACL_SHMEM_CSR_SIZE) + +#define SBI_NACL_SHMEM_CSR_INDEX(__csr_num) \ + ((((__csr_num) & 0xc00) >> 2) | ((__csr_num) & 0xff)) + +#define SBI_NACL_SHMEM_HFENCE_ENTRY_SZ ((__riscv_xlen / 8) * 4) +#define SBI_NACL_SHMEM_HFENCE_ENTRY_MAX \ + (SBI_NACL_SHMEM_HFENCE_SIZE / \ + SBI_NACL_SHMEM_HFENCE_ENTRY_SZ) +#define SBI_NACL_SHMEM_HFENCE_ENTRY(__num) \ + (SBI_NACL_SHMEM_HFENCE_OFFSET + \ + (__num) * SBI_NACL_SHMEM_HFENCE_ENTRY_SZ) +#define SBI_NACL_SHMEM_HFENCE_ENTRY_CONFIG(__num) \ + SBI_NACL_SHMEM_HFENCE_ENTRY(__num) +#define SBI_NACL_SHMEM_HFENCE_ENTRY_PNUM(__num)\ + (SBI_NACL_SHMEM_HFENCE_ENTRY(__num) + (__riscv_xlen / 8)) +#define SBI_NACL_SHMEM_HFENCE_ENTRY_PCOUNT(__num)\ + (SBI_NACL_SHMEM_HFENCE_ENTRY(__num) + \ + ((__riscv_xlen / 8) * 3)) + +#define SBI_NACL_SHMEM_HFENCE_CONFIG_PEND_BITS 1 +#define SBI_NACL_SHMEM_HFENCE_CONFIG_PEND_SHIFT \ + (__riscv_xlen - SBI_NACL_SHMEM_HFENCE_CONFIG_PEND_BITS) +#define SBI_NACL_SHMEM_HFENCE_CONFIG_PEND_MASK \ + ((1UL << SBI_NACL_SHMEM_HFENCE_CONFIG_PEND_BITS) - 1) +#define SBI_NACL_SHMEM_HFENCE_CONFIG_PEND \ + (SBI_NACL_SHMEM_HFENCE_CONFIG_PEND_MASK << \ + SBI_NACL_SHMEM_HFENCE_CONFIG_PEND_SHIFT) + +#define SBI_NACL_SHMEM_HFENCE_CONFIG_RSVD1_BITS 3 +#define SBI_NACL_SHMEM_HFENCE_CONFIG_RSVD1_SHIFT \ + (SBI_NACL_SHMEM_HFENCE_CONFIG_PEND_SHIFT - \ + SBI_NACL_SHMEM_HFENCE_CONFIG_RSVD1_BITS) + +#define SBI_NACL_SHMEM_HFENCE_CONFIG_TYPE_BITS 4 +#define SBI_NACL_SHMEM_HFENCE_CONFIG_TYPE_SHIFT \ + (SBI_NACL_SHMEM_HFENCE_CONFIG_RSVD1_SHIFT - \ + SBI_NACL_SHMEM_HFENCE_CONFIG_TYPE_BITS) +#define SBI_NACL_SHMEM_HFENCE_CONFIG_TYPE_MASK \ + ((1UL << SBI_NACL_SHMEM_HFENCE_CONFIG_TYPE_BITS) - 1) + +#define SBI_NACL_SHMEM_HFENCE_TYPE_GVMA 0x0 +#define SBI_NACL_SHMEM_HFENCE_TYPE_GVMA_ALL 0x1 +#define SBI_NACL_SHMEM_HFENCE_TYPE_GVMA_VMID 0x2 +#define SBI_NACL_SHMEM_HFENCE_TYPE_GVMA_VMID_ALL 0x3 +#define SBI_NACL_SHMEM_HFENCE_TYPE_VVMA 0x4 +#define SBI_NACL_SHMEM_HFENCE_TYPE_VVMA_ALL 0x5 +#define SBI_NACL_SHMEM_HFENCE_TYPE_VVMA_ASID 0x6 +#define SBI_NACL_SHMEM_HFENCE_TYPE_VVMA_ASID_ALL 0x7 + +#define SBI_NACL_SHMEM_HFENCE_CONFIG_RSVD2_BITS 1 +#define SBI_NACL_SHMEM_HFENCE_CONFIG_RSVD2_SHIFT \ + (SBI_NACL_SHMEM_HFENCE_CONFIG_TYPE_SHIFT - \ + SBI_NACL_SHMEM_HFENCE_CONFIG_RSVD2_BITS) + +#define SBI_NACL_SHMEM_HFENCE_CONFIG_ORDER_BITS 7 +#define SBI_NACL_SHMEM_HFENCE_CONFIG_ORDER_SHIFT \ + (SBI_NACL_SHMEM_HFENCE_CONFIG_RSVD2_SHIFT - \ + SBI_NACL_SHMEM_HFENCE_CONFIG_ORDER_BITS) +#define SBI_NACL_SHMEM_HFENCE_CONFIG_ORDER_MASK \ + ((1UL << SBI_NACL_SHMEM_HFENCE_CONFIG_ORDER_BITS) - 1) +#define SBI_NACL_SHMEM_HFENCE_ORDER_BASE 12 + +#if __riscv_xlen == 32 +#define SBI_NACL_SHMEM_HFENCE_CONFIG_ASID_BITS 9 +#define SBI_NACL_SHMEM_HFENCE_CONFIG_VMID_BITS 7 +#else +#define SBI_NACL_SHMEM_HFENCE_CONFIG_ASID_BITS 16 +#define SBI_NACL_SHMEM_HFENCE_CONFIG_VMID_BITS 14 +#endif +#define SBI_NACL_SHMEM_HFENCE_CONFIG_VMID_SHIFT \ + SBI_NACL_SHMEM_HFENCE_CONFIG_ASID_BITS +#define SBI_NACL_SHMEM_HFENCE_CONFIG_ASID_MASK \ + ((1UL << SBI_NACL_SHMEM_HFENCE_CONFIG_ASID_BITS) - 1) +#define SBI_NACL_SHMEM_HFENCE_CONFIG_VMID_MASK \ + ((1UL << SBI_NACL_SHMEM_HFENCE_CONFIG_VMID_BITS) - 1) + +#define SBI_NACL_SHMEM_AUTOSWAP_FLAG_HSTATUS BIT(0) +#define SBI_NACL_SHMEM_AUTOSWAP_HSTATUS ((__riscv_xlen / 8) * 1) + +#define SBI_NACL_SHMEM_SRET_X(__i) ((__riscv_xlen / 8) * (__i)) +#define SBI_NACL_SHMEM_SRET_X_LAST 31 + /* SBI spec version fields */ #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index ab92fc84e1fc..ea263d3683ef 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -42,6 +42,7 @@ static inline int set_kernel_memory(char *startp, char *endp, int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h new file mode 100644 index 000000000000..e5121b89acea --- /dev/null +++ b/arch/riscv/include/asm/spinlock.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_RISCV_SPINLOCK_H +#define __ASM_RISCV_SPINLOCK_H + +#ifdef CONFIG_RISCV_COMBO_SPINLOCKS +#define _Q_PENDING_LOOPS (1 << 9) + +#define __no_arch_spinlock_redefine +#include <asm/ticket_spinlock.h> +#include <asm/qspinlock.h> +#include <asm/jump_label.h> + +/* + * TODO: Use an alternative instead of a static key when we are able to parse + * the extensions string earlier in the boot process. + */ +DECLARE_STATIC_KEY_TRUE(qspinlock_key); + +#define SPINLOCK_BASE_DECLARE(op, type, type_lock) \ +static __always_inline type arch_spin_##op(type_lock lock) \ +{ \ + if (static_branch_unlikely(&qspinlock_key)) \ + return queued_spin_##op(lock); \ + return ticket_spin_##op(lock); \ +} + +SPINLOCK_BASE_DECLARE(lock, void, arch_spinlock_t *) +SPINLOCK_BASE_DECLARE(unlock, void, arch_spinlock_t *) +SPINLOCK_BASE_DECLARE(is_locked, int, arch_spinlock_t *) +SPINLOCK_BASE_DECLARE(is_contended, int, arch_spinlock_t *) +SPINLOCK_BASE_DECLARE(trylock, bool, arch_spinlock_t *) +SPINLOCK_BASE_DECLARE(value_unlocked, int, arch_spinlock_t) + +#elif defined(CONFIG_RISCV_QUEUED_SPINLOCKS) + +#include <asm/qspinlock.h> + +#else + +#include <asm/ticket_spinlock.h> + +#endif + +#include <asm/qrwlock.h> + +#endif /* __ASM_RISCV_SPINLOCK_H */ diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h index 7594df37cc9f..94e33216b2d9 100644 --- a/arch/riscv/include/asm/switch_to.h +++ b/arch/riscv/include/asm/switch_to.h @@ -70,6 +70,24 @@ static __always_inline bool has_fpu(void) { return false; } #define __switch_to_fpu(__prev, __next) do { } while (0) #endif +static inline void envcfg_update_bits(struct task_struct *task, + unsigned long mask, unsigned long val) +{ + unsigned long envcfg; + + envcfg = (task->thread.envcfg & ~mask) | val; + task->thread.envcfg = envcfg; + if (task == current) + csr_write(CSR_ENVCFG, envcfg); +} + +static inline void __switch_to_envcfg(struct task_struct *next) +{ + asm volatile (ALTERNATIVE("nop", "csrw " __stringify(CSR_ENVCFG) ", %0", + 0, RISCV_ISA_EXT_XLINUXENVCFG, 1) + :: "r" (next->thread.envcfg) : "memory"); +} + extern struct task_struct *__switch_to(struct task_struct *, struct task_struct *); @@ -103,6 +121,7 @@ do { \ __switch_to_vector(__prev, __next); \ if (switch_to_should_flush_icache(__next)) \ local_flush_icache_all(); \ + __switch_to_envcfg(__next); \ ((last) = __switch_to(__prev, __next)); \ } while (0) diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/text-patching.h index 7228e266b9a1..7228e266b9a1 100644 --- a/arch/riscv/include/asm/patch.h +++ b/arch/riscv/include/asm/text-patching.h diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 72ec1d9bd3f3..fee56b0c8058 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -9,8 +9,41 @@ #define _ASM_RISCV_UACCESS_H #include <asm/asm-extable.h> +#include <asm/cpufeature.h> #include <asm/pgtable.h> /* for TASK_SIZE */ +#ifdef CONFIG_RISCV_ISA_SUPM +static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigned long addr) +{ + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) { + u8 pmlen = mm->context.pmlen; + + /* Virtual addresses are sign-extended; physical addresses are zero-extended. */ + if (IS_ENABLED(CONFIG_MMU)) + return (long)(addr << pmlen) >> pmlen; + else + return (addr << pmlen) >> pmlen; + } + + return addr; +} + +#define untagged_addr(addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr_remote(current->mm, __addr); \ +}) + +#define untagged_addr_remote(mm, addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + mmap_assert_locked(mm); \ + (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \ +}) + +#define access_ok(addr, size) likely(__access_ok(untagged_addr(addr), size)) +#else +#define untagged_addr(addr) (addr) +#endif + /* * User space memory access functions */ @@ -130,7 +163,7 @@ do { \ */ #define __get_user(x, ptr) \ ({ \ - const __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ + const __typeof__(*(ptr)) __user *__gu_ptr = untagged_addr(ptr); \ long __gu_err = 0; \ \ __chk_user_ptr(__gu_ptr); \ @@ -246,7 +279,7 @@ do { \ */ #define __put_user(x, ptr) \ ({ \ - __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_ptr = untagged_addr(ptr); \ __typeof__(*__gu_ptr) __val = (x); \ long __pu_err = 0; \ \ @@ -293,13 +326,13 @@ unsigned long __must_check __asm_copy_from_user(void *to, static inline unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - return __asm_copy_from_user(to, from, n); + return __asm_copy_from_user(to, untagged_addr(from), n); } static inline unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - return __asm_copy_to_user(to, from, n); + return __asm_copy_to_user(untagged_addr(to), from, n); } extern long strncpy_from_user(char *dest, const char __user *src, long count); @@ -314,7 +347,7 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) { might_fault(); return access_ok(to, n) ? - __clear_user(to, n) : n; + __clear_user(untagged_addr(to), n) : n; } #define __get_kernel_nofault(dst, src, type, err_label) \ diff --git a/arch/riscv/include/asm/uprobes.h b/arch/riscv/include/asm/uprobes.h index 3fc7deda9190..5008f76cdc27 100644 --- a/arch/riscv/include/asm/uprobes.h +++ b/arch/riscv/include/asm/uprobes.h @@ -4,7 +4,7 @@ #define _ASM_RISCV_UPROBES_H #include <asm/probes.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/bug.h> #define MAX_UINSN_BYTES 8 diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index be7d309cca8a..c7c023afbacd 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -21,6 +21,7 @@ extern unsigned long riscv_v_vsize; int riscv_v_setup_vsize(void); +bool insn_is_vector(u32 insn_buf); bool riscv_v_first_use_handler(struct pt_regs *regs); void kernel_vector_begin(void); void kernel_vector_end(void); @@ -268,6 +269,7 @@ struct pt_regs; static inline int riscv_v_setup_vsize(void) { return -EOPNOTSUPP; } static __always_inline bool has_vector(void) { return false; } +static __always_inline bool insn_is_vector(u32 insn_buf) { return false; } static inline bool riscv_v_first_use_handler(struct pt_regs *regs) { return false; } static inline bool riscv_v_vstate_query(struct pt_regs *regs) { return false; } static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 1e153cda57db..3af142b99f77 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -72,6 +72,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_EXT_ZCF (1ULL << 46) #define RISCV_HWPROBE_EXT_ZCMOP (1ULL << 47) #define RISCV_HWPROBE_EXT_ZAWRS (1ULL << 48) +#define RISCV_HWPROBE_EXT_SUPM (1ULL << 49) #define RISCV_HWPROBE_KEY_CPUPERF_0 5 #define RISCV_HWPROBE_MISALIGNED_UNKNOWN (0 << 0) #define RISCV_HWPROBE_MISALIGNED_EMULATED (1 << 0) @@ -88,6 +89,11 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW 2 #define RISCV_HWPROBE_MISALIGNED_SCALAR_FAST 3 #define RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED 4 +#define RISCV_HWPROBE_KEY_MISALIGNED_VECTOR_PERF 10 +#define RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN 0 +#define RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW 2 +#define RISCV_HWPROBE_MISALIGNED_VECTOR_FAST 3 +#define RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED 4 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index e97db3296456..4f24201376b1 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -175,6 +175,8 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_ZCF, KVM_RISCV_ISA_EXT_ZCMOP, KVM_RISCV_ISA_EXT_ZAWRS, + KVM_RISCV_ISA_EXT_SMNPM, + KVM_RISCV_ISA_EXT_SSNPM, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 69dc8aaab3fb..063d1faf5a53 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -75,7 +75,8 @@ obj-$(CONFIG_MMU) += vdso.o vdso/ obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o -obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o +obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o +obj-$(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS) += vec-copy-unaligned.o obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_FPU) += kernel_mode_fpu.o diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index 0128b161bfda..7eb3cb1215c6 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -18,7 +18,7 @@ #include <asm/sbi.h> #include <asm/csr.h> #include <asm/insn.h> -#include <asm/patch.h> +#include <asm/text-patching.h> struct cpu_manufacturer_info_t { unsigned long vendor_id; diff --git a/arch/riscv/kernel/copy-unaligned.h b/arch/riscv/kernel/copy-unaligned.h index e3d70d35b708..85d4d11450cb 100644 --- a/arch/riscv/kernel/copy-unaligned.h +++ b/arch/riscv/kernel/copy-unaligned.h @@ -10,4 +10,9 @@ void __riscv_copy_words_unaligned(void *dst, const void *src, size_t size); void __riscv_copy_bytes_unaligned(void *dst, const void *src, size_t size); +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS +void __riscv_copy_vec_words_unaligned(void *dst, const void *src, size_t size); +void __riscv_copy_vec_bytes_unaligned(void *dst, const void *src, size_t size); +#endif + #endif /* __RISCV_KERNEL_COPY_UNALIGNED_H */ diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 3a8eeaa9310c..467c5c735bf5 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -20,7 +20,8 @@ #include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/hwcap.h> -#include <asm/patch.h> +#include <asm/text-patching.h> +#include <asm/hwprobe.h> #include <asm/processor.h> #include <asm/sbi.h> #include <asm/vector.h> @@ -28,6 +29,8 @@ #define NUM_ALPHA_EXTS ('z' - 'a' + 1) +static bool any_cpu_has_zicboz; + unsigned long elf_hwcap __read_mostly; /* Host ISA bitmap */ @@ -98,6 +101,7 @@ static int riscv_ext_zicboz_validate(const struct riscv_isa_ext_data *data, pr_err("Zicboz disabled as cboz-block-size present, but is not a power-of-2\n"); return -EINVAL; } + any_cpu_has_zicboz = true; return 0; } @@ -314,6 +318,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { riscv_ext_zicbom_validate), __RISCV_ISA_EXT_SUPERSET_VALIDATE(zicboz, RISCV_ISA_EXT_ZICBOZ, riscv_xlinuxenvcfg_exts, riscv_ext_zicboz_validate), + __RISCV_ISA_EXT_DATA(ziccrse, RISCV_ISA_EXT_ZICCRSE), __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND), __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR), @@ -322,6 +327,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), __RISCV_ISA_EXT_DATA(zihpm, RISCV_ISA_EXT_ZIHPM), __RISCV_ISA_EXT_DATA(zimop, RISCV_ISA_EXT_ZIMOP), + __RISCV_ISA_EXT_DATA(zabha, RISCV_ISA_EXT_ZABHA), __RISCV_ISA_EXT_DATA(zacas, RISCV_ISA_EXT_ZACAS), __RISCV_ISA_EXT_DATA(zawrs, RISCV_ISA_EXT_ZAWRS), __RISCV_ISA_EXT_DATA(zfa, RISCV_ISA_EXT_ZFA), @@ -374,9 +380,12 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_BUNDLE(zvksg, riscv_zvksg_bundled_exts), __RISCV_ISA_EXT_DATA(zvkt, RISCV_ISA_EXT_ZVKT), __RISCV_ISA_EXT_DATA(smaia, RISCV_ISA_EXT_SMAIA), + __RISCV_ISA_EXT_DATA(smmpm, RISCV_ISA_EXT_SMMPM), + __RISCV_ISA_EXT_SUPERSET(smnpm, RISCV_ISA_EXT_SMNPM, riscv_xlinuxenvcfg_exts), __RISCV_ISA_EXT_DATA(smstateen, RISCV_ISA_EXT_SMSTATEEN), __RISCV_ISA_EXT_DATA(ssaia, RISCV_ISA_EXT_SSAIA), __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), + __RISCV_ISA_EXT_SUPERSET(ssnpm, RISCV_ISA_EXT_SSNPM, riscv_xlinuxenvcfg_exts), __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC), __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT), @@ -917,10 +926,12 @@ unsigned long riscv_get_elf_hwcap(void) return hwcap; } -void riscv_user_isa_enable(void) +void __init riscv_user_isa_enable(void) { - if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ)) - csr_set(CSR_ENVCFG, ENVCFG_CBZE); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOZ)) + current->thread.envcfg |= ENVCFG_CBZE; + else if (any_cpu_has_zicboz) + pr_warn("Zicboz disabled as it is unavailable on some harts\n"); } #ifdef CONFIG_RISCV_ALTERNATIVE diff --git a/arch/riscv/kernel/fpu.S b/arch/riscv/kernel/fpu.S index 327cf527dd7e..f74f6b60e347 100644 --- a/arch/riscv/kernel/fpu.S +++ b/arch/riscv/kernel/fpu.S @@ -170,7 +170,7 @@ SYM_FUNC_END(__fstate_restore) __access_func(f31) -#ifdef CONFIG_RISCV_MISALIGNED +#ifdef CONFIG_RISCV_SCALAR_MISALIGNED /* * Disable compressed instructions set to keep a constant offset between FP @@ -224,4 +224,4 @@ SYM_FUNC_START(get_f64_reg) fp_access_epilogue SYM_FUNC_END(get_f64_reg) -#endif /* CONFIG_RISCV_MISALIGNED */ +#endif /* CONFIG_RISCV_SCALAR_MISALIGNED */ diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 5081ad886841..8cb9b211611d 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -10,7 +10,7 @@ #include <linux/memory.h> #include <linux/stop_machine.h> #include <asm/cacheflush.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #ifdef CONFIG_DYNAMIC_FTRACE void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c index 11ad789c60c6..6eee6f736f68 100644 --- a/arch/riscv/kernel/jump_label.c +++ b/arch/riscv/kernel/jump_label.c @@ -10,7 +10,7 @@ #include <linux/mutex.h> #include <asm/bug.h> #include <asm/cacheflush.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #define RISCV_INSN_NOP 0x00000013U #define RISCV_INSN_JAL 0x0000006fU diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 34ef522f07a8..db13c9ddf9e3 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -13,7 +13,7 @@ #include <asm/cacheflush.h> #include <asm/fixmap.h> #include <asm/ftrace.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/sections.h> struct patch_insn { diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c index c7468af77c66..b465bc9eb870 100644 --- a/arch/riscv/kernel/perf_callchain.c +++ b/arch/riscv/kernel/perf_callchain.c @@ -28,11 +28,21 @@ static bool fill_callchain(void *entry, unsigned long pc) void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + if (perf_guest_state()) { + /* TODO: We don't support guest os callchain now */ + return; + } + arch_stack_walk_user(fill_callchain, entry, regs); } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + if (perf_guest_state()) { + /* TODO: We don't support guest os callchain now */ + return; + } + walk_stackframe(NULL, regs, fill_callchain, entry); } diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 474a65213657..380a0e8cecc0 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -12,7 +12,7 @@ #include <asm/sections.h> #include <asm/cacheflush.h> #include <asm/bug.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include "decode-insn.h" diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index e3142d8a6e28..58b6482c2bf6 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -7,6 +7,7 @@ * Copyright (C) 2017 SiFive */ +#include <linux/bitfield.h> #include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -180,6 +181,10 @@ void flush_thread(void) memset(¤t->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); clear_tsk_thread_flag(current, TIF_RISCV_V_DEFER_RESTORE); #endif +#ifdef CONFIG_RISCV_ISA_SUPM + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + envcfg_update_bits(current, ENVCFG_PMM, ENVCFG_PMM_PMLEN_0); +#endif } void arch_release_task_struct(struct task_struct *tsk) @@ -208,6 +213,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + /* Ensure all threads in this mm have the same pointer masking mode. */ + if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM) && p->mm && (clone_flags & CLONE_VM)) + set_bit(MM_CONTEXT_LOCK_PMLEN, &p->mm->context.flags); + memset(&p->thread.s, 0, sizeof(p->thread.s)); /* p->thread holds context to be restored by __switch_to() */ @@ -242,3 +251,148 @@ void __init arch_task_cache_init(void) { riscv_v_setup_ctx_cache(); } + +#ifdef CONFIG_RISCV_ISA_SUPM +enum { + PMLEN_0 = 0, + PMLEN_7 = 7, + PMLEN_16 = 16, +}; + +static bool have_user_pmlen_7; +static bool have_user_pmlen_16; + +/* + * Control the relaxed ABI allowing tagged user addresses into the kernel. + */ +static unsigned int tagged_addr_disabled; + +long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) +{ + unsigned long valid_mask = PR_PMLEN_MASK | PR_TAGGED_ADDR_ENABLE; + struct thread_info *ti = task_thread_info(task); + struct mm_struct *mm = task->mm; + unsigned long pmm; + u8 pmlen; + + if (is_compat_thread(ti)) + return -EINVAL; + + if (arg & ~valid_mask) + return -EINVAL; + + /* + * Prefer the smallest PMLEN that satisfies the user's request, + * in case choosing a larger PMLEN has a performance impact. + */ + pmlen = FIELD_GET(PR_PMLEN_MASK, arg); + if (pmlen == PMLEN_0) { + pmm = ENVCFG_PMM_PMLEN_0; + } else if (pmlen <= PMLEN_7 && have_user_pmlen_7) { + pmlen = PMLEN_7; + pmm = ENVCFG_PMM_PMLEN_7; + } else if (pmlen <= PMLEN_16 && have_user_pmlen_16) { + pmlen = PMLEN_16; + pmm = ENVCFG_PMM_PMLEN_16; + } else { + return -EINVAL; + } + + /* + * Do not allow the enabling of the tagged address ABI if globally + * disabled via sysctl abi.tagged_addr_disabled, if pointer masking + * is disabled for userspace. + */ + if (arg & PR_TAGGED_ADDR_ENABLE && (tagged_addr_disabled || !pmlen)) + return -EINVAL; + + if (!(arg & PR_TAGGED_ADDR_ENABLE)) + pmlen = PMLEN_0; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + if (test_bit(MM_CONTEXT_LOCK_PMLEN, &mm->context.flags) && mm->context.pmlen != pmlen) { + mmap_write_unlock(mm); + return -EBUSY; + } + + envcfg_update_bits(task, ENVCFG_PMM, pmm); + mm->context.pmlen = pmlen; + + mmap_write_unlock(mm); + + return 0; +} + +long get_tagged_addr_ctrl(struct task_struct *task) +{ + struct thread_info *ti = task_thread_info(task); + long ret = 0; + + if (is_compat_thread(ti)) + return -EINVAL; + + /* + * The mm context's pmlen is set only when the tagged address ABI is + * enabled, so the effective PMLEN must be extracted from envcfg.PMM. + */ + switch (task->thread.envcfg & ENVCFG_PMM) { + case ENVCFG_PMM_PMLEN_7: + ret = FIELD_PREP(PR_PMLEN_MASK, PMLEN_7); + break; + case ENVCFG_PMM_PMLEN_16: + ret = FIELD_PREP(PR_PMLEN_MASK, PMLEN_16); + break; + } + + if (task->mm->context.pmlen) + ret |= PR_TAGGED_ADDR_ENABLE; + + return ret; +} + +static bool try_to_set_pmm(unsigned long value) +{ + csr_set(CSR_ENVCFG, value); + return (csr_read_clear(CSR_ENVCFG, ENVCFG_PMM) & ENVCFG_PMM) == value; +} + +/* + * Global sysctl to disable the tagged user addresses support. This control + * only prevents the tagged address ABI enabling via prctl() and does not + * disable it for tasks that already opted in to the relaxed ABI. + */ + +static struct ctl_table tagged_addr_sysctl_table[] = { + { + .procname = "tagged_addr_disabled", + .mode = 0644, + .data = &tagged_addr_disabled, + .maxlen = sizeof(int), + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init tagged_addr_init(void) +{ + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + return 0; + + /* + * envcfg.PMM is a WARL field. Detect which values are supported. + * Assume the supported PMLEN values are the same on all harts. + */ + csr_clear(CSR_ENVCFG, ENVCFG_PMM); + have_user_pmlen_7 = try_to_set_pmm(ENVCFG_PMM_PMLEN_7); + have_user_pmlen_16 = try_to_set_pmm(ENVCFG_PMM_PMLEN_16); + + if (!register_sysctl("abi", tagged_addr_sysctl_table)) + return -EINVAL; + + return 0; +} +core_initcall(tagged_addr_init); +#endif /* CONFIG_RISCV_ISA_SUPM */ diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 92731ff8c79a..ea67e9fb7a58 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -28,6 +28,9 @@ enum riscv_regset { #ifdef CONFIG_RISCV_ISA_V REGSET_V, #endif +#ifdef CONFIG_RISCV_ISA_SUPM + REGSET_TAGGED_ADDR_CTRL, +#endif }; static int riscv_gpr_get(struct task_struct *target, @@ -152,6 +155,35 @@ static int riscv_vr_set(struct task_struct *target, } #endif +#ifdef CONFIG_RISCV_ISA_SUPM +static int tagged_addr_ctrl_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long ctrl = get_tagged_addr_ctrl(target); + + if (IS_ERR_VALUE(ctrl)) + return ctrl; + + return membuf_write(&to, &ctrl, sizeof(ctrl)); +} + +static int tagged_addr_ctrl_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + long ctrl; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + return set_tagged_addr_ctrl(target, ctrl); +} +#endif + static const struct user_regset riscv_user_regset[] = { [REGSET_X] = { .core_note_type = NT_PRSTATUS, @@ -182,6 +214,16 @@ static const struct user_regset riscv_user_regset[] = { .set = riscv_vr_set, }, #endif +#ifdef CONFIG_RISCV_ISA_SUPM + [REGSET_TAGGED_ADDR_CTRL] = { + .core_note_type = NT_RISCV_TAGGED_ADDR_CTRL, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = tagged_addr_ctrl_get, + .set = tagged_addr_ctrl_set, + }, +#endif }; static const struct user_regset_view riscv_user_native_view = { diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 26c886db4fb3..016b48fcd6f2 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -244,6 +244,42 @@ static void __init parse_dtb(void) #endif } +#if defined(CONFIG_RISCV_COMBO_SPINLOCKS) +DEFINE_STATIC_KEY_TRUE(qspinlock_key); +EXPORT_SYMBOL(qspinlock_key); +#endif + +static void __init riscv_spinlock_init(void) +{ + char *using_ext = NULL; + + if (IS_ENABLED(CONFIG_RISCV_TICKET_SPINLOCKS)) { + pr_info("Ticket spinlock: enabled\n"); + return; + } + + if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) && + IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) && + riscv_isa_extension_available(NULL, ZABHA) && + riscv_isa_extension_available(NULL, ZACAS)) { + using_ext = "using Zabha"; + } else if (riscv_isa_extension_available(NULL, ZICCRSE)) { + using_ext = "using Ziccrse"; + } +#if defined(CONFIG_RISCV_COMBO_SPINLOCKS) + else { + static_branch_disable(&qspinlock_key); + pr_info("Ticket spinlock: enabled\n"); + return; + } +#endif + + if (!using_ext) + pr_err("Queued spinlock without Zabha or Ziccrse"); + else + pr_info("Queued spinlock %s: enabled\n", using_ext); +} + extern void __init init_rt_signal_env(void); void __init setup_arch(char **cmdline_p) @@ -297,6 +333,7 @@ void __init setup_arch(char **cmdline_p) riscv_set_dma_cache_alignment(); riscv_user_isa_enable(); + riscv_spinlock_init(); } bool arch_cpu_is_hotpluggable(int cpu) diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 0f8f1c95ac38..e36d20205bd7 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -233,8 +233,6 @@ asmlinkage __visible void smp_callin(void) numa_add_cpu(curr_cpuid); set_cpu_online(curr_cpuid, true); - riscv_user_isa_enable(); - /* * Remote cache and TLB flushes are ignored while the CPU is offline, * so flush them both right now just in case. diff --git a/arch/riscv/kernel/suspend.c b/arch/riscv/kernel/suspend.c index c8cec0cc5833..9a8a0dc035b2 100644 --- a/arch/riscv/kernel/suspend.c +++ b/arch/riscv/kernel/suspend.c @@ -14,7 +14,7 @@ void suspend_save_csrs(struct suspend_context *context) { - if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_XLINUXENVCFG)) + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_XLINUXENVCFG)) context->envcfg = csr_read(CSR_ENVCFG); context->tvec = csr_read(CSR_TVEC); context->ie = csr_read(CSR_IE); @@ -37,7 +37,7 @@ void suspend_save_csrs(struct suspend_context *context) void suspend_restore_csrs(struct suspend_context *context) { csr_write(CSR_SCRATCH, 0); - if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_XLINUXENVCFG)) + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_XLINUXENVCFG)) csr_write(CSR_ENVCFG, context->envcfg); csr_write(CSR_TVEC, context->tvec); csr_write(CSR_IE, context->ie); diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 711a31f27c3d..cb93adfffc48 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -150,6 +150,9 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, EXT_KEY(ZFH); EXT_KEY(ZFHMIN); } + + if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM)) + EXT_KEY(SUPM); #undef EXT_KEY } @@ -201,6 +204,43 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) } #endif +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) +{ + int cpu; + u64 perf = -1ULL; + + /* Return if supported or not even if speed wasn't probed */ + for_each_cpu(cpu, cpus) { + int this_perf = per_cpu(vector_misaligned_access, cpu); + + if (perf == -1ULL) + perf = this_perf; + + if (perf != this_perf) { + perf = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; + break; + } + } + + if (perf == -1ULL) + return RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; + + return perf; +} +#else +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus) +{ + if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS)) + return RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; + + if (IS_ENABLED(CONFIG_RISCV_SLOW_VECTOR_UNALIGNED_ACCESS)) + return RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; + + return RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; +} +#endif + static void hwprobe_one_pair(struct riscv_hwprobe *pair, const struct cpumask *cpus) { @@ -229,6 +269,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, pair->value = hwprobe_misaligned(cpus); break; + case RISCV_HWPROBE_KEY_MISALIGNED_VECTOR_PERF: + pair->value = hwprobe_vec_misaligned(cpus); + break; + case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE: pair->value = 0; if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ)) diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 1b9867136b61..7cc108aed74e 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -16,6 +16,7 @@ #include <asm/entry-common.h> #include <asm/hwprobe.h> #include <asm/cpufeature.h> +#include <asm/vector.h> #define INSN_MATCH_LB 0x3 #define INSN_MASK_LB 0x707f @@ -320,12 +321,37 @@ union reg_data { u64 data_u64; }; -static bool unaligned_ctl __read_mostly; - /* sysctl hooks */ int unaligned_enabled __read_mostly = 1; /* Enabled by default */ -int handle_misaligned_load(struct pt_regs *regs) +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED +static int handle_vector_misaligned_load(struct pt_regs *regs) +{ + unsigned long epc = regs->epc; + unsigned long insn; + + if (get_insn(regs, epc, &insn)) + return -1; + + /* Only return 0 when in check_vector_unaligned_access_emulated */ + if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) { + *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; + regs->epc = epc + INSN_LEN(insn); + return 0; + } + + /* If vector instruction we don't emulate it yet */ + regs->epc = epc; + return -1; +} +#else +static int handle_vector_misaligned_load(struct pt_regs *regs) +{ + return -1; +} +#endif + +static int handle_scalar_misaligned_load(struct pt_regs *regs) { union reg_data val; unsigned long epc = regs->epc; @@ -433,7 +459,7 @@ int handle_misaligned_load(struct pt_regs *regs) return 0; } -int handle_misaligned_store(struct pt_regs *regs) +static int handle_scalar_misaligned_store(struct pt_regs *regs) { union reg_data val; unsigned long epc = regs->epc; @@ -524,11 +550,96 @@ int handle_misaligned_store(struct pt_regs *regs) return 0; } -static bool check_unaligned_access_emulated(int cpu) +int handle_misaligned_load(struct pt_regs *regs) +{ + unsigned long epc = regs->epc; + unsigned long insn; + + if (IS_ENABLED(CONFIG_RISCV_VECTOR_MISALIGNED)) { + if (get_insn(regs, epc, &insn)) + return -1; + + if (insn_is_vector(insn)) + return handle_vector_misaligned_load(regs); + } + + if (IS_ENABLED(CONFIG_RISCV_SCALAR_MISALIGNED)) + return handle_scalar_misaligned_load(regs); + + return -1; +} + +int handle_misaligned_store(struct pt_regs *regs) { + if (IS_ENABLED(CONFIG_RISCV_SCALAR_MISALIGNED)) + return handle_scalar_misaligned_store(regs); + + return -1; +} + +#ifdef CONFIG_RISCV_VECTOR_MISALIGNED +void check_vector_unaligned_access_emulated(struct work_struct *work __always_unused) +{ + long *mas_ptr = this_cpu_ptr(&vector_misaligned_access); + unsigned long tmp_var; + + *mas_ptr = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; + + kernel_vector_begin(); + /* + * In pre-13.0.0 versions of GCC, vector registers cannot appear in + * the clobber list. This inline asm clobbers v0, but since we do not + * currently build the kernel with V enabled, the v0 clobber arg is not + * needed (as the compiler will not emit vector code itself). If the kernel + * is changed to build with V enabled, the clobber arg will need to be + * added here. + */ + __asm__ __volatile__ ( + ".balign 4\n\t" + ".option push\n\t" + ".option arch, +zve32x\n\t" + " vsetivli zero, 1, e16, m1, ta, ma\n\t" // Vectors of 16b + " vle16.v v0, (%[ptr])\n\t" // Load bytes + ".option pop\n\t" + : : [ptr] "r" ((u8 *)&tmp_var + 1)); + kernel_vector_end(); +} + +bool check_vector_unaligned_access_emulated_all_cpus(void) +{ + int cpu; + + if (!has_vector()) { + for_each_online_cpu(cpu) + per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; + return false; + } + + schedule_on_each_cpu(check_vector_unaligned_access_emulated); + + for_each_online_cpu(cpu) + if (per_cpu(vector_misaligned_access, cpu) + == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) + return false; + + return true; +} +#else +bool check_vector_unaligned_access_emulated_all_cpus(void) +{ + return false; +} +#endif + +#ifdef CONFIG_RISCV_SCALAR_MISALIGNED + +static bool unaligned_ctl __read_mostly; + +void check_unaligned_access_emulated(struct work_struct *work __always_unused) +{ + int cpu = smp_processor_id(); long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); unsigned long tmp_var, tmp_val; - bool misaligned_emu_detected; *mas_ptr = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; @@ -536,19 +647,16 @@ static bool check_unaligned_access_emulated(int cpu) " "REG_L" %[tmp], 1(%[ptr])\n" : [tmp] "=r" (tmp_val) : [ptr] "r" (&tmp_var) : "memory"); - misaligned_emu_detected = (*mas_ptr == RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED); /* * If unaligned_ctl is already set, this means that we detected that all * CPUS uses emulated misaligned access at boot time. If that changed * when hotplugging the new cpu, this is something we don't handle. */ - if (unlikely(unaligned_ctl && !misaligned_emu_detected)) { + if (unlikely(unaligned_ctl && (*mas_ptr != RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED))) { pr_crit("CPU misaligned accesses non homogeneous (expected all emulated)\n"); while (true) cpu_relax(); } - - return misaligned_emu_detected; } bool check_unaligned_access_emulated_all_cpus(void) @@ -560,8 +668,11 @@ bool check_unaligned_access_emulated_all_cpus(void) * accesses emulated since tasks requesting such control can run on any * CPU. */ + schedule_on_each_cpu(check_unaligned_access_emulated); + for_each_online_cpu(cpu) - if (!check_unaligned_access_emulated(cpu)) + if (per_cpu(misaligned_access_speed, cpu) + != RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED) return false; unaligned_ctl = true; @@ -572,3 +683,9 @@ bool unaligned_ctl_available(void) { return unaligned_ctl; } +#else +bool check_unaligned_access_emulated_all_cpus(void) +{ + return false; +} +#endif diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c index 160628a2116d..91f189cf1611 100644 --- a/arch/riscv/kernel/unaligned_access_speed.c +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -6,11 +6,13 @@ #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/jump_label.h> +#include <linux/kthread.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/types.h> #include <asm/cpufeature.h> #include <asm/hwprobe.h> +#include <asm/vector.h> #include "copy-unaligned.h" @@ -19,7 +21,8 @@ #define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) -DEFINE_PER_CPU(long, misaligned_access_speed); +DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED; #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS static cpumask_t fast_misaligned_access; @@ -191,6 +194,7 @@ static int riscv_online_cpu(unsigned int cpu) if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) goto exit; + check_unaligned_access_emulated(NULL); buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); if (!buf) { pr_warn("Allocation failure, not measuring misaligned performance\n"); @@ -259,23 +263,159 @@ out: kfree(bufs); return 0; } +#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ +static int check_unaligned_access_speed_all_cpus(void) +{ + return 0; +} +#endif -static int check_unaligned_access_all_cpus(void) +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS +static void check_vector_unaligned_access(struct work_struct *work __always_unused) { - bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); + int cpu = smp_processor_id(); + u64 start_cycles, end_cycles; + u64 word_cycles; + u64 byte_cycles; + int ratio; + unsigned long start_jiffies, now; + struct page *page; + void *dst; + void *src; + long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW; - if (!all_cpus_emulated) - return check_unaligned_access_speed_all_cpus(); + if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) + return; + + page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); + if (!page) { + pr_warn("Allocation failure, not measuring vector misaligned performance\n"); + return; + } + + /* Make an unaligned destination buffer. */ + dst = (void *)((unsigned long)page_address(page) | 0x1); + /* Unalign src as well, but differently (off by 1 + 2 = 3). */ + src = dst + (MISALIGNED_BUFFER_SIZE / 2); + src += 2; + word_cycles = -1ULL; + + /* Do a warmup. */ + kernel_vector_begin(); + __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + /* + * For a fixed amount of time, repeatedly try the function, and take + * the best time in cycles as the measurement. + */ + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + /* Ensure the CSR read can't reorder WRT to the copy. */ + mb(); + __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + /* Ensure the copy ends before the end time is snapped. */ + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < word_cycles) + word_cycles = end_cycles - start_cycles; + } + + byte_cycles = -1ULL; + __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + /* Ensure the CSR read can't reorder WRT to the copy. */ + mb(); + __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + /* Ensure the copy ends before the end time is snapped. */ + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < byte_cycles) + byte_cycles = end_cycles - start_cycles; + } + + kernel_vector_end(); + + /* Don't divide by zero. */ + if (!word_cycles || !byte_cycles) { + pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n", + cpu); + + return; + } + + if (word_cycles < byte_cycles) + speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST; + + ratio = div_u64((byte_cycles * 100), word_cycles); + pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n", + cpu, + ratio / 100, + ratio % 100, + (speed == RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow"); + + per_cpu(vector_misaligned_access, cpu) = speed; +} + +static int riscv_online_cpu_vec(unsigned int cpu) +{ + if (!has_vector()) + return 0; + + if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED) + return 0; + + check_vector_unaligned_access_emulated(NULL); + check_vector_unaligned_access(NULL); return 0; } -#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ -static int check_unaligned_access_all_cpus(void) + +/* Measure unaligned access speed on all CPUs present at boot in parallel. */ +static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) { - check_unaligned_access_emulated_all_cpus(); + schedule_on_each_cpu(check_vector_unaligned_access); + + /* + * Setup hotplug callbacks for any new CPUs that come online or go + * offline. + */ + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", + riscv_online_cpu_vec, NULL); return 0; } +#else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */ +static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused) +{ + return 0; +} #endif +static int check_unaligned_access_all_cpus(void) +{ + bool all_cpus_emulated, all_cpus_vec_unsupported; + + all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); + all_cpus_vec_unsupported = check_vector_unaligned_access_emulated_all_cpus(); + + if (!all_cpus_vec_unsupported && + IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) { + kthread_run(vec_check_unaligned_access_speed_all_cpus, + NULL, "vec_check_unaligned_access_speed_all_cpus"); + } + + if (!all_cpus_emulated) + return check_unaligned_access_speed_all_cpus(); + + return 0; +} + arch_initcall(check_unaligned_access_all_cpus); diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 3f1c4b2d0b06..9a1b555e8733 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -45,7 +45,7 @@ $(obj)/vdso.o: $(obj)/vdso.so # link rule for the .so file, .lds has to be first $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE - $(call if_changed,vdsold) + $(call if_changed,vdsold_and_check) LDFLAGS_vdso.so.dbg = -shared -soname=linux-vdso.so.1 \ --build-id=sha1 --hash-style=both --eh-frame-hdr @@ -65,7 +65,8 @@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE # actual build commands # The DSO images are built using a special linker script # Make sure only to export the intended __vdso_xxx symbol offsets. -quiet_cmd_vdsold = VDSOLD $@ - cmd_vdsold = $(LD) $(ld_flags) -T $(filter-out FORCE,$^) -o $@.tmp && \ +quiet_cmd_vdsold_and_check = VDSOLD $@ + cmd_vdsold_and_check = $(LD) $(ld_flags) -T $(filter-out FORCE,$^) -o $@.tmp && \ $(OBJCOPY) $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ - rm $@.tmp + rm $@.tmp && \ + $(cmd_vdso_check) diff --git a/arch/riscv/kernel/vec-copy-unaligned.S b/arch/riscv/kernel/vec-copy-unaligned.S new file mode 100644 index 000000000000..d16f19f1b3b6 --- /dev/null +++ b/arch/riscv/kernel/vec-copy-unaligned.S @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2024 Rivos Inc. */ + +#include <linux/args.h> +#include <linux/linkage.h> +#include <asm/asm.h> + + .text + +#define WORD_EEW 32 + +#define WORD_SEW CONCATENATE(e, WORD_EEW) +#define VEC_L CONCATENATE(vle, WORD_EEW).v +#define VEC_S CONCATENATE(vle, WORD_EEW).v + +/* void __riscv_copy_vec_words_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using word loads and stores. */ +/* Note: The size is truncated to a multiple of WORD_EEW */ +SYM_FUNC_START(__riscv_copy_vec_words_unaligned) + andi a4, a2, ~(WORD_EEW-1) + beqz a4, 2f + add a3, a1, a4 + .option push + .option arch, +zve32x +1: + vsetivli t0, 8, WORD_SEW, m8, ta, ma + VEC_L v0, (a1) + VEC_S v0, (a0) + addi a0, a0, WORD_EEW + addi a1, a1, WORD_EEW + bltu a1, a3, 1b + +2: + .option pop + ret +SYM_FUNC_END(__riscv_copy_vec_words_unaligned) + +/* void __riscv_copy_vec_bytes_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using only byte accesses. */ +/* Note: The size is truncated to a multiple of 8 */ +SYM_FUNC_START(__riscv_copy_vec_bytes_unaligned) + andi a4, a2, ~(8-1) + beqz a4, 2f + add a3, a1, a4 + .option push + .option arch, +zve32x +1: + vsetivli t0, 8, e8, m8, ta, ma + vle8.v v0, (a1) + vse8.v v0, (a0) + addi a0, a0, 8 + addi a1, a1, 8 + bltu a1, a3, 1b + +2: + .option pop + ret +SYM_FUNC_END(__riscv_copy_vec_bytes_unaligned) diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index 682b3feee451..821818886fab 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -66,7 +66,7 @@ void __init riscv_v_setup_ctx_cache(void) #endif } -static bool insn_is_vector(u32 insn_buf) +bool insn_is_vector(u32 insn_buf) { u32 opcode = insn_buf & __INSN_OPCODE_MASK; u32 width, csr; diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 26d1727f0550..0c3cbb0915ff 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -32,6 +32,7 @@ config KVM select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_MMU_NOTIFIER select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS help Support hosting virtualized guest machines. diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index c2cacfbc06a0..0fb1840c3e0a 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -9,27 +9,30 @@ include $(srctree)/virt/kvm/Makefile.kvm obj-$(CONFIG_KVM) += kvm.o +# Ordered alphabetically +kvm-y += aia.o +kvm-y += aia_aplic.o +kvm-y += aia_device.o +kvm-y += aia_imsic.o kvm-y += main.o -kvm-y += vm.o -kvm-y += vmid.o -kvm-y += tlb.o kvm-y += mmu.o +kvm-y += nacl.o +kvm-y += tlb.o kvm-y += vcpu.o kvm-y += vcpu_exit.o kvm-y += vcpu_fp.o -kvm-y += vcpu_vector.o kvm-y += vcpu_insn.o kvm-y += vcpu_onereg.o -kvm-y += vcpu_switch.o +kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o kvm-y += vcpu_sbi.o -kvm-$(CONFIG_RISCV_SBI_V01) += vcpu_sbi_v01.o kvm-y += vcpu_sbi_base.o -kvm-y += vcpu_sbi_replace.o kvm-y += vcpu_sbi_hsm.o +kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_sbi_pmu.o +kvm-y += vcpu_sbi_replace.o kvm-y += vcpu_sbi_sta.o +kvm-$(CONFIG_RISCV_SBI_V01) += vcpu_sbi_v01.o +kvm-y += vcpu_switch.o kvm-y += vcpu_timer.o -kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o -kvm-y += aia.o -kvm-y += aia_device.o -kvm-y += aia_aplic.o -kvm-y += aia_imsic.o +kvm-y += vcpu_vector.o +kvm-y += vm.o +kvm-y += vmid.o diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index 2967d305c442..dcced4db7fe8 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -16,6 +16,7 @@ #include <linux/percpu.h> #include <linux/spinlock.h> #include <asm/cpufeature.h> +#include <asm/kvm_nacl.h> struct aia_hgei_control { raw_spinlock_t lock; @@ -51,7 +52,7 @@ static int aia_find_hgei(struct kvm_vcpu *owner) return hgei; } -static void aia_set_hvictl(bool ext_irq_pending) +static inline unsigned long aia_hvictl_value(bool ext_irq_pending) { unsigned long hvictl; @@ -62,7 +63,7 @@ static void aia_set_hvictl(bool ext_irq_pending) hvictl = (IRQ_S_EXT << HVICTL_IID_SHIFT) & HVICTL_IID; hvictl |= ext_irq_pending; - csr_write(CSR_HVICTL, hvictl); + return hvictl; } #ifdef CONFIG_32BIT @@ -88,7 +89,7 @@ void kvm_riscv_vcpu_aia_sync_interrupts(struct kvm_vcpu *vcpu) struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; if (kvm_riscv_aia_available()) - csr->vsieh = csr_read(CSR_VSIEH); + csr->vsieh = ncsr_read(CSR_VSIEH); } #endif @@ -115,7 +116,7 @@ bool kvm_riscv_vcpu_aia_has_interrupts(struct kvm_vcpu *vcpu, u64 mask) hgei = aia_find_hgei(vcpu); if (hgei > 0) - return !!(csr_read(CSR_HGEIP) & BIT(hgei)); + return !!(ncsr_read(CSR_HGEIP) & BIT(hgei)); return false; } @@ -128,45 +129,73 @@ void kvm_riscv_vcpu_aia_update_hvip(struct kvm_vcpu *vcpu) return; #ifdef CONFIG_32BIT - csr_write(CSR_HVIPH, vcpu->arch.aia_context.guest_csr.hviph); + ncsr_write(CSR_HVIPH, vcpu->arch.aia_context.guest_csr.hviph); #endif - aia_set_hvictl(!!(csr->hvip & BIT(IRQ_VS_EXT))); + ncsr_write(CSR_HVICTL, aia_hvictl_value(!!(csr->hvip & BIT(IRQ_VS_EXT)))); } void kvm_riscv_vcpu_aia_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + void *nsh; if (!kvm_riscv_aia_available()) return; - csr_write(CSR_VSISELECT, csr->vsiselect); - csr_write(CSR_HVIPRIO1, csr->hviprio1); - csr_write(CSR_HVIPRIO2, csr->hviprio2); + if (kvm_riscv_nacl_sync_csr_available()) { + nsh = nacl_shmem(); + nacl_csr_write(nsh, CSR_VSISELECT, csr->vsiselect); + nacl_csr_write(nsh, CSR_HVIPRIO1, csr->hviprio1); + nacl_csr_write(nsh, CSR_HVIPRIO2, csr->hviprio2); +#ifdef CONFIG_32BIT + nacl_csr_write(nsh, CSR_VSIEH, csr->vsieh); + nacl_csr_write(nsh, CSR_HVIPH, csr->hviph); + nacl_csr_write(nsh, CSR_HVIPRIO1H, csr->hviprio1h); + nacl_csr_write(nsh, CSR_HVIPRIO2H, csr->hviprio2h); +#endif + } else { + csr_write(CSR_VSISELECT, csr->vsiselect); + csr_write(CSR_HVIPRIO1, csr->hviprio1); + csr_write(CSR_HVIPRIO2, csr->hviprio2); #ifdef CONFIG_32BIT - csr_write(CSR_VSIEH, csr->vsieh); - csr_write(CSR_HVIPH, csr->hviph); - csr_write(CSR_HVIPRIO1H, csr->hviprio1h); - csr_write(CSR_HVIPRIO2H, csr->hviprio2h); + csr_write(CSR_VSIEH, csr->vsieh); + csr_write(CSR_HVIPH, csr->hviph); + csr_write(CSR_HVIPRIO1H, csr->hviprio1h); + csr_write(CSR_HVIPRIO2H, csr->hviprio2h); #endif + } } void kvm_riscv_vcpu_aia_put(struct kvm_vcpu *vcpu) { struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + void *nsh; if (!kvm_riscv_aia_available()) return; - csr->vsiselect = csr_read(CSR_VSISELECT); - csr->hviprio1 = csr_read(CSR_HVIPRIO1); - csr->hviprio2 = csr_read(CSR_HVIPRIO2); + if (kvm_riscv_nacl_available()) { + nsh = nacl_shmem(); + csr->vsiselect = nacl_csr_read(nsh, CSR_VSISELECT); + csr->hviprio1 = nacl_csr_read(nsh, CSR_HVIPRIO1); + csr->hviprio2 = nacl_csr_read(nsh, CSR_HVIPRIO2); #ifdef CONFIG_32BIT - csr->vsieh = csr_read(CSR_VSIEH); - csr->hviph = csr_read(CSR_HVIPH); - csr->hviprio1h = csr_read(CSR_HVIPRIO1H); - csr->hviprio2h = csr_read(CSR_HVIPRIO2H); + csr->vsieh = nacl_csr_read(nsh, CSR_VSIEH); + csr->hviph = nacl_csr_read(nsh, CSR_HVIPH); + csr->hviprio1h = nacl_csr_read(nsh, CSR_HVIPRIO1H); + csr->hviprio2h = nacl_csr_read(nsh, CSR_HVIPRIO2H); #endif + } else { + csr->vsiselect = csr_read(CSR_VSISELECT); + csr->hviprio1 = csr_read(CSR_HVIPRIO1); + csr->hviprio2 = csr_read(CSR_HVIPRIO2); +#ifdef CONFIG_32BIT + csr->vsieh = csr_read(CSR_VSIEH); + csr->hviph = csr_read(CSR_HVIPH); + csr->hviprio1h = csr_read(CSR_HVIPRIO1H); + csr->hviprio2h = csr_read(CSR_HVIPRIO2H); +#endif + } } int kvm_riscv_vcpu_aia_get_csr(struct kvm_vcpu *vcpu, @@ -250,20 +279,20 @@ static u8 aia_get_iprio8(struct kvm_vcpu *vcpu, unsigned int irq) switch (bitpos / BITS_PER_LONG) { case 0: - hviprio = csr_read(CSR_HVIPRIO1); + hviprio = ncsr_read(CSR_HVIPRIO1); break; case 1: #ifndef CONFIG_32BIT - hviprio = csr_read(CSR_HVIPRIO2); + hviprio = ncsr_read(CSR_HVIPRIO2); break; #else - hviprio = csr_read(CSR_HVIPRIO1H); + hviprio = ncsr_read(CSR_HVIPRIO1H); break; case 2: - hviprio = csr_read(CSR_HVIPRIO2); + hviprio = ncsr_read(CSR_HVIPRIO2); break; case 3: - hviprio = csr_read(CSR_HVIPRIO2H); + hviprio = ncsr_read(CSR_HVIPRIO2H); break; #endif default: @@ -283,20 +312,20 @@ static void aia_set_iprio8(struct kvm_vcpu *vcpu, unsigned int irq, u8 prio) switch (bitpos / BITS_PER_LONG) { case 0: - hviprio = csr_read(CSR_HVIPRIO1); + hviprio = ncsr_read(CSR_HVIPRIO1); break; case 1: #ifndef CONFIG_32BIT - hviprio = csr_read(CSR_HVIPRIO2); + hviprio = ncsr_read(CSR_HVIPRIO2); break; #else - hviprio = csr_read(CSR_HVIPRIO1H); + hviprio = ncsr_read(CSR_HVIPRIO1H); break; case 2: - hviprio = csr_read(CSR_HVIPRIO2); + hviprio = ncsr_read(CSR_HVIPRIO2); break; case 3: - hviprio = csr_read(CSR_HVIPRIO2H); + hviprio = ncsr_read(CSR_HVIPRIO2H); break; #endif default: @@ -308,20 +337,20 @@ static void aia_set_iprio8(struct kvm_vcpu *vcpu, unsigned int irq, u8 prio) switch (bitpos / BITS_PER_LONG) { case 0: - csr_write(CSR_HVIPRIO1, hviprio); + ncsr_write(CSR_HVIPRIO1, hviprio); break; case 1: #ifndef CONFIG_32BIT - csr_write(CSR_HVIPRIO2, hviprio); + ncsr_write(CSR_HVIPRIO2, hviprio); break; #else - csr_write(CSR_HVIPRIO1H, hviprio); + ncsr_write(CSR_HVIPRIO1H, hviprio); break; case 2: - csr_write(CSR_HVIPRIO2, hviprio); + ncsr_write(CSR_HVIPRIO2, hviprio); break; case 3: - csr_write(CSR_HVIPRIO2H, hviprio); + ncsr_write(CSR_HVIPRIO2H, hviprio); break; #endif default: @@ -377,7 +406,7 @@ int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num, return KVM_INSN_ILLEGAL_TRAP; /* First try to emulate in kernel space */ - isel = csr_read(CSR_VSISELECT) & ISELECT_MASK; + isel = ncsr_read(CSR_VSISELECT) & ISELECT_MASK; if (isel >= ISELECT_IPRIO0 && isel <= ISELECT_IPRIO15) return aia_rmw_iprio(vcpu, isel, val, new_val, wr_mask); else if (isel >= IMSIC_FIRST && isel <= IMSIC_LAST && @@ -499,6 +528,10 @@ static int aia_hgei_init(void) hgctrl->free_bitmap = 0; } + /* Skip SGEI interrupt setup for zero guest external interrupts */ + if (!kvm_riscv_aia_nr_hgei) + goto skip_sgei_interrupt; + /* Find INTC irq domain */ domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), DOMAIN_BUS_ANY); @@ -522,11 +555,16 @@ static int aia_hgei_init(void) return rc; } +skip_sgei_interrupt: return 0; } static void aia_hgei_exit(void) { + /* Do nothing for zero guest external interrupts */ + if (!kvm_riscv_aia_nr_hgei) + return; + /* Free per-CPU SGEI interrupt */ free_percpu_irq(hgei_parent_irq, &aia_hgei); } @@ -536,7 +574,7 @@ void kvm_riscv_aia_enable(void) if (!kvm_riscv_aia_available()) return; - aia_set_hvictl(false); + csr_write(CSR_HVICTL, aia_hvictl_value(false)); csr_write(CSR_HVIPRIO1, 0x0); csr_write(CSR_HVIPRIO2, 0x0); #ifdef CONFIG_32BIT @@ -572,7 +610,7 @@ void kvm_riscv_aia_disable(void) csr_clear(CSR_HIE, BIT(IRQ_S_GEXT)); disable_percpu_irq(hgei_parent_irq); - aia_set_hvictl(false); + csr_write(CSR_HVICTL, aia_hvictl_value(false)); raw_spin_lock_irqsave(&hgctrl->lock, flags); diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c index da6ff1bade0d..f59d1c0c8c43 100644 --- a/arch/riscv/kvm/aia_aplic.c +++ b/arch/riscv/kvm/aia_aplic.c @@ -143,7 +143,7 @@ static void aplic_write_pending(struct aplic *aplic, u32 irq, bool pending) if (sm == APLIC_SOURCECFG_SM_LEVEL_HIGH || sm == APLIC_SOURCECFG_SM_LEVEL_LOW) { if (!pending) - goto skip_write_pending; + goto noskip_write_pending; if ((irqd->state & APLIC_IRQ_STATE_INPUT) && sm == APLIC_SOURCECFG_SM_LEVEL_LOW) goto skip_write_pending; @@ -152,6 +152,7 @@ static void aplic_write_pending(struct aplic *aplic, u32 irq, bool pending) goto skip_write_pending; } +noskip_write_pending: if (pending) irqd->state |= APLIC_IRQ_STATE_PENDING; else diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index f3427f6de608..1fa8be5ee509 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -10,8 +10,8 @@ #include <linux/err.h> #include <linux/module.h> #include <linux/kvm_host.h> -#include <asm/csr.h> #include <asm/cpufeature.h> +#include <asm/kvm_nacl.h> #include <asm/sbi.h> long kvm_arch_dev_ioctl(struct file *filp, @@ -22,6 +22,12 @@ long kvm_arch_dev_ioctl(struct file *filp, int kvm_arch_enable_virtualization_cpu(void) { + int rc; + + rc = kvm_riscv_nacl_enable(); + if (rc) + return rc; + csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); @@ -49,11 +55,21 @@ void kvm_arch_disable_virtualization_cpu(void) csr_write(CSR_HVIP, 0); csr_write(CSR_HEDELEG, 0); csr_write(CSR_HIDELEG, 0); + + kvm_riscv_nacl_disable(); +} + +static void kvm_riscv_teardown(void) +{ + kvm_riscv_aia_exit(); + kvm_riscv_nacl_exit(); + kvm_unregister_perf_callbacks(); } static int __init riscv_kvm_init(void) { int rc; + char slist[64]; const char *str; if (!riscv_isa_extension_available(NULL, h)) { @@ -71,16 +87,53 @@ static int __init riscv_kvm_init(void) return -ENODEV; } + rc = kvm_riscv_nacl_init(); + if (rc && rc != -ENODEV) + return rc; + kvm_riscv_gstage_mode_detect(); kvm_riscv_gstage_vmid_detect(); rc = kvm_riscv_aia_init(); - if (rc && rc != -ENODEV) + if (rc && rc != -ENODEV) { + kvm_riscv_nacl_exit(); return rc; + } kvm_info("hypervisor extension available\n"); + if (kvm_riscv_nacl_available()) { + rc = 0; + slist[0] = '\0'; + if (kvm_riscv_nacl_sync_csr_available()) { + if (rc) + strcat(slist, ", "); + strcat(slist, "sync_csr"); + rc++; + } + if (kvm_riscv_nacl_sync_hfence_available()) { + if (rc) + strcat(slist, ", "); + strcat(slist, "sync_hfence"); + rc++; + } + if (kvm_riscv_nacl_sync_sret_available()) { + if (rc) + strcat(slist, ", "); + strcat(slist, "sync_sret"); + rc++; + } + if (kvm_riscv_nacl_autoswap_csr_available()) { + if (rc) + strcat(slist, ", "); + strcat(slist, "autoswap_csr"); + rc++; + } + kvm_info("using SBI nested acceleration with %s\n", + (rc) ? slist : "no features"); + } + switch (kvm_riscv_gstage_mode()) { case HGATP_MODE_SV32X4: str = "Sv32x4"; @@ -105,9 +158,11 @@ static int __init riscv_kvm_init(void) kvm_info("AIA available with %d guest external interrupts\n", kvm_riscv_aia_nr_hgei); + kvm_register_perf_callbacks(NULL); + rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (rc) { - kvm_riscv_aia_exit(); + kvm_riscv_teardown(); return rc; } @@ -117,7 +172,7 @@ module_init(riscv_kvm_init); static void __exit riscv_kvm_exit(void) { - kvm_riscv_aia_exit(); + kvm_riscv_teardown(); kvm_exit(); } diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index b63650f9b966..1087ea74567b 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -15,7 +15,7 @@ #include <linux/vmalloc.h> #include <linux/kvm_host.h> #include <linux/sched/signal.h> -#include <asm/csr.h> +#include <asm/kvm_nacl.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -601,6 +601,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, bool logging = (memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY)) ? true : false; unsigned long vma_pagesize, mmu_seq; + struct page *page; /* We need minimum second+third level pages */ ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels); @@ -631,7 +632,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, /* * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or gfn_to_pfn_prot() become stale priort to acquiring + * vma_lookup() or __kvm_faultin_pfn() become stale prior to acquiring * kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs @@ -647,7 +648,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, return -EFAULT; } - hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable); + hfn = kvm_faultin_pfn(vcpu, gfn, is_write, &writable, &page); if (hfn == KVM_PFN_ERR_HWPOISON) { send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, vma_pageshift, current); @@ -669,7 +670,6 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, goto out_unlock; if (writable) { - kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, vma_pagesize, false, true); @@ -682,9 +682,8 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, kvm_err("Failed to map in G-stage\n"); out_unlock: + kvm_release_faultin_page(kvm, page, ret && ret != -EEXIST, writable); spin_unlock(&kvm->mmu_lock); - kvm_set_pfn_accessed(hfn); - kvm_release_pfn_clean(hfn); return ret; } @@ -732,7 +731,7 @@ void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu) hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID; hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN; - csr_write(CSR_HGATP, hgatp); + ncsr_write(CSR_HGATP, hgatp); if (!kvm_riscv_gstage_vmid_bits()) kvm_riscv_local_hfence_gvma_all(); diff --git a/arch/riscv/kvm/nacl.c b/arch/riscv/kvm/nacl.c new file mode 100644 index 000000000000..08a95ad9ada2 --- /dev/null +++ b/arch/riscv/kvm/nacl.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024 Ventana Micro Systems Inc. + */ + +#include <linux/kvm_host.h> +#include <linux/vmalloc.h> +#include <asm/kvm_nacl.h> + +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_available); +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_sync_csr_available); +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_sync_hfence_available); +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_sync_sret_available); +DEFINE_STATIC_KEY_FALSE(kvm_riscv_nacl_autoswap_csr_available); +DEFINE_PER_CPU(struct kvm_riscv_nacl, kvm_riscv_nacl); + +void __kvm_riscv_nacl_hfence(void *shmem, + unsigned long control, + unsigned long page_num, + unsigned long page_count) +{ + int i, ent = -1, try_count = 5; + unsigned long *entp; + +again: + for (i = 0; i < SBI_NACL_SHMEM_HFENCE_ENTRY_MAX; i++) { + entp = shmem + SBI_NACL_SHMEM_HFENCE_ENTRY_CONFIG(i); + if (lelong_to_cpu(*entp) & SBI_NACL_SHMEM_HFENCE_CONFIG_PEND) + continue; + + ent = i; + break; + } + + if (ent < 0) { + if (try_count) { + nacl_sync_hfence(-1UL); + goto again; + } else { + pr_warn("KVM: No free entry in NACL shared memory\n"); + return; + } + } + + entp = shmem + SBI_NACL_SHMEM_HFENCE_ENTRY_CONFIG(i); + *entp = cpu_to_lelong(control); + entp = shmem + SBI_NACL_SHMEM_HFENCE_ENTRY_PNUM(i); + *entp = cpu_to_lelong(page_num); + entp = shmem + SBI_NACL_SHMEM_HFENCE_ENTRY_PCOUNT(i); + *entp = cpu_to_lelong(page_count); +} + +int kvm_riscv_nacl_enable(void) +{ + int rc; + struct sbiret ret; + struct kvm_riscv_nacl *nacl; + + if (!kvm_riscv_nacl_available()) + return 0; + nacl = this_cpu_ptr(&kvm_riscv_nacl); + + ret = sbi_ecall(SBI_EXT_NACL, SBI_EXT_NACL_SET_SHMEM, + nacl->shmem_phys, 0, 0, 0, 0, 0); + rc = sbi_err_map_linux_errno(ret.error); + if (rc) + return rc; + + return 0; +} + +void kvm_riscv_nacl_disable(void) +{ + if (!kvm_riscv_nacl_available()) + return; + + sbi_ecall(SBI_EXT_NACL, SBI_EXT_NACL_SET_SHMEM, + SBI_SHMEM_DISABLE, SBI_SHMEM_DISABLE, 0, 0, 0, 0); +} + +void kvm_riscv_nacl_exit(void) +{ + int cpu; + struct kvm_riscv_nacl *nacl; + + if (!kvm_riscv_nacl_available()) + return; + + /* Allocate per-CPU shared memory */ + for_each_possible_cpu(cpu) { + nacl = per_cpu_ptr(&kvm_riscv_nacl, cpu); + if (!nacl->shmem) + continue; + + free_pages((unsigned long)nacl->shmem, + get_order(SBI_NACL_SHMEM_SIZE)); + nacl->shmem = NULL; + nacl->shmem_phys = 0; + } +} + +static long nacl_probe_feature(long feature_id) +{ + struct sbiret ret; + + if (!kvm_riscv_nacl_available()) + return 0; + + ret = sbi_ecall(SBI_EXT_NACL, SBI_EXT_NACL_PROBE_FEATURE, + feature_id, 0, 0, 0, 0, 0); + return ret.value; +} + +int kvm_riscv_nacl_init(void) +{ + int cpu; + struct page *shmem_page; + struct kvm_riscv_nacl *nacl; + + if (sbi_spec_version < sbi_mk_version(1, 0) || + sbi_probe_extension(SBI_EXT_NACL) <= 0) + return -ENODEV; + + /* Enable NACL support */ + static_branch_enable(&kvm_riscv_nacl_available); + + /* Probe NACL features */ + if (nacl_probe_feature(SBI_NACL_FEAT_SYNC_CSR)) + static_branch_enable(&kvm_riscv_nacl_sync_csr_available); + if (nacl_probe_feature(SBI_NACL_FEAT_SYNC_HFENCE)) + static_branch_enable(&kvm_riscv_nacl_sync_hfence_available); + if (nacl_probe_feature(SBI_NACL_FEAT_SYNC_SRET)) + static_branch_enable(&kvm_riscv_nacl_sync_sret_available); + if (nacl_probe_feature(SBI_NACL_FEAT_AUTOSWAP_CSR)) + static_branch_enable(&kvm_riscv_nacl_autoswap_csr_available); + + /* Allocate per-CPU shared memory */ + for_each_possible_cpu(cpu) { + nacl = per_cpu_ptr(&kvm_riscv_nacl, cpu); + + shmem_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(SBI_NACL_SHMEM_SIZE)); + if (!shmem_page) { + kvm_riscv_nacl_exit(); + return -ENOMEM; + } + nacl->shmem = page_to_virt(shmem_page); + nacl->shmem_phys = page_to_phys(shmem_page); + } + + return 0; +} diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 23c0e82b5103..2f91ea5f8493 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -14,6 +14,7 @@ #include <asm/csr.h> #include <asm/cpufeature.h> #include <asm/insn-def.h> +#include <asm/kvm_nacl.h> #define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL) @@ -186,18 +187,24 @@ void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu) void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu) { - struct kvm_vmid *vmid; + struct kvm_vmid *v = &vcpu->kvm->arch.vmid; + unsigned long vmid = READ_ONCE(v->vmid); - vmid = &vcpu->kvm->arch.vmid; - kvm_riscv_local_hfence_gvma_vmid_all(READ_ONCE(vmid->vmid)); + if (kvm_riscv_nacl_available()) + nacl_hfence_gvma_vmid_all(nacl_shmem(), vmid); + else + kvm_riscv_local_hfence_gvma_vmid_all(vmid); } void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu) { - struct kvm_vmid *vmid; + struct kvm_vmid *v = &vcpu->kvm->arch.vmid; + unsigned long vmid = READ_ONCE(v->vmid); - vmid = &vcpu->kvm->arch.vmid; - kvm_riscv_local_hfence_vvma_all(READ_ONCE(vmid->vmid)); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma_all(nacl_shmem(), vmid); + else + kvm_riscv_local_hfence_vvma_all(vmid); } static bool vcpu_hfence_dequeue(struct kvm_vcpu *vcpu, @@ -251,6 +258,7 @@ static bool vcpu_hfence_enqueue(struct kvm_vcpu *vcpu, void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu) { + unsigned long vmid; struct kvm_riscv_hfence d = { 0 }; struct kvm_vmid *v = &vcpu->kvm->arch.vmid; @@ -259,26 +267,41 @@ void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu) case KVM_RISCV_HFENCE_UNKNOWN: break; case KVM_RISCV_HFENCE_GVMA_VMID_GPA: - kvm_riscv_local_hfence_gvma_vmid_gpa( - READ_ONCE(v->vmid), - d.addr, d.size, d.order); + vmid = READ_ONCE(v->vmid); + if (kvm_riscv_nacl_available()) + nacl_hfence_gvma_vmid(nacl_shmem(), vmid, + d.addr, d.size, d.order); + else + kvm_riscv_local_hfence_gvma_vmid_gpa(vmid, d.addr, + d.size, d.order); break; case KVM_RISCV_HFENCE_VVMA_ASID_GVA: kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD); - kvm_riscv_local_hfence_vvma_asid_gva( - READ_ONCE(v->vmid), d.asid, - d.addr, d.size, d.order); + vmid = READ_ONCE(v->vmid); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma_asid(nacl_shmem(), vmid, d.asid, + d.addr, d.size, d.order); + else + kvm_riscv_local_hfence_vvma_asid_gva(vmid, d.asid, d.addr, + d.size, d.order); break; case KVM_RISCV_HFENCE_VVMA_ASID_ALL: kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD); - kvm_riscv_local_hfence_vvma_asid_all( - READ_ONCE(v->vmid), d.asid); + vmid = READ_ONCE(v->vmid); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma_asid_all(nacl_shmem(), vmid, d.asid); + else + kvm_riscv_local_hfence_vvma_asid_all(vmid, d.asid); break; case KVM_RISCV_HFENCE_VVMA_GVA: kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD); - kvm_riscv_local_hfence_vvma_gva( - READ_ONCE(v->vmid), - d.addr, d.size, d.order); + vmid = READ_ONCE(v->vmid); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma(nacl_shmem(), vmid, + d.addr, d.size, d.order); + else + kvm_riscv_local_hfence_vvma_gva(vmid, d.addr, + d.size, d.order); break; default: break; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 8d7d381737ee..dc3f76f6e46c 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -17,8 +17,8 @@ #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/kvm_host.h> -#include <asm/csr.h> #include <asm/cacheflush.h> +#include <asm/kvm_nacl.h> #include <asm/kvm_vcpu_vector.h> #define CREATE_TRACE_POINTS @@ -226,6 +226,13 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return (vcpu->arch.guest_context.sstatus & SR_SPP) ? true : false; } +#ifdef CONFIG_GUEST_PERF_EVENTS +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_context.sepc; +} +#endif + vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) { return VM_FAULT_SIGBUS; @@ -361,10 +368,10 @@ void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu) struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; /* Read current HVIP and VSIE CSRs */ - csr->vsie = csr_read(CSR_VSIE); + csr->vsie = ncsr_read(CSR_VSIE); /* Sync-up HVIP.VSSIP bit changes does by Guest */ - hvip = csr_read(CSR_HVIP); + hvip = ncsr_read(CSR_HVIP); if ((csr->hvip ^ hvip) & (1UL << IRQ_VS_SOFT)) { if (hvip & (1UL << IRQ_VS_SOFT)) { if (!test_and_set_bit(IRQ_VS_SOFT, @@ -561,26 +568,49 @@ static void kvm_riscv_vcpu_setup_config(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + void *nsh; struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; - csr_write(CSR_VSSTATUS, csr->vsstatus); - csr_write(CSR_VSIE, csr->vsie); - csr_write(CSR_VSTVEC, csr->vstvec); - csr_write(CSR_VSSCRATCH, csr->vsscratch); - csr_write(CSR_VSEPC, csr->vsepc); - csr_write(CSR_VSCAUSE, csr->vscause); - csr_write(CSR_VSTVAL, csr->vstval); - csr_write(CSR_HEDELEG, cfg->hedeleg); - csr_write(CSR_HVIP, csr->hvip); - csr_write(CSR_VSATP, csr->vsatp); - csr_write(CSR_HENVCFG, cfg->henvcfg); - if (IS_ENABLED(CONFIG_32BIT)) - csr_write(CSR_HENVCFGH, cfg->henvcfg >> 32); - if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { - csr_write(CSR_HSTATEEN0, cfg->hstateen0); + if (kvm_riscv_nacl_sync_csr_available()) { + nsh = nacl_shmem(); + nacl_csr_write(nsh, CSR_VSSTATUS, csr->vsstatus); + nacl_csr_write(nsh, CSR_VSIE, csr->vsie); + nacl_csr_write(nsh, CSR_VSTVEC, csr->vstvec); + nacl_csr_write(nsh, CSR_VSSCRATCH, csr->vsscratch); + nacl_csr_write(nsh, CSR_VSEPC, csr->vsepc); + nacl_csr_write(nsh, CSR_VSCAUSE, csr->vscause); + nacl_csr_write(nsh, CSR_VSTVAL, csr->vstval); + nacl_csr_write(nsh, CSR_HEDELEG, cfg->hedeleg); + nacl_csr_write(nsh, CSR_HVIP, csr->hvip); + nacl_csr_write(nsh, CSR_VSATP, csr->vsatp); + nacl_csr_write(nsh, CSR_HENVCFG, cfg->henvcfg); if (IS_ENABLED(CONFIG_32BIT)) - csr_write(CSR_HSTATEEN0H, cfg->hstateen0 >> 32); + nacl_csr_write(nsh, CSR_HENVCFGH, cfg->henvcfg >> 32); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + nacl_csr_write(nsh, CSR_HSTATEEN0, cfg->hstateen0); + if (IS_ENABLED(CONFIG_32BIT)) + nacl_csr_write(nsh, CSR_HSTATEEN0H, cfg->hstateen0 >> 32); + } + } else { + csr_write(CSR_VSSTATUS, csr->vsstatus); + csr_write(CSR_VSIE, csr->vsie); + csr_write(CSR_VSTVEC, csr->vstvec); + csr_write(CSR_VSSCRATCH, csr->vsscratch); + csr_write(CSR_VSEPC, csr->vsepc); + csr_write(CSR_VSCAUSE, csr->vscause); + csr_write(CSR_VSTVAL, csr->vstval); + csr_write(CSR_HEDELEG, cfg->hedeleg); + csr_write(CSR_HVIP, csr->hvip); + csr_write(CSR_VSATP, csr->vsatp); + csr_write(CSR_HENVCFG, cfg->henvcfg); + if (IS_ENABLED(CONFIG_32BIT)) + csr_write(CSR_HENVCFGH, cfg->henvcfg >> 32); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + csr_write(CSR_HSTATEEN0, cfg->hstateen0); + if (IS_ENABLED(CONFIG_32BIT)) + csr_write(CSR_HSTATEEN0H, cfg->hstateen0 >> 32); + } } kvm_riscv_gstage_update_hgatp(vcpu); @@ -603,6 +633,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + void *nsh; struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; vcpu->cpu = -1; @@ -618,15 +649,28 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->arch.isa); kvm_riscv_vcpu_host_vector_restore(&vcpu->arch.host_context); - csr->vsstatus = csr_read(CSR_VSSTATUS); - csr->vsie = csr_read(CSR_VSIE); - csr->vstvec = csr_read(CSR_VSTVEC); - csr->vsscratch = csr_read(CSR_VSSCRATCH); - csr->vsepc = csr_read(CSR_VSEPC); - csr->vscause = csr_read(CSR_VSCAUSE); - csr->vstval = csr_read(CSR_VSTVAL); - csr->hvip = csr_read(CSR_HVIP); - csr->vsatp = csr_read(CSR_VSATP); + if (kvm_riscv_nacl_available()) { + nsh = nacl_shmem(); + csr->vsstatus = nacl_csr_read(nsh, CSR_VSSTATUS); + csr->vsie = nacl_csr_read(nsh, CSR_VSIE); + csr->vstvec = nacl_csr_read(nsh, CSR_VSTVEC); + csr->vsscratch = nacl_csr_read(nsh, CSR_VSSCRATCH); + csr->vsepc = nacl_csr_read(nsh, CSR_VSEPC); + csr->vscause = nacl_csr_read(nsh, CSR_VSCAUSE); + csr->vstval = nacl_csr_read(nsh, CSR_VSTVAL); + csr->hvip = nacl_csr_read(nsh, CSR_HVIP); + csr->vsatp = nacl_csr_read(nsh, CSR_VSATP); + } else { + csr->vsstatus = csr_read(CSR_VSSTATUS); + csr->vsie = csr_read(CSR_VSIE); + csr->vstvec = csr_read(CSR_VSTVEC); + csr->vsscratch = csr_read(CSR_VSSCRATCH); + csr->vsepc = csr_read(CSR_VSEPC); + csr->vscause = csr_read(CSR_VSCAUSE); + csr->vstval = csr_read(CSR_VSTVAL); + csr->hvip = csr_read(CSR_HVIP); + csr->vsatp = csr_read(CSR_VSATP); + } } static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) @@ -681,7 +725,7 @@ static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu) { struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; - csr_write(CSR_HVIP, csr->hvip); + ncsr_write(CSR_HVIP, csr->hvip); kvm_riscv_vcpu_aia_update_hvip(vcpu); } @@ -691,6 +735,7 @@ static __always_inline void kvm_riscv_vcpu_swap_in_guest_state(struct kvm_vcpu * struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + vcpu->arch.host_scounteren = csr_swap(CSR_SCOUNTEREN, csr->scounteren); vcpu->arch.host_senvcfg = csr_swap(CSR_SENVCFG, csr->senvcfg); if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN) && (cfg->hstateen0 & SMSTATEEN0_SSTATEEN0)) @@ -704,6 +749,7 @@ static __always_inline void kvm_riscv_vcpu_swap_in_host_state(struct kvm_vcpu *v struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + csr->scounteren = csr_swap(CSR_SCOUNTEREN, vcpu->arch.host_scounteren); csr->senvcfg = csr_swap(CSR_SENVCFG, vcpu->arch.host_senvcfg); if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN) && (cfg->hstateen0 & SMSTATEEN0_SSTATEEN0)) @@ -718,11 +764,81 @@ static __always_inline void kvm_riscv_vcpu_swap_in_host_state(struct kvm_vcpu *v * This must be noinstr as instrumentation may make use of RCU, and this is not * safe during the EQS. */ -static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu) +static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct kvm_cpu_trap *trap) { + void *nsh; + struct kvm_cpu_context *gcntx = &vcpu->arch.guest_context; + struct kvm_cpu_context *hcntx = &vcpu->arch.host_context; + + /* + * We save trap CSRs (such as SEPC, SCAUSE, STVAL, HTVAL, and + * HTINST) here because we do local_irq_enable() after this + * function in kvm_arch_vcpu_ioctl_run() which can result in + * an interrupt immediately after local_irq_enable() and can + * potentially change trap CSRs. + */ + kvm_riscv_vcpu_swap_in_guest_state(vcpu); guest_state_enter_irqoff(); - __kvm_riscv_switch_to(&vcpu->arch); + + if (kvm_riscv_nacl_sync_sret_available()) { + nsh = nacl_shmem(); + + if (kvm_riscv_nacl_autoswap_csr_available()) { + hcntx->hstatus = + nacl_csr_read(nsh, CSR_HSTATUS); + nacl_scratch_write_long(nsh, + SBI_NACL_SHMEM_AUTOSWAP_OFFSET + + SBI_NACL_SHMEM_AUTOSWAP_HSTATUS, + gcntx->hstatus); + nacl_scratch_write_long(nsh, + SBI_NACL_SHMEM_AUTOSWAP_OFFSET, + SBI_NACL_SHMEM_AUTOSWAP_FLAG_HSTATUS); + } else if (kvm_riscv_nacl_sync_csr_available()) { + hcntx->hstatus = nacl_csr_swap(nsh, + CSR_HSTATUS, gcntx->hstatus); + } else { + hcntx->hstatus = csr_swap(CSR_HSTATUS, gcntx->hstatus); + } + + nacl_scratch_write_longs(nsh, + SBI_NACL_SHMEM_SRET_OFFSET + + SBI_NACL_SHMEM_SRET_X(1), + &gcntx->ra, + SBI_NACL_SHMEM_SRET_X_LAST); + + __kvm_riscv_nacl_switch_to(&vcpu->arch, SBI_EXT_NACL, + SBI_EXT_NACL_SYNC_SRET); + + if (kvm_riscv_nacl_autoswap_csr_available()) { + nacl_scratch_write_long(nsh, + SBI_NACL_SHMEM_AUTOSWAP_OFFSET, + 0); + gcntx->hstatus = nacl_scratch_read_long(nsh, + SBI_NACL_SHMEM_AUTOSWAP_OFFSET + + SBI_NACL_SHMEM_AUTOSWAP_HSTATUS); + } else { + gcntx->hstatus = csr_swap(CSR_HSTATUS, hcntx->hstatus); + } + + trap->htval = nacl_csr_read(nsh, CSR_HTVAL); + trap->htinst = nacl_csr_read(nsh, CSR_HTINST); + } else { + hcntx->hstatus = csr_swap(CSR_HSTATUS, gcntx->hstatus); + + __kvm_riscv_switch_to(&vcpu->arch); + + gcntx->hstatus = csr_swap(CSR_HSTATUS, hcntx->hstatus); + + trap->htval = csr_read(CSR_HTVAL); + trap->htinst = csr_read(CSR_HTINST); + } + + trap->sepc = gcntx->sepc; + trap->scause = csr_read(CSR_SCAUSE); + trap->stval = csr_read(CSR_STVAL); + vcpu->arch.last_exit_cpu = vcpu->cpu; guest_state_exit_irqoff(); kvm_riscv_vcpu_swap_in_host_state(vcpu); @@ -839,22 +955,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) guest_timing_enter_irqoff(); - kvm_riscv_vcpu_enter_exit(vcpu); + kvm_riscv_vcpu_enter_exit(vcpu, &trap); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; - /* - * Save SCAUSE, STVAL, HTVAL, and HTINST because we might - * get an interrupt between __kvm_riscv_switch_to() and - * local_irq_enable() which can potentially change CSRs. - */ - trap.sepc = vcpu->arch.guest_context.sepc; - trap.scause = csr_read(CSR_SCAUSE); - trap.stval = csr_read(CSR_STVAL); - trap.htval = csr_read(CSR_HTVAL); - trap.htinst = csr_read(CSR_HTINST); - /* Syncup interrupts state with HW */ kvm_riscv_vcpu_sync_interrupts(vcpu); diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index b319c4c13c54..5b68490ad9b7 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -34,9 +34,11 @@ static const unsigned long kvm_isa_ext_arr[] = { [KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m, [KVM_RISCV_ISA_EXT_V] = RISCV_ISA_EXT_v, /* Multi letter extensions (alphabetically sorted) */ + [KVM_RISCV_ISA_EXT_SMNPM] = RISCV_ISA_EXT_SSNPM, KVM_ISA_EXT_ARR(SMSTATEEN), KVM_ISA_EXT_ARR(SSAIA), KVM_ISA_EXT_ARR(SSCOFPMF), + KVM_ISA_EXT_ARR(SSNPM), KVM_ISA_EXT_ARR(SSTC), KVM_ISA_EXT_ARR(SVINVAL), KVM_ISA_EXT_ARR(SVNAPOT), @@ -127,8 +129,10 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) case KVM_RISCV_ISA_EXT_C: case KVM_RISCV_ISA_EXT_I: case KVM_RISCV_ISA_EXT_M: + case KVM_RISCV_ISA_EXT_SMNPM: /* There is not architectural config bit to disable sscofpmf completely */ case KVM_RISCV_ISA_EXT_SSCOFPMF: + case KVM_RISCV_ISA_EXT_SSNPM: case KVM_RISCV_ISA_EXT_SSTC: case KVM_RISCV_ISA_EXT_SVINVAL: case KVM_RISCV_ISA_EXT_SVNAPOT: diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index 7de128be8db9..6e704ed86a83 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -486,19 +486,22 @@ void kvm_riscv_vcpu_sbi_init(struct kvm_vcpu *vcpu) struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; const struct kvm_riscv_sbi_extension_entry *entry; const struct kvm_vcpu_sbi_extension *ext; - int i; + int idx, i; for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { entry = &sbi_ext[i]; ext = entry->ext_ptr; + idx = entry->ext_idx; + + if (idx < 0 || idx >= ARRAY_SIZE(scontext->ext_status)) + continue; if (ext->probe && !ext->probe(vcpu)) { - scontext->ext_status[entry->ext_idx] = - KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE; + scontext->ext_status[idx] = KVM_RISCV_SBI_EXT_STATUS_UNAVAILABLE; continue; } - scontext->ext_status[entry->ext_idx] = ext->default_disabled ? + scontext->ext_status[idx] = ext->default_disabled ? KVM_RISCV_SBI_EXT_STATUS_DISABLED : KVM_RISCV_SBI_EXT_STATUS_ENABLED; } diff --git a/arch/riscv/kvm/vcpu_switch.S b/arch/riscv/kvm/vcpu_switch.S index 0c26189aa01c..47686bcb21e0 100644 --- a/arch/riscv/kvm/vcpu_switch.S +++ b/arch/riscv/kvm/vcpu_switch.S @@ -11,11 +11,7 @@ #include <asm/asm-offsets.h> #include <asm/csr.h> - .text - .altmacro - .option norelax - -SYM_FUNC_START(__kvm_riscv_switch_to) +.macro SAVE_HOST_GPRS /* Save Host GPRs (except A0 and T0-T6) */ REG_S ra, (KVM_ARCH_HOST_RA)(a0) REG_S sp, (KVM_ARCH_HOST_SP)(a0) @@ -40,39 +36,33 @@ SYM_FUNC_START(__kvm_riscv_switch_to) REG_S s9, (KVM_ARCH_HOST_S9)(a0) REG_S s10, (KVM_ARCH_HOST_S10)(a0) REG_S s11, (KVM_ARCH_HOST_S11)(a0) +.endm +.macro SAVE_HOST_AND_RESTORE_GUEST_CSRS __resume_addr /* Load Guest CSR values */ REG_L t0, (KVM_ARCH_GUEST_SSTATUS)(a0) - REG_L t1, (KVM_ARCH_GUEST_HSTATUS)(a0) - REG_L t2, (KVM_ARCH_GUEST_SCOUNTEREN)(a0) - la t4, .Lkvm_switch_return - REG_L t5, (KVM_ARCH_GUEST_SEPC)(a0) + la t1, \__resume_addr + REG_L t2, (KVM_ARCH_GUEST_SEPC)(a0) /* Save Host and Restore Guest SSTATUS */ csrrw t0, CSR_SSTATUS, t0 - /* Save Host and Restore Guest HSTATUS */ - csrrw t1, CSR_HSTATUS, t1 - - /* Save Host and Restore Guest SCOUNTEREN */ - csrrw t2, CSR_SCOUNTEREN, t2 - /* Save Host STVEC and change it to return path */ - csrrw t4, CSR_STVEC, t4 + csrrw t1, CSR_STVEC, t1 + + /* Restore Guest SEPC */ + csrw CSR_SEPC, t2 /* Save Host SSCRATCH and change it to struct kvm_vcpu_arch pointer */ csrrw t3, CSR_SSCRATCH, a0 - /* Restore Guest SEPC */ - csrw CSR_SEPC, t5 - /* Store Host CSR values */ REG_S t0, (KVM_ARCH_HOST_SSTATUS)(a0) - REG_S t1, (KVM_ARCH_HOST_HSTATUS)(a0) - REG_S t2, (KVM_ARCH_HOST_SCOUNTEREN)(a0) + REG_S t1, (KVM_ARCH_HOST_STVEC)(a0) REG_S t3, (KVM_ARCH_HOST_SSCRATCH)(a0) - REG_S t4, (KVM_ARCH_HOST_STVEC)(a0) +.endm +.macro RESTORE_GUEST_GPRS /* Restore Guest GPRs (except A0) */ REG_L ra, (KVM_ARCH_GUEST_RA)(a0) REG_L sp, (KVM_ARCH_GUEST_SP)(a0) @@ -107,13 +97,9 @@ SYM_FUNC_START(__kvm_riscv_switch_to) /* Restore Guest A0 */ REG_L a0, (KVM_ARCH_GUEST_A0)(a0) +.endm - /* Resume Guest */ - sret - - /* Back to Host */ - .align 2 -.Lkvm_switch_return: +.macro SAVE_GUEST_GPRS /* Swap Guest A0 with SSCRATCH */ csrrw a0, CSR_SSCRATCH, a0 @@ -148,39 +134,33 @@ SYM_FUNC_START(__kvm_riscv_switch_to) REG_S t4, (KVM_ARCH_GUEST_T4)(a0) REG_S t5, (KVM_ARCH_GUEST_T5)(a0) REG_S t6, (KVM_ARCH_GUEST_T6)(a0) +.endm +.macro SAVE_GUEST_AND_RESTORE_HOST_CSRS /* Load Host CSR values */ - REG_L t1, (KVM_ARCH_HOST_STVEC)(a0) - REG_L t2, (KVM_ARCH_HOST_SSCRATCH)(a0) - REG_L t3, (KVM_ARCH_HOST_SCOUNTEREN)(a0) - REG_L t4, (KVM_ARCH_HOST_HSTATUS)(a0) - REG_L t5, (KVM_ARCH_HOST_SSTATUS)(a0) - - /* Save Guest SEPC */ - csrr t0, CSR_SEPC + REG_L t0, (KVM_ARCH_HOST_STVEC)(a0) + REG_L t1, (KVM_ARCH_HOST_SSCRATCH)(a0) + REG_L t2, (KVM_ARCH_HOST_SSTATUS)(a0) /* Save Guest A0 and Restore Host SSCRATCH */ - csrrw t2, CSR_SSCRATCH, t2 + csrrw t1, CSR_SSCRATCH, t1 - /* Restore Host STVEC */ - csrw CSR_STVEC, t1 - - /* Save Guest and Restore Host SCOUNTEREN */ - csrrw t3, CSR_SCOUNTEREN, t3 + /* Save Guest SEPC */ + csrr t3, CSR_SEPC - /* Save Guest and Restore Host HSTATUS */ - csrrw t4, CSR_HSTATUS, t4 + /* Restore Host STVEC */ + csrw CSR_STVEC, t0 /* Save Guest and Restore Host SSTATUS */ - csrrw t5, CSR_SSTATUS, t5 + csrrw t2, CSR_SSTATUS, t2 /* Store Guest CSR values */ - REG_S t0, (KVM_ARCH_GUEST_SEPC)(a0) - REG_S t2, (KVM_ARCH_GUEST_A0)(a0) - REG_S t3, (KVM_ARCH_GUEST_SCOUNTEREN)(a0) - REG_S t4, (KVM_ARCH_GUEST_HSTATUS)(a0) - REG_S t5, (KVM_ARCH_GUEST_SSTATUS)(a0) + REG_S t1, (KVM_ARCH_GUEST_A0)(a0) + REG_S t2, (KVM_ARCH_GUEST_SSTATUS)(a0) + REG_S t3, (KVM_ARCH_GUEST_SEPC)(a0) +.endm +.macro RESTORE_HOST_GPRS /* Restore Host GPRs (except A0 and T0-T6) */ REG_L ra, (KVM_ARCH_HOST_RA)(a0) REG_L sp, (KVM_ARCH_HOST_SP)(a0) @@ -205,11 +185,68 @@ SYM_FUNC_START(__kvm_riscv_switch_to) REG_L s9, (KVM_ARCH_HOST_S9)(a0) REG_L s10, (KVM_ARCH_HOST_S10)(a0) REG_L s11, (KVM_ARCH_HOST_S11)(a0) +.endm + + .text + .altmacro + .option norelax + + /* + * Parameters: + * A0 <= Pointer to struct kvm_vcpu_arch + */ +SYM_FUNC_START(__kvm_riscv_switch_to) + SAVE_HOST_GPRS + + SAVE_HOST_AND_RESTORE_GUEST_CSRS .Lkvm_switch_return + + RESTORE_GUEST_GPRS + + /* Resume Guest using SRET */ + sret + + /* Back to Host */ + .align 2 +.Lkvm_switch_return: + SAVE_GUEST_GPRS + + SAVE_GUEST_AND_RESTORE_HOST_CSRS + + RESTORE_HOST_GPRS /* Return to C code */ ret SYM_FUNC_END(__kvm_riscv_switch_to) + /* + * Parameters: + * A0 <= Pointer to struct kvm_vcpu_arch + * A1 <= SBI extension ID + * A2 <= SBI function ID + */ +SYM_FUNC_START(__kvm_riscv_nacl_switch_to) + SAVE_HOST_GPRS + + SAVE_HOST_AND_RESTORE_GUEST_CSRS .Lkvm_nacl_switch_return + + /* Resume Guest using SBI nested acceleration */ + add a6, a2, zero + add a7, a1, zero + ecall + + /* Back to Host */ + .align 2 +.Lkvm_nacl_switch_return: + SAVE_GUEST_GPRS + + SAVE_GUEST_AND_RESTORE_HOST_CSRS + + RESTORE_HOST_GPRS + + /* Return to C code */ + ret +SYM_FUNC_END(__kvm_riscv_nacl_switch_to) + SYM_CODE_START(__kvm_riscv_unpriv_trap) /* * We assume that faulting unpriv load/store instruction is diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index 75486b25ac45..96e7a4e463f7 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -11,8 +11,8 @@ #include <linux/kvm_host.h> #include <linux/uaccess.h> #include <clocksource/timer-riscv.h> -#include <asm/csr.h> #include <asm/delay.h> +#include <asm/kvm_nacl.h> #include <asm/kvm_vcpu_timer.h> static u64 kvm_riscv_current_cycles(struct kvm_guest_timer *gt) @@ -72,12 +72,12 @@ static int kvm_riscv_vcpu_timer_cancel(struct kvm_vcpu_timer *t) static int kvm_riscv_vcpu_update_vstimecmp(struct kvm_vcpu *vcpu, u64 ncycles) { #if defined(CONFIG_32BIT) - csr_write(CSR_VSTIMECMP, ncycles & 0xFFFFFFFF); - csr_write(CSR_VSTIMECMPH, ncycles >> 32); + ncsr_write(CSR_VSTIMECMP, ncycles & 0xFFFFFFFF); + ncsr_write(CSR_VSTIMECMPH, ncycles >> 32); #else - csr_write(CSR_VSTIMECMP, ncycles); + ncsr_write(CSR_VSTIMECMP, ncycles); #endif - return 0; + return 0; } static int kvm_riscv_vcpu_update_hrtimer(struct kvm_vcpu *vcpu, u64 ncycles) @@ -289,10 +289,10 @@ static void kvm_riscv_vcpu_update_timedelta(struct kvm_vcpu *vcpu) struct kvm_guest_timer *gt = &vcpu->kvm->arch.timer; #if defined(CONFIG_32BIT) - csr_write(CSR_HTIMEDELTA, (u32)(gt->time_delta)); - csr_write(CSR_HTIMEDELTAH, (u32)(gt->time_delta >> 32)); + ncsr_write(CSR_HTIMEDELTA, (u32)(gt->time_delta)); + ncsr_write(CSR_HTIMEDELTAH, (u32)(gt->time_delta >> 32)); #else - csr_write(CSR_HTIMEDELTA, gt->time_delta); + ncsr_write(CSR_HTIMEDELTA, gt->time_delta); #endif } @@ -306,10 +306,10 @@ void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu) return; #if defined(CONFIG_32BIT) - csr_write(CSR_VSTIMECMP, (u32)t->next_cycles); - csr_write(CSR_VSTIMECMPH, (u32)(t->next_cycles >> 32)); + ncsr_write(CSR_VSTIMECMP, (u32)t->next_cycles); + ncsr_write(CSR_VSTIMECMPH, (u32)(t->next_cycles >> 32)); #else - csr_write(CSR_VSTIMECMP, t->next_cycles); + ncsr_write(CSR_VSTIMECMP, t->next_cycles); #endif /* timer should be enabled for the remaining operations */ @@ -327,10 +327,10 @@ void kvm_riscv_vcpu_timer_sync(struct kvm_vcpu *vcpu) return; #if defined(CONFIG_32BIT) - t->next_cycles = csr_read(CSR_VSTIMECMP); - t->next_cycles |= (u64)csr_read(CSR_VSTIMECMPH) << 32; + t->next_cycles = ncsr_read(CSR_VSTIMECMP); + t->next_cycles |= (u64)ncsr_read(CSR_VSTIMECMPH) << 32; #else - t->next_cycles = csr_read(CSR_VSTIMECMP); + t->next_cycles = ncsr_read(CSR_VSTIMECMP); #endif } diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index 271d01a5ba4d..d815448758a1 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -386,6 +386,21 @@ int set_direct_map_default_noflush(struct page *page) PAGE_KERNEL, __pgprot(_PAGE_EXEC)); } +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + pgprot_t set, clear; + + if (valid) { + set = PAGE_KERNEL; + clear = __pgprot(_PAGE_EXEC); + } else { + set = __pgprot(0); + clear = __pgprot(_PAGE_PRESENT); + } + + return __set_memory((unsigned long)page_address(page), nr, set, clear); +} + #ifdef CONFIG_DEBUG_PAGEALLOC static int debug_pagealloc_set_page(pte_t *pte, unsigned long addr, void *data) { diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 4cc631fa7039..ca60db75199d 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -10,7 +10,7 @@ #include <linux/filter.h> #include <linux/memory.h> #include <linux/stop_machine.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/cfi.h> #include <asm/percpu.h> #include "bpf_jit.h" diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 6de753c667f4..f8cd2f70a7fb 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -9,7 +9,7 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/memory.h> -#include <asm/patch.h> +#include <asm/text-patching.h> #include <asm/cfi.h> #include "bpf_jit.h" diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index cf1b5d6fb1a6..6f815d4ba0ca 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -12,39 +12,26 @@ #include <linux/pgtable.h> #include <asm/page.h> -#define hugetlb_free_pgd_range free_pgd_range #define hugepages_supported() (MACHINE_HAS_EDAT1) +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); -pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, +#define __HAVE_ARCH_HUGE_PTEP_GET +extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - static inline void arch_clear_hugetlb_flags(struct folio *folio) { clear_bit(PG_arch_1, &folio->flags); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags +#define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { @@ -54,12 +41,14 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY)); } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) @@ -72,6 +61,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -79,69 +69,36 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } -static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) -{ - return mk_pte(page, pgprot); -} - +#define __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { return pte_none(pte); } +#define __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte); } -static inline int huge_pte_write(pte_t pte) -{ - return pte_write(pte); -} - -static inline int huge_pte_dirty(pte_t pte) -{ - return pte_dirty(pte); -} - -static inline pte_t huge_pte_mkwrite(pte_t pte) -{ - return pte_mkwrite_novma(pte); -} - -static inline pte_t huge_pte_mkdirty(pte_t pte) -{ - return pte_mkdirty(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - -static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) -{ - return pte_modify(pte, newprot); -} - +#define __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return pte; } +#define __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte; } +#define __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return 0; } -static inline bool gigantic_page_runtime_supported(void) -{ - return true; -} +#include <asm-generic/hugetlb.h> #endif /* _ASM_S390_HUGETLB_H */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 51201b4ac93a..1cd8eaebd3c0 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -356,6 +356,7 @@ struct kvm_s390_sie_block { #define ECD_MEF 0x08000000 #define ECD_ETOKENF 0x02000000 #define ECD_ECC 0x00200000 +#define ECD_HMAC 0x00004000 __u32 ecd; /* 0x01c8 */ __u8 reserved1cc[18]; /* 0x01cc */ __u64 pp; /* 0x01de */ diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index b13a46e2e931..4f43cdd9835b 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -69,7 +69,7 @@ static inline void copy_page(void *to, void *from) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) /* * These are used to make use of C type-checking.. diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 5013a690837e..474e1f8d1d3c 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -96,7 +96,6 @@ struct zpci_bar_struct { u8 size; /* order 2 exponent */ }; -struct s390_domain; struct kvm_zdev; #define ZPCI_FUNCTIONS_PER_BUS 256 @@ -186,9 +185,10 @@ struct zpci_dev { struct dentry *debugfs_dev; /* IOMMU and passthrough */ - struct s390_domain *s390_domain; /* s390 IOMMU domain data */ + struct iommu_domain *s390_domain; /* attached IOMMU domain */ struct kvm_zdev *kzdev; struct mutex kzdev_lock; + spinlock_t dom_lock; /* protect s390_domain change */ }; static inline bool zdev_enabled(struct zpci_dev *zdev) diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h index cb4cc0f59012..94092f4ae764 100644 --- a/arch/s390/include/asm/set_memory.h +++ b/arch/s390/include/asm/set_memory.h @@ -62,6 +62,7 @@ __SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K) int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); #endif diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h index b69695e39957..3653ff57d6d9 100644 --- a/arch/s390/include/asm/spinlock_types.h +++ b/arch/s390/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define __ASM_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h index 4d74d7e33340..827cb208de86 100644 --- a/arch/s390/include/asm/stp.h +++ b/arch/s390/include/asm/stp.h @@ -94,5 +94,6 @@ struct stp_stzi { int stp_sync_check(void); int stp_island_check(void); void stp_queue_work(void); +bool stp_enabled(void); #endif /* __S390_STP_H */ diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 8fe56456feab..a9460bd6555b 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -96,6 +96,7 @@ extern unsigned char ptff_function_mask[16]; #define PTFF_QAF 0x00 /* query available functions */ #define PTFF_QTO 0x01 /* query tod offset */ #define PTFF_QSI 0x02 /* query steering information */ +#define PTFF_QPT 0x03 /* query physical clock */ #define PTFF_QUI 0x04 /* query UTC information */ #define PTFF_ATO 0x40 /* adjust tod offset */ #define PTFF_STO 0x41 /* set tod offset */ @@ -252,6 +253,11 @@ static __always_inline unsigned long tod_to_ns(unsigned long todval) return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } +static __always_inline u128 eitod_to_ns(u128 todval) +{ + return (todval * 125) >> 9; +} + /** * tod_after - compare two 64 bit TOD values * @a: first 64 bit TOD timestamp diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 05eaf6db3ad4..60345dd2cba2 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -469,7 +469,8 @@ struct kvm_s390_vm_cpu_subfunc { __u8 kdsa[16]; /* with MSA9 */ __u8 sortl[32]; /* with STFLE.150 */ __u8 dfltcc[32]; /* with STFLE.151 */ - __u8 reserved[1728]; + __u8 pfcr[16]; /* with STFLE.201 */ + __u8 reserved[1712]; }; #define KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST 6 diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 30fd10a72c5d..34a65c141ea0 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -254,6 +254,7 @@ static struct clocksource clocksource_tod = { .shift = 24, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .vdso_clock_mode = VDSO_CLOCKMODE_TOD, + .id = CSID_S390_TOD, }; struct clocksource * __init clocksource_default_clock(void) @@ -467,6 +468,12 @@ static void __init stp_reset(void) } } +bool stp_enabled(void) +{ + return test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags) && stp_online; +} +EXPORT_SYMBOL(stp_enabled); + static void stp_timeout(struct timer_list *unused) { queue_work(time_sync_wq, &stp_work); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index deeb32034ad5..442d4a227c0e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -348,6 +348,16 @@ static inline int plo_test_bit(unsigned char nr) return CC_TRANSFORM(cc) == 0; } +static __always_inline void pfcr_query(u8 (*query)[16]) +{ + asm volatile( + " lghi 0,0\n" + " .insn rsy,0xeb0000000016,0,0,%[query]\n" + : [query] "=QS" (*query) + : + : "cc", "0"); +} + static __always_inline void __sortl_query(u8 (*query)[32]) { asm volatile( @@ -429,6 +439,9 @@ static void __init kvm_s390_cpu_feat_init(void) if (test_facility(151)) /* DFLTCC */ __dfltcc_query(&kvm_s390_available_subfunc.dfltcc); + if (test_facility(201)) /* PFCR */ + pfcr_query(&kvm_s390_available_subfunc.pfcr); + if (MACHINE_HAS_ESOP) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); /* @@ -799,6 +812,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) set_kvm_facility(kvm->arch.model.fac_mask, 192); set_kvm_facility(kvm->arch.model.fac_list, 192); } + if (test_facility(198)) { + set_kvm_facility(kvm->arch.model.fac_mask, 198); + set_kvm_facility(kvm->arch.model.fac_list, 198); + } + if (test_facility(199)) { + set_kvm_facility(kvm->arch.model.fac_mask, 199); + set_kvm_facility(kvm->arch.model.fac_list, 199); + } r = 0; } else r = -EINVAL; @@ -1543,6 +1564,9 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm, ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1], ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2], ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]); + VM_EVENT(kvm, 3, "GET: guest PFCR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]); return 0; } @@ -1757,6 +1781,9 @@ static int kvm_s390_get_processor_subfunc(struct kvm *kvm, ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1], ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2], ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]); + VM_EVENT(kvm, 3, "GET: guest PFCR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]); return 0; } @@ -1825,6 +1852,9 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[1], ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[2], ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[3]); + VM_EVENT(kvm, 3, "GET: host PFCR subfunc 0x%16.16lx.%16.16lx", + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0], + ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]); return 0; } @@ -3769,6 +3799,13 @@ static bool kvm_has_pckmo_ecc(struct kvm *kvm) } +static bool kvm_has_pckmo_hmac(struct kvm *kvm) +{ + /* At least one HMAC subfunction must be present */ + return kvm_has_pckmo_subfunc(kvm, 118) || + kvm_has_pckmo_subfunc(kvm, 122); +} + static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) { /* @@ -3781,7 +3818,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA); vcpu->arch.sie_block->eca &= ~ECA_APIE; - vcpu->arch.sie_block->ecd &= ~ECD_ECC; + vcpu->arch.sie_block->ecd &= ~(ECD_ECC | ECD_HMAC); if (vcpu->kvm->arch.crypto.apie) vcpu->arch.sie_block->eca |= ECA_APIE; @@ -3789,9 +3826,11 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) /* Set up protected key support */ if (vcpu->kvm->arch.crypto.aes_kw) { vcpu->arch.sie_block->ecb3 |= ECB3_AES; - /* ecc is also wrapped with AES key */ + /* ecc/hmac is also wrapped with AES key */ if (kvm_has_pckmo_ecc(vcpu->kvm)) vcpu->arch.sie_block->ecd |= ECD_ECC; + if (kvm_has_pckmo_hmac(vcpu->kvm)) + vcpu->arch.sie_block->ecd |= ECD_HMAC; } if (vcpu->kvm->arch.crypto.dea_kw) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index d3cdde1b18e5..150b9387860a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -335,7 +335,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* we may only allow it if enabled for guest 2 */ ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & (ECB3_AES | ECB3_DEA); - ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC; + ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & + (ECD_ECC | ECD_HMAC); if (!ecb3_flags && !ecd_flags) goto end; @@ -661,7 +662,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) struct page *page; page = gfn_to_page(kvm, gpa_to_gfn(gpa)); - if (is_error_page(page)) + if (!page) return -EINVAL; *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK); return 0; @@ -670,7 +671,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) /* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) { - kvm_release_pfn_dirty(hpa >> PAGE_SHIFT); + kvm_release_page_dirty(pfn_to_page(hpa >> PAGE_SHIFT)); /* mark the page always as dirty for migration */ mark_page_dirty(kvm, gpa_to_gfn(gpa)); } diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index ded0eff58a19..7c79cf1bc7d7 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -242,88 +242,3 @@ bool __init arch_hugetlb_valid_size(unsigned long size) else return false; } - -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.length = len; - info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - return vm_unmapped_area(&info); -} - -static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - unsigned long addr; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > TASK_SIZE - mmap_min_addr) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - goto check_asce_limit; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - goto check_asce_limit; - } - - if (!test_bit(MMF_TOPDOWN, &mm->flags)) - addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - addr = hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); - if (offset_in_page(addr)) - return addr; - -check_asce_limit: - return check_asce_limit(mm, addr, len); -} diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 96efa061ce01..33f3504be90b 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -17,6 +17,7 @@ #include <linux/random.h> #include <linux/compat.h> #include <linux/security.h> +#include <linux/hugetlb.h> #include <asm/elf.h> static unsigned long stack_maxrandom_size(void) @@ -73,6 +74,8 @@ static inline unsigned long mmap_base(unsigned long rnd, static int get_align_mask(struct file *filp, unsigned long flags) { + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); if (!(current->flags & PF_RANDOMIZE)) return 0; if (filp || (flags & MAP_SHARED)) @@ -106,7 +109,8 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, info.low_limit = mm->mmap_base; info.high_limit = TASK_SIZE; info.align_mask = get_align_mask(filp, flags); - info.align_offset = pgoff << PAGE_SHIFT; + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); if (offset_in_page(addr)) return addr; @@ -144,7 +148,8 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; info.align_mask = get_align_mask(filp, flags); - info.align_offset = pgoff << PAGE_SHIFT; + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); /* diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 4a0f422cfeb6..8f56a21a077f 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -407,6 +407,18 @@ int set_direct_map_default_noflush(struct page *page) return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF); } +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + unsigned long flags; + + if (valid) + flags = SET_MEMORY_DEF; + else + flags = SET_MEMORY_INV; + + return __set_memory((unsigned long)page_to_virt(page), nr, flags); +} + bool kernel_page_present(struct page *page) { unsigned long addr; diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index b7efa96776ea..cbff587dc4e3 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -161,6 +161,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev) u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_SET_MEASURE); struct zpci_iommu_ctrs *ctrs; struct zpci_fib fib = {0}; + unsigned long flags; u8 cc, status; if (zdev->fmb || sizeof(*zdev->fmb) < zdev->fmb_length) @@ -172,6 +173,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev) WARN_ON((u64) zdev->fmb & 0xf); /* reset software counters */ + spin_lock_irqsave(&zdev->dom_lock, flags); ctrs = zpci_get_iommu_ctrs(zdev); if (ctrs) { atomic64_set(&ctrs->mapped_pages, 0); @@ -180,6 +182,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev) atomic64_set(&ctrs->sync_map_rpcits, 0); atomic64_set(&ctrs->sync_rpcits, 0); } + spin_unlock_irqrestore(&zdev->dom_lock, flags); fib.fmb_addr = virt_to_phys(zdev->fmb); diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 1b74a000ff64..d5ace00d10f0 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -53,7 +53,7 @@ static int zpci_bus_prepare_device(struct zpci_dev *zdev) zpci_setup_bus_resources(zdev); for (i = 0; i < PCI_STD_NUM_BARS; i++) { if (zdev->bars[i].res) - pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res, 0); + pci_bus_add_resource(zdev->zbus->bus, zdev->bars[i].res); } } diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index 2cb5043a997d..38014206c16b 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -71,17 +71,23 @@ static void pci_fmb_show(struct seq_file *m, char *name[], int length, static void pci_sw_counter_show(struct seq_file *m) { - struct zpci_iommu_ctrs *ctrs = zpci_get_iommu_ctrs(m->private); + struct zpci_dev *zdev = m->private; + struct zpci_iommu_ctrs *ctrs; atomic64_t *counter; + unsigned long flags; int i; + spin_lock_irqsave(&zdev->dom_lock, flags); + ctrs = zpci_get_iommu_ctrs(m->private); if (!ctrs) - return; + goto unlock; counter = &ctrs->mapped_pages; for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++) seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i], atomic64_read(counter)); +unlock: + spin_unlock_irqrestore(&zdev->dom_lock, flags); } static int pci_perf_show(struct seq_file *m, void *v) diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 68580cbea4e6..855f818deb98 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -109,10 +109,12 @@ static struct facility_def facility_defs[] = { 15, /* AP Facilities Test */ 156, /* etoken facility */ 165, /* nnpa facility */ + 170, /* ineffective-nonconstrained-transaction facility */ 193, /* bear enhancement facility */ 194, /* rdp enhancement facility */ 196, /* processor activity instrumentation facility */ 197, /* processor activity instrumentation extension 1 */ + 201, /* concurrent-functions facility */ -1 /* END */ } }, diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig index 0311380160f4..d871623955c5 100644 --- a/arch/sh/configs/landisk_defconfig +++ b/arch/sh/configs/landisk_defconfig @@ -95,7 +95,6 @@ CONFIG_USB_SISUSBVGA=m CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set -CONFIG_REISERFS_FS=y CONFIG_ISO9660_FS=m CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig index c1032559ecd4..99bc0e889287 100644 --- a/arch/sh/configs/titan_defconfig +++ b/arch/sh/configs/titan_defconfig @@ -220,7 +220,6 @@ CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set -CONFIG_REISERFS_FS=m CONFIG_XFS_FS=m CONFIG_FUSE_FS=m CONFIG_ISO9660_FS=m diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index fc44d9c88b41..4d3f10ed8275 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -3,3 +3,4 @@ generated-y += syscall_table.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += text-patching.h diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 75028bd568ba..4a92e6e4d627 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -5,21 +5,6 @@ #include <asm/cacheflush.h> #include <asm/page.h> -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h index 907bda4b1619..7cb50e68448f 100644 --- a/arch/sh/include/asm/spinlock_types.h +++ b/arch/sh/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define __ASM_SH_SPINLOCK_TYPES_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif typedef struct { diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 43b0ae4c2c21..17ee8a273aa6 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -4,3 +4,4 @@ generated-y += syscall_table_64.h generic-y += agp.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += text-patching.h diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 57084ed2f3c4..113cd9f353e3 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -139,6 +139,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 0x0059 +#define SCM_TS_OPT_ID 0x005a + #if !defined(__KERNEL__) diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 80822f922e76..fb31bc0c5b48 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -23,6 +23,7 @@ #include <linux/utsname.h> #include <linux/smp.h> #include <linux/ipc.h> +#include <linux/hugetlb.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -42,12 +43,16 @@ SYSCALL_DEFINE0(getpagesize) unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_unmapped_area_info info = {}; + bool file_hugepage = false; + + if (filp && is_file_hugepages(filp)) + file_hugepage = true; if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && + if (!file_hugepage && (flags & MAP_SHARED) && ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; @@ -62,9 +67,13 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi info.length = len; info.low_limit = addr; info.high_limit = TASK_SIZE; - info.align_mask = (flags & MAP_SHARED) ? - (PAGE_MASK & (SHMLBA - 1)) : 0; - info.align_offset = pgoff << PAGE_SHIFT; + if (!file_hugepage) { + info.align_mask = (flags & MAP_SHARED) ? + (PAGE_MASK & (SHMLBA - 1)) : 0; + info.align_offset = pgoff << PAGE_SHIFT; + } else { + info.align_mask = huge_page_mask_align(filp); + } return vm_unmapped_area(&info); } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index acade309dc2f..c5a284df7b41 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -30,6 +30,7 @@ #include <linux/context_tracking.h> #include <linux/timex.h> #include <linux/uaccess.h> +#include <linux/hugetlb.h> #include <asm/utrap.h> #include <asm/unistd.h> @@ -87,6 +88,16 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr, return base + off; } +static unsigned long get_align_mask(struct file *filp, unsigned long flags) +{ + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); + if (filp || (flags & MAP_SHARED)) + return PAGE_MASK & (SHMLBA - 1); + + return 0; +} + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; @@ -94,12 +105,16 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi unsigned long task_size = TASK_SIZE; int do_color_align; struct vm_unmapped_area_info info = {}; + bool file_hugepage = false; + + if (filp && is_file_hugepages(filp)) + file_hugepage = true; if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && + if (!file_hugepage && (flags & MAP_SHARED) && ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; @@ -111,7 +126,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi return -ENOMEM; do_color_align = 0; - if (filp || (flags & MAP_SHARED)) + if ((filp || (flags & MAP_SHARED)) && !file_hugepage) do_color_align = 1; if (addr) { @@ -129,8 +144,9 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi info.length = len; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = min(task_size, VA_EXCLUDE_START); - info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!file_hugepage) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) { @@ -154,15 +170,19 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long addr = addr0; int do_color_align; struct vm_unmapped_area_info info = {}; + bool file_hugepage = false; /* This should only ever run for 32-bit processes. */ BUG_ON(!test_thread_flag(TIF_32BIT)); + if (filp && is_file_hugepages(filp)) + file_hugepage = true; + if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && + if (!file_hugepage && (flags & MAP_SHARED) && ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; @@ -172,7 +192,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return -ENOMEM; do_color_align = 0; - if (filp || (flags & MAP_SHARED)) + if ((filp || (flags & MAP_SHARED)) && !file_hugepage) do_color_align = 1; /* requesting a specific address */ @@ -192,8 +212,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!file_hugepage) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); /* diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index cc91ca7a1e18..eee601a0d2cf 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -19,114 +19,6 @@ #include <asm/cacheflush.h> #include <asm/mmu_context.h> -/* Slightly simplified from the non-hugepage variant because by - * definition we don't have to worry about any page coloring stuff - */ - -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp, - unsigned long addr, - unsigned long len, - unsigned long pgoff, - unsigned long flags) -{ - struct hstate *h = hstate_file(filp); - unsigned long task_size = TASK_SIZE; - struct vm_unmapped_area_info info = {}; - - if (test_thread_flag(TIF_32BIT)) - task_size = STACK_TOP32; - - info.length = len; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = min(task_size, VA_EXCLUDE_START); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) { - VM_BUG_ON(addr != -ENOMEM); - info.low_limit = VA_EXCLUDE_END; - info.high_limit = task_size; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -static unsigned long -hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, - const unsigned long pgoff, - const unsigned long flags) -{ - struct hstate *h = hstate_file(filp); - struct mm_struct *mm = current->mm; - unsigned long addr = addr0; - struct vm_unmapped_area_info info = {}; - - /* This should only ever run for 32-bit processes. */ - BUG_ON(!test_thread_flag(TIF_32BIT)); - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = mm->mmap_base; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = STACK_TOP32; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long task_size = TASK_SIZE; - - if (test_thread_flag(TIF_32BIT)) - task_size = STACK_TOP32; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > task_size) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (task_size - len >= addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - if (!test_bit(MMF_TOPDOWN, &mm->flags)) - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); -} static pte_t sun4u_hugepage_shift_to_tte(pte_t entry, unsigned int shift) { diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index e543cbac8792..9c9c77f1255a 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -61,7 +61,6 @@ CONFIG_UML_NET_DAEMON=y CONFIG_UML_NET_MCAST=y CONFIG_UML_NET_SLIRP=y CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m CONFIG_ISO9660_FS=m diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 939cb12318ca..03b10d3f6816 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -59,7 +59,6 @@ CONFIG_UML_NET_DAEMON=y CONFIG_UML_NET_MCAST=y CONFIG_UML_NET_SLIRP=y CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m CONFIG_ISO9660_FS=m diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 8e594cda6d77..e8e8b54b3037 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -435,24 +435,25 @@ void __init arch_cpu_finalize_init(void) os_check_bugs(); } -void apply_seal_endbr(s32 *start, s32 *end) +void apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } -void apply_retpolines(s32 *start, s32 *end) +void apply_retpolines(s32 *start, s32 *end, struct module *mod) { } -void apply_returns(s32 *start, s32 *end) +void apply_returns(s32 *start, s32 *end, struct module *mod) { } void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { } -void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +void apply_alternatives(struct alt_instr *start, struct alt_instr *end, + struct module *mod) { } @@ -468,6 +469,11 @@ void *text_poke(void *addr, const void *opcode, size_t len) return memcpy(addr, opcode, len); } +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + return text_poke(addr, opcode, len); +} + void text_poke_sync(void) { } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a3c31b784edc..6c633d93c639 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -83,6 +83,7 @@ config X86 select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_EXECMEM_ROX if X86_64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL @@ -2564,15 +2565,14 @@ config MITIGATION_CALL_DEPTH_TRACKING default y help Compile the kernel with call depth tracking to mitigate the Intel - SKL Return-Speculation-Buffer (RSB) underflow issue. The - mitigation is off by default and needs to be enabled on the - kernel command line via the retbleed=stuff option. For - non-affected systems the overhead of this option is marginal as - the call depth tracking is using run-time generated call thunks - in a compiler generated padding area and call patching. This - increases text size by ~5%. For non affected systems this space - is unused. On affected SKL systems this results in a significant - performance gain over the IBRS mitigation. + SKL Return-Stack-Buffer (RSB) underflow issue. The mitigation is off + by default and needs to be enabled on the kernel command line via the + retbleed=stuff option. For non-affected systems the overhead of this + option is marginal as the call depth tracking is using run-time + generated call thunks in a compiler generated padding area and call + patching. This increases text size by ~5%. For non affected systems + this space is unused. On affected SKL systems this results in a + significant performance gain over the IBRS mitigation. config CALL_THUNKS_DEBUG bool "Enable call thunks and call depth tracking debugging" diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 327c45c5013f..0d9b090b4880 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -78,6 +78,32 @@ static inline void tdcall(u64 fn, struct tdx_module_args *args) panic("TDCALL %lld failed (Buggy TDX module!)\n", fn); } +/* Read TD-scoped metadata */ +static inline u64 tdg_vm_rd(u64 field, u64 *value) +{ + struct tdx_module_args args = { + .rdx = field, + }; + u64 ret; + + ret = __tdcall_ret(TDG_VM_RD, &args); + *value = args.r8; + + return ret; +} + +/* Write TD-scoped metadata */ +static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask) +{ + struct tdx_module_args args = { + .rdx = field, + .r8 = value, + .r9 = mask, + }; + + return __tdcall(TDG_VM_WR, &args); +} + /** * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT * subtype 0) using TDG.MR.REPORT TDCALL. @@ -168,7 +194,87 @@ static void __noreturn tdx_panic(const char *msg) __tdx_hypercall(&args); } -static void tdx_parse_tdinfo(u64 *cc_mask) +/* + * The kernel cannot handle #VEs when accessing normal kernel memory. Ensure + * that no #VE will be delivered for accesses to TD-private memory. + * + * TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM + * controls if the guest will receive such #VE with TD attribute + * ATTR_SEPT_VE_DISABLE. + * + * Newer TDX modules allow the guest to control if it wants to receive SEPT + * violation #VEs. + * + * Check if the feature is available and disable SEPT #VE if possible. + * + * If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE + * attribute is no longer reliable. It reflects the initial state of the + * control for the TD, but it will not be updated if someone (e.g. bootloader) + * changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to + * determine if SEPT #VEs are enabled or disabled. + */ +static void disable_sept_ve(u64 td_attr) +{ + const char *msg = "TD misconfiguration: SEPT #VE has to be disabled"; + bool debug = td_attr & ATTR_DEBUG; + u64 config, controls; + + /* Is this TD allowed to disable SEPT #VE */ + tdg_vm_rd(TDCS_CONFIG_FLAGS, &config); + if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) { + /* No SEPT #VE controls for the guest: check the attribute */ + if (td_attr & ATTR_SEPT_VE_DISABLE) + return; + + /* Relax SEPT_VE_DISABLE check for debug TD for backtraces */ + if (debug) + pr_warn("%s\n", msg); + else + tdx_panic(msg); + return; + } + + /* Check if SEPT #VE has been disabled before us */ + tdg_vm_rd(TDCS_TD_CTLS, &controls); + if (controls & TD_CTLS_PENDING_VE_DISABLE) + return; + + /* Keep #VEs enabled for splats in debugging environments */ + if (debug) + return; + + /* Disable SEPT #VEs */ + tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_PENDING_VE_DISABLE, + TD_CTLS_PENDING_VE_DISABLE); +} + +/* + * TDX 1.0 generates a #VE when accessing topology-related CPUID leafs (0xB and + * 0x1F) and the X2APIC_APICID MSR. The kernel returns all zeros on CPUID #VEs. + * In practice, this means that the kernel can only boot with a plain topology. + * Any complications will cause problems. + * + * The ENUM_TOPOLOGY feature allows the VMM to provide topology information. + * Enabling the feature eliminates topology-related #VEs: the TDX module + * virtualizes accesses to the CPUID leafs and the MSR. + * + * Enable ENUM_TOPOLOGY if it is available. + */ +static void enable_cpu_topology_enumeration(void) +{ + u64 configured; + + /* Has the VMM provided a valid topology configuration? */ + tdg_vm_rd(TDCS_TOPOLOGY_ENUM_CONFIGURED, &configured); + if (!configured) { + pr_err("VMM did not configure X2APIC_IDs properly\n"); + return; + } + + tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY); +} + +static void tdx_setup(u64 *cc_mask) { struct tdx_module_args args = {}; unsigned int gpa_width; @@ -193,21 +299,13 @@ static void tdx_parse_tdinfo(u64 *cc_mask) gpa_width = args.rcx & GENMASK(5, 0); *cc_mask = BIT_ULL(gpa_width - 1); - /* - * The kernel can not handle #VE's when accessing normal kernel - * memory. Ensure that no #VE will be delivered for accesses to - * TD-private memory. Only VMM-shared memory (MMIO) will #VE. - */ td_attr = args.rdx; - if (!(td_attr & ATTR_SEPT_VE_DISABLE)) { - const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set."; - /* Relax SEPT_VE_DISABLE check for debug TD. */ - if (td_attr & ATTR_DEBUG) - pr_warn("%s\n", msg); - else - tdx_panic(msg); - } + /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ + tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL); + + disable_sept_ve(td_attr); + enable_cpu_topology_enumeration(); } /* @@ -929,10 +1027,6 @@ static void tdx_kexec_finish(void) void __init tdx_early_init(void) { - struct tdx_module_args args = { - .rdx = TDCS_NOTIFY_ENABLES, - .r9 = -1ULL, - }; u64 cc_mask; u32 eax, sig[3]; @@ -947,11 +1041,11 @@ void __init tdx_early_init(void) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); cc_vendor = CC_VENDOR_INTEL; - tdx_parse_tdinfo(&cc_mask); - cc_set_mask(cc_mask); - /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */ - tdcall(TDG_VM_WR, &args); + /* Configure the TD */ + tdx_setup(&cc_mask); + + cc_set_mask(cc_mask); /* * All bits above GPA width are reserved and kernel treats shared bit diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index bfc7cabf4017..39e6efc1a9ca 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -48,7 +48,8 @@ int __init init_vdso_image(const struct vdso_image *image) apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + - image->alt_len)); + image->alt_len), + NULL); return 0; } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ca9ae606aab9..dc03a647776d 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -96,16 +96,16 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; * instructions were patched in already: */ extern int alternatives_patched; +struct module; extern void alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); -extern void apply_retpolines(s32 *start, s32 *end); -extern void apply_returns(s32 *start, s32 *end); -extern void apply_seal_endbr(s32 *start, s32 *end); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, + struct module *mod); +extern void apply_retpolines(s32 *start, s32 *end, struct module *mod); +extern void apply_returns(s32 *start, s32 *end, struct module *mod); +extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, - s32 *start_cfi, s32 *end_cfi); - -struct module; + s32 *start_cfi, s32 *end_cfi, struct module *mod); struct callthunk_sites { s32 *call_start, *call_end; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ea33439a5d00..17b6590748c0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -317,6 +317,9 @@ #define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_SHA512 (12*32+ 0) /* SHA512 instructions */ +#define X86_FEATURE_SM3 (12*32+ 1) /* SM3 instructions */ +#define X86_FEATURE_SM4 (12*32+ 2) /* SM4 instructions */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */ #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index ca4243318aad..239b9ba5c398 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -6,6 +6,8 @@ #ifndef _ASM_X86_CPUID_H #define _ASM_X86_CPUID_H +#include <linux/types.h> + #include <asm/string.h> struct cpuid_regs { @@ -20,11 +22,11 @@ enum cpuid_regs_idx { }; #ifdef CONFIG_X86_32 -extern int have_cpuid_p(void); +bool have_cpuid_p(void); #else -static inline int have_cpuid_p(void) +static inline bool have_cpuid_p(void) { - return 1; + return true; } #endif static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 99d345b686fa..6e2458088800 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -48,7 +48,9 @@ do { \ static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - if (!user_access_begin(uaddr, sizeof(u32))) + if (can_do_masked_user_access()) + uaddr = masked_user_access_begin(uaddr); + else if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; switch (op) { @@ -84,7 +86,9 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, { int ret = 0; - if (!user_access_begin(uaddr, sizeof(u32))) + if (can_do_masked_user_access()) + uaddr = masked_user_access_begin(uaddr); + else if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; asm volatile("\n" "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index cbbef32517f0..3f1c1d6c0da1 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -12,35 +12,28 @@ #include <linux/stringify.h> #include <linux/types.h> -#define JUMP_TABLE_ENTRY \ +#define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\" \n\t" \ _ASM_ALIGN "\n\t" \ ".long 1b - . \n\t" \ - ".long %l[l_yes] - . \n\t" \ - _ASM_PTR "%c0 + %c1 - .\n\t" \ + ".long " label " - . \n\t" \ + _ASM_PTR " " key " - . \n\t" \ ".popsection \n\t" +/* This macro is also expanded on the Rust side. */ #ifdef CONFIG_HAVE_JUMP_LABEL_HACK - -static __always_inline bool arch_static_branch(struct static_key *key, bool branch) -{ - asm goto("1:" - "jmp %l[l_yes] # objtool NOPs this \n\t" - JUMP_TABLE_ENTRY - : : "i" (key), "i" (2 | branch) : : l_yes); - - return false; -l_yes: - return true; -} - +#define ARCH_STATIC_BRANCH_ASM(key, label) \ + "1: jmp " label " # objtool NOPs this \n\t" \ + JUMP_TABLE_ENTRY(key " + 2", label) #else /* !CONFIG_HAVE_JUMP_LABEL_HACK */ +#define ARCH_STATIC_BRANCH_ASM(key, label) \ + "1: .byte " __stringify(BYTES_NOP5) "\n\t" \ + JUMP_TABLE_ENTRY(key, label) +#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm goto("1:" - ".byte " __stringify(BYTES_NOP5) "\n\t" - JUMP_TABLE_ENTRY + asm goto(ARCH_STATIC_BRANCH_ASM("%c0 + %c1", "%l[l_yes]") : : "i" (key), "i" (branch) : : l_yes); return false; @@ -48,13 +41,11 @@ l_yes: return true; } -#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */ - static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm goto("1:" "jmp %l[l_yes]\n\t" - JUMP_TABLE_ENTRY + JUMP_TABLE_ENTRY("%c0 + %c1", "%l[l_yes]") : : "i" (key), "i" (branch) : : l_yes); return false; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 861d080ed4c6..5aff7222e40f 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -34,6 +34,7 @@ KVM_X86_OP(set_msr) KVM_X86_OP(get_segment_base) KVM_X86_OP(get_segment) KVM_X86_OP(get_cpl) +KVM_X86_OP(get_cpl_no_cache) KVM_X86_OP(set_segment) KVM_X86_OP(get_cs_db_l_bits) KVM_X86_OP(is_valid_cr0) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6d9f763a7bb9..e159e44a6a1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -26,6 +26,7 @@ #include <linux/irqbypass.h> #include <linux/hyperv.h> #include <linux/kfifo.h> +#include <linux/sched/vhost_task.h> #include <asm/apic.h> #include <asm/pvclock-abi.h> @@ -1306,7 +1307,6 @@ struct kvm_arch { bool pre_fault_allowed; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; - struct list_head zapped_obsolete_pages; /* * A list of kvm_mmu_page structs that, if zapped, could possibly be * replaced by an NX huge page. A shadow page is on this list if its @@ -1443,7 +1443,8 @@ struct kvm_arch { bool sgx_provisioning_allowed; struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; - struct task_struct *nx_huge_page_recovery_thread; + struct vhost_task *nx_huge_page_recovery_thread; + u64 nx_huge_page_last; #ifdef CONFIG_X86_64 /* The number of TDP MMU pages across all roots. */ @@ -1656,6 +1657,7 @@ struct kvm_x86_ops { void (*get_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int (*get_cpl)(struct kvm_vcpu *vcpu); + int (*get_cpl_no_cache)(struct kvm_vcpu *vcpu); void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); @@ -1955,8 +1957,8 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level); -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot); +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); @@ -2359,7 +2361,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ - KVM_X86_QUIRK_SLOT_ZAP_ALL) + KVM_X86_QUIRK_SLOT_ZAP_ALL | \ + KVM_X86_QUIRK_STUFF_FEATURE_MSRS) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 1b93ff80b43b..c9fe207916f4 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -35,7 +35,7 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index f3d257c45225..d63576608ce7 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -17,7 +17,7 @@ extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; -extern unsigned long physmem_end; +extern unsigned long direct_map_physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index c55a79d5feae..e525cd85f999 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -234,9 +234,10 @@ do { \ */ #define percpu_add_op(size, qual, var, val) \ do { \ - const int pao_ID__ = (__builtin_constant_p(val) && \ - ((val) == 1 || (val) == -1)) ? \ - (int)(val) : 0; \ + const int pao_ID__ = \ + (__builtin_constant_p(val) && \ + ((val) == 1 || \ + (val) == (typeof(val))-1)) ? (int)(val) : 0; \ \ if (0) { \ typeof(var) pao_tmp__; \ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 4c2d080d26b4..593f10aabd45 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1775,6 +1775,43 @@ bool arch_is_platform_page(u64 paddr); #define arch_is_platform_page arch_is_platform_page #endif +/* + * Use set_p*_safe(), and elide TLB flushing, when confident that *no* + * TLB flush will be required as a result of the "set". For example, use + * in scenarios where it is known ahead of time that the routine is + * setting non-present entries, or re-setting an existing entry to the + * same value. Otherwise, use the typical "set" helpers and flush the + * TLB. + */ +#define set_pte_safe(ptep, pte) \ +({ \ + WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ + set_pte(ptep, pte); \ +}) + +#define set_pmd_safe(pmdp, pmd) \ +({ \ + WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ + set_pmd(pmdp, pmd); \ +}) + +#define set_pud_safe(pudp, pud) \ +({ \ + WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ + set_pud(pudp, pud); \ +}) + +#define set_p4d_safe(p4dp, p4d) \ +({ \ + WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ + set_p4d(p4dp, p4d); \ +}) + +#define set_pgd_safe(pgdp, pgd) \ +({ \ + WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ + set_pgd(pgdp, pgd); \ +}) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index a98e53491a4e..ec68f8369bdc 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -141,7 +141,7 @@ extern unsigned int ptrs_per_p4d; #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #ifdef CONFIG_RANDOMIZE_MEMORY -# define PHYSMEM_END physmem_end +# define DIRECT_MAP_PHYSMEM_END direct_map_physmem_end #endif /* diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 4b2abce2e3e7..cc62ef70ccc0 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -89,6 +89,7 @@ int set_pages_rw(struct page *page, int numpages); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index fdfd41511b02..89f7fcade8ae 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -16,10 +16,21 @@ #define TDG_VP_VEINFO_GET 3 #define TDG_MR_REPORT 4 #define TDG_MEM_PAGE_ACCEPT 6 +#define TDG_VM_RD 7 #define TDG_VM_WR 8 -/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ +#define TDCS_CONFIG_FLAGS 0x1110000300000016 +#define TDCS_TD_CTLS 0x1110000300000017 #define TDCS_NOTIFY_ENABLES 0x9100000000000010 +#define TDCS_TOPOLOGY_ENUM_CONFIGURED 0x9100000000000019 + +/* TDCS_CONFIG_FLAGS bits */ +#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1) + +/* TDCS_TD_CTLS bits */ +#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0) +#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(1) /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 6259f1937fe7..ab9e143ec9fe 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -35,6 +35,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +#define text_poke_copy text_poke_copy extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a8debbf2f702..88585c1de416 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -440,6 +440,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) +#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d17518ca19b8..243843e44e89 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -392,8 +392,10 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, + struct module *mod) { + u8 *wr_instr = module_writable_address(mod, instr); void *target, *bug = &BUG_func; s32 disp; @@ -403,14 +405,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) } if (a->instrlen != 6 || - instr[0] != CALL_RIP_REL_OPCODE || - instr[1] != CALL_RIP_REL_MODRM) { + wr_instr[0] != CALL_RIP_REL_OPCODE || + wr_instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ - disp = *(s32 *)(instr + 2); + disp = *(s32 *)(wr_instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ @@ -448,7 +450,8 @@ static inline u8 * instr_va(struct alt_instr *i) * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, - struct alt_instr *end) + struct alt_instr *end, + struct module *mod) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; @@ -477,6 +480,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; + u8 *wr_instr, *wr_replacement; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -490,7 +494,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, } instr = instr_va(a); + wr_instr = module_writable_address(mod, instr); + replacement = (u8 *)&a->repl_offset + a->repl_offset; + wr_replacement = module_writable_address(mod, replacement); + BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -501,9 +509,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - memcpy(insn_buff, instr, a->instrlen); + memcpy(insn_buff, wr_instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); - text_poke_early(instr, insn_buff, a->instrlen); + text_poke_early(wr_instr, insn_buff, a->instrlen); continue; } @@ -513,11 +521,12 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); - memcpy(insn_buff, replacement, a->replacementlen); + memcpy(insn_buff, wr_replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { - insn_buff_sz = alt_replace_call(instr, insn_buff, a); + insn_buff_sz = alt_replace_call(instr, insn_buff, a, + mod); if (insn_buff_sz < 0) continue; } @@ -527,11 +536,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(instr, insn_buff, insn_buff_sz); + text_poke_early(wr_instr, insn_buff, insn_buff_sz); } kasan_enable_current(); @@ -722,18 +731,20 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * Generated by 'objtool --retpoline'. */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, + struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; - ret = insn_decode_kernel(&insn, addr); + ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -761,9 +772,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); - DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + text_poke_early(wr_addr, bytes, len); } } } @@ -799,7 +810,8 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) return i; } -void __init_or_module noinline apply_returns(s32 *start, s32 *end) +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { s32 *s; @@ -808,12 +820,13 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; - ret = insn_decode_kernel(&insn, addr); + ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -833,32 +846,35 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + text_poke_early(wr_addr, bytes, len); } } } #else -void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { } #endif /* CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } -void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, + struct module *mod) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { } #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr); +static void poison_cfi(void *addr, void *wr_addr); -static void __init_or_module poison_endbr(void *addr, bool warn) +static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) return; if (!is_endbr(endbr)) { @@ -873,7 +889,7 @@ static void __init_or_module poison_endbr(void *addr, bool warn) */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + text_poke_early(wr_addr, &poison, 4); } /* @@ -882,22 +898,23 @@ static void __init_or_module poison_endbr(void *addr, bool warn) * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ -void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr, true); + poison_endbr(addr, wr_addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16); + poison_cfi(addr - 16, wr_addr - 16); } } #else -void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } #endif /* CONFIG_X86_KERNEL_IBT */ @@ -1119,7 +1136,7 @@ static u32 decode_caller_hash(void *addr) } /* .retpoline_sites */ -static int cfi_disable_callers(s32 *start, s32 *end) +static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate @@ -1131,20 +1148,23 @@ static int cfi_disable_callers(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); + if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, jmp, 2); + text_poke_early(wr_addr, jmp, 2); } return 0; } -static int cfi_enable_callers(s32 *start, s32 *end) +static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. @@ -1154,106 +1174,115 @@ static int cfi_enable_callers(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, mov, 2); + text_poke_early(wr_addr, mov, 2); } return 0; } /* .cfi_sites */ -static int cfi_rand_preamble(s32 *start, s32 *end) +static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(addr); + hash = decode_preamble_hash(wr_addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(addr + 1, &hash, 4); + text_poke_early(wr_addr + 1, &hash, 4); } return 0; } -static int cfi_rewrite_preamble(s32 *start, s32 *end) +static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(addr); + hash = decode_preamble_hash(wr_addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; - text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); - WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); } return 0; } -static void cfi_rewrite_endbr(s32 *start, s32 *end) +static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr+16, false); + poison_endbr(addr + 16, wr_addr + 16, false); } } /* .retpoline_sites */ -static int cfi_rand_callers(s32 *start, s32 *end) +static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(addr + 2, &hash, 4); + text_poke_early(wr_addr + 2, &hash, 4); } } return 0; } -static int cfi_rewrite_callers(s32 *start, s32 *end) +static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (hash) { - text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); - WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(addr + fineibt_caller_hash, &hash, 4); + text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } @@ -1262,8 +1291,9 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, bool builtin) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { + bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, @@ -1281,7 +1311,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ - ret = cfi_disable_callers(start_retpoline, end_retpoline); + ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; @@ -1292,11 +1322,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } - ret = cfi_rand_preamble(start_cfi, end_cfi); + ret = cfi_rand_preamble(start_cfi, end_cfi, mod); if (ret) goto err; - ret = cfi_rand_callers(start_retpoline, end_retpoline); + ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; } @@ -1308,7 +1338,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_KCFI: - ret = cfi_enable_callers(start_retpoline, end_retpoline); + ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; @@ -1318,17 +1348,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ - ret = cfi_rewrite_preamble(start_cfi, end_cfi); + ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); if (ret) goto err; /* rewrite the callers to target func()-16 */ - ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ - cfi_rewrite_endbr(start_cfi, end_cfi); + cfi_rewrite_endbr(start_cfi, end_cfi, mod); if (builtin) pr_info("Using FineIBT CFI\n"); @@ -1347,7 +1377,7 @@ static inline void poison_hash(void *addr) *(u32 *)addr = 0; } -static void poison_cfi(void *addr) +static void poison_cfi(void *addr, void *wr_addr) { switch (cfi_mode) { case CFI_FINEIBT: @@ -1359,8 +1389,8 @@ static void poison_cfi(void *addr) * ud2 * 1: nop */ - poison_endbr(addr, false); - poison_hash(addr + fineibt_preamble_hash); + poison_endbr(addr, wr_addr, false); + poison_hash(wr_addr + fineibt_preamble_hash); break; case CFI_KCFI: @@ -1369,7 +1399,7 @@ static void poison_cfi(void *addr) * movl $0, %eax * .skip 11, 0x90 */ - poison_hash(addr + 1); + poison_hash(wr_addr + 1); break; default: @@ -1380,22 +1410,21 @@ static void poison_cfi(void *addr) #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, bool builtin) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr) { } +static void poison_cfi(void *addr, void *wr_addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { return __apply_fineibt(start_retpoline, end_retpoline, - start_cfi, end_cfi, - /* .builtin = */ false); + start_cfi, end_cfi, mod); } #ifdef CONFIG_SMP @@ -1692,16 +1721,16 @@ void __init alternative_instructions(void) paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, - __cfi_sites, __cfi_sites_end, true); + __cfi_sites, __cfi_sites_end, NULL); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ - apply_retpolines(__retpoline_sites, __retpoline_sites_end); - apply_returns(__return_sites, __return_sites_end); + apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); + apply_returns(__return_sites, __return_sites_end, NULL); - apply_alternatives(__alt_instructions, __alt_instructions_end); + apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); /* * Now all calls are established. Apply the call thunks if @@ -1712,7 +1741,7 @@ void __init alternative_instructions(void) /* * Seal all functions that do not have their address taken. */ - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 02637365d1a9..ca327cfa42ae 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -276,21 +276,13 @@ static int __init x86_noinvpcid_setup(char *s) } early_param("noinvpcid", x86_noinvpcid_setup); -#ifdef CONFIG_X86_32 -static int cachesize_override = -1; -static int disable_x86_serial_nr = 1; - -static int __init cachesize_setup(char *str) -{ - get_option(&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); - /* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) +static inline bool flag_is_changeable_p(unsigned long flag) { - u32 f1, f2; + unsigned long f1, f2; + + if (!IS_ENABLED(CONFIG_X86_32)) + return true; /* * Cyrix and IDT cpus allow disabling of CPUID @@ -313,11 +305,22 @@ static inline int flag_is_changeable_p(u32 flag) : "=&r" (f1), "=&r" (f2) : "ir" (flag)); - return ((f1^f2) & flag) != 0; + return (f1 ^ f2) & flag; } +#ifdef CONFIG_X86_32 +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + /* Probe for the CPUID instruction */ -int have_cpuid_p(void) +bool have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -349,10 +352,6 @@ static int __init x86_serial_nr_setup(char *s) } __setup("serialnumber", x86_serial_nr_setup); #else -static inline int flag_is_changeable_p(u32 flag) -{ - return 1; -} static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { } @@ -1088,7 +1087,6 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c) static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_32 int i; /* @@ -1109,7 +1107,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) break; } } -#endif } #define NO_SPECULATION BIT(0) @@ -2392,12 +2389,12 @@ void __init arch_cpu_finalize_init(void) alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { - unsigned long USER_PTR_MAX = TASK_SIZE_MAX-1; + unsigned long USER_PTR_MAX = TASK_SIZE_MAX; /* * Enable this when LAM is gated on LASS support if (cpu_feature_enabled(X86_FEATURE_LAM)) - USER_PTR_MAX = (1ul << 63) - PAGE_SIZE - 1; + USER_PTR_MAX = (1ul << 63) - PAGE_SIZE; */ runtime_const_init(ptr, USER_PTR_MAX); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index eb5848d1851a..8ce352fc72ac 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -630,7 +630,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, if (!section->virt_addr) return false; - section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); + section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page)); if (!section->pages) { memunmap(section->virt_addr); return false; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 29d1f9104e94..6b6f32f40cbe 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,7 +18,7 @@ #include <linux/bcma/bcma_regs.h> #include <linux/platform_data/x86/apple.h> #include <drm/intel/i915_drm.h> -#include <drm/intel/i915_pciids.h> +#include <drm/intel/pciids.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index adb09f78edb2..4dd0ad6c94d6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) + if (ftrace_poke_late) { text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - else - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); + } else { + mutex_lock(&text_mutex); + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); + mutex_unlock(&text_mutex); + } return 0; } @@ -318,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - int ret; + void *ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -349,15 +352,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); - if (WARN_ON(ret < 0)) + ret = text_poke_copy(trampoline, (void *)start_offset, size); + if (WARN_ON(!ret)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - memcpy(ip, retq, sizeof(retq)); + text_poke_copy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -365,8 +368,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); - if (ret < 0) + if (!text_poke_copy(ip, x86_nops[2], 2)) goto fail; } @@ -379,7 +381,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - *ptr = (unsigned long)ops; + text_poke_copy(ptr, &ops, sizeof(unsigned long)); op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -395,7 +397,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -405,9 +407,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - memcpy(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE); + text_poke_copy_locked(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE, false); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 837450b6e882..8984abd91c00 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -146,18 +146,21 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, } if (apply) { - if (memcmp(loc, &zero, size)) { + void *wr_loc = module_writable_address(me, loc); + + if (memcmp(wr_loc, &zero, size)) { pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(loc, &val, size); + write(wr_loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } + /* FIXME: needs care for ROX module allocations */ write(loc, &zero, size); } } @@ -224,7 +227,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *alt = NULL, *locks = NULL, + const Elf_Shdr *s, *alt = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; @@ -233,8 +236,6 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -265,20 +266,20 @@ int module_finalize(const Elf_Ehdr *hdr, csize = cfi->sh_size; } - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; - apply_retpolines(rseg, rseg + retpolines->sh_size); + apply_retpolines(rseg, rseg + retpolines->sh_size, me); } if (returns) { void *rseg = (void *)returns->sh_addr; - apply_returns(rseg, rseg + returns->sh_size); + apply_returns(rseg, rseg + returns->sh_size, me); } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); + apply_alternatives(aseg, aseg + alt->sh_size, me); } if (calls || alt) { struct callthunk_sites cs = {}; @@ -297,8 +298,28 @@ int module_finalize(const Elf_Ehdr *hdr, } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); } + + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + + return 0; +} + +int module_post_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *locks = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + } + if (locks) { void *lseg = (void *)locks->sh_addr; void *text = me->mem[MOD_TEXT].base; @@ -308,10 +329,6 @@ int module_finalize(const Elf_Ehdr *hdr, text, text_end); } - if (orc && orc_ip) - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, - (void *)orc->sh_addr, orc->sh_size); - return 0; } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 87f8c9a71c49..776ae6fa7f2d 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,6 +18,7 @@ #include <linux/random.h> #include <linux/uaccess.h> #include <linux/elf.h> +#include <linux/hugetlb.h> #include <asm/elf.h> #include <asm/ia32.h> @@ -25,8 +26,10 @@ /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. */ -static unsigned long get_align_mask(void) +static unsigned long get_align_mask(struct file *filp) { + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); /* handle 32- and 64-bit case with a single conditional */ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) return 0; @@ -49,7 +52,7 @@ static unsigned long get_align_mask(void) */ static unsigned long get_align_bits(void) { - return va_align.bits & get_align_mask(); + return va_align.bits & get_align_mask(NULL); } static int __init control_va_addr_alignment(char *str) @@ -148,12 +151,15 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, info.length = len; info.low_limit = begin; info.high_limit = end; - info.align_offset = pgoff << PAGE_SHIFT; - info.start_gap = stack_guard_placement(vm_flags); + if (!(filp && is_file_hugepages(filp))) { + info.align_offset = pgoff << PAGE_SHIFT; + info.start_gap = stack_guard_placement(vm_flags); + } if (filp) { - info.align_mask = get_align_mask(); + info.align_mask = get_align_mask(filp); info.align_offset += get_align_bits(); } + return vm_unmapped_area(&info); } @@ -199,7 +205,10 @@ get_unmapped_area: info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); - info.start_gap = stack_guard_placement(vm_flags); + if (!(filp && is_file_hugepages(filp))) { + info.start_gap = stack_guard_placement(vm_flags); + info.align_offset = pgoff << PAGE_SHIFT; + } /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area @@ -211,9 +220,8 @@ get_unmapped_area: if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; - info.align_offset = pgoff << PAGE_SHIFT; if (filp) { - info.align_mask = get_align_mask(); + info.align_mask = get_align_mask(filp); info.align_offset += get_align_bits(); } addr = vm_unmapped_area(&info); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f09f13c01c6b..ea2c4f21c1ca 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -18,10 +18,10 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION config KVM_X86 - def_tristate KVM if KVM_INTEL || KVM_AMD - depends on X86_LOCAL_APIC + def_tristate KVM if (KVM_INTEL != n || KVM_AMD != n) select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER + select KVM_ELIDE_TLB_FLUSH_IF_YOUNG select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_DIRTY_RING_TSO @@ -29,6 +29,7 @@ config KVM_X86 select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_READONLY_MEM + select VHOST_TASK select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO @@ -49,6 +50,7 @@ config KVM_X86 config KVM tristate "Kernel-based Virtual Machine (KVM) support" + depends on X86_LOCAL_APIC help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 41786b834b16..097bdc022d0f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -690,7 +690,9 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST); kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES); - if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) + if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) && + boot_cpu_has(X86_FEATURE_AMD_IBPB) && + boot_cpu_has(X86_FEATURE_AMD_IBRS)) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL); if (boot_cpu_has(X86_FEATURE_STIBP)) kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); @@ -698,14 +700,14 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | - F(FZRM) | F(FSRS) | F(FSRC) | - F(AMX_FP16) | F(AVX_IFMA) | F(LAM) + F(SHA512) | F(SM3) | F(SM4) | F(AVX_VNNI) | F(AVX512_BF16) | + F(CMPCCXADD) | F(FZRM) | F(FSRS) | F(FSRC) | F(AMX_FP16) | + F(AVX_IFMA) | F(LAM) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, - F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | - F(AMX_COMPLEX) | F(AVX10) + F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(AMX_COMPLEX) | + F(AVX_VNNI_INT16) | F(PREFETCHITI) | F(AVX10) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, @@ -755,7 +757,7 @@ void kvm_set_cpu_caps(void) F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | - F(AMD_PSFD) + F(AMD_PSFD) | F(AMD_IBPB_RET) ); /* @@ -763,8 +765,12 @@ void kvm_set_cpu_caps(void) * arch/x86/kernel/cpu/bugs.c is kind enough to * record that in cpufeatures so use them. */ - if (boot_cpu_has(X86_FEATURE_IBPB)) + if (boot_cpu_has(X86_FEATURE_IBPB)) { kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB); + if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && + !boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) + kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB_RET); + } if (boot_cpu_has(X86_FEATURE_IBRS)) kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS); if (boot_cpu_has(X86_FEATURE_STIBP)) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 41697cca354e..c8dc66eddefd 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -2,7 +2,6 @@ #ifndef ARCH_X86_KVM_CPUID_H #define ARCH_X86_KVM_CPUID_H -#include "x86.h" #include "reverse_cpuid.h" #include <asm/cpu.h> #include <asm/processor.h> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e72aed25d721..60986f67c35a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -651,9 +651,10 @@ static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt) } static inline bool emul_is_noncanonical_address(u64 la, - struct x86_emulate_ctxt *ctxt) + struct x86_emulate_ctxt *ctxt, + unsigned int flags) { - return !__is_canonical_address(la, ctxt_virt_addr_bits(ctxt)); + return !ctxt->ops->is_canonical_addr(ctxt, la, flags); } /* @@ -1733,7 +1734,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | - ((u64)base3 << 32), ctxt)) + ((u64)base3 << 32), ctxt, + X86EMUL_F_DT_LOAD)) return emulate_gp(ctxt, err_code); } @@ -2516,8 +2518,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ss_sel = cs_sel + 8; cs.d = 0; cs.l = 1; - if (emul_is_noncanonical_address(rcx, ctxt) || - emul_is_noncanonical_address(rdx, ctxt)) + if (emul_is_noncanonical_address(rcx, ctxt, 0) || + emul_is_noncanonical_address(rdx, ctxt, 0)) return emulate_gp(ctxt, 0); break; } @@ -3494,7 +3496,8 @@ static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) if (rc != X86EMUL_CONTINUE) return rc; if (ctxt->mode == X86EMUL_MODE_PROT64 && - emul_is_noncanonical_address(desc_ptr.address, ctxt)) + emul_is_noncanonical_address(desc_ptr.address, ctxt, + X86EMUL_F_DT_LOAD)) return emulate_gp(ctxt, 0); if (lgdt) ctxt->ops->set_gdt(ctxt, &desc_ptr); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index b1eb46e26b2e..36a8786db291 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -44,6 +44,18 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif /* + * Using the register cache from interrupt context is generally not allowed, as + * caching a register and marking it available/dirty can't be done atomically, + * i.e. accesses from interrupt context may clobber state or read stale data if + * the vCPU task is in the process of updating the cache. The exception is if + * KVM is handling a PMI IRQ/NMI VM-Exit, as that bound code sequence doesn't + * touch the cache, it runs after the cache is reset (post VM-Exit), and PMIs + * need to access several registers that are cacheable. + */ +#define kvm_assert_register_caching_allowed(vcpu) \ + lockdep_assert_once(in_task() || kvm_arch_pmi_in_guest(vcpu)) + +/* * avail dirty * 0 0 register in VMCS/VMCB * 0 1 *INVALID* @@ -53,24 +65,28 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15) static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); } static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); } @@ -84,6 +100,7 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 55a18e2f2dcd..10495fffb890 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -94,6 +94,8 @@ struct x86_instruction_info { #define X86EMUL_F_FETCH BIT(1) #define X86EMUL_F_IMPLICIT BIT(2) #define X86EMUL_F_INVLPG BIT(3) +#define X86EMUL_F_MSR BIT(4) +#define X86EMUL_F_DT_LOAD BIT(5) struct x86_emulate_ops { void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); @@ -235,6 +237,9 @@ struct x86_emulate_ops { gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, unsigned int flags); + + bool (*is_canonical_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, + unsigned int flags); }; /* Type, address-of, and value of an instruction's operand. */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 95c6beb8ce27..3c83951c619e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -382,7 +382,7 @@ enum { DIRTY }; -void kvm_recalculate_apic_map(struct kvm *kvm) +static void kvm_recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; @@ -2577,7 +2577,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) return (tpr & 0xf0) >> 4; } -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) +static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value) { u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; @@ -2625,6 +2625,31 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } } +int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) +{ + enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); + enum lapic_mode new_mode = kvm_apic_mode(value); + + if (vcpu->arch.apic_base == value) + return 0; + + u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | + (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); + + if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) + return 1; + if (!host_initiated) { + if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) + return 1; + if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) + return 1; + } + + __kvm_apic_set_base(vcpu, value); + kvm_recalculate_apic_map(vcpu->kvm); + return 0; +} + void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -2654,7 +2679,6 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) int kvm_alloc_apic_access_page(struct kvm *kvm) { - struct page *page; void __user *hva; int ret = 0; @@ -2670,17 +2694,6 @@ int kvm_alloc_apic_access_page(struct kvm *kvm) goto out; } - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) { - ret = -EFAULT; - goto out; - } - - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); @@ -2735,7 +2748,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) msr_val |= MSR_IA32_APICBASE_BSP; - kvm_lapic_set_base(vcpu, msr_val); + + /* + * Use the inner helper to avoid an extra recalcuation of the + * optimized APIC map if some other task has dirtied the map. + * The recalculation needed for this vCPU will be done after + * all APIC state has been initialized (see below). + */ + __kvm_apic_set_base(vcpu, msr_val); } if (!apic) @@ -3076,7 +3096,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_x86_call(apicv_pre_state_restore)(vcpu); - kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 1b8ef9856422..24add38beaf0 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -95,8 +95,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); -void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, @@ -117,11 +115,9 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); -int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); -enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -271,6 +267,11 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base) return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); } +static inline enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) +{ + return kvm_apic_mode(vcpu->arch.apic_base); +} + static inline u8 kvm_xapic_id(struct kvm_lapic *apic) { return kvm_lapic_get_reg(apic, APIC_ID) >> 24; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 9dc5dd43ae7f..e9322358678b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -4,6 +4,7 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" +#include "x86.h" #include "cpuid.h" extern bool __read_mostly enable_mmio_caching; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8e853a5fc867..22e7ad235123 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator { static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; -static struct percpu_counter kvm_total_used_mmu_pages; static void mmu_spte_set(u64 *sptep, u64 spte); @@ -485,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) __set_spte(sptep, new_spte); } -/* - * Update the SPTE (excluding the PFN), but do not track changes in its - * accessed/dirty status. +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Returns true if the TLB needs to be flushed */ -static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; @@ -498,7 +498,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return old_spte; + return false; } if (!spte_has_volatile_bits(old_spte)) @@ -506,53 +506,10 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); - WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + WARN_ON_ONCE(!is_shadow_present_pte(old_spte) || + spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); - return old_spte; -} - -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote - * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only - * spte, even though the writable spte might be cached on a CPU's TLB. - * - * Returns true if the TLB needs to be flushed - */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) -{ - bool flush = false; - u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); - - if (!is_shadow_present_pte(old_spte)) - return false; - - /* - * For the spte updated out of mmu-lock is safe, since - * we always atomically update it, see the comments in - * spte_has_volatile_bits(). - */ - if (is_mmu_writable_spte(old_spte) && - !is_writable_pte(new_spte)) - flush = true; - - /* - * Flush TLB when accessed/dirty states are changed in the page tables, - * to guarantee consistency between TLB and page tables. - */ - - if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { - flush = true; - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - } - - if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { - flush = true; - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - } - - return flush; + return leaf_spte_change_needs_tlb_flush(old_spte, new_spte); } /* @@ -563,10 +520,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) */ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { - kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; - struct page *page; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) @@ -578,24 +533,6 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) return old_spte; kvm_update_page_stats(kvm, level, -1); - - pfn = spte_to_pfn(old_spte); - - /* - * KVM doesn't hold a reference to any pages mapped into the guest, and - * instead uses the mmu_notifier to ensure that KVM unmaps any pages - * before they are reclaimed. Sanity check that, if the pfn is backed - * by a refcounted page, the refcount is elevated. - */ - page = kvm_pfn_to_refcounted_page(pfn); - WARN_ON_ONCE(page && !page_count(page)); - - if (is_accessed_spte(old_spte)) - kvm_set_pfn_accessed(pfn); - - if (is_dirty_spte(old_spte)) - kvm_set_pfn_dirty(pfn); - return old_spte; } @@ -1250,16 +1187,6 @@ static bool spte_clear_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool spte_wrprot_for_clear_dirty(u64 *sptep) -{ - bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, - (unsigned long *)sptep); - if (was_writable && !spte_ad_enabled(*sptep)) - kvm_set_pfn_dirty(spte_to_pfn(*sptep)); - - return was_writable; -} - /* * Gets the GFN ready for another round of dirty logging by clearing the * - D bit on ad-enabled SPTEs, and @@ -1275,7 +1202,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, for_each_rmap_spte(rmap_head, &iter, sptep) if (spte_ad_need_write_protect(*sptep)) - flush |= spte_wrprot_for_clear_dirty(sptep); + flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, + (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); @@ -1640,15 +1568,12 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm, (unsigned long *)sptep); } else { /* - * Capture the dirty status of the page, so that - * it doesn't get lost when the SPTE is marked - * for access tracking. + * WARN if mmu_spte_update() signals the need + * for a TLB flush, as Access tracking a SPTE + * should never trigger an _immediate_ flush. */ - if (is_writable_pte(spte)) - kvm_set_pfn_dirty(spte_to_pfn(spte)); - spte = mark_spte_for_access_track(spte); - mmu_spte_update_no_track(sptep, spte); + WARN_ON_ONCE(mmu_spte_update(sptep, spte)); } young = true; } @@ -1696,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) #endif } -/* - * This value is the sum of all of the kvm instances's - * kvm->arch.n_used_mmu_pages values. We need a global, - * aggregate version in order to make the slab shrinker - * faster - */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) -{ - kvm->arch.n_used_mmu_pages += nr; - percpu_counter_add(&kvm_total_used_mmu_pages, nr); -} - static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm_mod_used_mmu_pages(kvm, +1); + kvm->arch.n_used_mmu_pages++; kvm_account_pgtable_pages((void *)sp->spt, +1); } static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm_mod_used_mmu_pages(kvm, -1); + kvm->arch.n_used_mmu_pages--; kvm_account_pgtable_pages((void *)sp->spt, -1); } @@ -2802,7 +2715,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) * be write-protected. */ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, - gfn_t gfn, bool can_unsync, bool prefetch) + gfn_t gfn, bool synchronizing, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; @@ -2817,12 +2730,12 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, /* * The page is not write-tracked, mark existing shadow pages unsync - * unless KVM is synchronizing an unsync SP (can_unsync = false). In - * that case, KVM must complete emulation of the guest TLB flush before - * allowing shadow pages to become unsync (writable by the guest). + * unless KVM is synchronizing an unsync SP. In that case, KVM must + * complete emulation of the guest TLB flush before allowing shadow + * pages to become unsync (writable by the guest). */ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - if (!can_unsync) + if (synchronizing) return -EPERM; if (sp->unsync) @@ -2926,6 +2839,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (is_shadow_present_pte(*sptep)) { + if (prefetch) + return RET_PF_SPURIOUS; + /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. @@ -2945,7 +2861,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, - true, host_writable, &spte); + false, host_writable, &spte); if (*sptep == spte) { ret = RET_PF_SPURIOUS; @@ -2971,32 +2887,51 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, return ret; } -static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *start, u64 *end) +static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep, + int nr_pages, unsigned int access) { struct page *pages[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; - unsigned int access = sp->role.access; - int i, ret; - gfn_t gfn; + int i; + + if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM)) + return false; - gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) - return -1; + return false; - ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); - if (ret <= 0) - return -1; + nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages); + if (nr_pages <= 0) + return false; - for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, slot, start, access, gfn, + for (i = 0; i < nr_pages; i++, gfn++, sptep++) { + mmu_set_spte(vcpu, slot, sptep, access, gfn, page_to_pfn(pages[i]), NULL); - put_page(pages[i]); + + /* + * KVM always prefetches writable pages from the primary MMU, + * and KVM can make its SPTE writable in the fast page handler, + * without notifying the primary MMU. Mark pages/folios dirty + * now to ensure file data is written back if it ends up being + * written by the guest. Because KVM's prefetching GUPs + * writable PTEs, the probability of unnecessary writeback is + * extremely low. + */ + kvm_release_page_dirty(pages[i]); } - return 0; + return true; +} + +static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *start, u64 *end) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); + unsigned int access = sp->role.access; + + return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access); } static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, @@ -3014,8 +2949,9 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, if (is_shadow_present_pte(*spte) || spte == sptep) { if (!start) continue; - if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) + if (!direct_pte_prefetch_many(vcpu, sp, start, spte)) return; + start = NULL; } else if (!start) start = spte; @@ -3165,13 +3101,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, } int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level) + const struct kvm_memory_slot *slot, gfn_t gfn) { bool is_private = kvm_slot_can_be_private(slot) && kvm_mem_is_private(kvm, gfn); - return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private); + return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); } void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -3322,7 +3257,6 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; - fault->hva = KVM_HVA_ERR_BAD; /* * If MMIO caching is disabled, emulate immediately without @@ -3392,7 +3326,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault * by setting the Writable bit, which can be done out of mmu_lock. */ if (!fault->present) - return !kvm_ad_enabled(); + return !kvm_ad_enabled; /* * Note, instruction fetches and writes are mutually exclusive, ignore @@ -3419,7 +3353,7 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, * harm. This also avoids the TLB flush needed after setting dirty bit * so non-PML cases won't be impacted. * - * Compare with set_spte where instead shadow_dirty_mask is set. + * Compare with make_spte() where instead shadow_dirty_mask is set. */ if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; @@ -3527,8 +3461,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * uses A/D bits for non-nested MMUs. Thus, if A/D bits are * enabled, the SPTE can't be an access-tracked SPTE. */ - if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte)) - new_spte = restore_acc_track_spte(new_spte); + if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte) | + shadow_accessed_mask; /* * To keep things simple, only SPTEs that are MMU-writable can @@ -4376,8 +4311,15 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, return max_level; } -static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) +static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, int r) +{ + kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page, + r == RET_PF_RETRY, fault->map_writable); +} + +static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { int max_order, r; @@ -4387,7 +4329,7 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, } r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, - &max_order); + &fault->refcounted_page, &max_order); if (r) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return r; @@ -4400,19 +4342,26 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, return RET_PF_CONTINUE; } -static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { - bool async; + unsigned int foll = fault->write ? FOLL_WRITE : 0; if (fault->is_private) - return kvm_faultin_pfn_private(vcpu, fault); + return kvm_mmu_faultin_pfn_private(vcpu, fault); - async = false; - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false, - &async, fault->write, - &fault->map_writable, &fault->hva); - if (!async) - return RET_PF_CONTINUE; /* *pfn has correct page already */ + foll |= FOLL_NOWAIT; + fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, + &fault->map_writable, &fault->refcounted_page); + + /* + * If resolving the page failed because I/O is needed to fault-in the + * page, then either set up an asynchronous #PF to do the I/O, or if + * doing an async #PF isn't possible, retry with I/O allowed. All + * other failures are terminal, i.e. retrying won't help. + */ + if (fault->pfn != KVM_PFN_ERR_NEEDS_IO) + return RET_PF_CONTINUE; if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); @@ -4430,14 +4379,16 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, - NULL, fault->write, - &fault->map_writable, &fault->hva); + foll |= FOLL_INTERRUPTIBLE; + foll &= ~FOLL_NOWAIT; + fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, + &fault->map_writable, &fault->refcounted_page); + return RET_PF_CONTINUE; } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access) +static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; int ret; @@ -4520,7 +4471,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; - ret = __kvm_faultin_pfn(vcpu, fault); + ret = __kvm_mmu_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; @@ -4538,7 +4489,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * mmu_lock is acquired. */ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { - kvm_release_pfn_clean(fault->pfn); + kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY); return RET_PF_RETRY; } @@ -4597,7 +4548,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; @@ -4614,8 +4565,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = direct_map(vcpu, fault); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } @@ -4688,7 +4639,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; @@ -4701,8 +4652,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, r = kvm_tdp_mmu_map(vcpu, fault); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); read_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } #endif @@ -5488,7 +5439,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.efer_nx = true; role.smm = cpu_role.base.smm; role.guest_mode = cpu_role.base.guest_mode; - role.ad_disabled = !kvm_ad_enabled(); + role.ad_disabled = !kvm_ad_enabled; role.level = kvm_mmu_get_tdp_level(vcpu); role.direct = true; role.has_4_byte_gpte = false; @@ -6228,7 +6179,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, /* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ - if (is_noncanonical_address(addr, vcpu)) + if (is_noncanonical_invlpg_address(addr, vcpu)) return; kvm_x86_call(flush_tlb_gva)(vcpu, addr); @@ -6416,8 +6367,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; + LIST_HEAD(invalid_list); bool unstable; + lockdep_assert_held(&kvm->slots_lock); + restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { @@ -6449,7 +6403,7 @@ restart: } unstable = __kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages, &nr_zapped); + &invalid_list, &nr_zapped); batch += nr_zapped; if (unstable) @@ -6465,7 +6419,7 @@ restart: * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are * running with an obsolete MMU. */ - kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* @@ -6528,16 +6482,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_tdp_mmu_zap_invalidated_roots(kvm); } -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) -{ - return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); -} - void kvm_mmu_init_vm(struct kvm *kvm) { kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); @@ -6771,7 +6719,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm, continue; } - spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); + spte = make_small_spte(kvm, huge_spte, sp->role, index); mmu_spte_set(sptep, spte); __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); } @@ -6954,8 +6902,7 @@ restart: * mapping if the indirect sp has level = 1. */ if (sp->role.direct && - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - PG_LEVEL_NUM)) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) @@ -6983,8 +6930,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, kvm_flush_remote_tlbs_memslot(kvm, slot); } -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); @@ -6994,7 +6941,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); + kvm_tdp_mmu_recover_huge_pages(kvm, slot); read_unlock(&kvm->mmu_lock); } } @@ -7149,72 +7096,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) } } -static unsigned long mmu_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) -{ - struct kvm *kvm; - int nr_to_scan = sc->nr_to_scan; - unsigned long freed = 0; - - mutex_lock(&kvm_lock); - - list_for_each_entry(kvm, &vm_list, vm_list) { - int idx; - - /* - * Never scan more than sc->nr_to_scan VM instances. - * Will not hit this condition practically since we do not try - * to shrink more than one VM and it is very unlikely to see - * !n_used_mmu_pages so many times. - */ - if (!nr_to_scan--) - break; - /* - * n_used_mmu_pages is accessed without holding kvm->mmu_lock - * here. We may skip a VM instance errorneosly, but we do not - * want to shrink a VM that only started to populate its MMU - * anyway. - */ - if (!kvm->arch.n_used_mmu_pages && - !kvm_has_zapped_obsolete_pages(kvm)) - continue; - - idx = srcu_read_lock(&kvm->srcu); - write_lock(&kvm->mmu_lock); - - if (kvm_has_zapped_obsolete_pages(kvm)) { - kvm_mmu_commit_zap_page(kvm, - &kvm->arch.zapped_obsolete_pages); - goto unlock; - } - - freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); - -unlock: - write_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - - /* - * unfair on small ones - * per-vm shrinkers cry out - * sadness comes quickly - */ - list_move_tail(&kvm->vm_list, &vm_list); - break; - } - - mutex_unlock(&kvm_lock); - return freed; -} - -static unsigned long mmu_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) -{ - return percpu_counter_read_positive(&kvm_total_used_mmu_pages); -} - -static struct shrinker *mmu_shrinker; - static void mmu_destroy_caches(void) { kmem_cache_destroy(pte_list_desc_cache); @@ -7281,7 +7162,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - wake_up_process(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); } mutex_unlock(&kvm_lock); } @@ -7341,23 +7222,8 @@ int kvm_mmu_vendor_module_init(void) if (!mmu_page_header_cache) goto out; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto out; - - mmu_shrinker = shrinker_alloc(0, "x86-mmu"); - if (!mmu_shrinker) - goto out_shrinker; - - mmu_shrinker->count_objects = mmu_shrink_count; - mmu_shrinker->scan_objects = mmu_shrink_scan; - mmu_shrinker->seeks = DEFAULT_SEEKS * 10; - - shrinker_register(mmu_shrinker); - return 0; -out_shrinker: - percpu_counter_destroy(&kvm_total_used_mmu_pages); out: mmu_destroy_caches(); return ret; @@ -7374,8 +7240,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); - percpu_counter_destroy(&kvm_total_used_mmu_pages); - shrinker_free(mmu_shrinker); } /* @@ -7427,7 +7291,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - wake_up_process(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); mutex_unlock(&kvm_lock); } @@ -7530,62 +7394,56 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static long get_nx_huge_page_recovery_timeout(u64 start_time) +static void kvm_nx_huge_page_recovery_worker_kill(void *data) { - bool enabled; - uint period; - - enabled = calc_nx_huge_pages_recovery_period(&period); - - return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; } -static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) +static bool kvm_nx_huge_page_recovery_worker(void *data) { - u64 start_time; + struct kvm *kvm = data; + bool enabled; + uint period; long remaining_time; - while (true) { - start_time = get_jiffies_64(); - remaining_time = get_nx_huge_page_recovery_timeout(start_time); - - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop() && remaining_time > 0) { - schedule_timeout(remaining_time); - remaining_time = get_nx_huge_page_recovery_timeout(start_time); - set_current_state(TASK_INTERRUPTIBLE); - } - - set_current_state(TASK_RUNNING); - - if (kthread_should_stop()) - return 0; + enabled = calc_nx_huge_pages_recovery_period(&period); + if (!enabled) + return false; - kvm_recover_nx_huge_pages(kvm); + remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period) + - get_jiffies_64(); + if (remaining_time > 0) { + schedule_timeout(remaining_time); + /* check for signals and come back */ + return true; } + + __set_current_state(TASK_RUNNING); + kvm_recover_nx_huge_pages(kvm); + kvm->arch.nx_huge_page_last = get_jiffies_64(); + return true; } int kvm_mmu_post_init_vm(struct kvm *kvm) { - int err; - if (nx_hugepage_mitigation_hard_disabled) return 0; - err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, - "kvm-nx-lpage-recovery", - &kvm->arch.nx_huge_page_recovery_thread); - if (!err) - kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); + kvm->arch.nx_huge_page_last = get_jiffies_64(); + kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( + kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, + kvm, "kvm-nx-lpage-recovery"); - return err; + if (!kvm->arch.nx_huge_page_recovery_thread) + return -ENOMEM; + + vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); + return 0; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { if (kvm->arch.nx_huge_page_recovery_thread) - kthread_stop(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread); } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index c98827840e07..b00abbe3f6cf 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -164,7 +164,7 @@ static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) } int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, - gfn_t gfn, bool can_unsync, bool prefetch); + gfn_t gfn, bool synchronizing, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); @@ -235,10 +235,10 @@ struct kvm_page_fault { /* The memslot containing gfn. May be NULL. */ struct kvm_memory_slot *slot; - /* Outputs of kvm_faultin_pfn. */ + /* Outputs of kvm_mmu_faultin_pfn(). */ unsigned long mmu_seq; kvm_pfn_t pfn; - hva_t hva; + struct page *refcounted_page; bool map_writable; /* @@ -313,7 +313,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .is_private = err & PFERR_PRIVATE_ACCESS, .pfn = KVM_PFN_ERR_FAULT, - .hva = KVM_HVA_ERR_BAD, }; int r; @@ -347,8 +346,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level); + const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ae7d39ff2d07..f4711674c47b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -533,10 +533,8 @@ static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte) { - struct kvm_memory_slot *slot; unsigned pte_access; gfn_t gfn; - kvm_pfn_t pfn; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; @@ -545,17 +543,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK); - if (!slot) - return false; - - pfn = gfn_to_pfn_memslot_atomic(slot, gfn); - if (is_error_pfn(pfn)) - return false; - - mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); - kvm_release_pfn_clean(pfn); - return true; + return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -813,7 +801,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); + r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; @@ -848,8 +836,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = FNAME(fetch)(vcpu, fault, &walker); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } @@ -892,9 +880,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, /* * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is - * safe because: - * - The spte has a reference to the struct page, so the pfn for a given gfn - * can't change unless all sptes pointing to it are nuked first. + * safe because SPTEs are protected by mmu_notifiers and memslot generations, so + * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are + * nuked first. * * Returns * < 0: failed to sync spte @@ -963,9 +951,14 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int host_writable = spte & shadow_host_writable_mask; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); make_spte(vcpu, sp, slot, pte_access, gfn, - spte_to_pfn(spte), spte, true, false, + spte_to_pfn(spte), spte, true, true, host_writable, &spte); + /* + * There is no need to mark the pfn dirty, as the new protections must + * be a subset of the old protections, i.e. synchronizing a SPTE cannot + * change the SPTE from read-only to writable. + */ return mmu_spte_update(sptep, spte); } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 8f7eb3ad88fc..22551e2f1d00 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -24,6 +24,8 @@ static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); EXPORT_SYMBOL_GPL(enable_mmio_caching); +bool __read_mostly kvm_ad_enabled; + u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; @@ -133,12 +135,6 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) */ bool spte_has_volatile_bits(u64 spte) { - /* - * Always atomically update spte if it can be updated - * out of mmu-lock, it can ensure dirty bit is not lost, - * also, it can help us to get a stable is_writable_pte() - * to ensure tlb flush is not missed. - */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; @@ -157,7 +153,7 @@ bool spte_has_volatile_bits(u64 spte) bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, - u64 old_spte, bool prefetch, bool can_unsync, + u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte) { int level = sp->role.level; @@ -178,8 +174,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= SPTE_TDP_AD_WRPROT_ONLY; spte |= shadow_present_mask; - if (!prefetch) - spte |= spte_shadow_accessed_mask(spte); + if (!prefetch || synchronizing) + spte |= shadow_accessed_mask; /* * For simplicity, enforce the NX huge page mitigation even if not @@ -223,41 +219,39 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. - * Same reasoning can be applied to dirty page accounting. - */ - if (is_writable_pte(old_spte)) - goto out; - /* * Unsync shadow pages that are reachable by the new, writable * SPTE. Write-protect the SPTE if the page can't be unsync'd, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. + * + * When overwriting an existing leaf SPTE, and the old SPTE was + * writable, skip trying to unsync shadow pages as any relevant + * shadow pages must already be unsync, i.e. the hash lookup is + * unnecessary (and expensive). Note, this relies on KVM not + * changing PFNs without first zapping the old SPTE, which is + * guaranteed by both the shadow MMU and the TDP MMU. */ - if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { + if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) && + mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) wrprot = true; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); - } + else + spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask | + shadow_dirty_mask; } - if (pte_access & ACC_WRITE_MASK) - spte |= spte_shadow_dirty_mask(spte); - -out: - if (prefetch) + if (prefetch && !synchronizing) spte = mark_spte_for_access_track(spte); WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + /* + * Mark the memslot dirty *after* modifying it for access tracking. + * Unlike folios, memslots can be safely marked dirty out of mmu_lock, + * i.e. in the fast page fault handler. + */ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { /* Enforced by kvm_mmu_hugepage_adjust. */ WARN_ON_ONCE(level > PG_LEVEL_4K); @@ -268,15 +262,15 @@ out: return wrprot; } -static u64 make_spte_executable(u64 spte) +static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) { bool is_access_track = is_access_track_spte(spte); if (is_access_track) spte = restore_acc_track_spte(spte); - spte &= ~shadow_nx_mask; - spte |= shadow_x_mask; + KVM_MMU_WARN_ON(set & clear); + spte = (spte | set) & ~clear; if (is_access_track) spte = mark_spte_for_access_track(spte); @@ -284,6 +278,16 @@ static u64 make_spte_executable(u64 spte) return spte; } +static u64 make_spte_executable(u64 spte) +{ + return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); +} + +static u64 make_spte_nonexecutable(u64 spte) +{ + return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); +} + /* * Construct an SPTE that maps a sub-page of the given huge page SPTE where * `index` identifies which sub-page. @@ -291,8 +295,8 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, - union kvm_mmu_page_role role, int index) +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index) { u64 child_spte = huge_spte; @@ -320,6 +324,26 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, return child_spte; } +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) +{ + u64 huge_spte; + + KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm); + + huge_spte = small_spte | PT_PAGE_SIZE_MASK; + + /* + * huge_spte already has the address of the sub-page being collapsed + * from small_spte, so just clear the lower address bits to create the + * huge page address. + */ + huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; + + if (is_nx_huge_page_enabled(kvm)) + huge_spte = make_spte_nonexecutable(huge_spte); + + return huge_spte; +} u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { @@ -352,7 +376,7 @@ u64 mark_spte_for_access_track(u64 spte) spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; - spte &= ~shadow_acc_track_mask; + spte &= ~(shadow_acc_track_mask | shadow_accessed_mask); return spte; } @@ -422,9 +446,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { + kvm_ad_enabled = has_ad_bits; + shadow_user_mask = VMX_EPT_READABLE_MASK; - shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; - shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; + shadow_accessed_mask = VMX_EPT_ACCESS_BIT; + shadow_dirty_mask = VMX_EPT_DIRTY_BIT; shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ @@ -455,6 +481,8 @@ void kvm_mmu_reset_all_pte_masks(void) u8 low_phys_bits; u64 mask; + kvm_ad_enabled = true; + /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 2cb816ea2430..f332b33bc817 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -167,6 +167,15 @@ static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); #define SHADOW_NONPRESENT_VALUE 0ULL #endif + +/* + * True if A/D bits are supported in hardware and are enabled by KVM. When + * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable + * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario + * where KVM is using A/D bits for L1, but not L2. + */ +extern bool __read_mostly kvm_ad_enabled; + extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; @@ -285,17 +294,6 @@ static inline bool is_ept_ve_possible(u64 spte) (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; } -/* - * Returns true if A/D bits are supported in hardware and are enabled by KVM. - * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can - * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the - * scenario where KVM is using A/D bits for L1, but not L2. - */ -static inline bool kvm_ad_enabled(void) -{ - return !!shadow_accessed_mask; -} - static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; @@ -318,18 +316,6 @@ static inline bool spte_ad_need_write_protect(u64 spte) return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - static inline bool is_access_track_spte(u64 spte) { return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; @@ -357,17 +343,7 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte) static inline bool is_accessed_spte(u64 spte) { - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static inline bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; + return spte & shadow_accessed_mask; } static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, @@ -485,6 +461,33 @@ static inline bool is_mmu_writable_spte(u64 spte) return spte & shadow_mmu_writable_mask; } +/* + * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for + * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, + * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM + * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped, + * not whether or not SPTEs were modified, i.e. only the write-tracking case + * needs to flush at the time the SPTEs is modified, before dropping mmu_lock. + * + * Don't flush if the Accessed bit is cleared, as access tracking tolerates + * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a + * mmu_notifier.clear_flush_young() event. + * + * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally + * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and + * when clearing dirty logs, KVM flushes based on whether or not dirty entries + * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found. + * + * Note, this logic only applies to shadow-present leaf SPTEs. The caller is + * responsible for checking that the old SPTE is shadow-present, and is also + * responsible for determining whether or not a TLB flush is required when + * modifying a shadow-present non-leaf SPTE. + */ +static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte) +{ + return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte); +} + static inline u64 get_mmio_spte_generation(u64 spte) { u64 gen; @@ -499,10 +502,11 @@ bool spte_has_volatile_bits(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, - u64 old_spte, bool prefetch, bool can_unsync, + u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, - union kvm_mmu_page_role role, int index); +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 3b996c1fdaab..4508d868f1cd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -511,10 +511,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (is_leaf != was_leaf) kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); - if (was_leaf && is_dirty_spte(old_spte) && - (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - /* * Recursively handle child PTs if the change removed a subtree from * the paging structure. Note the WARN on the PFN changing without the @@ -524,10 +520,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_present && !was_leaf && (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); - - if (was_leaf && is_accessed_spte(old_spte) && - (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, @@ -591,48 +583,6 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, return 0; } -static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) -{ - int ret; - - lockdep_assert_held_read(&kvm->mmu_lock); - - /* - * Freeze the SPTE by setting it to a special, non-present value. This - * will stop other threads from immediately installing a present entry - * in its place before the TLBs are flushed. - * - * Delay processing of the zapped SPTE until after TLBs are flushed and - * the FROZEN_SPTE is replaced (see below). - */ - ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE); - if (ret) - return ret; - - kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); - - /* - * No other thread can overwrite the frozen SPTE as they must either - * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not - * overwrite the special frozen SPTE value. Use the raw write helper to - * avoid an unnecessary check on volatile bits. - */ - __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE); - - /* - * Process the zapped SPTE after flushing TLBs, and after replacing - * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are - * blocked by the FROZEN_SPTE and reduces contention on the child - * SPTEs. - */ - handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - SHADOW_NONPRESENT_VALUE, iter->level, true); - - return 0; -} - - /* * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance @@ -688,6 +638,16 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) +static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, + struct tdp_iter *iter) +{ + if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock)) + return false; + + /* Ensure forward progress has been made before yielding. */ + return iter->next_last_level_gfn != iter->yielded_gfn; +} + /* * Yield if the MMU lock is contended or this thread needs to return control * to the scheduler. @@ -706,31 +666,27 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush, bool shared) { - WARN_ON_ONCE(iter->yielded); + KVM_MMU_WARN_ON(iter->yielded); - /* Ensure forward progress has been made before yielding. */ - if (iter->next_last_level_gfn == iter->yielded_gfn) + if (!tdp_mmu_iter_need_resched(kvm, iter)) return false; - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush) - kvm_flush_remote_tlbs(kvm); - - rcu_read_unlock(); + if (flush) + kvm_flush_remote_tlbs(kvm); - if (shared) - cond_resched_rwlock_read(&kvm->mmu_lock); - else - cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_unlock(); - rcu_read_lock(); + if (shared) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); - WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); + rcu_read_lock(); - iter->yielded = true; - } + WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); - return iter->yielded; + iter->yielded = true; + return true; } static inline gfn_t tdp_mmu_max_gfn_exclusive(void) @@ -1026,19 +982,23 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) return RET_PF_RETRY; + if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) + return RET_PF_SPURIOUS; + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, - fault->pfn, iter->old_spte, fault->prefetch, true, - fault->map_writable, &new_spte); + fault->pfn, iter->old_spte, fault->prefetch, + false, fault->map_writable, &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; else if (is_shadow_present_pte(iter->old_spte) && - !is_last_spte(iter->old_spte, iter->level)) + (!is_last_spte(iter->old_spte, iter->level) || + WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte)))) kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); /* @@ -1078,7 +1038,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { - u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); + u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled); int ret = 0; if (shared) { @@ -1195,33 +1155,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, return flush; } -typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range); - -static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, - struct kvm_gfn_range *range, - tdp_handler_t handler) -{ - struct kvm_mmu_page *root; - struct tdp_iter iter; - bool ret = false; - - /* - * Don't support rescheduling, none of the MMU notifiers that funnel - * into this helper allow blocking; it'd be dead, wasteful code. - */ - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { - rcu_read_lock(); - - tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) - ret |= handler(kvm, &iter, range); - - rcu_read_unlock(); - } - - return ret; -} - /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. @@ -1230,15 +1163,10 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ -static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) +static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter) { u64 new_spte; - /* If we have a non-accessed entry we don't need to change the pte. */ - if (!is_accessed_spte(iter->old_spte)) - return false; - if (spte_ad_enabled(iter->old_spte)) { iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, iter->old_spte, @@ -1246,13 +1174,6 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, iter->level); new_spte = iter->old_spte & ~shadow_accessed_mask; } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(iter->old_spte)) - kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); - new_spte = mark_spte_for_access_track(iter->old_spte); iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, iter->old_spte, new_spte, @@ -1261,23 +1182,48 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, iter->old_spte, new_spte); - return true; } -bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + bool test_only) { - return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); + struct kvm_mmu_page *root; + struct tdp_iter iter; + bool ret = false; + + /* + * Don't support rescheduling, none of the MMU notifiers that funnel + * into this helper allow blocking; it'd be dead, wasteful code. Note, + * this helper must NOT be used to unmap GFNs, as it processes only + * valid roots! + */ + for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) { + guard(rcu)(); + + tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) { + if (!is_accessed_spte(iter.old_spte)) + continue; + + if (test_only) + return true; + + ret = true; + kvm_tdp_mmu_age_spte(&iter); + } + } + + return ret; } -static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - return is_accessed_spte(iter->old_spte); + return __kvm_tdp_mmu_age_gfn_range(kvm, range, false); } bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); + return __kvm_tdp_mmu_age_gfn_range(kvm, range, true); } /* @@ -1368,7 +1314,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * not been linked in yet and thus is not reachable from any other CPU. */ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); + sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level @@ -1501,16 +1447,15 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) * from level, so it is valid to key off any shadow page to determine if * write protection is needed for an entire tree. */ - return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled(); + return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; } -static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) +static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) { const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; - bool spte_set = false; rcu_read_lock(); @@ -1531,31 +1476,24 @@ retry: if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) goto retry; - - spte_set = true; } rcu_read_unlock(); - return spte_set; } /* * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the - * memslot. Returns true if an SPTE has been changed and the TLBs need to be - * flushed. + * memslot. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - bool spte_set = false; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); - - return spte_set; + clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); } static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, @@ -1593,7 +1531,6 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, iter.old_spte, iter.old_spte & ~dbit); - kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); } rcu_read_unlock(); @@ -1615,21 +1552,55 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -static void zap_collapsible_spte_range(struct kvm *kvm, - struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot) +static int tdp_mmu_make_huge_spte(struct kvm *kvm, + struct tdp_iter *parent, + u64 *huge_spte) +{ + struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte); + gfn_t start = parent->gfn; + gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * Use the parent iterator when checking for forward progress so + * that KVM doesn't get stuck continuously trying to yield (i.e. + * returning -EAGAIN here and then failing the forward progress + * check in the caller ad nauseam). + */ + if (tdp_mmu_iter_need_resched(kvm, parent)) + return -EAGAIN; + + *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level); + return 0; + } + + return -ENOENT; +} + +static void recover_huge_pages_range(struct kvm *kvm, + struct kvm_mmu_page *root, + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; + bool flush = false; + u64 huge_spte; + int r; + + if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot))) + return; rcu_read_lock(); for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { + flush = false; continue; + } if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || !is_shadow_present_pte(iter.old_spte)) @@ -1653,31 +1624,40 @@ retry: if (iter.gfn < start || iter.gfn >= end) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, PG_LEVEL_NUM); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); if (max_mapping_level < iter.level) continue; - /* Note, a successful atomic zap also does a remote TLB flush. */ - if (tdp_mmu_zap_spte_atomic(kvm, &iter)) + r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte); + if (r == -EAGAIN) + goto retry; + else if (r) + continue; + + if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) goto retry; + + flush = true; } + if (flush) + kvm_flush_remote_tlbs_memslot(kvm, slot); + rcu_read_unlock(); } /* - * Zap non-leaf SPTEs (and free their associated page tables) which could - * be replaced by huge pages, for GFNs within the slot. + * Recover huge page mappings within the slot by replacing non-leaf SPTEs with + * huge SPTEs where possible. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - zap_collapsible_spte_range(kvm, root, slot); + recover_huge_pages_range(kvm, root, slot); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1b74e058a81c..f03ca0dd13d9 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -34,14 +34,14 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level); -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 05490b9d8a43..6f74e2b27c1e 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -19,6 +19,7 @@ #include <asm/mtrr.h> #include "cpuid.h" +#include "x86.h" static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr) { diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 0d17d6b70639..e46220ece83c 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -46,6 +46,7 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) +#define X86_FEATURE_AVX_VNNI_INT16 KVM_X86_FEATURE(CPUID_7_1_EDX, 10) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) #define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cf84103ce38b..b708bdf7eaff 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -926,7 +926,7 @@ out_exit_err: nested_svm_vmexit(svm); out: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -1130,7 +1130,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_int_info_err, KVM_ISA_SVM); - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); nested_svm_transition_tlb_flush(vcpu); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 92d4711fd1e4..943bd074a5d3 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3458,7 +3458,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); svm->sev_es.ghcb = NULL; } @@ -3839,6 +3839,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); struct kvm_memory_slot *slot; + struct page *page; kvm_pfn_t pfn; slot = gfn_to_memslot(vcpu->kvm, gfn); @@ -3849,7 +3850,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) * The new VMSA will be private memory guest memory, so * retrieve the PFN from the gmem backend. */ - if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL)) + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) return -EINVAL; /* @@ -3878,7 +3879,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) * changes then care should be taken to ensure * svm->sev_es.vmsa is pinned through some other means. */ - kvm_release_pfn_clean(pfn); + kvm_release_page_clean(page); } /* @@ -4678,6 +4679,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) struct kvm_memory_slot *slot; struct kvm *kvm = vcpu->kvm; int order, rmp_level, ret; + struct page *page; bool assigned; kvm_pfn_t pfn; gfn_t gfn; @@ -4704,7 +4706,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) return; } - ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order); + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); if (ret) { pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", gpa); @@ -4762,7 +4764,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) out: trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); out_no_trace: - put_page(pfn_to_page(pfn)); + kvm_release_page_unused(page); } static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9df3e1e5ae81..dd15cc635655 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1390,7 +1390,9 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm_vcpu_init_msrpm(vcpu, svm->msrpm); svm_init_osvw(vcpu); - vcpu->arch.microcode_version = 0x01000065; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) + vcpu->arch.microcode_version = 0x01000065; svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; svm->nmi_masked = false; @@ -2299,7 +2301,7 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) svm_copy_vmloadsave_state(vmcb12, svm->vmcb); } - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -4714,7 +4716,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) svm_copy_vmrun_state(map_save.hva + 0x400, &svm->vmcb01.ptr->save); - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); return 0; } @@ -4774,9 +4776,9 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) svm->nested.nested_run_pending = 1; unmap_save: - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); unmap_map: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -5031,6 +5033,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, + .get_cpl_no_cache = svm_get_cpl, .get_cs_db_l_bits = svm_get_cs_db_l_bits, .is_valid_cr0 = svm_is_valid_cr0, .set_cr0 = svm_set_cr0, diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index fab6a1ad98dc..fa41d036acd4 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -4,6 +4,7 @@ #include <linux/errno.h> #include <linux/smp.h> +#include "x86.h" #include "../cpuid.h" #include "hyperv.h" #include "nested.h" diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 7668e2fb8043..92d35cc6cd15 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -50,6 +50,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .get_segment = vmx_get_segment, .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, + .get_cpl_no_cache = vmx_get_cpl_no_cache, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .is_valid_cr0 = vmx_is_valid_cr0, .set_cr0 = vmx_set_cr0, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 931a7361c30f..aa78b6f38dfe 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7,6 +7,7 @@ #include <asm/debugreg.h> #include <asm/mmu_context.h> +#include "x86.h" #include "cpuid.h" #include "hyperv.h" #include "mmu.h" @@ -16,7 +17,6 @@ #include "sgx.h" #include "trace.h" #include "vmx.h" -#include "x86.h" #include "smm.h" static bool __read_mostly enable_shadow_vmcs = 1; @@ -231,11 +231,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - if (nested_vmx_is_evmptr12_valid(vmx)) { - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs = NULL; - } - + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); + vmx->nested.hv_evmcs = NULL; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; if (hv_vcpu) { @@ -317,6 +314,16 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vcpu->arch.regs_dirty = 0; } +static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); + vmx->nested.pi_desc = NULL; +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -349,15 +356,8 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* - * Unpin physical memory we referred to in the vmcs02. The APIC access - * page's backing page (yeah, confusing) shouldn't actually be accessed, - * and if it is written, the contents are irrelevant. - */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + + nested_put_vmcs12_pages(vcpu); kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); @@ -624,7 +624,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; + struct kvm_host_map map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -647,10 +647,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) + if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) return false; - msr_bitmap_l1 = (unsigned long *)map->hva; + msr_bitmap_l1 = (unsigned long *)map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -714,7 +714,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; @@ -3010,6 +3010,17 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, return 0; } +static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) +{ + /* + * Check that the given linear address is canonical after a VM exit + * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. + */ + u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; + + return !__is_canonical_address(la, l1_address_bits_on_exit); +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -3020,8 +3031,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; - if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) + if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || + CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && @@ -3055,12 +3066,12 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; - if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) + if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || + CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) return -EINVAL; /* @@ -3178,7 +3189,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || + (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; @@ -5027,11 +5038,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_update_cpu_dirty_logging(vcpu); } - /* Unpin physical memory we referred to in vmcs02 */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + nested_put_vmcs12_pages(vcpu); if (vmx->nested.reload_vmcs01_apic_access_page) { vmx->nested.reload_vmcs01_apic_access_page = false; @@ -5167,7 +5174,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, * non-canonical form. This is the only check on the memory * destination for long mode! */ - exn = is_noncanonical_address(*ret, vcpu); + exn = is_noncanonical_address(*ret, vcpu, 0); } else { /* * When not in long mode, the virtual/linear address is @@ -5978,7 +5985,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) * invalidation. */ if (!operand.vpid || - is_noncanonical_address(operand.gla, vcpu)) + is_noncanonical_invlpg_address(operand.gla, vcpu)) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_vcpu_addr(vpid02, operand.gla); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 83382a4d1d66..9c9d4a336166 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -365,7 +365,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_DS_AREA: - if (is_noncanonical_address(data, vcpu)) + if (is_noncanonical_msr_address(data, vcpu)) return 1; pmu->ds_area = data; diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index a3c3d2a51f47..b352a3ba7354 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -4,12 +4,11 @@ #include <asm/sgx.h> -#include "cpuid.h" +#include "x86.h" #include "kvm_cache_regs.h" #include "nested.h" #include "sgx.h" #include "vmx.h" -#include "x86.h" bool __read_mostly enable_sgx = 1; module_param_named(sgx, enable_sgx, bool, 0444); @@ -38,7 +37,7 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, fault = true; } else if (likely(is_64_bit_mode(vcpu))) { *gva = vmx_get_untagged_addr(vcpu, *gva, 0); - fault = is_noncanonical_address(*gva, vcpu); + fault = is_noncanonical_address(*gva, vcpu, 0); } else { *gva &= 0xffffffff; fault = (s.unusable) || diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d28618e9277e..893366e53732 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -483,10 +483,9 @@ noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) ext, vpid, gva); } -noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) +noinline void invept_error(unsigned long ext, u64 eptp) { - vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", - ext, eptp, gpa); + vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); } static DEFINE_PER_CPU(struct vmcs *, vmxarea); @@ -2285,7 +2284,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) return 1; - if (is_noncanonical_address(data & PAGE_MASK, vcpu) || + if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; @@ -2450,7 +2449,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (index >= 2 * vmx->pt_desc.num_address_ranges) return 1; - if (is_noncanonical_address(data, vcpu)) + if (is_noncanonical_msr_address(data, vcpu)) return 1; if (index % 2) vmx->pt_desc.guest.addr_b[index / 2] = data; @@ -2458,8 +2457,6 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->pt_desc.guest.addr_a[index / 2] = data; break; case MSR_IA32_PERF_CAPABILITIES: - if (data && !vcpu_to_pmu(vcpu)->version) - return 1; if (data & PMU_CAP_LBR_FMT) { if ((data & PMU_CAP_LBR_FMT) != (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) @@ -2551,28 +2548,6 @@ static bool cpu_has_sgx(void) return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); } -/* - * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they - * can't be used due to errata where VM Exit may incorrectly clear - * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the - * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. - */ -static bool cpu_has_perf_global_ctrl_bug(void) -{ - switch (boot_cpu_data.x86_vfm) { - case INTEL_NEHALEM_EP: /* AAK155 */ - case INTEL_NEHALEM: /* AAP115 */ - case INTEL_WESTMERE: /* AAT100 */ - case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ - case INTEL_NEHALEM_EX: /* BA97 */ - return true; - default: - break; - } - - return false; -} - static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; @@ -2732,6 +2707,27 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, _vmexit_control &= ~x_ctrl; } + /* + * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they + * can't be used due to an errata where VM Exit may incorrectly clear + * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the + * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. + */ + switch (boot_cpu_data.x86_vfm) { + case INTEL_NEHALEM_EP: /* AAK155 */ + case INTEL_NEHALEM: /* AAP115 */ + case INTEL_WESTMERE: /* AAT100 */ + case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ + case INTEL_NEHALEM_EX: /* BA97 */ + _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + break; + default: + break; + } + rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ @@ -3570,16 +3566,29 @@ u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } -int vmx_get_cpl(struct kvm_vcpu *vcpu) +static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int ar; if (unlikely(vmx->rmode.vm86_active)) return 0; - else { - int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); - return VMX_AR_DPL(ar); - } + + if (no_cache) + ar = vmcs_read32(GUEST_SS_AR_BYTES); + else + ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return VMX_AR_DPL(ar); +} + +int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + return __vmx_get_cpl(vcpu, false); +} + +int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) +{ + return __vmx_get_cpl(vcpu, true); } static u32 vmx_segment_access_rights(struct kvm_segment *var) @@ -4422,9 +4431,6 @@ static u32 vmx_vmentry_ctrl(void) VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_IA32E_MODE); - if (cpu_has_perf_global_ctrl_bug()) - vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - return vmentry_ctrl; } @@ -4442,10 +4448,6 @@ static u32 vmx_vmexit_ctrl(void) if (vmx_pt_mode_is_system()) vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); - - if (cpu_has_perf_global_ctrl_bug()) - vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); @@ -4561,7 +4563,8 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, * Update the nested MSR settings so that a nested VMM can/can't set * controls for features that are/aren't exposed to the guest. */ - if (nested) { + if (nested && + kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { /* * All features that can be added or removed to VMX MSRs must * be supported in the first place for nested virtualization. @@ -4851,7 +4854,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcs(vmx); - if (nested) + if (nested && + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); vcpu_setup_sgx_lepubkeyhash(vcpu); @@ -4864,7 +4868,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; #endif - vcpu->arch.microcode_version = 0x100000000ULL; + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) + vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; /* @@ -6792,8 +6797,10 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; + struct page *refcounted_page; unsigned long mmu_seq; kvm_pfn_t pfn; + bool writable; /* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { @@ -6829,30 +6836,30 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) * controls the APIC-access page memslot, and only deletes the memslot * if APICv is permanently inhibited, i.e. the memslot won't reappear. */ - pfn = gfn_to_pfn_memslot(slot, gfn); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); if (is_error_noslot_pfn(pfn)) return; read_lock(&vcpu->kvm->mmu_lock); - if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { + if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - read_unlock(&vcpu->kvm->mmu_lock); - goto out; - } + else + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); - vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); - read_unlock(&vcpu->kvm->mmu_lock); + /* + * Do not pin the APIC access page in memory so that it can be freely + * migrated, the MMU notifier will call us again if it is migrated or + * swapped out. KVM backs the memslot with anonymous memory, the pfn + * should always point at a refcounted page (if the pfn is valid). + */ + if (!WARN_ON_ONCE(!refcounted_page)) + kvm_release_page_clean(refcounted_page); /* * No need for a manual TLB flush at this point, KVM has already done a * flush if there were SPTEs pointing at the previous page. */ -out: - /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. - */ - kvm_release_pfn_clean(pfn); + read_unlock(&vcpu->kvm->mmu_lock); } void vmx_hwapic_isr_update(int max_isr) @@ -8400,10 +8407,6 @@ __init int vmx_hardware_setup(void) if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; - if (cpu_has_perf_global_ctrl_bug()) - pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " - "does not work properly. Using workaround\n"); - if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 2325f773a20b..43f573f6ca46 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -200,8 +200,6 @@ struct nested_vmx { struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; - struct kvm_host_map msr_bitmap_map; - struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; @@ -385,6 +383,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); +int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu); bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 93e020dc88f6..633c87e2fd92 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -15,7 +15,7 @@ void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); -void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); +void invept_error(unsigned long ext, u64 eptp); #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /* @@ -312,13 +312,13 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva); } -static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) +static inline void __invept(unsigned long ext, u64 eptp) { struct { - u64 eptp, gpa; - } operand = {eptp, gpa}; - - vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa); + u64 eptp; + u64 reserved_0; + } operand = { eptp, 0 }; + vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp); } static inline void vpid_sync_vcpu_single(int vpid) @@ -355,13 +355,13 @@ static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr) static inline void ept_sync_global(void) { - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); + __invept(VMX_EPT_EXTENT_GLOBAL, 0); } static inline void ept_sync_context(u64 eptp) { if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + __invept(VMX_EPT_EXTENT_CONTEXT, eptp); else ept_sync_global(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 83fe0a78146f..2e713480933a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -451,6 +451,7 @@ static const u32 msr_based_features_all_except_vmx[] = { MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, + MSR_PLATFORM_INFO, }; static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + @@ -667,38 +668,6 @@ static void drop_user_return_notifiers(void) kvm_on_user_return(&msrs->urn); } -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.apic_base; -} - -enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) -{ - return kvm_apic_mode(kvm_get_apic_base(vcpu)); -} -EXPORT_SYMBOL_GPL(kvm_get_apic_mode); - -int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); - enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); - u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | - (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); - - if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) - return 1; - if (!msr_info->host_initiated) { - if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) - return 1; - if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) - return 1; - } - - kvm_lapic_set_base(vcpu, msr_info->data); - kvm_recalculate_apic_map(vcpu->kvm); - return 0; -} - /* * Handle a fault on a hardware virtualization (VMX or SVM) instruction. * @@ -1706,6 +1675,9 @@ static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, case MSR_IA32_PERF_CAPABILITIES: *data = kvm_caps.supported_perf_cap; break; + case MSR_PLATFORM_INFO: + *data = MSR_PLATFORM_INFO_CPUID_FAULT; + break; case MSR_IA32_UCODE_REV: rdmsrl_safe(index, data); break; @@ -1854,7 +1826,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: - if (is_noncanonical_address(data, vcpu)) + if (is_noncanonical_msr_address(data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: @@ -1871,7 +1843,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - data = __canonical_address(data, vcpu_virt_addr_bits(vcpu)); + data = __canonical_address(data, max_host_virt_addr_bits()); break; case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) @@ -2144,8 +2116,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { xfer_to_guest_mode_prepare(); - return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || - xfer_to_guest_mode_work_pending(); + + return READ_ONCE(vcpu->mode) == EXITING_GUEST_MODE || + kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } /* @@ -3793,13 +3766,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.microcode_version = data; break; case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; + if (!msr_info->host_initiated || + !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return KVM_MSR_RET_UNSUPPORTED; vcpu->arch.arch_capabilities = data; break; case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; + if (!msr_info->host_initiated || + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return KVM_MSR_RET_UNSUPPORTED; + if (data & ~kvm_caps.supported_perf_cap) return 1; @@ -3890,7 +3866,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: - return kvm_set_apic_base(vcpu, msr_info); + return kvm_apic_set_base(vcpu, data, msr_info->host_initiated); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSC_DEADLINE: @@ -4111,9 +4087,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.osvw.status = data; break; case MSR_PLATFORM_INFO: - if (!msr_info->host_initiated || - (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && - cpuid_fault_enabled(vcpu))) + if (!msr_info->host_initiated) return 1; vcpu->arch.msr_platform_info = data; break; @@ -4252,15 +4226,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.microcode_version; break; case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) - return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.arch_capabilities; break; case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.perf_capabilities; break; case MSR_IA32_POWER_CTL: @@ -4314,7 +4286,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 1 << 24; break; case MSR_IA32_APICBASE: - msr_info->data = kvm_get_apic_base(vcpu); + msr_info->data = vcpu->arch.apic_base; break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); @@ -5094,7 +5066,13 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int idx; if (vcpu->preempted) { - vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu); + /* + * Assume protected guests are in-kernel. Inefficient yielding + * due to false positives is preferable to never yielding due + * to false negatives. + */ + vcpu->arch.preempted_in_kernel = vcpu->arch.guest_state_protected || + !kvm_x86_call(get_cpl_no_cache)(vcpu); /* * Take the srcu lock as memslots will be accessed to check the gfn @@ -8612,6 +8590,12 @@ static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt, addr, flags); } +static bool emulator_is_canonical_addr(struct x86_emulate_ctxt *ctxt, + gva_t addr, unsigned int flags) +{ + return !is_noncanonical_address(addr, emul_to_vcpu(ctxt), flags); +} + static const struct x86_emulate_ops emulate_ops = { .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, @@ -8658,6 +8642,7 @@ static const struct x86_emulate_ops emulate_ops = { .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, .get_untagged_addr = emulator_get_untagged_addr, + .is_canonical_addr = emulator_is_canonical_addr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -10159,7 +10144,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu); - kvm_run->apic_base = kvm_get_apic_base(vcpu); + kvm_run->apic_base = vcpu->arch.apic_base; kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || @@ -10576,8 +10561,8 @@ static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but * and hardware doesn't support x2APIC virtualization. E.g. some AMD * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in - * this case so that KVM can the AVIC doorbell to inject interrupts to - * running vCPUs, but KVM must not create SPTEs for the APIC base as + * this case so that KVM can use the AVIC doorbell to inject interrupts + * to running vCPUs, but KVM must not create SPTEs for the APIC base as * the vCPU would incorrectly be able to access the vAPIC page via MMIO * despite being in x2APIC mode. For simplicity, inhibiting the APIC * access page is sticky. @@ -10606,11 +10591,11 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, if (!!old != !!new) { /* * Kick all vCPUs before setting apicv_inhibit_reasons to avoid - * false positives in the sanity check WARN in svm_vcpu_run(). + * false positives in the sanity check WARN in vcpu_enter_guest(). * This task will wait for all vCPUs to ack the kick IRQ before * updating apicv_inhibit_reasons, and all other vCPUs will * block on acquiring apicv_update_lock so that vCPUs can't - * redo svm_vcpu_run() without seeing the new inhibit state. + * redo vcpu_enter_guest() without seeing the new inhibit state. * * Note, holding apicv_update_lock and taking it in the read * side (handling the request) also prevents other vCPUs from @@ -11711,7 +11696,7 @@ skip_protected_regs: sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; - sregs->apic_base = kvm_get_apic_base(vcpu); + sregs->apic_base = vcpu->arch.apic_base; } static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -11888,16 +11873,13 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, int *mmu_reset_needed, bool update_pdptrs) { - struct msr_data apic_base_msr; int idx; struct desc_ptr dt; if (!kvm_is_valid_sregs(vcpu, sregs)) return -EINVAL; - apic_base_msr.data = sregs->apic_base; - apic_base_msr.host_initiated = true; - if (kvm_set_apic_base(vcpu, &apic_base_msr)) + if (kvm_apic_set_base(vcpu, sregs->apic_base, true)) return -EINVAL; if (vcpu->arch.guest_state_protected) @@ -12299,7 +12281,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); - vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { + vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; + } kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; @@ -12313,8 +12299,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (r) goto free_guest_fpu; - vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); - vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_xen_init_vcpu(vcpu); vcpu_load(vcpu); kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); @@ -13104,19 +13088,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (!log_dirty_pages) { /* - * Dirty logging tracks sptes in 4k granularity, meaning that - * large sptes have to be split. If live migration succeeds, - * the guest in the source machine will be destroyed and large - * sptes will be created in the destination. However, if the - * guest continues to run in the source machine (for example if - * live migration fails), small sptes will remain around and - * cause bad performance. + * Recover huge page mappings in the slot now that dirty logging + * is disabled, i.e. now that KVM does not have to track guest + * writes at 4KiB granularity. * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. + * Dirty logging might be disabled by userspace if an ongoing VM + * live migration is cancelled and the VM must continue running + * on the source. */ - kvm_mmu_zap_collapsible_sptes(kvm, new); + kvm_mmu_recover_huge_pages(kvm, new); } else { /* * Initially-all-set does not require write protecting any page, @@ -13207,6 +13187,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { + WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)); + if (vcpu->arch.guest_state_protected) return true; @@ -13215,6 +13197,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) { + WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)); + + if (vcpu->arch.guest_state_protected) + return 0; + return kvm_rip_read(vcpu); } @@ -13730,7 +13717,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) * invalidation. */ if ((!pcid_enabled && (operand.pcid != 0)) || - is_noncanonical_address(operand.gla, vcpu)) { + is_noncanonical_invlpg_address(operand.gla, vcpu)) { kvm_inject_gp(vcpu, 0); return 1; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a84c48ef5278..ec623d23d13d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,7 @@ #include <asm/pvclock.h> #include "kvm_cache_regs.h" #include "kvm_emulate.h" +#include "cpuid.h" struct kvm_caps { /* control of guest tsc rate supported? */ @@ -233,9 +234,52 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48; } -static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) +static inline u8 max_host_virt_addr_bits(void) { - return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu)); + return kvm_cpu_cap_has(X86_FEATURE_LA57) ? 57 : 48; +} + +/* + * x86 MSRs which contain linear addresses, x86 hidden segment bases, and + * IDT/GDT bases have static canonicality checks, the size of which depends + * only on the CPU's support for 5-level paging, rather than on the state of + * CR4.LA57. This applies to both WRMSR and to other instructions that set + * their values, e.g. SGDT. + * + * KVM passes through most of these MSRS and also doesn't intercept the + * instructions that set the hidden segment bases. + * + * Because of this, to be consistent with hardware, even if the guest doesn't + * have LA57 enabled in its CPUID, perform canonicality checks based on *host* + * support for 5 level paging. + * + * Finally, instructions which are related to MMU invalidation of a given + * linear address, also have a similar static canonical check on address. + * This allows for example to invalidate 5-level addresses of a guest from a + * host which uses 4-level paging. + */ +static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu, + unsigned int flags) +{ + if (flags & (X86EMUL_F_INVLPG | X86EMUL_F_MSR | X86EMUL_F_DT_LOAD)) + return !__is_canonical_address(la, max_host_virt_addr_bits()); + else + return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu)); +} + +static inline bool is_noncanonical_msr_address(u64 la, struct kvm_vcpu *vcpu) +{ + return is_noncanonical_address(la, vcpu, X86EMUL_F_MSR); +} + +static inline bool is_noncanonical_base_address(u64 la, struct kvm_vcpu *vcpu) +{ + return is_noncanonical_address(la, vcpu, X86EMUL_F_DT_LOAD); +} + +static inline bool is_noncanonical_invlpg_address(u64 la, struct kvm_vcpu *vcpu) +{ + return is_noncanonical_address(la, vcpu, X86EMUL_F_INVLPG); } static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 807a5859a3c4..58f7f2bd535d 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -19,107 +19,6 @@ #include <asm/tlbflush.h> #include <asm/elf.h> -#ifdef CONFIG_HUGETLB_PAGE -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.length = len; - info.low_limit = get_mmap_base(1); - - /* - * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area - * in the full address space. - */ - info.high_limit = in_32bit_syscall() ? - task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); - - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - return vm_unmapped_area(&info); -} - -static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = get_mmap_base(0); - - /* - * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area - * in the full address space. - */ - if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) - info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; - - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE_LOW; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - - if (len > TASK_SIZE) - return -ENOMEM; - - /* No address checking. See comment at mmap_address_hint_valid() */ - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr &= huge_page_mask(h); - if (!mmap_address_hint_valid(addr, len)) - goto get_unmapped_area; - - vma = find_vma(mm, addr); - if (!vma || addr + len <= vm_start_gap(vma)) - return addr; - } - -get_unmapped_area: - if (!test_bit(MMF_TOPDOWN, &mm->flags)) - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); -} -#endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 bool __init arch_hugetlb_valid_size(unsigned long size) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 101725c149c4..c6d29f283001 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1058,18 +1058,53 @@ unsigned long arch_max_swapfile_size(void) #ifdef CONFIG_EXECMEM static struct execmem_info execmem_info __ro_after_init; +#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable) +{ + /* fill memory with INT3 instructions */ + if (writeable) + memset(ptr, INT3_INSN_OPCODE, size); + else + text_poke_set(ptr, INT3_INSN_OPCODE, size); +} +#endif + struct execmem_info __init *execmem_arch_setup(void) { unsigned long start, offset = 0; + enum execmem_range_flags flags; + pgprot_t pgprot; if (kaslr_enabled()) offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; start = MODULES_VADDR + offset; + if (IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) { + pgprot = PAGE_KERNEL_ROX; + flags = EXECMEM_KASAN_SHADOW | EXECMEM_ROX_CACHE; + } else { + pgprot = PAGE_KERNEL; + flags = EXECMEM_KASAN_SHADOW; + } + execmem_info = (struct execmem_info){ .ranges = { - [EXECMEM_DEFAULT] = { + [EXECMEM_MODULE_TEXT] = { + .flags = flags, + .start = start, + .end = MODULES_END, + .pgprot = pgprot, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_KPROBES ... EXECMEM_BPF] = { + .flags = EXECMEM_KASAN_SHADOW, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_MODULE_DATA] = { .flags = EXECMEM_KASAN_SHADOW, .start = start, .end = MODULES_END, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ff253648706f..01ea7c6df303 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -961,7 +961,7 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; int ret; - if (WARN_ON_ONCE(end > PHYSMEM_END)) + if (WARN_ON_ONCE(end > DIRECT_MAP_PHYSMEM_END)) return -ERANGE; ret = __add_pages(nid, start_pfn, nr_pages, params); @@ -985,22 +985,32 @@ int arch_add_memory(int nid, u64 start, u64 size, return add_pages(nid, start_pfn, nr_pages, params); } -static void __meminit free_pagetable(struct page *page, int order) +static void free_reserved_pages(struct page *page, unsigned long nr_pages) { - unsigned long magic; - unsigned int nr_pages = 1 << order; + while (nr_pages--) + free_reserved_page(page++); +} +static void __meminit free_pagetable(struct page *page, int order) +{ /* bootmem page has reserved flag */ if (PageReserved(page)) { - magic = page->index; - if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + unsigned long nr_pages = 1 << order; +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE + enum bootmem_type type = bootmem_type(page); + + if (type == SECTION_INFO || type == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); - } else - while (nr_pages--) - free_reserved_page(page++); - } else + } else { + free_reserved_pages(page, nr_pages); + } +#else + free_reserved_pages(page, nr_pages); +#endif + } else { free_pages((unsigned long)page_address(page), order); + } } static void __meminit free_hugepage_table(struct page *page, diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index e17e6e27b7ec..11a93542d198 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -52,7 +52,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { .base = &page_offset_base, - .end = &physmem_end, + .end = &direct_map_physmem_end, }, { .base = &vmalloc_base, @@ -62,8 +62,12 @@ static __initdata struct kaslr_memory_region { }, }; -/* The end of the possible address space for physical memory */ -unsigned long physmem_end __ro_after_init; +/* + * The end of the physical address space that can be mapped directly by the + * kernel. This starts out at (1<<MAX_PHYSMEM_BITS) - 1), but KASLR may reduce + * that in order to increase the available entropy for mapping other regions. + */ +unsigned long direct_map_physmem_end __ro_after_init; /* Get size in bytes used by the memory region */ static inline unsigned long get_padding(struct kaslr_memory_region *region) @@ -94,7 +98,7 @@ void __init kernel_randomize_memory(void) BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); /* Preset the end of the possible address space for physical memory */ - physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); + direct_map_physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); if (!kaslr_memory_enabled()) return; @@ -145,7 +149,7 @@ void __init kernel_randomize_memory(void) vaddr += get_padding(&kaslr_regions[i]); /* * KASLR trims the maximum possible size of the - * direct-map. Update the physmem_end boundary. + * direct-map. Update the direct_map_physmem_end boundary. * No rounding required as the region starts * PUD aligned and size is in units of TB. */ diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 44f7b2ea6a07..069e421c2247 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2444,6 +2444,14 @@ int set_direct_map_default_noflush(struct page *page) return __set_pages_p(page, 1); } +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + if (valid) + return __set_pages_p(page, nr); + + return __set_pages_np(page, nr); +} + #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 55c4b07ec1f6..0c316bae1726 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -250,6 +250,125 @@ void __init pci_acpi_crs_quirks(void) pr_info("Please notify linux-pci@vger.kernel.org so future kernels can do this automatically\n"); } +/* + * Check if pdev is part of a PCIe switch that is directly below the + * specified bridge. + */ +static bool pcie_switch_directly_under(struct pci_dev *bridge, + struct pci_dev *pdev) +{ + struct pci_dev *parent = pci_upstream_bridge(pdev); + + /* If the device doesn't have a parent, it's not under anything */ + if (!parent) + return false; + + /* + * If the device has a PCIe type, check if it is below the + * corresponding PCIe switch components (if applicable). Then check + * if its upstream port is directly beneath the specified bridge. + */ + switch (pci_pcie_type(pdev)) { + case PCI_EXP_TYPE_UPSTREAM: + return parent == bridge; + + case PCI_EXP_TYPE_DOWNSTREAM: + if (pci_pcie_type(parent) != PCI_EXP_TYPE_UPSTREAM) + return false; + parent = pci_upstream_bridge(parent); + return parent == bridge; + + case PCI_EXP_TYPE_ENDPOINT: + if (pci_pcie_type(parent) != PCI_EXP_TYPE_DOWNSTREAM) + return false; + parent = pci_upstream_bridge(parent); + if (!parent || pci_pcie_type(parent) != PCI_EXP_TYPE_UPSTREAM) + return false; + parent = pci_upstream_bridge(parent); + return parent == bridge; + } + + return false; +} + +static bool pcie_has_usb4_host_interface(struct pci_dev *pdev) +{ + struct fwnode_handle *fwnode; + + /* + * For USB4, the tunneled PCIe Root or Downstream Ports are marked + * with the "usb4-host-interface" ACPI property, so we look for + * that first. This should cover most cases. + */ + fwnode = fwnode_find_reference(dev_fwnode(&pdev->dev), + "usb4-host-interface", 0); + if (!IS_ERR(fwnode)) { + fwnode_handle_put(fwnode); + return true; + } + + /* + * Any integrated Thunderbolt 3/4 PCIe Root Ports from Intel + * before Alder Lake do not have the "usb4-host-interface" + * property so we use their PCI IDs instead. All these are + * tunneled. This list is not expected to grow. + */ + if (pdev->vendor == PCI_VENDOR_ID_INTEL) { + switch (pdev->device) { + /* Ice Lake Thunderbolt 3 PCIe Root Ports */ + case 0x8a1d: + case 0x8a1f: + case 0x8a21: + case 0x8a23: + /* Tiger Lake-LP Thunderbolt 4 PCIe Root Ports */ + case 0x9a23: + case 0x9a25: + case 0x9a27: + case 0x9a29: + /* Tiger Lake-H Thunderbolt 4 PCIe Root Ports */ + case 0x9a2b: + case 0x9a2d: + case 0x9a2f: + case 0x9a31: + return true; + } + } + + return false; +} + +bool arch_pci_dev_is_removable(struct pci_dev *pdev) +{ + struct pci_dev *parent, *root; + + /* pdev without a parent or Root Port is never tunneled */ + parent = pci_upstream_bridge(pdev); + if (!parent) + return false; + root = pcie_find_root_port(pdev); + if (!root) + return false; + + /* Internal PCIe devices are not tunneled */ + if (!root->external_facing) + return false; + + /* Anything directly behind a "usb4-host-interface" is tunneled */ + if (pcie_has_usb4_host_interface(parent)) + return true; + + /* + * Check if this is a discrete Thunderbolt/USB4 controller that is + * directly behind the non-USB4 PCIe Root Port marked as + * "ExternalFacingPort". Those are not behind a PCIe tunnel. + */ + if (pcie_switch_directly_under(root, pdev)) + return false; + + /* PCIe devices after the discrete chip are tunneled */ + return true; +} + #ifdef CONFIG_PCI_MMCONFIG static int check_segment(u16 seg, struct device *dev, char *estr) { diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 98a9bb92d75c..0681ecfe3430 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -757,7 +757,7 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", res); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); - pci_bus_add_resource(dev->bus, res, 0); + pci_bus_add_resource(dev->bus, res); } base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) | diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index fa07c686cbcc..cc5dba738389 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -8,3 +8,4 @@ generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index efdea5d5bca0..644413792bf3 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -103,26 +103,8 @@ typedef struct page *pgtable_t; #define __pgd(x) ((pgd_t) { (x) } ) #define __pgprot(x) ((pgprot_t) { (x) } ) -/* - * Pure 2^n version of get_order - * Use 'nsau' instructions if supported by the processor or the generic version. - */ - -#if XCHAL_HAVE_NSA - -static inline __attribute_const__ int get_order(unsigned long size) -{ - int lz; - asm ("nsau %0, %1" : "=r" (lz) : "r" ((size - 1) >> PAGE_SHIFT)); - return 32 - lz; -} - -#else - # include <asm-generic/getorder.h> -#endif - struct page; struct vm_area_struct; extern void clear_page(void *page); diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h index 797aed7df3dd..6baaeac6248b 100644 --- a/arch/xtensa/include/asm/spinlock_types.h +++ b/arch/xtensa/include/asm/spinlock_types.h @@ -3,7 +3,7 @@ #define __ASM_SPINLOCK_TYPES_H #if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #include <asm-generic/qspinlock_types.h> diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 1ff0c858544f..99d4ccee7f6e 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -113,6 +113,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 |