diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/xen/apic.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/debugfs.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/debugfs.h | 7 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 108 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_hvm.c | 15 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_pv.c | 104 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_pvh.c | 128 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 3 | ||||
-rw-r--r-- | arch/x86/xen/mmu.h | 28 | ||||
-rw-r--r-- | arch/x86/xen/mmu_hvm.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/mmu_pv.c | 103 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.c | 135 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.h | 69 | ||||
-rw-r--r-- | arch/x86/xen/p2m.c | 123 | ||||
-rw-r--r-- | arch/x86/xen/pmu.c | 1 | ||||
-rw-r--r-- | arch/x86/xen/pmu.h | 22 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 207 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 1 | ||||
-rw-r--r-- | arch/x86/xen/smp.h | 51 | ||||
-rw-r--r-- | arch/x86/xen/smp_hvm.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/smp_pv.c | 9 | ||||
-rw-r--r-- | arch/x86/xen/spinlock.c | 20 | ||||
-rw-r--r-- | arch/x86/xen/suspend.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/xen-asm.S | 53 | ||||
-rw-r--r-- | arch/x86/xen/xen-head.S | 118 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 163 |
28 files changed, 1038 insertions, 444 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 77e788e928cd..98d8a50d2aed 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -9,7 +9,7 @@ config XEN select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) - depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8) + depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM) depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 8b045dd25196..bb0f3f368446 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -10,8 +10,6 @@ #include <xen/xen.h> #include <xen/interface/physdev.h> #include "xen-ops.h" -#include "pmu.h" -#include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 532410998684..b8c9f2a7d9b6 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -3,7 +3,7 @@ #include <linux/debugfs.h> #include <linux/slab.h> -#include "debugfs.h" +#include "xen-ops.h" static struct dentry *d_xen_debug; diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h deleted file mode 100644 index 6b813ad1091c..000000000000 --- a/arch/x86/xen/debugfs.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_DEBUGFS_H -#define _XEN_DEBUGFS_H - -struct dentry * __init xen_init_debugfs(void); - -#endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a01ca255b0c6..53282dc7d5ac 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,10 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -#include <linux/memblock.h> -#endif #include <linux/console.h> #include <linux/cpu.h> +#include <linux/instrumentation.h> #include <linux/kexec.h> #include <linux/memblock.h> #include <linux/slab.h> @@ -23,10 +21,9 @@ #include <asm/setup.h> #include "xen-ops.h" -#include "smp.h" -#include "pmu.h" -EXPORT_SYMBOL_GPL(hypercall_page); +DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm); +EXPORT_STATIC_CALL_TRAMP(xen_hypercall); /* * Pointer to the xen_vcpu_info structure or @@ -73,6 +70,65 @@ EXPORT_SYMBOL(xen_start_flags); */ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; +/* Number of pages released from the initial allocation. */ +unsigned long xen_released_pages; + +static __ref void xen_get_vendor(void) +{ + init_cpu_devs(); + cpu_detect(&boot_cpu_data); + get_cpu_vendor(&boot_cpu_data); +} + +void xen_hypercall_setfunc(void) +{ + if (static_call_query(xen_hypercall) != xen_hypercall_hvm) + return; + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + static_call_update(xen_hypercall, xen_hypercall_amd); + else + static_call_update(xen_hypercall, xen_hypercall_intel); +} + +/* + * Evaluate processor vendor in order to select the correct hypercall + * function for HVM/PVH guests. + * Might be called very early in boot before vendor has been set by + * early_cpu_init(). + */ +noinstr void *__xen_hypercall_setfunc(void) +{ + void (*func)(void); + + /* + * Note that __xen_hypercall_setfunc() is noinstr only due to a nasty + * dependency chain: it is being called via the xen_hypercall static + * call when running as a PVH or HVM guest. Hypercalls need to be + * noinstr due to PV guests using hypercalls in noinstr code. So we + * can safely tag the function body as "instrumentation ok", since + * the PV guest requirement is not of interest here (xen_get_vendor() + * calls noinstr functions, and static_call_update_early() might do + * so, too). + */ + instrumentation_begin(); + + xen_get_vendor(); + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + func = xen_hypercall_amd; + else + func = xen_hypercall_intel; + + static_call_update_early(xen_hypercall, func); + + instrumentation_end(); + + return func; +} + static int xen_cpu_up_online(unsigned int cpu) { xen_init_lock_cpu(cpu); @@ -382,3 +438,43 @@ void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns) memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } + +#ifdef CONFIG_XEN_UNPOPULATED_ALLOC +int __init arch_xen_unpopulated_init(struct resource **res) +{ + unsigned int i; + + if (!xen_domain()) + return -ENODEV; + + /* Must be set strictly before calling xen_free_unpopulated_pages(). */ + *res = &iomem_resource; + + /* + * Initialize with pages from the extra memory regions (see + * arch/x86/xen/setup.c). + */ + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + unsigned int j; + + for (j = 0; j < xen_extra_mem[i].n_pfns; j++) { + struct page *pg = + pfn_to_page(xen_extra_mem[i].start_pfn + j); + + xen_free_unpopulated_pages(1, &pg); + } + + /* + * Account for the region being in the physmap but unpopulated. + * The value in xen_released_pages is used by the balloon + * driver to know how much of the physmap is unpopulated and + * set an accurate initial memory target. + */ + xen_released_pages += xen_extra_mem[i].n_pfns; + /* Zero so region is not also added to the balloon driver. */ + xen_extra_mem[i].n_pfns = 0; + } + + return 0; +} +#endif diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index c001a2296582..fe57ff85d004 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -28,8 +28,6 @@ #include <asm/xen/page.h> #include "xen-ops.h" -#include "mmu.h" -#include "smp.h" static unsigned long shared_info_pfn; @@ -108,15 +106,8 @@ static void __init init_hvm_pv_info(void) /* PVH set up hypercall page in xen_prepare_pvh(). */ if (xen_pvh_domain()) pv_info.name = "Xen PVH"; - else { - u64 pfn; - uint32_t msr; - + else pv_info.name = "Xen HVM"; - msr = cpuid_ebx(base + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - } xen_setup_features(); @@ -302,6 +293,10 @@ static uint32_t __init xen_platform_hvm(void) if (xen_pv_domain()) return 0; + /* Set correct hypercall function. */ + if (xen_domain) + xen_hypercall_setfunc(); + if (xen_pvh_domain() && nopv) { /* Guest booting via the Xen-PVH boot entry goes here */ pr_info("\"nopv\" parameter is ignored in PVH guest\n"); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ace2eb054053..846b5737d320 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -49,6 +49,7 @@ #include <xen/hvc-console.h> #include <xen/acpi.h> +#include <asm/cpuid.h> #include <asm/paravirt.h> #include <asm/apic.h> #include <asm/page.h> @@ -72,6 +73,7 @@ #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/cpu.h> +#include <asm/irq_stack.h> #ifdef CONFIG_X86_IOPL_IOPERM #include <asm/io_bitmap.h> #endif @@ -85,10 +87,6 @@ #endif #include "xen-ops.h" -#include "mmu.h" -#include "smp.h" -#include "multicalls.h" -#include "pmu.h" #include "../kernel/cpu/cpu.h" /* get_cpu_cap() */ @@ -97,12 +95,49 @@ void *xen_initial_gdt; static int xen_cpu_up_prepare_pv(unsigned int cpu); static int xen_cpu_dead_pv(unsigned int cpu); +#ifndef CONFIG_PREEMPTION +/* + * Some hypercalls issued by the toolstack can take many 10s of + * seconds. Allow tasks running hypercalls via the privcmd driver to + * be voluntarily preempted even if full kernel preemption is + * disabled. + * + * Such preemptible hypercalls are bracketed by + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() + * calls. + */ +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); + +/* + * In case of scheduling the flag must be cleared and restored after + * returning from schedule as the task might move to a different CPU. + */ +static __always_inline bool get_and_clear_inhcall(void) +{ + bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); + + __this_cpu_write(xen_in_preemptible_hcall, false); + return inhcall; +} + +static __always_inline void restore_inhcall(bool inhcall) +{ + __this_cpu_write(xen_in_preemptible_hcall, inhcall); +} + +#else + +static __always_inline bool get_and_clear_inhcall(void) { return false; } +static __always_inline void restore_inhcall(bool inhcall) { } + +#endif + struct tls_descs { struct desc_struct desc[3]; }; DEFINE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode) = XEN_LAZY_NONE; -DEFINE_PER_CPU(unsigned int, xen_lazy_nesting); enum xen_lazy_mode xen_get_lazy_mode(void) { @@ -175,7 +210,7 @@ static void __init xen_set_mtrr_data(void) /* Only overwrite MTRR state if any MTRR could be got from Xen. */ if (reg) - mtrr_overwrite_state(var, reg, MTRR_TYPE_UNCACHABLE); + guest_force_mtrr_state(var, reg, MTRR_TYPE_UNCACHABLE); #endif } @@ -199,7 +234,7 @@ static void __init xen_pv_init_platform(void) if (xen_initial_domain()) xen_set_mtrr_data(); else - mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); /* Adjust nr_cpu_ids before "enumeration" happens */ xen_smp_count_cpus(); @@ -219,14 +254,22 @@ static __read_mostly unsigned int cpuid_leaf5_edx_val; static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { - unsigned maskebx = ~0; + unsigned int maskebx = ~0; + unsigned int or_ebx = 0; /* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ switch (*ax) { - case CPUID_MWAIT_LEAF: + case 0x1: + /* Replace initial APIC ID in bits 24-31 of EBX. */ + /* See xen_pv_smp_config() for related topology preparations. */ + maskebx = 0x00ffffff; + or_ebx = smp_processor_id() << 24; + break; + + case CPUID_LEAF_MWAIT: /* Synthesize the values.. */ *ax = 0; *bx = 0; @@ -248,6 +291,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, : "0" (*ax), "2" (*cx)); *bx &= maskebx; + *bx |= or_ebx; } static bool __init xen_check_mwait(void) @@ -295,7 +339,7 @@ static bool __init xen_check_mwait(void) * ecx and edx. The hypercall provides only partial information. */ - ax = CPUID_MWAIT_LEAF; + ax = CPUID_LEAF_MWAIT; bx = 0; cx = 0; dx = 0; @@ -681,6 +725,36 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check) } #endif +static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(irq_hv_callback_count); + + xen_evtchn_do_upcall(); + + set_irq_regs(old_regs); +} + +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + bool inhcall; + + instrumentation_begin(); + run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); + + inhcall = get_and_clear_inhcall(); + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { + irqentry_exit_cond_resched(); + instrumentation_end(); + restore_inhcall(inhcall); + } else { + instrumentation_end(); + irqentry_exit(regs, state); + } +} + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -1027,6 +1101,10 @@ static u64 xen_do_read_msr(unsigned int msr, int *err) switch (msr) { case MSR_IA32_APICBASE: val &= ~X2APIC_ENABLE; + if (smp_processor_id() == 0) + val |= MSR_IA32_APICBASE_BSP; + else + val &= ~MSR_IA32_APICBASE_BSP; break; } return val; @@ -1152,8 +1230,6 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { .write_cr4 = xen_write_cr4, - .wbinvd = pv_native_wbinvd, - .read_msr = xen_read_msr, .write_msr = xen_write_msr, @@ -1332,6 +1408,9 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_domain_type = XEN_PV_DOMAIN; xen_start_flags = xen_start_info->flags; + /* Interrupts are guaranteed to be off initially. */ + early_boot_irqs_disabled = true; + static_call_update_early(xen_hypercall, xen_hypercall_pv); xen_setup_features(); @@ -1422,7 +1501,6 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); local_irq_disable(); - early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 27a2a02ef8fb..9d25d9373945 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,14 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> +#include <linux/cpufreq.h> +#include <linux/cpuidle.h> #include <linux/export.h> #include <linux/mm.h> #include <xen/hvc-console.h> +#include <xen/acpi.h> #include <asm/bootparam.h> #include <asm/io_apic.h> #include <asm/hypervisor.h> #include <asm/e820/api.h> +#include <asm/setup.h> #include <xen/xen.h> #include <asm/xen/interface.h> @@ -27,53 +31,27 @@ bool __ro_after_init xen_pvh; EXPORT_SYMBOL_GPL(xen_pvh); -void __init xen_pvh_init(struct boot_params *boot_params) +#ifdef CONFIG_XEN_DOM0 +int xen_pvh_setup_gsi(int gsi, int trigger, int polarity) { - u32 msr; - u64 pfn; - - xen_pvh = 1; - xen_domain_type = XEN_HVM_DOMAIN; - xen_start_flags = pvh_start_info.flags; - - msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - - if (xen_initial_domain()) - x86_init.oem.arch_setup = xen_add_preferred_consoles; - x86_init.oem.banner = xen_banner; - - xen_efi_init(boot_params); + int ret; + struct physdev_setup_gsi setup_gsi; - if (xen_initial_domain()) { - struct xen_platform_op op = { - .cmd = XENPF_get_dom0_console, - }; - int ret = HYPERVISOR_platform_op(&op); - - if (ret > 0) - xen_init_vga(&op.u.dom0_console, - min(ret * sizeof(char), - sizeof(op.u.dom0_console)), - &boot_params->screen_info); - } -} + setup_gsi.gsi = gsi; + setup_gsi.triggering = (trigger == ACPI_EDGE_SENSITIVE ? 0 : 1); + setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1); -void __init mem_map_via_hcall(struct boot_params *boot_params_p) -{ - struct xen_memory_map memmap; - int rc; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); + if (ret == -EEXIST) { + xen_raw_printk("Already setup the GSI :%d\n", gsi); + ret = 0; + } else if (ret) + xen_raw_printk("Fail to setup GSI (%d)!\n", gsi); - memmap.nr_entries = ARRAY_SIZE(boot_params_p->e820_table); - set_xen_guest_handle(memmap.buffer, boot_params_p->e820_table); - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc) { - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); - BUG(); - } - boot_params_p->e820_entries = memmap.nr_entries; + return ret; } +EXPORT_SYMBOL_GPL(xen_pvh_setup_gsi); +#endif /* * Reserve e820 UNUSABLE regions to inflate the memory balloon. @@ -89,8 +67,9 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p) * hypervisor should notify us which memory ranges are suitable for creating * foreign mappings, but that's not yet implemented. */ -void __init xen_reserve_extra_memory(struct boot_params *bootp) +static void __init pvh_reserve_extra_memory(void) { + struct boot_params *bootp = &boot_params; unsigned int i, ram_pages = 0, extra_pages; for (i = 0; i < bootp->e820_entries; i++) { @@ -141,3 +120,66 @@ void __init xen_reserve_extra_memory(struct boot_params *bootp) xen_add_extra_mem(PFN_UP(e->addr), pages); } } + +static void __init pvh_arch_setup(void) +{ + pvh_reserve_extra_memory(); + + if (xen_initial_domain()) { + xen_add_preferred_consoles(); + + /* + * Disable usage of CPU idle and frequency drivers: when + * running as hardware domain the exposed native ACPI tables + * causes idle and/or frequency drivers to attach and + * malfunction. It's Xen the entity that controls the idle and + * frequency states. + * + * For unprivileged domains the exposed ACPI tables are + * fabricated and don't contain such data. + */ + disable_cpuidle(); + disable_cpufreq(); + WARN_ON(xen_set_default_idle()); + } +} + +void __init xen_pvh_init(struct boot_params *boot_params) +{ + xen_pvh = 1; + xen_domain_type = XEN_HVM_DOMAIN; + xen_start_flags = pvh_start_info.flags; + + x86_init.oem.arch_setup = pvh_arch_setup; + x86_init.oem.banner = xen_banner; + + xen_efi_init(boot_params); + + if (xen_initial_domain()) { + struct xen_platform_op op = { + .cmd = XENPF_get_dom0_console, + }; + int ret = HYPERVISOR_platform_op(&op); + + if (ret > 0) + xen_init_vga(&op.u.dom0_console, + min(ret * sizeof(char), + sizeof(op.u.dom0_console)), + &boot_params->screen_info); + } +} + +void __init mem_map_via_hcall(struct boot_params *boot_params_p) +{ + struct xen_memory_map memmap; + int rc; + + memmap.nr_entries = ARRAY_SIZE(boot_params_p->e820_table); + set_xen_guest_handle(memmap.buffer, boot_params_p->e820_table); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc) { + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); + BUG(); + } + boot_params_p->e820_entries = memmap.nr_entries; +} diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 60e9c37fd79f..c4c479373249 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -5,8 +5,7 @@ #include <asm/xen/hypercall.h> #include <xen/interface/memory.h> -#include "multicalls.h" -#include "mmu.h" +#include "xen-ops.h" unsigned long arbitrary_virt_to_mfn(void *vaddr) { diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h deleted file mode 100644 index 6e4c6bd62203..000000000000 --- a/arch/x86/xen/mmu.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_MMU_H - -#include <linux/linkage.h> -#include <asm/page.h> - -enum pt_level { - PT_PGD, - PT_P4D, - PT_PUD, - PT_PMD, - PT_PTE -}; - - -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); - -void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); -void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte); - -unsigned long xen_read_cr2_direct(void); - -extern void xen_init_mmu_ops(void); -extern void xen_hvm_init_mmu_ops(void); -#endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c index 509bdee3ab90..337955652202 100644 --- a/arch/x86/xen/mmu_hvm.c +++ b/arch/x86/xen/mmu_hvm.c @@ -5,7 +5,7 @@ #include <xen/interface/xen.h> #include <xen/hvm.h> -#include "mmu.h" +#include "xen-ops.h" #ifdef CONFIG_PROC_VMCORE /* diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 54e0d311dcc9..38971c6dcd4b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -82,9 +82,7 @@ #include <xen/hvc-console.h> #include <xen/swiotlb-xen.h> -#include "multicalls.h" -#include "mmu.h" -#include "debugfs.h" +#include "xen-ops.h" /* * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order @@ -113,6 +111,51 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; */ static DEFINE_SPINLOCK(xen_reservation_lock); +/* Protected by xen_reservation_lock. */ +#define MIN_CONTIG_ORDER 9 /* 2MB */ +static unsigned int discontig_frames_order = MIN_CONTIG_ORDER; +static unsigned long discontig_frames_early[1UL << MIN_CONTIG_ORDER] __initdata; +static unsigned long *discontig_frames __refdata = discontig_frames_early; +static bool discontig_frames_dyn; + +static int alloc_discontig_frames(unsigned int order) +{ + unsigned long *new_array, *old_array; + unsigned int old_order; + unsigned long flags; + + BUG_ON(order < MIN_CONTIG_ORDER); + BUILD_BUG_ON(sizeof(discontig_frames_early) != PAGE_SIZE); + + new_array = (unsigned long *)__get_free_pages(GFP_KERNEL, + order - MIN_CONTIG_ORDER); + if (!new_array) + return -ENOMEM; + + spin_lock_irqsave(&xen_reservation_lock, flags); + + old_order = discontig_frames_order; + + if (order > discontig_frames_order || !discontig_frames_dyn) { + if (!discontig_frames_dyn) + old_array = NULL; + else + old_array = discontig_frames; + + discontig_frames = new_array; + discontig_frames_order = order; + discontig_frames_dyn = true; + } else { + old_array = new_array; + } + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + free_pages((unsigned long)old_array, old_order - MIN_CONTIG_ORDER); + + return 0; +} + /* * Note about cr3 (pagetable base) values: * @@ -128,7 +171,7 @@ static DEFINE_SPINLOCK(xen_reservation_lock); * looking at another vcpu's cr3 value, it should use this variable. */ DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ -DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +static DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ static phys_addr_t xen_pt_base, xen_pt_size __initdata; @@ -305,16 +348,17 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) __xen_set_pte(ptep, pteval); } -pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep); return *ptep; } -void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte) +static void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, + pte_t *ptep, pte_t pte) { struct mmu_update u; @@ -666,7 +710,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; -#if USE_SPLIT_PTE_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) ptl = ptlock_ptr(page_ptdesc(page)); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif @@ -782,6 +826,7 @@ void xen_mm_pin_all(void) { struct page *page; + spin_lock(&init_mm.page_table_lock); spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { @@ -792,6 +837,7 @@ void xen_mm_pin_all(void) } spin_unlock(&pgd_lock); + spin_unlock(&init_mm.page_table_lock); } static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page, @@ -813,6 +859,9 @@ static void __init xen_after_bootmem(void) SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); + + if (alloc_discontig_frames(MIN_CONTIG_ORDER)) + BUG(); } static void xen_unpin_page(struct mm_struct *mm, struct page *page, @@ -888,6 +937,7 @@ void xen_mm_unpin_all(void) { struct page *page; + spin_lock(&init_mm.page_table_lock); spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { @@ -899,6 +949,7 @@ void xen_mm_unpin_all(void) } spin_unlock(&pgd_lock); + spin_unlock(&init_mm.page_table_lock); } static void xen_enter_mmap(struct mm_struct *mm) @@ -1554,7 +1605,8 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, __set_pfn_prot(pfn, PAGE_KERNEL_RO); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS) && + !pinned) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); xen_mc_issue(XEN_LAZY_MMU); @@ -1582,7 +1634,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level) if (pinned) { xen_mc_batch(); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS)) __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); __set_pfn_prot(pfn, PAGE_KERNEL); @@ -2019,10 +2071,7 @@ void __init xen_reserve_special_pages(void) void __init xen_pt_check_e820(void) { - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table"); } static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; @@ -2140,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, @@ -2202,10 +2250,6 @@ void __init xen_init_mmu_ops(void) memset(dummy_mapping, 0xff, PAGE_SIZE); } -/* Protected by xen_reservation_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ -static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; - #define VOID_PTE (mfn_pte(0, __pgprot(0))) static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, unsigned long *in_frames, @@ -2322,18 +2366,25 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, dma_addr_t *dma_handle) { - unsigned long *in_frames = discontig_frames, out_frame; + unsigned long *in_frames, out_frame; unsigned long flags; int success; unsigned long vstart = (unsigned long)phys_to_virt(pstart); - if (unlikely(order > MAX_CONTIG_ORDER)) - return -ENOMEM; + if (unlikely(order > discontig_frames_order)) { + if (!discontig_frames_dyn) + return -ENOMEM; + + if (alloc_discontig_frames(order)) + return -ENOMEM; + } memset((void *) vstart, 0, PAGE_SIZE << order); spin_lock_irqsave(&xen_reservation_lock, flags); + in_frames = discontig_frames; + /* 1. Zap current PTEs, remembering MFNs. */ xen_zap_pfn_range(vstart, order, in_frames, NULL); @@ -2357,12 +2408,12 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) { - unsigned long *out_frames = discontig_frames, in_frame; + unsigned long *out_frames, in_frame; unsigned long flags; int success; unsigned long vstart; - if (unlikely(order > MAX_CONTIG_ORDER)) + if (unlikely(order > discontig_frames_order)) return; vstart = (unsigned long)phys_to_virt(pstart); @@ -2370,6 +2421,8 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_lock_irqsave(&xen_reservation_lock, flags); + out_frames = discontig_frames; + /* 1. Find start MFN of contiguous extent. */ in_frame = virt_to_mfn((void *)vstart); diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 07054572297f..7237d56a9d3f 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -23,26 +23,21 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/debugfs.h> +#include <linux/jump_label.h> +#include <linux/printk.h> #include <asm/xen/hypercall.h> -#include "multicalls.h" -#include "debugfs.h" +#include "xen-ops.h" #define MC_BATCH 32 -#define MC_DEBUG 0 - #define MC_ARGS (MC_BATCH * 16) struct mc_buffer { unsigned mcidx, argidx, cbidx; struct multicall_entry entries[MC_BATCH]; -#if MC_DEBUG - struct multicall_entry debug[MC_BATCH]; - void *caller[MC_BATCH]; -#endif unsigned char args[MC_ARGS]; struct callback { void (*fn)(void *); @@ -50,13 +45,105 @@ struct mc_buffer { } callbacks[MC_BATCH]; }; +struct mc_debug_data { + struct multicall_entry entries[MC_BATCH]; + void *caller[MC_BATCH]; + size_t argsz[MC_BATCH]; + unsigned long *args[MC_BATCH]; +}; + static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); +static struct mc_debug_data mc_debug_data_early __initdata; +static struct mc_debug_data __percpu *mc_debug_data_ptr; DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); +static struct static_key mc_debug __ro_after_init; +static bool mc_debug_enabled __initdata; + +static struct mc_debug_data * __ref get_mc_debug(void) +{ + if (!mc_debug_data_ptr) + return &mc_debug_data_early; + + return this_cpu_ptr(mc_debug_data_ptr); +} + +static int __init xen_parse_mc_debug(char *arg) +{ + mc_debug_enabled = true; + static_key_slow_inc(&mc_debug); + + return 0; +} +early_param("xen_mc_debug", xen_parse_mc_debug); + +static int __init mc_debug_enable(void) +{ + unsigned long flags; + struct mc_debug_data __percpu *mcdb; + + if (!mc_debug_enabled) + return 0; + + mcdb = alloc_percpu(struct mc_debug_data); + if (!mcdb) { + pr_err("xen_mc_debug inactive\n"); + static_key_slow_dec(&mc_debug); + return -ENOMEM; + } + + /* Be careful when switching to percpu debug data. */ + local_irq_save(flags); + xen_mc_flush(); + mc_debug_data_ptr = mcdb; + local_irq_restore(flags); + + pr_info("xen_mc_debug active\n"); + + return 0; +} +early_initcall(mc_debug_enable); + +/* Number of parameters of hypercalls used via multicalls. */ +static const uint8_t hpcpars[] = { + [__HYPERVISOR_mmu_update] = 4, + [__HYPERVISOR_stack_switch] = 2, + [__HYPERVISOR_fpu_taskswitch] = 1, + [__HYPERVISOR_update_descriptor] = 2, + [__HYPERVISOR_update_va_mapping] = 3, + [__HYPERVISOR_mmuext_op] = 4, +}; + +static void print_debug_data(struct mc_buffer *b, struct mc_debug_data *mcdb, + int idx) +{ + unsigned int arg; + unsigned int opidx = mcdb->entries[idx].op & 0xff; + unsigned int pars = 0; + + pr_err(" call %2d: op=%lu result=%ld caller=%pS ", idx + 1, + mcdb->entries[idx].op, b->entries[idx].result, + mcdb->caller[idx]); + if (opidx < ARRAY_SIZE(hpcpars)) + pars = hpcpars[opidx]; + if (pars) { + pr_cont("pars="); + for (arg = 0; arg < pars; arg++) + pr_cont("%lx ", mcdb->entries[idx].args[arg]); + } + if (mcdb->argsz[idx]) { + pr_cont("args="); + for (arg = 0; arg < mcdb->argsz[idx] / 8; arg++) + pr_cont("%lx ", mcdb->args[idx][arg]); + } + pr_cont("\n"); +} + void xen_mc_flush(void) { struct mc_buffer *b = this_cpu_ptr(&mc_buffer); struct multicall_entry *mc; + struct mc_debug_data *mcdb = NULL; int ret = 0; unsigned long flags; int i; @@ -69,10 +156,11 @@ void xen_mc_flush(void) trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); -#if MC_DEBUG - memcpy(b->debug, b->entries, - b->mcidx * sizeof(struct multicall_entry)); -#endif + if (static_key_false(&mc_debug)) { + mcdb = get_mc_debug(); + memcpy(mcdb->entries, b->entries, + b->mcidx * sizeof(struct multicall_entry)); + } switch (b->mcidx) { case 0: @@ -103,21 +191,14 @@ void xen_mc_flush(void) pr_err("%d of %d multicall(s) failed: cpu %d\n", ret, b->mcidx, smp_processor_id()); for (i = 0; i < b->mcidx; i++) { - if (b->entries[i].result < 0) { -#if MC_DEBUG - pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pS\n", - i + 1, - b->debug[i].op, - b->debug[i].args[0], - b->entries[i].result, - b->caller[i]); -#else + if (static_key_false(&mc_debug)) { + print_debug_data(b, mcdb, i); + } else if (b->entries[i].result < 0) { pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n", i + 1, b->entries[i].op, b->entries[i].args[0], b->entries[i].result); -#endif } } } @@ -155,9 +236,13 @@ struct multicall_space __xen_mc_entry(size_t args) } ret.mc = &b->entries[b->mcidx]; -#if MC_DEBUG - b->caller[b->mcidx] = __builtin_return_address(0); -#endif + if (static_key_false(&mc_debug)) { + struct mc_debug_data *mcdb = get_mc_debug(); + + mcdb->caller[b->mcidx] = __builtin_return_address(0); + mcdb->argsz[b->mcidx] = args; + mcdb->args[b->mcidx] = (unsigned long *)(&b->args[argidx]); + } b->mcidx++; ret.args = &b->args[argidx]; b->argidx = argidx + args; diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h deleted file mode 100644 index c3867b585e0d..000000000000 --- a/arch/x86/xen/multicalls.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_MULTICALLS_H -#define _XEN_MULTICALLS_H - -#include <trace/events/xen.h> - -#include "xen-ops.h" - -/* Multicalls */ -struct multicall_space -{ - struct multicall_entry *mc; - void *args; -}; - -/* Allocate room for a multicall and its args */ -struct multicall_space __xen_mc_entry(size_t args); - -DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); - -/* Call to start a batch of multiple __xen_mc_entry()s. Must be - paired with xen_mc_issue() */ -static inline void xen_mc_batch(void) -{ - unsigned long flags; - - /* need to disable interrupts until this entry is complete */ - local_irq_save(flags); - trace_xen_mc_batch(xen_get_lazy_mode()); - __this_cpu_write(xen_mc_irq_flags, flags); -} - -static inline struct multicall_space xen_mc_entry(size_t args) -{ - xen_mc_batch(); - return __xen_mc_entry(args); -} - -/* Flush all pending multicalls */ -void xen_mc_flush(void); - -/* Issue a multicall if we're not in a lazy mode */ -static inline void xen_mc_issue(unsigned mode) -{ - trace_xen_mc_issue(mode); - - if ((xen_get_lazy_mode() & mode) == 0) - xen_mc_flush(); - - /* restore flags saved in xen_mc_batch */ - local_irq_restore(this_cpu_read(xen_mc_irq_flags)); -} - -/* Set up a callback to be called when the current batch is flushed */ -void xen_mc_callback(void (*fn)(void *), void *data); - -/* - * Try to extend the arguments of the previous multicall command. The - * previous command's op must match. If it does, then it attempts to - * extend the argument space allocated to the multicall entry by - * arg_size bytes. - * - * The returned multicall_space will return with mc pointing to the - * command on success, or NULL on failure, and args pointing to the - * newly allocated space. - */ -struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); - -#endif /* _XEN_MULTICALLS_H */ diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 9bdc3b656b2c..56914e21e303 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -70,6 +70,7 @@ #include <linux/memblock.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/acpi.h> #include <asm/cache.h> #include <asm/setup.h> @@ -80,8 +81,8 @@ #include <asm/xen/hypervisor.h> #include <xen/balloon.h> #include <xen/grant_table.h> +#include <xen/hvc-console.h> -#include "multicalls.h" #include "xen-ops.h" #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) @@ -177,13 +178,7 @@ static void p2m_init_identity(unsigned long *p2m, unsigned long pfn) static void * __ref alloc_p2m_page(void) { if (unlikely(!slab_is_available())) { - void *ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); - - return ptr; + return memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } return (void *)__get_free_page(GFP_KERNEL); @@ -555,7 +550,6 @@ int xen_alloc_p2m_entry(unsigned long pfn) /* Separately check the mid mfn level */ unsigned long missing_mfn; unsigned long mid_mfn_mfn; - unsigned long old_mfn; mid_mfn = alloc_p2m_page(); if (!mid_mfn) @@ -565,12 +559,12 @@ int xen_alloc_p2m_entry(unsigned long pfn) missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); - old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); - if (old_mfn != missing_mfn) { - free_p2m_page(mid_mfn); - mid_mfn = mfn_to_virt(old_mfn); - } else { + /* try_cmpxchg() updates missing_mfn on failure. */ + if (try_cmpxchg(top_mfn_p, &missing_mfn, mid_mfn_mfn)) { p2m_top_mfn_p[topidx] = mid_mfn; + } else { + free_p2m_page(mid_mfn); + mid_mfn = mfn_to_virt(missing_mfn); } } } else { @@ -731,7 +725,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, * immediate unmapping. */ map_ops[i].status = GNTST_general_error; - unmap[0].host_addr = map_ops[i].host_addr, + unmap[0].host_addr = map_ops[i].host_addr; unmap[0].handle = map_ops[i].handle; map_ops[i].handle = INVALID_GRANT_HANDLE; if (map_ops[i].flags & GNTMAP_device_map) @@ -741,7 +735,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, if (kmap_ops) { kmap_ops[i].status = GNTST_general_error; - unmap[1].host_addr = kmap_ops[i].host_addr, + unmap[1].host_addr = kmap_ops[i].host_addr; unmap[1].handle = kmap_ops[i].handle; kmap_ops[i].handle = INVALID_GRANT_HANDLE; if (kmap_ops[i].flags & GNTMAP_device_map) @@ -794,9 +788,104 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } +/* Remapped non-RAM areas */ +#define NR_NONRAM_REMAP 4 +static struct nonram_remap { + phys_addr_t maddr; + phys_addr_t paddr; + size_t size; +} xen_nonram_remap[NR_NONRAM_REMAP] __ro_after_init; +static unsigned int nr_nonram_remap __ro_after_init; + +/* + * Do the real remapping of non-RAM regions as specified in the + * xen_nonram_remap[] array. + * In case of an error just crash the system. + */ +void __init xen_do_remap_nonram(void) +{ + unsigned int i; + unsigned int remapped = 0; + const struct nonram_remap *remap = xen_nonram_remap; + unsigned long pfn, mfn, end_pfn; + + for (i = 0; i < nr_nonram_remap; i++) { + end_pfn = PFN_UP(remap->paddr + remap->size); + pfn = PFN_DOWN(remap->paddr); + mfn = PFN_DOWN(remap->maddr); + while (pfn < end_pfn) { + if (!set_phys_to_machine(pfn, mfn)) + panic("Failed to set p2m mapping for pfn=%lx mfn=%lx\n", + pfn, mfn); + + pfn++; + mfn++; + remapped++; + } + + remap++; + } + + pr_info("Remapped %u non-RAM page(s)\n", remapped); +} + +#ifdef CONFIG_ACPI +/* + * Xen variant of acpi_os_ioremap() taking potentially remapped non-RAM + * regions into account. + * Any attempt to map an area crossing a remap boundary will produce a + * WARN() splat. + * phys is related to remap->maddr on input and will be rebased to remap->paddr. + */ +static void __iomem *xen_acpi_os_ioremap(acpi_physical_address phys, + acpi_size size) +{ + unsigned int i; + const struct nonram_remap *remap = xen_nonram_remap; + + for (i = 0; i < nr_nonram_remap; i++) { + if (phys + size > remap->maddr && + phys < remap->maddr + remap->size) { + WARN_ON(phys < remap->maddr || + phys + size > remap->maddr + remap->size); + phys += remap->paddr - remap->maddr; + break; + } + } + + return x86_acpi_os_ioremap(phys, size); +} +#endif /* CONFIG_ACPI */ + +/* + * Add a new non-RAM remap entry. + * In case of no free entry found, just crash the system. + */ +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, + unsigned long size) +{ + BUG_ON((maddr & ~PAGE_MASK) != (paddr & ~PAGE_MASK)); + + if (nr_nonram_remap == NR_NONRAM_REMAP) { + xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n"); + BUG(); + } + +#ifdef CONFIG_ACPI + /* Switch to the Xen acpi_os_ioremap() variant. */ + if (nr_nonram_remap == 0) + acpi_os_ioremap = xen_acpi_os_ioremap; +#endif + + xen_nonram_remap[nr_nonram_remap].maddr = maddr; + xen_nonram_remap[nr_nonram_remap].paddr = paddr; + xen_nonram_remap[nr_nonram_remap].size = size; + + nr_nonram_remap++; +} + #ifdef CONFIG_XEN_DEBUG_FS #include <linux/debugfs.h> -#include "debugfs.h" static int p2m_dump_show(struct seq_file *m, void *v) { static const char * const type_name[] = { diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 246d67dab510..f06987b0efc3 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -10,7 +10,6 @@ #include <xen/interface/xenpmu.h> #include "xen-ops.h" -#include "pmu.h" /* x86_pmu.handle_irq definition */ #include "../events/perf_event.h" diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h deleted file mode 100644 index 65c58894fc79..000000000000 --- a/arch/x86/xen/pmu.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __XEN_PMU_H -#define __XEN_PMU_H - -#include <xen/interface/xenpmu.h> - -extern bool is_xen_pmu; - -irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); -#ifdef CONFIG_XEN_HAVE_VPMU -void xen_pmu_init(int cpu); -void xen_pmu_finish(int cpu); -#else -static inline void xen_pmu_init(int cpu) {} -static inline void xen_pmu_finish(int cpu) {} -#endif -bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); -bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); -int pmu_apic_update(uint32_t reg); -unsigned long long xen_read_pmc(int counter); - -#endif /* __XEN_PMU_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 380591028cb8..3823e52aef52 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -15,12 +15,12 @@ #include <linux/cpuidle.h> #include <linux/cpufreq.h> #include <linux/memory_hotplug.h> +#include <linux/acpi.h> #include <asm/elf.h> #include <asm/vdso.h> #include <asm/e820/api.h> #include <asm/setup.h> -#include <asm/acpi.h> #include <asm/numa.h> #include <asm/idtentry.h> #include <asm/xen/hypervisor.h> @@ -34,19 +34,18 @@ #include <xen/features.h> #include <xen/hvc-console.h> #include "xen-ops.h" -#include "mmu.h" #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) -/* Number of pages released from the initial allocation. */ -unsigned long xen_released_pages; - /* Memory map would allow PCI passthrough. */ bool xen_pv_pci_possible; /* E820 map used during setting up memory. */ static struct e820_table xen_e820_table __initdata; +/* Number of initially usable memory pages. */ +static unsigned long ini_nr_pages __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -213,7 +212,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages) + unsigned long end_pfn) { unsigned long pfn, end; int ret; @@ -221,7 +220,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN_ON(start_pfn > end_pfn); /* Release pages first. */ - end = min(end_pfn, nr_pages); + end = min(end_pfn, ini_nr_pages); for (pfn = start_pfn; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -342,15 +341,14 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long remap_pfn) + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; unsigned long n = end_pfn - start_pfn; if (remap_pfn == 0) - remap_pfn = nr_pages; + remap_pfn = ini_nr_pages; while (i < n) { unsigned long cur_pfn = start_pfn + i; @@ -359,19 +357,19 @@ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long remap_range_size; /* Do not remap pages beyond the current allocation */ - if (cur_pfn >= nr_pages) { + if (cur_pfn >= ini_nr_pages) { /* Identity map remaining pages */ set_phys_range_identity(cur_pfn, cur_pfn + size); break; } - if (cur_pfn + size > nr_pages) - size = nr_pages - cur_pfn; + if (cur_pfn + size > ini_nr_pages) + size = ini_nr_pages - cur_pfn; remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages); + cur_pfn + left); break; } /* Adjust size to fit in current e820 RAM region */ @@ -398,18 +396,18 @@ static unsigned long __init xen_set_identity_and_remap_chunk( } static unsigned long __init xen_count_remap_pages( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pages) { - if (start_pfn >= nr_pages) + if (start_pfn >= ini_nr_pages) return remap_pages; - return remap_pages + min(end_pfn, nr_pages) - start_pfn; + return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn; } -static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, +static unsigned long __init xen_foreach_remap_area( unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn, - unsigned long nr_pages, unsigned long last_val)) + unsigned long last_val)) { phys_addr_t start = 0; unsigned long ret_val = 0; @@ -437,8 +435,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) - ret_val = func(start_pfn, end_pfn, nr_pages, - ret_val); + ret_val = func(start_pfn, end_pfn, ret_val); start = end; } } @@ -495,6 +492,8 @@ void __init xen_remap_memory(void) set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); + + xen_do_remap_nonram(); } static unsigned long __init xen_get_pages_limit(void) @@ -568,7 +567,7 @@ static void __init xen_ignore_unusable(void) } } -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { struct e820_entry *entry; unsigned mapcnt; @@ -626,6 +625,111 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) } /* + * Swap a non-RAM E820 map entry with RAM above ini_nr_pages. + * Note that the E820 map is modified accordingly, but the P2M map isn't yet. + * The adaption of the P2M must be deferred until page allocation is possible. + */ +static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t mem_end = PFN_PHYS(ini_nr_pages); + phys_addr_t swap_addr, swap_size, entry_end; + + swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr); + swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size); + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + entry_end = entry->addr + entry->size; + if (entry->type == E820_TYPE_RAM && entry->size >= swap_size && + entry_end - swap_size >= mem_end) { + /* Reduce RAM entry by needed space (whole pages). */ + entry->size -= swap_size; + + /* Add new entry at the end of E820 map. */ + entry = xen_e820_table.entries + + xen_e820_table.nr_entries; + xen_e820_table.nr_entries++; + + /* Fill new entry (keep size and page offset). */ + entry->type = swap_entry->type; + entry->addr = entry_end - swap_size + + swap_addr - swap_entry->addr; + entry->size = swap_entry->size; + + /* Convert old entry to RAM, align to pages. */ + swap_entry->type = E820_TYPE_RAM; + swap_entry->addr = swap_addr; + swap_entry->size = swap_size; + + /* Remember PFN<->MFN relation for P2M update. */ + xen_add_remap_nonram(swap_addr, entry_end - swap_size, + swap_size); + + /* Order E820 table and merge entries. */ + e820__update_table(&xen_e820_table); + + return; + } + + entry++; + } + + xen_raw_console_write("No suitable area found for required E820 entry remapping action\n"); + BUG(); +} + +/* + * Look for non-RAM memory types in a specific guest physical area and move + * those away if possible (ACPI NVS only for now). + */ +static void __init xen_e820_resolve_conflicts(phys_addr_t start, + phys_addr_t size) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t end; + + if (!size) + return; + + end = start + size; + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + if (entry->addr >= end) + return; + + if (entry->addr + entry->size > start && + entry->type == E820_TYPE_NVS) + xen_e820_swap_entry_with_ram(entry); + + entry++; + } +} + +/* + * Check for an area in physical memory to be usable for non-movable purposes. + * An area is considered to usable if the used E820 map lists it to be RAM or + * some other type which can be moved to higher PFNs while keeping the MFNs. + * In case the area is not usable, crash the system with an error message. + */ +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component) +{ + xen_e820_resolve_conflicts(start, size); + + if (!xen_is_e820_reserved(start, size)) + return; + + xen_raw_console_write("Xen hypervisor allocated "); + xen_raw_console_write(component); + xen_raw_console_write(" memory conflicts with E820 map\n"); + BUG(); +} + +/* * Like memcpy, but with physical addresses for dest and src. */ static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src, @@ -684,20 +788,20 @@ static void __init xen_reserve_xen_mfnlist(void) **/ char * __init xen_memory_setup(void) { - unsigned long max_pfn, pfn_s, n_pfns; + unsigned long pfn_s, n_pfns; phys_addr_t mem_end, addr, size, chunk_size; u32 type; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; + unsigned long maxmem_pages; int i; int op; xen_parse_512gb(); - max_pfn = xen_get_pages_limit(); - max_pfn = min(max_pfn, xen_start_info->nr_pages); - mem_end = PFN_PHYS(max_pfn); + ini_nr_pages = min(xen_get_pages_limit(), xen_start_info->nr_pages); + mem_end = PFN_PHYS(ini_nr_pages); memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); @@ -747,13 +851,35 @@ char * __init xen_memory_setup(void) /* Make sure the Xen-supplied memory map is well-ordered. */ e820__update_table(&xen_e820_table); + /* + * Check whether the kernel itself conflicts with the target E820 map. + * Failing now is better than running into weird problems later due + * to relocating (and even reusing) pages with kernel text or data. + */ + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* + * Check for a conflict of the xen_start_info memory with the target + * E820 map. + */ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); + + /* + * Check for a conflict of the hypervisor supplied page tables with + * the target E820 map. + */ + xen_pt_check_e820(); + max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ - max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages); + max_pages += xen_foreach_remap_area(xen_count_remap_pages); - if (max_pages > max_pfn) - extra_pages += max_pages - max_pfn; + if (max_pages > ini_nr_pages) + extra_pages += max_pages - ini_nr_pages; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO @@ -762,8 +888,8 @@ char * __init xen_memory_setup(void) * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. */ - extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), - extra_pages, max_pages - max_pfn); + maxmem_pages = EXTRA_MEM_RATIO * min(ini_nr_pages, PFN_DOWN(MAXMEM)); + extra_pages = min3(maxmem_pages, extra_pages, max_pages - ini_nr_pages); i = 0; addr = xen_e820_table.entries[0].addr; size = xen_e820_table.entries[0].size; @@ -819,23 +945,6 @@ char * __init xen_memory_setup(void) e820__update_table(e820_table); - /* - * Check whether the kernel itself conflicts with the target E820 map. - * Failing now is better than running into weird problems later due - * to relocating (and even reusing) pages with kernel text or data. - */ - if (xen_is_e820_reserved(__pa_symbol(_text), - __pa_symbol(__bss_stop) - __pa_symbol(_text))) { - xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); - BUG(); - } - - /* - * Check for a conflict of the hypervisor supplied page tables with - * the target E820 map. - */ - xen_pt_check_e820(); - xen_reserve_xen_mfnlist(); /* Check for a conflict of the initrd with the target E820 map. */ @@ -863,7 +972,7 @@ char * __init xen_memory_setup(void) * Set identity map on non-RAM pages and prepare remapping the * underlying RAM. */ - xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk); + xen_foreach_remap_area(xen_set_identity_and_remap_chunk); pr_info("Released %ld page(s)\n", xen_released_pages); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 935771726f9c..05f92c812ac8 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -9,7 +9,6 @@ #include <xen/hvc-console.h> #include "xen-ops.h" -#include "smp.h" static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h deleted file mode 100644 index b8efdbc693f7..000000000000 --- a/arch/x86/xen/smp.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_SMP_H - -#ifdef CONFIG_SMP - -void asm_cpu_bringup_and_idle(void); -asmlinkage void cpu_bringup_and_idle(void); - -extern void xen_send_IPI_mask(const struct cpumask *mask, - int vector); -extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, - int vector); -extern void xen_send_IPI_allbutself(int vector); -extern void xen_send_IPI_all(int vector); -extern void xen_send_IPI_self(int vector); - -extern int xen_smp_intr_init(unsigned int cpu); -extern void xen_smp_intr_free(unsigned int cpu); -int xen_smp_intr_init_pv(unsigned int cpu); -void xen_smp_intr_free_pv(unsigned int cpu); - -void xen_smp_count_cpus(void); -void xen_smp_cpus_done(unsigned int max_cpus); - -void xen_smp_send_reschedule(int cpu); -void xen_smp_send_call_function_ipi(const struct cpumask *mask); -void xen_smp_send_call_function_single_ipi(int cpu); - -void __noreturn xen_cpu_bringup_again(unsigned long stack); - -struct xen_common_irq { - int irq; - char *name; -}; -#else /* CONFIG_SMP */ - -static inline int xen_smp_intr_init(unsigned int cpu) -{ - return 0; -} -static inline void xen_smp_intr_free(unsigned int cpu) {} - -static inline int xen_smp_intr_init_pv(unsigned int cpu) -{ - return 0; -} -static inline void xen_smp_intr_free_pv(unsigned int cpu) {} -static inline void xen_smp_count_cpus(void) { } -#endif /* CONFIG_SMP */ - -#endif diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index ac95d1981cc0..485c1d8804f7 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -5,8 +5,6 @@ #include <xen/events.h> #include "xen-ops.h" -#include "smp.h" - static void __init xen_hvm_smp_prepare_boot_cpu(void) { diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 27d1a5b7f571..9bb8ff8bff30 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -46,9 +46,6 @@ #include <xen/hvc-console.h> #include "xen-ops.h" -#include "mmu.h" -#include "smp.h" -#include "pmu.h" cpumask_var_t xen_cpu_initialized_map; @@ -73,7 +70,7 @@ static void cpu_bringup(void) xen_enable_syscall(); } cpu = smp_processor_id(); - smp_store_cpu_info(cpu); + identify_secondary_cpu(cpu); set_cpu_sibling_map(cpu); speculative_store_bypass_ht_init(); @@ -154,9 +151,9 @@ static void __init xen_pv_smp_config(void) u32 apicid = 0; int i; - topology_register_boot_apic(apicid++); + topology_register_boot_apic(apicid); - for (i = 1; i < nr_cpu_ids; i++) + for (i = 0; i < nr_cpu_ids; i++) topology_register_apic(apicid++, CPU_ACPIID_INVALID, true); /* Pretend to be a proper enumerated system */ diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 5c6fc16e4b92..8e4efe0fb6f9 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -18,7 +18,6 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(atomic_t, xen_qlock_wait_nest); -static bool xen_pvspin = true; static void xen_qlock_kick(int cpu) { @@ -68,7 +67,7 @@ void xen_init_lock_cpu(int cpu) int irq; char *name; - if (!xen_pvspin) + if (nopvspin) return; WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", @@ -95,7 +94,7 @@ void xen_uninit_lock_cpu(int cpu) { int irq; - if (!xen_pvspin) + if (nopvspin) return; kfree(per_cpu(irq_name, cpu)); @@ -125,10 +124,10 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); void __init xen_init_spinlocks(void) { /* Don't need to use pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1 || nopvspin) - xen_pvspin = false; + if (num_possible_cpus() == 1) + nopvspin = true; - if (!xen_pvspin) { + if (nopvspin) { printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); static_branch_disable(&virt_spin_lock_key); return; @@ -143,12 +142,3 @@ void __init xen_init_spinlocks(void) pv_ops.lock.kick = xen_qlock_kick; pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } - -static __init int xen_parse_nopvspin(char *arg) -{ - pr_notice("\"xen_nopvspin\" is deprecated, please use \"nopvspin\" instead\n"); - xen_pvspin = false; - return 0; -} -early_param("xen_nopvspin", xen_parse_nopvspin); - diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d83152c761b..77a6ea1c60e4 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -15,8 +15,6 @@ #include <asm/fixmap.h> #include "xen-ops.h" -#include "mmu.h" -#include "pmu.h" static DEFINE_PER_CPU(u64, spec_ctrl); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 52fa5609b7f6..96521b1874ac 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -30,7 +30,7 @@ #include "xen-ops.h" /* Minimum amount of time until next clock event fires */ -#define TIMER_SLOP 100000 +#define TIMER_SLOP 1 static u64 xen_sched_clock_offset __read_mostly; diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 83189cf5cdce..461bb1526502 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -20,14 +20,38 @@ #include <linux/init.h> #include <linux/linkage.h> +#include <linux/objtool.h> #include <../entry/calling.h> .pushsection .noinstr.text, "ax" /* + * PV hypercall interface to the hypervisor. + * + * Called via inline asm(), so better preserve %rcx and %r11. + * + * Input: + * %eax: hypercall number + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %rax + */ +SYM_FUNC_START(xen_hypercall_pv) + ANNOTATE_NOENDBR + push %rcx + push %r11 + UNWIND_HINT_SAVE + syscall + UNWIND_HINT_RESTORE + pop %r11 + pop %rcx + RET +SYM_FUNC_END(xen_hypercall_pv) + +/* * Disabling events is simply a matter of making the event mask * non-zero. */ SYM_FUNC_START(xen_irq_disable_direct) + ENDBR movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) RET SYM_FUNC_END(xen_irq_disable_direct) @@ -67,6 +91,7 @@ SYM_FUNC_END(check_events) * then enter the hypervisor to get them handled. */ SYM_FUNC_START(xen_irq_enable_direct) + ENDBR FRAME_BEGIN /* Unmask events */ movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) @@ -97,6 +122,7 @@ SYM_FUNC_END(xen_irq_enable_direct) * x86 use opposite senses (mask vs enable). */ SYM_FUNC_START(xen_save_fl_direct) + ENDBR testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) setz %ah addb %ah, %ah @@ -104,6 +130,7 @@ SYM_FUNC_START(xen_save_fl_direct) SYM_FUNC_END(xen_save_fl_direct) SYM_FUNC_START(xen_read_cr2) + ENDBR FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX _ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX @@ -112,6 +139,7 @@ SYM_FUNC_START(xen_read_cr2) SYM_FUNC_END(xen_read_cr2); SYM_FUNC_START(xen_read_cr2_direct) + ENDBR FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX FRAME_END @@ -176,7 +204,6 @@ SYM_CODE_START(xen_early_idt_handler_array) SYM_CODE_END(xen_early_idt_handler_array) __FINIT -hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* * Xen64 iret frame: * @@ -186,17 +213,26 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 * cs * rip <-- standard iret frame * - * flags + * flags <-- xen_iret must push from here on * - * rcx } - * r11 }<-- pushed by hypercall page - * rsp->rax } + * rcx + * r11 + * rsp->rax */ +.macro xen_hypercall_iret + pushq $0 /* Flags */ + push %rcx + push %r11 + push %rax + mov $__HYPERVISOR_iret, %eax + syscall /* Do the IRET. */ + ud2 /* The SYSCALL should never return. */ +.endm + SYM_CODE_START(xen_iret) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_iret) /* @@ -301,8 +337,7 @@ SYM_CODE_START(xen_entry_SYSENTER_compat) ENDBR lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_entry_SYSENTER_compat) SYM_CODE_END(xen_entry_SYSCALL_compat) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 04101b984f24..5dad6c51cdc3 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -6,9 +6,11 @@ #include <linux/elfnote.h> #include <linux/init.h> +#include <linux/instrumentation.h> #include <asm/boot.h> #include <asm/asm.h> +#include <asm/frame.h> #include <asm/msr.h> #include <asm/page_types.h> #include <asm/percpu.h> @@ -20,28 +22,6 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> -.pushsection .noinstr.text, "ax" - .balign PAGE_SIZE -SYM_CODE_START(hypercall_page) - .rept (PAGE_SIZE / 32) - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - ANNOTATE_UNRET_SAFE - ret - /* - * Xen will write the hypercall page, and sort out ENDBR. - */ - .skip 31, 0xcc - .endr - -#define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 -#include <asm/xen-hypercalls.h> -#undef HYPERCALL -SYM_CODE_END(hypercall_page) -.popsection - #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) @@ -49,18 +29,16 @@ SYM_CODE_START(startup_xen) ANNOTATE_NOENDBR cld - leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp + leaq __top_init_kernel_stack(%rip), %rsp - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax - cdq + xorl %eax, %eax + xorl %edx, %edx wrmsr mov %rsi, %rdi @@ -87,6 +65,84 @@ SYM_CODE_END(xen_cpu_bringup_again) #endif #endif + .pushsection .noinstr.text, "ax" +/* + * Xen hypercall interface to the hypervisor. + * + * Input: + * %eax: hypercall number + * 32-bit: + * %ebx, %ecx, %edx, %esi, %edi: args 1..5 for the hypercall + * 64-bit: + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %[er]ax + */ +SYM_FUNC_START(xen_hypercall_hvm) + ENDBR + FRAME_BEGIN + /* Save all relevant registers (caller save and arguments). */ +#ifdef CONFIG_X86_32 + push %eax + push %ebx + push %ecx + push %edx + push %esi + push %edi +#else + push %rax + push %rcx + push %rdx + push %rdi + push %rsi + push %r11 + push %r10 + push %r9 + push %r8 +#endif + /* Set the vendor specific function. */ + call __xen_hypercall_setfunc + /* Set ZF = 1 if AMD, Restore saved registers. */ +#ifdef CONFIG_X86_32 + lea xen_hypercall_amd, %ebx + cmp %eax, %ebx + pop %edi + pop %esi + pop %edx + pop %ecx + pop %ebx + pop %eax +#else + lea xen_hypercall_amd(%rip), %rcx + cmp %rax, %rcx + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %rsi + pop %rdi + pop %rdx + pop %rcx + pop %rax +#endif + FRAME_END + /* Use correct hypercall function. */ + jz xen_hypercall_amd + jmp xen_hypercall_intel +SYM_FUNC_END(xen_hypercall_hvm) + +SYM_FUNC_START(xen_hypercall_amd) + ANNOTATE_NOENDBR + vmmcall + RET +SYM_FUNC_END(xen_hypercall_amd) + +SYM_FUNC_START(xen_hypercall_intel) + ANNOTATE_NOENDBR + vmcall + RET +SYM_FUNC_END(xen_hypercall_intel) + .popsection + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") @@ -94,7 +150,8 @@ SYM_CODE_END(xen_cpu_bringup_again) ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) /* Map the p2m table to a 512GB-aligned user address. */ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .globl xen_elfnote_entry; + xen_elfnote_entry: _ASM_PTR xen_elfnote_entry_value - .) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, @@ -115,7 +172,6 @@ SYM_CODE_END(xen_cpu_bringup_again) #else # define FEATURES_DOM0 0 #endif - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0) ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 79cf93f2c92f..25e318ef27d6 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -5,8 +5,15 @@ #include <linux/init.h> #include <linux/clocksource.h> #include <linux/irqreturn.h> +#include <linux/linkage.h> + +#include <xen/interface/xenpmu.h> #include <xen/xen-ops.h> +#include <asm/page.h> + +#include <trace/events/xen.h> + /* These are code, but not functions. Defined in entry.S */ extern const char xen_failsafe_callback[]; @@ -23,14 +30,11 @@ void xen_copy_trap_info(struct trap_info *traps); DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); -DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; -extern bool xen_fifo_events; - void xen_setup_mfn_list_list(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); @@ -43,8 +47,12 @@ void xen_mm_unpin_all(void); #ifdef CONFIG_X86_64 void __init xen_relocate_p2m(void); #endif +void __init xen_do_remap_nonram(void); +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, + unsigned long size); -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); @@ -177,4 +185,151 @@ static inline void xen_hvm_post_suspend(int suspend_cancelled) {} void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns); +struct dentry * __init xen_init_debugfs(void); + +enum pt_level { + PT_PGD, + PT_P4D, + PT_PUD, + PT_PMD, + PT_PTE +}; + +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); +unsigned long xen_read_cr2_direct(void); +void xen_init_mmu_ops(void); +void xen_hvm_init_mmu_ops(void); + +/* Multicalls */ +struct multicall_space +{ + struct multicall_entry *mc; + void *args; +}; + +/* Allocate room for a multicall and its args */ +struct multicall_space __xen_mc_entry(size_t args); + +DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); + +/* Call to start a batch of multiple __xen_mc_entry()s. Must be + paired with xen_mc_issue() */ +static inline void xen_mc_batch(void) +{ + unsigned long flags; + + /* need to disable interrupts until this entry is complete */ + local_irq_save(flags); + trace_xen_mc_batch(xen_get_lazy_mode()); + __this_cpu_write(xen_mc_irq_flags, flags); +} + +static inline struct multicall_space xen_mc_entry(size_t args) +{ + xen_mc_batch(); + return __xen_mc_entry(args); +} + +/* Flush all pending multicalls */ +void xen_mc_flush(void); + +/* Issue a multicall if we're not in a lazy mode */ +static inline void xen_mc_issue(unsigned mode) +{ + trace_xen_mc_issue(mode); + + if ((xen_get_lazy_mode() & mode) == 0) + xen_mc_flush(); + + /* restore flags saved in xen_mc_batch */ + local_irq_restore(this_cpu_read(xen_mc_irq_flags)); +} + +/* Set up a callback to be called when the current batch is flushed */ +void xen_mc_callback(void (*fn)(void *), void *data); + +/* + * Try to extend the arguments of the previous multicall command. The + * previous command's op must match. If it does, then it attempts to + * extend the argument space allocated to the multicall entry by + * arg_size bytes. + * + * The returned multicall_space will return with mc pointing to the + * command on success, or NULL on failure, and args pointing to the + * newly allocated space. + */ +struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); + +extern bool is_xen_pmu; + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +#ifdef CONFIG_XEN_HAVE_VPMU +void xen_pmu_init(int cpu); +void xen_pmu_finish(int cpu); +#else +static inline void xen_pmu_init(int cpu) {} +static inline void xen_pmu_finish(int cpu) {} +#endif +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); +int pmu_apic_update(uint32_t reg); +unsigned long long xen_read_pmc(int counter); + +#ifdef CONFIG_SMP + +void asm_cpu_bringup_and_idle(void); +asmlinkage void cpu_bringup_and_idle(void); + +extern void xen_send_IPI_mask(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_allbutself(int vector); +extern void xen_send_IPI_all(int vector); +extern void xen_send_IPI_self(int vector); + +extern int xen_smp_intr_init(unsigned int cpu); +extern void xen_smp_intr_free(unsigned int cpu); +int xen_smp_intr_init_pv(unsigned int cpu); +void xen_smp_intr_free_pv(unsigned int cpu); + +void xen_smp_count_cpus(void); +void xen_smp_cpus_done(unsigned int max_cpus); + +void xen_smp_send_reschedule(int cpu); +void xen_smp_send_call_function_ipi(const struct cpumask *mask); +void xen_smp_send_call_function_single_ipi(int cpu); + +void __noreturn xen_cpu_bringup_again(unsigned long stack); + +struct xen_common_irq { + int irq; + char *name; +}; +#else /* CONFIG_SMP */ + +static inline int xen_smp_intr_init(unsigned int cpu) +{ + return 0; +} +static inline void xen_smp_intr_free(unsigned int cpu) {} + +static inline int xen_smp_intr_init_pv(unsigned int cpu) +{ + return 0; +} +static inline void xen_smp_intr_free_pv(unsigned int cpu) {} +static inline void xen_smp_count_cpus(void) { } +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_XEN_PV +void xen_hypercall_pv(void); +#endif +void xen_hypercall_hvm(void); +void xen_hypercall_amd(void); +void xen_hypercall_intel(void); +void xen_hypercall_setfunc(void); +void *__xen_hypercall_setfunc(void); + #endif /* XEN_OPS_H */ |