diff options
Diffstat (limited to 'arch/x86')
71 files changed, 976 insertions, 806 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f8958b01b975..140e254fe546 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -123,7 +123,7 @@ config NEED_SG_DMA_LENGTH def_bool y config GENERIC_ISA_DMA - def_bool y + def_bool ISA_DMA_API config GENERIC_IOMAP def_bool y @@ -143,7 +143,7 @@ config GENERIC_GPIO bool config ARCH_MAY_HAVE_PC_FDC - def_bool y + def_bool ISA_DMA_API config RWSEM_GENERIC_SPINLOCK def_bool !X86_XADD @@ -221,10 +221,6 @@ config X86_HT def_bool y depends on SMP -config X86_TRAMPOLINE - def_bool y - depends on SMP || (64BIT && ACPI_SLEEP) - config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR @@ -2006,9 +2002,13 @@ source "drivers/pci/pcie/Kconfig" source "drivers/pci/Kconfig" -# x86_64 have no ISA slots, but do have ISA-style DMA. +# x86_64 have no ISA slots, but can have ISA-style DMA. config ISA_DMA_API - def_bool y + bool "ISA-style DMA support" if (X86_64 && EXPERT) + default y + help + Enables ISA-style DMA support for devices requiring such controllers. + If unsure, say Y. if X86_32 @@ -2096,6 +2096,16 @@ source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" +config RAPIDIO + bool "RapidIO support" + depends on PCI + default n + help + If you say Y here, the kernel will include drivers and + infrastructure code to support RapidIO interconnect devices. + +source "drivers/rapidio/Kconfig" + endmenu @@ -2130,6 +2140,11 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC +config KEYS_COMPAT + bool + depends on COMPAT && KEYS + default y + endmenu diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index e1e60c7d5813..e0e6340c8dad 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -873,22 +873,18 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) crypto_ablkcipher_clear_flags(ctr_tfm, ~0); ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); - if (ret) { - crypto_free_ablkcipher(ctr_tfm); - return ret; - } + if (ret) + goto out_free_ablkcipher; + ret = -ENOMEM; req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); - if (!req) { - crypto_free_ablkcipher(ctr_tfm); - return -EINVAL; - } + if (!req) + goto out_free_ablkcipher; req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); - if (!req_data) { - crypto_free_ablkcipher(ctr_tfm); - return -ENOMEM; - } + if (!req_data) + goto out_free_request; + memset(req_data->iv, 0, sizeof(req_data->iv)); /* Clear the data in the hash sub key container to zero.*/ @@ -913,8 +909,10 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) if (!ret) ret = req_data->result.err; } - ablkcipher_request_free(req); kfree(req_data); +out_free_request: + ablkcipher_request_free(req); +out_free_ablkcipher: crypto_free_ablkcipher(ctr_tfm); return ret; } diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 2d93bdbc9ac0..fd843877e841 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); + current->mm->context.ia32_compat = 1; setup_new_exec(bprm); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 430312ba6e3f..849a9d23c71d 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -847,4 +847,5 @@ ia32_sys_call_table: .quad sys_name_to_handle_at .quad compat_sys_open_by_handle_at .quad compat_sys_clock_adjtime + .quad sys_syncfs ia32_syscall_end: diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index b964ec457546..12e0e7dd869c 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -29,6 +29,7 @@ #include <asm/processor.h> #include <asm/mmu.h> #include <asm/mpspec.h> +#include <asm/trampoline.h> #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -113,11 +114,11 @@ static inline void acpi_disable_pci(void) acpi_noirq_set(); } -/* routines for saving/restoring kernel state */ -extern int acpi_save_state_mem(void); -extern void acpi_restore_state_mem(void); +/* Low-level suspend routine. */ +extern int acpi_suspend_lowlevel(void); -extern unsigned long acpi_wakeup_address; +extern const unsigned char acpi_wakeup_code[]; +#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code))) /* early initialization routine */ extern void acpi_reserve_wakeup_memory(void); diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 903683b07e42..69d58131bc8e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -456,14 +456,12 @@ static inline int fls(int x) #ifdef __KERNEL__ -#include <asm-generic/bitops/ext2-non-atomic.h> +#include <asm-generic/bitops/le.h> #define ext2_set_bit_atomic(lock, nr, addr) \ test_and_set_bit((nr), (unsigned long *)(addr)) #define ext2_clear_bit_atomic(lock, nr, addr) \ test_and_clear_bit((nr), (unsigned long *)(addr)) -#include <asm-generic/bitops/minix.h> - #endif /* __KERNEL__ */ #endif /* _ASM_X86_BITOPS_H */ diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index ca1098a7e580..97b6d8114a43 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -151,6 +151,7 @@ #define DMA_AUTOINIT 0x10 +#ifdef CONFIG_ISA_DMA_API extern spinlock_t dma_spin_lock; static inline unsigned long claim_dma_lock(void) @@ -164,6 +165,7 @@ static inline void release_dma_lock(unsigned long flags) { spin_unlock_irqrestore(&dma_spin_lock, flags); } +#endif /* CONFIG_ISA_DMA_API */ /* enable/disable a specific DMA channel */ static inline void enable_dma(unsigned int dmanr) @@ -303,9 +305,11 @@ static inline int get_dma_residue(unsigned int dmanr) } -/* These are in kernel/dma.c: */ +/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */ +#ifdef CONFIG_ISA_DMA_API extern int request_dma(unsigned int dmanr, const char *device_id); extern void free_dma(unsigned int dmanr); +#endif /* From PCI */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 8e37deb1eb38..0f5213564326 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -142,9 +142,9 @@ struct x86_emulate_ops { int (*pio_out_emulated)(int size, unsigned short port, const void *val, unsigned int count, struct kvm_vcpu *vcpu); - bool (*get_cached_descriptor)(struct desc_struct *desc, + bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3, int seg, struct kvm_vcpu *vcpu); - void (*set_cached_descriptor)(struct desc_struct *desc, + void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3, int seg, struct kvm_vcpu *vcpu); u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); @@ -239,6 +239,7 @@ struct x86_emulate_ctxt { int interruptibility; bool perm_ok; /* do not check permissions if true */ + bool only_vendor_specific_insn; bool have_exception; struct x86_exception exception; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ffd7f8d29187..c8af0991fdf0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -85,7 +85,7 @@ #define ASYNC_PF_PER_VCPU 64 -extern spinlock_t kvm_lock; +extern raw_spinlock_t kvm_lock; extern struct list_head vm_list; struct kvm_vcpu; @@ -255,6 +255,8 @@ struct kvm_mmu { int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, const void *pte, unsigned long mmu_seq); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -335,12 +337,6 @@ struct kvm_vcpu_arch { u64 *last_pte_updated; gfn_t last_pte_gfn; - struct { - gfn_t gfn; /* presumed gfn during guest pte update */ - pfn_t pfn; /* pfn corresponding to that gfn */ - unsigned long mmu_seq; - } update_pte; - struct fpu guest_fpu; u64 xcr0; @@ -448,7 +444,7 @@ struct kvm_arch { unsigned long irq_sources_bitmap; s64 kvmclock_offset; - spinlock_t tsc_write_lock; + raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 80a1dee5bea5..aeff3e89b222 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -13,6 +13,12 @@ typedef struct { int size; struct mutex lock; void *vdso; + +#ifdef CONFIG_X86_64 + /* True if mm supports a task running in 32 bit compatibility mode. */ + unsigned short ia32_compat; +#endif + } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 823d48223400..fd5a1f365c95 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -43,6 +43,7 @@ #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 +#define MSR_IA32_BBL_CR_CTL3 0x0000011e #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 7e172955ee57..a09e1f052d84 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -451,6 +451,26 @@ do { \ #define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #endif /* !CONFIG_M386 */ +#ifdef CONFIG_X86_CMPXCHG64 +#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ +({ \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy = n2; \ + asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ + : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ + : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ + __ret; \ +}) + +#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#endif /* CONFIG_X86_CMPXCHG64 */ + /* * Per cpu atomic 64 bit operations are only available under 64 bit. * 32 bit must fall back to generic operations. @@ -480,6 +500,34 @@ do { \ #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) #define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +/* + * Pretty complex macro to generate cmpxchg16 instruction. The instruction + * is not supported on early AMD64 processors so we must be able to emulate + * it in software. The address used in the cmpxchg16 instruction must be + * aligned to a 16 byte boundary. + */ +#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ +({ \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy; \ + alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \ + "cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t", \ + X86_FEATURE_CX16, \ + ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ + "S" (&pcp1), "b"(__n1), "c"(__n2), \ + "a"(__o1), "d"(__o2)); \ + __ret; \ +}) + +#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) + #endif /* This is not atomic against other CPUs -- CPU preemption needs to be off */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 562d4fd31ba8..3250e3d605d9 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -18,7 +18,10 @@ extern struct machine_ops machine_ops; void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); -void machine_real_restart(const unsigned char *code, int length); +void machine_real_restart(unsigned int type); +/* These must match dispatch_table in reboot_32.S */ +#define MRR_BIOS 0 +#define MRR_APM 1 typedef void (*nmi_shootdown_cb)(int, struct die_args*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 231f1c1d6607..cd84f7208f76 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -1,14 +1,16 @@ #ifndef _ASM_X86_SEGMENT_H #define _ASM_X86_SEGMENT_H +#include <linux/const.h> + /* Constructor for a conventional segment GDT (or LDT) entry */ /* This is a macro so it can be used in initializers */ #define GDT_ENTRY(flags, base, limit) \ - ((((base) & 0xff000000ULL) << (56-24)) | \ - (((flags) & 0x0000f0ffULL) << 40) | \ - (((limit) & 0x000f0000ULL) << (48-16)) | \ - (((base) & 0x00ffffffULL) << 16) | \ - (((limit) & 0x0000ffffULL))) + ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ + (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ + (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \ + (((base) & _AC(0x00ffffff,ULL)) << 16) | \ + (((limit) & _AC(0x0000ffff,ULL)))) /* Simple and small GDT entries for booting only */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f0b6e5dbc5a0..1f2e61e28981 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -161,8 +161,14 @@ struct thread_info { #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -#define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) +#define alloc_thread_info_node(tsk, node) \ +({ \ + struct page *page = alloc_pages_node(node, THREAD_FLAGS, \ + THREAD_ORDER); \ + struct thread_info *ret = page ? page_address(page) : NULL; \ + \ + ret; \ +}) #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index f4500fb3b485..feca3118a73b 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -3,25 +3,36 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_X86_TRAMPOLINE +#include <linux/types.h> +#include <asm/io.h> + /* - * Trampoline 80x86 program as an array. + * Trampoline 80x86 program as an array. These are in the init rodata + * segment, but that's okay, because we only care about the relative + * addresses of the symbols. */ -extern const unsigned char trampoline_data []; -extern const unsigned char trampoline_end []; -extern unsigned char *trampoline_base; +extern const unsigned char x86_trampoline_start []; +extern const unsigned char x86_trampoline_end []; +extern unsigned char *x86_trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; extern unsigned long initial_gs; -#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) +extern void __init setup_trampolines(void); + +extern const unsigned char trampoline_data[]; +extern const unsigned char trampoline_status[]; + +#define TRAMPOLINE_SYM(x) \ + ((void *)(x86_trampoline_base + \ + ((const unsigned char *)(x) - x86_trampoline_start))) -extern unsigned long setup_trampoline(void); -extern void __init reserve_trampoline_memory(void); -#else -static inline void reserve_trampoline_memory(void) {} -#endif /* CONFIG_X86_TRAMPOLINE */ +/* Address of the SMP trampoline */ +static inline unsigned long trampoline_address(void) +{ + return virt_to_phys(TRAMPOLINE_SYM(trampoline_data)); +} #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index df1da20f4534..8e8c23fef08c 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -1,22 +1,6 @@ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H -#define dma_addr_t dma_addr_t - #include <asm-generic/types.h> -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ - -typedef u64 dma64_addr_t; -#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G) -/* DMA addresses come in 32-bit and 64-bit flavours. */ -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif - -#endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_TYPES_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index ffaf183c619a..a755ef5e5977 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -349,10 +349,11 @@ #define __NR_name_to_handle_at 341 #define __NR_open_by_handle_at 342 #define __NR_clock_adjtime 343 +#define __NR_syncfs 344 #ifdef __KERNEL__ -#define NR_syscalls 344 +#define NR_syscalls 345 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 5466bea670e7..160fa76bd578 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -675,6 +675,8 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) #define __NR_clock_adjtime 305 __SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) +#define __NR_syncfs 306 +__SYSCALL(__NR_syncfs, sys_syncfs) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 62445ba2f8a8..7338ef2218bc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -41,13 +41,13 @@ obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o -obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o +obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-y += trampoline.o trampoline_$(BITS).o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o @@ -55,10 +55,12 @@ obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o obj-$(CONFIG_INTEL_TXT) += tboot.o +obj-$(CONFIG_ISA_DMA_API) += i8237.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o +obj-$(CONFIG_X86_32) += reboot_32.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o @@ -69,7 +71,6 @@ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 28595d6df47c..ead21b663117 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -6,11 +6,17 @@ #include <asm/page_types.h> #include <asm/pgtable_types.h> #include <asm/processor-flags.h> +#include "wakeup.h" .code16 - .section ".header", "a" + .section ".jump", "ax" + .globl _start +_start: + cli + jmp wakeup_code /* This should match the structure in wakeup.h */ + .section ".header", "a" .globl wakeup_header wakeup_header: video_mode: .short 0 /* Video mode number */ @@ -30,14 +36,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */ wakeup_jmp_off: .word 3f wakeup_jmp_seg: .word 0 wakeup_gdt: .quad 0, 0, 0 -signature: .long 0x51ee1111 +signature: .long WAKEUP_HEADER_SIGNATURE .text - .globl _start .code16 wakeup_code: -_start: - cli cld /* Apparently some dimwit BIOS programmers don't know how to @@ -77,12 +80,12 @@ _start: /* Check header signature... */ movl signature, %eax - cmpl $0x51ee1111, %eax + cmpl $WAKEUP_HEADER_SIGNATURE, %eax jne bogus_real_magic /* Check we really have everything... */ movl end_signature, %eax - cmpl $0x65a22c82, %eax + cmpl $WAKEUP_END_SIGNATURE, %eax jne bogus_real_magic /* Call the C code */ @@ -147,3 +150,7 @@ wakeup_heap: wakeup_stack: .space 2048 wakeup_stack_end: + + .section ".signature","a" +end_signature: + .long WAKEUP_END_SIGNATURE diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h index 69d38d0b2b64..e1828c07e79c 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ b/arch/x86/kernel/acpi/realmode/wakeup.h @@ -35,7 +35,8 @@ struct wakeup_header { extern struct wakeup_header wakeup_header; #endif -#define HEADER_OFFSET 0x3f00 -#define WAKEUP_SIZE 0x4000 +#define WAKEUP_HEADER_OFFSET 8 +#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 +#define WAKEUP_END_SIGNATURE 0x65a22c82 #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 060fff8f5c5b..d4f8010a5b1b 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -13,9 +13,19 @@ ENTRY(_start) SECTIONS { . = 0; + .jump : { + *(.jump) + } = 0x90909090 + + . = WAKEUP_HEADER_OFFSET; + .header : { + *(.header) + } + + . = ALIGN(16); .text : { *(.text*) - } + } = 0x90909090 . = ALIGN(16); .rodata : { @@ -33,11 +43,6 @@ SECTIONS *(.data*) } - .signature : { - end_signature = .; - LONG(0x65a22c82) - } - . = ALIGN(16); .bss : { __bss_start = .; @@ -45,20 +50,13 @@ SECTIONS __bss_end = .; } - . = HEADER_OFFSET; - .header : { - *(.header) + .signature : { + *(.signature) } - . = ALIGN(16); _end = .; /DISCARD/ : { *(.note*) } - - /* - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: - */ - . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 68d1537b8c81..ff93bc1b09c3 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -18,37 +18,28 @@ #include "realmode/wakeup.h" #include "sleep.h" -unsigned long acpi_wakeup_address; unsigned long acpi_realmode_flags; -/* address in low memory of the wakeup routine. */ -static unsigned long acpi_realmode; - #if defined(CONFIG_SMP) && defined(CONFIG_64BIT) static char temp_stack[4096]; #endif /** - * acpi_save_state_mem - save kernel state + * acpi_suspend_lowlevel - save kernel state * * Create an identity mapped page table and copy the wakeup routine to * low memory. - * - * Note that this is too late to change acpi_wakeup_address. */ -int acpi_save_state_mem(void) +int acpi_suspend_lowlevel(void) { struct wakeup_header *header; + /* address in low memory of the wakeup routine. */ + char *acpi_realmode; - if (!acpi_realmode) { - printk(KERN_ERR "Could not allocate memory during boot, " - "S3 disabled\n"); - return -ENOMEM; - } - memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE); + acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code); - header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); - if (header->signature != 0x51ee1111) { + header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET); + if (header->signature != WAKEUP_HEADER_SIGNATURE) { printk(KERN_ERR "wakeup header does not match\n"); return -EINVAL; } @@ -68,9 +59,7 @@ int acpi_save_state_mem(void) /* GDT[0]: GDT self-pointer */ header->wakeup_gdt[0] = (u64)(sizeof(header->wakeup_gdt) - 1) + - ((u64)(acpi_wakeup_address + - ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) - << 16); + ((u64)__pa(&header->wakeup_gdt) << 16); /* GDT[1]: big real mode-like code segment */ header->wakeup_gdt[1] = GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); @@ -96,7 +85,7 @@ int acpi_save_state_mem(void) header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ - header->trampoline_segment = setup_trampoline() >> 4; + header->trampoline_segment = trampoline_address() >> 4; #ifdef CONFIG_SMP stack_start = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = @@ -107,56 +96,10 @@ int acpi_save_state_mem(void) saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ + do_suspend_lowlevel(); return 0; } -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - - -/** - * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation - * - * We allocate a page from the first 1MB of memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16MB pages, but not - * <1MB pages. - */ -void __init acpi_reserve_wakeup_memory(void) -{ - phys_addr_t mem; - - if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { - printk(KERN_ERR - "ACPI: Wakeup code way too big, S3 disabled.\n"); - return; - } - - mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - - if (mem == MEMBLOCK_ERROR) { - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); - return; - } - acpi_realmode = (unsigned long) phys_to_virt(mem); - acpi_wakeup_address = mem; - memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); -} - -int __init acpi_configure_wakeup_memory(void) -{ - if (acpi_realmode) - set_memory_x(acpi_realmode, WAKEUP_SIZE >> PAGE_SHIFT); - - return 0; -} -arch_initcall(acpi_configure_wakeup_memory); - - static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index adbcbaa6f1df..416d4be13fef 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -4,13 +4,12 @@ #include <asm/trampoline.h> -extern char wakeup_code_start, wakeup_code_end; - extern unsigned long saved_video_mode; extern long saved_magic; extern int wakeup_pmode_return; -extern char swsusp_pg_dir[PAGE_SIZE]; extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); + +extern void do_suspend_lowlevel(void); diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S index 6ff3b5730575..63b8ab524f2c 100644 --- a/arch/x86/kernel/acpi/wakeup_rm.S +++ b/arch/x86/kernel/acpi/wakeup_rm.S @@ -2,9 +2,11 @@ * Wrapper script for the realmode binary as a transport object * before copying to low memory. */ - .section ".rodata","a" - .globl wakeup_code_start, wakeup_code_end -wakeup_code_start: +#include <asm/page_types.h> + + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .globl acpi_wakeup_code +acpi_wakeup_code: .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" -wakeup_code_end: - .size wakeup_code_start, .-wakeup_code_start + .size acpi_wakeup_code, .-acpi_wakeup_code diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 65634190ffd6..6801959a8b2a 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -15,7 +15,7 @@ static u32 *flush_words; const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, {} }; EXPORT_SYMBOL(amd_nb_misc_ids); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f15c6f76071c..180ca240e03c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3983,7 +3983,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) static __init int bad_ioapic(unsigned long address) { if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " + printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); return 1; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index a10e516dd78d..0b4be431c620 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -227,6 +227,7 @@ #include <linux/suspend.h> #include <linux/kthread.h> #include <linux/jiffies.h> +#include <linux/acpi.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -975,20 +976,10 @@ recalc: static void apm_power_off(void) { - unsigned char po_bios_call[] = { - 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ - 0x8e, 0xd0, /* movw ax,ss */ - 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ - 0xb8, 0x07, 0x53, /* movw $0x5307,ax */ - 0xbb, 0x01, 0x00, /* movw $0x0001,bx */ - 0xb9, 0x03, 0x00, /* movw $0x0003,cx */ - 0xcd, 0x15 /* int $0x15 */ - }; - /* Some bioses don't like being called from CPU != 0 */ if (apm_info.realmode_power_off) { set_cpus_allowed_ptr(current, cpumask_of(0)); - machine_real_restart(po_bios_call, sizeof(po_bios_call)); + machine_real_restart(MRR_APM); } else { (void)set_system_power_state(APM_STATE_OFF); } @@ -2331,12 +2322,11 @@ static int __init apm_init(void) apm_info.disabled = 1; return -ENODEV; } - if (pm_flags & PM_ACPI) { + if (!acpi_disabled) { printk(KERN_NOTICE "apm: overridden by ACPI.\n"); apm_info.disabled = 1; return -ENODEV; } - pm_flags |= PM_APM; /* * Set up the long jump entry point to the APM BIOS, which is called @@ -2428,7 +2418,6 @@ static void __exit apm_exit(void) kthread_stop(kapmd_task); kapmd_task = NULL; } - pm_flags &= ~PM_APM; } module_init(apm_init); diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 4a5a42b842ad..755a31e0f5b0 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -315,8 +315,6 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) input.count = 4; input.pointer = in_params; - input.count = 4; - input.pointer = in_params; in_params[0].type = ACPI_TYPE_BUFFER; in_params[0].buffer.length = 16; in_params[0].buffer.pointer = OSC_UUID; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index b41f7da4555b..2368e38327b3 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -630,8 +630,7 @@ static void print_basics(struct powernow_k8_data *data) data->powernow_table[j].frequency/1000); } else { printk(KERN_INFO PFX - " %d : fid 0x%x (%d MHz), vid 0x%x\n", - j, + "fid 0x%x (%d MHz), vid 0x%x\n", data->powernow_table[j].index & 0xff, data->powernow_table[j].frequency/1000, data->powernow_table[j].index >> 8); diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 8209472b27a5..83930deec3c6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m) ssize_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; - ssize_t len; - - len = erst_read_next(&rcd.hdr, sizeof(rcd)); - if (len <= 0) - return len; - /* Can not skip other records in storage via ERST unless clear them */ - else if (len != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { - if (printk_ratelimit()) - pr_warning( - "MCE-APEI: Can not skip the unknown record in ERST"); - return -EIO; - } - + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; memcpy(m, &rcd.mce, sizeof(*m)); - *record_id = rcd.hdr.record_id; + rc = sizeof(*m); +out: + erst_get_record_id_end(); - return sizeof(*m); + return rc; } /* Check whether there is record in ERST */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 30612764cd3b..87eab4a27dfc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -178,8 +178,6 @@ struct cpu_hw_events { */ #define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) -#define PEBS_EVENT_CONSTRAINT(c, n) \ - INTEL_UEVENT_CONSTRAINT(c, n) #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index b95c66ae4a2a..bab491b8ee25 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -362,87 +362,69 @@ static int intel_pmu_drain_bts_buffer(void) * PEBS */ static struct event_constraint intel_core2_pebs_event_constraints[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ - PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_atom_pebs_event_constraints[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_nehalem_pebs_event_constraints[] = { - INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - PEBS_EVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ - INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - PEBS_EVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - PEBS_EVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_westmere_pebs_event_constraints[] = { - INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ - PEBS_EVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ - INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ - - INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ - PEBS_EVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ - INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_snb_pebs_events[] = { - PEBS_EVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ - PEBS_EVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ - PEBS_EVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ - PEBS_EVENT_CONSTRAINT(0x01c4, 0xf), /* BR_INST_RETIRED.CONDITIONAL */ - PEBS_EVENT_CONSTRAINT(0x02c4, 0xf), /* BR_INST_RETIRED.NEAR_CALL */ - PEBS_EVENT_CONSTRAINT(0x04c4, 0xf), /* BR_INST_RETIRED.ALL_BRANCHES */ - PEBS_EVENT_CONSTRAINT(0x08c4, 0xf), /* BR_INST_RETIRED.NEAR_RETURN */ - PEBS_EVENT_CONSTRAINT(0x10c4, 0xf), /* BR_INST_RETIRED.NOT_TAKEN */ - PEBS_EVENT_CONSTRAINT(0x20c4, 0xf), /* BR_INST_RETIRED.NEAR_TAKEN */ - PEBS_EVENT_CONSTRAINT(0x40c4, 0xf), /* BR_INST_RETIRED.FAR_BRANCH */ - PEBS_EVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */ - PEBS_EVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ - PEBS_EVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */ - PEBS_EVENT_CONSTRAINT(0x10c5, 0xf), /* BR_MISP_RETIRED.NOT_TAKEN */ - PEBS_EVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.TAKEN */ - PEBS_EVENT_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ - PEBS_EVENT_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORE */ - PEBS_EVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ - PEBS_EVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ - PEBS_EVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ - PEBS_EVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ - PEBS_EVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ - PEBS_EVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ - PEBS_EVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ - PEBS_EVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ - PEBS_EVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */ - PEBS_EVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */ - PEBS_EVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.LLC_HIT */ - PEBS_EVENT_CONSTRAINT(0x40d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */ - PEBS_EVENT_CONSTRAINT(0x01d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */ - PEBS_EVENT_CONSTRAINT(0x02d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */ - PEBS_EVENT_CONSTRAINT(0x04d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM */ - PEBS_EVENT_CONSTRAINT(0x08d2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_NONE */ - PEBS_EVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ + INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ + INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ + INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ EVENT_CONSTRAINT_END }; diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index d5cd13945d5a..642f75a68cd5 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -14,9 +14,6 @@ static void *kdump_buf_page; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 994828899e09..afa64adb75ee 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,9 +10,6 @@ #include <linux/uaccess.h> #include <linux/io.h> -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 999e2793590b..81ac6c78c01c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -322,16 +322,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - static int __init kstack_setup(char *s) { if (!s) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index cdf5bfd9d4d5..3e2ef8425316 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> +#include <linux/crash_dump.h> #include <linux/bootmem.h> #include <linux/pfn.h> #include <linux/suspend.h> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index fa41f7298c84..5c1a91974918 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1414,7 +1414,7 @@ ENTRY(async_page_fault) pushl_cfi $do_async_page_fault jmp error_code CFI_ENDPROC -END(apf_page_fault) +END(async_page_fault) #endif /* diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 7f138b3c3c52..d6d6bb361931 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -34,15 +34,6 @@ void __init i386_start_kernel(void) { memblock_init(); -#ifdef CONFIG_X86_TRAMPOLINE - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); -#endif - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 239046bd447f..e11e39478a49 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -136,10 +136,9 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) -#ifdef CONFIG_X86_TRAMPOLINE + /* Fixup trampoline */ addq %rbp, trampoline_level4_pgt + 0(%rip) addq %rbp, trampoline_level4_pgt + (511*8)(%rip) -#endif /* Due to ENTRY(), sometimes the empty space gets filled with * zeros. Better take a jmp than relying on empty space being diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8dc44662394b..33c07b0b122e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) native_smp_prepare_boot_cpu(); } -static void kvm_guest_cpu_online(void *dummy) +static void __cpuinit kvm_guest_cpu_online(void *dummy) { kvm_guest_cpu_init(); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bd387e8f73b4..6c9dd922ac0d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -501,6 +501,10 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + /* Ensure the corresponding mm is not marked. */ + if (current->mm) + current->mm->context.ia32_compat = 0; + /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that @@ -516,6 +520,10 @@ void set_personality_ia32(void) set_thread_flag(TIF_IA32); current->personality |= force_personality32; + /* Mark the associated mm as containing 32-bit tasks. */ + if (current->mm) + current->mm->context.ia32_compat = 1; + /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 715037caeb43..d3ce37edb54d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -303,68 +303,16 @@ static int __init reboot_init(void) } core_initcall(reboot_init); -/* The following code and data reboots the machine by switching to real - mode and jumping to the BIOS reset entry point, as if the CPU has - really been reset. The previous version asked the keyboard - controller to pulse the CPU reset line, which is more thorough, but - doesn't work with at least one type of 486 motherboard. It is easy - to stop this code working; hence the copious comments. */ -static const unsigned long long -real_mode_gdt_entries [3] = -{ - 0x0000000000000000ULL, /* Null descriptor */ - 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ - 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ -}; +extern const unsigned char machine_real_restart_asm[]; +extern const u64 machine_real_restart_gdt[3]; -static const struct desc_ptr -real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, 0 }; - -/* This is 16-bit protected mode code to disable paging and the cache, - switch to real mode and jump to the BIOS reset code. - - The instruction that switches to real mode by writing to CR0 must be - followed immediately by a far jump instruction, which set CS to a - valid value for real mode, and flushes the prefetch queue to avoid - running instructions that have already been decoded in protected - mode. - - Clears all the flags except ET, especially PG (paging), PE - (protected-mode enable) and TS (task switch for coprocessor state - save). Flushes the TLB after paging has been disabled. Sets CD and - NW, to disable the cache on a 486, and invalidates the cache. This - is more like the state of a 486 after reset. I don't know if - something else should be done for other chips. - - More could be done here to set up the registers as if a CPU reset had - occurred; hopefully real BIOSs don't assume much. */ -static const unsigned char real_mode_switch [] = -{ - 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ - 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ - 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */ - 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */ - 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */ - 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */ - 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */ - 0x74, 0x02, /* jz f */ - 0x0f, 0x09, /* wbinvd */ - 0x24, 0x10, /* f: andb $0x10,al */ - 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ -}; -static const unsigned char jump_to_bios [] = +void machine_real_restart(unsigned int type) { - 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ -}; + void *restart_va; + unsigned long restart_pa; + void (*restart_lowmem)(unsigned int); + u64 *lowmem_gdt; -/* - * Switch to real mode and then execute the code - * specified by the code and length parameters. - * We assume that length will aways be less that 100! - */ -void machine_real_restart(const unsigned char *code, int length) -{ local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST @@ -392,41 +340,23 @@ void machine_real_restart(const unsigned char *code, int length) too. */ *((unsigned short *)0x472) = reboot_mode; - /* For the switch to real mode, copy some code to low memory. It has - to be in the first 64k because it is running in 16-bit mode, and it - has to have the same physical and virtual address, because it turns - off paging. Copy it near the end of the first page, out of the way - of BIOS variables. */ - memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), - real_mode_switch, sizeof (real_mode_switch)); - memcpy((void *)(0x1000 - 100), code, length); - - /* Set up the IDT for real mode. */ - load_idt(&real_mode_idt); - - /* Set up a GDT from which we can load segment descriptors for real - mode. The GDT is not used in real mode; it is just needed here to - prepare the descriptors. */ - load_gdt(&real_mode_gdt); - - /* Load the data segment registers, and thus the descriptors ready for - real mode. The base address of each segment is 0x100, 16 times the - selector value being loaded here. This is so that the segment - registers don't have to be reloaded after switching to real mode: - the values are consistent for real mode operation already. */ - __asm__ __volatile__ ("movl $0x0010,%%eax\n" - "\tmovl %%eax,%%ds\n" - "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" - "\tmovl %%eax,%%ss" : : : "eax"); - - /* Jump to the 16-bit code that we copied earlier. It disables paging - and the cache, switches to real mode, and jumps to the BIOS reset - entry point. */ - __asm__ __volatile__ ("ljmp $0x0008,%0" - : - : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100))); + /* Patch the GDT in the low memory trampoline */ + lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); + + restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); + restart_pa = virt_to_phys(restart_va); + restart_lowmem = (void (*)(unsigned int))restart_pa; + + /* GDT[0]: GDT self-pointer */ + lowmem_gdt[0] = + (u64)(sizeof(machine_real_restart_gdt) - 1) + + ((u64)virt_to_phys(lowmem_gdt) << 16); + /* GDT[1]: 64K real mode code segment */ + lowmem_gdt[1] = + GDT_ENTRY(0x009b, restart_pa, 0xffff); + + /* Jump to the identity-mapped low memory code */ + restart_lowmem(type); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); @@ -581,7 +511,7 @@ static void native_machine_emergency_restart(void) #ifdef CONFIG_X86_32 case BOOT_BIOS: - machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); + machine_real_restart(MRR_BIOS); reboot_type = BOOT_KBD; break; diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S new file mode 100644 index 000000000000..29092b38d816 --- /dev/null +++ b/arch/x86/kernel/reboot_32.S @@ -0,0 +1,135 @@ +#include <linux/linkage.h> +#include <linux/init.h> +#include <asm/segment.h> +#include <asm/page_types.h> + +/* + * The following code and data reboots the machine by switching to real + * mode and jumping to the BIOS reset entry point, as if the CPU has + * really been reset. The previous version asked the keyboard + * controller to pulse the CPU reset line, which is more thorough, but + * doesn't work with at least one type of 486 motherboard. It is easy + * to stop this code working; hence the copious comments. + * + * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. + */ + .section ".x86_trampoline","a" + .balign 16 + .code32 +ENTRY(machine_real_restart_asm) +r_base = . + /* Get our own relocated address */ + call 1f +1: popl %ebx + subl $1b, %ebx + + /* Compute the equivalent real-mode segment */ + movl %ebx, %ecx + shrl $4, %ecx + + /* Patch post-real-mode segment jump */ + movw dispatch_table(%ebx,%eax,2),%ax + movw %ax, 101f(%ebx) + movw %cx, 102f(%ebx) + + /* Set up the IDT for real mode. */ + lidtl machine_real_restart_idt(%ebx) + + /* + * Set up a GDT from which we can load segment descriptors for real + * mode. The GDT is not used in real mode; it is just needed here to + * prepare the descriptors. + */ + lgdtl machine_real_restart_gdt(%ebx) + + /* + * Load the data segment registers with 16-bit compatible values + */ + movl $16, %ecx + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss + ljmpl $8, $1f - r_base + +/* + * This is 16-bit protected mode code to disable paging and the cache, + * switch to real mode and jump to the BIOS reset code. + * + * The instruction that switches to real mode by writing to CR0 must be + * followed immediately by a far jump instruction, which set CS to a + * valid value for real mode, and flushes the prefetch queue to avoid + * running instructions that have already been decoded in protected + * mode. + * + * Clears all the flags except ET, especially PG (paging), PE + * (protected-mode enable) and TS (task switch for coprocessor state + * save). Flushes the TLB after paging has been disabled. Sets CD and + * NW, to disable the cache on a 486, and invalidates the cache. This + * is more like the state of a 486 after reset. I don't know if + * something else should be done for other chips. + * + * More could be done here to set up the registers as if a CPU reset had + * occurred; hopefully real BIOSs don't assume much. This is not the + * actual BIOS entry point, anyway (that is at 0xfffffff0). + * + * Most of this work is probably excessive, but it is what is tested. + */ + .code16 +1: + xorl %ecx, %ecx + movl %cr0, %eax + andl $0x00000011, %eax + orl $0x60000000, %eax + movl %eax, %cr0 + movl %ecx, %cr3 + movl %cr0, %edx + andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ + jz 2f + wbinvd +2: + andb $0x10, %al + movl %eax, %cr0 + .byte 0xea /* ljmpw */ +101: .word 0 /* Offset */ +102: .word 0 /* Segment */ + +bios: + ljmpw $0xf000, $0xfff0 + +apm: + movw $0x1000, %ax + movw %ax, %ss + movw $0xf000, %sp + movw $0x5307, %ax + movw $0x0001, %bx + movw $0x0003, %cx + int $0x15 + +END(machine_real_restart_asm) + + .balign 16 + /* These must match <asm/reboot.h */ +dispatch_table: + .word bios - r_base + .word apm - r_base +END(dispatch_table) + + .balign 16 +machine_real_restart_idt: + .word 0xffff /* Length - real mode default value */ + .long 0 /* Base - real mode default value */ +END(machine_real_restart_idt) + + .balign 16 +ENTRY(machine_real_restart_gdt) + .quad 0 /* Self-pointer, filled in by PM code */ + .quad 0 /* 16-bit code segment, filled in by PM code */ + /* + * 16-bit data segment with the selector value 16 = 0x10 and + * base value 0x100; since this is consistent with real mode + * semantics we don't have to reload the segments once CR0.PE = 0. + */ + .quad GDT_ENTRY(0x0093, 0x100, 0xffff) +END(machine_real_restart_gdt) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4a52a5f9afcb..5a0484a95ad6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -619,28 +619,6 @@ void __init reserve_standard_io_resources(void) } -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ - -#ifdef CONFIG_CRASH_DUMP -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - static __init void reserve_ibft_region(void) { unsigned long addr, size = 0; @@ -944,15 +922,8 @@ void __init setup_arch(char **cmdline_p) printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); - reserve_trampoline_memory(); + setup_trampolines(); -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - * even before init_memory_mapping - */ - acpi_reserve_wakeup_memory(); -#endif init_gbpages(); /* max_pfn_mapped is updated here */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e9efdfd51c8d..c2871d3c71b6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -711,7 +711,7 @@ do_rest: stack_start = c_idle.idle->thread.sp; /* start_ip had better be page-aligned! */ - start_ip = setup_trampoline(); + start_ip = trampoline_address(); /* So we see what's up */ announce_cpu(cpu, apicid); @@ -721,6 +721,8 @@ do_rest: * the targeted processor. */ + printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip); + atomic_set(&init_deasserted, 0); if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { @@ -774,8 +776,8 @@ do_rest: pr_debug("CPU%d: has booted.\n", cpu); else { boot_error = 1; - if (*((volatile unsigned char *)trampoline_base) - == 0xA5) + if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) + == 0xA5A5A5A5) /* trampoline started but...? */ pr_err("CPU%d: Stuck ??\n", cpu); else @@ -801,7 +803,7 @@ do_rest: } /* mark "stuck" area as not stuck */ - *((volatile unsigned long *)trampoline_base) = 0; + *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0; if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { /* diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 5f181742e8f9..abce34d5c79d 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -343,3 +343,4 @@ ENTRY(sys_call_table) .long sys_name_to_handle_at .long sys_open_by_handle_at .long sys_clock_adjtime + .long sys_syncfs diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a375616d77f7..a91ae7709b49 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -2,39 +2,41 @@ #include <linux/memblock.h> #include <asm/trampoline.h> +#include <asm/cacheflush.h> #include <asm/pgtable.h> -#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) -#define __trampinit -#define __trampinitdata -#else -#define __trampinit __cpuinit -#define __trampinitdata __cpuinitdata -#endif +unsigned char *x86_trampoline_base; -/* ready for x86_64 and x86 */ -unsigned char *__trampinitdata trampoline_base; - -void __init reserve_trampoline_memory(void) +void __init setup_trampolines(void) { phys_addr_t mem; + size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); if (mem == MEMBLOCK_ERROR) panic("Cannot allocate trampoline\n"); - trampoline_base = __va(mem); - memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); + x86_trampoline_base = __va(mem); + memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); + + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + x86_trampoline_base, (unsigned long long)mem, size); + + memcpy(x86_trampoline_base, x86_trampoline_start, size); } /* - * Currently trivial. Write the real->protected mode - * bootstrap into the page concerned. The caller - * has made sure it's suitably aligned. + * setup_trampolines() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page + * tables are set up, so we cannot set page permissions in that + * function. Thus, we use an arch_initcall instead. */ -unsigned long __trampinit setup_trampoline(void) +static int __init configure_trampolines(void) { - memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); - return virt_to_phys(trampoline_base); + size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); + + set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT); + return 0; } +arch_initcall(configure_trampolines); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 8508237e8e43..451c0a7ef7fd 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -32,9 +32,11 @@ #include <asm/segment.h> #include <asm/page_types.h> -/* We can free up trampoline after bootup if cpu hotplug is not supported. */ -__CPUINITRODATA -.code16 +#ifdef CONFIG_SMP + + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .code16 ENTRY(trampoline_data) r_base = . @@ -44,7 +46,7 @@ r_base = . cli # We should be safe anyway - movl $0xA5A5A5A5, trampoline_data - r_base + movl $0xA5A5A5A5, trampoline_status - r_base # write marker for master knows we're running /* GDT tables in non default location kernel can be beyond 16MB and @@ -72,5 +74,10 @@ boot_idt_descr: .word 0 # idt limit = 0 .long 0 # idt base = 0L +ENTRY(trampoline_status) + .long 0 + .globl trampoline_end trampoline_end: + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 075d130efcf9..09ff51799e96 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -32,13 +32,9 @@ #include <asm/segment.h> #include <asm/processor-flags.h> -#ifdef CONFIG_ACPI_SLEEP -.section .rodata, "a", @progbits -#else -/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ -__CPUINITRODATA -#endif -.code16 + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .code16 ENTRY(trampoline_data) r_base = . @@ -50,7 +46,7 @@ r_base = . mov %ax, %ss - movl $0xA5A5A5A5, trampoline_data - r_base + movl $0xA5A5A5A5, trampoline_status - r_base # write marker for master knows we're running # Setup stack @@ -64,10 +60,13 @@ r_base = . movzx %ax, %esi # Find the 32bit trampoline location shll $4, %esi - # Fixup the vectors - addl %esi, startup_32_vector - r_base - addl %esi, startup_64_vector - r_base - addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer + # Fixup the absolute vectors + leal (startup_32 - r_base)(%esi), %eax + movl %eax, startup_32_vector - r_base + leal (startup_64 - r_base)(%esi), %eax + movl %eax, startup_64_vector - r_base + leal (tgdt - r_base)(%esi), %eax + movl %eax, (tgdt + 2 - r_base) /* * GDT tables in non default location kernel can be beyond 16MB and @@ -129,6 +128,7 @@ no_longmode: jmp no_longmode #include "verify_cpu.S" + .balign 4 # Careful these need to be in the same 64K segment as the above; tidt: .word 0 # idt limit = 0 @@ -156,6 +156,10 @@ startup_64_vector: .long startup_64 - r_base .word __KERNEL_CS, 0 + .balign 4 +ENTRY(trampoline_status) + .long 0 + trampoline_stack: .org 0x1000 trampoline_stack_end: diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0381e1f3baed..624a2016198e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -231,7 +231,7 @@ SECTIONS * output PHDR, so the next output section - .init.text - should * start another segment - init. */ - PERCPU_VADDR(0, :percpu) + PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) #endif INIT_TEXT_SECTION(PAGE_SIZE) @@ -241,6 +241,18 @@ SECTIONS INIT_DATA_SECTION(16) + /* + * Code and data for a variety of lowlevel trampolines, to be + * copied into base memory (< 1 MiB) during initialization. + * Since it is copied early, the main copy can be discarded + * afterwards. + */ + .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) { + x86_trampoline_start = .; + *(.x86_trampoline) + x86_trampoline_end = .; + } + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) @@ -292,6 +304,7 @@ SECTIONS *(.iommu_table) __iommu_table_end = .; } + . = ALIGN(8); /* * .exit.text is discard at runtime, not link time, to deal with @@ -306,7 +319,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(PAGE_SIZE) + PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE) #endif . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index caf966781d25..0ad47b819a8b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -76,6 +76,7 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ /* Misc flags */ +#define VendorSpecific (1<<22) /* Vendor specific instruction */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ @@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, if (selector & 1 << 2) { struct desc_struct desc; memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) + if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR, + ctxt->vcpu)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ @@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; } +/* Does not support long mode */ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 selector, int seg) @@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, } load: ops->set_segment_selector(selector, seg, ctxt->vcpu); - ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); + ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct desc_struct *ss) { memset(cs, 0, sizeof(struct desc_struct)); - ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); + ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu); memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ @@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->regs[VCPU_REGS_RCX] = c->eip; @@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); @@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel |= SELECTOR_RPL_MASK; ss_sel |= SELECTOR_RPL_MASK; - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->eip = c->regs[VCPU_REGS_RDX]; @@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, u16 port, u16 len) { struct desc_struct tr_seg; + u32 base3; int r; - u16 io_bitmap_ptr; - u8 perm, bit_idx = port & 0x7; + u16 io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; + unsigned long base; - ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); + ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu); if (!tr_seg.p) return false; if (desc_limit_scaled(&tr_seg) < 103) return false; - r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, - ctxt->vcpu, NULL); + base = get_desc_base(&tr_seg); +#ifdef CONFIG_X86_64 + base |= ((u64)base3) << 32; +#endif + r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, - &perm, 1, ctxt->vcpu, NULL); + r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu, + NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); - ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); + ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu); ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); if (has_error_code) { @@ -2365,7 +2372,8 @@ static struct group_dual group7 = { { D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv | NoAccess), }, { - D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | Priv | VendorSpecific), N, + N, D(SrcNone | ModRM | Priv | VendorSpecific), D(SrcNone | ModRM | DstMem | Mov), N, D(SrcMem16 | ModRM | Mov | Priv), N, } }; @@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ N, GD(0, &group7), N, N, - N, D(ImplicitOps), D(ImplicitOps | Priv), N, + N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ @@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = { /* 0x30 - 0x3F */ D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), D(ImplicitOps | Priv), N, - D(ImplicitOps), D(ImplicitOps | Priv), N, N, + D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), + N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ X16(D(DstReg | SrcMem | ModRM | Mov)), @@ -2741,6 +2750,9 @@ done_prefixes: if (c->d == 0 || (c->d & Undefined)) return -1; + if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) + return -1; + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) c->op_bytes = 8; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 3cece05e4ac4..19fe855e7953 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -62,9 +62,6 @@ static void pic_unlock(struct kvm_pic *s) } if (!found) - found = s->kvm->bsp_vcpu; - - if (!found) return; kvm_make_request(KVM_REQ_EVENT, found); @@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s) static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); - s->isr_ack |= (1 << irq); if (s != &s->pics_state->pics[0]) irq += 8; /* @@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) pic_lock(s->pics_state); } -void kvm_pic_clear_isr_ack(struct kvm *kvm) -{ - struct kvm_pic *s = pic_irqchip(kvm); - - pic_lock(s); - s->pics[0].isr_ack = 0xff; - s->pics[1].isr_ack = 0xff; - pic_unlock(s); -} - /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s->irr = 0; s->imr = 0; s->isr = 0; - s->isr_ack = 0xff; s->priority_add = 0; s->irq_base = 0; s->read_reg_select = 0; @@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this, */ static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); - int irq = pic_get_irq(&s->pics[0]); - s->output = level; - if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { - s->pics[0].isr_ack &= ~(1 << irq); + if (!s->output) s->wakeup_needed = true; - } + s->output = level; } static const struct kvm_io_device_ops picdev_ops = { @@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->pics[1].elcr_mask = 0xde; s->pics[0].pics_state = s; s->pics[1].pics_state = s; - s->pics[0].isr_ack = 0xff; - s->pics[1].isr_ack = 0xff; /* * Initialize PIO device diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93cf9d0d3653..2b2255b1f04b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (level) { result = 1; - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - printk(KERN_DEBUG - "INIT on a runnable vcpu %d\n", - vcpu->vcpu_id); vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); - if (vcpu->arch.apic->regs_page) - __free_page(vcpu->arch.apic->regs_page); + if (vcpu->arch.apic->regs) + free_page((unsigned long)vcpu->arch.apic->regs); kfree(vcpu->arch.apic); } @@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic = apic; - apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (apic->regs_page == NULL) { + apic->regs = (void *)get_zeroed_page(GFP_KERNEL); + if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } - apic->regs = page_address(apic->regs_page); apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f5fe32c5edad..52c9e6b9e725 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,7 +13,6 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; - struct page *regs_page; void *regs; gpa_t vapic_addr; struct page *vapic_page; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f02b8edc3d44..22fae7593ee7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_LEVEL_SHIFT(level) \ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) -#define PT64_LEVEL_MASK(level) \ - (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) - #define PT64_INDEX(address, level)\ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) @@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644); #define PT32_LEVEL_SHIFT(level) \ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) -#define PT32_LEVEL_MASK(level) \ - (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) #define PT32_LVL_OFFSET_MASK(level) \ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) @@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, int min) { - struct page *page; + void *page; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = alloc_page(GFP_KERNEL); + page = (void *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; - cache->objects[cache->nobjs++] = page_address(page); + cache->objects[cache->nobjs++] = page; } return 0; } @@ -554,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static struct kvm_memory_slot * +gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, + bool no_dirty_log) { struct kvm_memory_slot *slot; - slot = gfn_to_memslot(vcpu->kvm, large_gfn); - if (slot && slot->dirty_bitmap) - return true; - return false; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID || + (no_dirty_log && slot->dirty_bitmap)) + slot = NULL; + + return slot; +} + +static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); } static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -1032,9 +1037,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); - __free_page(virt_to_page(sp->spt)); + free_page((unsigned long)sp->spt); if (!sp->role.direct) - __free_page(virt_to_page(sp->gfns)); + free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); kvm_mod_used_mmu_pages(kvm, -1); } @@ -1199,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +static void nonpaging_update_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + const void *pte, unsigned long mmu_seq) +{ + WARN_ON(1); +} + #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { @@ -2150,26 +2162,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static struct kvm_memory_slot * -pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) - slot = NULL; - - return slot; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; unsigned long hva; - slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) { get_page(bad_page); return page_to_pfn(bad_page); @@ -2190,7 +2189,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, gfn_t gfn; gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) + if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) return -1; ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); @@ -2804,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; + context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -2933,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->prefetch_page = paging64_prefetch_page; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; + context->update_pte = paging64_update_pte; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -2961,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->prefetch_page = paging32_prefetch_page; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; + context->update_pte = paging32_update_pte; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -2985,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; + context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; context->direct_map = true; @@ -3089,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) static int init_kvm_mmu(struct kvm_vcpu *vcpu) { - vcpu->arch.update_pte.pfn = bad_pfn; - if (mmu_is_nested(vcpu)) return init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -3164,7 +3165,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, - const void *new) + const void *new, unsigned long mmu_seq) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; @@ -3172,10 +3173,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - if (!sp->role.cr4_pae) - paging32_update_pte(vcpu, sp, spte, new); - else - paging64_update_pte(vcpu, sp, spte, new); + vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq); } static bool need_remote_flush(u64 old, u64 new) @@ -3210,28 +3208,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) return !!(spte && (*spte & shadow_accessed_mask)); } -static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - u64 gpte) -{ - gfn_t gfn; - pfn_t pfn; - - if (!is_present_gpte(gpte)) - return; - gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - - vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - pfn = gfn_to_pfn(vcpu->kvm, gfn); - - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - vcpu->arch.update_pte.gfn = gfn; - vcpu->arch.update_pte.pfn = pfn; -} - static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) { u64 *spte = vcpu->arch.last_pte_updated; @@ -3253,21 +3229,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); - u64 entry, gentry; - u64 *spte; - unsigned offset = offset_in_page(gpa); - unsigned pte_size; - unsigned page_offset; - unsigned misaligned; - unsigned quadrant; - int level; - int flooded = 0; - int npte; - int r; - int invlpg_counter; + unsigned long mmu_seq; + u64 entry, gentry, *spte; + unsigned pte_size, page_offset, misaligned, quadrant, offset; + int level, npte, invlpg_counter, r, flooded = 0; bool remote_flush, local_flush, zap_page; zap_page = remote_flush = local_flush = false; + offset = offset_in_page(gpa); pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -3275,9 +3244,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, /* * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode. This is nearly always true - * (might be false while changing modes). Note it is verified later - * by update_pte(). + * as the current vcpu paging mode since we update the sptes only + * when they have the same mode. */ if ((is_pae(vcpu) && bytes == 4) || !new) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ @@ -3303,15 +3271,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, break; } - mmu_guess_page_from_pte_write(vcpu, gpa, gentry); + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + spin_lock(&vcpu->kvm->mmu_lock); if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) gentry = 0; - kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); if (guest_initiated) { + kvm_mmu_access_page(vcpu, gfn); if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { ++vcpu->arch.last_pt_write_count; @@ -3375,7 +3345,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); + mmu_pte_write_new_pte(vcpu, sp, spte, &gentry, + mmu_seq); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; ++spte; @@ -3385,10 +3356,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); - if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { - kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); - vcpu->arch.update_pte.pfn = bad_pfn; - } } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) @@ -3538,14 +3505,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (!test_bit(slot, sp->slot_bitmap)) continue; - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (!is_shadow_present_pte(pt[i]) || + !is_last_spte(pt[i], sp->role.level)) + continue; + + if (is_large_pte(pt[i])) { + drop_spte(kvm, &pt[i], + shadow_trap_nonpresent_pte); + --kvm->stat.lpages; + continue; + } + /* avoid RMW */ if (is_writable_pte(pt[i])) update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); + } } kvm_flush_remote_tlbs(kvm); } @@ -3583,7 +3559,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) if (nr_to_scan == 0) goto out; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx, freed_pages; @@ -3606,7 +3582,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); out: return percpu_counter_read_positive(&kvm_total_used_mmu_pages); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a51517d9eb51..c6397795d865 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -31,7 +31,6 @@ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 @@ -48,7 +47,6 @@ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 #define CMPXCHG cmpxchg @@ -327,7 +325,7 @@ no_present: } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) + u64 *spte, const void *pte, unsigned long mmu_seq) { pt_element_t gpte; unsigned pte_access; @@ -339,14 +337,14 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) + pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); return; - pfn = vcpu->arch.update_pte.pfn; - if (is_error_pfn(pfn)) - return; - if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) + } + if (mmu_notifier_retry(vcpu, mmu_seq)) return; - kvm_get_pfn(pfn); + /* * we call mmu_set_spte() with host_writable = true because that * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). @@ -829,7 +827,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -#undef PT_LEVEL_MASK #undef PT_LVL_ADDR_MASK #undef PT_LVL_OFFSET_MASK #undef PT_LEVEL_BITS diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 63fec1531e89..6bb15d583e47 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -135,6 +135,8 @@ struct vcpu_svm { u32 *msrpm; + ulong nmi_iret_rip; + struct nested_state nested; bool nmi_singlestep; @@ -1153,8 +1155,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); load_gs_index(svm->host.gs); #else +#ifdef CONFIG_X86_32_LAZY_GS loadsegment(gs, svm->host.gs); #endif +#endif for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } @@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.nmi_window_exits; clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; + svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); return 1; } @@ -3474,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->int3_injected = 0; - if (svm->vcpu.arch.hflags & HF_IRET_MASK) { + /* + * If we've made progress since setting HF_IRET_MASK, we've + * executed an IRET and can allow NMI injection. + */ + if ((svm->vcpu.arch.hflags & HF_IRET_MASK) + && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); } @@ -3641,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif #endif reload_tss(vcpu); local_irq_disable(); - stgi(); - vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_before_handle_nmi(&svm->vcpu); + + stgi(); + + /* Any pending NMI will happen here */ + + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_after_handle_nmi(&svm->vcpu); + sync_cr8_to_lapic(vcpu); svm->next_rip = 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bf89ec2cfb82..5b4cdcbd154c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO); * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually small than 41 cycles. + * According to test, this time is usually smaller than 128 cycles. * ple_window: upper bound on the amount of time a guest is allowed to execute * in a PAUSE loop. Tests indicate that most spinlocks are held for * less than 2^12 cycles * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 41 +#define KVM_VMX_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; module_param(ple_gap, int, S_IRUGO); @@ -176,11 +176,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1333,19 +1333,25 @@ static __init int vmx_disabled_by_bios(void) rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); if (msr & FEATURE_CONTROL_LOCKED) { + /* launched w/ TXT and VMX disabled */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) && tboot_enabled()) return 1; + /* launched w/o TXT and VMX only enabled w/ TXT */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) && !tboot_enabled()) { printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - " activate TXT before enabling KVM\n"); + "activate TXT before enabling KVM\n"); return 1; } + /* launched w/o TXT and VMX disabled */ + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !tboot_enabled()) + return 1; } return 0; - /* locked but not enabled */ } static void kvm_cpu_vmxon(u64 addr) @@ -1683,6 +1689,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 0; + vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); @@ -1756,6 +1763,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; + /* + * Very old userspace does not call KVM_SET_TSS_ADDR before entering + * vcpu. Call it here with phys address pointing 16M below 4G. + */ + if (!vcpu->kvm->arch.tss_addr) { + printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " + "called before entering vcpu\n"); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + } + + vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1794,7 +1814,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) continue_rmode: kvm_mmu_reset_context(vcpu); - init_rmode(vcpu->kvm); } static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -2030,23 +2049,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); } -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - return vmcs_readl(sf->base); -} - static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_save_segment *save; u32 ar; + if (vmx->rmode.vm86_active + && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES + || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS + || seg == VCPU_SREG_GS) + && !emulate_invalid_guest_state) { + switch (seg) { + case VCPU_SREG_TR: save = &vmx->rmode.tr; break; + case VCPU_SREG_ES: save = &vmx->rmode.es; break; + case VCPU_SREG_DS: save = &vmx->rmode.ds; break; + case VCPU_SREG_FS: save = &vmx->rmode.fs; break; + case VCPU_SREG_GS: save = &vmx->rmode.gs; break; + default: BUG(); + } + var->selector = save->selector; + var->base = save->base; + var->limit = save->limit; + ar = save->ar; + if (seg == VCPU_SREG_TR + || var->selector == vmcs_read16(sf->selector)) + goto use_saved_rmode_seg; + } var->base = vmcs_readl(sf->base); var->limit = vmcs_read32(sf->limit); var->selector = vmcs_read16(sf->selector); ar = vmcs_read32(sf->ar_bytes); +use_saved_rmode_seg: if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; var->type = ar & 15; @@ -2060,6 +2096,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->unusable = (ar >> 16) & 1; } +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_segment s; + + if (to_vmx(vcpu)->rmode.vm86_active) { + vmx_get_segment(vcpu, &s, seg); + return s.base; + } + return vmcs_readl(sf->base); +} + static int vmx_get_cpl(struct kvm_vcpu *vcpu) { if (!is_protmode(vcpu)) @@ -2101,6 +2149,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, u32 ar; if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmcs_write16(sf->selector, var->selector); vmx->rmode.tr.selector = var->selector; vmx->rmode.tr.base = var->base; vmx->rmode.tr.limit = var->limit; @@ -2361,11 +2410,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) static int init_rmode_tss(struct kvm *kvm) { - gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + gfn_t fn; u16 data = 0; - int ret = 0; - int r; + int r, idx, ret = 0; + idx = srcu_read_lock(&kvm->srcu); + fn = rmode_tss_base(kvm) >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2389,12 +2439,13 @@ static int init_rmode_tss(struct kvm *kvm) ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } static int init_rmode_identity_map(struct kvm *kvm) { - int i, r, ret; + int i, idx, r, ret; pfn_t identity_map_pfn; u32 tmp; @@ -2409,6 +2460,7 @@ static int init_rmode_identity_map(struct kvm *kvm) return 1; ret = 0; identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2424,6 +2476,7 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm->arch.ept_identity_pagetable_done = true; ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } @@ -2699,22 +2752,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } -static int init_rmode(struct kvm *kvm) -{ - int idx, ret = 0; - - idx = srcu_read_lock(&kvm->srcu); - if (!init_rmode_tss(kvm)) - goto exit; - if (!init_rmode_identity_map(kvm)) - goto exit; - - ret = 1; -exit: - srcu_read_unlock(&kvm->srcu, idx); - return ret; -} - static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2722,10 +2759,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) int ret; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - if (!init_rmode(vmx->vcpu.kvm)) { - ret = -ENOMEM; - goto out; - } vmx->rmode.vm86_active = 0; @@ -2805,7 +2838,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); if (vm_need_tpr_shadow(vmx->vcpu.kvm)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->vcpu.arch.apic->regs_page)); + __pa(vmx->vcpu.arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } @@ -2971,6 +3004,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (ret) return ret; kvm->arch.tss_addr = addr; + if (!init_rmode_tss(kvm)) + return -ENOMEM; + return 0; } @@ -3962,7 +3998,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) #define Q "l" #endif -static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3991,6 +4027,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" /* placeholder for guest rcx */ "push %%"R"cx \n\t" "cmp %%"R"sp, %c[host_rsp](%0) \n\t" "je 1f \n\t" @@ -4032,10 +4069,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ - "xchg %0, (%%"R"sp) \n\t" + "mov %0, %c[wordsize](%%"R"sp) \n\t" + "pop %0 \n\t" "mov %%"R"ax, %c[rax](%0) \n\t" "mov %%"R"bx, %c[rbx](%0) \n\t" - "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "pop"Q" %c[rcx](%0) \n\t" "mov %%"R"dx, %c[rdx](%0) \n\t" "mov %%"R"si, %c[rsi](%0) \n\t" "mov %%"R"di, %c[rdi](%0) \n\t" @@ -4053,7 +4091,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%cr2, %%"R"ax \n\t" "mov %%"R"ax, %c[cr2](%0) \n\t" - "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" + "pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), @@ -4076,7 +4114,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), + [wordsize]"i"(sizeof(ulong)) : "cc", "memory" , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 @@ -4183,8 +4222,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + err = -ENOMEM; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + if (!init_rmode_identity_map(kvm)) + goto free_vmcs; } return &vmx->vcpu; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3a9e4b17d66..58f517b59645 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -81,9 +81,10 @@ * - enable LME and LMA per default on 64 bit KVM */ #ifdef CONFIG_X86_64 -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; +static +u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); #else -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; +static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM @@ -360,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_NMI, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -525,8 +526,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); - if ((cr0 ^ old_cr0) & X86_CR0_PG) + if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + } if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); @@ -1017,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) unsigned long flags; s64 sdiff; - spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = data - native_read_tsc(); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -1050,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_offset = offset; kvm_x86_ops->write_tsc_offset(vcpu, offset); - spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; @@ -1453,6 +1456,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } +static void kvmclock_reset(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1510,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); vcpu->arch.time = data; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -1592,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } else return set_msr_hyperv(vcpu, msr, data); break; + case MSR_IA32_BBL_CR_CTL3: + /* Drop writes to this legacy MSR -- see rdmsr + * counterpart for further detail. + */ + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1846,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) } else return get_msr_hyperv(vcpu, msr, pdata); break; + case MSR_IA32_BBL_CR_CTL3: + /* This legacy MSR exists but isn't fully documented in current + * silicon. It is however accessed by winxp in very narrow + * scenarios where it sets bit #19, itself documented as + * a "reserved" bit. Best effort attempt to source coherent + * read data here should the balance of the register be + * interpreted by the guest: + * + * L2 cache control register 3: 64GB range, 256KB size, + * enabled, latency 0x1, configured + */ + data = 0xbe702111; + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -2100,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (check_tsc_unstable()) { kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); vcpu->arch.tsc_catchup = 1; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -2575,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { - printk(KERN_DEBUG "kvm: set_mce: " - "injects mce exception while " - "previous one is in progress!\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } @@ -2648,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; - if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) kvm_x86_ops->set_interrupt_shadow(vcpu, events->interrupt.shadow); @@ -4140,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg, return get_segment_base(vcpu, seg); } -static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, + int seg, struct kvm_vcpu *vcpu) { struct kvm_segment var; @@ -4154,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, var.limit >>= 12; set_desc_limit(desc, var.limit); set_desc_base(desc, (unsigned long)var.base); +#ifdef CONFIG_X86_64 + if (base3) + *base3 = var.base >> 32; +#endif desc->type = var.type; desc->s = var.s; desc->dpl = var.dpl; @@ -4166,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, return true; } -static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, + int seg, struct kvm_vcpu *vcpu) { struct kvm_segment var; @@ -4175,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, kvm_get_segment(vcpu, &var, seg); var.base = get_desc_base(desc); +#ifdef CONFIG_X86_64 + var.base |= ((u64)base3) << 32; +#endif var.limit = get_desc_limit(desc); if (desc->g) var.limit = (var.limit << 12) | 0xfff; @@ -4390,41 +4419,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.have_exception = false; vcpu->arch.emulate_ctxt.perm_ok = false; + vcpu->arch.emulate_ctxt.only_vendor_specific_insn + = emulation_type & EMULTYPE_TRAP_UD; + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); - if (r == X86EMUL_PROPAGATE_FAULT) - goto done; trace_kvm_emulate_insn_start(vcpu); - - /* Only allow emulation of specific instructions on #UD - * (namely VMMCALL, sysenter, sysexit, syscall)*/ - if (emulation_type & EMULTYPE_TRAP_UD) { - if (!c->twobyte) - return EMULATE_FAIL; - switch (c->b) { - case 0x01: /* VMMCALL */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - return EMULATE_FAIL; - break; - case 0x34: /* sysenter */ - case 0x35: /* sysexit */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - case 0x05: /* syscall */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - default: - return EMULATE_FAIL; - } - - if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) - return EMULATE_FAIL; - } - ++vcpu->stat.insn_emulation; if (r) { + if (emulation_type & EMULTYPE_TRAP_UD) + return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) @@ -4452,7 +4456,6 @@ restart: return handle_emulation_failure(vcpu); } -done: if (vcpu->arch.emulate_ctxt.have_exception) { inject_emulated_exception(vcpu); r = EMULATE_DONE; @@ -4562,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) @@ -4572,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va send_ipi = 1; } } - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -5185,6 +5188,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } + if (kvm_check_request(KVM_REQ_NMI, vcpu)) + vcpu->arch.nmi_pending = true; } r = kvm_mmu_reload(vcpu); @@ -5213,14 +5218,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_fpu(vcpu); kvm_load_guest_xcr0(vcpu); - atomic_set(&vcpu->guest_mode, 1); - smp_wmb(); + vcpu->mode = IN_GUEST_MODE; + + /* We should set ->mode before check ->requests, + * see the comment in make_all_cpus_request. + */ + smp_mb(); local_irq_disable(); - if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests || need_resched() || signal_pending(current)) { - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); preempt_enable(); @@ -5256,7 +5265,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); @@ -5574,7 +5583,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int pending_vec, max_bits; + int pending_vec, max_bits, idx; struct desc_ptr dt; dt.size = sregs->idt.limit; @@ -5603,10 +5612,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (sregs->cr4 & X86_CR4_OSXSAVE) update_cpuid(vcpu); + + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); @@ -5617,8 +5629,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (pending_vec < max_bits) { kvm_queue_interrupt(vcpu, pending_vec, false); pr_debug("Set back pending irq %d\n", pending_vec); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -5814,10 +5824,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fx_free(vcpu); @@ -5878,6 +5885,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; + kvmclock_reset(vcpu); + kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; @@ -6005,7 +6014,7 @@ int kvm_arch_init_vm(struct kvm *kvm) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - spin_lock_init(&kvm->arch.tsc_write_lock); + raw_spin_lock_init(&kvm->arch.tsc_write_lock); return 0; } @@ -6103,7 +6112,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; + int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; if (!user_alloc && !old.user_alloc && old.rmap && !npages) { int ret; @@ -6118,12 +6127,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, "failed to munmap memory\n"); } + if (!kvm->arch.n_requested_mmu_pages) + nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + spin_lock(&kvm->mmu_lock); - if (!kvm->arch.n_requested_mmu_pages) { - unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - } - kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); } @@ -6157,7 +6166,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (atomic_xchg(&vcpu->guest_mode, 0)) + if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) smp_send_reschedule(cpu); put_cpu(); } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index e10cf070ede0..f2479f19ddde 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -42,4 +42,5 @@ else lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o + lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S new file mode 100644 index 000000000000..3e8b08a6de2b --- /dev/null +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -0,0 +1,59 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + +.text + +/* + * Inputs: + * %rsi : memory location to compare + * %rax : low 64 bits of old value + * %rdx : high 64 bits of old value + * %rbx : low 64 bits of new value + * %rcx : high 64 bits of new value + * %al : Operation successful + */ +ENTRY(this_cpu_cmpxchg16b_emu) +CFI_STARTPROC + +# +# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not +# via the ZF. Caller will access %al to get result. +# +# Note that this is only useful for a cpuops operation. Meaning that we +# do *not* have a fully atomic operation but just an operation that is +# *atomic* on a single cpu (as provided by the this_cpu_xx class of +# macros). +# +this_cpu_cmpxchg16b_emu: + pushf + cli + + cmpq %gs:(%rsi), %rax + jne not_same + cmpq %gs:8(%rsi), %rdx + jne not_same + + movq %rbx, %gs:(%rsi) + movq %rcx, %gs:8(%rsi) + + popf + mov $1, %al + ret + + not_same: + popf + xor %al,%al + ret + +CFI_ENDPROC + +ENDPROC(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7026505a33ba..794233587287 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -51,6 +51,7 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> +#include <asm/uv/uv.h> #include <asm/setup.h> static int __init parse_direct_gbpages_off(char *arg) @@ -861,18 +862,18 @@ static struct vm_area_struct gate_vma = { .vm_flags = VM_READ | VM_EXEC }; -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) + if (!mm || mm->context.ia32_compat) return NULL; #endif return &gate_vma; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = get_gate_vma(task); + struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; @@ -881,11 +882,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr) } /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } @@ -899,6 +900,19 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } +#ifdef CONFIG_X86_UV +#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) + +unsigned long memory_block_size_bytes(void) +{ + if (is_uv_system()) { + printk(KERN_INFO "UV: memory block size 2GB\n"); + return 2UL * 1024 * 1024 * 1024; + } + return MIN_MEMORY_BLOCK_SIZE; +} +#endif + #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 9fadec074142..98ab13058f89 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -50,7 +50,7 @@ static inline void setup_num_counters(void) #endif } -static int inline addr_increment(void) +static inline int addr_increment(void) { #ifdef CONFIG_SMP return smp_num_siblings == 2 ? 2 : 1; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 87e6c8323117..8201165bae28 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -597,21 +597,18 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) { - r->name = "PIIX/ICH"; - r->get = pirq_piix_get; - r->set = pirq_piix_set; - return 1; - } return 0; } diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 8c4085a95ef1..e37b407a0ee8 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -50,7 +50,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, name = "ioapic-level"; } - irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name); + irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); @@ -237,6 +237,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) { int rc; int share = 1; + int pirq; u8 gsi; rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); @@ -246,13 +247,21 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) return rc; } + rc = xen_allocate_pirq_gsi(gsi); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n", + gsi, rc); + return rc; + } + pirq = rc; + if (gsi < NR_IRQS_LEGACY) share = 0; - rc = xen_allocate_pirq(gsi, share, "pcifront"); + rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n", - gsi, rc); + dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n", + gsi, pirq, rc); return rc; } @@ -309,7 +318,7 @@ int __init pci_xen_hvm_init(void) #ifdef CONFIG_XEN_DOM0 static int xen_register_pirq(u32 gsi, int triggering) { - int rc, irq; + int rc, pirq, irq = -1; struct physdev_map_pirq map_irq; int shareable = 0; char *name; @@ -325,17 +334,20 @@ static int xen_register_pirq(u32 gsi, int triggering) name = "ioapic-level"; } - irq = xen_allocate_pirq(gsi, shareable, name); - - printk(KERN_DEBUG "xen: --> irq=%d\n", irq); + pirq = xen_allocate_pirq_gsi(gsi); + if (pirq < 0) + goto out; + irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); if (irq < 0) goto out; + printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq); + map_irq.domid = DOMID_SELF; map_irq.type = MAP_PIRQ_TYPE_GSI; map_irq.index = gsi; - map_irq.pirq = irq; + map_irq.pirq = pirq; rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); if (rc) { @@ -422,13 +434,18 @@ static int __init pci_xen_initial_domain(void) void __init xen_setup_pirqs(void) { - int irq; + int pirq, irq; pci_xen_initial_domain(); if (0 == nr_ioapics) { - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) - xen_allocate_pirq(irq, 0, "xt-pic"); + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { + pirq = xen_allocate_pirq_gsi(irq); + if (WARN(pirq < 0, + "Could not allocate PIRQ for legacy interrupt\n")) + break; + irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic"); + } return; } diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index 127775696d6c..99513642a0e6 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/platform_device.h> #include <linux/pm.h> +#include <linux/mfd/core.h> #include <asm/io.h> #include <asm/olpc.h> @@ -56,25 +57,24 @@ static void xo1_power_off(void) static int __devinit olpc_xo1_probe(struct platform_device *pdev) { struct resource *res; + int err; /* don't run on non-XOs */ if (!machine_is_olpc()) return -ENODEV; + err = mfd_cell_enable(pdev); + if (err) + return err; + res = platform_get_resource(pdev, IORESOURCE_IO, 0); if (!res) { dev_err(&pdev->dev, "can't fetch device resource info\n"); return -EIO; } - - if (!request_region(res->start, resource_size(res), DRV_NAME)) { - dev_err(&pdev->dev, "can't request region\n"); - return -EIO; - } - - if (strcmp(pdev->name, "cs5535-pms") == 0) + if (strcmp(pdev->name, "olpc-xo1-pms") == 0) pms_base = res->start; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-ac-acpi") == 0) acpi_base = res->start; /* If we have both addresses, we can override the poweroff hook */ @@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev) static int __devexit olpc_xo1_remove(struct platform_device *pdev) { - struct resource *r; - - r = platform_get_resource(pdev, IORESOURCE_IO, 0); - release_region(r->start, resource_size(r)); + mfd_cell_disable(pdev); - if (strcmp(pdev->name, "cs5535-pms") == 0) + if (strcmp(pdev->name, "olpc-xo1-pms") == 0) pms_base = 0; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-acpi") == 0) acpi_base = 0; pm_power_off = NULL; @@ -104,7 +101,7 @@ static int __devexit olpc_xo1_remove(struct platform_device *pdev) static struct platform_driver cs5535_pms_drv = { .driver = { - .name = "cs5535-pms", + .name = "olpc-xo1-pms", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = { static struct platform_driver cs5535_acpi_drv = { .driver = { - .name = "cs5535-acpi", + .name = "olpc-xo1-acpi", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -124,26 +121,27 @@ static int __init olpc_xo1_init(void) { int r; - r = platform_driver_register(&cs5535_pms_drv); + r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms"); if (r) return r; - r = platform_driver_register(&cs5535_acpi_drv); + r = mfd_shared_platform_driver_register(&cs5535_acpi_drv, + "cs5535-acpi"); if (r) - platform_driver_unregister(&cs5535_pms_drv); + mfd_shared_platform_driver_unregister(&cs5535_pms_drv); return r; } static void __exit olpc_xo1_exit(void) { - platform_driver_unregister(&cs5535_acpi_drv); - platform_driver_unregister(&cs5535_pms_drv); + mfd_shared_platform_driver_unregister(&cs5535_acpi_drv); + mfd_shared_platform_driver_unregister(&cs5535_pms_drv); } MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:olpc-xo1"); +MODULE_ALIAS("platform:cs5535-pms"); module_init(olpc_xo1_init); module_exit(olpc_xo1_exit); diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 36df991985b2..468d591dde31 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -417,24 +417,25 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { - struct mm_struct *mm = tsk->mm; - - /* Check to see if this task was created in compat vdso mode */ + /* + * Check to see if the corresponding task was created in compat vdso + * mode. + */ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) return &gate_vma; return NULL; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - const struct vm_area_struct *vma = get_gate_vma(task); + const struct vm_area_struct *vma = get_gate_vma(mm); return vma && addr >= vma->vm_start && addr < vma->vm_end; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index e4343fe488ed..1c7121ba18ff 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -38,7 +38,7 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool - depends on XEN && PM + depends on XEN default y config XEN_DEBUG_FS diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 21058ad1e5e3..c82df6c9c0f0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -79,8 +79,7 @@ /* * Protects atomic reservation decrease/increase against concurrent increases. - * Also protects non-atomic updates of current_pages and driver_pages, and - * balloon lists. + * Also protects non-atomic updates of current_pages and balloon lists. */ DEFINE_SPINLOCK(xen_reservation_lock); |