diff options
Diffstat (limited to 'arch/x86')
140 files changed, 2270 insertions, 2448 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 51c59015b280..72ace9515a07 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_COUNTERS if (!M386 && !M486) + select HAVE_PERF_EVENTS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -86,10 +86,6 @@ config STACKTRACE_SUPPORT config HAVE_LATENCYTOP_SUPPORT def_bool y -config FAST_CMPXCHG_LOCAL - bool - default y - config MMU def_bool y @@ -432,6 +428,17 @@ config X86_NUMAQ of Flat Logical. You will need a new lynxer.elf file to flash your firmware with - send email to <Martin.Bligh@us.ibm.com>. +config X86_SUPPORTS_MEMORY_FAILURE + bool + # MCE code calls memory_failure(): + depends on X86_MCE + # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags: + depends on !X86_NUMAQ + # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH: + depends on X86_64 || !SPARSEMEM + select ARCH_SUPPORTS_MEMORY_FAILURE + default y + config X86_VISWS bool "SGI 320/540 (Visual Workstation)" depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT @@ -484,7 +491,7 @@ if PARAVIRT_GUEST source "arch/x86/xen/Kconfig" config VMI - bool "VMI Guest support" + bool "VMI Guest support (DEPRECATED)" select PARAVIRT depends on X86_32 ---help--- @@ -493,6 +500,15 @@ config VMI at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. + As of September 2009, VMware has started a phased retirement + of this feature from VMware's products. Please see + feature-removal-schedule.txt for details. If you are + planning to enable this option, please note that you cannot + live migrate a VMI enabled VM to a future VMware product, + which doesn't support VMI. So if you expect your kernel to + seamlessly migrate to newer VMware products, keep this + disabled. + config KVM_CLOCK bool "KVM paravirtualized clock" select PARAVIRT @@ -1204,6 +1220,10 @@ config ARCH_DISCONTIGMEM_DEFAULT def_bool y depends on NUMA && X86_32 +config ARCH_PROC_KCORE_TEXT + def_bool y + depends on X86_64 && PROC_KCORE + config ARCH_SPARSEMEM_DEFAULT def_bool y depends on X86_64 @@ -1423,12 +1443,8 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. -config CC_STACKPROTECTOR_ALL - bool - config CC_STACKPROTECTOR bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" - select CC_STACKPROTECTOR_ALL ---help--- This option turns on the -fstack-protector GCC feature. This feature puts, at the beginning of functions, a canary value on @@ -1662,6 +1678,8 @@ source "kernel/power/Kconfig" source "drivers/acpi/Kconfig" +source "drivers/sfi/Kconfig" + config X86_APM_BOOT bool default y @@ -1857,7 +1875,7 @@ config PCI_DIRECT config PCI_MMCONFIG def_bool y - depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) + depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY) config PCI_OLPC def_bool y @@ -1895,7 +1913,7 @@ config DMAR_DEFAULT_ON config DMAR_BROKEN_GFX_WA def_bool n prompt "Workaround broken graphics drivers (going away soon)" - depends on DMAR + depends on DMAR && BROKEN ---help--- Current Graphics drivers tend to use physical address for DMA and avoid using DMA APIs. Setting this config diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 527519b8a9f9..2649840d888f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -400,7 +400,7 @@ config X86_TSC config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 + depends on !M386 && !M486 # this should be set for all -march=.. options where the compiler # generates cmov. @@ -412,6 +412,7 @@ config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 default "6" if X86_32 && X86_P6_NOP + default "5" if X86_32 && X86_CMPXCHG64 default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) default "3" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7983c420eaf2..d2d24c9ee64d 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -76,7 +76,6 @@ ifdef CONFIG_CC_STACKPROTECTOR cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y) stackp-y := -fstack-protector - stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all KBUILD_CFLAGS += $(stackp-y) else $(warning stack protector enabled but no compiler support) @@ -179,8 +178,8 @@ archclean: define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' echo ' install - Install kernel using' - echo ' (your) ~/bin/installkernel or' - echo ' (distribution) /sbin/installkernel or' + echo ' (your) ~/bin/$(INSTALLKERNEL) or' + echo ' (distribution) /sbin/$(INSTALLKERNEL) or' echo ' install to $$(INSTALL_PATH) and run lilo' echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 75e4f001e706..f543b70ffae2 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -23,13 +23,14 @@ */ .text +#include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page_types.h> #include <asm/boot.h> #include <asm/asm-offsets.h> - .section ".text.head","ax",@progbits + __HEAD ENTRY(startup_32) cld /* diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index f62c284db9eb..077e1b69198e 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -24,6 +24,7 @@ .code32 .text +#include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/pgtable_types.h> @@ -33,7 +34,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> - .section ".text.head" + __HEAD .code32 ENTRY(startup_32) cld diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index cc353e1b3ffd..f4193bb48782 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,3 +1,5 @@ +#include <asm-generic/vmlinux.lds.h> + OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) #undef i386 @@ -18,9 +20,9 @@ SECTIONS * address 0. */ . = 0; - .text.head : { + .head.text : { _head = . ; - *(.text.head) + HEAD_TEXT _ehead = . ; } .rodata.compressed : { diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh index 8d60ee15dfd9..d13ec1c38640 100644 --- a/arch/x86/boot/install.sh +++ b/arch/x86/boot/install.sh @@ -33,8 +33,8 @@ verify "$3" # User may have a custom install script -if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi -if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi +if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi +if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi # Default install - same as make zlilo diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 0f6ec455a2b1..03c0683636b6 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -53,6 +53,9 @@ SECTIONS /DISCARD/ : { *(.note*) } + /* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT(_end <= 0x8000, "Setup too big!"); . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); /* Necessary for the very-old-loader check to work... */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 585edebe12cf..49c552c060e9 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -82,7 +82,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, return -EINVAL; } - if (irq_fpu_usable()) + if (!irq_fpu_usable()) err = crypto_aes_expand_key(ctx, in_key, key_len); else { kernel_fpu_begin(); @@ -103,7 +103,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (irq_fpu_usable()) + if (!irq_fpu_usable()) crypto_aes_encrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); @@ -116,7 +116,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (irq_fpu_usable()) + if (!irq_fpu_usable()) crypto_aes_decrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); @@ -342,7 +342,7 @@ static int ablk_encrypt(struct ablkcipher_request *req) struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - if (irq_fpu_usable()) { + if (!irq_fpu_usable()) { struct ablkcipher_request *cryptd_req = ablkcipher_request_ctx(req); memcpy(cryptd_req, req, sizeof(*req)); @@ -363,7 +363,7 @@ static int ablk_decrypt(struct ablkcipher_request *req) struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - if (irq_fpu_usable()) { + if (!irq_fpu_usable()) { struct ablkcipher_request *cryptd_req = ablkcipher_request_ctx(req); memcpy(cryptd_req, req, sizeof(*req)); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ba331bfd1112..581b0568fe19 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -21,8 +21,8 @@ #define __AUDIT_ARCH_LE 0x40000000 #ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit int_ret_from_sys_call -#define sysretl_audit int_ret_from_sys_call +#define sysexit_audit ia32_ret_from_sys_call +#define sysretl_audit ia32_ret_from_sys_call #endif #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) @@ -39,12 +39,12 @@ .endm /* clobbers %eax */ - .macro CLEAR_RREGS _r9=rax + .macro CLEAR_RREGS offset=0, _r9=rax xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) + movq %rax,\offset+R11(%rsp) + movq %rax,\offset+R10(%rsp) + movq %\_r9,\offset+R9(%rsp) + movq %rax,\offset+R8(%rsp) .endm /* @@ -172,6 +172,10 @@ sysexit_from_sys_call: movl RIP-R11(%rsp),%edx /* User %eip */ CFI_REGISTER rip,rdx RESTORE_ARGS 1,24,1,1,1,1 + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 popfq CFI_ADJUST_CFA_OFFSET -8 /*CFI_RESTORE rflags*/ @@ -200,9 +204,9 @@ sysexit_from_sys_call: movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ .endm - .macro auditsys_exit exit,ebpsave=RBP + .macro auditsys_exit exit testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) - jnz int_ret_from_sys_call + jnz ia32_ret_from_sys_call TRACE_IRQS_ON sti movl %eax,%esi /* second arg, syscall return value */ @@ -213,13 +217,13 @@ sysexit_from_sys_call: call audit_syscall_exit GET_THREAD_INFO(%r10) movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ - movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF testl %edi,TI_flags(%r10) - jnz int_with_check - jmp \exit + jz \exit + CLEAR_RREGS -ARGOFFSET + jmp int_with_check .endm sysenter_auditsys: @@ -329,6 +333,9 @@ sysretl_from_sys_call: CFI_REGISTER rip,rcx movl EFLAGS-ARGOFFSET(%rsp),%r11d /*CFI_REGISTER rflags,r11*/ + xorq %r10,%r10 + xorq %r9,%r9 + xorq %r8,%r8 TRACE_IRQS_ON movl RSP-ARGOFFSET(%rsp),%esp CFI_RESTORE rsp @@ -343,7 +350,7 @@ cstar_auditsys: jmp cstar_dispatch sysretl_audit: - auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */ + auditsys_exit sysretl_from_sys_call #endif cstar_tracesys: @@ -353,7 +360,7 @@ cstar_tracesys: #endif xchgl %r9d,%ebp SAVE_REST - CLEAR_RREGS r9 + CLEAR_RREGS 0, r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter @@ -425,6 +432,8 @@ ia32_do_call: call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: movq %rax,RAX-ARGOFFSET(%rsp) +ia32_ret_from_sys_call: + CLEAR_RREGS -ARGOFFSET jmp int_ret_from_sys_call ia32_tracesys: @@ -442,8 +451,8 @@ END(ia32_syscall) ia32_badsys: movq $0,ORIG_RAX-ARGOFFSET(%rsp) - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp int_ret_from_sys_call + movq $-ENOSYS,%rax + jmp ia32_sysret quiet_ni_syscall: movq $-ENOSYS,%rax @@ -831,5 +840,5 @@ ia32_sys_call_table: .quad compat_sys_preadv .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_counter_open + .quad sys_perf_event_open ia32_syscall_end: diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 20d1465a2ab0..4518dc500903 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) #else /* !CONFIG_ACPI */ -#define acpi_disabled 1 #define acpi_lapic 0 #define acpi_ioapic 0 static inline void acpi_noirq_set(void) { } diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index ac95995b7bad..5af2982133b5 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -23,18 +23,13 @@ #include <linux/irqreturn.h> #ifdef CONFIG_AMD_IOMMU -extern int amd_iommu_init(void); -extern int amd_iommu_init_dma_ops(void); -extern int amd_iommu_init_passthrough(void); + extern void amd_iommu_detect(void); -extern irqreturn_t amd_iommu_int_handler(int irq, void *data); -extern void amd_iommu_flush_all_domains(void); -extern void amd_iommu_flush_all_devices(void); -extern void amd_iommu_shutdown(void); + #else -static inline int amd_iommu_init(void) { return -ENODEV; } + static inline void amd_iommu_detect(void) { } -static inline void amd_iommu_shutdown(void) { } + #endif #endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h new file mode 100644 index 000000000000..84786fb9a23b --- /dev/null +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2009 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_X86_AMD_IOMMU_PROTO_H +#define _ASM_X86_AMD_IOMMU_PROTO_H + +struct amd_iommu; + +extern int amd_iommu_init_dma_ops(void); +extern int amd_iommu_init_passthrough(void); +extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +extern void amd_iommu_flush_all_domains(void); +extern void amd_iommu_flush_all_devices(void); +extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); + +#ifndef CONFIG_AMD_IOMMU_STATS + +static inline void amd_iommu_stats_init(void) { } + +#endif /* !CONFIG_AMD_IOMMU_STATS */ + +#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 2a2cc7a78a81..ba19ad4c47d0 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -25,6 +25,11 @@ #include <linux/spinlock.h> /* + * Maximum number of IOMMUs supported + */ +#define MAX_IOMMUS 32 + +/* * some size calculation constants */ #define DEV_TABLE_ENTRY_SIZE 32 @@ -206,6 +211,9 @@ extern bool amd_iommu_dump; printk(KERN_INFO "AMD-Vi: " format, ## arg); \ } while(0); +/* global flag if IOMMUs cache non-present entries */ +extern bool amd_iommu_np_cache; + /* * Make iterating over all IOMMUs easier */ @@ -226,6 +234,8 @@ extern bool amd_iommu_dump; * independent of their use. */ struct protection_domain { + struct list_head list; /* for list of all protection domains */ + struct list_head dev_list; /* List of all devices in this domain */ spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ @@ -233,7 +243,20 @@ struct protection_domain { unsigned long flags; /* flags to find out type of domain */ bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ + unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ void *priv; /* private data */ + +}; + +/* + * This struct contains device specific data for the IOMMU + */ +struct iommu_dev_data { + struct list_head list; /* For domain->dev_list */ + struct device *dev; /* Device this data belong to */ + struct device *alias; /* The Alias Device */ + struct protection_domain *domain; /* Domain the device is bound to */ + atomic_t bind; /* Domain attach reverent count */ }; /* @@ -291,6 +314,9 @@ struct dma_ops_domain { struct amd_iommu { struct list_head list; + /* Index within the IOMMU array */ + int index; + /* locks the accesses to the hardware */ spinlock_t lock; @@ -357,6 +383,21 @@ struct amd_iommu { extern struct list_head amd_iommu_list; /* + * Array with pointers to each IOMMU struct + * The indices are referenced in the protection domains + */ +extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; + +/* Number of IOMMUs present in the system */ +extern int amd_iommus_present; + +/* + * Declarations for the global list of all protection domains + */ +extern spinlock_t amd_iommu_pd_lock; +extern struct list_head amd_iommu_pd_list; + +/* * Structure defining one entry in the device table */ struct dev_table_entry { @@ -416,15 +457,9 @@ extern unsigned amd_iommu_aperture_order; /* largest PCI device id we expect translation requests for */ extern u16 amd_iommu_last_bdf; -/* data structures for protection domain handling */ -extern struct protection_domain **amd_iommu_pd_table; - /* allocation bitmap for domain ids */ extern unsigned long *amd_iommu_pd_alloc_bitmap; -/* will be 1 if device isolation is enabled */ -extern bool amd_iommu_isolate; - /* * If true, the addresses will be flushed on unmap time, not when * they are reused @@ -462,11 +497,6 @@ struct __iommu_counter { #define ADD_STATS_COUNTER(name, x) #define SUB_STATS_COUNTER(name, x) -static inline void amd_iommu_stats_init(void) { } - #endif /* CONFIG_AMD_IOMMU_STATS */ -/* some function prototypes */ -extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); - #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index d9cf1cd156d2..f654d1bb17fb 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -22,14 +22,14 @@ do { \ ".popsection" \ : : "i" (__FILE__), "i" (__LINE__), \ "i" (sizeof(struct bug_entry))); \ - for (;;) ; \ + unreachable(); \ } while (0) #else #define BUG() \ do { \ asm volatile("ud2"); \ - for (;;) ; \ + unreachable(); \ } while (0) #endif diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h index 5d367caa0e36..549860d3be8f 100644 --- a/arch/x86/include/asm/cache.h +++ b/arch/x86/include/asm/cache.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_CACHE_H #define _ASM_X86_CACHE_H +#include <linux/linkage.h> + /* L1 cache line size */ #define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) @@ -13,7 +15,7 @@ #ifdef CONFIG_SMP #define __cacheline_aligned_in_smp \ __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ - __attribute__((__section__(".data.page_aligned"))) + __page_aligned_data #endif #endif diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h index b03bedb62aa7..0918654305af 100644 --- a/arch/x86/include/asm/calgary.h +++ b/arch/x86/include/asm/calgary.h @@ -62,10 +62,8 @@ struct cal_chipset_ops { extern int use_calgary; #ifdef CONFIG_CALGARY_IOMMU -extern int calgary_iommu_init(void); extern void detect_calgary(void); #else -static inline int calgary_iommu_init(void) { return 1; } static inline void detect_calgary(void) { return; } #endif diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 7c5ef8b14d92..46fc474fd819 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -161,7 +161,8 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, "adcl $0, %0 ;\n" : "=&r" (sum) : "r" (saddr), "r" (daddr), - "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)); + "r" (htonl(len)), "r" (htonl(proto)), "0" (sum) + : "memory"); return csum_fold(sum); } diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 82ceb788a981..ee1931be6593 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -312,19 +312,23 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); -#define cmpxchg64(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 4)) \ - __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - else \ - __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - __ret; \ -}) +#define cmpxchg64(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (o); \ + __typeof__(*(ptr)) __new = (n); \ + alternative_io("call cmpxchg8b_emu", \ + "lock; cmpxchg8b (%%esi)" , \ + X86_FEATURE_CX8, \ + "=A" (__ret), \ + "S" ((ptr)), "0" (__old), \ + "b" ((unsigned int)__new), \ + "c" ((unsigned int)(__new>>32)) \ + : "memory"); \ + __ret; }) + + + #define cmpxchg64_local(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index e8de2f6f5ca5..617bd56b3070 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -288,7 +288,7 @@ static inline void load_LDT(mm_context_t *pc) static inline unsigned long get_desc_base(const struct desc_struct *desc) { - return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); + return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); } static inline void set_desc_base(struct desc_struct *desc, unsigned long base) diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index cee34e9ca45b..029f230ab637 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -8,7 +8,7 @@ struct dev_archdata { #ifdef CONFIG_X86_64 struct dma_map_ops *dma_ops; #endif -#ifdef CONFIG_DMAR +#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU) void *iommu; /* hook for IOMMU specific extension */ #endif }; diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 0ee770d23d0e..0f6c02f3b7d4 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -14,7 +14,14 @@ #include <asm/swiotlb.h> #include <asm-generic/dma-coherent.h> -extern dma_addr_t bad_dma_address; +#ifdef CONFIG_ISA +# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) +#else +# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) +#endif + +#define DMA_ERROR_CODE 0 + extern int iommu_merge; extern struct device x86_dma_fallback_dev; extern int panic_on_overflow; @@ -42,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); - return (dma_addr == bad_dma_address); + return (dma_addr == DMA_ERROR_CODE); } #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) @@ -124,10 +131,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) return memory; - if (!dev) { + if (!dev) dev = &x86_dma_fallback_dev; - gfp |= GFP_DMA; - } if (!is_device_dma_capable(dev)) return NULL; diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 5e3f2044f0d3..f5693c81a1db 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) #endif diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 6cfdafa409d8..4ac5b0f33fc1 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed; extern int gart_iommu_aperture_disabled; extern void early_gart_iommu_check(void); -extern void gart_iommu_init(void); -extern void gart_iommu_shutdown(void); +extern int gart_iommu_init(void); extern void __init gart_parse_options(char *); extern void gart_iommu_hole_init(void); @@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void); static inline void early_gart_iommu_check(void) { } -static inline void gart_iommu_init(void) -{ -} -static inline void gart_iommu_shutdown(void) -{ -} static inline void gart_parse_options(char *options) { } diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index fd6d21bbee6c..345c99cef152 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_IOMMU_H #define _ASM_X86_IOMMU_H -extern void pci_iommu_shutdown(void); -extern void no_iommu_init(void); extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3be000435fad..d83892226f73 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b608a64c5814..f1363b72364f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -133,6 +133,8 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} static inline void enable_p5_mce(void) {} #endif +extern void (*x86_mce_decode_callback)(struct mce *m); + void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index f923203dc39a..4a2d4e0c18d9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -37,12 +37,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (likely(prev != next)) { /* stop flush ipis for the previous mm */ - cpu_clear(cpu, prev->cpu_vm_mask); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); #ifdef CONFIG_SMP percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); #endif - cpu_set(cpu, next->cpu_vm_mask); + cpumask_set_cpu(cpu, mm_cpumask(next)); /* Re-load page tables */ load_cr3(next->pgd); @@ -58,7 +58,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, percpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); - if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index e63cf7d441e1..139d4c1a33a7 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog; #define NMI_INVALID 3 struct ctl_table; -struct file; -extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, +extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 8aebcc41041d..efb38994859c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -840,42 +840,22 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) static inline unsigned long __raw_local_save_flags(void) { - unsigned long f; - - asm volatile(paravirt_alt(PARAVIRT_CALL) - : "=a"(f) - : paravirt_type(pv_irq_ops.save_fl), - paravirt_clobber(CLBR_EAX) - : "memory", "cc"); - return f; + return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); } static inline void raw_local_irq_restore(unsigned long f) { - asm volatile(paravirt_alt(PARAVIRT_CALL) - : "=a"(f) - : PV_FLAGS_ARG(f), - paravirt_type(pv_irq_ops.restore_fl), - paravirt_clobber(CLBR_EAX) - : "memory", "cc"); + PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); } static inline void raw_local_irq_disable(void) { - asm volatile(paravirt_alt(PARAVIRT_CALL) - : - : paravirt_type(pv_irq_ops.irq_disable), - paravirt_clobber(CLBR_EAX) - : "memory", "eax", "cc"); + PVOP_VCALLEE0(pv_irq_ops.irq_disable); } static inline void raw_local_irq_enable(void) { - asm volatile(paravirt_alt(PARAVIRT_CALL) - : - : paravirt_type(pv_irq_ops.irq_enable), - paravirt_clobber(CLBR_EAX) - : "memory", "eax", "cc"); + PVOP_VCALLEE0(pv_irq_ops.irq_enable); } static inline unsigned long __raw_local_irq_save(void) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index dd0f5b32489d..9357473c8da0 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -494,10 +494,11 @@ int paravirt_disable_iospace(void); #define EXTRA_CLOBBERS #define VEXTRA_CLOBBERS #else /* CONFIG_X86_64 */ +/* [re]ax isn't an arg, but the return val */ #define PVOP_VCALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax + __edx = __edx, __ecx = __ecx, __eax = __eax +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) #define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) @@ -509,6 +510,7 @@ int paravirt_disable_iospace(void); "=c" (__ecx) #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) +/* void functions are still allowed [re]ax for scratch */ #define PVOP_VCALLEE_CLOBBERS "=a" (__eax) #define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS @@ -583,8 +585,8 @@ int paravirt_disable_iospace(void); VEXTRA_CLOBBERS, \ pre, post, ##__VA_ARGS__) -#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ +#define __PVOP_VCALLEESAVE(op, pre, post, ...) \ + ____PVOP_VCALL(op.func, CLBR_RET_REG, \ PVOP_VCALLEE_CLOBBERS, , \ pre, post, ##__VA_ARGS__) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index f76a162c082c..ada8c201d513 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -143,7 +143,11 @@ static inline int __pcibus_to_node(const struct pci_bus *bus) static inline const struct cpumask * cpumask_of_pcibus(const struct pci_bus *bus) { - return cpumask_of_node(__pcibus_to_node(bus)); + int node; + + node = __pcibus_to_node(bus); + return (node == -1) ? cpu_online_mask : + cpumask_of_node(node); } #endif diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_event.h index e7b7c938ae27..ad7ce3fd5065 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_event.h @@ -1,8 +1,8 @@ -#ifndef _ASM_X86_PERF_COUNTER_H -#define _ASM_X86_PERF_COUNTER_H +#ifndef _ASM_X86_PERF_EVENT_H +#define _ASM_X86_PERF_EVENT_H /* - * Performance counter hw details: + * Performance event hw details: */ #define X86_PMC_MAX_GENERIC 8 @@ -43,7 +43,7 @@ union cpuid10_eax { struct { unsigned int version_id:8; - unsigned int num_counters:8; + unsigned int num_events:8; unsigned int bit_width:8; unsigned int mask_length:8; } split; @@ -52,7 +52,7 @@ union cpuid10_eax { union cpuid10_edx { struct { - unsigned int num_counters_fixed:4; + unsigned int num_events_fixed:4; unsigned int reserved:28; } split; unsigned int full; @@ -60,7 +60,7 @@ union cpuid10_edx { /* - * Fixed-purpose performance counters: + * Fixed-purpose performance events: */ /* @@ -87,22 +87,22 @@ union cpuid10_edx { /* * We model BTS tracing as another fixed-mode PMC. * - * We choose a value in the middle of the fixed counter range, since lower - * values are used by actual fixed counters and higher values are used + * We choose a value in the middle of the fixed event range, since lower + * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) -#ifdef CONFIG_PERF_COUNTERS -extern void init_hw_perf_counters(void); -extern void perf_counters_lapic_init(void); +#ifdef CONFIG_PERF_EVENTS +extern void init_hw_perf_events(void); +extern void perf_events_lapic_init(void); -#define PERF_COUNTER_INDEX_OFFSET 0 +#define PERF_EVENT_INDEX_OFFSET 0 #else -static inline void init_hw_perf_counters(void) { } -static inline void perf_counters_lapic_init(void) { } +static inline void init_hw_perf_events(void) { } +static inline void perf_events_lapic_init(void) { } #endif -#endif /* _ASM_X86_PERF_COUNTER_H */ +#endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7b467bf3c680..d1f4a760be23 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -277,6 +277,7 @@ static inline pteval_t pte_flags(pte_t pte) typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; +extern void set_nx(void); extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c3429e8b2424..c9786480f0fe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1000,7 +1000,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ +extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 6a84ed166aec..1e796782cd7b 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -121,7 +121,6 @@ static inline void arch_send_call_function_single_ipi(int cpu) smp_ops.send_call_func_single_ipi(cpu); } -#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) { smp_ops.send_call_func_ipi(mask); diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index b9e4e20174fb..87ffcb12a1b8 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -3,17 +3,14 @@ #include <linux/swiotlb.h> -/* SWIOTLB interface */ - -extern int swiotlb_force; - #ifdef CONFIG_SWIOTLB extern int swiotlb; -extern void pci_swiotlb_init(void); +extern int pci_swiotlb_init(void); #else #define swiotlb 0 -static inline void pci_swiotlb_init(void) +static inline int pci_swiotlb_init(void) { + return 0; } #endif diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d82f39bb7905..8d33bc5462d1 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -1,7 +1,7 @@ /* * Access to user system call parameters and results * - * Copyright (C) 2008 Red Hat, Inc. All rights reserved. + * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -16,13 +16,13 @@ #include <linux/sched.h> #include <linux/err.h> -static inline long syscall_get_nr(struct task_struct *task, - struct pt_regs *regs) +/* + * Only the low 32 bits of orig_ax are meaningful, so we return int. + * This importantly ignores the high bits on 64-bit, so comparisons + * sign-extend the low 32 bits. + */ +static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { - /* - * We always sign-extend a -1 value being set here, - * so this is always either -1L or a syscall number. - */ return regs->orig_ax; } diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6f0695d744bf..40e37b10c6c0 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -143,6 +143,7 @@ extern unsigned long node_remap_size[]; | 1*SD_BALANCE_FORK \ | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ + | 0*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ @@ -165,21 +166,11 @@ static inline int numa_node_id(void) return 0; } -static inline int cpu_to_node(int cpu) -{ - return 0; -} - static inline int early_cpu_to_node(int cpu) { return 0; } -static inline const struct cpumask *cpumask_of_node(int node) -{ - return cpu_online_mask; -} - static inline void setup_node_to_cpumask_map(void) { } #endif diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 8deaada61bc8..6fb3c209a7e3 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -341,7 +341,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index b9f3c60de5f7..8d3ad0adbc68 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv) __SYSCALL(__NR_pwritev, sys_pwritev) #define __NR_rt_tgsigqueueinfo 297 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_counter_open 298 -__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) +#define __NR_perf_event_open 298 +__SYSCALL(__NR_perf_event_open, sys_perf_event_open) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 04eb6c958b9d..d1414af98559 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -19,6 +19,8 @@ #include <asm/types.h> #include <asm/percpu.h> #include <asm/uv/uv_mmrs.h> +#include <asm/irq_vectors.h> +#include <asm/io_apic.h> /* @@ -114,7 +116,7 @@ /* * The largest possible NASID of a C or M brick (+ 2) */ -#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) +#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2) struct uv_scir_s { struct timer_list timer; @@ -230,6 +232,20 @@ static inline unsigned long uv_gpa(void *v) return uv_soc_phys_ram_to_gpa(__pa(v)); } +/* gnode -> pnode */ +static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) +{ + return gpa >> uv_hub_info->m_val; +} + +/* gpa -> pnode */ +static inline int uv_gpa_to_pnode(unsigned long gpa) +{ + unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1; + + return uv_gpa_to_gnode(gpa) & n_mask; +} + /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { @@ -421,9 +437,14 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) { unsigned long val; + unsigned long dmode = dest_Fixed; + + if (vector == NMI_VECTOR) + dmode = dest_NMI; val = (1UL << UVH_IPI_INT_SEND_SHFT) | ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | + (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | (vector << UVH_IPI_INT_VECTOR_SHFT); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2c756fd4ab0e..d8e71459f025 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -91,6 +91,14 @@ struct x86_init_timers { }; /** + * struct x86_init_iommu - platform specific iommu setup + * @iommu_init: platform specific iommu setup + */ +struct x86_init_iommu { + int (*iommu_init)(void); +}; + +/** * struct x86_init_ops - functions for platform specific setup * */ @@ -101,6 +109,7 @@ struct x86_init_ops { struct x86_init_oem oem; struct x86_init_paging paging; struct x86_init_timers timers; + struct x86_init_iommu iommu; }; /** @@ -121,6 +130,7 @@ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); + void (*iommu_shutdown)(void); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4ba419b668a5..d8e5d0cdd678 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ +obj-$(CONFIG_SFI) += sfi.o obj-y += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8c44c232efcb..59cdfa4686b2 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, * P4, Core and beyond CPUs */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d296f4a195c9..d85d1b2432ba 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -79,7 +79,8 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) struct cpuinfo_x86 *c = &cpu_data(pr->id); pr->pdc = NULL; - if (c->x86_vendor == X86_VENDOR_INTEL) + if (c->x86_vendor == X86_VENDOR_INTEL || + c->x86_vendor == X86_VENDOR_CENTAUR) init_intel_pdc(pr, c); return; diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 7da00b799cda..060fff8f5c5b 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -57,5 +57,8 @@ SECTIONS *(.note*) } + /* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 98f230f6a28d..32fb09102a13 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -28,6 +28,7 @@ #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/amd_iommu_proto.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> @@ -56,20 +57,115 @@ struct iommu_cmd { u32 data[4]; }; -static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, - struct unity_map_entry *e); -static struct dma_ops_domain *find_protection_domain(u16 devid); -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, int end_lvl, - u64 **pte_page, gfp_t gfp); -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages); static void reset_iommu_command_buffer(struct amd_iommu *iommu); -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size); static void update_domain(struct protection_domain *domain); +/**************************************************************************** + * + * Helper functions + * + ****************************************************************************/ + +static inline u16 get_device_id(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return calc_devid(pdev->bus->number, pdev->devfn); +} + +static struct iommu_dev_data *get_dev_data(struct device *dev) +{ + return dev->archdata.iommu; +} + +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ + struct dma_ops_domain *entry, *ret = NULL; + unsigned long flags; + u16 alias = amd_iommu_alias_table[devid]; + + if (list_empty(&iommu_pd_list)) + return NULL; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + + list_for_each_entry(entry, &iommu_pd_list, list) { + if (entry->target_dev == devid || + entry->target_dev == alias) { + ret = entry; + break; + } + } + + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + return ret; +} + +/* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ + u16 devid; + + if (!dev || !dev->dma_mask) + return false; + + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return false; + + devid = get_device_id(dev); + + /* Out of our scope? */ + if (devid > amd_iommu_last_bdf) + return false; + + if (amd_iommu_rlookup_table[devid] == NULL) + return false; + + return true; +} + +static int iommu_init_device(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct pci_dev *pdev; + u16 devid, alias; + + if (dev->archdata.iommu) + return 0; + + dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + dev_data->dev = dev; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); + if (pdev) + dev_data->alias = &pdev->dev; + + atomic_set(&dev_data->bind, 0); + + dev->archdata.iommu = dev_data; + + + return 0; +} + +static void iommu_uninit_device(struct device *dev) +{ + kfree(dev->archdata.iommu); +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -90,7 +186,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem); DECLARE_STATS_COUNTER(total_map_requests); static struct dentry *stats_dir; -static struct dentry *de_isolate; static struct dentry *de_fflush; static void amd_iommu_stats_add(struct __iommu_counter *cnt) @@ -108,9 +203,6 @@ static void amd_iommu_stats_init(void) if (stats_dir == NULL) return; - de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, - (u32 *)&amd_iommu_isolate); - de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, (u32 *)&amd_iommu_unmap_flush); @@ -130,12 +222,6 @@ static void amd_iommu_stats_init(void) #endif -/* returns !0 if the IOMMU is caching non-present entries in its TLB */ -static int iommu_has_npcache(struct amd_iommu *iommu) -{ - return iommu->cap & (1UL << IOMMU_CAP_NPCACHE); -} - /**************************************************************************** * * Interrupt handling functions @@ -199,6 +285,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + iommu->reset_in_progress = true; reset_iommu_command_buffer(iommu); dump_command(address); break; @@ -321,11 +408,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) { - spin_unlock(&iommu->lock); - reset_iommu_command_buffer(iommu); - spin_lock(&iommu->lock); - } + if (unlikely(i == EXIT_LOOP_COUNT)) + iommu->reset_in_progress = true; } /* @@ -372,26 +456,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu) out: spin_unlock_irqrestore(&iommu->lock, flags); + if (iommu->reset_in_progress) + reset_iommu_command_buffer(iommu); + return 0; } +static void iommu_flush_complete(struct protection_domain *domain) +{ + int i; + + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); + } +} + /* * Command send function for invalidating a device table entry */ -static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) +static int iommu_flush_device(struct device *dev) { + struct amd_iommu *iommu; struct iommu_cmd cmd; - int ret; + u16 devid; - BUG_ON(iommu == NULL); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + /* Build command */ memset(&cmd, 0, sizeof(cmd)); CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); cmd.data[0] = devid; - ret = iommu_queue_command(iommu, &cmd); - - return ret; + return iommu_queue_command(iommu, &cmd); } static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, @@ -430,11 +534,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, * It invalidates a single PTE if the range to flush is within a single * page. Otherwise it flushes the whole TLB of the IOMMU. */ -static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, - u64 address, size_t size) +static void __iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size, int pde) { - int s = 0; - unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); + int s = 0, i; + unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -447,142 +551,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, s = 1; } - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s); - return 0; + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need a TLB flush + */ + iommu_queue_inv_iommu_pages(amd_iommus[i], address, + domain->id, pde, s); + } + + return; } -/* Flush the whole IO/TLB for a given protection domain */ -static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); + __iommu_flush_pages(domain, address, size, 0); +} - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); +/* Flush the whole IO/TLB for a given protection domain */ +static void iommu_flush_tlb(struct protection_domain *domain) +{ + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_tlb_pde(struct protection_domain *domain) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); - - iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } + /* - * This function flushes one domain on one IOMMU + * This function flushes the DTEs for all devices in domain */ -static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_domain_devices(struct protection_domain *domain) { - struct iommu_cmd cmd; + struct iommu_dev_data *dev_data; unsigned long flags; - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); + spin_lock_irqsave(&domain->lock, flags); - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) + iommu_flush_device(dev_data->dev); + + spin_unlock_irqrestore(&domain->lock, flags); } -static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +static void iommu_flush_all_domain_devices(void) { - int i; + struct protection_domain *domain; + unsigned long flags; - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) - continue; - flush_domain_on_iommu(iommu, i); + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + iommu_flush_domain_devices(domain); + iommu_flush_complete(domain); } + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +void amd_iommu_flush_all_devices(void) +{ + iommu_flush_all_domain_devices(); } /* - * This function is used to flush the IO/TLB for a given protection domain - * on every IOMMU in the system + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. */ -static void iommu_flush_domain(u16 domid) +void amd_iommu_flush_all_domains(void) { - struct amd_iommu *iommu; + struct protection_domain *domain; + unsigned long flags; - INC_STATS_COUNTER(domain_flush_all); + spin_lock_irqsave(&amd_iommu_pd_lock, flags); - for_each_iommu(iommu) - flush_domain_on_iommu(iommu, domid); + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + spin_lock(&domain->lock); + iommu_flush_tlb_pde(domain); + iommu_flush_complete(domain); + spin_unlock(&domain->lock); + } + + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } -void amd_iommu_flush_all_domains(void) +static void reset_iommu_command_buffer(struct amd_iommu *iommu) { - struct amd_iommu *iommu; + pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); - for_each_iommu(iommu) - flush_all_domains_on_iommu(iommu); + if (iommu->reset_in_progress) + panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + + amd_iommu_reset_cmd_buffer(iommu); + amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); + + iommu->reset_in_progress = false; } -static void flush_all_devices_for_iommu(struct amd_iommu *iommu) +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. + */ +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) { - int i; + u64 *pte; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (iommu != amd_iommu_rlookup_table[i]) - continue; + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); - } + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; } -static void flush_devices_by_domain(struct protection_domain *domain) +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) { - struct amd_iommu *iommu; - int i; + u64 *pte, *page; + int level; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || - (amd_iommu_pd_table[i] != domain)) - continue; + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); - iommu = amd_iommu_rlookup_table[i]; - if (!iommu) - continue; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); + level -= 1; + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page && level == end_lvl) + *pte_page = pte; + + pte = &pte[PM_LEVEL_INDEX(level, address)]; } + + return pte; } -static void reset_iommu_command_buffer(struct amd_iommu *iommu) +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) { - pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); + int level; + u64 *pte; - if (iommu->reset_in_progress) - panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - iommu->reset_in_progress = true; + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; - amd_iommu_reset_cmd_buffer(iommu); - flush_all_devices_for_iommu(iommu); - flush_all_domains_on_iommu(iommu); + level -= 1; - iommu->reset_in_progress = false; -} + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; -void amd_iommu_flush_all_devices(void) -{ - flush_devices_by_domain(NULL); -} + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } -/**************************************************************************** - * - * The functions below are used the create the page table mappings for - * unity mapped regions. - * - ****************************************************************************/ + return pte; +} /* * Generic mapping functions. It maps a physical address into a DMA @@ -654,28 +828,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, } /* - * Init the unity mappings for a specific IOMMU in the system - * - * Basically iterates over all unity mapping entries and applies them to - * the default domain DMA of that IOMMU if necessary. - */ -static int iommu_init_unity_mappings(struct amd_iommu *iommu) -{ - struct unity_map_entry *entry; - int ret; - - list_for_each_entry(entry, &amd_iommu_unity_map, list) { - if (!iommu_for_unity_map(iommu, entry)) - continue; - ret = dma_ops_unity_map(iommu->default_dom, entry); - if (ret) - return ret; - } - - return 0; -} - -/* * This function actually applies the mapping to the page table of the * dma_ops domain. */ @@ -704,6 +856,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, } /* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ +static int iommu_init_unity_mappings(struct amd_iommu *iommu) +{ + struct unity_map_entry *entry; + int ret; + + list_for_each_entry(entry, &amd_iommu_unity_map, list) { + if (!iommu_for_unity_map(iommu, entry)) + continue; + ret = dma_ops_unity_map(iommu->default_dom, entry); + if (ret) + return ret; + } + + return 0; +} + +/* * Inits the unity mappings required for a specific device */ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, @@ -740,34 +914,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, */ /* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. */ -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size) +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages) { - int level; - u64 *pte; - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > map_size) { - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - level -= 1; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; + if (start_page + pages > last_page) + pages = last_page - start_page; - if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { - pte = NULL; - break; - } + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); } - - return pte; } /* @@ -775,11 +938,11 @@ static u64 *fetch_pte(struct protection_domain *domain, * aperture in case of dma_ops domain allocation or address allocation * failure. */ -static int alloc_new_range(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static int alloc_new_range(struct dma_ops_domain *dma_dom, bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + struct amd_iommu *iommu; int i; #ifdef CONFIG_IOMMU_STRESS @@ -819,14 +982,17 @@ static int alloc_new_range(struct amd_iommu *iommu, dma_dom->aperture_size += APERTURE_RANGE_SIZE; /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start >= dma_dom->aperture[index]->offset && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); + for_each_iommu(iommu) { + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset + && iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + startpage = iommu->exclusion_start >> PAGE_SHIFT; + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } } /* @@ -928,7 +1094,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, } if (unlikely(address == -1)) - address = bad_dma_address; + address = DMA_ERROR_CODE; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -973,6 +1139,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, * ****************************************************************************/ +/* + * This function adds a protection domain to the global protection domain list + */ +static void add_domain_to_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_add(&domain->list, &amd_iommu_pd_list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +/* + * This function removes a protection domain to the global + * protection domain list + */ +static void del_domain_from_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_del(&domain->list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + static u16 domain_id_alloc(void) { unsigned long flags; @@ -1000,26 +1191,6 @@ static void domain_id_free(int id) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -/* - * Used to reserve address ranges in the aperture (e.g. for exclusion - * ranges. - */ -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages) -{ - unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - - if (start_page + pages > last_page) - pages = last_page - start_page; - - for (i = start_page; i < start_page + pages; ++i) { - int index = i / APERTURE_RANGE_PAGES; - int page = i % APERTURE_RANGE_PAGES; - __set_bit(page, dom->aperture[index]->bitmap); - } -} - static void free_pagetable(struct protection_domain *domain) { int i, j; @@ -1061,6 +1232,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; + del_domain_from_list(&dom->domain); + free_pagetable(&dom->domain); for (i = 0; i < APERTURE_MAX_RANGES; ++i) { @@ -1078,7 +1251,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) +static struct dma_ops_domain *dma_ops_domain_alloc(void) { struct dma_ops_domain *dma_dom; @@ -1091,6 +1264,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; + INIT_LIST_HEAD(&dma_dom->domain.dev_list); dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; @@ -1101,7 +1275,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) + add_domain_to_list(&dma_dom->domain); + + if (alloc_new_range(dma_dom, true, GFP_KERNEL)) goto free_dma_dom; /* @@ -1129,22 +1305,6 @@ static bool dma_ops_domain(struct protection_domain *domain) return domain->flags & PD_DMA_OPS_MASK; } -/* - * Find out the protection domain structure for a given PCI device. This - * will give us the pointer to the page table root for example. - */ -static struct protection_domain *domain_for_device(u16 devid) -{ - struct protection_domain *dom; - unsigned long flags; - - read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = amd_iommu_pd_table[devid]; - read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - return dom; -} - static void set_dte_entry(u16 devid, struct protection_domain *domain) { u64 pte_root = virt_to_phys(domain->pt_root); @@ -1156,42 +1316,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_dev_table[devid].data[2] = domain->id; amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); +} + +static void clear_dte_entry(u16 devid) +{ + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; - amd_iommu_pd_table[devid] = domain; + amd_iommu_apply_erratum_63(devid); +} + +static void do_attach(struct device *dev, struct protection_domain *domain) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + set_dte_entry(devid, domain); + + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; + + /* Flush the DTE entry */ + iommu_flush_device(dev); +} + +static void do_detach(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* decrease reference counters */ + dev_data->domain->dev_iommu[iommu->index] -= 1; + dev_data->domain->dev_cnt -= 1; + + /* Update data structures */ + dev_data->domain = NULL; + list_del(&dev_data->list); + clear_dte_entry(devid); + + /* Flush the DTE entry */ + iommu_flush_device(dev); } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void __attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int __attach_device(struct device *dev, + struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *alias_data; + + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + + if (!alias_data) + return -EINVAL; + /* lock domain */ spin_lock(&domain->lock); - /* update DTE entry */ - set_dte_entry(devid, domain); + /* Some sanity checks */ + if (alias_data->domain != NULL && + alias_data->domain != domain) + return -EBUSY; + + if (dev_data->domain != NULL && + dev_data->domain != domain) + return -EBUSY; - domain->dev_cnt += 1; + /* Do real assignment */ + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (alias_data->domain == NULL) + do_attach(dev_data->alias, domain); + + atomic_inc(&alias_data->bind); + } + + if (dev_data->domain == NULL) + do_attach(dev, domain); + + atomic_inc(&dev_data->bind); /* ready */ spin_unlock(&domain->lock); + + return 0; } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int attach_device(struct device *dev, + struct protection_domain *domain) { unsigned long flags; + int ret; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __attach_device(iommu, domain, devid); + ret = __attach_device(dev, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* @@ -1199,96 +1440,125 @@ static void attach_device(struct amd_iommu *iommu, * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - iommu_queue_inv_dev_entry(iommu, devid); - iommu_flush_tlb_pde(iommu, domain->id); + iommu_flush_tlb_pde(domain); + + return ret; } /* * Removes a device from a protection domain (unlocked) */ -static void __detach_device(struct protection_domain *domain, u16 devid) +static void __detach_device(struct device *dev) { + struct iommu_dev_data *dev_data = get_dev_data(dev); + struct iommu_dev_data *alias_data; + unsigned long flags; - /* lock domain */ - spin_lock(&domain->lock); + BUG_ON(!dev_data->domain); - /* remove domain from the lookup table */ - amd_iommu_pd_table[devid] = NULL; + spin_lock_irqsave(&dev_data->domain->lock, flags); - /* remove entry from the device table seen by the hardware */ - amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (atomic_dec_and_test(&alias_data->bind)) + do_detach(dev_data->alias); + } - /* decrease reference counter */ - domain->dev_cnt -= 1; + if (atomic_dec_and_test(&dev_data->bind)) + do_detach(dev); - /* ready */ - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&dev_data->domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the * passthrough domain if it is detached from any other domain */ - if (iommu_pass_through) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; - __attach_device(iommu, pt_domain, devid); - } + if (iommu_pass_through && dev_data->domain == NULL) + __attach_device(dev, pt_domain); } /* * Removes a device from a protection domain (with devtable_lock held) */ -static void detach_device(struct protection_domain *domain, u16 devid) +static void detach_device(struct device *dev) { unsigned long flags; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __detach_device(domain, devid); + __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ +static struct protection_domain *domain_for_device(struct device *dev) +{ + struct protection_domain *dom; + struct iommu_dev_data *dev_data, *alias_data; + unsigned long flags; + u16 devid, alias; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + if (!alias_data) + return NULL; + + read_lock_irqsave(&amd_iommu_devtable_lock, flags); + dom = dev_data->domain; + if (dom == NULL && + alias_data->domain != NULL) { + __attach_device(dev, alias_data->domain); + dom = alias_data->domain; + } + + read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return dom; +} + static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct pci_dev *pdev = to_pci_dev(dev); - u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + u16 devid; struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; unsigned long flags; - if (devid > amd_iommu_last_bdf) - goto out; - - devid = amd_iommu_alias_table[devid]; - - iommu = amd_iommu_rlookup_table[devid]; - if (iommu == NULL) - goto out; - - domain = domain_for_device(devid); + if (!check_device(dev)) + return 0; - if (domain && !dma_ops_domain(domain)) - WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " - "to a non-dma-ops domain\n", dev_name(dev)); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: + + domain = domain_for_device(dev); + if (!domain) goto out; if (iommu_pass_through) break; - detach_device(domain, devid); + detach_device(dev); break; case BUS_NOTIFY_ADD_DEVICE: + + iommu_init_device(dev); + + domain = domain_for_device(dev); + /* allocate a protection domain if a device is added */ dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu); + dma_domain = dma_ops_domain_alloc(); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1298,11 +1568,15 @@ static int device_change_notifier(struct notifier_block *nb, spin_unlock_irqrestore(&iommu_pd_list_lock, flags); break; + case BUS_NOTIFY_DEL_DEVICE: + + iommu_uninit_device(dev); + default: goto out; } - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); out: @@ -1320,106 +1594,46 @@ static struct notifier_block device_nb = { *****************************************************************************/ /* - * This function checks if the driver got a valid device from the caller to - * avoid dereferencing invalid pointers. - */ -static bool check_device(struct device *dev) -{ - if (!dev || !dev->dma_mask) - return false; - - return true; -} - -/* - * In this function the list of preallocated protection domains is traversed to - * find the domain for a specific device - */ -static struct dma_ops_domain *find_protection_domain(u16 devid) -{ - struct dma_ops_domain *entry, *ret = NULL; - unsigned long flags; - - if (list_empty(&iommu_pd_list)) - return NULL; - - spin_lock_irqsave(&iommu_pd_list_lock, flags); - - list_for_each_entry(entry, &iommu_pd_list, list) { - if (entry->target_dev == devid) { - ret = entry; - break; - } - } - - spin_unlock_irqrestore(&iommu_pd_list_lock, flags); - - return ret; -} - -/* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the * requestor id for a given device. * If the device is not yet associated with a domain this is also done * in this function. */ -static int get_device_resources(struct device *dev, - struct amd_iommu **iommu, - struct protection_domain **domain, - u16 *bdf) +static struct protection_domain *get_domain(struct device *dev) { + struct protection_domain *domain; struct dma_ops_domain *dma_dom; - struct pci_dev *pcidev; - u16 _bdf; - - *iommu = NULL; - *domain = NULL; - *bdf = 0xffff; - - if (dev->bus != &pci_bus_type) - return 0; + u16 devid = get_device_id(dev); - pcidev = to_pci_dev(dev); - _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* device not translated by any IOMMU in the system? */ - if (_bdf > amd_iommu_last_bdf) - return 0; + if (!check_device(dev)) + return ERR_PTR(-EINVAL); - *bdf = amd_iommu_alias_table[_bdf]; + domain = domain_for_device(dev); + if (domain != NULL && !dma_ops_domain(domain)) + return ERR_PTR(-EBUSY); - *iommu = amd_iommu_rlookup_table[*bdf]; - if (*iommu == NULL) - return 0; - *domain = domain_for_device(*bdf); - if (*domain == NULL) { - dma_dom = find_protection_domain(*bdf); - if (!dma_dom) - dma_dom = (*iommu)->default_dom; - *domain = &dma_dom->domain; - attach_device(*iommu, *domain, *bdf); - DUMP_printk("Using protection domain %d for device %s\n", - (*domain)->id, dev_name(dev)); - } + if (domain != NULL) + return domain; - if (domain_for_device(_bdf) == NULL) - attach_device(*iommu, *domain, _bdf); + /* Device not bount yet - bind it */ + dma_dom = find_protection_domain(devid); + if (!dma_dom) + dma_dom = amd_iommu_rlookup_table[devid]->default_dom; + attach_device(dev, &dma_dom->domain); + DUMP_printk("Using protection domain %d for device %s\n", + dma_dom->domain.id, dev_name(dev)); - return 1; + return &dma_dom->domain; } static void update_device_table(struct protection_domain *domain) { - unsigned long flags; - int i; + struct iommu_dev_data *dev_data; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] != domain) - continue; - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - set_dte_entry(i, domain); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) { + u16 devid = get_device_id(dev_data->dev); + set_dte_entry(devid, domain); } } @@ -1429,76 +1643,13 @@ static void update_domain(struct protection_domain *domain) return; update_device_table(domain); - flush_devices_by_domain(domain); - iommu_flush_domain(domain->id); + iommu_flush_domain_devices(domain); + iommu_flush_tlb_pde(domain); domain->updated = false; } /* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct protection_domain *domain, - gfp_t gfp) -{ - u64 *pte; - - if (domain->mode == PAGE_MODE_6_LEVEL) - /* address space already 64 bit large */ - return false; - - pte = (void *)get_zeroed_page(gfp); - if (!pte) - return false; - - *pte = PM_LEVEL_PDE(domain->mode, - virt_to_phys(domain->pt_root)); - domain->pt_root = pte; - domain->mode += 1; - domain->updated = true; - - return true; -} - -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, - int end_lvl, - u64 **pte_page, - gfp_t gfp) -{ - u64 *pte, *page; - int level; - - while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > end_lvl) { - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); - } - - level -= 1; - - pte = IOMMU_PTE_PAGE(*pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* * This function fetches the PTE for a given address in the aperture */ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, @@ -1528,8 +1679,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ -static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, unsigned long address, phys_addr_t paddr, int direction) @@ -1542,7 +1692,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, pte = dma_ops_get_pte(dom, address); if (!pte) - return bad_dma_address; + return DMA_ERROR_CODE; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1563,8 +1713,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, /* * The generic unmapping function for on page in the DMA address space. */ -static void dma_ops_domain_unmap(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static void dma_ops_domain_unmap(struct dma_ops_domain *dom, unsigned long address) { struct aperture_range *aperture; @@ -1595,7 +1744,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, * Must be called with the domain lock held. */ static dma_addr_t __map_single(struct device *dev, - struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, phys_addr_t paddr, size_t size, @@ -1623,7 +1771,7 @@ static dma_addr_t __map_single(struct device *dev, retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) { + if (unlikely(address == DMA_ERROR_CODE)) { /* * setting next_address here will let the address * allocator only scan the new allocated range in the @@ -1631,7 +1779,7 @@ retry: */ dma_dom->next_address = dma_dom->aperture_size; - if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) goto out; /* @@ -1643,8 +1791,8 @@ retry: start = address; for (i = 0; i < pages; ++i) { - ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); - if (ret == bad_dma_address) + ret = dma_ops_domain_map(dma_dom, start, paddr, dir); + if (ret == DMA_ERROR_CODE) goto out_unmap; paddr += PAGE_SIZE; @@ -1655,10 +1803,10 @@ retry: ADD_STATS_COUNTER(alloced_io_mem, size); if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - iommu_flush_tlb(iommu, dma_dom->domain.id); + iommu_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; - } else if (unlikely(iommu_has_npcache(iommu))) - iommu_flush_pages(iommu, dma_dom->domain.id, address, size); + } else if (unlikely(amd_iommu_np_cache)) + iommu_flush_pages(&dma_dom->domain, address, size); out: return address; @@ -1667,20 +1815,19 @@ out_unmap: for (--i; i >= 0; --i) { start -= PAGE_SIZE; - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); } dma_ops_free_addresses(dma_dom, address, pages); - return bad_dma_address; + return DMA_ERROR_CODE; } /* * Does the reverse of the __map_single function. Must be called with * the domain lock held too */ -static void __unmap_single(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static void __unmap_single(struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, size_t size, int dir) @@ -1688,7 +1835,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_addr_t i, start; unsigned int pages; - if ((dma_addr == bad_dma_address) || + if ((dma_addr == DMA_ERROR_CODE) || (dma_addr + size > dma_dom->aperture_size)) return; @@ -1697,7 +1844,7 @@ static void __unmap_single(struct amd_iommu *iommu, start = dma_addr; for (i = 0; i < pages; ++i) { - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); start += PAGE_SIZE; } @@ -1706,7 +1853,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); + iommu_flush_pages(&dma_dom->domain, dma_addr, size); dma_dom->need_flush = false; } } @@ -1720,36 +1867,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; dma_addr_t addr; u64 dma_mask; phys_addr_t paddr = page_to_phys(page) + offset; INC_STATS_COUNTER(cnt_map_single); - if (!check_device(dev)) - return bad_dma_address; - - dma_mask = *dev->dma_mask; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (iommu == NULL || domain == NULL) - /* device not handled by any AMD IOMMU */ + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) return (dma_addr_t)paddr; + else if (IS_ERR(domain)) + return DMA_ERROR_CODE; - if (!dma_ops_domain(domain)) - return bad_dma_address; + dma_mask = *dev->dma_mask; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + + addr = __map_single(dev, domain->priv, paddr, size, dir, false, dma_mask); - if (addr == bad_dma_address) + if (addr == DMA_ERROR_CODE) goto out; - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1764,25 +1904,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_unmap_single); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) - /* device not handled by any AMD IOMMU */ - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, dir); + __unmap_single(domain->priv, dma_addr, size, dir); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1814,9 +1948,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; int i; struct scatterlist *s; phys_addr_t paddr; @@ -1825,25 +1957,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!check_device(dev)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) + return map_sg_no_iommu(dev, sglist, nelems, dir); + else if (IS_ERR(domain)) return 0; dma_mask = *dev->dma_mask; - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - return map_sg_no_iommu(dev, sglist, nelems, dir); - - if (!dma_ops_domain(domain)) - return 0; - spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { paddr = sg_phys(s); - s->dma_address = __map_single(dev, iommu, domain->priv, + s->dma_address = __map_single(dev, domain->priv, paddr, s->length, dir, false, dma_mask); @@ -1854,7 +1981,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1863,7 +1990,7 @@ out: unmap: for_each_sg(sglist, s, mapped_elems, i) { if (s->dma_address) - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } @@ -1882,30 +2009,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; struct scatterlist *s; - u16 devid; int i; INC_STATS_COUNTER(cnt_unmap_sg); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1918,49 +2040,44 @@ static void *alloc_coherent(struct device *dev, size_t size, { unsigned long flags; void *virt_addr; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; INC_STATS_COUNTER(cnt_alloc_coherent); - if (!check_device(dev)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) { + virt_addr = (void *)__get_free_pages(flag, get_order(size)); + *dma_addr = __pa(virt_addr); + return virt_addr; + } else if (IS_ERR(domain)) return NULL; - if (!get_device_resources(dev, &iommu, &domain, &devid)) - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + dma_mask = dev->coherent_dma_mask; + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + flag |= __GFP_ZERO; - flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return NULL; paddr = virt_to_phys(virt_addr); - if (!iommu || !domain) { - *dma_addr = (dma_addr_t)paddr; - return virt_addr; - } - - if (!dma_ops_domain(domain)) - goto out_free; - if (!dma_mask) dma_mask = *dev->dma_mask; spin_lock_irqsave(&domain->lock, flags); - *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + *dma_addr = __map_single(dev, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { + if (*dma_addr == DMA_ERROR_CODE) { spin_unlock_irqrestore(&domain->lock, flags); goto out_free; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -1980,28 +2097,19 @@ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_free_coherent); - if (!check_device(dev)) - return; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - goto free_mem; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) goto free_mem; spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); + __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2015,22 +2123,7 @@ free_mem: */ static int amd_iommu_dma_supported(struct device *dev, u64 mask) { - u16 bdf; - struct pci_dev *pcidev; - - /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) - return 0; - - pcidev = to_pci_dev(dev); - - bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* Out of our scope? */ - if (bdf > amd_iommu_last_bdf) - return 0; - - return 1; + return check_device(dev); } /* @@ -2044,25 +2137,30 @@ static void prealloc_protection_domains(void) { struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; - struct amd_iommu *iommu; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) - continue; - devid = amd_iommu_alias_table[devid]; - if (domain_for_device(devid)) + + /* Do we handle this device? */ + if (!check_device(&dev->dev)) continue; - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) + + iommu_init_device(&dev->dev); + + /* Is there already any domain for it? */ + if (domain_for_device(&dev->dev)) continue; - dma_dom = dma_ops_domain_alloc(iommu); + + devid = get_device_id(&dev->dev); + + dma_dom = dma_ops_domain_alloc(); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; + attach_device(&dev->dev, &dma_dom->domain); + list_add_tail(&dma_dom->list, &iommu_pd_list); } } @@ -2091,7 +2189,7 @@ int __init amd_iommu_init_dma_ops(void) * protection domain will be assigned to the default one. */ for_each_iommu(iommu) { - iommu->default_dom = dma_ops_domain_alloc(iommu); + iommu->default_dom = dma_ops_domain_alloc(); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; @@ -2101,15 +2199,12 @@ int __init amd_iommu_init_dma_ops(void) } /* - * If device isolation is enabled, pre-allocate the protection - * domains for each device. + * Pre-allocate the protection domains for each device. */ - if (amd_iommu_isolate) - prealloc_protection_domains(); + prealloc_protection_domains(); iommu_detected = 1; - force_iommu = 1; - bad_dma_address = 0; + swiotlb = 0; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; @@ -2148,14 +2243,17 @@ free_domains: static void cleanup_domain(struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *next; unsigned long flags; - u16 devid; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) - if (amd_iommu_pd_table[devid] == domain) - __detach_device(domain, devid); + list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { + struct device *dev = dev_data->dev; + + do_detach(dev); + atomic_set(&dev_data->bind, 0); + } write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } @@ -2165,6 +2263,8 @@ static void protection_domain_free(struct protection_domain *domain) if (!domain) return; + del_domain_from_list(domain); + if (domain->id) domain_id_free(domain->id); @@ -2183,6 +2283,9 @@ static struct protection_domain *protection_domain_alloc(void) domain->id = domain_id_alloc(); if (!domain->id) goto out_err; + INIT_LIST_HEAD(&domain->dev_list); + + add_domain_to_list(domain); return domain; @@ -2239,26 +2342,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { - struct protection_domain *domain = dom->priv; + struct iommu_dev_data *dev_data = dev->archdata.iommu; struct amd_iommu *iommu; - struct pci_dev *pdev; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return; - pdev = to_pci_dev(dev); - - devid = calc_devid(pdev->bus->number, pdev->devfn); + devid = get_device_id(dev); - if (devid > 0) - detach_device(domain, devid); + if (dev_data->domain != NULL) + detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return; - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); } @@ -2266,35 +2366,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { struct protection_domain *domain = dom->priv; - struct protection_domain *old_domain; + struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - struct pci_dev *pdev; + int ret; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return -EINVAL; - pdev = to_pci_dev(dev); + dev_data = dev->archdata.iommu; - devid = calc_devid(pdev->bus->number, pdev->devfn); - - if (devid >= amd_iommu_last_bdf || - devid != amd_iommu_alias_table[devid]) - return -EINVAL; + devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return -EINVAL; - old_domain = domain_for_device(devid); - if (old_domain) - detach_device(old_domain, devid); + if (dev_data->domain) + detach_device(dev); - attach_device(iommu, domain, devid); + ret = attach_device(dev, domain); iommu_completion_wait(iommu); - return 0; + return ret; } static int amd_iommu_map_range(struct iommu_domain *dom, @@ -2340,7 +2435,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova += PAGE_SIZE; } - iommu_flush_domain(domain->id); + iommu_flush_tlb_pde(domain); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, @@ -2391,8 +2486,9 @@ static struct iommu_ops amd_iommu_ops = { int __init amd_iommu_init_passthrough(void) { + struct amd_iommu *iommu; struct pci_dev *dev = NULL; - u16 devid, devid2; + u16 devid; /* allocate passthroug domain */ pt_domain = protection_domain_alloc(); @@ -2402,20 +2498,17 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - struct amd_iommu *iommu; - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) + if (!check_device(&dev->dev)) continue; - devid2 = amd_iommu_alias_table[devid]; + devid = get_device_id(&dev->dev); - iommu = amd_iommu_rlookup_table[devid2]; + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - __attach_device(iommu, pt_domain, devid); - __attach_device(iommu, pt_domain, devid2); + attach_device(&dev->dev, pt_domain); } pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b4b61d462dcc..7ffc39965233 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -25,10 +25,12 @@ #include <linux/interrupt.h> #include <linux/msi.h> #include <asm/pci-direct.h> +#include <asm/amd_iommu_proto.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/x86_init.h> /* * definitions for the ACPI scanning code @@ -123,18 +125,24 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -#ifdef CONFIG_IOMMU_STRESS -bool amd_iommu_isolate = false; -#else -bool amd_iommu_isolate = true; /* if true, device isolation is - enabled */ -#endif - bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ +/* Array to assign indices to IOMMUs*/ +struct amd_iommu *amd_iommus[MAX_IOMMUS]; +int amd_iommus_present; + +/* IOMMUs have a non-present cache? */ +bool amd_iommu_np_cache __read_mostly; + +/* + * List of protection domains - used during resume + */ +LIST_HEAD(amd_iommu_pd_list); +spinlock_t amd_iommu_pd_lock; + /* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains @@ -157,12 +165,6 @@ u16 *amd_iommu_alias_table; struct amd_iommu **amd_iommu_rlookup_table; /* - * The pd table (protection domain table) is used to find the protection domain - * data structure a device belongs to. Indexed with the PCI device id too. - */ -struct protection_domain **amd_iommu_pd_table; - -/* * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap * to know which ones are already in use. */ @@ -240,7 +242,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } -static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -519,6 +521,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } +static int get_dev_entry_bit(u16 devid, u8 bit) +{ + int i = (bit >> 5) & 0x07; + int _bit = bit & 0x1f; + + return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit; +} + + +void amd_iommu_apply_erratum_63(u16 devid) +{ + int sysmgt; + + sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) | + (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1); + + if (sysmgt == 0x01) + set_dev_entry_bit(devid, DEV_ENTRY_IW); +} + /* Writes the specific IOMMU for a device into the rlookup table */ static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) { @@ -547,6 +569,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); + amd_iommu_apply_erratum_63(devid); + set_iommu_for_device(iommu, devid); } @@ -816,7 +840,18 @@ static void __init free_iommu_all(void) static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); + + /* Add IOMMU to internal data structures */ list_add_tail(&iommu->list, &amd_iommu_list); + iommu->index = amd_iommus_present++; + + if (unlikely(iommu->index >= MAX_IOMMUS)) { + WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n"); + return -ENOSYS; + } + + /* Index is fine - add IOMMU to the array */ + amd_iommus[iommu->index] = iommu; /* * Copy data from ACPI table entry to the iommu struct @@ -846,6 +881,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) + amd_iommu_np_cache = true; + return pci_enable_device(iommu->dev); } @@ -903,7 +941,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msi(struct amd_iommu *iommu) +static int iommu_setup_msi(struct amd_iommu *iommu) { int r; @@ -1154,19 +1192,10 @@ static struct sys_device device_amd_iommu = { * functions. Finally it prints some information about AMD IOMMUs and * the driver state and enables the hardware. */ -int __init amd_iommu_init(void) +static int __init amd_iommu_init(void) { int i, ret = 0; - - if (no_iommu) { - printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); - return 0; - } - - if (!amd_iommu_detected) - return -ENODEV; - /* * First parse ACPI tables to find the largest Bus/Dev/Func * we need to handle. Upon this information the shared data @@ -1203,15 +1232,6 @@ int __init amd_iommu_init(void) if (amd_iommu_rlookup_table == NULL) goto free; - /* - * Protection Domain table - maps devices to protection domains - * This table has the same size as the rlookup_table - */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(rlookup_table_size)); - if (amd_iommu_pd_table == NULL) - goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); @@ -1233,6 +1253,8 @@ int __init amd_iommu_init(void) */ amd_iommu_pd_alloc_bitmap[0] = 1; + spin_lock_init(&amd_iommu_pd_lock); + /* * now the data structures are allocated and basically initialized * start the real acpi table scan @@ -1264,17 +1286,12 @@ int __init amd_iommu_init(void) if (iommu_pass_through) goto out; - printk(KERN_INFO "AMD-Vi: device isolation "); - if (amd_iommu_isolate) - printk("enabled\n"); - else - printk("disabled\n"); - if (amd_iommu_unmap_flush) printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + x86_platform.iommu_shutdown = disable_iommus; out: return ret; @@ -1282,9 +1299,6 @@ free: free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); - free_pages((unsigned long)amd_iommu_rlookup_table, get_order(rlookup_table_size)); @@ -1301,11 +1315,6 @@ free: goto out; } -void amd_iommu_shutdown(void) -{ - disable_iommus(); -} - /**************************************************************************** * * Early detect code. This code runs at IOMMU detection time in the DMA @@ -1320,16 +1329,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) + if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif + x86_init.iommu.iommu_init = amd_iommu_init; } } @@ -1350,10 +1356,6 @@ static int __init parse_amd_iommu_dump(char *str) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = true; - if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 128111d8ffe0..e0dfb6856aa2 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -28,6 +28,7 @@ #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/k8.h> +#include <asm/x86_init.h> int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; @@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void) iommu_detected = 1; gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; @@ -456,7 +458,7 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (swiotlb && !valid_agp) { + } else if (!valid_agp) { /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a58ef98be155..894aa97f0717 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,7 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi_pmtmr.h> @@ -35,7 +35,7 @@ #include <linux/smp.h> #include <linux/mm.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> #include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/atomic.h> @@ -1189,7 +1189,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(); + perf_events_lapic_init(); preempt_disable(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 64970b9885f2..dc69f28489f5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -227,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node) cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { - if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { kfree(cfg); cfg = NULL; - } else if (!alloc_cpumask_var_node(&cfg->old_domain, + } else if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node)) { free_cpumask_var(cfg->domain); kfree(cfg); cfg = NULL; - } else { - cpumask_clear(cfg->domain); - cpumask_clear(cfg->old_domain); } } diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index cb66a22d98ad..7ff61d6a188a 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) /* * proc handler for /proc/sys/kernel/nmi */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, +int proc_nmi_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int old_state; nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (!!old_state == !!nmi_watchdog_enabled) return 0; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5f5886a6b53..326c25477d3d 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -352,14 +352,14 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { alias.v = uv_read_local_mmr(redir_addrs[i].alias); - if (alias.s.base == 0) { + if (alias.s.enable && alias.s.base == 0) { *size = (1UL << alias.s.m_alias); redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; return; } } - BUG(); + *base = *size = 0; } enum map_type {map_wb, map_uc}; @@ -619,12 +619,12 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; - uv_cpu_hub_info(cpu)->n_val = m_val; + uv_cpu_hub_info(cpu)->n_val = n_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; uv_cpu_hub_info(cpu)->pnode = pnode; uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; - uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; + uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 8dd30638fe44..68537e957a9b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f32fa71ccf97..c910a716a71c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) * approved Athlon */ WARN_ONCE(1, "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); + " processors is not suitable for SMP.\n"); if (!test_taint(TAINT_UNSAFE_SMP)) add_taint(TAINT_UNSAFE_SMP); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2fea97eccf77..cc25c2b4a567 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,7 +13,7 @@ #include <linux/io.h> #include <asm/stackprotector.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/hypervisor.h> #include <asm/processor.h> @@ -869,7 +869,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_counters(); + init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 7d5c3b0ea8da..8b581d3905cb 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -526,15 +526,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) { - /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf + /* Intel Xeon Processor 7100 Series Specification Update + * http://www.intel.com/Assets/PDF/specupdate/314554.pdf * AL30: A Machine Check Exception (MCE) Occurring during an * Enhanced Intel SpeedStep Technology Ratio Change May Cause - * Both Processor Cores to Lock Up when HT is enabled*/ + * Both Processor Cores to Lock Up. */ if (c->x86_vendor == X86_VENDOR_INTEL) { if ((c->x86 == 15) && (c->x86_model == 6) && - (c->x86_mask == 8) && smt_capable()) + (c->x86_mask == 8)) { + printk(KERN_INFO "acpi-cpufreq: Intel(R) " + "Xeon(R) 7100 Errata AL30, processors may " + "lock up on frequency changes: disabling " + "acpi-cpufreq.\n"); return -ENODEV; + } } return 0; } @@ -549,13 +555,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) unsigned int result = 0; struct cpuinfo_x86 *c = &cpu_data(policy->cpu); struct acpi_processor_performance *perf; +#ifdef CONFIG_SMP + static int blacklisted; +#endif dprintk("acpi_cpufreq_cpu_init\n"); #ifdef CONFIG_SMP - result = acpi_cpufreq_blacklist(c); - if (result) - return result; + if (blacklisted) + return blacklisted; + blacklisted = acpi_cpufreq_blacklist(c); + if (blacklisted) + return blacklisted; #endif data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index ce2ed3e4aad9..cabd2fa3fc93 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); break; case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V1; + longhaul_version = TYPE_LONGHAUL_V2; if (c->x86_mask < 8) { cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 6394aa5c7985..3f12dabeab52 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data) * set it to 1 to avoid problems in the future. * For all others it's a BIOS bug. */ - if (!boot_cpu_data.x86 == 0x11) + if (boot_cpu_data.x86 != 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 6911e91fb4f6..3ae5a7a3a500 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void) return 0; } -struct get_freq_data { - unsigned int speed; - unsigned int processor; -}; - -static void get_freq_data(void *_data) +static void get_freq_data(void *_speed) { - struct get_freq_data *data = _data; + unsigned int *speed = _speed; - data->speed = speedstep_get_frequency(data->processor); + *speed = speedstep_get_frequency(speedstep_processor); } static unsigned int speedstep_get(unsigned int cpu) { - struct get_freq_data data = { .processor = cpu }; + unsigned int speed; /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) + if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) BUG(); - dprintk("detected %u kHz as current frequency\n", data.speed); - return data.speed; + dprintk("detected %u kHz as current frequency\n", speed); + return speed; } /** diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 7029f0e2acad..472763d92098 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -98,8 +98,9 @@ static struct notifier_block mce_raise_nb = { }; /* Inject mce on current CPU */ -static int raise_local(struct mce *m) +static int raise_local(void) { + struct mce *m = &__get_cpu_var(injectm); int context = MCJ_CTX(m->inject_flags); int ret = 0; int cpu = m->extcpu; @@ -167,12 +168,12 @@ static void raise_mce(struct mce *m) } cpu_relax(); } - raise_local(m); + raise_local(); put_cpu(); put_online_cpus(); } else #endif - raise_local(m); + raise_local(); } /* Error injection interface */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2f5aab26320e..721a77ca8115 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -85,6 +85,18 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +static void default_decode_mce(struct mce *m) +{ + pr_emerg("No human readable MCE decoding support on this CPU type.\n"); + pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); +} + +/* + * CPU/chipset specific EDAC code can register a callback here to print + * MCE errors in a human-readable form: + */ +void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; +EXPORT_SYMBOL(x86_mce_decode_callback); /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { @@ -165,49 +177,46 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } -void __weak decode_mce(struct mce *m) -{ - return; -} - static void print_mce(struct mce *m) { - printk(KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); + if (m->ip) { - printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); + pr_emerg("RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } - printk(KERN_EMERG "TSC %llx ", m->tsc); + + pr_emerg("TSC %llx ", m->tsc); if (m->addr) - printk(KERN_CONT "ADDR %llx ", m->addr); + pr_cont("ADDR %llx ", m->addr); if (m->misc) - printk(KERN_CONT "MISC %llx ", m->misc); - printk(KERN_CONT "\n"); - printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, - m->apicid); + pr_cont("MISC %llx ", m->misc); - decode_mce(m); + pr_cont("\n"); + pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + + /* + * Print out human-readable details about the MCE error, + * (if the CPU has an implementation for that): + */ + x86_mce_decode_callback(m); } static void print_mce_head(void) { - printk(KERN_EMERG "\nHARDWARE ERROR\n"); + pr_emerg("\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { - printk(KERN_EMERG "This is not a software problem!\n" -#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD)) - "Run through mcelog --ascii to decode and contact your hardware vendor\n" -#endif - ); + pr_emerg("This is not a software problem!\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -221,6 +230,7 @@ static atomic_t mce_fake_paniced; static void wait_for_panic(void) { long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + preempt_disable(); local_irq_enable(); while (timeout-- > 0) @@ -288,6 +298,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) static int msr_to_offset(u32 msr) { unsigned bank = __get_cpu_var(injectm.bank); + if (msr == rip_msr) return offsetof(struct mce, ip); if (msr == MSR_IA32_MCx_STATUS(bank)) @@ -305,13 +316,25 @@ static int msr_to_offset(u32 msr) static u64 mce_rdmsrl(u32 msr) { u64 v; + if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset < 0) return 0; return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); } - rdmsrl(msr, v); + + if (rdmsrl_safe(msr, &v)) { + WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + /* + * Return zero in case the access faulted. This should + * not happen normally but can happen if the CPU does + * something weird, or if the code is buggy. + */ + v = 0; + } + return v; } @@ -319,6 +342,7 @@ static void mce_wrmsrl(u32 msr, u64 v) { if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset >= 0) *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; return; @@ -415,7 +439,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) m->ip = mce_rdmsrl(rip_msr); } -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_LOCAL_APIC /* * Called after interrupts have been reenabled again * when a MCE happened during an interrupts off region @@ -1172,6 +1196,7 @@ static int mce_banks_init(void) return -ENOMEM; for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + b->ctl = -1ULL; b->init = 1; } @@ -1189,7 +1214,8 @@ static int __cpuinit mce_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (!banks) + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { printk(KERN_WARNING @@ -1203,6 +1229,7 @@ static int __cpuinit mce_cap_init(void) banks = b; if (!mce_banks) { int err = mce_banks_init(); + if (err) return err; } @@ -1237,6 +1264,7 @@ static void mce_init(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + if (!b->init) continue; wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); @@ -1626,6 +1654,7 @@ static int mce_disable(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), 0); } @@ -1911,6 +1940,7 @@ static void mce_disable_cpu(void *h) cmci_clear(); for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), 0); } @@ -1928,6 +1958,7 @@ static void mce_reenable_cpu(void *h) cmci_reenable(); for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 889f665fe93d..7c785634af2b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,6 +8,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> +#include <linux/sched.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 63a56d147e4a..b3a1dba75330 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,20 +34,31 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; -static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -static DEFINE_PER_CPU(bool, thermal_throttle_active); +/* + * Current thermal throttling state: + */ +struct thermal_state { + bool is_throttled; + + u64 next_check; + unsigned long throttle_count; + unsigned long last_throttle_count; +}; + +static DEFINE_PER_CPU(struct thermal_state, thermal_state); -static atomic_t therm_throt_en = ATOMIC_INIT(0); +static atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) #define define_therm_throt_sysdev_show_func(name) \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ + \ +static ssize_t therm_throt_sysdev_show_##name( \ + struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ { \ unsigned int cpu = dev->id; \ ssize_t ret; \ @@ -55,7 +66,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ preempt_disable(); /* CPU hotplug */ \ if (cpu_online(cpu)) \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_throttle_##name, cpu)); \ + per_cpu(thermal_state, cpu).name); \ else \ ret = 0; \ preempt_enable(); \ @@ -63,11 +74,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ return ret; \ } -define_therm_throt_sysdev_show_func(count); -define_therm_throt_sysdev_one_ro(count); +define_therm_throt_sysdev_show_func(throttle_count); +define_therm_throt_sysdev_one_ro(throttle_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_count.attr, + &attr_throttle_count.attr, NULL }; @@ -93,33 +104,39 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(int curr) +static int therm_throt_process(bool is_throttled) { - unsigned int cpu = smp_processor_id(); - __u64 tmp_jiffs = get_jiffies_64(); - bool was_throttled = __get_cpu_var(thermal_throttle_active); - bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; + struct thermal_state *state; + unsigned int this_cpu; + bool was_throttled; + u64 now; + + this_cpu = smp_processor_id(); + now = get_jiffies_64(); + state = &per_cpu(thermal_state, this_cpu); + + was_throttled = state->is_throttled; + state->is_throttled = is_throttled; if (is_throttled) - __get_cpu_var(thermal_throttle_count)++; + state->throttle_count++; - if (!(was_throttled ^ is_throttled) && - time_before64(tmp_jiffs, __get_cpu_var(next_check))) + if (time_before64(now, state->next_check) && + state->throttle_count != state->last_throttle_count) return 0; - __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; + state->next_check = now + CHECK_INTERVAL; + state->last_throttle_count = state->throttle_count; /* if we just entered the thermal event */ if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", - cpu, __get_cpu_var(thermal_throttle_count)); + printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); add_taint(TAINT_MACHINE_CHECK); return 1; } if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); return 1; } @@ -213,7 +230,7 @@ static void intel_thermal_interrupt(void) __u64 msr_val; rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) mce_log_therm_throt_event(msr_val); } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 315738c74aad..73c86db5acbe 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -846,7 +846,7 @@ int __init mtrr_cleanup(unsigned address_bits) sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); - printk(KERN_INFO "total RAM coverred: %ldM\n", + printk(KERN_INFO "total RAM covered: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index f04e72527604..3c1b12d461d1 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -96,17 +96,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) unsigned long long base, size; char *ptr; char line[LINE_SIZE]; + int length; size_t linelen; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!len) - return -EINVAL; memset(line, 0, LINE_SIZE); - if (len > LINE_SIZE) - len = LINE_SIZE; - if (copy_from_user(line, buf, len - 1)) + + length = len; + length--; + + if (length > LINE_SIZE - 1) + length = LINE_SIZE - 1; + + if (length < 0) + return -EINVAL; + + if (copy_from_user(line, buf, length)) return -EFAULT; linelen = strlen(line); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c index a6c8b27553cd..b5801c311846 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1,5 +1,5 @@ /* - * Performance counter x86 architecture code + * Performance events x86 architecture code * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar @@ -11,7 +11,7 @@ * For licencing details see kernel-base/COPYING */ -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> @@ -27,10 +27,10 @@ #include <asm/stacktrace.h> #include <asm/nmi.h> -static u64 perf_counter_mask __read_mostly; +static u64 perf_event_mask __read_mostly; -/* The maximal number of PEBS counters: */ -#define MAX_PEBS_COUNTERS 4 +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 @@ -65,11 +65,11 @@ struct debug_store { u64 pebs_index; u64 pebs_absolute_maximum; u64 pebs_interrupt_threshold; - u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; -struct cpu_hw_counters { - struct perf_counter *counters[X86_PMC_IDX_MAX]; +struct cpu_hw_events { + struct perf_event *events[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; @@ -86,17 +86,17 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); - void (*enable)(struct hw_perf_counter *, int); - void (*disable)(struct hw_perf_counter *, int); + void (*enable)(struct hw_perf_event *, int); + void (*disable)(struct hw_perf_event *, int); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); u64 (*raw_event)(u64); int max_events; - int num_counters; - int num_counters_fixed; - int counter_bits; - u64 counter_mask; + int num_events; + int num_events_fixed; + int event_bits; + u64 event_mask; int apic; u64 max_period; u64 intel_ctrl; @@ -106,7 +106,7 @@ struct x86_pmu { static struct x86_pmu x86_pmu __read_mostly; -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; @@ -124,35 +124,35 @@ static const u64 p6_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, }; -static u64 p6_pmu_event_map(int event) +static u64 p6_pmu_event_map(int hw_event) { - return p6_perfmon_event_map[event]; + return p6_perfmon_event_map[hw_event]; } /* - * Counter setting that is specified not to count anything. + * Event setting that is specified not to count anything. * We use this to effectively disable a counter. * * L2_RQSTS with 0 MESI unit mask. */ -#define P6_NOP_COUNTER 0x0000002EULL +#define P6_NOP_EVENT 0x0000002EULL -static u64 p6_pmu_raw_event(u64 event) +static u64 p6_pmu_raw_event(u64 hw_event) { #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL #define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL #define P6_EVNTSEL_MASK \ (P6_EVNTSEL_EVENT_MASK | \ P6_EVNTSEL_UNIT_MASK | \ P6_EVNTSEL_EDGE_MASK | \ P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_COUNTER_MASK) + P6_EVNTSEL_REG_MASK) - return event & P6_EVNTSEL_MASK; + return hw_event & P6_EVNTSEL_MASK; } @@ -170,16 +170,16 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static u64 intel_pmu_event_map(int event) +static u64 intel_pmu_event_map(int hw_event) { - return intel_perfmon_event_map[event]; + return intel_perfmon_event_map[hw_event]; } /* - * Generalized hw caching related event table, filled + * Generalized hw caching related hw_event table, filled * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'event makes no sense on - * this CPU', any other value means the raw event + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event * ID. */ @@ -463,22 +463,22 @@ static const u64 atom_hw_cache_event_ids }, }; -static u64 intel_pmu_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 hw_event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL #define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ CORE_EVNTSEL_EDGE_MASK | \ CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_COUNTER_MASK) + CORE_EVNTSEL_REG_MASK) - return event & CORE_EVNTSEL_MASK; + return hw_event & CORE_EVNTSEL_MASK; } static const u64 amd_hw_cache_event_ids @@ -585,39 +585,39 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, }; -static u64 amd_pmu_event_map(int event) +static u64 amd_pmu_event_map(int hw_event) { - return amd_perfmon_event_map[event]; + return amd_perfmon_event_map[hw_event]; } -static u64 amd_pmu_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 hw_event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL #define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ K7_EVNTSEL_UNIT_MASK | \ K7_EVNTSEL_EDGE_MASK | \ K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_COUNTER_MASK) + K7_EVNTSEL_REG_MASK) - return event & K7_EVNTSEL_MASK; + return hw_event & K7_EVNTSEL_MASK; } /* - * Propagate counter elapsed time into the generic counter. - * Can only be executed on the CPU where the counter is active. + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ static u64 -x86_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +x86_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) { - int shift = 64 - x86_pmu.counter_bits; + int shift = 64 - x86_pmu.event_bits; u64 prev_raw_count, new_raw_count; s64 delta; @@ -625,15 +625,15 @@ x86_perf_counter_update(struct perf_counter *counter, return 0; /* - * Careful: an NMI might modify the previous counter value. + * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta - * count to the generic counter atomically: + * count to the generic event atomically: */ again: prev_raw_count = atomic64_read(&hwc->prev_count); - rdmsrl(hwc->counter_base + idx, new_raw_count); + rdmsrl(hwc->event_base + idx, new_raw_count); if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -642,7 +642,7 @@ again: /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta - * (counter-)time and add that to the generic counter. + * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. @@ -650,13 +650,13 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &counter->count); + atomic64_add(delta, &event->count); atomic64_sub(delta, &hwc->period_left); return new_raw_count; } -static atomic_t active_counters; +static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) @@ -667,12 +667,12 @@ static bool reserve_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -685,7 +685,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); - i = x86_pmu.num_counters; + i = x86_pmu.num_events; perfctr_fail: for (i--; i >= 0; i--) @@ -703,7 +703,7 @@ static void release_pmc_hardware(void) #ifdef CONFIG_X86_LOCAL_APIC int i; - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < x86_pmu.num_events; i++) { release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } @@ -720,7 +720,7 @@ static inline bool bts_available(void) static inline void init_debug_store_on_cpu(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; if (!ds) return; @@ -732,7 +732,7 @@ static inline void init_debug_store_on_cpu(int cpu) static inline void fini_debug_store_on_cpu(int cpu) { - if (!per_cpu(cpu_hw_counters, cpu).ds) + if (!per_cpu(cpu_hw_events, cpu).ds) return; wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); @@ -751,12 +751,12 @@ static void release_bts_hardware(void) fini_debug_store_on_cpu(cpu); for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; if (!ds) continue; - per_cpu(cpu_hw_counters, cpu).ds = NULL; + per_cpu(cpu_hw_events, cpu).ds = NULL; kfree((void *)(unsigned long)ds->bts_buffer_base); kfree(ds); @@ -796,7 +796,7 @@ static int reserve_bts_hardware(void) ds->bts_interrupt_threshold = ds->bts_absolute_maximum - BTS_OVFL_TH; - per_cpu(cpu_hw_counters, cpu).ds = ds; + per_cpu(cpu_hw_events, cpu).ds = ds; err = 0; } @@ -812,9 +812,9 @@ static int reserve_bts_hardware(void) return err; } -static void hw_perf_counter_destroy(struct perf_counter *counter) +static void hw_perf_event_destroy(struct perf_event *event) { - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { + if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { release_pmc_hardware(); release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); @@ -827,7 +827,7 @@ static inline int x86_pmu_initialized(void) } static inline int -set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) { unsigned int cache_type, cache_op, cache_result; u64 config, val; @@ -880,7 +880,7 @@ static void intel_pmu_enable_bts(u64 config) static void intel_pmu_disable_bts(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long debugctlmsr; if (!cpuc->ds) @@ -898,10 +898,10 @@ static void intel_pmu_disable_bts(void) /* * Setup the hardware configuration for a given attr_type */ -static int __hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_event_init(struct perf_event *event) { - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; u64 config; int err; @@ -909,22 +909,22 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -ENODEV; err = 0; - if (!atomic_inc_not_zero(&active_counters)) { + if (!atomic_inc_not_zero(&active_events)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0) { + if (atomic_read(&active_events) == 0) { if (!reserve_pmc_hardware()) err = -EBUSY; else err = reserve_bts_hardware(); } if (!err) - atomic_inc(&active_counters); + atomic_inc(&active_events); mutex_unlock(&pmc_reserve_mutex); } if (err) return err; - counter->destroy = hw_perf_counter_destroy; + event->destroy = hw_perf_event_destroy; /* * Generate PMC IRQs: @@ -948,15 +948,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * If we have a PMU initialized but no APIC * interrupts, we cannot sample hardware - * counters (user-space has to fall back and - * sample via a hrtimer based software counter): + * events (user-space has to fall back and + * sample via a hrtimer based software event): */ if (!x86_pmu.apic) return -EOPNOTSUPP; } /* - * Raw event type provide the config in the event structure + * Raw hw_event type provide the config in the hw_event structure */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); @@ -1001,7 +1001,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) static void p6_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; if (!cpuc->enabled) @@ -1018,7 +1018,7 @@ static void p6_pmu_disable_all(void) static void intel_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (!cpuc->enabled) return; @@ -1034,7 +1034,7 @@ static void intel_pmu_disable_all(void) static void amd_pmu_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; if (!cpuc->enabled) @@ -1043,12 +1043,12 @@ static void amd_pmu_disable_all(void) cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the - * counters proper, so that amd_pmu_enable_counter() does the + * events proper, so that amd_pmu_enable_event() does the * right thing. */ barrier(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -1070,7 +1070,7 @@ void hw_perf_disable(void) static void p6_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long val; if (cpuc->enabled) @@ -1087,7 +1087,7 @@ static void p6_pmu_enable_all(void) static void intel_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) return; @@ -1098,19 +1098,19 @@ static void intel_pmu_enable_all(void) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_counter *counter = - cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; - if (WARN_ON_ONCE(!counter)) + if (WARN_ON_ONCE(!event)) return; - intel_pmu_enable_bts(counter->hw.config); + intel_pmu_enable_bts(event->hw.config); } } static void amd_pmu_enable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; if (cpuc->enabled) @@ -1119,14 +1119,14 @@ static void amd_pmu_enable_all(void) cpuc->enabled = 1; barrier(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; + for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; - val = counter->hw.config; + val = event->hw.config; val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } @@ -1153,19 +1153,19 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); } static inline void -intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -1178,10 +1178,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) } static inline void -p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val = P6_NOP_COUNTER; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -1190,7 +1190,7 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) } static inline void -intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) { if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); @@ -1202,24 +1202,24 @@ intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) return; } - x86_pmu_disable_counter(hwc, idx); + x86_pmu_disable_event(hwc, idx); } static inline void -amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) { - x86_pmu_disable_counter(hwc, idx); + x86_pmu_disable_event(hwc, idx); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. - * To be called with the counter disabled in hw: + * To be called with the event disabled in hw: */ static int -x86_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); s64 period = hwc->sample_period; @@ -1245,7 +1245,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, ret = 1; } /* - * Quirk: certain CPUs dont like it if just 1 event is left: + * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; @@ -1256,21 +1256,21 @@ x86_perf_counter_set_period(struct perf_counter *counter, per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* - * The hw counter starts counting from this counter offset, + * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ atomic64_set(&hwc->prev_count, (u64)-left); - err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & x86_pmu.counter_mask); + err = checking_wrmsrl(hwc->event_base + idx, + (u64)(-left) & x86_pmu.event_mask); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); return ret; } static inline void -intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -1295,9 +1295,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); u64 val; val = hwc->config; @@ -1308,10 +1308,10 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) } -static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) { if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_counters).enabled) + if (!__get_cpu_var(cpu_hw_events).enabled) return; intel_pmu_enable_bts(hwc->config); @@ -1323,134 +1323,134 @@ static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) return; } - x86_pmu_enable_counter(hwc, idx); + x86_pmu_enable_event(hwc, idx); } -static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) - x86_pmu_enable_counter(hwc, idx); + x86_pmu_enable_event(hwc, idx); } static int -fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) { - unsigned int event; + unsigned int hw_event; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; + hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely((event == + if (unlikely((hw_event == x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && (hwc->sample_period == 1))) return X86_PMC_IDX_FIXED_BTS; - if (!x86_pmu.num_counters_fixed) + if (!x86_pmu.num_events_fixed) return -1; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; } /* - * Find a PMC slot for the freshly enabled / scheduled in counter: + * Find a PMC slot for the freshly enabled / scheduled in event: */ -static int x86_pmu_enable(struct perf_counter *counter) +static int x86_pmu_enable(struct perf_event *event) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; int idx; - idx = fixed_mode_idx(counter, hwc); + idx = fixed_mode_idx(event, hwc); if (idx == X86_PMC_IDX_FIXED_BTS) { /* BTS is already occupied. */ if (test_and_set_bit(idx, cpuc->used_mask)) return -EAGAIN; hwc->config_base = 0; - hwc->counter_base = 0; + hwc->event_base = 0; hwc->idx = idx; } else if (idx >= 0) { /* - * Try to get the fixed counter, if that is already taken - * then try to get a generic counter: + * Try to get the fixed event, if that is already taken + * then try to get a generic event: */ if (test_and_set_bit(idx, cpuc->used_mask)) goto try_generic; hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; /* - * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * We set it so that event_base + idx in wrmsr/rdmsr maps to * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: */ - hwc->counter_base = + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; hwc->idx = idx; } else { idx = hwc->idx; - /* Try to get the previous generic counter again */ + /* Try to get the previous generic event again */ if (test_and_set_bit(idx, cpuc->used_mask)) { try_generic: idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_counters); - if (idx == x86_pmu.num_counters) + x86_pmu.num_events); + if (idx == x86_pmu.num_events) return -EAGAIN; set_bit(idx, cpuc->used_mask); hwc->idx = idx; } hwc->config_base = x86_pmu.eventsel; - hwc->counter_base = x86_pmu.perfctr; + hwc->event_base = x86_pmu.perfctr; } - perf_counters_lapic_init(); + perf_events_lapic_init(); x86_pmu.disable(hwc, idx); - cpuc->counters[idx] = counter; + cpuc->events[idx] = event; set_bit(idx, cpuc->active_mask); - x86_perf_counter_set_period(counter, hwc, idx); + x86_perf_event_set_period(event, hwc, idx); x86_pmu.enable(hwc, idx); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); return 0; } -static void x86_pmu_unthrottle(struct perf_counter *counter) +static void x86_pmu_unthrottle(struct perf_event *event) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->counters[hwc->idx] != counter)) + cpuc->events[hwc->idx] != event)) return; x86_pmu.enable(hwc, hwc->idx); } -void perf_counter_print_debug(void) +void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - struct cpu_hw_counters *cpuc; + struct cpu_hw_events *cpuc; unsigned long flags; int cpu, idx; - if (!x86_pmu.num_counters) + if (!x86_pmu.num_events) return; local_irq_save(flags); cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + cpuc = &per_cpu(cpu_hw_events, cpu); if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); @@ -1466,7 +1466,7 @@ void perf_counter_print_debug(void) } pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); @@ -1479,7 +1479,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1488,7 +1488,7 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) +static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; struct bts_record { @@ -1496,14 +1496,14 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) u64 to; u64 flags; }; - struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; struct bts_record *at, *top; struct perf_output_handle handle; struct perf_event_header header; struct perf_sample_data data; struct pt_regs regs; - if (!counter) + if (!event) return; if (!ds) @@ -1518,7 +1518,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) ds->bts_index = ds->bts_buffer_base; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; data.addr = 0; regs.ip = 0; @@ -1527,9 +1527,9 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) * We will overwrite the from and to address before we output * the sample. */ - perf_prepare_sample(&header, &data, counter, ®s); + perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, counter, + if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) return; @@ -1537,20 +1537,20 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) data.ip = at->from; data.addr = at->to; - perf_output_sample(&handle, &header, &data, counter); + perf_output_sample(&handle, &header, &data, event); } perf_output_end(&handle); /* There's new data available. */ - counter->hw.interrupts++; - counter->pending_kill = POLL_IN; + event->hw.interrupts++; + event->pending_kill = POLL_IN; } -static void x86_pmu_disable(struct perf_counter *counter) +static void x86_pmu_disable(struct perf_event *event) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; /* @@ -1562,63 +1562,63 @@ static void x86_pmu_disable(struct perf_counter *counter) /* * Make sure the cleared pointer becomes visible before we - * (potentially) free the counter: + * (potentially) free the event: */ barrier(); /* - * Drain the remaining delta count out of a counter + * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_counter_update(counter, hwc, idx); + x86_perf_event_update(event, hwc, idx); /* Drain the remaining BTS records. */ if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) intel_pmu_drain_bts_buffer(cpuc); - cpuc->counters[idx] = NULL; + cpuc->events[idx] = NULL; clear_bit(idx, cpuc->used_mask); - perf_counter_update_userpage(counter); + perf_event_update_userpage(event); } /* - * Save and restart an expired counter. Called by NMI contexts, - * so it has to be careful about preempting normal counter ops: + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: */ -static int intel_pmu_save_and_restart(struct perf_counter *counter) +static int intel_pmu_save_and_restart(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; int ret; - x86_perf_counter_update(counter, hwc, idx); - ret = x86_perf_counter_set_period(counter, hwc, idx); + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - intel_pmu_enable_counter(hwc, idx); + if (event->state == PERF_EVENT_STATE_ACTIVE) + intel_pmu_enable_event(hwc, idx); return ret; } static void intel_pmu_reset(void) { - struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; unsigned long flags; int idx; - if (!x86_pmu.num_counters) + if (!x86_pmu.num_events) return; local_irq_save(flags); printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } if (ds) @@ -1630,38 +1630,38 @@ static void intel_pmu_reset(void) static int p6_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; int idx, handled = 0; u64 val; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; - counter = cpuc->counters[idx]; - hwc = &counter->hw; + event = cpuc->events[idx]; + hwc = &event->hw; - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; /* - * counter overflow + * event overflow */ handled = 1; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (!x86_perf_counter_set_period(counter, hwc, idx)) + if (!x86_perf_event_set_period(event, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data, regs)) - p6_pmu_disable_counter(hwc, idx); + if (perf_event_overflow(event, 1, &data, regs)) + p6_pmu_disable_event(hwc, idx); } if (handled) @@ -1677,13 +1677,13 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) static int intel_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; + struct cpu_hw_events *cpuc; int bit, loops; u64 ack, status; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); perf_disable(); intel_pmu_drain_bts_buffer(cpuc); @@ -1696,8 +1696,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: if (++loops > 100) { - WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); - perf_counter_print_debug(); + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); intel_pmu_reset(); perf_enable(); return 1; @@ -1706,19 +1706,19 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_event *event = cpuc->events[bit]; clear_bit(bit, (unsigned long *) &status); if (!test_bit(bit, cpuc->active_mask)) continue; - if (!intel_pmu_save_and_restart(counter)) + if (!intel_pmu_save_and_restart(event)) continue; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (perf_counter_overflow(counter, 1, &data, regs)) - intel_pmu_disable_counter(&counter->hw, bit); + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); } intel_pmu_ack_status(ack); @@ -1738,38 +1738,38 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; int idx, handled = 0; u64 val; data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_counters); + cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < x86_pmu.num_events; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; - counter = cpuc->counters[idx]; - hwc = &counter->hw; + event = cpuc->events[idx]; + hwc = &event->hw; - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; /* - * counter overflow + * event overflow */ handled = 1; - data.period = counter->hw.last_period; + data.period = event->hw.last_period; - if (!x86_perf_counter_set_period(counter, hwc, idx)) + if (!x86_perf_event_set_period(event, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data, regs)) - amd_pmu_disable_counter(hwc, idx); + if (perf_event_overflow(event, 1, &data, regs)) + amd_pmu_disable_event(hwc, idx); } if (handled) @@ -1783,18 +1783,21 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) irq_enter(); ack_APIC_irq(); inc_irq_stat(apic_pending_irqs); - perf_counter_do_pending(); + perf_event_do_pending(); irq_exit(); } -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { #ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + apic->send_IPI_self(LOCAL_PENDING_VECTOR); #endif } -void perf_counters_lapic_init(void) +void perf_events_lapic_init(void) { #ifdef CONFIG_X86_LOCAL_APIC if (!x86_pmu.apic || !x86_pmu_initialized()) @@ -1808,13 +1811,13 @@ void perf_counters_lapic_init(void) } static int __kprobes -perf_counter_nmi_handler(struct notifier_block *self, +perf_event_nmi_handler(struct notifier_block *self, unsigned long cmd, void *__args) { struct die_args *args = __args; struct pt_regs *regs; - if (!atomic_read(&active_counters)) + if (!atomic_read(&active_events)) return NOTIFY_DONE; switch (cmd) { @@ -1833,7 +1836,7 @@ perf_counter_nmi_handler(struct notifier_block *self, #endif /* * Can't rely on the handled return value to say it was our NMI, two - * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * events could trigger 'simultaneously' raising two back-to-back NMIs. * * If the first NMI handles both, the latter will be empty and daze * the CPU. @@ -1843,8 +1846,8 @@ perf_counter_nmi_handler(struct notifier_block *self, return NOTIFY_STOP; } -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, .next = NULL, .priority = 1 }; @@ -1854,8 +1857,8 @@ static struct x86_pmu p6_pmu = { .handle_irq = p6_pmu_handle_irq, .disable_all = p6_pmu_disable_all, .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_counter, - .disable = p6_pmu_disable_counter, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, .eventsel = MSR_P6_EVNTSEL0, .perfctr = MSR_P6_PERFCTR0, .event_map = p6_pmu_event_map, @@ -1864,16 +1867,16 @@ static struct x86_pmu p6_pmu = { .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, - .num_counters = 2, + .num_events = 2, /* - * Counters have 40 bits implemented. However they are designed such + * Events have 40 bits implemented. However they are designed such * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a counter for P6-like PMU is 32 bits only. + * effective width of a event for P6-like PMU is 32 bits only. * * See IA-32 Intel Architecture Software developer manual Vol 3B */ - .counter_bits = 32, - .counter_mask = (1ULL << 32) - 1, + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, }; static struct x86_pmu intel_pmu = { @@ -1881,8 +1884,8 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .disable_all = intel_pmu_disable_all, .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_counter, - .disable = intel_pmu_disable_counter, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = intel_pmu_event_map, @@ -1892,7 +1895,7 @@ static struct x86_pmu intel_pmu = { /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of - * the generic counter period: + * the generic event period: */ .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, @@ -1904,16 +1907,16 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_counter, - .disable = amd_pmu_disable_counter, + .enable = amd_pmu_enable_event, + .disable = amd_pmu_disable_event, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, - .counter_bits = 48, - .counter_mask = (1ULL << 48) - 1, + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, @@ -1970,7 +1973,7 @@ static int intel_pmu_init(void) /* * Check whether the Architectural PerfMon supports - * Branch Misses Retired Event or not. + * Branch Misses Retired hw_event or not. */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) @@ -1982,15 +1985,15 @@ static int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; /* - * Quirk: v2 perfmon does not report fixed-purpose counters, so - * assume at least 3 counters: + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); /* * Install the hw-cache-events table: @@ -2037,11 +2040,11 @@ static int amd_pmu_init(void) return 0; } -void __init init_hw_perf_counters(void) +void __init init_hw_perf_events(void) { int err; - pr_info("Performance Counters: "); + pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: @@ -2054,45 +2057,45 @@ void __init init_hw_perf_counters(void) return; } if (err != 0) { - pr_cont("no PMU driver, software counters only.\n"); + pr_cont("no PMU driver, software events only.\n"); return; } pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; + if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_events, X86_PMC_MAX_GENERIC); + x86_pmu.num_events = X86_PMC_MAX_GENERIC; } - perf_counter_mask = (1 << x86_pmu.num_counters) - 1; - perf_max_counters = x86_pmu.num_counters; + perf_event_mask = (1 << x86_pmu.num_events) - 1; + perf_max_events = x86_pmu.num_events; - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; + if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; } - perf_counter_mask |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - x86_pmu.intel_ctrl = perf_counter_mask; + perf_event_mask |= + ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_event_mask; - perf_counters_lapic_init(); - register_die_notifier(&perf_counter_nmi_notifier); + perf_events_lapic_init(); + register_die_notifier(&perf_event_nmi_notifier); - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); - pr_info("... generic counters: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); - pr_info("... counter mask: %016Lx\n", perf_counter_mask); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.event_bits); + pr_info("... generic registers: %d\n", x86_pmu.num_events); + pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); + pr_info("... event mask: %016Lx\n", perf_event_mask); } -static inline void x86_pmu_read(struct perf_counter *counter) +static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); + x86_perf_event_update(event, &event->hw, event->hw.idx); } static const struct pmu pmu = { @@ -2102,14 +2105,14 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_event_init(struct perf_event *event) { int err; - err = __hw_perf_counter_init(counter); + err = __hw_perf_event_init(event); if (err) { - if (counter->destroy) - counter->destroy(counter); + if (event->destroy) + event->destroy(event); return ERR_PTR(err); } @@ -2292,7 +2295,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void hw_perf_counter_setup_online(int cpu) +void hw_perf_event_setup_online(int cpu) { init_debug_store_on_cpu(cpu); } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 392bea43b890..fab786f60ed6 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include <linux/kprobes.h> #include <asm/apic.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> struct nmi_watchdog_ctlblk { unsigned int cccr_msr; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 5e409dc298a4..a4849c10a77e 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,8 +27,7 @@ #include <asm/cpu.h> #include <asm/reboot.h> #include <asm/virtext.h> -#include <asm/iommu.h> - +#include <asm/x86_init.h> #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif crash_save_cpu(regs, safe_smp_processor_id()); diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index f7cdb3b457aa..cd97ce18c29d 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -16,6 +16,22 @@ static void *kdump_buf_page; /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +static inline bool is_crashed_pfn_valid(unsigned long pfn) +{ +#ifndef CONFIG_X86_PAE + /* + * non-PAE kdump kernel executed from a PAE one will crop high pte + * bits and poke unwanted space counting again from address 0, we + * don't want that. pte must fit into unsigned long. In fact the + * test checks high 12 bits for being zero (pfn will be shifted left + * by PAGE_SHIFT). + */ + return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn; +#else + return true; +#endif +} + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied @@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!csize) return 0; + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + vaddr = kmap_atomic_pfn(pfn, KM_PTE0); if (!userbuf) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index bca5fba91c9e..f7dd2a7c3bf4 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -5,7 +5,6 @@ #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 54b0a3276766..a071e6be177e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -5,7 +5,6 @@ #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a3210ce1eccd..d17d482a04f4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; @@ -1378,8 +1378,8 @@ static unsigned long ram_alignment(resource_size_t pos) if (mb < 16) return 1024*1024; - /* To 32MB for anything above that */ - return 32*1024*1024; + /* To 64MB for anything above that */ + return 64*1024*1024; } #define MAX_RESOURCE_SIZE ((resource_size_t)-1) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index b11cab3c323a..b9c830c12b4a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -160,721 +160,6 @@ static struct console early_serial_console = { .index = -1, }; -#ifdef CONFIG_EARLY_PRINTK_DBGP - -static struct ehci_caps __iomem *ehci_caps; -static struct ehci_regs __iomem *ehci_regs; -static struct ehci_dbg_port __iomem *ehci_debug; -static unsigned int dbgp_endpoint_out; - -struct ehci_dev { - u32 bus; - u32 slot; - u32 func; -}; - -static struct ehci_dev ehci_dev; - -#define USB_DEBUG_DEVNUM 127 - -#define DBGP_DATA_TOGGLE 0x8800 - -static inline u32 dbgp_pid_update(u32 x, u32 tok) -{ - return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); -} - -static inline u32 dbgp_len_update(u32 x, u32 len) -{ - return (x & ~0x0f) | (len & 0x0f); -} - -/* - * USB Packet IDs (PIDs) - */ - -/* token */ -#define USB_PID_OUT 0xe1 -#define USB_PID_IN 0x69 -#define USB_PID_SOF 0xa5 -#define USB_PID_SETUP 0x2d -/* handshake */ -#define USB_PID_ACK 0xd2 -#define USB_PID_NAK 0x5a -#define USB_PID_STALL 0x1e -#define USB_PID_NYET 0x96 -/* data */ -#define USB_PID_DATA0 0xc3 -#define USB_PID_DATA1 0x4b -#define USB_PID_DATA2 0x87 -#define USB_PID_MDATA 0x0f -/* Special */ -#define USB_PID_PREAMBLE 0x3c -#define USB_PID_ERR 0x3c -#define USB_PID_SPLIT 0x78 -#define USB_PID_PING 0xb4 -#define USB_PID_UNDEF_0 0xf0 - -#define USB_PID_DATA_TOGGLE 0x88 -#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) - -#define PCI_CAP_ID_EHCI_DEBUG 0xa - -#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ -#define HUB_SHORT_RESET_TIME 10 -#define HUB_LONG_RESET_TIME 200 -#define HUB_RESET_TIMEOUT 500 - -#define DBGP_MAX_PACKET 8 - -static int dbgp_wait_until_complete(void) -{ - u32 ctrl; - int loop = 0x100000; - - do { - ctrl = readl(&ehci_debug->control); - /* Stop when the transaction is finished */ - if (ctrl & DBGP_DONE) - break; - } while (--loop > 0); - - if (!loop) - return -1; - - /* - * Now that we have observed the completed transaction, - * clear the done bit. - */ - writel(ctrl | DBGP_DONE, &ehci_debug->control); - return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); -} - -static void __init dbgp_mdelay(int ms) -{ - int i; - - while (ms--) { - for (i = 0; i < 1000; i++) - outb(0x1, 0x80); - } -} - -static void dbgp_breath(void) -{ - /* Sleep to give the debug port a chance to breathe */ -} - -static int dbgp_wait_until_done(unsigned ctrl) -{ - u32 pids, lpid; - int ret; - int loop = 3; - -retry: - writel(ctrl | DBGP_GO, &ehci_debug->control); - ret = dbgp_wait_until_complete(); - pids = readl(&ehci_debug->pids); - lpid = DBGP_PID_GET(pids); - - if (ret < 0) - return ret; - - /* - * If the port is getting full or it has dropped data - * start pacing ourselves, not necessary but it's friendly. - */ - if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) - dbgp_breath(); - - /* If I get a NACK reissue the transmission */ - if (lpid == USB_PID_NAK) { - if (--loop > 0) - goto retry; - } - - return ret; -} - -static void dbgp_set_data(const void *buf, int size) -{ - const unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = hi = 0; - for (i = 0; i < 4 && i < size; i++) - lo |= bytes[i] << (8*i); - for (; i < 8 && i < size; i++) - hi |= bytes[i] << (8*(i - 4)); - writel(lo, &ehci_debug->data03); - writel(hi, &ehci_debug->data47); -} - -static void __init dbgp_get_data(void *buf, int size) -{ - unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = readl(&ehci_debug->data03); - hi = readl(&ehci_debug->data47); - for (i = 0; i < 4 && i < size; i++) - bytes[i] = (lo >> (8*i)) & 0xff; - for (; i < 8 && i < size; i++) - bytes[i] = (hi >> (8*(i - 4))) & 0xff; -} - -static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, - const char *bytes, int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_OUT); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - dbgp_set_data(bytes, size); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - return ret; -} - -static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, - int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_IN); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl &= ~DBGP_OUT; - ctrl |= DBGP_GO; - - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - if (size > ret) - size = ret; - dbgp_get_data(data, size); - return ret; -} - -static int __init dbgp_control_msg(unsigned devnum, int requesttype, - int request, int value, int index, void *data, int size) -{ - u32 pids, addr, ctrl; - struct usb_ctrlrequest req; - int read; - int ret; - - read = (requesttype & USB_DIR_IN) != 0; - if (size > (read ? DBGP_MAX_PACKET:0)) - return -1; - - /* Compute the control message */ - req.bRequestType = requesttype; - req.bRequest = request; - req.wValue = cpu_to_le16(value); - req.wIndex = cpu_to_le16(index); - req.wLength = cpu_to_le16(size); - - pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); - addr = DBGP_EPADDR(devnum, 0); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, sizeof(req)); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - /* Send the setup message */ - dbgp_set_data(&req, sizeof(req)); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - /* Read the result */ - return dbgp_bulk_read(devnum, 0, data, size); -} - - -/* Find a PCI capability */ -static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) -{ - u8 pos; - int bytes; - - if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & - PCI_STATUS_CAP_LIST)) - return 0; - - pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - - pos &= ~3; - id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - - pos = read_pci_config_byte(num, slot, func, - pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - -static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) -{ - u32 class; - - class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); - if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) - return 0; - - return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); -} - -static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) -{ - u32 bus, slot, func; - - for (bus = 0; bus < 256; bus++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - unsigned cap; - - cap = __find_dbgp(bus, slot, func); - - if (!cap) - continue; - if (ehci_num-- != 0) - continue; - *rbus = bus; - *rslot = slot; - *rfunc = func; - return cap; - } - } - } - return 0; -} - -static int __init ehci_reset_port(int port) -{ - u32 portsc; - u32 delay_time, delay; - int loop; - - /* Reset the usb debug port */ - portsc = readl(&ehci_regs->port_status[port - 1]); - portsc &= ~PORT_PE; - portsc |= PORT_RESET; - writel(portsc, &ehci_regs->port_status[port - 1]); - - delay = HUB_ROOT_RESET_TIME; - for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; - delay_time += delay) { - dbgp_mdelay(delay); - - portsc = readl(&ehci_regs->port_status[port - 1]); - if (portsc & PORT_RESET) { - /* force reset to complete */ - loop = 2; - writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), - &ehci_regs->port_status[port - 1]); - do { - portsc = readl(&ehci_regs->port_status[port-1]); - } while ((portsc & PORT_RESET) && (--loop > 0)); - } - - /* Device went away? */ - if (!(portsc & PORT_CONNECT)) - return -ENOTCONN; - - /* bomb out completely if something weird happend */ - if ((portsc & PORT_CSC)) - return -EINVAL; - - /* If we've finished resetting, then break out of the loop */ - if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) - return 0; - } - return -EBUSY; -} - -static int __init ehci_wait_for_port(int port) -{ - u32 status; - int ret, reps; - - for (reps = 0; reps < 3; reps++) { - dbgp_mdelay(100); - status = readl(&ehci_regs->status); - if (status & STS_PCD) { - ret = ehci_reset_port(port); - if (ret == 0) - return 0; - } - } - return -ENOTCONN; -} - -#ifdef DBGP_DEBUG -# define dbgp_printk early_printk -#else -static inline void dbgp_printk(const char *fmt, ...) { } -#endif - -typedef void (*set_debug_port_t)(int port); - -static void __init default_set_debug_port(int port) -{ -} - -static set_debug_port_t __initdata set_debug_port = default_set_debug_port; - -static void __init nvidia_set_debug_port(int port) -{ - u32 dword; - dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x74); - dword &= ~(0x0f<<12); - dword |= ((port & 0x0f)<<12); - write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, - dword); - dbgp_printk("set debug port to %d\n", port); -} - -static void __init detect_set_debug_port(void) -{ - u32 vendorid; - - vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x00); - - if ((vendorid & 0xffff) == 0x10de) { - dbgp_printk("using nvidia set_debug_port\n"); - set_debug_port = nvidia_set_debug_port; - } -} - -static int __init ehci_setup(void) -{ - struct usb_debug_descriptor dbgp_desc; - u32 cmd, ctrl, status, portsc, hcs_params; - u32 debug_port, new_debug_port = 0, n_ports; - u32 devnum; - int ret, i; - int loop; - int port_map_tried; - int playtimes = 3; - -try_next_time: - port_map_tried = 0; - -try_next_port: - - hcs_params = readl(&ehci_caps->hcs_params); - debug_port = HCS_DEBUG_PORT(hcs_params); - n_ports = HCS_N_PORTS(hcs_params); - - dbgp_printk("debug_port: %d\n", debug_port); - dbgp_printk("n_ports: %d\n", n_ports); - - for (i = 1; i <= n_ports; i++) { - portsc = readl(&ehci_regs->port_status[i-1]); - dbgp_printk("portstatus%d: %08x\n", i, portsc); - } - - if (port_map_tried && (new_debug_port != debug_port)) { - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - return -1; - } - - loop = 100000; - /* Reset the EHCI controller */ - cmd = readl(&ehci_regs->command); - cmd |= CMD_RESET; - writel(cmd, &ehci_regs->command); - do { - cmd = readl(&ehci_regs->command); - } while ((cmd & CMD_RESET) && (--loop > 0)); - - if (!loop) { - dbgp_printk("can not reset ehci\n"); - return -1; - } - dbgp_printk("ehci reset done\n"); - - /* Claim ownership, but do not enable yet */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_OWNER; - ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); - writel(ctrl, &ehci_debug->control); - - /* Start the ehci running */ - cmd = readl(&ehci_regs->command); - cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); - cmd |= CMD_RUN; - writel(cmd, &ehci_regs->command); - - /* Ensure everything is routed to the EHCI */ - writel(FLAG_CF, &ehci_regs->configured_flag); - - /* Wait until the controller is no longer halted */ - loop = 10; - do { - status = readl(&ehci_regs->status); - } while ((status & STS_HALT) && (--loop > 0)); - - if (!loop) { - dbgp_printk("ehci can be started\n"); - return -1; - } - dbgp_printk("ehci started\n"); - - /* Wait for a device to show up in the debug port */ - ret = ehci_wait_for_port(debug_port); - if (ret < 0) { - dbgp_printk("No device found in debug port\n"); - goto next_debug_port; - } - dbgp_printk("ehci wait for port done\n"); - - /* Enable the debug port */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_CLAIM; - writel(ctrl, &ehci_debug->control); - ctrl = readl(&ehci_debug->control); - if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { - dbgp_printk("No device in debug port\n"); - writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); - goto err; - } - dbgp_printk("debug ported enabled\n"); - - /* Completely transfer the debug device to the debug controller */ - portsc = readl(&ehci_regs->port_status[debug_port - 1]); - portsc &= ~PORT_PE; - writel(portsc, &ehci_regs->port_status[debug_port - 1]); - - dbgp_mdelay(100); - - /* Find the debug device and make it device number 127 */ - for (devnum = 0; devnum <= 127; devnum++) { - ret = dbgp_control_msg(devnum, - USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, - &dbgp_desc, sizeof(dbgp_desc)); - if (ret > 0) - break; - } - if (devnum > 127) { - dbgp_printk("Could not find attached debug device\n"); - goto err; - } - if (ret < 0) { - dbgp_printk("Attached device is not a debug device\n"); - goto err; - } - dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; - - /* Move the device to 127 if it isn't already there */ - if (devnum != USB_DEBUG_DEVNUM) { - ret = dbgp_control_msg(devnum, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); - if (ret < 0) { - dbgp_printk("Could not move attached device to %d\n", - USB_DEBUG_DEVNUM); - goto err; - } - devnum = USB_DEBUG_DEVNUM; - dbgp_printk("debug device renamed to 127\n"); - } - - /* Enable the debug interface */ - ret = dbgp_control_msg(USB_DEBUG_DEVNUM, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); - if (ret < 0) { - dbgp_printk(" Could not enable the debug device\n"); - goto err; - } - dbgp_printk("debug interface enabled\n"); - - /* Perform a small write to get the even/odd data state in sync - */ - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); - if (ret < 0) { - dbgp_printk("dbgp_bulk_write failed: %d\n", ret); - goto err; - } - dbgp_printk("small write doned\n"); - - return 0; -err: - /* Things didn't work so remove my claim */ - ctrl = readl(&ehci_debug->control); - ctrl &= ~(DBGP_CLAIM | DBGP_OUT); - writel(ctrl, &ehci_debug->control); - return -1; - -next_debug_port: - port_map_tried |= (1<<(debug_port - 1)); - new_debug_port = ((debug_port-1+1)%n_ports) + 1; - if (port_map_tried != ((1<<n_ports) - 1)) { - set_debug_port(new_debug_port); - goto try_next_port; - } - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - - return -1; -} - -static int __init early_dbgp_init(char *s) -{ - u32 debug_port, bar, offset; - u32 bus, slot, func, cap; - void __iomem *ehci_bar; - u32 dbgp_num; - u32 bar_val; - char *e; - int ret; - u8 byte; - - if (!early_pci_allowed()) - return -1; - - dbgp_num = 0; - if (*s) - dbgp_num = simple_strtoul(s, &e, 10); - dbgp_printk("dbgp_num: %d\n", dbgp_num); - - cap = find_dbgp(dbgp_num, &bus, &slot, &func); - if (!cap) - return -1; - - dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, - func); - - debug_port = read_pci_config(bus, slot, func, cap); - bar = (debug_port >> 29) & 0x7; - bar = (bar * 4) + 0xc; - offset = (debug_port >> 16) & 0xfff; - dbgp_printk("bar: %02x offset: %03x\n", bar, offset); - if (bar != PCI_BASE_ADDRESS_0) { - dbgp_printk("only debug ports on bar 1 handled.\n"); - - return -1; - } - - bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); - dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); - if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { - dbgp_printk("only simple 32bit mmio bars supported\n"); - - return -1; - } - - /* double check if the mem space is enabled */ - byte = read_pci_config_byte(bus, slot, func, 0x04); - if (!(byte & 0x2)) { - byte |= 0x02; - write_pci_config_byte(bus, slot, func, 0x04, byte); - dbgp_printk("mmio for ehci enabled\n"); - } - - /* - * FIXME I don't have the bar size so just guess PAGE_SIZE is more - * than enough. 1K is the biggest I have seen. - */ - set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); - ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); - ehci_bar += bar_val & ~PAGE_MASK; - dbgp_printk("ehci_bar: %p\n", ehci_bar); - - ehci_caps = ehci_bar; - ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); - ehci_debug = ehci_bar + offset; - ehci_dev.bus = bus; - ehci_dev.slot = slot; - ehci_dev.func = func; - - detect_set_debug_port(); - - ret = ehci_setup(); - if (ret < 0) { - dbgp_printk("ehci_setup failed\n"); - ehci_debug = NULL; - - return -1; - } - - return 0; -} - -static void early_dbgp_write(struct console *con, const char *str, u32 n) -{ - int chunk, ret; - - if (!ehci_debug) - return; - while (n > 0) { - chunk = n; - if (chunk > DBGP_MAX_PACKET) - chunk = DBGP_MAX_PACKET; - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, - dbgp_endpoint_out, str, chunk); - str += chunk; - n -= chunk; - } -} - -static struct console early_dbgp_console = { - .name = "earlydbg", - .write = early_dbgp_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; -#endif - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -891,10 +176,24 @@ asmlinkage void early_printk(const char *fmt, ...) va_end(ap); } +static inline void early_console_register(struct console *con, int keep_early) +{ + if (early_console->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} static int __init setup_early_printk(char *buf) { - int keep_early; + int keep; if (!buf) return 0; @@ -903,42 +202,37 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - keep_early = (strstr(buf, "keep") != NULL); - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3) - && boot_params.screen_info.orig_video_isVGA == 1) { - max_xpos = boot_params.screen_info.orig_video_cols; - max_ypos = boot_params.screen_info.orig_video_lines; - current_ypos = boot_params.screen_info.orig_y; - early_console = &early_vga_console; + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; + early_console_register(&early_vga_console, keep); + } #ifdef CONFIG_EARLY_PRINTK_DBGP - } else if (!strncmp(buf, "dbgp", 4)) { - if (early_dbgp_init(buf+4) < 0) - return 0; - early_console = &early_dbgp_console; - /* - * usb subsys will reset ehci controller, so don't keep - * that early console - */ - keep_early = 0; + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); #endif #ifdef CONFIG_HVC_XEN - } else if (!strncmp(buf, "xen", 3)) { - early_console = &xenboot_console; + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); #endif + buf++; } - - if (keep_early) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); return 0; } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index ad5bd988fb79..cdcfb122f256 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -454,8 +454,10 @@ void __init efi_init(void) if (add_efi_memmap) do_add_efi_memmap(); +#ifdef CONFIG_X86_32 x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; +#endif /* Setup for EFI runtime service */ reboot_type = BOOT_EFI; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d59fe323807e..b5c061f8f358 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -536,20 +536,13 @@ sysret_signal: bt $TIF_SYSCALL_AUDIT,%edx jc sysret_audit #endif - /* edx: work flags (arg3) */ - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call do_notify_resume - RESTORE_TOP_OF_STACK %r11 - RESTORE_REST - movl $_TIF_WORK_MASK,%edi - /* Use IRET because user could have changed frame. This - works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -654,6 +647,7 @@ int_careful: int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: SAVE_REST /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx @@ -1021,7 +1015,7 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b766e8c7252d..050c278481b1 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -.section .text.head,"ax",@progbits +__HEAD ENTRY(startup_32) /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ @@ -608,7 +608,7 @@ ENTRY(initial_code) /* * BSS section */ -.section ".bss.page_aligned","wa" +__PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE swapper_pg_pmd: @@ -626,7 +626,7 @@ ENTRY(empty_zero_page) * This starts the data section. */ #ifdef CONFIG_X86_PAE -.section ".data.page_aligned","wa" +__PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fa54f78e2a05..780cd928fcd5 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) .text - .section .text.head + __HEAD .code64 .globl startup_64 startup_64: @@ -418,7 +418,7 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 - .section .bss.page_aligned, "aw", @nobits + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 43cec6bdda63..9c3bd4a2050e 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -10,6 +10,16 @@ EXPORT_SYMBOL(mcount); #endif +/* + * Note, this is a prototype to get at the symbol for + * the export, but dont use it from C code, it is used + * by assembly code and is not using C calling convention! + */ +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +EXPORT_SYMBOL(cmpxchg8b_emu); +#endif + /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 270ff83efc11..3a54dcb9cd0e 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 74656d1d4e30..04bbd5278568 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,10 +63,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "CNT"); + seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 300883112e3d..40f30773fb29 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -208,7 +208,7 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_COUNTERS +# ifdef CONFIG_PERF_EVENTS alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 71f1d99a635d..ec6ef60cbd17 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #ifdef CONFIG_SMP preempt_disable(); load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, - cpumask_of_cpu(smp_processor_id()))) + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 366baa179913..f4c538b681ca 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -317,6 +317,12 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; } + if (*(u32 *)firmware->data != UCODE_MAGIC) { + printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", + *(u32 *)firmware->data); + return UCODE_ERROR; + } + ret = generic_load_microcode(cpu, firmware->data, firmware->size); release_firmware(firmware); diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 0db7969b0dde..378e9a8f1bf8 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > num_physpages) { - pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); return ret; } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 971a3bec47a8..c563e4c8ff39 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -46,6 +46,7 @@ #include <asm/dma.h> #include <asm/rio.h> #include <asm/bios_ebda.h> +#include <asm/x86_init.h> #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; @@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else - return bad_dma_address; + return DMA_ERROR_CODE; } } @@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { unsigned long entry; - dma_addr_t ret = bad_dma_address; + dma_addr_t ret; entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == bad_dma_address)) - goto error; + if (unlikely(entry == DMA_ERROR_CODE)) { + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); + return DMA_ERROR_CODE; + } /* set the return dma address */ ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); @@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* put the TCEs in the HW table */ tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, direction); - return ret; - -error: - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); - return bad_dma_address; } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, @@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; /* were we called with bad_dma_address? */ - badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); return; @@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev) pdev = to_pci_dev(dev); + /* search up the device tree for an iommu */ pbus = pdev->bus; - - /* is the device behind a bridge? Look for the root bus */ - while (pbus->parent) + do { + tbl = pci_iommu(pbus); + if (tbl && tbl->it_busno == pbus->number) + break; + tbl = NULL; pbus = pbus->parent; - - tbl = pci_iommu(pbus); + } while (pbus); BUG_ON(tbl && (tbl->it_busno != pbus->number)); @@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); - if (entry == bad_dma_address) { + if (entry == DMA_ERROR_CODE) { /* makes sure unmap knows to stop */ s->dma_length = 0; goto error; @@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, error: calgary_unmap_sg(dev, sg, nelems, dir, NULL); for_each_sg(sg, s, nelems, i) { - sg->dma_address = bad_dma_address; + sg->dma_address = DMA_ERROR_CODE; sg->dma_length = 0; } return 0; @@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, /* set up tces to cover the allocated range */ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) + if (mapping == DMA_ERROR_CODE) goto free; *dma_handle = mapping; return ret; @@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); + iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ /* for CalIOC2 - avoid the entire first MB */ @@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void) return; } +static int __init calgary_iommu_init(void) +{ + int ret; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + return ret; + } + + return 0; +} + void __init detect_calgary(void) { int bus; @@ -1357,7 +1374,7 @@ void __init detect_calgary(void) * if the user specified iommu=off or iommu=soft or we found * another HW IOMMU already, bail out. */ - if (swiotlb || no_iommu || iommu_detected) + if (no_iommu || iommu_detected) return; if (!use_calgary) @@ -1442,9 +1459,7 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", specified_table_size); - /* swiotlb for devices that aren't behind the Calgary. */ - if (max_pfn > MAX_DMA32_PFN) - swiotlb = 1; + x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1457,35 +1472,6 @@ cleanup: } } -int __init calgary_iommu_init(void) -{ - int ret; - - if (no_iommu || (swiotlb && !calgary_detected)) - return -ENODEV; - - if (!calgary_detected) - return -ENODEV; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - force_iommu = 1; - bad_dma_address = 0x0; - /* dma_ops is set to swiotlb or nommu */ - if (!dma_ops) - dma_ops = &nommu_dma_ops; - - return 0; -} - static int __init calgary_parse_options(char *p) { unsigned int bridge; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 64b838eac18c..afcc58b69c7c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,10 +11,11 @@ #include <asm/gart.h> #include <asm/calgary.h> #include <asm/amd_iommu.h> +#include <asm/x86_init.h> static int forbid_dac __read_mostly; -struct dma_map_ops *dma_ops; +struct dma_map_ops *dma_ops = &nommu_dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -35,22 +36,17 @@ int iommu_detected __read_mostly = 0; /* * This variable becomes 1 if iommu=pt is passed on the kernel command line. - * If this variable is 1, IOMMU implementations do no DMA ranslation for + * If this variable is 1, IOMMU implementations do no DMA translation for * devices and allow every device to access to whole physical memory. This is * useful if a user want to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. */ int iommu_pass_through __read_mostly; -dma_addr_t bad_dma_address __read_mostly = 0; -EXPORT_SYMBOL(bad_dma_address); - -/* Dummy device used for NULL arguments (normally ISA). Better would - be probably a smaller DMA mask, but this is bug-to-bug compatible - to older i386. */ +/* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", - .coherent_dma_mask = DMA_BIT_MASK(32), + .coherent_dma_mask = ISA_DMA_BIT_MASK, .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; EXPORT_SYMBOL(x86_dma_fallback_dev); @@ -128,20 +124,17 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif + if (pci_swiotlb_init()) + return; - /* - * The order of these functions is important for - * fall-back/fail-over reasons - */ gart_iommu_hole_init(); detect_calgary(); detect_intel_iommu(); + /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); - - pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, @@ -216,7 +209,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; @@ -291,27 +284,19 @@ static int __init pci_iommu_init(void) #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); #endif + x86_init.iommu.iommu_init(); - calgary_iommu_init(); - - intel_iommu_init(); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else + swiotlb_free(); - amd_iommu_init(); - - gart_iommu_init(); - - no_iommu_init(); return 0; } - -void pci_iommu_shutdown(void) -{ - gart_iommu_shutdown(); - - amd_iommu_shutdown(); -} /* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); +rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 98a827ee9ed7..e6a0d402f171 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -16,6 +16,7 @@ #include <linux/agp_backend.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/sched.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/pci.h> @@ -38,6 +39,7 @@ #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/k8.h> +#include <asm/x86_init.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -45,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */ static u32 *iommu_gatt_base; /* Remapping table */ +static dma_addr_t bad_dma_addr; + /* * If this is disabled the IOMMU will use an optimized flushing strategy * of only flushing when an mapping is reused. With it true the GART is @@ -91,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, + boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); @@ -215,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, if (panic_on_overflow) panic("dma_map_area overflow %lu bytes\n", size); iommu_full(dev, size, dir); - return bad_dma_address; + return bad_dma_addr; } for (i = 0; i < npages; i++) { @@ -293,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, int i; #ifdef CONFIG_IOMMU_DEBUG - printk(KERN_DEBUG "dma_map_sg overflow\n"); + pr_debug("dma_map_sg overflow\n"); #endif for_each_sg(sg, s, nents, i) { @@ -301,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir, 0); - if (addr == bad_dma_address) { + if (addr == bad_dma_addr) { if (i > 0) gart_unmap_sg(dev, sg, i, dir, NULL); nents = 0; @@ -388,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, if (!dev) dev = &x86_dma_fallback_dev; - out = 0; - start = 0; - start_sg = sgmap = sg; - seg_size = 0; - max_seg_size = dma_get_max_seg_size(dev); - ps = NULL; /* shut up gcc */ + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); @@ -416,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, sgmap, pages, need) < 0) goto error; out++; - seg_size = 0; - sgmap = sg_next(sgmap); - pages = 0; - start = i; - start_sg = s; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; } } @@ -454,7 +461,7 @@ error: iommu_full(dev, pages << PAGE_SHIFT, dir); for_each_sg(sg, s, nents, i) - s->dma_address = bad_dma_address; + s->dma_address = bad_dma_addr; return 0; } @@ -478,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, DMA_BIDIRECTIONAL, align_mask); flush_gart(); - if (paddr != bad_dma_address) { + if (paddr != bad_dma_addr) { *dma_addr = paddr; return page_address(page); } @@ -498,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return (dma_addr == bad_dma_addr); +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -514,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - printk(KERN_WARNING + pr_warning( "PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", iommu_size >> 20); @@ -569,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static int gart_resume(struct sys_device *dev) +static void gart_fixup_northbridges(struct sys_device *dev) { - printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); + int i; - if (fix_up_north_bridges) { - int i; + if (!fix_up_north_bridges) + return; - printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); + pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < num_k8_northbridges; i++) { + struct pci_dev *dev = k8_northbridges[i]; - /* - * Don't enable translations just yet. That is the next - * step. Restore the pre-suspend aperture settings. - */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, - aperture_order << 1); - pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, - aperture_alloc >> 25); - } + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } +} + +static int gart_resume(struct sys_device *dev) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(dev); enable_gart_translations(); @@ -603,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state) } static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, - .resume = gart_resume, + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, }; static struct sys_device device_gart = { - .id = 0, - .cls = &gart_sysdev_class, + .cls = &gart_sysdev_class, }; /* @@ -626,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) void *gatt; int i, error; - printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + pr_info("PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; dev = NULL; for (i = 0; i < num_k8_northbridges; i++) { @@ -644,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) } if (!aper_base) goto nommu; + info->aper_base = aper_base; info->aper_size = aper_size >> 20; @@ -666,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) flush_gart(); - printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); return 0; nommu: /* Should not happen anymore */ - printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" "falling back to iommu=soft.\n"); return -1; } @@ -685,14 +702,15 @@ static struct dma_map_ops gart_dma_ops = { .unmap_page = gart_unmap_page, .alloc_coherent = gart_alloc_coherent, .free_coherent = gart_free_coherent, + .mapping_error = gart_mapping_error, }; -void gart_iommu_shutdown(void) +static void gart_iommu_shutdown(void) { struct pci_dev *dev; int i; - if (no_agp && (dma_ops != &gart_dma_ops)) + if (no_agp) return; for (i = 0; i < num_k8_northbridges; i++) { @@ -707,7 +725,7 @@ void gart_iommu_shutdown(void) } } -void __init gart_iommu_init(void) +int __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long iommu_start; @@ -717,7 +735,7 @@ void __init gart_iommu_init(void) long i; if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) - return; + return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; @@ -729,35 +747,28 @@ void __init gart_iommu_init(void) (agp_copy_info(agp_bridge, &info) < 0); #endif - if (swiotlb) - return; - - /* Did we detect a different HW IOMMU? */ - if (iommu_detected && !gart_iommu_aperture) - return; - if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - printk(KERN_WARNING "More than 4GB of memory " - "but GART IOMMU not available.\n"); - printk(KERN_WARNING "falling back to iommu=soft.\n"); + pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warning("falling back to iommu=soft.\n"); } - return; + return 0; } /* need to map that range */ - aper_size = info.aper_size << 20; - aper_base = info.aper_base; - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { start_pfn = (aper_base>>PAGE_SHIFT); init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); } - printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); + pr_info("PCI-DMA: using GART IOMMU.\n"); iommu_size = check_iommu_size(info.aper_base, aper_size); iommu_pages = iommu_size >> PAGE_SHIFT; @@ -772,8 +783,7 @@ void __init gart_iommu_init(void) ret = dma_debug_resize_entries(iommu_pages); if (ret) - printk(KERN_DEBUG - "PCI-DMA: Cannot trace all the entries\n"); + pr_debug("PCI-DMA: Cannot trace all the entries\n"); } #endif @@ -783,15 +793,14 @@ void __init gart_iommu_init(void) */ iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - agp_memory_reserved = iommu_size; - printk(KERN_INFO - "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size >> 20); - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; - bad_dma_address = iommu_bus_base; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); /* * Unmap the IOMMU part of the GART. The alias of the page is @@ -813,7 +822,7 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); - + /* * Now all caches are flushed and we can safely enable * GART hardware. Doing it early leaves the possibility @@ -837,6 +846,10 @@ void __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; + swiotlb = 0; + + return 0; } void __init gart_parse_options(char *p) @@ -855,7 +868,7 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush", 8)) + if (!strncmp(p, "fullflush", 9)) iommu_fullflush = 1; if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a3933d4330cd..22be12b60a8f 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) - return bad_dma_address; + return DMA_ERROR_CODE; flush_write_buffers(); return bus; } @@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, }; - -void __init no_iommu_init(void) -{ - if (dma_ops) - return; - - force_iommu = 0; /* no HW IOMMU */ - dma_ops = &nommu_dma_ops; -} diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e8a35016115f..e3c0a66b9e77 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -42,19 +42,28 @@ static struct dma_map_ops swiotlb_dma_ops = { .dma_supported = NULL, }; -void __init pci_swiotlb_init(void) +/* + * pci_swiotlb_init - initialize swiotlb if necessary + * + * This returns non-zero if we are forced to use swiotlb (by the boot + * option). + */ +int __init pci_swiotlb_init(void) { + int use_swiotlb = swiotlb | swiotlb_force; + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || - iommu_pass_through) - swiotlb = 1; + if (!no_iommu && max_pfn > MAX_DMA32_PFN) + swiotlb = 1; #endif if (swiotlb_force) swiotlb = 1; + if (swiotlb) { - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_init(); + swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } + + return use_swiotlb; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 847ab4160315..5284cd2b5776 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -555,10 +555,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) void __init init_c1e_mask(void) { /* If we're using c1e_idle, we need to allocate c1e_mask. */ - if (pm_idle == c1e_idle) { - alloc_cpumask_var(&c1e_mask, GFP_KERNEL); - cpumask_clear(c1e_mask); - } + if (pm_idle == c1e_idle) + zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b683170..eb62cbcaa490 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -664,3 +664,8 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } +unsigned long KSTK_ESP(struct task_struct *task) +{ + return (test_tsk_thread_flag(task, TIF_IA32)) ? + (task_pt_regs(task)->sp) : ((task)->thread.usersp); +} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8d7d5c9c1be3..7b058a2dc66a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -325,16 +325,6 @@ static int putreg(struct task_struct *child, return set_flags(child, value); #ifdef CONFIG_X86_64 - /* - * Orig_ax is really just a flag with small positive and - * negative values, so make sure to always sign-extend it - * from 32 bits so that it works correctly regardless of - * whether we come from a 32-bit environment or not. - */ - case offsetof(struct user_regs_struct, orig_ax): - value = (long) (s32) value; - break; - case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_OF(child)) return -EIO; @@ -1126,10 +1116,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) case offsetof(struct user32, regs.orig_eax): /* - * Sign-extend the value so that orig_eax = -1 - * causes (long)orig_ax < 0 tests to fire correctly. + * A 32-bit debugger setting orig_eax means to restore + * the state of the task restarting a 32-bit syscall. + * Make sure we interpret the -ERESTART* codes correctly + * in case the task is not actually still sitting at the + * exit from a 32-bit syscall with TS_COMPAT still set. */ - regs->orig_ax = (long) (s32) value; + regs->orig_ax = value; + if (syscall_get_nr(child, regs) >= 0) + task_thread_info(child)->status |= TS_COMPAT; break; case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 27349f92a6d7..2b97fc5b124e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -4,6 +4,7 @@ #include <linux/pm.h> #include <linux/efi.h> #include <linux/dmi.h> +#include <linux/sched.h> #include <linux/tboot.h> #include <acpi/reboot.h> #include <asm/io.h> @@ -22,7 +23,7 @@ # include <linux/ctype.h> # include <linux/mc146818rtc.h> #else -# include <asm/iommu.h> +# include <asm/x86_init.h> #endif /* @@ -435,6 +436,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, + { /* Handle problems with rebooting on Apple Macmini3,1 */ + .callback = set_pci_reboot, + .ident = "Apple Macmini3,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), + }, + }, { } }; @@ -613,7 +622,7 @@ void native_machine_shutdown(void) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f327bccf5089..2a34f9c5be21 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -27,6 +27,7 @@ #include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/acpi.h> +#include <linux/sfi.h> #include <linux/apm_bios.h> #include <linux/initrd.h> #include <linux/bootmem.h> @@ -659,6 +660,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { }, }, { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix/MSC BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), + }, + }, + { /* * AMI BIOS with low memory corruption was found on Intel DG45ID board. * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will @@ -985,6 +993,8 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); + sfi_init(); + /* * get boot-time SMP configuration: */ diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c new file mode 100644 index 000000000000..34e099382651 --- /dev/null +++ b/arch/x86/kernel/sfi.c @@ -0,0 +1,122 @@ +/* + * sfi.c - x86 architecture SFI support. + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#define KMSG_COMPONENT "SFI" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/acpi.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/io.h> + +#include <asm/io_apic.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; + +void __init mp_sfi_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); + + pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + +/* All CPUs enumerated by SFI must be present and enabled */ +void __cpuinit mp_sfi_register_lapic(u8 id) +{ + if (MAX_APICS - id <= 0) { + pr_warning("Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + pr_info("registering lapic[%d]\n", id); + + generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); +} + +static int __init sfi_parse_cpus(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_cpu_table_entry *pentry; + int i; + int cpu_num; + + sb = (struct sfi_table_simple *)table; + cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); + pentry = (struct sfi_cpu_table_entry *)sb->pentry; + + for (i = 0; i < cpu_num; i++) { + mp_sfi_register_lapic(pentry->apic_id); + pentry++; + } + + smp_found_config = 1; + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static u32 gsi_base; + +static int __init sfi_parse_ioapic(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_apic_table_entry *pentry; + int i, num; + + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); + pentry = (struct sfi_apic_table_entry *)sb->pentry; + + for (i = 0; i < num; i++) { + mp_register_ioapic(i, pentry->phys_addr, gsi_base); + gsi_base += io_apic_get_redir_entries(i); + pentry++; + } + + WARN(pic_mode, KERN_WARNING + "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); + pic_mode = 0; + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ + +/* + * sfi_platform_init(): register lapics & io-apics + */ +int __init sfi_platform_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + mp_sfi_register_lapic_address(sfi_lapic_addr); + sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); +#endif +#ifdef CONFIG_X86_IO_APIC + sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); +#endif + return 0; +} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 09c5e077dff7..565ebc65920e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1059,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) #endif current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { - alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); - cpumask_clear(per_cpu(cpu_core_map, i)); - cpumask_clear(per_cpu(cpu_sibling_map, i)); - cpumask_clear(cpu_data(i).llc_shared_map); + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); } set_cpu_sibling_map(0); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d51321ddafda..0157cd26d7cc 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -335,4 +335,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index e293ac56c723..be2573448ed9 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -38,7 +38,8 @@ unsigned long profile_pc(struct pt_regs *regs) #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else - unsigned long *sp = (unsigned long *)regs->sp; + unsigned long *sp = + (unsigned long *)kernel_stack_pointer(regs); /* * Return address is either directly at stack pointer * or above a saved flags. Eflags has bits 22-31 zero, @@ -93,7 +94,6 @@ static struct irqaction irq0 = { void __init setup_default_timer_irq(void) { - irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 503c1f2e8835..1740c85e24bb 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -23,8 +23,6 @@ static struct bau_control **uv_bau_table_bases __read_mostly; static int uv_bau_retry_limit __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; /* base pnode in this partition */ static int uv_partition_base_pnode __read_mostly; @@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode) BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ - n = pa >> uv_nshift; + n = uv_gpa_to_pnode(pa); m = pa & uv_mmask; uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, @@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) * need the pnode of where the memory was really allocated */ pa = uv_gpa(pqp); - pn = pa >> uv_nshift; + pn = uv_gpa_to_pnode(pa); uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | @@ -843,8 +841,7 @@ static int __init uv_bau_init(void) GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; - uv_nshift = uv_hub_info->n_val; - uv_mmask = (1UL << uv_hub_info->n_val) - 1; + uv_mmask = (1UL << uv_hub_info->m_val) - 1; nblades = uv_num_possible_blades(); uv_bau_table_bases = (struct bau_control **) diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 699f7eeb896a..cd022121cab6 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -3,8 +3,16 @@ #include <asm/trampoline.h> #include <asm/e820.h> +#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) +#define __trampinit +#define __trampinitdata +#else +#define __trampinit __cpuinit +#define __trampinitdata __cpuinitdata +#endif + /* ready for x86_64 and x86 */ -unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); void __init reserve_trampoline_memory(void) { @@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void) * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. */ -unsigned long __cpuinit setup_trampoline(void) +unsigned long __trampinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 596d54c660a5..3af2dff58b21 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -32,8 +32,12 @@ #include <asm/segment.h> #include <asm/processor-flags.h> +#ifdef CONFIG_ACPI_SLEEP +.section .rodata, "a", @progbits +#else /* We can free up the trampoline after bootup if cpu hotplug is not supported. */ __CPUINITRODATA +#endif .code16 ENTRY(trampoline_data) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9346e102338d..7e37dcee0cc3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -14,7 +14,6 @@ #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> -#include <linux/utsname.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/module.h> @@ -73,11 +72,9 @@ char ignore_fpu_irq; /* * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. + * F0 0F bug workaround. */ -gate_desc idt_table[NR_VECTORS] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; #endif DECLARE_BITMAP(used_vectors, NR_VECTORS); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 027b5b498993..f37930954d15 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - pr_info("Skipping synchronization checks as TSC is reliable.\n"); + printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); return; } diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 31e6f6cfe53e..d430e4c30193 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -648,7 +648,7 @@ static inline int __init activate_vmi(void) pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; - pv_info.name = "vmi"; + pv_info.name = "vmi [deprecated]"; pv_init_ops.patch = vmi_patch; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a46acccec38a..3c68fe2d46cf 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -65,17 +65,11 @@ SECTIONS #endif /* Text and read-only data */ - - /* bootstrapping code */ - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; - *(.text.head) - } :text = 0x9090 - - /* The rest of the text */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* bootstrapping code */ + HEAD_TEXT #ifdef CONFIG_X86_32 - /* not really needed, already page aligned */ . = ALIGN(PAGE_SIZE); *(.text.page_aligned) #endif @@ -94,13 +88,7 @@ SECTIONS NOTES :text :note - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 + EXCEPTION_TABLE(16) :text = 0x9090 RO_DATA(PAGE_SIZE) @@ -118,7 +106,6 @@ SECTIONS #endif PAGE_ALIGNED_DATA(PAGE_SIZE) - *(.data.idt) CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) @@ -135,24 +122,21 @@ SECTIONS #ifdef CONFIG_X86_64 #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ - PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ - PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + . = ALIGN(4096); + __vsyscall_0 = .; + . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) @@ -192,11 +176,9 @@ SECTIONS *(.vsyscall_3) } - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + . = __vsyscall_0 + PAGE_SIZE; #undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR #undef VLOAD_OFFSET #undef VLOAD #undef VVIRT_OFFSET @@ -219,36 +201,12 @@ SECTIONS PERCPU_VADDR(0, :percpu) #endif - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } + INIT_TEXT_SECTION(PAGE_SIZE) #ifdef CONFIG_X86_64 :init #endif - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } + INIT_DATA_SECTION(16) .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; @@ -256,8 +214,6 @@ SECTIONS __x86_cpu_dev_end = .; } - SECURITY_INIT - . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; @@ -288,15 +244,6 @@ SECTIONS EXIT_DATA } -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) PERCPU(PAGE_SIZE) #endif @@ -358,6 +305,9 @@ SECTIONS #ifdef CONFIG_X86_32 +/* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); #else diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index cf53a78e2dcf..8cb4974ff599 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -228,19 +228,11 @@ static long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -static int -vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .proc_handler = proc_dointvec }, {} }; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 4449a4a2c2ed..d11c5ff7c65e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,10 +14,13 @@ #include <asm/time.h> #include <asm/irq.h> #include <asm/tsc.h> +#include <asm/iommu.h> void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } +int __init iommu_init_noop(void) { return 0; } +void iommu_shutdown_noop(void) { } /* * The platform setup functions are preset with the default functions @@ -62,6 +65,10 @@ struct x86_init_ops x86_init __initdata = { .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { @@ -72,4 +79,5 @@ struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, + .iommu_shutdown = iommu_shutdown_noop, }; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 82ad523b4901..144e7f60b5e2 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -116,7 +116,7 @@ static s64 __kpit_elapsed(struct kvm *kvm) * itself with the initial count and continues counting * from there. */ - remaining = hrtimer_expires_remaining(&ps->pit_timer.timer); + remaining = hrtimer_get_remaining(&ps->pit_timer.timer); elapsed = ps->pit_timer.period - ktime_to_ns(remaining); elapsed = mod_64(elapsed, ps->pit_timer.period); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1ae5ceba7eb2..23c217692ea9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -521,7 +521,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) if (apic_get_reg(apic, APIC_TMICT) == 0) return 0; - remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer); + remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); if (ktime_to_ns(remaining) < 0) remaining = ktime_set(0, 0); @@ -664,7 +664,7 @@ static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) * + apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; atomic_set(&apic->lapic_timer.pending, 0); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eca41ae9f453..818b92ad82cf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -156,6 +156,8 @@ module_param(oos_shadow, bool, 0644); #define CREATE_TRACE_POINTS #include "mmutrace.h" +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { @@ -634,9 +636,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (is_writeble_pte(*spte)) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); + kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); @@ -748,7 +748,8 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) return write_protected; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) { u64 *spte; int need_tlb_flush = 0; @@ -763,8 +764,47 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) return need_tlb_flush; } +static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + int need_flush = 0; + u64 *spte, new_spte; + pte_t *ptep = (pte_t *)data; + pfn_t new_pfn; + + WARN_ON(pte_huge(*ptep)); + new_pfn = pte_pfn(*ptep); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!is_shadow_present_pte(*spte)); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); + need_flush = 1; + if (pte_write(*ptep)) { + rmap_remove(kvm, spte); + __set_spte(spte, shadow_trap_nonpresent_pte); + spte = rmap_next(kvm, rmapp, NULL); + } else { + new_spte = *spte &~ (PT64_BASE_ADDR_MASK); + new_spte |= (u64)new_pfn << PAGE_SHIFT; + + new_spte &= ~PT_WRITABLE_MASK; + new_spte &= ~SPTE_HOST_WRITEABLE; + if (is_writeble_pte(*spte)) + kvm_set_pfn_dirty(spte_to_pfn(*spte)); + __set_spte(spte, new_spte); + spte = rmap_next(kvm, rmapp, spte); + } + } + if (need_flush) + kvm_flush_remote_tlbs(kvm); + + return 0; +} + static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - int (*handler)(struct kvm *kvm, unsigned long *rmapp)) + unsigned long data, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long data)) { int i, j; int retval = 0; @@ -786,13 +826,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - retval |= handler(kvm, &memslot->rmap[gfn_offset]); + retval |= handler(kvm, &memslot->rmap[gfn_offset], + data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); retval |= handler(kvm, - &memslot->lpage_info[j][idx].rmap_pde); + &memslot->lpage_info[j][idx].rmap_pde, + data); } } } @@ -802,10 +844,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) { u64 *spte; int young = 0; @@ -841,13 +889,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) gfn = unalias_gfn(vcpu->kvm, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - kvm_unmap_rmapp(vcpu->kvm, rmapp); + kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); kvm_flush_remote_tlbs(vcpu->kvm); } int kvm_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, kvm_age_rmapp); + return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); } #ifdef MMU_DEBUG @@ -1756,7 +1804,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool can_unsync) + bool can_unsync, bool reset_host_protection) { u64 spte; int ret = 0; @@ -1783,6 +1831,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); + if (reset_host_protection) + spte |= SPTE_HOST_WRITEABLE; + spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) @@ -1828,7 +1879,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, int *ptwrite, int level, gfn_t gfn, - pfn_t pfn, bool speculative) + pfn_t pfn, bool speculative, + bool reset_host_protection) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*sptep); @@ -1860,7 +1912,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - dirty, level, gfn, pfn, speculative, true)) { + dirty, level, gfn, pfn, speculative, true, + reset_host_protection)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1877,8 +1930,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); - if (!is_rmap_spte(*sptep)) - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { @@ -1909,7 +1961,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, - level, gfn, pfn, false); + level, gfn, pfn, false, true); ++vcpu->stat.pf_fixed; break; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d2fec9c12d22..72558f8ff3f5 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -273,9 +273,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) return; kvm_get_pfn(pfn); + /* + * we call mmu_set_spte() with reset_host_protection = true beacuse that + * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). + */ mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, - gpte_to_gfn(gpte), pfn, true); + gpte_to_gfn(gpte), pfn, true, true); } /* @@ -308,7 +312,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, user_fault, write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, ptwrite, level, - gw->gfn, pfn, false); + gw->gfn, pfn, false, true); break; } @@ -558,6 +562,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int i, offset, nr_present; + bool reset_host_protection; offset = nr_present = 0; @@ -595,9 +600,16 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { + pte_access &= ~ACC_WRITE_MASK; + reset_host_protection = 0; + } else { + reset_host_protection = 1; + } set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, - spte_to_pfn(sp->spt[i]), true, false); + spte_to_pfn(sp->spt[i]), true, false, + reset_host_protection); } return !nr_present; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 944cc9c04b3c..c17404add91f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -767,6 +767,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdtscll(tsc_this); delta = vcpu->arch.host_tsc - tsc_this; svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; @@ -2057,10 +2059,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) switch (ecx) { case MSR_IA32_TSC: { - u64 tsc; + u64 tsc_offset; + + if (is_nested(svm)) + tsc_offset = svm->nested.hsave->control.tsc_offset; + else + tsc_offset = svm->vmcb->control.tsc_offset; - rdtscll(tsc); - *data = svm->vmcb->control.tsc_offset + tsc; + *data = tsc_offset + native_read_tsc(); break; } case MSR_K6_STAR: @@ -2146,10 +2152,17 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) switch (ecx) { case MSR_IA32_TSC: { - u64 tsc; + u64 tsc_offset = data - native_read_tsc(); + u64 g_tsc_offset = 0; + + if (is_nested(svm)) { + g_tsc_offset = svm->vmcb->control.tsc_offset - + svm->nested.hsave->control.tsc_offset; + svm->nested.hsave->control.tsc_offset = tsc_offset; + } + + svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; - rdtscll(tsc); - svm->vmcb->control.tsc_offset = data - tsc; break; } case MSR_K6_STAR: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f3812014bd0b..ed53b42caba1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -709,7 +709,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->cpu != cpu) { vcpu_clear(vmx); kvm_migrate_timers(vcpu); - vpid_sync_vcpu_all(vmx); + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); local_irq_disable(); list_add(&vmx->local_vcpus_link, &per_cpu(vcpus_on_cpu, cpu)); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be451ee44249..ae07d261527c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1591,6 +1591,8 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, if (cpuid->nent < 1) goto out; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + cpuid->nent = KVM_MAX_CPUID_ENTRIES; r = -ENOMEM; cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); if (!cpuid_entries) @@ -1690,7 +1692,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, unsigned bank_num = mcg_cap & 0xff, bank; r = -EINVAL; - if (!bank_num) + if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) goto out; @@ -4049,7 +4051,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } -static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, +static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { u32 base_addr = get_desc_base(seg_desc); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4cb7d5d18b8e..7e59dc1d3fc2 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1135,11 +1135,6 @@ static struct notifier_block paniced = { /* Setting up memory is fairly easy. */ static __init char *lguest_memory_setup(void) { - /* We do this here and not earlier because lockcheck used to barf if we - * did it before start_kernel(). I think we fixed that, so it'd be - * nice to move it back to lguest_init. Patch welcome... */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* *The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. @@ -1364,10 +1359,13 @@ __init void lguest_init(void) /* * If we don't initialize the lock dependency checker now, it crashes - * paravirt_disable_iospace. + * atomic_notifier_chain_register, then paravirt_disable_iospace. */ lockdep_init(); + /* Hook in our special panic hypercall code. */ + atomic_notifier_chain_register(&panic_notifier_list, &paniced); + /* * The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 9e609206fac9..85f5db95c60f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -16,7 +16,9 @@ ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o - +ifneq ($(CONFIG_X86_CMPXCHG64),y) + lib-y += cmpxchg8b_emu.o +endif lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else obj-y += io_64.o iomap_copy_64.o diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S new file mode 100644 index 000000000000..828cb710dec2 --- /dev/null +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -0,0 +1,57 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ + +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + + +.text + +/* + * Inputs: + * %esi : memory location to compare + * %eax : low 32 bits of old value + * %edx : high 32 bits of old value + * %ebx : low 32 bits of new value + * %ecx : high 32 bits of new value + */ +ENTRY(cmpxchg8b_emu) +CFI_STARTPROC + +# +# Emulate 'cmpxchg8b (%esi)' on UP except we don't +# set the whole ZF thing (caller will just compare +# eax:edx with the expected value) +# +cmpxchg8b_emu: + pushfl + cli + + cmpl (%esi), %eax + jne not_same + cmpl 4(%esi), %edx + jne half_same + + movl %ebx, (%esi) + movl %ecx, 4(%esi) + + popfl + ret + + not_same: + movl (%esi), %eax + half_same: + movl 4(%esi), %edx + + popfl + ret + +CFI_ENDPROC +ENDPROC(cmpxchg8b_emu) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 9b5a9f59a478..06630d26e56d 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,9 +1,10 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o + pat.o pgtable.o physaddr.o gup.o setup_nx.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_physaddr.o := $(nostackp) +CFLAGS_setup_nx.o := $(nostackp) obj-$(CONFIG_SMP) += tlb.o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 775a020990a5..f4cee9028cf0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -10,7 +10,7 @@ #include <linux/bootmem.h> /* max_low_pfn */ #include <linux/kprobes.h> /* __kprobes, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ -#include <linux/perf_counter.h> /* perf_swcounter_event */ +#include <linux/perf_event.h> /* perf_sw_event */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; + info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; force_sig_info(si_signo, &info, tsk); } @@ -790,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code, } static void -do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, + unsigned int fault) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; + int code = BUS_ADRERR; up_read(&mm->mmap_sem); @@ -809,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) tsk->thread.error_code = error_code; tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +#ifdef CONFIG_MEMORY_FAILURE + if (fault & VM_FAULT_HWPOISON) { + printk(KERN_ERR + "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + tsk->comm, tsk->pid, address); + code = BUS_MCEERR_AR; + } +#endif + force_sig_info_fault(SIGBUS, code, address, tsk); } static noinline void @@ -819,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); } else { - if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + do_sigbus(regs, error_code, address, fault); else BUG(); } @@ -1017,7 +1028,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1114,11 +1125,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 0607119cef94..73ffd5536f62 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -28,69 +28,6 @@ int direct_gbpages #endif ; -int nx_enabled; - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -static int disable_nx __cpuinitdata; - -/* - * noexec = on|off - * - * Control non-executable mappings for processes. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", noexec_setup); -#endif - -#ifdef CONFIG_X86_PAE -static void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#else -static inline void set_nx(void) -{ -} -#endif - -#ifdef CONFIG_X86_64 -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) - __supported_pte_mask &= ~_PAGE_NX; -} -#endif - static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3cd7711bb949..30938c1d8d5d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); @@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) #endif if (!page_table) page_table = - (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + (pte_t *)alloc_bootmem_pages(PAGE_SIZE); } else page_table = (pte_t *)alloc_low_page(); @@ -857,8 +857,6 @@ static void __init test_wp_bit(void) } } -static struct kcore_list kcore_mem, kcore_vmalloc; - void __init mem_init(void) { int codesize, reservedpages, datasize, initsize; @@ -886,13 +884,9 @@ void __init mem_init(void) datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " "%dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ea56b8cbb6a6..5a4398a6006b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -647,8 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, - kcore_modules, kcore_vsyscall; +static struct kcore_list kcore_vsyscall; void __init mem_init(void) { @@ -677,17 +676,12 @@ void __init mem_init(void) initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - kclist_add(&kcore_kernel, &_stext, _end - _stext); - kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START); + VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), max_pfn << (PAGE_SHIFT-10), codesize >> 10, absent_pages << (PAGE_SHIFT-10), diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 334e63ca7b2b..2feb9bdedaaf 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -170,8 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, (unsigned long long)phys_addr, (unsigned long long)(phys_addr + size), prot_val, new_prot_val); - free_memtype(phys_addr, phys_addr + size); - return NULL; + goto err_free_memtype; } prot_val = new_prot_val; } @@ -197,26 +196,25 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, */ area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) - return NULL; + goto err_free_memtype; area->phys_addr = phys_addr; vaddr = (unsigned long) area->addr; - if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { - free_memtype(phys_addr, phys_addr + size); - free_vm_area(area); - return NULL; - } + if (kernel_map_sync_memtype(phys_addr, size, prot_val)) + goto err_free_area; - if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { - free_memtype(phys_addr, phys_addr + size); - free_vm_area(area); - return NULL; - } + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) + goto err_free_area; ret_addr = (void __iomem *) (vaddr + offset); mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); return ret_addr; +err_free_area: + free_vm_area(area); +err_free_memtype: + free_memtype(phys_addr, phys_addr + size); + return NULL; } /** diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 528bf954eb74..8cc183344140 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - if (data->balance == 0) - return; - if (unlikely(data->balance != 1)) { kmemcheck_show_all(); kmemcheck_error_save_bug(regs); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 24952fdc7e40..dd38bfbefd1f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -144,6 +144,7 @@ void clflush_cache_range(void *vaddr, unsigned int size) mb(); } +EXPORT_SYMBOL_GPL(clflush_cache_range); static void __cpa_flush_all(void *arg) { diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 7257cf3decf9..e78cd0ec2bcf 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -81,6 +81,7 @@ enum { void pat_init(void) { u64 pat; + bool boot_cpu = !boot_pat_state; if (!pat_enabled) return; @@ -122,8 +123,10 @@ void pat_init(void) rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); wrmsrl(MSR_IA32_CR_PAT, pat); - printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", - smp_processor_id(), boot_pat_state, pat); + + if (boot_cpu) + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); } #undef PAT diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c new file mode 100644 index 000000000000..513d8ed5d2ec --- /dev/null +++ b/arch/x86/mm/setup_nx.c @@ -0,0 +1,69 @@ +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/init.h> + +#include <asm/pgtable.h> + +int nx_enabled; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static int disable_nx __cpuinitdata; + +/* + * noexec = on|off + * + * Control non-executable mappings for processes. + * + * on Enable + * off Disable + */ +static int __init noexec_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", noexec_setup); +#endif + +#ifdef CONFIG_X86_PAE +void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} +#else +void set_nx(void) +{ +} +#endif + +#ifdef CONFIG_X86_64 +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || disable_nx) + __supported_pte_mask &= ~_PAGE_NX; +} +#endif + diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c814e144a3f0..36fe08eeb5c3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -59,7 +59,8 @@ void leave_mm(int cpu) { if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); - cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); + cpumask_clear_cpu(cpu, + mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -234,8 +235,8 @@ void flush_tlb_current_task(void) preempt_disable(); local_flush_tlb(); - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -249,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm) else leave_mm(smp_processor_id()); } - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -268,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) leave_mm(smp_processor_id()); } - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, va); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, va); preempt_enable(); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 4899215999de..8eb05878554c 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void) if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { eax.split.version_id = 2; - eax.split.num_counters = 2; + eax.split.num_events = 2; eax.split.bit_width = 40; } - num_counters = eax.split.num_counters; + num_counters = eax.split.num_events; op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index b83776180c7f..7b8e75d16081 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -13,7 +13,7 @@ #define OP_X86_MODEL_H #include <asm/types.h> -#include <asm/perf_counter.h> +#include <asm/perf_event.h> struct op_msr { unsigned long addr; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 5db96d4304de..1331fcf26143 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -646,7 +646,7 @@ int get_mp_bus_to_node(int busnum) #else /* CONFIG_X86_32 */ -static unsigned char mp_bus_to_node[BUS_NR] = { +static int mp_bus_to_node[BUS_NR] = { [0 ... BUS_NR - 1] = -1 }; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 52e62e57fedd..b22d13b0c71d 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -266,7 +266,7 @@ void pcibios_set_master(struct pci_dev *dev) pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); } -static struct vm_operations_struct pci_mmap_ops = { +static const struct vm_operations_struct pci_mmap_ops = { .access = generic_access_phys, }; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 712443ec6d43..602c172d3bd5 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -13,10 +13,14 @@ #include <linux/pci.h> #include <linux/init.h> #include <linux/acpi.h> +#include <linux/sfi_acpi.h> #include <linux/bitmap.h> #include <linux/sort.h> #include <asm/e820.h> #include <asm/pci_x86.h> +#include <asm/acpi.h> + +#define PREFIX "PCI: " /* aperture is up to 256MB but BIOS may reserve less */ #define MMCONFIG_APER_MIN (2 * 1024*1024) @@ -491,7 +495,7 @@ static void __init pci_mmcfg_reject_broken(int early) (unsigned int)cfg->start_bus_number, (unsigned int)cfg->end_bus_number); - if (!early) + if (!early && !acpi_disabled) valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); if (valid) @@ -606,7 +610,7 @@ static void __init __pci_mmcfg_init(int early) } if (!known_bridge) - acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); + acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); pci_mmcfg_reject_broken(early); diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 8b2d561046a3..f10a7e94a84c 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -11,9 +11,9 @@ #include <linux/pci.h> #include <linux/init.h> -#include <linux/acpi.h> #include <asm/e820.h> #include <asm/pci_x86.h> +#include <acpi/acpi.h> /* Assume systems with more busses have correct MCFG */ #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 88112b49f02c..6b4ffedb93c9 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO $@ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) -VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) GCOV_PROFILE := n # diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index b53225d2cac3..e133ce25e290 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -100,7 +100,7 @@ static int xen_array_release(struct inode *inode, struct file *file) return 0; } -static struct file_operations u32_array_fops = { +static const struct file_operations u32_array_fops = { .owner = THIS_MODULE, .open = u32_array_open, .release= xen_array_release, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 544eb7496531..dfbf70e65860 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -178,6 +178,7 @@ static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { + unsigned maskebx = ~0; unsigned maskecx = ~0; unsigned maskedx = ~0; @@ -185,9 +186,16 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ - if (*ax == 1) { + switch (*ax) { + case 1: maskecx = cpuid_leaf1_ecx_mask; maskedx = cpuid_leaf1_edx_mask; + break; + + case 0xb: + /* Suppress extended topology stuff */ + maskebx = 0; + break; } asm(XEN_EMULATE_PREFIX "cpuid" @@ -197,6 +205,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, "=d" (*dx) : "0" (*ax), "2" (*cx)); + *bx &= maskebx; *cx &= maskecx; *dx &= maskedx; } @@ -1075,6 +1084,8 @@ asmlinkage void __init xen_start_kernel(void) * Set up some pagetable state before starting to set any ptes. */ + xen_init_mmu_ops(); + /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; if (!xen_initial_domain()) @@ -1082,6 +1093,11 @@ asmlinkage void __init xen_start_kernel(void) __supported_pte_mask |= _PAGE_IOMAP; +#ifdef CONFIG_X86_64 + /* Work out if we support NX */ + check_efer(); +#endif + xen_setup_features(); /* Get mfn list */ @@ -1094,7 +1110,6 @@ asmlinkage void __init xen_start_kernel(void) */ xen_setup_stackprotector(); - xen_init_mmu_ops(); xen_init_irq_ops(); xen_init_cpuid_mask(); @@ -1123,11 +1138,6 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; -#ifdef CONFIG_X86_64 - /* Work out if we support NX */ - check_efer(); -#endif - /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 093dd59b5385..3bf7b1d250ce 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1165,14 +1165,14 @@ static void xen_drop_mm_ref(struct mm_struct *mm) /* Get the "official" set of cpus referring to our pagetable. */ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) + if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) continue; smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); } return; } - cpumask_copy(mask, &mm->cpu_vm_mask); + cpumask_copy(mask, mm_cpumask(mm)); /* It's possible that a vcpu may have a stale reference to our cr3, because its in lazy mode, and it hasn't yet flushed |