diff options
Diffstat (limited to 'arch/x86')
43 files changed, 1008 insertions, 327 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore index 028079065af6..7cab8c08e6d1 100644 --- a/arch/x86/.gitignore +++ b/arch/x86/.gitignore @@ -1,3 +1,4 @@ boot/compressed/vmlinux tools/test_get_len +tools/insn_sanity diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6c14ecd851d0..5bed94e189fa 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -125,16 +125,6 @@ config HAVE_LATENCYTOP_SUPPORT config MMU def_bool y -config ZONE_DMA - bool "DMA memory allocation support" if EXPERT - default y - help - DMA memory allocation support allows devices with less than 32-bit - addressing to allocate within the first 16MB of address space. - Disable if no such devices will be used. - - If unsure, say Y. - config SBUS bool @@ -255,6 +245,16 @@ source "kernel/Kconfig.freezer" menu "Processor type and features" +config ZONE_DMA + bool "DMA memory allocation support" if EXPERT + default y + help + DMA memory allocation support allows devices with less than 32-bit + addressing to allocate within the first 16MB of address space. + Disable if no such devices will be used. + + If unsure, say Y. + source "kernel/time/Kconfig" config SMP @@ -360,7 +360,6 @@ config X86_NUMACHIP depends on NUMA depends on SMP depends on X86_X2APIC - depends on !EDAC_AMD64 ---help--- Adds support for Numascale NumaChip large-SMP systems. Needed to enable more than ~168 cores. diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 3a19d04cebeb..7116dcba0c9e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -321,6 +321,8 @@ static void parse_elf(void *output) default: /* Ignore other PT_* */ break; } } + + free(phdrs); } asmlinkage void decompress_kernel(void *rmode, memptr heap, diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 1106261856c8..e3e734005e19 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -14,6 +14,7 @@ #include <asm/segment.h> #include <asm/irqflags.h> #include <linux/linkage.h> +#include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -189,7 +190,7 @@ sysexit_from_sys_call: movl %ebx,%edx /* 3rd arg: 1st syscall arg */ movl %eax,%esi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys @@ -206,12 +207,13 @@ sysexit_from_sys_call: TRACE_IRQS_ON sti movl %eax,%esi /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ + call __audit_syscall_exit + movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 0c9fa2745f13..b3b733262909 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -145,13 +145,13 @@ extern void __add_wrong_size(void) #ifdef __HAVE_ARCH_CMPXCHG #define cmpxchg(ptr, old, new) \ - __cmpxchg((ptr), (old), (new), sizeof(*ptr)) + __cmpxchg(ptr, old, new, sizeof(*(ptr))) #define sync_cmpxchg(ptr, old, new) \ - __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) + __sync_cmpxchg(ptr, old, new, sizeof(*(ptr))) #define cmpxchg_local(ptr, old, new) \ - __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) + __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) #endif /* diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 17c5d4bdee5e..8d67d428b0f9 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -159,6 +159,7 @@ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ #define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ #define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE (6*32+17) /* translation cache extension */ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ #define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ #define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 6919e936345b..247904945d3f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -29,10 +29,11 @@ extern unsigned int sig_xstate_size; extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); -extern asmlinkage void math_state_restore(void); -extern void __math_state_restore(void); +extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); + extern user_regset_active_fn fpregs_active, xfpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; @@ -212,19 +213,11 @@ static inline void fpu_fxsave(struct fpu *fpu) #endif /* CONFIG_X86_64 */ -/* We need a safe address that is cheap to find and that is already - in L1 during context switch. The best choices are unfortunately - different for UP and SMP */ -#ifdef CONFIG_SMP -#define safe_address (__per_cpu_offset[0]) -#else -#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER]) -#endif - /* - * These must be called with preempt disabled + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact. */ -static inline void fpu_save_init(struct fpu *fpu) +static inline int fpu_save_init(struct fpu *fpu) { if (use_xsave()) { fpu_xsave(fpu); @@ -233,33 +226,33 @@ static inline void fpu_save_init(struct fpu *fpu) * xsave header may indicate the init state of the FP. */ if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return; + return 1; } else if (use_fxsr()) { fpu_fxsave(fpu); } else { asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state->fsave)); - return; + return 0; } - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) + /* + * If exceptions are pending, we need to clear them so + * that we don't randomly get exceptions later. + * + * FIXME! Is this perhaps only true for the old-style + * irq13 case? Maybe we could leave the x87 state + * intact otherwise? + */ + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { asm volatile("fnclex"); - - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. safe_address is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); + return 0; + } + return 1; } -static inline void __save_init_fpu(struct task_struct *tsk) +static inline int __save_init_fpu(struct task_struct *tsk) { - fpu_save_init(&tsk->thread.fpu); - task_thread_info(tsk)->status &= ~TS_USEDFPU; + return fpu_save_init(&tsk->thread.fpu); } static inline int fpu_fxrstor_checking(struct fpu *fpu) @@ -277,44 +270,212 @@ static inline int fpu_restore_checking(struct fpu *fpu) static inline int restore_fpu_checking(struct task_struct *tsk) { + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. "m" is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (tsk->thread.fpu.has_fpu)); + return fpu_restore_checking(&tsk->thread.fpu); } /* - * Signal frame handlers... + * Software FPU state helpers. Careful: these need to + * be preemption protection *and* they need to be + * properly paired with the CR0.TS changes! */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); +static inline int __thread_has_fpu(struct task_struct *tsk) +{ + return tsk->thread.fpu.has_fpu; +} -static inline void __unlazy_fpu(struct task_struct *tsk) +/* Must be paired with an 'stts' after! */ +static inline void __thread_clear_has_fpu(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_USEDFPU) { - __save_init_fpu(tsk); - stts(); - } else - tsk->fpu_counter = 0; + tsk->thread.fpu.has_fpu = 0; + percpu_write(fpu_owner_task, NULL); +} + +/* Must be paired with a 'clts' before! */ +static inline void __thread_set_has_fpu(struct task_struct *tsk) +{ + tsk->thread.fpu.has_fpu = 1; + percpu_write(fpu_owner_task, tsk); +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void __thread_fpu_end(struct task_struct *tsk) +{ + __thread_clear_has_fpu(tsk); + stts(); +} + +static inline void __thread_fpu_begin(struct task_struct *tsk) +{ + clts(); + __thread_set_has_fpu(tsk); +} + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. + */ +typedef struct { int preload; } fpu_switch_t; + +/* + * FIXME! We could do a totally lazy restore, but we need to + * add a per-cpu "this was the task that last touched the FPU + * on this CPU" variable, and the task needs to have a "I last + * touched the FPU on this CPU" and check them. + * + * We don't do that yet, so "fpu_lazy_restore()" always returns + * false, but some day.. + */ +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == percpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} + +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) +{ + fpu_switch_t fpu; + + fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + if (__thread_has_fpu(old)) { + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + new->fpu_counter++; + __thread_set_has_fpu(new); + prefetch(new->thread.fpu.state); + } else + stts(); + } else { + old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; + if (fpu.preload) { + new->fpu_counter++; + if (fpu_lazy_restore(new, cpu)) + fpu.preload = 0; + else + prefetch(new->thread.fpu.state); + __thread_fpu_begin(new); + } + } + return fpu; +} + +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) +{ + if (fpu.preload) { + if (unlikely(restore_fpu_checking(new))) + __thread_fpu_end(new); + } } +/* + * Signal frame handlers... + */ +extern int save_i387_xstate(void __user *buf); +extern int restore_i387_xstate(void __user *buf); + static inline void __clear_fpu(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (__thread_has_fpu(tsk)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(tsk); } } +/* + * Were we in an interrupt that interrupted kernel mode? + * + * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: the thread must not have fpu (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ + return !__thread_has_fpu(current) && + (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode_vm(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +static inline bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} + static inline void kernel_fpu_begin(void) { - struct thread_info *me = current_thread_info(); + struct task_struct *me = current; + + WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); - if (me->status & TS_USEDFPU) - __save_init_fpu(me->task); - else + if (__thread_has_fpu(me)) { + __save_init_fpu(me); + __thread_clear_has_fpu(me); + /* We do 'stts()' in kernel_fpu_end() */ + } else { + percpu_write(fpu_owner_task, NULL); clts(); + } } static inline void kernel_fpu_end(void) @@ -323,14 +484,6 @@ static inline void kernel_fpu_end(void) preempt_enable(); } -static inline bool irq_fpu_usable(void) -{ - struct pt_regs *regs; - - return !in_interrupt() || !(regs = get_irq_regs()) || \ - user_mode(regs) || (read_cr0() & X86_CR0_TS); -} - /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions @@ -363,20 +516,64 @@ static inline void irq_ts_restore(int TS_state) } /* + * The question "does this thread have fpu access?" + * is slightly racy, since preemption could come in + * and revoke it immediately after the test. + * + * However, even in that very unlikely scenario, + * we can just assume we have FPU access - typically + * to save the FP state - we'll just take a #NM + * fault and get the FPU access back. + * + * The actual user_fpu_begin/end() functions + * need to be preemption-safe, though. + * + * NOTE! user_fpu_end() must be used only after you + * have saved the FP state, and user_fpu_begin() must + * be used only immediately before restoring it. + * These functions do not do any save/restore on + * their own. + */ +static inline int user_has_fpu(void) +{ + return __thread_has_fpu(current); +} + +static inline void user_fpu_end(void) +{ + preempt_disable(); + __thread_fpu_end(current); + preempt_enable(); +} + +static inline void user_fpu_begin(void) +{ + preempt_disable(); + if (!user_has_fpu()) + __thread_fpu_begin(current); + preempt_enable(); +} + +/* * These disable preemption on their own and are safe */ static inline void save_init_fpu(struct task_struct *tsk) { + WARN_ON_ONCE(!__thread_has_fpu(tsk)); preempt_disable(); __save_init_fpu(tsk); - stts(); + __thread_fpu_end(tsk); preempt_enable(); } static inline void unlazy_fpu(struct task_struct *tsk) { preempt_disable(); - __unlazy_fpu(tsk); + if (__thread_has_fpu(tsk)) { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } else + tsk->fpu_counter = 0; preempt_enable(); } diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index ab4092e3214e..7b9cfc4878af 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -190,6 +190,9 @@ struct x86_emulate_ops { int (*intercept)(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage); + + bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -298,6 +301,19 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \ X86EMUL_MODE_PROT64) +/* CPUID vendors */ +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 + +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273 + +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547 +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e +#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 + enum x86_intercept_stage { X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ X86_ICPT_PRE_EXCEPT, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aa9088c26931..58545c97d071 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,6 +374,8 @@ union thread_xstate { }; struct fpu { + unsigned int last_cpu; + unsigned int has_fpu; union thread_xstate *state; }; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index bc817cd8b443..cfd8144d5527 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -247,8 +247,6 @@ static inline struct thread_info *current_thread_info(void) * ever touches our thread-synchronous status, so we don't * have to worry about atomic accesses. */ -#define TS_USEDFPU 0x0001 /* FPU was used by this task - this quantum (SMP) */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #define TS_POLLING 0x0004 /* idle task polling need_resched, skip sending interrupt */ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index b4a3db7ce140..21f77b89e47a 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -7,6 +7,7 @@ # include <asm/unistd_32.h> # define __ARCH_WANT_IPC_PARSE_VERSION # define __ARCH_WANT_STAT64 +# define __ARCH_WANT_SYS_IPC # define __ARCH_WANT_SYS_OLD_MMAP # define __ARCH_WANT_SYS_OLD_SELECT diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 8e862aaf0d90..becf47b81735 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -65,7 +65,7 @@ * UV2: Bit 19 selects between * (0): 10 microsecond timebase and * (1): 80 microseconds - * we're using 655us, similar to UV1: 65 units of 10us + * we're using 560us, similar to UV1: 65 units of 10us */ #define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) #define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) @@ -167,6 +167,7 @@ #define FLUSH_RETRY_TIMEOUT 2 #define FLUSH_GIVEUP 3 #define FLUSH_COMPLETE 4 +#define FLUSH_RETRY_BUSYBUG 5 /* * tuning the action when the numalink network is extremely delayed @@ -235,10 +236,10 @@ struct bau_msg_payload { /* - * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) * see table 4.2.3.0.1 in broacast_assist spec. */ -struct bau_msg_header { +struct uv1_bau_msg_header { unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ unsigned int base_dest_nasid:15; /* nasid of the first bit */ @@ -318,19 +319,87 @@ struct bau_msg_header { }; /* + * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * see figure 9-2 of harp_sys.pdf + */ +struct uv2_bau_msg_header { + unsigned int base_dest_nasid:15; /* nasid of the first bit */ + /* bits 14:0 */ /* in uvhub map */ + unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */ + /* bits 19:15 */ + unsigned int rsvd_1:1; /* must be zero */ + /* bit 20 */ + /* Address bits 59:21 */ + /* bits 25:2 of address (44:21) are payload */ + /* these next 24 bits become bytes 12-14 of msg */ + /* bits 28:21 land in byte 12 */ + unsigned int replied_to:1; /* sent as 0 by the source to + byte 12 */ + /* bit 21 */ + unsigned int msg_type:3; /* software type of the + message */ + /* bits 24:22 */ + unsigned int canceled:1; /* message canceled, resource + is to be freed*/ + /* bit 25 */ + unsigned int payload_1:3; /* not currently used */ + /* bits 28:26 */ + + /* bits 36:29 land in byte 13 */ + unsigned int payload_2a:3; /* not currently used */ + unsigned int payload_2b:5; /* not currently used */ + /* bits 36:29 */ + + /* bits 44:37 land in byte 14 */ + unsigned int payload_3:8; /* not currently used */ + /* bits 44:37 */ + + unsigned int rsvd_2:7; /* reserved */ + /* bits 51:45 */ + unsigned int swack_flag:1; /* software acknowledge flag */ + /* bit 52 */ + unsigned int rsvd_3a:3; /* must be zero */ + unsigned int rsvd_3b:8; /* must be zero */ + unsigned int rsvd_3c:8; /* must be zero */ + unsigned int rsvd_3d:3; /* must be zero */ + /* bits 74:53 */ + unsigned int fairness:3; /* usually zero */ + /* bits 77:75 */ + + unsigned int sequence:16; /* message sequence number */ + /* bits 93:78 Suppl_A */ + unsigned int chaining:1; /* next descriptor is part of + this activation*/ + /* bit 94 */ + unsigned int multilevel:1; /* multi-level multicast + format */ + /* bit 95 */ + unsigned int rsvd_4:24; /* ordered / source node / + source subnode / aging + must be zero */ + /* bits 119:96 */ + unsigned int command:8; /* message type */ + /* bits 127:120 */ +}; + +/* * The activation descriptor: * The format of the message to send, plus all accompanying control * Should be 64 bytes */ struct bau_desc { - struct pnmask distribution; + struct pnmask distribution; /* * message template, consisting of header and payload: */ - struct bau_msg_header header; - struct bau_msg_payload payload; + union bau_msg_header { + struct uv1_bau_msg_header uv1_hdr; + struct uv2_bau_msg_header uv2_hdr; + } header; + + struct bau_msg_payload payload; }; -/* +/* UV1: * -payload-- ---------header------ * bytes 0-11 bits 41-56 bits 58-81 * A B (2) C (3) @@ -340,6 +409,16 @@ struct bau_desc { * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) * ------------payload queue----------- */ +/* UV2: + * -payload-- ---------header------ + * bytes 0-11 bits 70-78 bits 21-44 + * A B (2) C (3) + * + * A/B/C are moved to: + * A C B + * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) + * ------------payload queue----------- + */ /* * The payload queue on the destination side is an array of these. @@ -385,7 +464,6 @@ struct bau_pq_entry { struct msg_desc { struct bau_pq_entry *msg; int msg_slot; - int swack_slot; struct bau_pq_entry *queue_first; struct bau_pq_entry *queue_last; }; @@ -405,6 +483,7 @@ struct ptc_stats { requests */ unsigned long s_stimeout; /* source side timeouts */ unsigned long s_dtimeout; /* destination side timeouts */ + unsigned long s_strongnacks; /* number of strong nack's */ unsigned long s_time; /* time spent in sending side */ unsigned long s_retriesok; /* successful retries */ unsigned long s_ntargcpu; /* total number of cpu's @@ -439,6 +518,9 @@ struct ptc_stats { unsigned long s_retry_messages; /* retry broadcasts */ unsigned long s_bau_reenabled; /* for bau enable/disable */ unsigned long s_bau_disabled; /* for bau enable/disable */ + unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ + unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ + unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -511,9 +593,12 @@ struct bau_control { short osnode; short uvhub_cpu; short uvhub; + short uvhub_version; short cpus_in_socket; short cpus_in_uvhub; short partition_base_pnode; + short using_desc; /* an index, like uvhub_cpu */ + unsigned int inuse_map; unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; @@ -531,6 +616,7 @@ struct bau_control { int cong_response_us; int cong_reps; int cong_period; + unsigned long clocks_per_100_usec; cycles_t period_time; long period_requests; struct hub_and_pnode *thp; @@ -591,6 +677,11 @@ static inline void write_mmr_sw_ack(unsigned long mr) uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); } +static inline void write_gmmr_sw_ack(int pnode, unsigned long mr) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); +} + static inline unsigned long read_mmr_sw_ack(void) { return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 54a13aaebc40..21f7385badb8 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -318,13 +318,13 @@ uv_gpa_in_mmr_space(unsigned long gpa) /* UV global physical address --> socket phys RAM */ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) { - unsigned long paddr = gpa & uv_hub_info->gpa_mask; + unsigned long paddr; unsigned long remap_base = uv_hub_info->lowmem_remap_base; unsigned long remap_top = uv_hub_info->lowmem_remap_top; gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); - gpa = gpa & uv_hub_info->gpa_mask; + paddr = gpa & uv_hub_info->gpa_mask; if (paddr >= remap_base && paddr < remap_base + remap_top) paddr -= remap_base; return paddr; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..c0f7d68d318f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1044,6 +1044,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); + /* * Special IST stacks which the CPU switches to when it calls * an IST-marked descriptor entry. Up to 7 stacks (hardware @@ -1111,6 +1114,8 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 73da6b64f5b7..d6bd49faa40c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -439,7 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event) hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; cpuc->pebs_enabled |= 1ULL << hwc->idx; - WARN_ON_ONCE(cpuc->enabled); if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) intel_pmu_lbr_enable(event); diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 3fab3de3ce96..47a7e63bfe54 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -72,8 +72,6 @@ void intel_pmu_lbr_enable(struct perf_event *event) if (!x86_pmu.lbr_nr) return; - WARN_ON_ONCE(cpuc->enabled); - /* * Reset the LBR stack if we changed task context to * avoid data leaks. diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1aae78f775fc..4025fe4f928f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) unsigned short ss; unsigned long sp; #endif - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); + printk(KERN_DEFAULT + "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6d728d9284bd..17107bd6e1f0 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -129,7 +129,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { if (regs) stack = (unsigned long *)regs->sp; - else if (task && task != current) + else if (task != current) stack = (unsigned long *)task->thread.sp; else stack = &dummy; @@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + printk(KERN_DEFAULT "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - 0, KERN_EMERG); + 0, KERN_DEFAULT); - printk(KERN_EMERG "Code: "); + printk(KERN_DEFAULT "Code: "); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 174d938d576b..62d61e9976eb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -703,7 +703,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) } #endif -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_ACPI /** * Mark ACPI NVS memory region, so that we can save/restore it during * hibernation and the subsequent resume. @@ -716,7 +716,7 @@ static int __init e820_mark_nvs_memory(void) struct e820entry *ei = &e820.map[i]; if (ei->type == E820_NVS) - suspend_nvs_register(ei->addr, ei->size); + acpi_nvs_register(ei->addr, ei->size); } return 0; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 4af9fd2450a5..79d97e68f042 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -42,6 +42,7 @@ */ #include <linux/linkage.h> +#include <linux/err.h> #include <asm/thread_info.h> #include <asm/irqflags.h> #include <asm/errno.h> @@ -453,7 +454,7 @@ sysenter_audit: movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -464,11 +465,10 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 940ba711fc28..3fe8239fd8fb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include <asm/paravirt.h> #include <asm/ftrace.h> #include <asm/percpu.h> +#include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -548,7 +549,7 @@ badsys: #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. - * We just call audit_syscall_entry() directly, and then + * We just call __audit_syscall_entry() directly, and then * jump back to the normal fast path. */ auditsys: @@ -558,22 +559,21 @@ auditsys: movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ movq %rax,%rsi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath /* - * Return fast path for syscall audit. Call audit_syscall_exit() + * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $0,%rsi /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index fe86493f3ed1..ac0417be9131 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -311,13 +311,33 @@ out: return state; } +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + * amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Starting at family 15h they are in family specific firmware files: + * + * amd-ucode/microcode_amd_fam15h.bin + * amd-ucode/microcode_amd_fam16h.bin + * ... + * + * These might be larger than 2K. + */ static enum ucode_state request_microcode_amd(int cpu, struct device *device) { - const char *fw_name = "amd-ucode/microcode_amd.bin"; + char fw_name[36] = "amd-ucode/microcode_amd.bin"; const struct firmware *fw; enum ucode_state ret = UCODE_NFOUND; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86 >= 0x15) + snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); - if (request_firmware(&fw, fw_name, device)) { + if (request_firmware(&fw, (const char *)fw_name, device)) { pr_err("failed to load file %s\n", fw_name); goto out; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 485204f58cda..c08d1ff12b7c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -214,6 +214,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; @@ -299,22 +300,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); - bool preload_fpu; + fpu_switch_t fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - /* - * If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - */ - preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; - - __unlazy_fpu(prev_p); - - /* we're going to use this soon, after a few expensive things */ - if (preload_fpu) - prefetch(next->fpu.state); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0. @@ -354,11 +344,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); - /* If we're going to preload the fpu context, make sure clts - is run while we're batching the cpu state updates. */ - if (preload_fpu) - clts(); - /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -368,15 +353,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - if (preload_fpu) - __math_state_restore(); - /* * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) lazy_load_gs(next->gs); + switch_fpu_finish(next_p, fpu); + percpu_write(current_task, next_p); return prev_p; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9b9fe4a85c87..cfa5c90c01db 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -286,6 +286,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, set_tsk_thread_flag(p, TIF_FORK); + p->fpu_counter = 0; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); @@ -386,18 +387,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; - bool preload_fpu; + fpu_switch_t fpu; - /* - * If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - */ - preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; - - /* we're going to use this soon, after a few expensive things */ - if (preload_fpu) - prefetch(next->fpu.state); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0, LDT and the page table pointer: @@ -427,13 +419,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); - /* Must be after DS reload */ - __unlazy_fpu(prev_p); - - /* Make sure cpu is ready for new context */ - if (preload_fpu) - clts(); - /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -474,6 +459,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; + switch_fpu_finish(next_p, fpu); + /* * Switch the PDA and FPU contexts. */ @@ -492,13 +479,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* - * Preload the FPU context, now that we've determined that the - * task is likely to be using it. - */ - if (preload_fpu) - __math_state_restore(); - return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 89a04c7b5bb6..50267386b766 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (unlikely(current->audit_context)) { - if (IS_IA32) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); #ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); #endif - } return ret ?: regs->orig_ax; } @@ -1414,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 37a458b521a6..d840e69a853c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -39,6 +39,14 @@ static int reboot_mode; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; +/* This variable is used privately to keep track of whether or not + * reboot_type is still set to its default value (i.e., reboot= hasn't + * been set on the command line). This is needed so that we can + * suppress DMI scanning for reboot quirks. Without it, it's + * impossible to override a faulty reboot quirk without recompiling. + */ +static int reboot_default = 1; + #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) static int reboot_cpu = -1; #endif @@ -67,6 +75,12 @@ bool port_cf9_safe = false; static int __init reboot_setup(char *str) { for (;;) { + /* Having anything passed on the command line via + * reboot= will cause us to disable DMI checking + * below. + */ + reboot_default = 0; + switch (*str) { case 'w': reboot_mode = 0x1234; @@ -295,14 +309,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, - { /* Handle problems with rebooting on VersaLogic Menlow boards */ - .callback = set_bios_reboot, - .ident = "VersaLogic Menlow based board", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"), - DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"), - }, - }, { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, .ident = "Acer Aspire One A110", @@ -316,7 +322,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { static int __init reboot_init(void) { - dmi_check_system(reboot_dmi_table); + /* Only do the DMI check if reboot_type hasn't been overridden + * on the command line + */ + if (reboot_default) { + dmi_check_system(reboot_dmi_table); + } return 0; } core_initcall(reboot_init); @@ -465,7 +476,12 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { static int __init pci_reboot_init(void) { - dmi_check_system(pci_reboot_dmi_table); + /* Only do the DMI check if reboot_type hasn't been overridden + * on the command line + */ + if (reboot_default) { + dmi_check_system(pci_reboot_dmi_table); + } return 0; } core_initcall(pci_reboot_init); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 482ec3af2067..4bbe04d96744 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -571,41 +571,18 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) } /* - * __math_state_restore assumes that cr0.TS is already clear and the - * fpu state is all ready for use. Used during context switch. - */ -void __math_state_restore(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; - - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; -} - -/* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task * * Careful.. There are problems with IBM-designed IRQ13 behaviour. * Don't touch unless you *really* know how it works. * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). + * Must be called with kernel preemption disabled (eg with local + * local interrupts as in the case of do_device_not_available). */ -asmlinkage void math_state_restore(void) +void math_state_restore(void) { - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; + struct task_struct *tsk = current; if (!tsk_used_math(tsk)) { local_irq_enable(); @@ -622,9 +599,17 @@ asmlinkage void math_state_restore(void) local_irq_disable(); } - clts(); /* Allow maths ops (or we recurse) */ + __thread_fpu_begin(tsk); + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + __thread_fpu_end(tsk); + force_sig(SIGSEGV, tsk); + return; + } - __math_state_restore(); + tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c0dd5b603749..a62c201c97ec 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -290,14 +290,15 @@ static inline int pit_verify_msb(unsigned char val) static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; - u64 tsc = 0; + u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; + prev_tsc = tsc; tsc = get_cycles(); } - *deltap = get_cycles() - tsc; + *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* @@ -311,9 +312,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend - * more than 25ms on it. + * more than 50ms on it. */ -#define MAX_QUICK_PIT_MS 25 +#define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) @@ -383,15 +384,12 @@ success: * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are - * reliable (within the error). We also adjust the - * delta to the middle of the error bars, just - * because it looks nicer. + * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ - delta += (long)(d2 - d1)/2; delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); printk("Fast TSC calibration using PIT\n"); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f8753ab0a..b466cab5ba15 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call audit_syscall_exit since we do not exit via the normal paths */ + /*call __audit_syscall_exit since we do not exit via the normal paths */ +#ifdef CONFIG_AUDITSYSCALL if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(0), 0); + __audit_syscall_exit(1, 0); +#endif __asm__ __volatile__( "movl %0,%%esp\n\t" diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a3911343976b..711091114119 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!fx) return; - BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); + BUG_ON(__thread_has_fpu(tsk)); xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; @@ -168,7 +168,7 @@ int save_i387_xstate(void __user *buf) if (!used_math()) return 0; - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (user_has_fpu()) { if (use_xsave()) err = xsave_user(buf); else @@ -176,8 +176,7 @@ int save_i387_xstate(void __user *buf) if (err) return err; - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + user_fpu_end(); } else { sanitize_i387_state(tsk); if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, @@ -292,10 +291,7 @@ int restore_i387_xstate(void __user *buf) return err; } - if (!(task_thread_info(current)->status & TS_USEDFPU)) { - clts(); - task_thread_info(current)->status |= TS_USEDFPU; - } + user_fpu_begin(); if (use_xsave()) err = restore_user_xstate(buf); else diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 05a562b85025..0982507b962a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1891,6 +1891,51 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, ss->p = 1; } +static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) +{ + struct x86_emulate_ops *ops = ctxt->ops; + u32 eax, ebx, ecx, edx; + + /* + * syscall should always be enabled in longmode - so only become + * vendor specific (cpuid) if other modes are active... + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return true; + + eax = 0x00000000; + ecx = 0x00000000; + if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { + /* + * Intel ("GenuineIntel") + * remark: Intel CPUs only support "syscall" in 64bit + * longmode. Also an 64bit guest with a + * 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD + * response - CPUs of AMD can't behave like Intel. + */ + if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + return false; + + /* AMD ("AuthenticAMD") */ + if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + return true; + + /* AMD ("AMDisbetter!") */ + if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) + return true; + } + + /* default: (not Intel, not AMD), apply Intel's stricter rules... */ + return false; +} + static int em_syscall(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; @@ -1904,9 +1949,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->mode == X86EMUL_MODE_VM86) return emulate_ud(ctxt); + if (!(em_syscall_is_enabled(ctxt))) + return emulate_ud(ctxt); + ops->get_msr(ctxt, MSR_EFER, &efer); setup_syscalls_segments(ctxt, &cs, &ss); + if (!(efer & EFER_SCE)) + return emulate_ud(ctxt); + ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d29216c462b3..3b4c8d8ad906 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (current_thread_info()->status & TS_USEDFPU) + if (__thread_has_fpu(current)) clts(); load_gdt(&__get_cpu_var(host_gdt)); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 14d6cadc4ba6..9cbfc0698118 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1495,6 +1495,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + bool pr = false; + switch (msr) { case MSR_EFER: return set_efer(vcpu, data); @@ -1635,6 +1637,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + pr = true; + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr, data); + + if (pr || data != 0) + pr_unimpl(vcpu, "disabled perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; case MSR_K7_CLK_CTL: /* * Ignore all writes to this no longer documented MSR. @@ -1835,6 +1849,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_get_msr(vcpu, msr, pdata); + data = 0; + break; case MSR_IA32_UCODE_REV: data = 0x100000000ULL; break; @@ -4180,6 +4202,28 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } +static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + struct kvm_cpuid_entry2 *cpuid = NULL; + + if (eax && ecx) + cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), + *eax, *ecx); + + if (cpuid) { + *eax = cpuid->eax; + *ecx = cpuid->ecx; + if (ebx) + *ebx = cpuid->ebx; + if (edx) + *edx = cpuid->edx; + return true; + } + + return false; +} + static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, @@ -4211,6 +4255,7 @@ static struct x86_emulate_ops emulate_ops = { .get_fpu = emulator_get_fpu, .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, + .get_cpuid = emulator_get_cpuid, }; static void cache_all_regs(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 5b83c51c12e0..819137904428 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -219,7 +219,9 @@ ab: STOS/W/D/Q Yv,rAX ac: LODS/B AL,Xb ad: LODS/W/D/Q rAX,Xv ae: SCAS/B AL,Yb -af: SCAS/W/D/Q rAX,Xv +# Note: The May 2011 Intel manual shows Xv for the second parameter of the +# next instruction but Yv is correct +af: SCAS/W/D/Q rAX,Yv # 0xb0 - 0xbf b0: MOV AL/R8L,Ib b1: MOV CL/R9L,Ib @@ -729,8 +731,8 @@ de: VAESDEC Vdq,Hdq,Wdq (66),(v1) df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) -f3: ANDN Gy,By,Ey (v) -f4: Grp17 (1A) +f2: ANDN Gy,By,Ey (v) +f3: Grp17 (1A) f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) f6: MULX By,Gy,rDX,Ey (F2),(v) f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d74824a708d..f0b4caf85c1a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -673,7 +673,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, stackend = end_of_stack(tsk); if (tsk != &init_task && *stackend != STACK_END_MAGIC) - printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; @@ -684,7 +684,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, sig = 0; /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); + printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index fd61b3fb7341..1c1c4f46a7c1 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -109,6 +109,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain_lo; + if (acpi_srat_revision >= 2) + pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); @@ -160,6 +162,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; + if (acpi_srat_revision <= 1) + pxm &= 0xff; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7b65f752c5f8..7c1b765ecc59 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -151,17 +151,18 @@ void bpf_jit_compile(struct sk_filter *fp) cleanup_addr = proglen; /* epilogue address */ for (pass = 0; pass < 10; pass++) { + u8 seen_or_pass0 = (pass == 0) ? (SEEN_XREG | SEEN_DATAREF | SEEN_MEM) : seen; /* no prologue/epilogue for trivial filters (RET something) */ proglen = 0; prog = temp; - if (seen) { + if (seen_or_pass0) { EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */ EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */ /* note : must save %rbx in case bpf_error is hit */ - if (seen & (SEEN_XREG | SEEN_DATAREF)) + if (seen_or_pass0 & (SEEN_XREG | SEEN_DATAREF)) EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */ - if (seen & SEEN_XREG) + if (seen_or_pass0 & SEEN_XREG) CLEAR_X(); /* make sure we dont leek kernel memory */ /* @@ -170,7 +171,7 @@ void bpf_jit_compile(struct sk_filter *fp) * r9 = skb->len - skb->data_len * r8 = skb->data */ - if (seen & SEEN_DATAREF) { + if (seen_or_pass0 & SEEN_DATAREF) { if (offsetof(struct sk_buff, len) <= 127) /* mov off8(%rdi),%r9d */ EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len)); @@ -260,9 +261,14 @@ void bpf_jit_compile(struct sk_filter *fp) case BPF_S_ALU_DIV_X: /* A /= X; */ seen |= SEEN_XREG; EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ - if (pc_ret0 != -1) - EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4)); - else { + if (pc_ret0 > 0) { + /* addrs[pc_ret0 - 1] is start address of target + * (addrs[i] - 4) is the address following this jmp + * ("xor %edx,%edx; div %ebx" being 4 bytes long) + */ + EMIT_COND_JMP(X86_JE, addrs[pc_ret0 - 1] - + (addrs[i] - 4)); + } else { EMIT_COND_JMP(X86_JNE, 2 + 5); CLEAR_A(); EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */ @@ -335,12 +341,12 @@ void bpf_jit_compile(struct sk_filter *fp) } /* fallinto */ case BPF_S_RET_A: - if (seen) { + if (seen_or_pass0) { if (i != flen - 1) { EMIT_JMP(cleanup_addr - addrs[i]); break; } - if (seen & SEEN_XREG) + if (seen_or_pass0 & SEEN_XREG) EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */ EMIT1(0xc9); /* leaveq */ } @@ -483,8 +489,9 @@ common_load: seen |= SEEN_DATAREF; goto common_load; case BPF_S_LDX_B_MSH: if ((int)K < 0) { - if (pc_ret0 != -1) { - EMIT_JMP(addrs[pc_ret0] - addrs[i]); + if (pc_ret0 > 0) { + /* addrs[pc_ret0 - 1] is the start address */ + EMIT_JMP(addrs[pc_ret0 - 1] - addrs[i]); break; } CLEAR_A(); @@ -599,13 +606,14 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; * use it to give the cleanup instruction(s) addr */ cleanup_addr = proglen - 1; /* ret */ - if (seen) + if (seen_or_pass0) cleanup_addr -= 1; /* leaveq */ - if (seen & SEEN_XREG) + if (seen_or_pass0 & SEEN_XREG) cleanup_addr -= 4; /* mov -8(%rbp),%rbx */ if (image) { - WARN_ON(proglen != oldproglen); + if (proglen != oldproglen) + pr_err("bpb_jit_compile proglen=%u != oldproglen=%u\n", proglen, oldproglen); break; } if (proglen == oldproglen) { diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 492ade8c978e..d99346ea8fdb 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -374,7 +374,7 @@ int __init pci_xen_init(void) int __init pci_xen_hvm_init(void) { - if (!xen_feature(XENFEAT_hvm_pirqs)) + if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs)) return 0; #ifdef CONFIG_ACPI diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 5b552198f774..3ae0e61abd23 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid(int uvhub) * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp) +static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { unsigned long dw; struct bau_pq_entry *msg; msg = mdp->msg; - if (!msg->canceled) { + if (!msg->canceled && do_acknowledge) { dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; write_mmr_sw_ack(dw); } @@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp, if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { unsigned long mr; /* - * is the resource timed out? - * make everyone ignore the cancelled message. + * Is the resource timed out? + * Make everyone ignore the cancelled message. */ msg2->canceled = 1; stat->d_canceled++; @@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp, * Do all the things a cpu should do for a TLB shootdown message. * Other cpu's may come here at the same time for this message. */ -static void bau_process_message(struct msg_desc *mdp, - struct bau_control *bcp) +static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { short socket_ack_count = 0; short *sp; @@ -284,8 +285,9 @@ static void bau_process_message(struct msg_desc *mdp, if (msg_ack_count == bcp->cpus_in_uvhub) { /* * All cpus in uvhub saw it; reply + * (unless we are in the UV2 workaround) */ - reply_to_message(mdp, bcp); + reply_to_message(mdp, bcp, do_acknowledge); } } @@ -491,27 +493,138 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, /* * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. */ -static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu) +static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; unsigned long descriptor_status2; descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); - descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL; + descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; descriptor_status = (descriptor_status << 1) | descriptor_status2; return descriptor_status; } +/* + * Return whether the status of the descriptor that is normally used for this + * cpu (the one indexed by its hub-relative cpu number) is busy. + * The status of the original 32 descriptors is always reflected in the 64 + * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. + * The bit provided by the activation_status_2 register is irrelevant to + * the status if it is only being tested for busy or not busy. + */ +int normal_busy(struct bau_control *bcp) +{ + int cpu = bcp->uvhub_cpu; + int mmr_offset; + int right_shift; + + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + return (((((read_lmmr(mmr_offset) >> right_shift) & + UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); +} + +/* + * Entered when a bau descriptor has gone into a permanent busy wait because + * of a hardware bug. + * Workaround the bug. + */ +int handle_uv2_busy(struct bau_control *bcp) +{ + int busy_one = bcp->using_desc; + int normal = bcp->uvhub_cpu; + int selected = -1; + int i; + unsigned long descriptor_status; + unsigned long status; + int mmr_offset; + struct bau_desc *bau_desc_old; + struct bau_desc *bau_desc_new; + struct bau_control *hmaster = bcp->uvhub_master; + struct ptc_stats *stat = bcp->statp; + cycles_t ttm; + + stat->s_uv2_wars++; + spin_lock(&hmaster->uvhub_lock); + /* try for the original first */ + if (busy_one != normal) { + if (!normal_busy(bcp)) + selected = normal; + } + if (selected < 0) { + /* can't use the normal, select an alternate */ + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + descriptor_status = read_lmmr(mmr_offset); + + /* scan available descriptors 32-63 */ + for (i = 0; i < UV_CPUS_PER_AS; i++) { + if ((hmaster->inuse_map & (1 << i)) == 0) { + status = ((descriptor_status >> + (i * UV_ACT_STATUS_SIZE)) & + UV_ACT_STATUS_MASK) << 1; + if (status != UV2H_DESC_BUSY) { + selected = i + UV_CPUS_PER_AS; + break; + } + } + } + } + + if (busy_one != normal) + /* mark the busy alternate as not in-use */ + hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); + + if (selected >= 0) { + /* switch to the selected descriptor */ + if (selected != normal) { + /* set the selected alternate as in-use */ + hmaster->inuse_map |= + (1 << (selected - UV_CPUS_PER_AS)); + if (selected > stat->s_uv2_wars_hw) + stat->s_uv2_wars_hw = selected; + } + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * busy_one); + bcp->using_desc = selected; + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * selected); + *bau_desc_new = *bau_desc_old; + } else { + /* + * All are busy. Wait for the normal one for this cpu to + * free up. + */ + stat->s_uv2_war_waits++; + spin_unlock(&hmaster->uvhub_lock); + ttm = get_cycles(); + do { + cpu_relax(); + } while (normal_busy(bcp)); + spin_lock(&hmaster->uvhub_lock); + /* switch to the original descriptor */ + bcp->using_desc = normal; + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); + bcp->using_desc = (ITEMS_PER_DESC * normal); + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * normal); + *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ + } + spin_unlock(&hmaster->uvhub_lock); + return FLUSH_RETRY_BUSYBUG; +} + static int uv2_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, struct bau_control *bcp, long try) { unsigned long descriptor_stat; cycles_t ttm; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; + long busy_reps = 0; struct ptc_stats *stat = bcp->statp; - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { @@ -522,32 +635,35 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, * our message and its state will stay IDLE. */ if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || - (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) || (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { stat->s_stimeout++; return FLUSH_GIVEUP; + } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { + stat->s_strongnacks++; + bcp->conseccompletes = 0; + return FLUSH_GIVEUP; } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { stat->s_dtimeout++; - ttm = get_cycles(); - /* - * Our retries may be blocked by all destination - * swack resources being consumed, and a timeout - * pending. In that case hardware returns the - * ERROR that looks like a destination timeout. - */ - if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { - bcp->conseccompletes = 0; - return FLUSH_RETRY_PLUGGED; - } bcp->conseccompletes = 0; return FLUSH_RETRY_TIMEOUT; } else { + busy_reps++; + if (busy_reps > 1000000) { + /* not to hammer on the clock */ + busy_reps = 0; + ttm = get_cycles(); + if ((ttm - bcp->send_message) > + (bcp->clocks_per_100_usec)) { + return handle_uv2_busy(bcp); + } + } /* * descriptor_stat is still BUSY */ cpu_relax(); } - descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, + desc); } bcp->conseccompletes++; return FLUSH_COMPLETE; @@ -563,17 +679,17 @@ static int wait_completion(struct bau_desc *bau_desc, { int right_shift; unsigned long mmr_offset; - int cpu = bcp->uvhub_cpu; + int desc = bcp->using_desc; - if (cpu < UV_CPUS_PER_AS) { + if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; + right_shift = desc * UV_ACT_STATUS_SIZE; } else { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); + right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); } - if (is_uv1_hub()) + if (bcp->uvhub_version == 1) return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try); else @@ -752,19 +868,22 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) { int seq_number = 0; int completion_stat = 0; + int uv1 = 0; long try = 0; unsigned long index; cycles_t time1; cycles_t time2; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster = bcp->uvhub_master; + struct uv1_bau_msg_header *uv1_hdr = NULL; + struct uv2_bau_msg_header *uv2_hdr = NULL; + struct bau_desc *bau_desc; - if (is_uv1_hub()) + if (bcp->uvhub_version == 1) uv1_throttle(hmaster, stat); while (hmaster->uvhub_quiesce) @@ -772,22 +891,39 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, time1 = get_cycles(); do { - if (try == 0) { - bau_desc->header.msg_type = MSG_REGULAR; + bau_desc = bcp->descriptor_base; + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + if (bcp->uvhub_version == 1) { + uv1 = 1; + uv1_hdr = &bau_desc->header.uv1_hdr; + } else + uv2_hdr = &bau_desc->header.uv2_hdr; + if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { + if (uv1) + uv1_hdr->msg_type = MSG_REGULAR; + else + uv2_hdr->msg_type = MSG_REGULAR; seq_number = bcp->message_number++; } else { - bau_desc->header.msg_type = MSG_RETRY; + if (uv1) + uv1_hdr->msg_type = MSG_RETRY; + else + uv2_hdr->msg_type = MSG_RETRY; stat->s_retry_messages++; } - bau_desc->header.sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; + if (uv1) + uv1_hdr->sequence = seq_number; + else + uv2_hdr->sequence = seq_number; + index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; bcp->send_message = get_cycles(); write_mmr_activation(index); try++; completion_stat = wait_completion(bau_desc, bcp, try); + /* UV2: wait_completion() may change the bcp->using_desc */ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); @@ -798,6 +934,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, } cpu_relax(); } while ((completion_stat == FLUSH_RETRY_PLUGGED) || + (completion_stat == FLUSH_RETRY_BUSYBUG) || (completion_stat == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); @@ -812,6 +949,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, record_send_stats(time1, time2, bcp, stat, completion_stat, try); if (completion_stat == FLUSH_GIVEUP) + /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ return 1; return 0; } @@ -967,7 +1105,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu; + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; @@ -980,13 +1118,86 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. */ - if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) + if (!uv_flush_send_and_wait(flush_mask, bcp)) return NULL; else return cpumask; } /* + * Search the message queue for any 'other' message with the same software + * acknowledge resource bit vector. + */ +struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, + struct bau_control *bcp, unsigned char swack_vec) +{ + struct bau_pq_entry *msg_next = msg + 1; + + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + while ((msg_next->swack_vec != 0) && (msg_next != msg)) { + if (msg_next->swack_vec == swack_vec) + return msg_next; + msg_next++; + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + } + return NULL; +} + +/* + * UV2 needs to work around a bug in which an arriving message has not + * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. + * Such a message must be ignored. + */ +void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +{ + unsigned long mmr_image; + unsigned char swack_vec; + struct bau_pq_entry *msg = mdp->msg; + struct bau_pq_entry *other_msg; + + mmr_image = read_mmr_sw_ack(); + swack_vec = msg->swack_vec; + + if ((swack_vec & mmr_image) == 0) { + /* + * This message was assigned a swack resource, but no + * reserved acknowlegment is pending. + * The bug has prevented this message from setting the MMR. + * And no other message has used the same sw_ack resource. + * Do the requested shootdown but do not reply to the msg. + * (the 0 means make no acknowledge) + */ + bau_process_message(mdp, bcp, 0); + return; + } + + /* + * Some message has set the MMR 'pending' bit; it might have been + * another message. Look for that message. + */ + other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); + if (other_msg) { + /* There is another. Do not ack the current one. */ + bau_process_message(mdp, bcp, 0); + /* + * Let the natural processing of that message acknowledge + * it. Don't get the processing of sw_ack's out of order. + */ + return; + } + + /* + * There is no other message using this sw_ack, so it is safe to + * acknowledge it. + */ + bau_process_message(mdp, bcp, 1); + + return; +} + +/* * The BAU message interrupt comes here. (registered by set_intr_gate) * See entry_64.S * @@ -1009,6 +1220,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) struct ptc_stats *stat; struct msg_desc msgdesc; + ack_APIC_irq(); time_start = get_cycles(); bcp = &per_cpu(bau_control, smp_processor_id()); @@ -1022,9 +1234,11 @@ void uv_bau_message_interrupt(struct pt_regs *regs) count++; msgdesc.msg_slot = msg - msgdesc.queue_first; - msgdesc.swack_slot = ffs(msg->swack_vec) - 1; msgdesc.msg = msg; - bau_process_message(&msgdesc, bcp); + if (bcp->uvhub_version == 2) + process_uv2_message(&msgdesc, bcp); + else + bau_process_message(&msgdesc, bcp, 1); msg++; if (msg > msgdesc.queue_last) @@ -1036,8 +1250,6 @@ void uv_bau_message_interrupt(struct pt_regs *regs) stat->d_nomsg++; else if (count > 1) stat->d_multmsg++; - - ack_APIC_irq(); } /* @@ -1083,7 +1295,7 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image |= (1L << UV2_LEG_SHFT); + mmr_image &= ~(1L << UV2_LEG_SHFT); mmr_image |= (1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); @@ -1136,13 +1348,13 @@ static int ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); seq_printf(file, "resetp resett giveup sto bz throt swack recv rtime "); seq_printf(file, "all one mult none retry canc nocan reset rcan "); seq_printf(file, - "disable enable\n"); + "disable enable wars warshw warwaits\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); @@ -1154,10 +1366,10 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->s_ntargremotes, stat->s_ntargcpu, stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, stat->s_ntarguvhub, stat->s_ntarguvhub16); - seq_printf(file, "%ld %ld %ld %ld %ld ", + seq_printf(file, "%ld %ld %ld %ld %ld %ld ", stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, - stat->s_dtimeout); + stat->s_dtimeout, stat->s_strongnacks); seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, @@ -1173,8 +1385,10 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); - seq_printf(file, "%ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled); + seq_printf(file, "%ld %ld %ld %ld %ld\n", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits); } return 0; } @@ -1432,12 +1646,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; + int uv1 = 0; unsigned long gpa; unsigned long m; unsigned long n; size_t dsize; struct bau_desc *bau_desc; struct bau_desc *bd2; + struct uv1_bau_msg_header *uv1_hdr; + struct uv2_bau_msg_header *uv2_hdr; struct bau_control *bcp; /* @@ -1451,6 +1668,8 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) gpa = uv_gpa(bau_desc); n = uv_gpa_to_gnode(gpa); m = uv_gpa_to_offset(gpa); + if (is_uv1_hub()) + uv1 = 1; /* the 14-bit pnode */ write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); @@ -1461,21 +1680,33 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) */ for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { memset(bd2, 0, sizeof(struct bau_desc)); - bd2->header.swack_flag = 1; - /* - * The base_dest_nasid set in the message header is the nasid - * of the first uvhub in the partition. The bit map will - * indicate destination pnode numbers relative to that base. - * They may not be consecutive if nasid striding is being used. - */ - bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); - bd2->header.dest_subnodeid = UV_LB_SUBNODEID; - bd2->header.command = UV_NET_ENDPOINT_INTD; - bd2->header.int_both = 1; - /* - * all others need to be set to zero: - * fairness chaining multilevel count replied_to - */ + if (uv1) { + uv1_hdr = &bd2->header.uv1_hdr; + uv1_hdr->swack_flag = 1; + /* + * The base_dest_nasid set in the message header + * is the nasid of the first uvhub in the partition. + * The bit map will indicate destination pnode numbers + * relative to that base. They may not be consecutive + * if nasid striding is being used. + */ + uv1_hdr->base_dest_nasid = + UV_PNODE_TO_NASID(base_pnode); + uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv1_hdr->command = UV_NET_ENDPOINT_INTD; + uv1_hdr->int_both = 1; + /* + * all others need to be set to zero: + * fairness chaining multilevel count replied_to + */ + } else { + uv2_hdr = &bd2->header.uv2_hdr; + uv2_hdr->swack_flag = 1; + uv2_hdr->base_dest_nasid = + UV_PNODE_TO_NASID(base_pnode); + uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv2_hdr->command = UV_NET_ENDPOINT_INTD; + } } for_each_present_cpu(cpu) { if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) @@ -1531,6 +1762,7 @@ static void pq_init(int node, int pnode) write_mmr_payload_first(pnode, pn_first); write_mmr_payload_tail(pnode, first); write_mmr_payload_last(pnode, last); + write_gmmr_sw_ack(pnode, 0xffffUL); /* in effect, all msg_type's are set to MSG_NOOP */ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); @@ -1584,14 +1816,14 @@ static int calculate_destination_timeout(void) ts_ns = base * mult1 * mult2; ret = ts_ns / 1000; } else { - /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */ - mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); + /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ + mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) - mult1 = 80; + base = 80; else - mult1 = 10; - base = mmr_image & UV2_ACK_MASK; + base = 10; + mult1 = mmr_image & UV2_ACK_MASK; ret = mult1 * base; } return ret; @@ -1618,6 +1850,9 @@ static void __init init_per_cpu_tunables(void) bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; bcp->cong_period = congested_period; + bcp->clocks_per_100_usec = usec_2_cycles(100); + spin_lock_init(&bcp->queue_lock); + spin_lock_init(&bcp->uvhub_lock); } } @@ -1728,8 +1963,17 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, bcp->cpus_in_socket = sdp->num_cpus; bcp->socket_master = *smasterp; bcp->uvhub = bdp->uvhub; + if (is_uv1_hub()) + bcp->uvhub_version = 1; + else if (is_uv2_hub()) + bcp->uvhub_version = 2; + else { + printk(KERN_EMERG "uvhub version not 1 or 2\n"); + return 1; + } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->using_desc = bcp->uvhub_cpu; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu); @@ -1845,6 +2089,8 @@ static int __init uv_bau_init(void) uv_base_pnode = uv_blade_to_pnode(uvhub); } + enable_timeouts(); + if (init_per_cpu(nuvhubs, uv_base_pnode)) { nobau = 1; return 0; @@ -1855,7 +2101,6 @@ static int __init uv_bau_init(void) if (uv_blade_nr_possible_cpus(uvhub)) init_uvhub(uvhub, vector, uv_base_pnode); - enable_timeouts(); alloc_intr_gate(vector, uv_bau_message_intr1); for_each_possible_blade(uvhub) { @@ -1867,7 +2112,8 @@ static int __init uv_bau_init(void) val = 1L << 63; write_gmmr_activation(pnode, val); mmr = 1; /* should be 1 to broadcast to both sockets */ - write_mmr_data_broadcast(pnode, mmr); + if (!is_uv1_hub()) + write_mmr_data_broadcast(pnode, mmr); } } diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 374a05d8ad22..f25c2765a5c9 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -25,7 +25,7 @@ struct uv_irq_2_mmr_pnode{ int irq; }; -static spinlock_t uv_irq_lock; +static DEFINE_SPINLOCK(uv_irq_lock); static struct rb_root uv_irq_root; static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool); diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 711b1621747f..2bbe1ec2d96a 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -1,5 +1,15 @@ +#ifndef __SYSDEP_X86_PTRACE_H +#define __SYSDEP_X86_PTRACE_H + #ifdef __i386__ #include "ptrace_32.h" #else #include "ptrace_64.h" #endif + +static inline long regs_return_value(struct uml_pt_regs *regs) +{ + return UPT_SYSCALL_RET(regs); +} + +#endif /* __SYSDEP_X86_PTRACE_H */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 041d4fe9dfe4..501d4e0244ba 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -409,6 +409,13 @@ static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */ play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); cpu_bringup(); + /* + * Balance out the preempt calls - as we are running in cpu_idle + * loop which has been called at bootup from cpu_bringup_and_idle. + * The cpucpu_bringup_and_idle called cpu_bringup which made a + * preempt_disable() So this preempt_enable will balance it out. + */ + preempt_enable(); } #else /* !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index cc9b1e182fcf..d69cc6c3f808 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -116,9 +116,26 @@ static inline void spin_time_accum_blocked(u64 start) } #endif /* CONFIG_XEN_DEBUG_FS */ +/* + * Size struct xen_spinlock so it's the same as arch_spinlock_t. + */ +#if NR_CPUS < 256 +typedef u8 xen_spinners_t; +# define inc_spinners(xl) \ + asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory"); +# define dec_spinners(xl) \ + asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory"); +#else +typedef u16 xen_spinners_t; +# define inc_spinners(xl) \ + asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory"); +# define dec_spinners(xl) \ + asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory"); +#endif + struct xen_spinlock { unsigned char lock; /* 0 -> free; 1 -> locked */ - unsigned short spinners; /* count of waiting cpus */ + xen_spinners_t spinners; /* count of waiting cpus */ }; static int xen_spin_is_locked(struct arch_spinlock *lock) @@ -164,8 +181,7 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) wmb(); /* set lock of interest before count */ - asm(LOCK_PREFIX " incw %0" - : "+m" (xl->spinners) : : "memory"); + inc_spinners(xl); return prev; } @@ -176,8 +192,7 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) */ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) { - asm(LOCK_PREFIX " decw %0" - : "+m" (xl->spinners) : : "memory"); + dec_spinners(xl); wmb(); /* decrement count before restoring lock */ __this_cpu_write(lock_spinners, prev); } @@ -373,6 +388,8 @@ void xen_uninit_lock_cpu(int cpu) void __init xen_init_spinlocks(void) { + BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); + pv_lock_ops.spin_is_locked = xen_spin_is_locked; pv_lock_ops.spin_is_contended = xen_spin_is_contended; pv_lock_ops.spin_lock = xen_spin_lock; |