From bcb71abe7d4c5a0d0368c67da0a7def4fc73497a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Oct 2011 15:56:24 -0400 Subject: iommu: Add option to group multi-function devices The option iommu=group_mf indicates the that the iommu driver should expose all functions of a multi-function PCI device as the same iommu_device_group. This is useful for disallowing individual functions being exposed as independent devices to userspace as there are often hidden dependencies. Virtual functions are not affected by this option. Signed-off-by: Alex Williamson Signed-off-by: Joerg Roedel --- arch/x86/kernel/pci-dma.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 80dc793b3f63..1c4d769e21ea 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; +/* + * Group multi-function PCI devices into a single device-group for the + * iommu_device_group interface. This tells the iommu driver to pretend + * it cannot distinguish between functions of a device, exposing only one + * group for the device. Useful for disallowing use of individual PCI + * functions from userspace drivers. + */ +int iommu_group_mf __read_mostly; + extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; /* Dummy device used for NULL arguments (normally ISA). */ @@ -169,6 +178,8 @@ static __init int iommu_setup(char *p) #endif if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; + if (!strncmp(p, "group_mf", 8)) + iommu_group_mf = 1; gart_parse_options(p); -- cgit v1.2.3 From b82e324b3c46a554595c12b45465d1943a57326c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 10 Nov 2011 13:18:09 +0000 Subject: serial, mfd: don't hardcode the console Add support to specify which HSU port to use as an early console. This can be selected by passing "earlyprintk=hsu" on the kernel command line. By default port 0 is still used. Signed-off-by: Mika Westerberg Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index cd28a350f7f9..9d42a52d2331 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -247,7 +247,7 @@ static int __init setup_early_printk(char *buf) } if (!strncmp(buf, "hsu", 3)) { - hsu_early_console_init(); + hsu_early_console_init(buf + 3); early_console_register(&early_hsu_console, keep); } #endif -- cgit v1.2.3 From b7641d2c83aa10031bf45afd82619bfaaedcbc6f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 15:43:02 -0800 Subject: x86-64, syscall: Adjust comment spacing and remove typo Adjust spacing for comment so that it matches the multiline comment style used in the rest of the kernel, and remove word duplication. It is not really clear what version of gcc this refers to, but the extra & doesn't cause any harm, so there is no reason to remove it. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/syscall_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index de87d6008295..0edfafa1b269 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -21,9 +21,9 @@ extern void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* - *Smells like a like a compiler bug -- it doesn't work - *when the & below is removed. - */ + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ [0 ... __NR_syscall_max] = &sys_ni_syscall, #include }; -- cgit v1.2.3 From 303395ac3bf3e2cb488435537d416bc840438fcb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 16:07:41 -0800 Subject: x86: Generate system call tables and unistd_*.h from tables Generate system call tables and unistd_*.h automatically from the tables in arch/x86/syscalls. All other information, like NR_syscalls, is auto-generated, some of which is in asm-offsets_*.c. This allows us to keep all the system call information in one place, and allows for kernel space and user space to see different information; this is currently used for the ia32 system call numbers when building the 64-bit kernel, but will be used by the x32 ABI in the near future. This also removes some gratuitious differences between i386, x86-64 and ia32; in particular, now all system call tables are generated with the same mechanism. Cc: H. J. Lu Cc: Sam Ravnborg Cc: Michal Marek Signed-off-by: H. Peter Anvin --- arch/x86/kernel/Makefile | 3 +- arch/x86/kernel/asm-offsets_32.c | 8 + arch/x86/kernel/asm-offsets_64.c | 19 +- arch/x86/kernel/entry_32.S | 37 ++-- arch/x86/kernel/syscall_32.c | 25 +++ arch/x86/kernel/syscall_64.c | 14 +- arch/x86/kernel/syscall_table_32.S | 350 ------------------------------------- 7 files changed, 66 insertions(+), 390 deletions(-) create mode 100644 arch/x86/kernel/syscall_32.c delete mode 100644 arch/x86/kernel/syscall_table_32.S (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8baca3c4871c..8c473d9d0b83 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o +obj-y += syscall_$(BITS).o +obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 395a10e68067..85d98ab15cdc 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -3,6 +3,11 @@ #include #include "../../../drivers/lguest/lg.h" +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include +}; + /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -76,4 +81,7 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(NR_syscalls, sizeof(syscalls)); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72a1194af22..834e897b1e25 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,11 +1,12 @@ #include -#define __NO_STUBS 1 -#undef __SYSCALL -#undef _ASM_X86_UNISTD_64_H -#define __SYSCALL(nr, sym) [nr] = 1, -static char syscalls[] = { -#include +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +static char syscalls_64[] = { +#include +}; +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls_ia32[] = { +#include }; int main(void) @@ -72,7 +73,11 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); + DEFINE(NR_syscalls, sizeof(syscalls_64)); + + DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f3f6f5344001..1ffcda22c2f6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -81,8 +81,6 @@ * enough to patch inline, increasing performance. */ -#define nr_syscalls ((syscall_table_size)/4) - #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -423,7 +421,7 @@ sysenter_past_esp: testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit sysenter_do_call: - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) @@ -504,7 +502,7 @@ ENTRY(system_call) # system call tracing in operation / emulation testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) @@ -650,7 +648,7 @@ syscall_trace_entry: movl %esp, %eax call syscall_trace_enter /* What it returned is what we'll actually use. */ - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jnae syscall_call jmp syscall_exit END(syscall_trace_entry) @@ -690,29 +688,28 @@ END(syscall_badsys) * System calls that need a pt_regs pointer. */ #define PTREGSCALL0(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL1(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL2(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%ecx; \ movl (PT_ECX+4)(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL3(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ CFI_STARTPROC; \ leal 4(%esp),%eax; \ pushl_cfi %eax; \ @@ -737,8 +734,7 @@ PTREGSCALL2(vm86) PTREGSCALL1(vm86old) /* Clone is an oddball. The 4th arg is in %edi */ - ALIGN; -ptregs_clone: +ENTRY(ptregs_clone) CFI_STARTPROC leal 4(%esp),%eax pushl_cfi %eax @@ -1209,11 +1205,6 @@ return_to_handler: jmp *%ecx #endif -.section .rodata,"a" -#include "syscall_table_32.S" - -syscall_table_size=(.-sys_call_table) - /* * Some functions should be protected against kprobes */ diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c new file mode 100644 index 000000000000..b37a57336609 --- /dev/null +++ b/arch/x86/kernel/syscall_32.c @@ -0,0 +1,25 @@ +/* System call table for i386. */ + +#include +#include +#include +#include + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 0edfafa1b269..7ac7943be02c 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -5,15 +5,11 @@ #include #include -#define __NO_STUBS +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_64 -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_UNISTD_64_H -#include - -#undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_UNISTD_64_H +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, typedef void (*sys_call_ptr_t)(void); @@ -25,5 +21,5 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { * when the & below is removed. */ [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include +#include }; diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S deleted file mode 100644 index 9a0e31293920..000000000000 --- a/arch/x86/kernel/syscall_table_32.S +++ /dev/null @@ -1,350 +0,0 @@ -ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long ptregs_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long ptregs_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 - old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_olduname - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long sys_old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long sys_old_readdir - .long sys_old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ioperm - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long ptregs_iopl /* 110 */ - .long sys_vhangup - .long sys_ni_syscall /* old "idle" system call */ - .long ptregs_vm86old - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long ptregs_sigreturn - .long ptregs_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_modify_ldt - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* reserved for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long ptregs_vm86 - .long sys_ni_syscall /* Old sys_query_module */ - .long sys_poll - .long sys_ni_syscall /* Old nfsservctl */ - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long ptregs_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long ptregs_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* reserved for streams1 */ - .long sys_ni_syscall /* reserved for streams2 */ - .long ptregs_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap_pgoff - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - .long sys_getdents64 /* 220 */ - .long sys_fcntl64 - .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall - .long sys_gettid - .long sys_readahead /* 225 */ - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr /* 230 */ - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr /* 235 */ - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_tkill - .long sys_sendfile64 - .long sys_futex /* 240 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_set_thread_area - .long sys_get_thread_area - .long sys_io_setup /* 245 */ - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 255 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 260 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 265 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 270 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_ni_syscall /* sys_vserver */ - .long sys_mbind - .long sys_get_mempolicy - .long sys_set_mempolicy - .long sys_mq_open - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive /* 280 */ - .long sys_mq_notify - .long sys_mq_getsetattr - .long sys_kexec_load - .long sys_waitid - .long sys_ni_syscall /* 285 */ /* available */ - .long sys_add_key - .long sys_request_key - .long sys_keyctl - .long sys_ioprio_set - .long sys_ioprio_get /* 290 */ - .long sys_inotify_init - .long sys_inotify_add_watch - .long sys_inotify_rm_watch - .long sys_migrate_pages - .long sys_openat /* 295 */ - .long sys_mkdirat - .long sys_mknodat - .long sys_fchownat - .long sys_futimesat - .long sys_fstatat64 /* 300 */ - .long sys_unlinkat - .long sys_renameat - .long sys_linkat - .long sys_symlinkat - .long sys_readlinkat /* 305 */ - .long sys_fchmodat - .long sys_faccessat - .long sys_pselect6 - .long sys_ppoll - .long sys_unshare /* 310 */ - .long sys_set_robust_list - .long sys_get_robust_list - .long sys_splice - .long sys_sync_file_range - .long sys_tee /* 315 */ - .long sys_vmsplice - .long sys_move_pages - .long sys_getcpu - .long sys_epoll_pwait - .long sys_utimensat /* 320 */ - .long sys_signalfd - .long sys_timerfd_create - .long sys_eventfd - .long sys_fallocate - .long sys_timerfd_settime /* 325 */ - .long sys_timerfd_gettime - .long sys_signalfd4 - .long sys_eventfd2 - .long sys_epoll_create1 - .long sys_dup3 /* 330 */ - .long sys_pipe2 - .long sys_inotify_init1 - .long sys_preadv - .long sys_pwritev - .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_event_open - .long sys_recvmmsg - .long sys_fanotify_init - .long sys_fanotify_mark - .long sys_prlimit64 /* 340 */ - .long sys_name_to_handle_at - .long sys_open_by_handle_at - .long sys_clock_adjtime - .long sys_syncfs - .long sys_sendmmsg /* 345 */ - .long sys_setns - .long sys_process_vm_readv - .long sys_process_vm_writev -- cgit v1.2.3 From 61f1e7e20874e8f11dab69b6a4bf7616badd4fe8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Nov 2011 16:25:07 -0800 Subject: x86, syscall: Re-fix typo in comment Fix the same typo as was fixed in: b7641d2c x86-64, syscall: Adjust comment spacing and remove typo ... for the new versions of this file (32-bit and IA32 compat). Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1321569446-20433-4-git-send-email-hpa@linux.intel.com --- arch/x86/kernel/syscall_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index b37a57336609..147fcd4941c4 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -17,7 +17,7 @@ extern asmlinkage void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* - * Smells like a like a compiler bug -- it doesn't work + * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ [0 ... __NR_syscall_max] = &sys_ni_syscall, -- cgit v1.2.3 From 37fe6a42b3433b79a159ceb06a94cd1ef00e279d Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:29 +0900 Subject: x86: Check stack overflow in detail Currently, only kernel stack is checked for the overflow, which is not sufficient for systems that need a high reliability. To enhance it, it is required to check the IRQ and exception stacks, as well. This patch checks all the stack types and will cause messages of stacks in detail when free stack space drops below a certain limit except user stack. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Link: http://lkml.kernel.org/r/20111129060829.11076.51733.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/kernel/irq_64.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 69bca468c47a..928a7e909619 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -36,18 +36,35 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW + struct orig_ist *oist; + u64 irq_stack_top, irq_stack_bottom; + u64 estack_top, estack_bottom; u64 curbase = (u64)task_stack_page(current); if (user_mode_vm(regs)) return; - WARN_ONCE(regs->sp >= curbase && - regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128, + if (regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp >= curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128) + return; + + irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack); + irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); + if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) + return; + + oist = &__get_cpu_var(orig_ist); + estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ; + estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; + if (regs->sp >= estack_top && regs->sp <= estack_bottom) + return; - "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + current->comm, curbase, regs->sp, + irq_stack_top, irq_stack_bottom, + estack_top, estack_bottom); #endif } -- cgit v1.2.3 From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:36 +0900 Subject: x86: Panic on detection of stack overflow Currently, messages are just output on the detection of stack overflow, which is not sufficient for systems that need a high reliability. This is because in general the overflow may corrupt data, and the additional corruption may occur due to reading them unless systems stop. This patch adds the sysctl parameter kernel.panic_on_stackoverflow and causes a panic when detecting the overflows of kernel, IRQ and exception stacks except user stack according to the parameter. It is disabled by default. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 2 ++ arch/x86/kernel/irq_64.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 72090705a656..e16e99ebd7ad 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -43,6 +43,8 @@ static void print_stack_overflow(void) { printk(KERN_WARNING "low stack detected by irq handler\n"); dump_stack(); + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); } #else diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 928a7e909619..42552b0dce6a 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); +int sysctl_panic_on_stackoverflow; + /* * Probabilistic stack overflow check: * @@ -65,6 +67,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, estack_top, estack_bottom); + + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); #endif } -- cgit v1.2.3 From 467e6b7a7c0eb792ebaf322ddb7363742b4ead40 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:45 +0900 Subject: x86: Clean up the range of stack overflow checking The overflow checking of kernel stack checks if the stack pointer points to the available kernel stack range, which is derived from the original overflow checking. It is clear that curbase address is always less than low boundary of available kernel stack. So, this patch removes the first condition that checks if the pointer is higher than curbase. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Link: http://lkml.kernel.org/r/20111129060845.11076.40916.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/kernel/irq_64.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 42552b0dce6a..54e2b2b2e250 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -46,10 +46,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode_vm(regs)) return; - if (regs->sp >= curbase && - regs->sp <= curbase + THREAD_SIZE && - regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128) + if (regs->sp >= curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128 && + regs->sp <= curbase + THREAD_SIZE) return; irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack); -- cgit v1.2.3 From 3603a2512f9e69dc87914ba922eb4a0812b21cd6 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 13 Oct 2011 15:14:25 -0400 Subject: x86, reboot: Use NMI instead of REBOOT_VECTOR to stop cpus A recent discussion started talking about the locking on the pstore fs and how it relates to the kmsg infrastructure. We noticed it was possible for userspace to r/w to the pstore fs (grabbing the locks in the process) and block the panic path from r/w to the same fs. The reason was the cpu with the lock could be doing work while the crashing cpu is panic'ing. Busting those spinlocks might cause those cpus to step on each other's data. Fine, fair enough. It was suggested it would be nice to serialize the panic path (ie stop the other cpus) and have only one cpu running. This would allow us to bust the spinlocks and not worry about another cpu stepping on the data. Of course, smp_send_stop() does this in the panic case. kmsg_dump() would have to be moved to be called after it. Easy enough. The only problem is on x86 the smp_send_stop() function calls the REBOOT_VECTOR. Any cpu with irqs disabled (which pstore and its backend ERST would do), block this IPI and thus do not stop. This makes it difficult to reliably log data to the pstore fs. The patch below switches from the REBOOT_VECTOR to NMI (and mimics what kdump does). Switching to NMI allows us to deliver the IPI when irqs are disabled, increasing the reliability of this function. However, Andi carefully noted that on some machines this approach does not work because of broken BIOSes or whatever. To help accomodate this, the next couple of patches will run a selftest and provide a knob to disable. V2: uses atomic ops to serialize the cpu that shuts everyone down V3: comment cleanup Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: seiji.aguchi@hds.com Cc: vgoyal@redhat.com Cc: mjg@redhat.com Cc: tony.luck@intel.com Cc: gong.chen@intel.com Cc: satoru.moriya@hds.com Cc: avi@redhat.com Cc: Andi Kleen Link: http://lkml.kernel.org/r/1318533267-18880-2-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 16204dc15484..e72b1754a2d7 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask) free_cpumask_var(allbutself); } +static atomic_t stopping_cpu = ATOMIC_INIT(-1); + +static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + /* We are registered on stopping cpu too, avoid spurious NMI */ + if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) + return NMI_HANDLED; + + stop_this_cpu(NULL); + + return NMI_HANDLED; +} + +static void native_nmi_stop_other_cpus(int wait) +{ + unsigned long flags; + unsigned long timeout; + + if (reboot_force) + return; + + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + */ + if (num_online_cpus() > 1) { + /* did someone beat us here? */ + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1)) + return; + + if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop")) + /* Note: we ignore failures here */ + return; + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_allbutself(NMI_VECTOR); + + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) + udelay(1); + } + + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); +} + /* * this function calls the 'stop' function on all other CPUs in the system. */ @@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_stop_other_cpus(int wait) +static void native_irq_stop_other_cpus(int wait) { unsigned long flags; unsigned long timeout; @@ -230,7 +285,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .stop_other_cpus = native_stop_other_cpus, + .stop_other_cpus = native_nmi_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, -- cgit v1.2.3 From 99e8b9ca90d688c3ac7d3a141b701c9694a93925 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 13 Oct 2011 15:14:26 -0400 Subject: x86, NMI: Add NMI IPI selftest The previous patch modified the stop cpus path to use NMI instead of IRQ as the way to communicate to the other cpus to shutdown. There were some concerns that various machines may have problems with using an NMI IPI. This patch creates a selftest to check if NMI is working at boot. The idea is to help catch any issues before the machine panics and we learn the hard way. Loosely based on the locking-selftest.c file, this separate file runs a couple of simple tests and reports the results. The output looks like: ... Brought up 4 CPUs ---------------- | NMI testsuite: -------------------- remote IPI: ok | local IPI: ok | -------------------- Good, all 2 testcases passed! | --------------------------------- Total of 4 processors activated (21330.61 BogoMIPS). ... Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: seiji.aguchi@hds.com Cc: vgoyal@redhat.com Cc: mjg@redhat.com Cc: tony.luck@intel.com Cc: gong.chen@intel.com Cc: satoru.moriya@hds.com Cc: avi@redhat.com Cc: Andi Kleen Link: http://lkml.kernel.org/r/1318533267-18880-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/nmi_selftest.c | 179 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 1 + 3 files changed, 181 insertions(+) create mode 100644 arch/x86/kernel/nmi_selftest.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8baca3c4871c..02b2f05b371e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o +obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c new file mode 100644 index 000000000000..572adb622251 --- /dev/null +++ b/arch/x86/kernel/nmi_selftest.c @@ -0,0 +1,179 @@ +/* + * arch/x86/kernel/nmi-selftest.c + * + * Testsuite for NMI: IPIs + * + * Started by Don Zickus: + * (using lib/locking-selftest.c as a guide) + * + * Copyright (C) 2011 Red Hat, Inc., Don Zickus + */ + +#include +#include +#include + +#include +#include + +#define SUCCESS 0 +#define FAILURE 1 +#define TIMEOUT 2 + +static int nmi_fail; + +/* check to see if NMI IPIs work on this machine */ +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; + +static int testcase_total; +static int testcase_successes; +static int expected_testcase_failures; +static int unexpected_testcase_failures; +static int unexpected_testcase_unknowns; + +static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) +{ + unexpected_testcase_unknowns++; + return NMI_HANDLED; +} + +static void init_nmi_testsuite(void) +{ + /* trap all the unknown NMIs we may generate */ + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); +} + +static void cleanup_nmi_testsuite(void) +{ + unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); +} + +static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask))) + return NMI_HANDLED; + + return NMI_DONE; +} + +static void test_nmi_ipi(struct cpumask *mask) +{ + unsigned long timeout; + + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest")) { + nmi_fail = FAILURE; + return; + } + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_mask(mask, NMI_VECTOR); + + /* Don't wait longer than a second */ + timeout = USEC_PER_SEC; + while (!cpumask_empty(mask) && timeout--) + udelay(1); + + /* What happens if we timeout, do we still unregister?? */ + unregister_nmi_handler(NMI_LOCAL, "nmi_selftest"); + + if (!timeout) + nmi_fail = TIMEOUT; + return; +} + +static void remote_ipi(void) +{ + cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void local_ipi(void) +{ + cpumask_clear(to_cpumask(nmi_ipi_mask)); + cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void reset_nmi(void) +{ + nmi_fail = 0; +} + +static void dotest(void (*testcase_fn)(void), int expected) +{ + testcase_fn(); + /* + * Filter out expected failures: + */ + if (nmi_fail != expected) { + unexpected_testcase_failures++; + + if (nmi_fail == FAILURE) + printk("FAILED |"); + else if (nmi_fail == TIMEOUT) + printk("TIMEOUT|"); + else + printk("ERROR |"); + dump_stack(); + } else { + testcase_successes++; + printk(" ok |"); + } + testcase_total++; + + reset_nmi(); +} + +static inline void print_testname(const char *testname) +{ + printk("%12s:", testname); +} + +void nmi_selftest(void) +{ + init_nmi_testsuite(); + + /* + * Run the testsuite: + */ + printk("----------------\n"); + printk("| NMI testsuite:\n"); + printk("--------------------\n"); + + print_testname("remote IPI"); + dotest(remote_ipi, SUCCESS); + printk("\n"); + print_testname("local IPI"); + dotest(local_ipi, SUCCESS); + printk("\n"); + + cleanup_nmi_testsuite(); + + if (unexpected_testcase_failures) { + printk("--------------------\n"); + printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + unexpected_testcase_failures, testcase_total); + printk("-----------------------------------------------------------------\n"); + } else if (expected_testcase_failures && testcase_successes) { + printk("--------------------\n"); + printk("%3d out of %3d testcases failed, as expected. |\n", + expected_testcase_failures, testcase_total); + printk("----------------------------------------------------\n"); + } else if (expected_testcase_failures && !testcase_successes) { + printk("--------------------\n"); + printk("All %3d testcases failed, as expected. |\n", + expected_testcase_failures); + printk("----------------------------------------\n"); + } else { + printk("--------------------\n"); + printk("Good, all %3d testcases passed! |\n", + testcase_successes); + printk("---------------------------------\n"); + } +} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb4a958..19277817effa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1142,6 +1142,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done.\n"); + nmi_selftest(); impress_friends(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); -- cgit v1.2.3 From bda62633983f9db49ce0b1a9235b3709c1cda5f0 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 13 Oct 2011 15:14:27 -0400 Subject: x86, NMI: Add knob to disable using NMI IPIs to stop cpus Some machines may exhibit problems using the NMI to stop other cpus. This knob just allows one to revert back to the original behaviour to help diagnose the problem. V2: make function static Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: seiji.aguchi@hds.com Cc: vgoyal@redhat.com Cc: mjg@redhat.com Cc: tony.luck@intel.com Cc: gong.chen@intel.com Cc: satoru.moriya@hds.com Cc: avi@redhat.com Cc: Andi Kleen Link: http://lkml.kernel.org/r/1318533267-18880-4-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index e72b1754a2d7..113acda5879e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -249,6 +249,11 @@ static void native_irq_stop_other_cpus(int wait) local_irq_restore(flags); } +static void native_smp_disable_nmi_ipi(void) +{ + smp_ops.stop_other_cpus = native_irq_stop_other_cpus; +} + /* * Reschedule call back. */ @@ -280,6 +285,14 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) irq_exit(); } +static int __init nonmi_ipi_setup(char *str) +{ + native_smp_disable_nmi_ipi(); + return 1; +} + +__setup("nonmi_ipi", nonmi_ipi_setup); + struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, -- cgit v1.2.3 From 53b5650273fea486ac8ac6c1d1e9a6cd17aa31ca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 5 Dec 2011 12:25:44 +0100 Subject: x86: Fix the 32-bit stackoverflow-debug build The panic_on_stackoverflow variable needs to be avilable on the 32-bit side as well ... Cc: Mitsuo Hayasaka Cc: Randy Dunlap Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e16e99ebd7ad..40fc86161d92 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); #ifdef CONFIG_DEBUG_STACKOVERFLOW + +int sysctl_panic_on_stackoverflow __read_mostly; + /* Debugging check for stack overflow: is there less than 1KB free? */ static int check_stack_overflow(void) { -- cgit v1.2.3 From 1ea7c6737c8f68453f55c894b3d07d7f48fcbef8 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 10 Nov 2011 13:29:14 +0000 Subject: x86/config: Revamp configuration for MID devices This follows on from the patch applied in 3.2rc1 which creates an INTEL_MID configuration. We can now add the entry for Medfield specific code. After this is merged the final patch will be submitted which moves the rest of the device Kconfig dependancies to MRST/MEDFIELD/INTEL_MID as appropriate. Signed-off-by: Alan Cox Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index cd28a350f7f9..7a53da03086f 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif -#ifdef CONFIG_EARLY_PRINTK_MRST +#ifdef CONFIG_EARLY_PRINTK_INTEL_MID if (!strncmp(buf, "mrst", 4)) { mrst_early_console_init(); early_console_register(&early_mrst_console, keep); -- cgit v1.2.3 From d1bbdd669298b7ca08284ddb29153dfc039dd89d Mon Sep 17 00:00:00 2001 From: Mike Ditto Date: Tue, 15 Nov 2011 14:46:50 -0800 Subject: arch/x86/kernel/e820.c: Eliminate bubble sort from sanitize_e820_map() Replace the bubble sort in sanitize_e820_map() with a call to the generic kernel sort function to avoid pathological performance with large maps. On large (thousands of entries) E820 maps, the previous code took minutes to run; with this change it's now milliseconds. Signed-off-by: Mike Ditto Cc: sassmann@kpanic.de Cc: yuenn@google.com Cc: Stefan Assmann Cc: Nancy Yuen Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 59 ++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e48f076..f655f802260d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -227,22 +228,38 @@ void __init e820_print_map(char *who) * ____________________33__ * ______________________4_ */ +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; + +static int __init cpcompare(const void *a, const void *b) +{ + struct change_member * const *app = a, * const *bpp = b; + const struct change_member *ap = *app, *bp = *bpp; + + /* + * Inputs are pointers to two elements of change_point[]. If their + * addresses are unequal, their difference dominates. If the addresses + * are equal, then consider one that represents the end of its region + * to be greater than one that does not. + */ + if (ap->addr != bp->addr) + return ap->addr > bp->addr ? 1 : -1; + + return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); +} int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map) { - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; static struct change_member change_point_list[2*E820_X_MAX] __initdata; static struct change_member *change_point[2*E820_X_MAX] __initdata; static struct e820entry *overlap_list[E820_X_MAX] __initdata; static struct e820entry new_bios[E820_X_MAX] __initdata; - struct change_member *change_tmp; unsigned long current_type, last_type; unsigned long long last_addr; - int chgidx, still_changing; + int chgidx; int overlap_entries; int new_bios_entry; int old_nr, new_nr, chg_nr; @@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, chg_nr = chgidx; /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i = 1; i < chg_nr; i++) { - unsigned long long curaddr, lastaddr; - unsigned long long curpbaddr, lastpbaddr; - - curaddr = change_point[i]->addr; - lastaddr = change_point[i - 1]->addr; - curpbaddr = change_point[i]->pbios->addr; - lastpbaddr = change_point[i - 1]->pbios->addr; - - /* - * swap entries, when: - * - * curaddr > lastaddr or - * curaddr == lastaddr and curaddr == curpbaddr and - * lastaddr != lastpbaddr - */ - if (curaddr < lastaddr || - (curaddr == lastaddr && curaddr == curpbaddr && - lastaddr != lastpbaddr)) { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing = 1; - } - } - } + sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0); /* create a new bios memory map, removing overlaps */ overlap_entries = 0; /* number of entries in the overlap table */ -- cgit v1.2.3 From 706d9a9c8b5758390036b9980a2b12d809599777 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 15 Nov 2011 14:48:56 -0800 Subject: arch/x86/kernel/e820.c: quiet sparse noise about plain integer as NULL pointer The last parameter to sort() is a pointer to the function used to swap items. This parameter should be NULL, not 0, when not used. This quiets the following sparse warning: warning: Using plain integer as NULL pointer Signed-off-by: H Hartley Sweeten Signed-off-by: Andrew Morton Cc: hartleys@visionengravers.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f655f802260d..d6bd85352c81 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -296,7 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, chg_nr = chgidx; /* sort change-point list by memory addresses (low -> high) */ - sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0); + sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); /* create a new bios memory map, removing overlaps */ overlap_entries = 0; /* number of entries in the overlap table */ -- cgit v1.2.3 From b565201cf75210614903ef2ae5917b4379681647 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 15 Nov 2011 15:33:56 -0800 Subject: x86: Reduce clock calibration time during slave cpu startup Reduce the startup time for slave cpus. Adds hooks for an arch-specific function for clock calibration. These hooks are used on x86. If a newly started cpu has the same phys_proc_id as a core already active, uses the TSC for the delay loop and has a CONSTANT_TSC, use the already-calculated value of loops_per_jiffy. This patch reduces the time required to start slave cpus on a 4096 cpu system from: 465 sec OLD 62 sec NEW This reduces boot time on a 4096p system by almost 7 minutes. Nice... Signed-off-by: Jack Steiner Cc: "H. Peter Anvin" Cc: John Stultz [fix CONFIG_SMP=n build] Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 16 +++++++++++----- arch/x86/kernel/tsc.c | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb4a958..00eef55c8327 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -207,22 +207,28 @@ static void __cpuinit smp_callin(void) * Need to setup vector mappings before we enable interrupts. */ setup_vector_irq(smp_processor_id()); + + /* + * Save our processor parameters. Note: this information + * is needed for clock calibration. + */ + smp_store_cpu_info(cpuid); + /* * Get our bogomips. + * Update loops_per_jiffy in cpu_data. Previous call to + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. * * Need to enable IRQs because it can take longer and then * the NMI watchdog might kill us. */ local_irq_enable(); calibrate_delay(); + cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; local_irq_disable(); pr_debug("Stack at about %p\n", &cpuid); - /* - * Save our processor parameters - */ - smp_store_cpu_info(cpuid); - /* * This must be done before setting cpu_online_mask * or calling notify_cpu_starting. diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db483369f10b..490fb330be87 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -995,3 +995,23 @@ void __init tsc_init(void) check_system_tsc_reliable(); } +#ifdef CONFIG_SMP +/* + * If we have a constant TSC and are using the TSC for the delay loop, + * we can skip clock calibration if another cpu in the same socket has already + * been calibrated. This assumes that CONSTANT_TSC applies to all + * cpus in the socket - this should be a safe assumption. + */ +unsigned long __cpuinit calibrate_delay_is_known(void) +{ + int i, cpu = smp_processor_id(); + + if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + for_each_online_cpu(i) + if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) + return cpu_data(i).loops_per_jiffy; + return 0; +} +#endif -- cgit v1.2.3 From 565cbc3e934f221369a656b4469a044aa4c3f2a8 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 6 Dec 2011 13:08:59 -0500 Subject: x86, NMI: NMI-selftest should handle the UP case properly If no remote cpus are online, then just quietly skip the remote IPI test for now. Signed-off-by: Don Zickus Cc: andi@firstfloor.org Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: robert.richter@amd.com Link: http://lkml.kernel.org/r/20111206180859.GR1669@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_selftest.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 572adb622251..1e42a23c1f2a 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -90,7 +90,8 @@ static void remote_ipi(void) { cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); - test_nmi_ipi(to_cpumask(nmi_ipi_mask)); + if (!cpumask_empty(nmi_ipi_mask)) + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } static void local_ipi(void) -- cgit v1.2.3 From d2db6610219cbcadceea6c43ee03d89068b7d759 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Wed, 7 Dec 2011 17:29:10 +0900 Subject: x86: Add stack top margin for stack overflow checking It seems that a margin for stack overflow checking is added to top of a kernel stack but is not added to IRQ and exception stacks in stack_overflow_check(). Therefore, the overflows of IRQ and exception stacks are always detected only after they actually occurred and data corruption might occur due to them. This patch adds the margin to top of IRQ and exception stacks as well as a kernel stack to enhance reliability. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20111207082910.9847.3359.stgit@ltc219.sdl.hitachi.co.jp [ removed the #undef - we typically don't do that for uncommon names ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 54e2b2b2e250..d04d3ecded62 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -38,6 +38,7 @@ int sysctl_panic_on_stackoverflow; static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW +#define STACK_TOP_MARGIN 128 struct orig_ist *oist; u64 irq_stack_top, irq_stack_bottom; u64 estack_top, estack_bottom; @@ -47,17 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs) return; if (regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128 && + sizeof(struct pt_regs) + STACK_TOP_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack); + irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) + + STACK_TOP_MARGIN; irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) return; oist = &__get_cpu_var(orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ; + estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; -- cgit v1.2.3 From 3d6240b53e34e372be007e08f7066e7625910675 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 7 Dec 2011 14:06:12 +0300 Subject: x86, NMI: Add to_cpumask() to silence compile warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gcc complains if we don't cast this to a struct cpumask pointer. arch/x86/kernel/nmi_selftest.c:93:2: warning: passing argument 1 of ‘cpumask_empty’ from incompatible pointer type [enabled by default] Signed-off-by: Dan Carpenter Cc: Don Zickus Link: http://lkml.kernel.org/r/20111207110612.GA3437@mwanda Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 1e42a23c1f2a..0d01a8ea4e11 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -90,7 +90,7 @@ static void remote_ipi(void) { cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); - if (!cpumask_empty(nmi_ipi_mask)) + if (!cpumask_empty(to_cpumask(nmi_ipi_mask))) test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } -- cgit v1.2.3 From f7d7d01be53cb47e0ae212c4e968aa28b82d2138 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 15 Nov 2011 12:56:14 +0000 Subject: x86: Don't use magic strings for EFI loader signature Introduce a symbol, EFI_LOADER_SIGNATURE instead of using the magic strings, which also helps to reduce the amount of ifdeffery. Cc: Matthew Garrett Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9a9e40fb091c..4d5243c31ac4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -752,12 +752,7 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, -#ifdef CONFIG_X86_32 - "EL32", -#else - "EL64", -#endif - 4)) { + EFI_LOADER_SIGNATURE, 4)) { efi_enabled = 1; efi_memblock_x86_reserve_range(); } -- cgit v1.2.3 From 291f36325f9f252bd76ef5f603995f37e453fc60 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 12 Dec 2011 21:27:52 +0000 Subject: x86, efi: EFI boot stub support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is currently a large divide between kernel development and the development of EFI boot loaders. The idea behind this patch is to give the kernel developers full control over the EFI boot process. As H. Peter Anvin put it, "The 'kernel carries its own stub' approach been very successful in dealing with BIOS, and would make a lot of sense to me for EFI as well." This patch introduces an EFI boot stub that allows an x86 bzImage to be loaded and executed by EFI firmware. The bzImage appears to the firmware as an EFI application. Luckily there are enough free bits within the bzImage header so that it can masquerade as an EFI application, thereby coercing the EFI firmware into loading it and jumping to its entry point. The beauty of this masquerading approach is that both BIOS and EFI boot loaders can still load and run the same bzImage, thereby allowing a single kernel image to work in any boot environment. The EFI boot stub supports multiple initrds, but they must exist on the same partition as the bzImage. Command-line arguments for the kernel can be appended after the bzImage name when run from the EFI shell, e.g. Shell> bzImage console=ttyS0 root=/dev/sdb initrd=initrd.img v7: - Fix checkpatch warnings. v6: - Try to allocate initrd memory just below hdr->inird_addr_max. v5: - load_options_size is UTF-16, which needs dividing by 2 to convert to the corresponding ASCII size. v4: - Don't read more than image->load_options_size v3: - Fix following warnings when compiling CONFIG_EFI_STUB=n arch/x86/boot/tools/build.c: In function ‘main’: arch/x86/boot/tools/build.c:138:24: warning: unused variable ‘pe_header’ arch/x86/boot/tools/build.c:138:15: warning: unused variable ‘file_sz’ - As reported by Matthew Garrett, some Apple machines have GOPs that don't have hardware attached. We need to weed these out by searching for ones that handle the PCIIO protocol. - Don't allocate memory if no initrds are on cmdline - Don't trust image->load_options_size Maarten Lankhorst noted: - Don't strip first argument when booted from efibootmgr - Don't allocate too much memory for cmdline - Don't update cmdline_size, the kernel considers it read-only - Don't accept '\n' for initrd names v2: - File alignment was too large, was 8192 should be 512. Reported by Maarten Lankhorst on LKML. - Added UGA support for graphics - Use VIDEO_TYPE_EFI instead of hard-coded number. - Move linelength assignment until after we've assigned depth - Dynamically fill out AddressOfEntryPoint in tools/build.c - Don't use magic number for GDT/TSS stuff. Requested by Andi Kleen - The bzImage may need to be relocated as it may have been loaded at a high address address by the firmware. This was required to get my macbook booting because the firmware loaded it at 0x7cxxxxxx, which triggers this error in decompress_kernel(), if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) error("Destination address too large"); Cc: Mike Waychison Cc: Matthew Garrett Tested-by: Henrik Rydberg Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1321383097.2657.9.camel@mfleming-mobl1.ger.corp.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/asm-offsets.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 4f13fafc5264..68de2dc962ec 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -67,4 +67,6 @@ void common(void) { OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_pref_address, boot_params, hdr.pref_address); + OFFSET(BP_code32_start, boot_params, hdr.code32_start); } -- cgit v1.2.3 From 549c89b98c4530b278dde1a3f68ce5ebbb1e6304 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Nov 2011 12:44:55 -0800 Subject: x86: Do not schedule while still in NMI context The NMI handler uses the paranoid_exit routine that checks the NEED_RESCHED flag, and if it is set and the return is for userspace, then interrupts are enabled, the stack is swapped to the thread's stack, and schedule is called. The problem with this is that we are still in an NMI context until an iret is executed. This means that any new NMIs are now starved until an interrupt or exception occurs and does the iret. As NMIs can not be masked and can interrupt any location, they are treated as a special case. NEED_RESCHED should not be set in an NMI handler. The interruption by the NMI should not disturb the work flow for scheduling. Any IPI sent to a processor after sending the NEED_RESCHED would have to wait for the NMI anyway, and after the IPI finishes the schedule would be called as required. There is no reason to do anything special leaving an NMI. Remove the call to paranoid_exit and do a simple return. This not only fixes the bug of starved NMIs, but it also cleans up the code. Link: http://lkml.kernel.org/r/CA+55aFzgM55hXTs4griX5e9=v_O+=ue+7Rj0PTD=M7hFYpyULQ@mail.gmail.com Acked-by: Andi Kleen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Paul Turner Signed-off-by: Linus Torvalds Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index faf8d5e74b0b..3819ea907339 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1489,46 +1489,14 @@ ENTRY(nmi) movq %rsp,%rdi movq $-1,%rsi call do_nmi -#ifdef CONFIG_TRACE_IRQFLAGS - /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ - DISABLE_INTERRUPTS(CLBR_NONE) testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 jmp irq_return -nmi_userspace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz nmi_schedule - movl %ebx,%edx /* arg3: thread flags */ - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - jmp nmi_userspace -nmi_schedule: - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - jmp nmi_userspace CFI_ENDPROC -#else - jmp paranoid_exit - CFI_ENDPROC -#endif END(nmi) ENTRY(ignore_sysret) -- cgit v1.2.3 From 1fd466efc88c48f50e5ee29f4dbb4e210a889172 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 8 Dec 2011 12:32:27 -0500 Subject: x86: Document the NMI handler about not using paranoid_exit Linus cleaned up the NMI handler but it still needs some comments to explain why it uses save_paranoid but not paranoid_exit. Just to keep others from adding that in the future, document why it's not used. Cc: Linus Torvalds Cc: Andi Kleen Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3819ea907339..d1d5434e7f6a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1480,9 +1480,16 @@ END(error_exit) ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + /* + * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ call save_paranoid DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ -- cgit v1.2.3 From 3f3c8b8c4b2a34776c3470142a7c8baafcda6eb0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 8 Dec 2011 12:36:23 -0500 Subject: x86: Add workaround to NMI iret woes In x86, when an NMI goes off, the CPU goes into an NMI context that prevents other NMIs to trigger on that CPU. If an NMI is suppose to trigger, it has to wait till the previous NMI leaves NMI context. At that time, the next NMI can trigger (note, only one more NMI will trigger, as only one can be latched at a time). The way x86 gets out of NMI context is by calling iret. The problem with this is that this causes problems if the NMI handle either triggers an exception, or a breakpoint. Both the exception and the breakpoint handlers will finish with an iret. If this happens while in NMI context, the CPU will leave NMI context and a new NMI may come in. As NMI handlers are not made to be re-entrant, this can cause havoc with the system, not to mention, the nested NMI will write all over the previous NMI's stack. Linus Torvalds proposed the following workaround to this problem: https://lkml.org/lkml/2010/7/14/264 "In fact, I wonder if we couldn't just do a software NMI disable instead? Hav ea per-cpu variable (in the _core_ percpu areas that get allocated statically) that points to the NMI stack frame, and just make the NMI code itself do something like NMI entry: - load percpu NMI stack frame pointer - if non-zero we know we're nested, and should ignore this NMI: - we're returning to kernel mode, so return immediately by using "popf/ret", which also keeps NMI's disabled in the hardware until the "real" NMI iret happens. - before the popf/iret, use the NMI stack pointer to make the NMI return stack be invalid and cause a fault - set the NMI stack pointer to the current stack pointer NMI exit (not the above "immediate exit because we nested"): clear the percpu NMI stack pointer Just do the iret. Now, the thing is, now the "iret" is atomic. If we had a nested NMI, we'll take a fault, and that re-does our "delayed" NMI - and NMI's will stay masked. And if we didn't have a nested NMI, that iret will now unmask NMI's, and everything is happy." I first tried to follow this advice but as I started implementing this code, a few gotchas showed up. One, is accessing per-cpu variables in the NMI handler. The problem is that per-cpu variables use the %gs register to get the variable for the given CPU. But as the NMI may happen in userspace, we must first perform a SWAPGS to get to it. The NMI handler already does this later in the code, but its too late as we have saved off all the registers and we don't want to do that for a disabled NMI. Peter Zijlstra suggested to keep all variables on the stack. This simplifies things greatly and it has the added benefit of cache locality. Two, faulting on the iret. I really wanted to make this work, but it was becoming very hacky, and I never got it to be stable. The iret already had a fault handler for userspace faulting with bad segment registers, and getting NMI to trigger a fault and detect it was very tricky. But for strange reasons, the system would usually take a double fault and crash. I never figured out why and decided to go with a simple "jmp" approach. The new approach I took also simplified things. Finally, the last problem with Linus's approach was to have the nested NMI handler do a ret instead of an iret to give the first NMI NMI-context again. The problem is that ret is much more limited than an iret. I couldn't figure out how to get the stack back where it belonged. I could have copied the current stack, pushed the return onto it, but my fear here is that there may be some place that writes data below the stack pointer. I know that is not something code should depend on, but I don't want to chance it. I may add this feature later, but for now, an NMI handler that loses NMI context will not get it back. Here's what is done: When an NMI comes in, the HW pushes the interrupt stack frame onto the per cpu NMI stack that is selected by the IST. A special location on the NMI stack holds a variable that is set when the first NMI handler runs. If this variable is set then we know that this is a nested NMI and we process the nested NMI code. There is still a race when this variable is cleared and an NMI comes in just before the first NMI does the return. For this case, if the variable is cleared, we also check if the interrupted stack is the NMI stack. If it is, then we process the nested NMI code. Why the two tests and not just test the interrupted stack? If the first NMI hits a breakpoint and loses NMI context, and then it hits another breakpoint and while processing that breakpoint we get a nested NMI. When processing a breakpoint, the stack changes to the breakpoint stack. If another NMI comes in here we can't rely on the interrupted stack to be the NMI stack. If the variable is not set and the interrupted task's stack is not the NMI stack, then we know this is the first NMI and we can process things normally. But in order to do so, we need to do a few things first. 1) Set the stack variable that tells us that we are in an NMI handler 2) Make two copies of the interrupt stack frame. One copy is used to return on iret The other is used to restore the first one if we have a nested NMI. This is what the stack will look like: +-------------------------+ | original SS | | original Return RSP | | original RFLAGS | | original CS | | original RIP | +-------------------------+ | temp storage for rdx | +-------------------------+ | NMI executing variable | +-------------------------+ | Saved SS | | Saved Return RSP | | Saved RFLAGS | | Saved CS | | Saved RIP | +-------------------------+ | copied SS | | copied Return RSP | | copied RFLAGS | | copied CS | | copied RIP | +-------------------------+ | pt_regs | +-------------------------+ The original stack frame contains what the HW put in when we entered the NMI. We store %rdx as a temp variable to use. Both the original HW stack frame and this %rdx storage will be clobbered by nested NMIs so we can not rely on them later in the first NMI handler. The next item is the special stack variable that is set when we execute the rest of the NMI handler. Then we have two copies of the interrupt stack. The second copy is modified by any nested NMIs to let the first NMI know that we triggered a second NMI (latched) and that we should repeat the NMI handler. If the first NMI hits an exception or breakpoint that takes it out of NMI context, if a second NMI comes in before the first one finishes, it will update the copied interrupt stack to point to a fix up location to trigger another NMI. When the first NMI calls iret, it will instead jump to the fix up location. This fix up location will copy the saved interrupt stack back to the copy and execute the nmi handler again. Note, the nested NMI knows enough to check if it preempted a previous NMI handler while it is in the fixup location. If it has, it will not modify the copied interrupt stack and will just leave as if nothing happened. As the NMI handle is about to execute again, there's no reason to latch now. To test all this, I forced the NMI handler to call iret and take itself out of NMI context. I also added assemble code to write to the serial to make sure that it hits the nested path as well as the fix up path. Everything seems to be working fine. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 177 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d1d5434e7f6a..b62aa298df7f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1475,11 +1475,166 @@ ENTRY(error_exit) CFI_ENDPROC END(error_exit) +/* + * Test if a given stack is an NMI stack or not. + */ + .macro test_in_nmi reg stack nmi_ret normal_ret + cmpq %\reg, \stack + ja \normal_ret + subq $EXCEPTION_STKSZ, %\reg + cmpq %\reg, \stack + jb \normal_ret + jmp \nmi_ret + .endm /* runs on exception stack */ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as out temp variable throughout */ + pushq_cfi %rdx + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmp $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -6*8(%rsp), %rdx + movq %rdx, %rsp + CFI_ADJUST_CFA_OFFSET 6*8 + pushq_cfi $__KERNEL_DS + pushq_cfi %rdx + pushfq_cfi + pushq_cfi $__KERNEL_CS + pushq_cfi $repeat_nmi + + /* Put stack back */ + addq $(11*8), %rsp + CFI_ADJUST_CFA_OFFSET -11*8 + +nested_nmi_out: + popq_cfi %rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved RIP is used to fix up the copied RIP that a nested + * NMI may zero out. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Set the NMI executing variable on the stack. */ + pushq_cfi $1 + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq_cfi 6*8(%rsp) + .endr + + /* Make another copy, this one may be modified by nested NMIs */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + /* Do not pop rdx, nested NMIs will corrupt it */ + movq 11*8(%rsp), %rdx + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception. Repeated NMIs + * caused by an exception and nested NMI will start here, and + * can still be preempted by another NMI. + */ +restart_nmi: pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1502,10 +1657,32 @@ nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + /* Clear the NMI executing stack variable */ + movq $0, 10*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) + /* + * If an NMI hit an iret because of an exception or breakpoint, + * it can lose its NMI context, and a nested NMI may come in. + * In that case, the nested NMI will change the preempted NMI's + * stack to jump to here when it does the final iret. + */ +repeat_nmi: + INTR_FRAME + /* Update the stack variable to say we are still in NMI */ + movq $1, 5*8(%rsp) + + /* copy the saved stack back to copy stack */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + jmp restart_nmi + CFI_ENDPROC +end_repeat_nmi: + ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax -- cgit v1.2.3 From 228bdaa95fb830e08b6acd1afd4d2c55093cabfa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 Dec 2011 03:02:19 -0500 Subject: x86: Keep current stack in NMI breakpoints We want to allow NMI handlers to have breakpoints to be able to remove stop_machine from ftrace, kprobes and jump_labels. But if an NMI interrupts a current breakpoint, and then it triggers a breakpoint itself, it will switch to the breakpoint stack and corrupt the data on it for the breakpoint processing that it interrupted. Instead, have the NMI check if it interrupted breakpoint processing by checking if the stack that is currently used is a breakpoint stack. If it is, then load a special IDT that changes the IST for the debug exception to keep the same stack in kernel context. When the NMI is done, it puts it back. This way, if the NMI does trigger a breakpoint, it will keep using the same stack and not stomp on the breakpoint data for the breakpoint it interrupted. Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++ arch/x86/kernel/head_64.S | 4 ++++ arch/x86/kernel/nmi.c | 15 +++++++++++++++ arch/x86/kernel/traps.c | 6 ++++++ 4 files changed, 47 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b13a831..caa404556b9c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) nmi_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); @@ -1090,6 +1092,24 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); + +int is_debug_stack(unsigned long addr) +{ + return addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ); +} + +void debug_stack_set_zero(void) +{ + load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ + load_idt((const struct desc_ptr *)&idt_descr); +} + #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; @@ -1208,6 +1228,8 @@ void __cpuinit cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e39478a49..40f4eb3766d1 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -417,6 +417,10 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 + .align L1_CACHE_BYTES +ENTRY(nmi_idt_table) + .skip IDT_ENTRIES * 16 + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b58ddd..de8d4b333f40 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -408,6 +408,18 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { + int update_debug_stack = 0; + + /* + * If we interrupted a breakpoint, it is possible that + * the nmi handler will have breakpoints too. We need to + * change the IDT such that breakpoints that happen here + * continue to use the NMI stack. + */ + if (unlikely(is_debug_stack(regs->sp))) { + debug_stack_set_zero(); + update_debug_stack = 1; + } nmi_enter(); inc_irq_stat(__nmi_count); @@ -416,6 +428,9 @@ do_nmi(struct pt_regs *regs, long error_code) default_do_nmi(regs); nmi_exit(); + + if (unlikely(update_debug_stack)) + debug_stack_reset(); } void stop_nmi(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb83466c..a93c5cabc36a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -723,4 +723,10 @@ void __init trap_init(void) cpu_init(); x86_init.irqs.trap_init(); + +#ifdef CONFIG_X86_64 + memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); + set_nmi_gate(1, &debug); + set_nmi_gate(3, &int3); +#endif } -- cgit v1.2.3 From ccd49c2391773ffbf52bb80d75c4a92b16972517 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 13 Dec 2011 16:44:16 -0500 Subject: x86: Allow NMIs to hit breakpoints in i386 With i386, NMIs and breakpoints use the current stack and they do not reset the stack pointer to a fix point that might corrupt a previous NMI or breakpoint (as it does in x86_64). But NMIs are still not made to be re-entrant, and need to prevent the case that an NMI hitting a breakpoint (which does an iret), doesn't allow another NMI to run. The fix is to let the NMI be in 3 different states: 1) not running 2) executing 3) latched When no NMI is executing on a given CPU, the state is "not running". When the first NMI comes in, the state is switched to "executing". On exit of that NMI, a cmpxchg is performed to switch the state back to "not running" and if that fails, the NMI is restarted. If a breakpoint is hit and does an iret, which re-enables NMIs, and another NMI comes in before the first NMI finished, it will detect that the state is not in the "not running" state and the current NMI is nested. In this case, the state is switched to "latched" to let the interrupted NMI know to restart the NMI handler, and the nested NMI exits without doing anything. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/x86/kernel/nmi.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index de8d4b333f40..47acaf319165 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -405,11 +405,84 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); } -dotraplinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - int update_debug_stack = 0; +/* + * NMIs can hit breakpoints which will cause it to lose its + * NMI context with the CPU when the breakpoint does an iret. + */ +#ifdef CONFIG_X86_32 +/* + * For i386, NMIs use the same stack as the kernel, and we can + * add a workaround to the iret problem in C. Simply have 3 states + * the NMI can be in. + * + * 1) not running + * 2) executing + * 3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI hits a breakpoint that executes an iret, another + * NMI can preempt it. We do not want to allow this new NMI + * to run, but we want to execute it when the first one finishes. + * We set the state to "latched", and the first NMI will perform + * an cmpxchg on the state, and if it doesn't successfully + * reset the state to "not running" it will restart the next + * NMI. + */ +enum nmi_states { + NMI_NOT_RUNNING, + NMI_EXECUTING, + NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); + +#define nmi_nesting_preprocess(regs) \ + do { \ + if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ + __get_cpu_var(nmi_state) = NMI_LATCHED; \ + return; \ + } \ + nmi_restart: \ + __get_cpu_var(nmi_state) = NMI_EXECUTING; \ + } while (0) + +#define nmi_nesting_postprocess() \ + do { \ + if (cmpxchg(&__get_cpu_var(nmi_state), \ + NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + goto nmi_restart; \ + } while (0) +#else /* x86_64 */ +/* + * In x86_64 things are a bit more difficult. This has the same problem + * where an NMI hitting a breakpoint that calls iret will remove the + * NMI context, allowing a nested NMI to enter. What makes this more + * difficult is that both NMIs and breakpoints have their own stack. + * When a new NMI or breakpoint is executed, the stack is set to a fixed + * point. If an NMI is nested, it will have its stack set at that same + * fixed address that the first NMI had, and will start corrupting the + * stack. This is handled in entry_64.S, but the same problem exists with + * the breakpoint stack. + * + * If a breakpoint is being processed, and the debug stack is being used, + * if an NMI comes in and also hits a breakpoint, the stack pointer + * will be set to the same fixed address as the breakpoint that was + * interrupted, causing that stack to be corrupted. To handle this case, + * check if the stack that was interrupted is the debug stack, and if + * so, change the IDT so that new breakpoints will use the current stack + * and not switch to the fixed address. On return of the NMI, switch back + * to the original IDT. + */ +static DEFINE_PER_CPU(int, update_debug_stack); +static inline void nmi_nesting_preprocess(struct pt_regs *regs) +{ /* * If we interrupted a breakpoint, it is possible that * the nmi handler will have breakpoints too. We need to @@ -418,8 +491,22 @@ do_nmi(struct pt_regs *regs, long error_code) */ if (unlikely(is_debug_stack(regs->sp))) { debug_stack_set_zero(); - update_debug_stack = 1; + __get_cpu_var(update_debug_stack) = 1; } +} + +static inline void nmi_nesting_postprocess(void) +{ + if (unlikely(__get_cpu_var(update_debug_stack))) + debug_stack_reset(); +} +#endif + +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_nesting_preprocess(regs); + nmi_enter(); inc_irq_stat(__nmi_count); @@ -429,8 +516,8 @@ do_nmi(struct pt_regs *regs, long error_code) nmi_exit(); - if (unlikely(update_debug_stack)) - debug_stack_reset(); + /* On i386, may loop back to preprocess */ + nmi_nesting_postprocess(); } void stop_nmi(void) -- cgit v1.2.3 From 42181186ad4db986fcaa40ca95c6e407e9e79372 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 11:43:02 -0500 Subject: x86: Add counter when debug stack is used with interrupts enabled Mathieu Desnoyers pointed out a case that can cause issues with NMIs running on the debug stack: int3 -> interrupt -> NMI -> int3 Because the interrupt changes the stack, the NMI will not see that it preempted the debug stack. Looking deeper at this case, interrupts only happen when the int3 is from userspace or in an a location in the exception table (fixup). userspace -> int3 -> interurpt -> NMI -> int3 All other int3s that happen in the kernel should be processed without ever enabling interrupts, as the do_trap() call will panic the kernel if it is called to process any other location within the kernel. Adding a counter around the sections that enable interrupts while using the debug stack allows the NMI to also check that case. If the NMI sees that it either interrupted a task using the debug stack or the debug counter is non-zero, then it will have to change the IDT table to make the int3 not change stacks (which will corrupt the stack if it does). Note, I had to move the debug_usage functions out of processor.h and into debugreg.h because of the static inlined functions to inc and dec the debug_usage counter. __get_cpu_var() requires smp.h which includes processor.h, and would fail to build. Link: http://lkml.kernel.org/r/1323976535.23971.112.camel@gandalf.stny.rr.com Reported-by: Mathieu Desnoyers Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- arch/x86/kernel/cpu/common.c | 6 ++++-- arch/x86/kernel/traps.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index caa404556b9c..266e4649b1da 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1093,11 +1093,13 @@ unsigned long kernel_eflags; DEFINE_PER_CPU(struct orig_ist, orig_ist); static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); int is_debug_stack(unsigned long addr) { - return addr <= __get_cpu_var(debug_stack_addr) && - addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ); + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } void debug_stack_set_zero(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a93c5cabc36a..0072b38e3ea1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) return; #endif + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); + debug_stack_usage_dec(); } #ifdef CONFIG_X86_64 @@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); + /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); @@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } -- cgit v1.2.3 From 8a25a2fd126c621f44f3aeaef80d51f00fc11639 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 21 Dec 2011 14:29:42 -0800 Subject: cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular subsystem This moves the 'cpu sysdev_class' over to a regular 'cpu' subsystem and converts the devices to regular devices. The sysdev drivers are implemented as subsystem interfaces now. After all sysdev classes are ported to regular driver core entities, the sysdev implementation will be entirely removed from the kernel. Userspace relies on events and generic sysfs subsystem infrastructure from sysdev devices, which are made available with this conversion. Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: Tigran Aivazian Cc: Len Brown Cc: Zhang Rui Cc: Dave Jones Cc: Peter Zijlstra Cc: Russell King Cc: Andrew Morton Cc: Arjan van de Ven Cc: "Rafael J. Wysocki" Cc: "Srivatsa S. Bhat" Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/intel_cacheinfo.c | 25 +++--- arch/x86/kernel/cpu/mcheck/mce-internal.h | 4 +- arch/x86/kernel/cpu/mcheck/mce.c | 128 +++++++++++++++--------------- arch/x86/kernel/cpu/mcheck/mce_amd.c | 11 ++- arch/x86/kernel/cpu/mcheck/therm_throt.c | 63 ++++++++------- arch/x86/kernel/microcode_core.c | 58 +++++++------- 6 files changed, 144 insertions(+), 145 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a3b0811693c9..6b45e5e7a901 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) #include #include - -extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ +#include /* pointer to kobject for cpuX/cache */ static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); @@ -1073,9 +1072,9 @@ err_out: static DECLARE_BITMAP(cache_dev_map, NR_CPUS); /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct sys_device * sys_dev) +static int __cpuinit cache_add_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i, j; struct _index_kobject *this_object; struct _cpuid4_info *this_leaf; @@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), &ktype_percpu_entry, - &sys_dev->kobj, "%s", "cache"); + &dev->kobj, "%s", "cache"); if (retval < 0) { cpuid4_cache_sysfs_exit(cpu); return retval; @@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) return 0; } -static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i; if (per_cpu(ici_cpuid4_info, cpu) == NULL) @@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cache_add_dev(sys_dev); + cache_add_dev(dev); break; case CPU_DEAD: case CPU_DEAD_FROZEN: - cache_remove_dev(sys_dev); + cache_remove_dev(dev); break; } return NOTIFY_OK; @@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void) for_each_online_cpu(i) { int err; - struct sys_device *sys_dev = get_cpu_sysdev(i); + struct device *dev = get_cpu_device(i); - err = cache_add_dev(sys_dev); + err = cache_add_dev(dev); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fefcc69ee8b5..ed44c8a65858 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,4 +1,4 @@ -#include +#include #include enum severity_level { @@ -17,7 +17,7 @@ enum severity_level { struct mce_bank { u64 ctl; /* subevents to enable */ unsigned char init; /* initialise bank? */ - struct sysdev_attribute attr; /* sysdev attribute */ + struct device_attribute attr; /* device attribute */ char attrname[ATTR_LEN]; /* attribute name */ }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 362056aefeb4..0156c6f85d7b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -1751,7 +1751,7 @@ static struct syscore_ops mce_syscore_ops = { }; /* - * mce_sysdev: Sysfs support + * mce_device: Sysfs support */ static void mce_cpu_restart(void *data) @@ -1787,27 +1787,28 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct sysdev_class mce_sysdev_class = { +static struct bus_type mce_subsys = { .name = "machinecheck", + .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, mce_sysdev); +DEFINE_PER_CPU(struct device, mce_device); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); } -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1822,14 +1823,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +show_trigger(struct device *s, struct device_attribute *attr, char *buf) { strcpy(buf, mce_helper); strcat(buf, "\n"); return strlen(mce_helper) + 1; } -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, const char *buf, size_t siz) { char *p; @@ -1844,8 +1845,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return strlen(mce_helper) + !!p; } -static ssize_t set_ignore_ce(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1868,8 +1869,8 @@ static ssize_t set_ignore_ce(struct sys_device *s, return size; } -static ssize_t set_cmci_disabled(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1891,108 +1892,107 @@ static ssize_t set_cmci_disabled(struct sys_device *s, return size; } -static ssize_t store_int_with_restart(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { - ssize_t ret = sysdev_store_int(s, attr, buf, size); + ssize_t ret = device_store_int(s, attr, buf, size); mce_restart(); return ret; } -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); -static struct sysdev_ext_attribute attr_check_interval = { - _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, - store_int_with_restart), +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), &check_interval }; -static struct sysdev_ext_attribute attr_ignore_ce = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), &mce_ignore_ce }; -static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), &mce_cmci_disabled }; -static struct sysdev_attribute *mce_sysdev_attrs[] = { - &attr_tolerant.attr, - &attr_check_interval.attr, - &attr_trigger, - &attr_monarch_timeout.attr, - &attr_dont_log_ce.attr, - &attr_ignore_ce.attr, - &attr_cmci_disabled.attr, +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, + &dev_attr_trigger, + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, NULL }; -static cpumask_var_t mce_sysdev_initialized; +static cpumask_var_t mce_device_initialized; -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_sysdev_create(unsigned int cpu) +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_device_create(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + struct device *dev = &per_cpu(mce_device, cpu); int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&sysdev->kobj, 0, sizeof(struct kobject)); - sysdev->id = cpu; - sysdev->cls = &mce_sysdev_class; + memset(&dev->kobj, 0, sizeof(struct kobject)); + dev->id = cpu; + dev->bus = &mce_subsys; - err = sysdev_register(sysdev); + err = device_register(dev); if (err) return err; - for (i = 0; mce_sysdev_attrs[i]; i++) { - err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); if (err) goto error; } for (j = 0; j < banks; j++) { - err = sysdev_create_file(sysdev, &mce_banks[j].attr); + err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_sysdev_initialized); + cpumask_set_cpu(cpu, mce_device_initialized); return 0; error2: while (--j >= 0) - sysdev_remove_file(sysdev, &mce_banks[j].attr); + device_remove_file(dev, &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + device_remove_file(dev, mce_device_attrs[i]); - sysdev_unregister(sysdev); + device_unregister(dev); return err; } -static __cpuinit void mce_sysdev_remove(unsigned int cpu) +static __cpuinit void mce_device_remove(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + struct device *dev = &per_cpu(mce_device, cpu); int i; - if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) + if (!cpumask_test_cpu(cpu, mce_device_initialized)) return; - for (i = 0; mce_sysdev_attrs[i]; i++) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(sysdev, &mce_banks[i].attr); + device_remove_file(dev, &mce_banks[i].attr); - sysdev_unregister(sysdev); - cpumask_clear_cpu(cpu, mce_sysdev_initialized); + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ @@ -2042,7 +2042,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - mce_sysdev_create(cpu); + mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; @@ -2050,7 +2050,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); - mce_sysdev_remove(cpu); + mce_device_remove(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -2084,7 +2084,7 @@ static __init void mce_init_banks(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; - struct sysdev_attribute *a = &b->attr; + struct device_attribute *a = &b->attr; sysfs_attr_init(&a->attr); a->attr.name = b->attrname; @@ -2104,16 +2104,16 @@ static __init int mcheck_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); mce_init_banks(); - err = sysdev_class_register(&mce_sysdev_class); + err = subsys_system_register(&mce_subsys, NULL); if (err) return err; for_each_online_cpu(i) { - err = mce_sysdev_create(i); + err = mce_device_create(i); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f5474218cffe..56d2aa1acd55 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -548,7 +547,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, + err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj, b->kobj, name); if (err) goto out; @@ -571,7 +570,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); + b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj); if (!b->kobj) goto out_free; @@ -591,7 +590,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, + err = sysfs_create_link(&per_cpu(mce_device, i).kobj, b->kobj, name); if (err) goto out; @@ -669,7 +668,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); + sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -681,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); + sysfs_remove_link(&per_cpu(mce_device, i).kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 787e06c84ea6..59e3f6ed265f 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0); static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, \ - therm_throt_sysdev_show_##_name, \ +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(event, name) \ +#define define_therm_throt_device_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##event##_##name( \ - struct sys_device *dev, \ - struct sysdev_attribute *attr, \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ @@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core_throttle, count); -define_therm_throt_sysdev_one_ro(core_throttle_count); +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(core_power_limit, count); -define_therm_throt_sysdev_one_ro(core_power_limit_count); +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); -define_therm_throt_sysdev_show_func(package_throttle, count); -define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); -define_therm_throt_sysdev_show_func(package_power_limit, count); -define_therm_throt_sysdev_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_core_throttle_count.attr, + &dev_attr_core_throttle_count.attr, NULL }; @@ -223,36 +222,36 @@ static int thresh_event_valid(int event) #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, +static __cpuinit int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) { int err; struct cpuinfo_x86 *c = &cpu_data(cpu); - err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); if (err) return err; if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_core_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_throttle_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); } return err; } -static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +static __cpuinit void thermal_throttle_remove_dev(struct device *dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); + sysfs_remove_group(&dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; int err = 0; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev, cpu); + err = thermal_throttle_add_dev(dev, cpu); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; @@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: mutex_lock(&therm_cpu_lock); - thermal_throttle_remove_dev(sys_dev); + thermal_throttle_remove_dev(dev); mutex_unlock(&therm_cpu_lock); break; } @@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void) #endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); + err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index f2d2a664e797..cf88f2a16473 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu) return err; } -static ssize_t reload_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t reload_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t size) { unsigned long val; @@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev, return ret; } -static ssize_t version_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); } -static ssize_t pf_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +static ssize_t pf_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, + &dev_attr_reload.attr, + &dev_attr_version.attr, + &dev_attr_processor_flags.attr, NULL }; @@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu) return ustate; } -static int mc_sysdev_add(struct sys_device *sys_dev) +static int mc_device_add(struct device *dev, struct subsys_interface *sif) { - int err, cpu = sys_dev->id; + int err, cpu = dev->id; if (!cpu_online(cpu)) return 0; pr_debug("CPU%d added\n", cpu); - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + err = sysfs_create_group(&dev->kobj, &mc_attr_group); if (err) return err; if (microcode_init_cpu(cpu) == UCODE_ERROR) { - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); return -EINVAL; } return err; } -static int mc_sysdev_remove(struct sys_device *sys_dev) +static int mc_device_remove(struct device *dev, struct subsys_interface *sif) { - int cpu = sys_dev->id; + int cpu = dev->id; if (!cpu_online(cpu)) return 0; pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); return 0; } -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, +static struct subsys_interface mc_cpu_interface = { + .name = "microcode", + .subsys = &cpu_subsys, + .add_dev = mc_device_add, + .remove_dev = mc_device_remove, }; /** @@ -464,9 +466,9 @@ static __cpuinit int mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("CPU%d added\n", cpu); - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); break; @@ -527,7 +529,7 @@ static int __init microcode_init(void) get_online_cpus(); mutex_lock(µcode_mutex); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + error = subsys_interface_register(&mc_cpu_interface); mutex_unlock(µcode_mutex); put_online_cpus(); @@ -561,7 +563,7 @@ static void __exit microcode_exit(void) get_online_cpus(); mutex_lock(µcode_mutex); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + subsys_interface_unregister(&mc_cpu_interface); mutex_unlock(µcode_mutex); put_online_cpus(); -- cgit v1.2.3 From edbaa603eb801655e80808a9cf3d3b622e8ac66b Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 21 Dec 2011 16:26:03 -0800 Subject: driver-core: remove sysdev.h usage. The sysdev.h file should not be needed by any in-kernel code, so remove the .h file from these random files that seem to still want to include it. The sysdev code will be going away soon, so this include needs to be removed no matter what. Cc: Jiandong Zheng Cc: Scott Branden Cc: Russell King Cc: Kukjin Kim Cc: David Brown Cc: Daniel Walker Cc: Bryan Huntsman Cc: Ben Dooks Cc: Wan ZongShun Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Guan Xuetao Cc: "Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Grant Likely Cc: Richard Purdie Cc: Matthew Garrett Signed-off-by: Kay Sievers --- arch/x86/kernel/hpet.c | 1 - arch/x86/kernel/irqinit.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b946a9eac7d9..98094a9580f7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index b3300e6bacef..313fb5cddbce 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 5202397df819d3c5a3f201bd4af6b86542115fb6 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Tue, 1 Nov 2011 17:28:47 -0700 Subject: KVM guest: remove KVM guest pv mmu support This has not been used for some years now. It's time to remove it. Signed-off-by: Chris Wright Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 181 -------------------------------------------------- 1 file changed, 181 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a9c2116001d6..f0c6fd6f176b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,8 +39,6 @@ #include #include -#define MMU_QUEUE_SIZE 1024 - static int kvmapf = 1; static int parse_no_kvmapf(char *arg) @@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -struct kvm_para_state { - u8 mmu_queue[MMU_QUEUE_SIZE]; - int mmu_queue_len; -}; - -static DEFINE_PER_CPU(struct kvm_para_state, para_state); static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; -static struct kvm_para_state *kvm_para_state(void) -{ - return &per_cpu(para_state, raw_smp_processor_id()); -} - /* * No need for any "IO delay" on KVM */ @@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) } } -static void kvm_mmu_op(void *buffer, unsigned len) -{ - int r; - unsigned long a1, a2; - - do { - a1 = __pa(buffer); - a2 = 0; /* on i386 __pa() always returns <4G */ - r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); - buffer += r; - len -= r; - } while (len); -} - -static void mmu_queue_flush(struct kvm_para_state *state) -{ - if (state->mmu_queue_len) { - kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); - state->mmu_queue_len = 0; - } -} - -static void kvm_deferred_mmu_op(void *buffer, int len) -{ - struct kvm_para_state *state = kvm_para_state(); - - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { - kvm_mmu_op(buffer, len); - return; - } - if (state->mmu_queue_len + len > sizeof state->mmu_queue) - mmu_queue_flush(state); - memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); - state->mmu_queue_len += len; -} - -static void kvm_mmu_write(void *dest, u64 val) -{ - __u64 pte_phys; - struct kvm_mmu_op_write_pte wpte; - -#ifdef CONFIG_HIGHPTE - struct page *page; - unsigned long dst = (unsigned long) dest; - - page = kmap_atomic_to_page(dest); - pte_phys = page_to_pfn(page); - pte_phys <<= PAGE_SHIFT; - pte_phys += (dst & ~(PAGE_MASK)); -#else - pte_phys = (unsigned long)__pa(dest); -#endif - wpte.header.op = KVM_MMU_OP_WRITE_PTE; - wpte.pte_val = val; - wpte.pte_phys = pte_phys; - - kvm_deferred_mmu_op(&wpte, sizeof wpte); -} - -/* - * We only need to hook operations that are MMU writes. We hook these so that - * we can use lazy MMU mode to batch these operations. We could probably - * improve the performance of the host code if we used some of the information - * here to simplify processing of batched writes. - */ -static void kvm_set_pte(pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) -{ - kvm_mmu_write(pmdp, pmd_val(pmd)); -} - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE -static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - kvm_mmu_write(ptep, 0); -} - -static void kvm_pmd_clear(pmd_t *pmdp) -{ - kvm_mmu_write(pmdp, 0); -} -#endif - -static void kvm_set_pud(pud_t *pudp, pud_t pud) -{ - kvm_mmu_write(pudp, pud_val(pud)); -} - -#if PAGETABLE_LEVELS == 4 -static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) -{ - kvm_mmu_write(pgdp, pgd_val(pgd)); -} -#endif -#endif /* PAGETABLE_LEVELS >= 3 */ - -static void kvm_flush_tlb(void) -{ - struct kvm_mmu_op_flush_tlb ftlb = { - .header.op = KVM_MMU_OP_FLUSH_TLB, - }; - - kvm_deferred_mmu_op(&ftlb, sizeof ftlb); -} - -static void kvm_release_pt(unsigned long pfn) -{ - struct kvm_mmu_op_release_pt rpt = { - .header.op = KVM_MMU_OP_RELEASE_PT, - .pt_phys = (u64)pfn << PAGE_SHIFT, - }; - - kvm_mmu_op(&rpt, sizeof rpt); -} - -static void kvm_enter_lazy_mmu(void) -{ - paravirt_enter_lazy_mmu(); -} - -static void kvm_leave_lazy_mmu(void) -{ - struct kvm_para_state *state = kvm_para_state(); - - mmu_queue_flush(state); - paravirt_leave_lazy_mmu(); -} - static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; @@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void) if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; - if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { - pv_mmu_ops.set_pte = kvm_set_pte; - pv_mmu_ops.set_pte_at = kvm_set_pte_at; - pv_mmu_ops.set_pmd = kvm_set_pmd; -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; - pv_mmu_ops.pte_clear = kvm_pte_clear; - pv_mmu_ops.pmd_clear = kvm_pmd_clear; -#endif - pv_mmu_ops.set_pud = kvm_set_pud; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = kvm_set_pgd; -#endif -#endif - pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; - pv_mmu_ops.release_pte = kvm_release_pt; - pv_mmu_ops.release_pmd = kvm_release_pt; - pv_mmu_ops.release_pud = kvm_release_pt; - - pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; - } #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif -- cgit v1.2.3 From 2c9ede55ecec58099b72e4bb8eab719f32f72c31 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Jul 2011 20:24:48 -0400 Subject: switch device_get_devnode() and ->devnode() to umode_t * both callers of device_get_devnode() are only interested in lower 16bits and nobody tries to return anything wider than 16bit anyway. Signed-off-by: Al Viro --- arch/x86/kernel/cpuid.c | 2 +- arch/x86/kernel/msr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 212a6a42527c..a524353d93f2 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; -static char *cpuid_devnode(struct device *dev, mode_t *mode) +static char *cpuid_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 12fcbe2c143e..96356762a51d 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -static char *msr_devnode(struct device *dev, mode_t *mode) +static char *msr_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } -- cgit v1.2.3 From 24d25dbfa63c376323096660bfa9ad45a08870ce Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 5 Jan 2012 14:27:19 -0700 Subject: x86/PCI: amd: factor out MMCONFIG discovery This factors out the AMD native MMCONFIG discovery so we can use it outside amd_bus.c. amd_bus.c reads AMD MSRs so it can remove the MMCONFIG area from the PCI resources. We may also need the MMCONFIG information to work around BIOS defects in the ACPI MCFG table. Cc: Borislav Petkov Cc: Yinghai Lu Cc: stable@kernel.org # 2.6.34+ Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/kernel/amd_nb.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4c39baa8facc..bae1efe6d515 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -119,6 +119,37 @@ bool __init early_is_amd_nb(u32 device) return false; } +struct resource *amd_get_mmconfig_range(struct resource *res) +{ + u32 address; + u64 base, msr; + unsigned segn_busn_bits; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return NULL; + + /* assume all cpus from fam10h have mmconfig */ + if (boot_cpu_data.x86 < 0x10) + return NULL; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enabled */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return NULL; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + res->flags = IORESOURCE_MEM; + res->start = base; + res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1; + return res; +} + int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; -- cgit v1.2.3 From 76ccc297018d25d55b789bbd508861ef1e2cdb0c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 16 Dec 2011 17:38:18 -0500 Subject: x86/PCI: Expand the x86_msi_ops to have a restore MSIs. The MSI restore function will become a function pointer in an x86_msi_ops struct. It defaults to the implementation in the io_apic.c and msi.c. We piggyback on the indirection mechanism introduced by "x86: Introduce x86_msi_ops". Cc: x86@kernel.org Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: linux-pci@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jesse Barnes --- arch/x86/kernel/x86_init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c1d6cd549397..83b05adaadf1 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -114,4 +114,5 @@ struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, + .restore_msi_irqs = default_restore_msi_irqs, }; -- cgit v1.2.3 From e58d429209105e698e9d0357481d62b37fe9a7dd Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 6 Jan 2012 11:17:51 -0500 Subject: x86, reboot: Fix typo in nmi reboot path It was brought to my attention that my x86 change to use NMI in the reboot path broke Intel Nehalem and Westmere boxes when using kexec. I realized I had mistyped the if statement in commit 3603a2512f9e69dc87914ba922eb4a0812b21cd6 and stuck the ')' in the wrong spot. Putting it in the right spot fixes kexec again. Doh. Reported-by: Yinghai Lu Cc: Linus Torvalds Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1325866671-9797-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 113acda5879e..66c74f481cab 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -176,7 +176,7 @@ static void native_nmi_stop_other_cpus(int wait) */ if (num_online_cpus() > 1) { /* did someone beat us here? */ - if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1)) + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) return; if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, -- cgit v1.2.3 From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 10 Jan 2012 15:11:17 -0800 Subject: signal: add block_sigmask() for adding sigmask to current->blocked Abstract the code sequence for adding a signal handler's sa_mask to current->blocked because the sequence is identical for all architectures. Furthermore, in the past some architectures actually got this code wrong, so introduce a wrapper that all architectures can use. Signed-off-by: Matt Fleming Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Tejun Heo Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/signal.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 54ddaeb221c1..46a01bdc27e2 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -682,7 +682,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - sigset_t blocked; int ret; /* Are we from a system call? */ @@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); - set_current_blocked(&blocked); + block_sigmask(ka, sig); tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); -- cgit v1.2.3 From 476bc0015bf09dad39d36a8b19f76f0c181d1ec9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:18 +1030 Subject: module_param: make bool parameters really bool (arch) module_param(bool) used to counter-intuitively take an int. In fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy trick. It's time to remove the int/unsigned int option. For this version it'll simply give a warning, but it'll break next kernel version. Signed-off-by: Rusty Russell --- arch/x86/kernel/apm_32.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index a46bd383953c..f76623cbe263 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -383,21 +383,21 @@ static int ignore_sys_suspend; static int ignore_normal_resume; static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; -static int debug __read_mostly; -static int smp __read_mostly; +static bool debug __read_mostly; +static bool smp __read_mostly; static int apm_disabled = -1; #ifdef CONFIG_SMP -static int power_off; +static bool power_off; #else -static int power_off = 1; +static bool power_off = 1; #endif -static int realmode_power_off; +static bool realmode_power_off; #ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; +static bool allow_ints = 1; #else -static int allow_ints; +static bool allow_ints; #endif -static int broken_psr; +static bool broken_psr; static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); -- cgit v1.2.3 From a3301b751b19f0efbafddc4034f8e7ce6bf3007b Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 14 Jan 2012 08:11:31 +0530 Subject: x86/mce: Fix CPU hotplug and suspend regression related to MCE Commit 8a25a2fd126c ("cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular subsystem") changed how things are dealt with in the MCE subsystem. Some of the things that got broken due to this are CPU hotplug and suspend/hibernate. MCE uses per_cpu allocations of struct device. So, when a CPU goes offline and comes back online, in order to ensure that we start from a clean slate with respect to the MCE subsystem, zero out the entire per_cpu device structure to 0 before using it. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f22a9f7f6390..29ba3297e480 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2011,7 +2011,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&dev->kobj, 0, sizeof(struct kobject)); + memset(dev, 0, sizeof(struct device)); dev->id = cpu; dev->bus = &mce_subsys; -- cgit v1.2.3 From e032d80774315869aa2285b217fdbbfed86c0b49 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 16 Jan 2012 14:40:28 -0800 Subject: mce: fix warning messages about static struct mce_device When suspending, there was a large list of warnings going something like: Device 'machinecheck1' does not have a release() function, it is broken and must be fixed This patch turns the static mce_devices into dynamically allocated, and properly frees them when they are removed from the system. It solves the warning messages on my laptop here. Reported-by: "Srivatsa S. Bhat" Reported-by: Linus Torvalds Tested-by: Djalal Harouni Cc: Kay Sievers Cc: Tony Luck Cc: Borislav Petkov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mcheck/mce.c | 18 ++++++++++++++---- arch/x86/kernel/cpu/mcheck/mce_amd.c | 18 +++++++++++------- 2 files changed, 25 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 29ba3297e480..5a11ae2e9e91 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = { .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct device, mce_device); +struct device *mce_device[CONFIG_NR_CPUS]; __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -2001,19 +2001,27 @@ static struct device_attribute *mce_device_attrs[] = { static cpumask_var_t mce_device_initialized; +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} + /* Per cpu device init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_device_create(unsigned int cpu) { - struct device *dev = &per_cpu(mce_device, cpu); + struct device *dev; int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(dev, 0, sizeof(struct device)); + dev = kzalloc(sizeof *dev, GFP_KERNEL); + if (!dev) + return -ENOMEM; dev->id = cpu; dev->bus = &mce_subsys; + dev->release = &mce_device_release; err = device_register(dev); if (err) @@ -2030,6 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) goto error2; } cpumask_set_cpu(cpu, mce_device_initialized); + mce_device[cpu] = dev; return 0; error2: @@ -2046,7 +2055,7 @@ error: static __cpuinit void mce_device_remove(unsigned int cpu) { - struct device *dev = &per_cpu(mce_device, cpu); + struct device *dev = mce_device[cpu]; int i; if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2060,6 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); + mce_device[cpu] = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ba0b94a7e204..786e76a86322 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -523,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; + struct device *dev = mce_device[cpu]; char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -543,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj, - b->kobj, name); + err = sysfs_create_link(&dev->kobj, b->kobj, name); if (err) goto out; @@ -565,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj); + b->kobj = kobject_create_and_add(name, &dev->kobj); if (!b->kobj) goto out_free; @@ -585,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_device, i).kobj, - b->kobj, name); + dev = mce_device[i]; + if (dev) + err = sysfs_create_link(&dev->kobj,b->kobj, name); if (err) goto out; @@ -649,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { struct threshold_bank *b; + struct device *dev; char name[32]; int i = 0; @@ -663,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name); + sysfs_remove_link(&mce_device[cpu]->kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -675,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_device, i).kobj, name); + dev = mce_device[i]; + if (dev) + sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } -- cgit v1.2.3 From b54ac6d2a25084667da781c7ca2cebef52a2bcdd Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 8 Dec 2011 11:25:49 +0800 Subject: ACPI, Record ACPI NVS regions Some firmware will access memory in ACPI NVS region via APEI. That is, instructions in APEI ERST/EINJ table will read/write ACPI NVS region. The original resource conflict checking in APEI code will check memory/ioport accessed by APEI via general resource management mechanism. But ACPI NVS region is marked as busy already, so that the false resource conflict will prevent APEI ERST/EINJ to work. To fix this, this patch record ACPI NVS regions, so that we can avoid request resources for memory region inside it. Signed-off-by: Huang Ying Signed-off-by: Len Brown --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e48f076..51c3b186e5b9 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -714,7 +714,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) } #endif -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_ACPI /** * Mark ACPI NVS memory region, so that we can save/restore it during * hibernation and the subsequent resume. @@ -727,7 +727,7 @@ static int __init e820_mark_nvs_memory(void) struct e820entry *ei = &e820.map[i]; if (ei->type == E820_NVS) - suspend_nvs_register(ei->addr, ei->size); + acpi_nvs_register(ei->addr, ei->size); } return 0; -- cgit v1.2.3 From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: Audit: push audit success and retcode into arch ptrace.h The audit system previously expected arches calling to audit_syscall_exit to supply as arguments if the syscall was a success and what the return code was. Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things by converting from negative retcodes to an audit internal magic value stating success or failure. This helper was wrong and could indicate that a valid pointer returned to userspace was a failed syscall. The fix is to fix the layering foolishness. We now pass audit_syscall_exit a struct pt_reg and it in turns calls back into arch code to collect the return value and to determine if the syscall was a success or failure. We also define a generic is_syscall_success() macro which determines success/failure based on if the value is < -MAX_ERRNO. This works for arches like x86 which do not use a separate mechanism to indicate syscall failure. We make both the is_syscall_success() and regs_return_value() static inlines instead of macros. The reason is because the audit function must take a void* for the regs. (uml calls theirs struct uml_pt_regs instead of just struct pt_regs so audit_syscall_exit can't take a struct pt_regs). Since the audit function takes a void* we need to use static inlines to cast it back to the arch correct structure to dereference it. The other major change is that on some arches, like ia64, MIPS and ppc, we change regs_return_value() to give us the negative value on syscall failure. THE only other user of this macro, kretprobe_example.c, won't notice and it makes the value signed consistently for the audit functions across all archs. In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old audit code as the return value. But the ptrace_64.h code defined the macro regs_return_value() as regs[3]. I have no idea which one is correct, but this patch now uses the regs_return_value() function, so it now uses regs[3]. For powerpc we previously used regs->result but now use the regs_return_value() function which uses regs->gprs[3]. regs->gprs[3] is always positive so the regs_return_value(), much like ia64 makes it negative before calling the audit code when appropriate. Signed-off-by: Eric Paris Acked-by: H. Peter Anvin [for x86 portion] Acked-by: Tony Luck [for ia64] Acked-by: Richard Weinberger [for uml] Acked-by: David S. Miller [for sparc] Acked-by: Ralf Baechle [for mips] Acked-by: Benjamin Herrenschmidt [for ppc] --- arch/x86/kernel/entry_32.S | 8 ++++---- arch/x86/kernel/entry_64.S | 10 +++++----- arch/x86/kernel/ptrace.c | 3 +-- arch/x86/kernel/vm86_32.c | 4 ++-- 4 files changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 22d0e21b4dd7..a22facf06f0e 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -42,6 +42,7 @@ */ #include +#include #include #include #include @@ -466,11 +467,10 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb9dc87..e51393dd93a3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -563,17 +564,16 @@ auditsys: jmp system_call_fastpath /* - * Return fast path for syscall audit. Call audit_syscall_exit() + * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $0,%rsi /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 89a04c7b5bb6..8b0218758775 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1414,8 +1414,7 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f8753ab0a..af17e1c966dc 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call audit_syscall_exit since we do not exit via the normal paths */ + /*call __audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(0), 0); + __audit_syscall_exit(1, 0); __asm__ __volatile__( "movl %0,%%esp\n\t" -- cgit v1.2.3 From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: inline audit_syscall_entry to reduce burden on archs Every arch calls: if (unlikely(current->audit_context)) audit_syscall_entry() which requires knowledge about audit (the existance of audit_context) in the arch code. Just do it all in static inline in audit.h so that arch's can remain blissfully ignorant. Signed-off-by: Eric Paris --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/ptrace.c | 22 ++++++++++------------ 3 files changed, 13 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a22facf06f0e..1ccd742eba1b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -456,7 +456,7 @@ sysenter_audit: movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e51393dd93a3..1ca66b650123 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -549,7 +549,7 @@ badsys: #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. - * We just call audit_syscall_entry() directly, and then + * We just call __audit_syscall_entry() directly, and then * jump back to the normal fast path. */ auditsys: @@ -559,7 +559,7 @@ auditsys: movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ movq %rax,%rsi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8b0218758775..50267386b766 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (unlikely(current->audit_context)) { - if (IS_IA32) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); #ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); #endif - } return ret ?: regs->orig_ax; } -- cgit v1.2.3 From 6015ff103133c7e50a753c198c69bcabc3a5e3b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 18 Jan 2012 01:51:22 +0000 Subject: x86-32: Fix build failure with AUDIT=y, AUDITSYSCALL=n JONGMAN HEO reports: With current linus git (commit a25a2b84), I got following build error, arch/x86/kernel/vm86_32.c: In function 'do_sys_vm86': arch/x86/kernel/vm86_32.c:340: error: implicit declaration of function '__audit_syscall_exit' make[3]: *** [arch/x86/kernel/vm86_32.o] Error 1 OK, I can reproduce it (32bit allmodconfig with AUDIT=y, AUDITSYSCALL=n) It's due to commit d7e7528bcd45: "Audit: push audit success and retcode into arch ptrace.h". Reported-by: JONGMAN HEO Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/x86/kernel/vm86_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index af17e1c966dc..b466cab5ba15 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -336,8 +336,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk mark_screen_rdonly(tsk->mm); /*call __audit_syscall_exit since we do not exit via the normal paths */ +#ifdef CONFIG_AUDITSYSCALL if (unlikely(current->audit_context)) __audit_syscall_exit(1, 0); +#endif __asm__ __volatile__( "movl %0,%%esp\n\t" -- cgit v1.2.3