diff options
author | Ingo Molnar <mingo@elte.hu> | 2011-01-09 10:42:21 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-01-09 10:42:21 +0100 |
commit | 4385428a477559b26736cc3c80d8b68f31126c71 (patch) | |
tree | 8eb0cbc78e79c368687fa13a1e0674ae537f830f | |
parent | 047a3772feaae8e43d81d790f3d3f80dae8ae676 (diff) | |
parent | 2d75af2f2a7a6103a6d539a492fe81deacabde44 (diff) | |
download | linux-stable-4385428a477559b26736cc3c80d8b68f31126c71.tar.gz linux-stable-4385428a477559b26736cc3c80d8b68f31126c71.tar.bz2 linux-stable-4385428a477559b26736cc3c80d8b68f31126c71.zip |
Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/urgent
165 files changed, 5763 insertions, 2060 deletions
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index a851118775d8..6a8c73f55b80 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -1,18 +1,22 @@ CONFIG_RCU_TRACE debugfs Files and Formats -The rcutree implementation of RCU provides debugfs trace output that -summarizes counters and state. This information is useful for debugging -RCU itself, and can sometimes also help to debug abuses of RCU. -The following sections describe the debugfs files and formats. +The rcutree and rcutiny implementations of RCU provide debugfs trace +output that summarizes counters and state. This information is useful for +debugging RCU itself, and can sometimes also help to debug abuses of RCU. +The following sections describe the debugfs files and formats, first +for rcutree and next for rcutiny. -Hierarchical RCU debugfs Files and Formats +CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats -This implementation of RCU provides three debugfs files under the +These implementations of RCU provides five debugfs files under the top-level directory RCU: rcu/rcudata (which displays fields in struct -rcu_data), rcu/rcugp (which displays grace-period counters), and -rcu/rcuhier (which displays the struct rcu_node hierarchy). +rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of +rcu/rcudata), rcu/rcugp (which displays grace-period counters), +rcu/rcuhier (which displays the struct rcu_node hierarchy), and +rcu/rcu_pending (which displays counts of the reasons that the +rcu_pending() function decided that there was core RCU work to do). The output of "cat rcu/rcudata" looks as follows: @@ -130,7 +134,8 @@ o "ci" is the number of RCU callbacks that have been invoked for been registered in absence of CPU-hotplug activity. o "co" is the number of RCU callbacks that have been orphaned due to - this CPU going offline. + this CPU going offline. These orphaned callbacks have been moved + to an arbitrarily chosen online CPU. o "ca" is the number of RCU callbacks that have been adopted due to other CPUs going offline. Note that ci+co-ca+ql is the number of @@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started. It is The output of "cat rcu/rcuhier" looks as follows, with very long lines: -c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0 +c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 1/1 .>. 0:127 ^0 3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 rcu_bh: -c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0 +c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 0/1 .>. 0:127 ^0 0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 @@ -212,11 +217,6 @@ o "fqlh" is the number of calls to force_quiescent_state() that exited immediately (without even being counted in nfqs above) due to contention on ->fqslock. -o "oqlen" is the number of callbacks on the "orphan" callback - list. RCU callbacks are placed on this list by CPUs going - offline, and are "adopted" either by the CPU helping the outgoing - CPU or by the next rcu_barrier*() call, whichever comes first. - o Each element of the form "1/1 0:127 ^0" represents one struct rcu_node. Each line represents one level of the hierarchy, from root to leaves. It is best to think of the rcu_data structures @@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing. Alert readers will note that the rcu "nn" number for a given CPU very closely matches the rcu_bh "np" number for that same CPU. This is due to short-circuit evaluation in rcu_pending(). + + +CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats + +These implementations of RCU provides a single debugfs file under the +top-level directory RCU, namely rcu/rcudata, which displays fields in +rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU, +rcu_preempt_ctrlblk. + +The output of "cat rcu/rcudata" is as follows: + +rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=... + ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274 + normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0 + exp balk: bt=0 nos=0 +rcu_sched: qlen: 0 +rcu_bh: qlen: 0 + +This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the +rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds. +The last three lines of the rcu_preempt section appear only in +CONFIG_RCU_BOOST kernel builds. The fields are as follows: + +o "qlen" is the number of RCU callbacks currently waiting either + for an RCU grace period or waiting to be invoked. This is the + only field present for rcu_sched and rcu_bh, due to the + short-circuiting of grace period in those two cases. + +o "gp" is the number of grace periods that have completed. + +o "g197/p197/c197" displays the grace-period state, with the + "g" number being the number of grace periods that have started + (mod 256), the "p" number being the number of grace periods + that the CPU has responded to (also mod 256), and the "c" + number being the number of grace periods that have completed + (once again mode 256). + + Why have both "gp" and "g"? Because the data flowing into + "gp" is only present in a CONFIG_RCU_TRACE kernel. + +o "tasks" is a set of bits. The first bit is "T" if there are + currently tasks that have recently blocked within an RCU + read-side critical section, the second bit is "N" if any of the + aforementioned tasks are blocking the current RCU grace period, + and the third bit is "E" if any of the aforementioned tasks are + blocking the current expedited grace period. Each bit is "." + if the corresponding condition does not hold. + +o "ttb" is a single bit. It is "B" if any of the blocked tasks + need to be priority boosted and "." otherwise. + +o "btg" indicates whether boosting has been carried out during + the current grace period, with "exp" indicating that boosting + is in progress for an expedited grace period, "no" indicating + that boosting has not yet started for a normal grace period, + "begun" indicating that boosting has bebug for a normal grace + period, and "done" indicating that boosting has completed for + a normal grace period. + +o "ntb" is the total number of tasks subjected to RCU priority boosting + periods since boot. + +o "neb" is the number of expedited grace periods that have had + to resort to RCU priority boosting since boot. + +o "nnb" is the number of normal grace periods that have had + to resort to RCU priority boosting since boot. + +o "j" is the low-order 12 bits of the jiffies counter in hexadecimal. + +o "bt" is the low-order 12 bits of the value that the jiffies counter + will have at the next time that boosting is scheduled to begin. + +o In the line beginning with "normal balk", the fields are as follows: + + o "nt" is the number of times that the system balked from + boosting because there were no blocked tasks to boost. + Note that the system will balk from boosting even if the + grace period is overdue when the currently running task + is looping within an RCU read-side critical section. + There is no point in boosting in this case, because + boosting a running task won't make it run any faster. + + o "gt" is the number of times that the system balked + from boosting because, although there were blocked tasks, + none of them were preventing the current grace period + from completing. + + o "bt" is the number of times that the system balked + from boosting because boosting was already in progress. + + o "b" is the number of times that the system balked from + boosting because boosting had already completed for + the grace period in question. + + o "ny" is the number of times that the system balked from + boosting because it was not yet time to start boosting + the grace period in question. + + o "nos" is the number of times that the system balked from + boosting for inexplicable ("not otherwise specified") + reasons. This can actually happen due to races involving + increments of the jiffies counter. + +o In the line beginning with "exp balk", the fields are as follows: + + o "bt" is the number of times that the system balked from + boosting because there were no blocked tasks to boost. + + o "nos" is the number of times that the system balked from + boosting for inexplicable ("not otherwise specified") + reasons. diff --git a/Documentation/dontdiff b/Documentation/dontdiff index d9bcffd59433..470d3dba1a69 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -62,6 +62,10 @@ aic7*reg_print.c* aic7*seq.h* aicasm aicdb.h* +altivec1.c +altivec2.c +altivec4.c +altivec8.c asm-offsets.h asm_offsets.h autoconf.h* @@ -76,6 +80,7 @@ btfixupprep build bvmlinux bzImage* +capflags.c classlist.h* comp*.log compile.h* @@ -94,6 +99,7 @@ devlist.h* docproc elf2ecoff elfconfig.h* +evergreen_reg_safe.h fixdep flask.h fore200e_mkfirm @@ -108,9 +114,16 @@ genksyms *_gray256.c ihex2fw ikconfig.h* +inat-tables.c initramfs_data.cpio initramfs_data.cpio.gz initramfs_list +int16.c +int1.c +int2.c +int32.c +int4.c +int8.c kallsyms kconfig keywords.c @@ -140,6 +153,7 @@ mkprep mktables mktree modpost +modules.builtin modules.order modversions.h* ncscope.* @@ -153,14 +167,23 @@ pca200e.bin pca200e_ecd.bin2 piggy.gz piggyback +piggy.S pnmtologo ppc_defs.h* pss_boot.h qconf +r100_reg_safe.h +r200_reg_safe.h +r300_reg_safe.h +r420_reg_safe.h +r600_reg_safe.h raid6altivec*.c raid6int*.c raid6tables.c relocs +rn50_reg_safe.h +rs600_reg_safe.h +rv515_reg_safe.h series setup setup.bin @@ -169,6 +192,7 @@ sImage sm_tbl* split-include syscalltab.h +tables.c tags tftpboot.img timeconst.h @@ -190,6 +214,7 @@ vmlinux vmlinux-* vmlinux.aout vmlinux.lds +voffset.h vsyscall.lds vsyscall_32.lds wanxlfw.inc @@ -200,3 +225,4 @@ wakeup.elf wakeup.lds zImage* zconf.hash.c +zoffset.h diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt index 715eaaf1519d..9a8674629a07 100644 --- a/Documentation/kernel-docs.txt +++ b/Documentation/kernel-docs.txt @@ -537,7 +537,7 @@ Notes: Further information in http://www.oreilly.com/catalog/linuxdrive2/ - * Title: "Linux Device Drivers, 3nd Edition" + * Title: "Linux Device Drivers, 3rd Edition" Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman Publisher: O'Reilly & Associates. Date: 2005. @@ -592,14 +592,6 @@ Pages: 600. ISBN: 0-13-101908-2 - * Title: "The Design and Implementation of the 4.4 BSD UNIX - Operating System" - Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels, - John S. Quarterman. - Publisher: Addison-Wesley. - Date: 1996. - ISBN: 0-201-54979-4 - * Title: "Programming for the real world - POSIX.4" Author: Bill O. Gallmeister. Publisher: O'Reilly & Associates, Inc.. @@ -610,28 +602,13 @@ POSIX. Good reference. * Title: "UNIX Systems for Modern Architectures: Symmetric - Multiprocesssing and Caching for Kernel Programmers" + Multiprocessing and Caching for Kernel Programmers" Author: Curt Schimmel. Publisher: Addison Wesley. Date: June, 1994. Pages: 432. ISBN: 0-201-63338-8 - * Title: "The Design and Implementation of the 4.3 BSD UNIX - Operating System" - Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J. - Karels, John S. Quarterman. - Publisher: Addison-Wesley. - Date: 1989 (reprinted with corrections on October, 1990). - ISBN: 0-201-06196-1 - - * Title: "The Design of the UNIX Operating System" - Author: Maurice J. Bach. - Publisher: Prentice Hall. - Date: 1986. - Pages: 471. - ISBN: 0-13-201757-1 - MISCELLANEOUS: * Name: linux/Documentation diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 992cda68fa63..f3dc951e949f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1614,6 +1614,8 @@ and is between 256 and 4096 characters. It is defined in the file noapic [SMP,APIC] Tells the kernel to not make use of any IOAPICs that may be present in the system. + noautogroup Disable scheduler automatic task group creation. + nobats [PPC] Do not use BATs for mapping kernel lowmem on "Classic" PPC cores. @@ -2459,12 +2461,13 @@ and is between 256 and 4096 characters. It is defined in the file to facilitate early boot debugging. See also Documentation/trace/events.txt - tsc= Disable clocksource-must-verify flag for TSC. + tsc= Disable clocksource stability checks for TSC. Format: <string> [x86] reliable: mark tsc clocksource as reliable, this - disables clocksource verification at runtime. - Used to enable high-resolution timer mode on older - hardware, and in virtualized environment. + disables clocksource verification at runtime, as well + as the stability checks done at bootup. Used to enable + high-resolution timer mode on older hardware, and in + virtualized environment. [x86] noirqtime: Do not use TSC to do irq accounting. Used to run time disable IRQ_TIME_ACCOUNTING on any platforms where RDTSC is slow and this accounting diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 30b43e1b2697..bdeb81ccb5f6 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -600,6 +600,7 @@ Protocol: 2.07+ 0x00000001 lguest 0x00000002 Xen 0x00000003 Moorestown MID + 0x00000004 CE4100 TV Platform Field name: hardware_subarch_data Type: write (subarch-dependent) diff --git a/MAINTAINERS b/MAINTAINERS index b1dda78a1e75..c5c7292daba0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2812,6 +2812,10 @@ M: Thomas Gleixner <tglx@linutronix.de> S: Maintained F: Documentation/timers/ F: kernel/hrtimer.c +F: kernel/time/clockevents.c +F: kernel/time/tick*.* +F: kernel/time/timer_*.c +F include/linux/clockevents.h F: include/linux/hrtimer.h HIGH-SPEED SCC DRIVER FOR AX.25 @@ -5142,6 +5146,18 @@ L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Supported F: sound/soc/s3c24xx +TIMEKEEPING, NTP +M: John Stultz <johnstul@us.ibm.com> +M: Thomas Gleixner <tglx@linutronix.de> +S: Supported +F: include/linux/clocksource.h +F: include/linux/time.h +F: include/linux/timex.h +F: include/linux/timekeeping.h +F: kernel/time/clocksource.c +F: kernel/time/time*.c +F: kernel/time/ntp.c + TLG2300 VIDEO4LINUX-2 DRIVER M: Huang Shijie <shijie8@gmail.com> M: Kang Yong <kangyong@telegent.com> diff --git a/arch/Kconfig b/arch/Kconfig index 8bf0fa652eb6..f78c2be4242b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI config HAVE_ARCH_JUMP_LABEL bool +config HAVE_ARCH_MUTEX_CPU_RELAX + bool + source "kernel/gcov/Kconfig" diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e0b98e71ff47..6c6d7b339aae 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -99,6 +99,7 @@ config S390 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO select HAVE_GET_USER_PAGES_FAST + select HAVE_ARCH_MUTEX_CPU_RELAX select ARCH_INLINE_SPIN_TRYLOCK select ARCH_INLINE_SPIN_TRYLOCK_BH select ARCH_INLINE_SPIN_LOCK diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h index 458c1f7fbc18..688271f5f2e4 100644 --- a/arch/s390/include/asm/mutex.h +++ b/arch/s390/include/asm/mutex.h @@ -7,3 +7,5 @@ */ #include <asm-generic/mutex-dec.h> + +#define arch_mutex_cpu_relax() barrier() diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e330da21b84f..b6fccb07123e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -377,6 +377,18 @@ config X86_ELAN If unsure, choose "PC-compatible" instead. +config X86_INTEL_CE + bool "CE4100 TV platform" + depends on PCI + depends on PCI_GODIRECT + depends on X86_32 + depends on X86_EXTENDED_PLATFORM + select X86_REBOOTFIXUPS + ---help--- + Select for the Intel CE media processor (CE4100) SOC. + This option compiles in support for the CE4100 SOC for settop + boxes and media devices. + config X86_MRST bool "Moorestown MID platform" depends on PCI @@ -385,6 +397,10 @@ config X86_MRST depends on X86_EXTENDED_PLATFORM depends on X86_IO_APIC select APB_TIMER + select I2C + select SPI + select INTEL_SCU_IPC + select X86_PLATFORM_DEVICES ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. Moorestown consists of two chips: @@ -466,6 +482,19 @@ config X86_ES7000 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is supposed to run on an IA32-based Unisys ES7000 system. +config X86_32_IRIS + tristate "Eurobraille/Iris poweroff module" + depends on X86_32 + ---help--- + The Iris machines from EuroBraille do not have APM or ACPI support + to shut themselves down properly. A special I/O sequence is + needed to do so, which is what this module does at + kernel shutdown. + + This is only for Iris machines from EuroBraille. + + If unused, say N. + config SCHED_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" @@ -1141,16 +1170,16 @@ config NUMA comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) -config K8_NUMA +config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" depends on X86_64 && NUMA && PCI ---help--- - Enable K8 NUMA node topology detection. You should say Y here if - you have a multi processor AMD K8 system. This uses an old - method to read the NUMA configuration directly from the builtin - Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA - instead, which also takes priority if both are compiled in. + Enable AMD NUMA node topology detection. You should say Y here if + you have a multi processor AMD system. This uses an old method to + read the NUMA configuration directly from the builtin Northbridge + of Opteron. It is recommended to use X86_64_ACPI_NUMA instead, + which also takes priority if both are compiled in. config X86_64_ACPI_NUMA def_bool y diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index b59ee765414e..45143bbcfe5e 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST feature as well as for the change_page_attr() infrastructure. If in doubt, say "N" +config DEBUG_SET_MODULE_RONX + bool "Set loadable kernel module data as NX and text as RO" + depends on MODULES + ---help--- + This option helps catch unintended modifications to loadable + kernel module's text and read-only data. It also prevents execution + of module data. Such protection may interfere with run-time code + patching and dynamic kernel tracing - and they might also protect + against certain classes of kernel exploits. + If in doubt, say "N". + config DEBUG_NX_TEST tristate "Testcase for the NX non-executable stack feature" depends on DEBUG_KERNEL && m diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 52f85a196fa0..35af09d13dc1 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -182,7 +182,7 @@ no_longmode: hlt jmp 1b -#include "../../kernel/verify_cpu_64.S" +#include "../../kernel/verify_cpu.S" /* * Be careful here startup_64 needs to be at a predictable diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4a2adaa9aefc..13009d1af99a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, extern void alternatives_smp_module_del(struct module *mod); extern void alternatives_smp_switch(int smp); extern int alternatives_text_reserved(void *start, void *end); +extern bool skip_smp_alternatives; #else static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index c8517f81b21e..6aee50d655d1 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -3,36 +3,53 @@ #include <linux/pci.h> -extern struct pci_device_id k8_nb_ids[]; +extern struct pci_device_id amd_nb_misc_ids[]; struct bootnode; -extern int early_is_k8_nb(u32 value); -extern int cache_k8_northbridges(void); -extern void k8_flush_garts(void); -extern int k8_get_nodes(struct bootnode *nodes); -extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); -extern int k8_scan_nodes(void); +extern int early_is_amd_nb(u32 value); +extern int amd_cache_northbridges(void); +extern void amd_flush_garts(void); +extern int amd_get_nodes(struct bootnode *nodes); +extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); +extern int amd_scan_nodes(void); -struct k8_northbridge_info { +struct amd_northbridge { + struct pci_dev *misc; +}; + +struct amd_northbridge_info { u16 num; - u8 gart_supported; - struct pci_dev **nb_misc; + u64 flags; + struct amd_northbridge *nb; }; -extern struct k8_northbridge_info k8_northbridges; +extern struct amd_northbridge_info amd_northbridges; + +#define AMD_NB_GART 0x1 +#define AMD_NB_L3_INDEX_DISABLE 0x2 #ifdef CONFIG_AMD_NB -static inline struct pci_dev *node_to_k8_nb_misc(int node) +static inline int amd_nb_num(void) { - return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; + return amd_northbridges.num; } -#else +static inline int amd_nb_has_feature(int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} -static inline struct pci_dev *node_to_k8_nb_misc(int node) +static inline struct amd_northbridge *node_to_amd_nb(int node) { - return NULL; + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; } + +#else + +#define amd_nb_num(x) 0 +#define amd_nb_has_feature(x) false +#define node_to_amd_nb(x) NULL + #endif diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f6ce0bda3b98..cf12007796db 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -238,6 +238,7 @@ extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); extern void enable_NMI_through_LVT0(void); +extern int apic_force_enable(void); /* * On 32bit this is mach-xxx local diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index a859ca461fb0..47a30ff8e517 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -145,6 +145,7 @@ #ifdef CONFIG_X86_32 # define MAX_IO_APICS 64 +# define MAX_LOCAL_APIC 256 #else # define MAX_IO_APICS 128 # define MAX_LOCAL_APIC 32768 diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 8e6218550e77..c8bfe63a06de 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -124,6 +124,7 @@ enum { X86_SUBARCH_LGUEST, X86_SUBARCH_XEN, X86_SUBARCH_MRST, + X86_SUBARCH_CE4100, X86_NR_SUBARCHS, }; diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 9479a037419f..0141b234406f 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -117,6 +117,10 @@ enum fixed_addresses { FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ __end_of_permanent_fixed_addresses, + +#ifdef CONFIG_X86_MRST + FIX_LNW_VRTC, +#endif /* * 256 temporary boot-time mappings, used by early_ioremap(), * before ioremap() is functional. diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 4aa2bb3b242a..ef328901c802 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) int err; /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxrstorq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err) + : [fx] "m" (*fx), "0" (0)); +#else asm volatile("1: rex64/fxrstor (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" @@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) _ASM_EXTABLE(1b, 3b) : [err] "=r" (err) : [fx] "R" (fx), "m" (*fx), "0" (0)); +#endif return err; } @@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) return -EFAULT; /* See comment in fxsave() below. */ +#ifdef CONFIG_AS_FXSAVEQ + asm volatile("1: fxsaveq %[fx]\n\t" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl $-1,%[err]\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : [err] "=r" (err), [fx] "=m" (*fx) + : "0" (0)); +#else asm volatile("1: rex64/fxsave (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" @@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) _ASM_EXTABLE(1b, 3b) : [err] "=r" (err), "=m" (*fx) : [fx] "R" (fx), "0" (0)); +#endif if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct))) err = -EFAULT; diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index a6b28d017c2f..0c5ca4e30d7b 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -159,7 +159,7 @@ struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); void setup_IO_APIC_irq_extra(u32 gsi); -extern void ioapic_init_mappings(void); +extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); @@ -168,10 +168,9 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern void probe_nr_irqs_gsi(void); extern int get_nr_irqs_gsi(void); - extern void setup_ioapic_ids_from_mpc(void); +extern void setup_ioapic_ids_from_mpc_nocheck(void); struct mp_ioapic_gsi{ u32 gsi_base; @@ -189,9 +188,8 @@ extern void __init pre_init_apic_IRQ0(void); #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop static const int timer_through_8259 = 0; -static inline void ioapic_init_mappings(void) { } +static inline void ioapic_and_gsi_init(void) { } static inline void ioapic_insert_resources(void) { } -static inline void probe_nr_irqs_gsi(void) { } #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c62c13cb9788..eb16e94ae04f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c); void mce_log_therm_throt_event(__u64 status); +/* Interrupt Handler for core thermal thresholds */ +extern int (*platform_thermal_notify)(__u64 msr_val); + #ifdef CONFIG_X86_THERMAL_VECTOR extern void mcheck_intel_therm_init(void); #else diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index ef51b501e22a..24215072d0e1 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void) #ifdef CONFIG_MICROCODE_AMD extern struct microcode_ops * __init init_amd_microcode(void); + +static inline void get_ucode_data(void *to, const u8 *from, size_t n) +{ + memcpy(to, from, n); +} + #else static inline struct microcode_ops * __init init_amd_microcode(void) { diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index c82868e9f905..0c90dd9f0505 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -5,8 +5,9 @@ #include <asm/mpspec_def.h> #include <asm/x86_init.h> +#include <asm/apicdef.h> -extern int apic_version[MAX_APICS]; +extern int apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 @@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, int active_high_low); #endif /* CONFIG_ACPI */ -#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) +#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) struct physid_mask { unsigned long mask[PHYSID_ARRAY_SIZE]; @@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t; test_and_set_bit(physid, (map).mask) #define physids_and(dst, src1, src2) \ - bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) + bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) #define physids_or(dst, src1, src2) \ - bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) + bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) #define physids_clear(map) \ - bitmap_zero((map).mask, MAX_APICS) + bitmap_zero((map).mask, MAX_LOCAL_APIC) #define physids_complement(dst, src) \ - bitmap_complement((dst).mask, (src).mask, MAX_APICS) + bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC) #define physids_empty(map) \ - bitmap_empty((map).mask, MAX_APICS) + bitmap_empty((map).mask, MAX_LOCAL_APIC) #define physids_equal(map1, map2) \ - bitmap_equal((map1).mask, (map2).mask, MAX_APICS) + bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC) #define physids_weight(map) \ - bitmap_weight((map).mask, MAX_APICS) + bitmap_weight((map).mask, MAX_LOCAL_APIC) #define physids_shift_right(d, s, n) \ - bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) + bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC) #define physids_shift_left(d, s, n) \ - bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) + bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC) static inline unsigned long physids_coerce(physid_mask_t *map) { @@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map) map->mask[0] = physids; } -/* Note: will create very large stack frames if physid_mask_t is big */ -#define physid_mask_of_physid(physid) \ - ({ \ - physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ - physid_set(physid, __physid_mask); \ - __physid_mask; \ - }) - static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) { physids_clear(*map); diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h index 4a7f96d7c188..c0a955a9a087 100644 --- a/arch/x86/include/asm/mpspec_def.h +++ b/arch/x86/include/asm/mpspec_def.h @@ -15,13 +15,6 @@ #ifdef CONFIG_X86_32 # define MAX_MPC_ENTRY 1024 -# define MAX_APICS 256 -#else -# if NR_CPUS <= 255 -# define MAX_APICS 255 -# else -# define MAX_APICS 32768 -# endif #endif /* Intel MP Floating Pointer Structure */ diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h new file mode 100644 index 000000000000..73668abdbedf --- /dev/null +++ b/arch/x86/include/asm/mrst-vrtc.h @@ -0,0 +1,9 @@ +#ifndef _MRST_VRTC_H +#define _MRST_VRTC_H + +extern unsigned char vrtc_cmos_read(unsigned char reg); +extern void vrtc_cmos_write(unsigned char val, unsigned char reg); +extern unsigned long vrtc_get_time(void); +extern int vrtc_set_mmss(unsigned long nowtime); + +#endif diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 4a711a684b17..719f00b28ff5 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -14,7 +14,9 @@ #include <linux/sfi.h> extern int pci_mrst_init(void); -int __init sfi_parse_mrtc(struct sfi_table_header *table); +extern int __init sfi_parse_mrtc(struct sfi_table_header *table); +extern int sfi_mrtc_num; +extern struct sfi_rtc_table_entry sfi_mrtc_array[]; /* * Medfield is the follow-up of Moorestown, it combines two chip solution into @@ -50,4 +52,14 @@ extern void mrst_early_console_init(void); extern struct console early_hsu_console; extern void hsu_early_console_init(void); + +extern void intel_scu_devices_create(void); +extern void intel_scu_devices_destroy(void); + +/* VRTC timer */ +#define MRST_VRTC_MAP_SZ (1024) +/*#define MRST_VRTC_PGOFFSET (0xc00) */ + +extern void mrst_rtc_init(void); + #endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 86030f63ba02..4d0dfa0d998e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -257,6 +257,18 @@ #define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) #define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) +/* Thermal Thresholds Support */ +#define THERM_INT_THRESHOLD0_ENABLE (1 << 15) +#define THERM_SHIFT_THRESHOLD0 8 +#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0) +#define THERM_INT_THRESHOLD1_ENABLE (1 << 23) +#define THERM_SHIFT_THRESHOLD1 16 +#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1) +#define THERM_STATUS_THRESHOLD0 (1 << 6) +#define THERM_LOG_THRESHOLD0 (1 << 7) +#define THERM_STATUS_THRESHOLD1 (1 << 8) +#define THERM_LOG_THRESHOLD1 (1 << 9) + /* MISC_ENABLE bits: architectural */ #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ef9975812c77..7709c12431b8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -112,7 +112,7 @@ static inline void arch_safe_halt(void) static inline void halt(void) { - PVOP_VCALL0(pv_irq_ops.safe_halt); + PVOP_VCALL0(pv_irq_ops.halt); } static inline void wbinvd(void) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index ca0437c714b2..676129229630 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -65,6 +65,7 @@ extern unsigned long pci_mem_start; #define PCIBIOS_MIN_CARDBUS_IO 0x4000 +extern int pcibios_enabled; void pcibios_config_init(void); struct pci_bus *pcibios_scan_root(int bus); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index d6763b139a84..db8aa19a08a2 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -53,6 +53,12 @@ extern void x86_mrst_early_setup(void); static inline void x86_mrst_early_setup(void) { } #endif +#ifdef CONFIG_X86_INTEL_CE +extern void x86_ce4100_early_setup(void); +#else +static inline void x86_ce4100_early_setup(void) { } +#endif + #ifndef _SETUP /* diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 42d412fd8b02..ce1d54c8a433 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -26,20 +26,22 @@ * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. * - * We will use 31 sets, one for sending BAU messages from each of the 32 + * We will use one set for sending BAU messages from each of the * cpu's on the uvhub. * * TLB shootdown will use the first of the 8 descriptors of each set. * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). */ +#define MAX_CPUS_PER_UVHUB 64 +#define MAX_CPUS_PER_SOCKET 32 +#define UV_ADP_SIZE 64 /* hardware-provided max. */ +#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */ #define UV_ITEMS_PER_DESCRIPTOR 8 /* the 'throttle' to prevent the hardware stay-busy bug */ #define MAX_BAU_CONCURRENT 3 -#define UV_CPUS_PER_ACT_STATUS 32 #define UV_ACT_STATUS_MASK 0x3 #define UV_ACT_STATUS_SIZE 2 -#define UV_ADP_SIZE 32 #define UV_DISTRIBUTION_SIZE 256 #define UV_SW_ACK_NPENDING 8 #define UV_NET_ENDPOINT_INTD 0x38 @@ -100,7 +102,6 @@ * number of destination side software ack resources */ #define DEST_NUM_RESOURCES 8 -#define MAX_CPUS_PER_NODE 32 /* * completion statuses for sending a TLB flush message */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 1e994754d323..34244b2cd880 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -85,7 +85,6 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 71232b941b6c..17c8090fabd4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) { unsigned int ver = 0; + if (id >= (MAX_LOCAL_APIC-1)) { + printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + return; + } + if (!enabled) { ++disabled_cpus; return; @@ -910,13 +915,13 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_register_lapic_address(acpi_lapic_addr); count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, - acpi_parse_sapic, MAX_APICS); + acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_APICS); + acpi_parse_x2apic, MAX_LOCAL_APIC); count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_APICS); + acpi_parse_lapic, MAX_LOCAL_APIC); } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 553d0b0d639b..123608531c8f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) mutex_unlock(&smp_alt); } +bool skip_smp_alternatives; void alternatives_smp_switch(int smp) { struct smp_alt_module *mod; @@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp) printk("lockdep: fixing up alternatives.\n"); #endif - if (noreplace_smp || smp_alt_once) + if (noreplace_smp || smp_alt_once || skip_smp_alternatives) return; BUG_ON(!smp && (num_online_cpus() > 1)); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 8f6463d8ed0d..affacb5e0065 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -12,95 +12,116 @@ static u32 *flush_words; -struct pci_device_id k8_nb_ids[] = { +struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, {} }; -EXPORT_SYMBOL(k8_nb_ids); +EXPORT_SYMBOL(amd_nb_misc_ids); -struct k8_northbridge_info k8_northbridges; -EXPORT_SYMBOL(k8_northbridges); +struct amd_northbridge_info amd_northbridges; +EXPORT_SYMBOL(amd_northbridges); -static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +static struct pci_dev *next_northbridge(struct pci_dev *dev, + struct pci_device_id *ids) { do { dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); if (!dev) break; - } while (!pci_match_id(&k8_nb_ids[0], dev)); + } while (!pci_match_id(ids, dev)); return dev; } -int cache_k8_northbridges(void) +int amd_cache_northbridges(void) { - int i; - struct pci_dev *dev; + int i = 0; + struct amd_northbridge *nb; + struct pci_dev *misc; - if (k8_northbridges.num) + if (amd_nb_num()) return 0; - dev = NULL; - while ((dev = next_k8_northbridge(dev)) != NULL) - k8_northbridges.num++; + misc = NULL; + while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) + i++; - /* some CPU families (e.g. family 0x11) do not support GART */ - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - boot_cpu_data.x86 == 0x15) - k8_northbridges.gart_supported = 1; + if (i == 0) + return 0; - k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * - sizeof(void *), GFP_KERNEL); - if (!k8_northbridges.nb_misc) + nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + if (!nb) return -ENOMEM; - if (!k8_northbridges.num) { - k8_northbridges.nb_misc[0] = NULL; - return 0; - } + amd_northbridges.nb = nb; + amd_northbridges.num = i; - if (k8_northbridges.gart_supported) { - flush_words = kmalloc(k8_northbridges.num * sizeof(u32), - GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges.nb_misc); - return -ENOMEM; - } - } + misc = NULL; + for (i = 0; i != amd_nb_num(); i++) { + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, amd_nb_misc_ids); + } + + /* some CPU families (e.g. family 0x11) do not support GART */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_GART; + + /* + * Some CPU families support L3 Cache Index Disable. There are some + * limitations because of E382 and E388 on family 0x10. + */ + if (boot_cpu_data.x86 == 0x10 && + boot_cpu_data.x86_model >= 0x8 && + (boot_cpu_data.x86_model > 0x9 || + boot_cpu_data.x86_mask >= 0x1)) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; - dev = NULL; - i = 0; - while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges.nb_misc[i] = dev; - if (k8_northbridges.gart_supported) - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); - } - k8_northbridges.nb_misc[i] = NULL; return 0; } -EXPORT_SYMBOL_GPL(cache_k8_northbridges); +EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* Ignores subdevice/subvendor but as far as I can figure out they're useless anyways */ -int __init early_is_k8_nb(u32 device) +int __init early_is_amd_nb(u32 device) { struct pci_device_id *id; u32 vendor = device & 0xffff; device >>= 16; - for (id = k8_nb_ids; id->vendor; id++) + for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) return 1; return 0; } -void k8_flush_garts(void) +int amd_cache_gart(void) +{ + int i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + return -ENOMEM; + } + + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, + &flush_words[i]); + + return 0; +} + +void amd_flush_garts(void) { int flushed, i; unsigned long flags; static DEFINE_SPINLOCK(gart_lock); - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; /* Avoid races between AGP and IOMMU. In theory it's not needed @@ -109,16 +130,16 @@ void k8_flush_garts(void) that it doesn't matter to serialize more. -AK */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < k8_northbridges.num; i++) { - pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, - flush_words[i]|1); + for (i = 0; i < amd_nb_num(); i++) { + pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, + flush_words[i] | 1); flushed++; } - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { - pci_read_config_dword(k8_northbridges.nb_misc[i], + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &w); if (!(w & 1)) break; @@ -129,19 +150,23 @@ void k8_flush_garts(void) if (!flushed) printk("nothing to flush?\n"); } -EXPORT_SYMBOL_GPL(k8_flush_garts); +EXPORT_SYMBOL_GPL(amd_flush_garts); -static __init int init_k8_nbs(void) +static __init int init_amd_nbs(void) { int err = 0; - err = cache_k8_northbridges(); + err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + + if (amd_cache_gart() < 0) + printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " + "GART support disabled.\n"); return err; } /* This has to go after the PCI subsystem */ -fs_initcall(init_k8_nbs); +fs_initcall(init_amd_nbs); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 92543c73cf8e..7c9ab59653e8 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -315,6 +315,7 @@ static void apbt_setup_irq(struct apbt_dev *adev) if (system_state == SYSTEM_BOOTING) { irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); /* APB timer irqs are set up as mp_irqs, timer is edge type */ __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); if (request_irq(adev->irq, apbt_interrupt_handler, diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b3a16e8f0703..dcd7c83e1659 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -206,7 +206,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * Do an PCI bus scan by hand because we're running before the PCI * subsystem. * - * All K8 AGP bridges are AGPv3 compliant, so we can do this scan + * All AMD AGP bridges are AGPv3 compliant, so we can do this scan * generically. It's probably overkill to always scan all slots because * the AGP bridges should be always an own bus on the HT hierarchy, * but do it here for future safety. @@ -303,7 +303,7 @@ void __init early_gart_iommu_check(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -358,7 +358,7 @@ void __init early_gart_iommu_check(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -400,7 +400,7 @@ int __init gart_iommu_hole_init(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; iommu_detected = 1; @@ -518,7 +518,7 @@ out: dev_base = bus_dev_ranges[i].dev_base; dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index fb7657822aad..879999a5230f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -431,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) reserved = reserve_eilvt_offset(offset, new); if (reserved != new) { - pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but " - "vector 0x%x was already reserved by another core, " - "APIC%lX=0x%x\n", - smp_processor_id(), new, reserved, reg, old); + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on another cpu\n", + smp_processor_id(), reg, offset, new, reserved); return -EINVAL; } if (!eilvt_entry_is_changeable(old, new)) { - pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but " - "register already in use, APIC%lX=0x%x\n", - smp_processor_id(), new, reg, old); + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on this cpu\n", + smp_processor_id(), reg, offset, new, old); return -EBUSY; } @@ -1532,13 +1533,60 @@ static int __init detect_init_APIC(void) return 0; } #else + +static int apic_verify(void) +{ + u32 features, h, l; + + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + pr_warning("Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + pr_info("Found and enabled local APIC!\n"); + return 0; +} + +int apic_force_enable(void) +{ + u32 h, l; + + if (disable_apic) + return -1; + + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + return apic_verify(); +} + /* * Detect and initialize APIC */ static int __init detect_init_APIC(void) { - u32 h, l, features; - /* Disabled by kernel option? */ if (disable_apic) return -1; @@ -1568,38 +1616,12 @@ static int __init detect_init_APIC(void) "you can enable it with \"lapic\"\n"); return -1; } - /* - * Some BIOSes disable the local APIC in the APIC_BASE - * MSR. This can only be done in software for Intel P6 or later - * and AMD K7 (Model > 1) or later. - */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - pr_info("Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; - } - } - /* - * The APIC feature bit should now be enabled - * in `cpuid' - */ - features = cpuid_edx(1); - if (!(features & (1 << X86_FEATURE_APIC))) { - pr_warning("Could not enable APIC!\n"); - return -1; + if (apic_force_enable()) + return -1; + } else { + if (apic_verify()) + return -1; } - set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - - pr_info("Found and enabled local APIC!\n"); apic_pm_activate(); @@ -1687,7 +1709,7 @@ void __init init_apic_mappings(void) * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int apic_version[MAX_APICS]; +int apic_version[MAX_LOCAL_APIC]; int __init APIC_init_uniprocessor(void) { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 16c2db8750a2..f6cd5b410770 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1933,8 +1933,7 @@ void disable_IO_APIC(void) * * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ - -void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc_nocheck(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -1943,15 +1942,6 @@ void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; - if (acpi_ioapic) - return; - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. @@ -2005,7 +1995,6 @@ void __init setup_ioapic_ids_from_mpc(void) physids_or(phys_id_present_map, phys_id_present_map, tmp); } - /* * We need to adjust the IRQ routing table * if the ID changed. @@ -2041,6 +2030,21 @@ void __init setup_ioapic_ids_from_mpc(void) apic_printk(APIC_VERBOSE, " ok.\n"); } } + +void __init setup_ioapic_ids_from_mpc(void) +{ + + if (acpi_ioapic) + return; + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + setup_ioapic_ids_from_mpc_nocheck(); +} #endif int no_timer_check __initdata; @@ -3593,7 +3597,7 @@ int __init io_apic_get_redir_entries (int ioapic) return reg_01.bits.entries + 1; } -void __init probe_nr_irqs_gsi(void) +static void __init probe_nr_irqs_gsi(void) { int nr; @@ -3910,7 +3914,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) return res; } -void __init ioapic_init_mappings(void) +void __init ioapic_and_gsi_init(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; @@ -3948,6 +3952,8 @@ fake_ioapic_page: ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } + + probe_nr_irqs_gsi(); } void __init ioapic_insert_resources(void) @@ -4057,7 +4063,8 @@ void __init pre_init_apic_IRQ0(void) printk(KERN_INFO "Early APIC setup for system timer0\n"); #ifndef CONFIG_SMP - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + physid_set_mask_of_physid(boot_cpu_physical_apicid, + &phys_cpu_present_map); #endif /* Make sure the irq descriptor is set up */ cfg = alloc_irq_and_cfg_at(0, 0); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 927902d90fe6..936613e77113 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -48,6 +48,16 @@ unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); static DEFINE_SPINLOCK(uv_nmi_lock); +static unsigned long __init uv_early_read_mmr(unsigned long addr) +{ + unsigned long val, *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); + val = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + return val; +} + static inline bool is_GRU_range(u64 start, u64 end) { return start >= gru_start_paddr && end <= gru_end_paddr; @@ -58,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end) return is_ISA_range(start, end) || is_GRU_range(start, end); } -static int early_get_nodeid(void) +static int __init early_get_pnodeid(void) { union uvh_node_id_u node_id; - unsigned long *mmr; - - mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); - node_id.v = *mmr; - early_iounmap(mmr, sizeof(*mmr)); + union uvh_rh_gam_config_mmr_u m_n_config; + int pnode; /* Currently, all blades have same revision number */ + node_id.v = uv_early_read_mmr(UVH_NODE_ID); + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); uv_min_hub_revision_id = node_id.s.revision; - return node_id.s.node_id; + pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); + return pnode; } static void __init early_get_apic_pnode_shift(void) { - unsigned long *mmr; - - mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr)); - uvh_apicid.v = *mmr; - early_iounmap(mmr, sizeof(*mmr)); + uvh_apicid.v = uv_early_read_mmr(UVH_APICID); if (!uvh_apicid.v) /* * Old bios, use default value @@ -95,21 +101,17 @@ static void __init early_get_apic_pnode_shift(void) static void __init uv_set_apicid_hibit(void) { union uvh_lb_target_physical_apic_id_mask_u apicid_mask; - unsigned long *mmr; - mmr = early_ioremap(UV_LOCAL_MMR_BASE | - UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr)); - apicid_mask.v = *mmr; - early_iounmap(mmr, sizeof(*mmr)); + apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK); uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK; } static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int nodeid; + int pnodeid; if (!strcmp(oem_id, "SGI")) { - nodeid = early_get_nodeid(); + pnodeid = early_get_pnodeid(); early_get_apic_pnode_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; @@ -119,7 +121,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - nodeid << (uvh_apicid.s.pnode_shift - 1); + pnodeid << uvh_apicid.s.pnode_shift; uv_system_type = UV_NON_UNIQUE_APIC; uv_set_apicid_hibit(); return 1; @@ -682,27 +684,32 @@ void uv_nmi_init(void) void __init uv_system_init(void) { union uvh_rh_gam_config_mmr_u m_n_config; + union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; - int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io; int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; - unsigned short pnode_mask; + unsigned short pnode_mask, pnode_io_mask; map_low_mmrs(); m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); + n_io = mmioh.s.n_io; mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; pnode_mask = (1 << n_val) - 1; + pnode_io_mask = (1 << n_io) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; gnode_upper = ((unsigned long)gnode_extra << m_val); - printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", - n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n", + n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask); printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); @@ -735,7 +742,7 @@ void __init uv_system_init(void) for (j = 0; j < 64; j++) { if (!test_bit(j, &present)) continue; - pnode = (i * 64 + j); + pnode = (i * 64 + j) & pnode_mask; uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; @@ -756,6 +763,7 @@ void __init uv_system_init(void) /* * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); */ + uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); @@ -772,7 +780,6 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; uv_cpu_hub_info(cpu)->pnode = pnode; - uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; @@ -796,7 +803,7 @@ void __init uv_system_init(void) map_gru_high(max_pnode); map_mmr_high(max_pnode); - map_mmioh_high(max_pnode); + map_mmioh_high(max_pnode & pnode_io_mask); uv_cpu_init(); uv_scir_register_cpu_notifier(); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 17ad03366211..9ecf81f9b90f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -149,8 +149,7 @@ union _cpuid4_leaf_ecx { }; struct amd_l3_cache { - struct pci_dev *dev; - bool can_disable; + struct amd_northbridge *nb; unsigned indices; u8 subcaches[4]; }; @@ -311,14 +310,12 @@ struct _cache_attr { /* * L3 cache descriptors */ -static struct amd_l3_cache **__cpuinitdata l3_caches; - static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) { unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->dev, 0x1C4, &val); + pci_read_config_dword(l3->nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); @@ -330,47 +327,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) -{ - struct amd_l3_cache *l3; - struct pci_dev *dev = node_to_k8_nb_misc(node); - - l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC); - if (!l3) { - printk(KERN_WARNING "Error allocating L3 struct\n"); - return NULL; - } - - l3->dev = dev; - - amd_calc_l3_indices(l3); - - return l3; -} - -static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, - int index) +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, + int index) { + static struct amd_l3_cache *__cpuinitdata l3_caches; int node; - if (boot_cpu_data.x86 != 0x10) - return; - - if (index < 3) - return; - - /* see errata #382 and #388 */ - if (boot_cpu_data.x86_model < 0x8) - return; - - if ((boot_cpu_data.x86_model == 0x8 || - boot_cpu_data.x86_model == 0x9) - && - boot_cpu_data.x86_mask < 0x1) - return; - - /* not in virtualized environments */ - if (k8_northbridges.num == 0) + /* only for L3, and not in virtualized environments */ + if (index < 3 || amd_nb_num() == 0) return; /* @@ -378,7 +342,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, * never freed but this is done only on shutdown so it doesn't matter. */ if (!l3_caches) { - int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); + int size = amd_nb_num() * sizeof(struct amd_l3_cache); l3_caches = kzalloc(size, GFP_ATOMIC); if (!l3_caches) @@ -387,14 +351,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, node = amd_get_nb_id(smp_processor_id()); - if (!l3_caches[node]) { - l3_caches[node] = amd_init_l3_cache(node); - l3_caches[node]->can_disable = true; + if (!l3_caches[node].nb) { + l3_caches[node].nb = node_to_amd_nb(node); + amd_calc_l3_indices(&l3_caches[node]); } - WARN_ON(!l3_caches[node]); - - this_leaf->l3 = l3_caches[node]; + this_leaf->l3 = &l3_caches[node]; } /* @@ -408,7 +370,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -422,7 +384,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; index = amd_get_l3_disable_slot(this_leaf->l3, slot); @@ -457,7 +420,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, if (!l3->subcaches[i]) continue; - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -467,7 +430,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); } } @@ -524,7 +487,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -545,7 +509,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ + const char *buf, size_t count) \ { \ return store_cache_disable(this_leaf, buf, count, slot); \ } @@ -558,10 +522,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); #else /* CONFIG_AMD_NB */ -static void __cpuinit -amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) -{ -}; +#define amd_init_l3_cache(x, y) #endif /* CONFIG_AMD_NB */ static int @@ -575,7 +536,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(this_leaf, index); + amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } @@ -983,30 +944,48 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -#define DEFAULT_SYSFS_CACHE_ATTRS \ - &type.attr, \ - &level.attr, \ - &coherency_line_size.attr, \ - &physical_line_partition.attr, \ - &ways_of_associativity.attr, \ - &number_of_sets.attr, \ - &size.attr, \ - &shared_cpu_map.attr, \ - &shared_cpu_list.attr - static struct attribute *default_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, + &type.attr, + &level.attr, + &coherency_line_size.attr, + &physical_line_partition.attr, + &ways_of_associativity.attr, + &number_of_sets.attr, + &size.attr, + &shared_cpu_map.attr, + &shared_cpu_list.attr, NULL }; -static struct attribute *default_l3_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, #ifdef CONFIG_AMD_NB - &cache_disable_0.attr, - &cache_disable_1.attr, +static struct attribute ** __cpuinit amd_l3_attrs(void) +{ + static struct attribute **attrs; + int n; + + if (attrs) + return attrs; + + n = sizeof (default_attrs) / sizeof (struct attribute *); + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); + if (attrs == NULL) + return attrs = default_attrs; + + for (n = 0; default_attrs[n]; n++) + attrs[n] = default_attrs[n]; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + attrs[n++] = &cache_disable_0.attr; + attrs[n++] = &cache_disable_1.attr; + } + + return attrs; +} #endif - NULL -}; static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1117,11 +1096,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_leaf = CPUID4_INFO_IDX(cpu, i); - if (this_leaf->l3 && this_leaf->l3->can_disable) - ktype_cache.default_attrs = default_l3_attrs; - else - ktype_cache.default_attrs = default_attrs; - + ktype_cache.default_attrs = default_attrs; +#ifdef CONFIG_AMD_NB + if (this_leaf->l3) + ktype_cache.default_attrs = amd_l3_attrs(); +#endif retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 80c482382d5c..5bf2fac52aca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -31,8 +31,6 @@ #include <asm/mce.h> #include <asm/msr.h> -#define PFX "mce_threshold: " -#define VERSION "version 1.1.1" #define NR_BANKS 6 #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -59,12 +57,6 @@ struct threshold_block { struct list_head miscj; }; -/* defaults used early on boot */ -static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, -}; - struct threshold_bank { struct kobject *kobj; struct threshold_block *blocks; @@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void); struct thresh_restart { struct threshold_block *b; int reset; + int set_lvt_off; + int lvt_off; u16 old_limit; }; +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + /* must be called with correct cpu affinity */ /* Called via smp_call_function_single() */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; - u32 mci_misc_hi, mci_misc_lo; + u32 hi, lo; - rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, lo, hi); - if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) tr->reset = 1; /* limit cannot be lower than err count */ if (tr->reset) { /* reset err count and overflow bit */ - mci_misc_hi = - (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | (THRESHOLD_MAX - tr->b->threshold_limit); } else if (tr->old_limit) { /* change limit w/o reset */ - int new_count = (mci_misc_hi & THRESHOLD_MAX) + + int new_count = (hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); - mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + hi = (hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + tr->b->interrupt_enable ? - (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (mci_misc_hi &= ~MASK_INT_TYPE_HI); + (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : + (hi &= ~MASK_INT_TYPE_HI); - mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} + +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; } /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { + struct threshold_block b; unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - struct thresh_restart tr; - int lvt_off = -1; - u8 offset; + int offset = -1; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; #endif - offset = (high & MASK_LVTOFF_HI) >> 20; - if (lvt_off < 0) { - if (setup_APIC_eilvt(offset, - THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0)) { - pr_err(FW_BUG "cpu %d, failed to " - "setup threshold interrupt " - "for bank %d, block %d " - "(MSR%08X=0x%x%08x)", - smp_processor_id(), bank, block, - address, high, low); - continue; - } - lvt_off = offset; - } else if (lvt_off != offset) { - pr_err(FW_BUG "cpu %d, invalid threshold " - "interrupt offset %d for bank %d," - "block %d (MSR%08X=0x%x%08x)", - smp_processor_id(), lvt_off, bank, - block, address, high, low); - continue; - } - - high &= ~MASK_LVTOFF_HI; - high |= lvt_off << 20; - wrmsr(address, low, high); + offset = setup_APIC_mce(offset, + (high & MASK_LVTOFF_HI) >> 20); - threshold_defaults.address = address; - tr.b = &threshold_defaults; - tr.reset = 0; - tr.old_limit = 0; - threshold_restart_bank(&tr); + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + mce_threshold_block_init(&b, offset); mce_threshold_vector = amd_threshold_interrupt; } } @@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) b->interrupt_enable = !!new; + memset(&tr, 0, sizeof(tr)); tr.b = b; - tr.reset = 0; - tr.old_limit = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) if (new < 1) new = 1; + memset(&tr, 0, sizeof(tr)); tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; - tr.reset = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu) continue; err = threshold_create_bank(cpu, bank); if (err) - goto out; + return err; } -out: + return err; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 4b683267eca5..e12246ff5aa6 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -53,8 +53,13 @@ struct thermal_state { struct _thermal_state core_power_limit; struct _thermal_state package_throttle; struct _thermal_state package_power_limit; + struct _thermal_state core_thresh0; + struct _thermal_state core_thresh1; }; +/* Callback to handle core threshold interrupts */ +int (*platform_thermal_notify)(__u64 msr_val); + static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -200,6 +205,22 @@ static int therm_throt_process(bool new_event, int event, int level) return 0; } +static int thresh_event_valid(int event) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + u64 now = get_jiffies_64(); + + state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1; + + if (time_before64(now, state->next_check)) + return 0; + + state->next_check = now + CHECK_INTERVAL; + return 1; +} + #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, @@ -313,6 +334,22 @@ device_initcall(thermal_throttle_init_device); #define PACKAGE_THROTTLED ((__u64)2 << 62) #define PACKAGE_POWER_LIMIT ((__u64)3 << 62) +static void notify_thresholds(__u64 msr_val) +{ + /* check whether the interrupt handler is defined; + * otherwise simply return + */ + if (!platform_thermal_notify) + return; + + /* lower threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0)) + platform_thermal_notify(msr_val); + /* higher threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1)) + platform_thermal_notify(msr_val); +} + /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { @@ -321,6 +358,9 @@ static void intel_thermal_interrupt(void) rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + /* Check for violation of core thermal thresholds*/ + notify_thresholds(msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 4572f25f9325..cd28a350f7f9 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif -#ifdef CONFIG_X86_MRST_EARLY_PRINTK +#ifdef CONFIG_EARLY_PRINTK_MRST if (!strncmp(buf, "mrst", 4)) { mrst_early_console_init(); early_console_register(&early_mrst_console, keep); @@ -250,7 +250,6 @@ static int __init setup_early_printk(char *buf) hsu_early_console_init(); early_console_register(&early_hsu_console, keep); } - #endif buf++; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3afb33f14d2d..298448656b60 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/init.h> #include <linux/list.h> +#include <linux/module.h> #include <trace/syscall.h> @@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code); int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); + set_all_modules_text_rw(); modifying_code = 1; return 0; } @@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void) int ftrace_arch_code_modify_post_process(void) { modifying_code = 0; + set_all_modules_text_ro(); set_kernel_text_ro(); return 0; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 763310165fa0..7f138b3c3c52 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -61,6 +61,9 @@ void __init i386_start_kernel(void) case X86_SUBARCH_MRST: x86_mrst_early_setup(); break; + case X86_SUBARCH_CE4100: + x86_ce4100_early_setup(); + break; default: i386_default_early_setup(); break; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index c0dbd9ac24f0..9f54b209c378 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -139,39 +139,6 @@ ENTRY(startup_32) movl %eax, pa(olpc_ofw_pgd) #endif -#ifdef CONFIG_PARAVIRT - /* This is can only trip for a broken bootloader... */ - cmpw $0x207, pa(boot_params + BP_version) - jb default_entry - - /* Paravirt-compatible boot parameters. Look to see what architecture - we're booting under. */ - movl pa(boot_params + BP_hardware_subarch), %eax - cmpl $num_subarch_entries, %eax - jae bad_subarch - - movl pa(subarch_entries)(,%eax,4), %eax - subl $__PAGE_OFFSET, %eax - jmp *%eax - -bad_subarch: -WEAK(lguest_entry) -WEAK(xen_entry) - /* Unknown implementation; there's really - nothing we can do at this point. */ - ud2a - - __INITDATA - -subarch_entries: - .long default_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ - .long xen_entry /* Xen hypervisor */ - .long default_entry /* Moorestown MID */ -num_subarch_entries = (. - subarch_entries) / 4 -.previous -#endif /* CONFIG_PARAVIRT */ - /* * Initialize page tables. This creates a PDE and a set of page * tables, which are located immediately beyond __brk_base. The variable @@ -181,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4 * * Note that the stack is not yet set up! */ -default_entry: #ifdef CONFIG_X86_PAE /* @@ -261,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(initial_page_table+0xffc) #endif - jmp 3f + +#ifdef CONFIG_PARAVIRT + /* This is can only trip for a broken bootloader... */ + cmpw $0x207, pa(boot_params + BP_version) + jb default_entry + + /* Paravirt-compatible boot parameters. Look to see what architecture + we're booting under. */ + movl pa(boot_params + BP_hardware_subarch), %eax + cmpl $num_subarch_entries, %eax + jae bad_subarch + + movl pa(subarch_entries)(,%eax,4), %eax + subl $__PAGE_OFFSET, %eax + jmp *%eax + +bad_subarch: +WEAK(lguest_entry) +WEAK(xen_entry) + /* Unknown implementation; there's really + nothing we can do at this point. */ + ud2a + + __INITDATA + +subarch_entries: + .long default_entry /* normal x86/PC */ + .long lguest_entry /* lguest hypervisor */ + .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ +num_subarch_entries = (. - subarch_entries) / 4 +.previous +#else + jmp default_entry +#endif /* CONFIG_PARAVIRT */ + /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but @@ -282,7 +283,7 @@ ENTRY(startup_32_smp) movl %eax,%fs movl %eax,%gs #endif /* CONFIG_SMP */ -3: +default_entry: /* * New page tables may be in 4Mbyte page mode and may @@ -316,6 +317,10 @@ ENTRY(startup_32_smp) subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax ja 6f + + /* Clear bogus XD_DISABLE bits */ + call verify_cpu + mov $0x80000001, %eax cpuid /* Execute Disable bit supported? */ @@ -611,6 +616,8 @@ ignore_int: #endif iret +#include "verify_cpu.S" + __REFDATA .align 4 ENTRY(initial_code) @@ -622,13 +629,13 @@ ENTRY(initial_code) __PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -ENTRY(initial_pg_pmd) +initial_pg_pmd: .fill 1024*KPMDS,4,0 #else ENTRY(initial_page_table) .fill 1024,4,0 #endif -ENTRY(initial_pg_fixmap) +initial_pg_fixmap: .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index ce0cb4721c9a..0fe6d1a66c38 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu) return 0; } -static int get_ucode_data(void *to, const u8 *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static void * get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) { @@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; void *mc; - if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) - return NULL; + get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); if (section_hdr[0] != UCODE_UCODE_TYPE) { pr_err("error: invalid type field in container file section header\n"); @@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; } - mc = vmalloc(UCODE_MAX_SIZE); - if (mc) { - memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, - total_size)) { - vfree(mc); - mc = NULL; - } else - *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; - } + mc = vzalloc(UCODE_MAX_SIZE); + if (!mc) + return NULL; + + get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); + *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; + return mc; } @@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf) unsigned int *buf_pos = (unsigned int *)container_hdr; unsigned long size; - if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) - return 0; + get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); size = buf_pos[2]; @@ -219,10 +208,7 @@ static int install_equiv_cpu_table(const u8 *buf) } buf += UCODE_CONTAINER_HEADER_SIZE; - if (get_ucode_data(equiv_cpu_table, buf, size)) { - vfree(equiv_cpu_table); - return 0; - } + get_ucode_data(equiv_cpu_table, buf, size); return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ba0f0ca9f280..c01ffa5b9b87 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -143,7 +143,7 @@ static void flush_gart(void) spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { - k8_flush_garts(); + amd_flush_garts(); need_flush = false; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -561,17 +561,17 @@ static void enable_gart_translations(void) { int i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; enable_gart_translation(dev, __pa(agp_gatt_table)); } /* Flush the GART-TLB to remove stale entries */ - k8_flush_garts(); + amd_flush_garts(); } /* @@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev) if (!fix_up_north_bridges) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; /* * Don't enable translations just yet. That is the next @@ -644,7 +644,7 @@ static struct sys_device device_gart = { * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. */ -static __init int init_k8_gatt(struct agp_kern_info *info) +static __init int init_amd_gatt(struct agp_kern_info *info) { unsigned aper_size, gatt_size, new_aper_size; unsigned aper_base, new_aper_base; @@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) aper_size = aper_base = info->aper_size = 0; dev = NULL; - for (i = 0; i < k8_northbridges.num; i++) { - dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void) if (!no_agp) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 ctl; - dev = k8_northbridges.nb_misc[i]; + dev = node_to_amd_nb(i)->misc; pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); ctl &= ~GARTEN; @@ -749,14 +749,14 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; #else /* Makefile puts PCI initialization via subsys_initcall first. */ - /* Add other K8 AGP bridge drivers here */ + /* Add other AMD AGP bridge drivers here */ no_agp = no_agp || (agp_amd64_init() < 0) || (agp_copy_info(agp_bridge, &info) < 0); @@ -765,7 +765,7 @@ int __init gart_iommu_init(void) if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || - (no_agp && init_k8_gatt(&info) < 0)) { + (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); pr_warning("falling back to iommu=soft.\n"); diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index fda313ebbb03..c8e41e90f59c 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev) outb(1, 0x92); } +static void ce4100_reset(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < 10; i++) { + outb(0x2, 0xcf9); + udelay(50); + } +} + struct device_fixup { unsigned int vendor; unsigned int device; void (*reboot_fixup)(struct pci_dev *); }; +/* + * PCI ids solely used for fixups_table go here + */ +#define PCI_DEVICE_ID_INTEL_CE4100 0x0708 + static const struct device_fixup fixups_table[] = { { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, { PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, +{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset }, }; /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a0f52af256a0..d3cfe26c0252 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -705,7 +705,7 @@ static u64 __init get_max_mapped(void) void __init setup_arch(char **cmdline_p) { int acpi = 0; - int k8 = 0; + int amd = 0; unsigned long flags; #ifdef CONFIG_X86_32 @@ -991,12 +991,12 @@ void __init setup_arch(char **cmdline_p) acpi = acpi_numa_init(); #endif -#ifdef CONFIG_K8_NUMA +#ifdef CONFIG_AMD_NUMA if (!acpi) - k8 = !k8_numa_init(0, max_pfn); + amd = !amd_numa_init(0, max_pfn); #endif - initmem_init(0, max_pfn, acpi, k8); + initmem_init(0, max_pfn, acpi, amd); memblock_find_dma_reserve(); dma32_reserve_bootmem(); @@ -1045,10 +1045,7 @@ void __init setup_arch(char **cmdline_p) #endif init_apic_mappings(); - ioapic_init_mappings(); - - /* need to wait for io_apic is mapped */ - probe_nr_irqs_gsi(); + ioapic_and_gsi_init(); kvm_guest_init(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 68f61ac632e1..ee886fe10ef4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1161,6 +1161,20 @@ out: preempt_enable(); } +void arch_disable_nonboot_cpus_begin(void) +{ + /* + * Avoid the smp alternatives switch during the disable_nonboot_cpus(). + * In the suspend path, we will be back in the SMP mode shortly anyways. + */ + skip_smp_alternatives = true; +} + +void arch_disable_nonboot_cpus_end(void) +{ + skip_smp_alternatives = false; +} + void arch_enable_nonboot_cpus_begin(void) { set_mtrr_aps_delayed_init(); diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 3af2dff58b21..075d130efcf9 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -127,7 +127,7 @@ startup_64: no_longmode: hlt jmp no_longmode -#include "verify_cpu_64.S" +#include "verify_cpu.S" # Careful these need to be in the same 64K segment as the above; tidt: diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0c40d8b72416..356a0d455cf9 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void) if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; + + if (tsc_clocksource_reliable) + return 0; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: @@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ if (num_possible_cpus() > 1) - tsc_unstable = 1; + return 1; } - return tsc_unstable; + return 0; +} + + +static void tsc_refine_calibration_work(struct work_struct *work); +static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); +/** + * tsc_refine_calibration_work - Further refine tsc freq calibration + * @work - ignored. + * + * This functions uses delayed work over a period of a + * second to further refine the TSC freq value. Since this is + * timer based, instead of loop based, we don't block the boot + * process while this longer calibration is done. + * + * If there are any calibration anomolies (too many SMIs, etc), + * or the refined calibration is off by 1% of the fast early + * calibration, we throw out the new calibration and use the + * early calibration. + */ +static void tsc_refine_calibration_work(struct work_struct *work) +{ + static u64 tsc_start = -1, ref_start; + static int hpet; + u64 tsc_stop, ref_stop, delta; + unsigned long freq; + + /* Don't bother refining TSC on unstable systems */ + if (check_tsc_unstable()) + goto out; + + /* + * Since the work is started early in boot, we may be + * delayed the first time we expire. So set the workqueue + * again once we know timers are working. + */ + if (tsc_start == -1) { + /* + * Only set hpet once, to avoid mixing hardware + * if the hpet becomes enabled later. + */ + hpet = is_hpet_enabled(); + schedule_delayed_work(&tsc_irqwork, HZ); + tsc_start = tsc_read_refs(&ref_start, hpet); + return; + } + + tsc_stop = tsc_read_refs(&ref_stop, hpet); + + /* hpet or pmtimer available ? */ + if (!hpet && !ref_start && !ref_stop) + goto out; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX) + goto out; + + delta = tsc_stop - tsc_start; + delta *= 1000000LL; + if (hpet) + freq = calc_hpet_ref(delta, ref_start, ref_stop); + else + freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + + /* Make sure we're within 1% */ + if (abs(tsc_khz - freq) > tsc_khz/100) + goto out; + + tsc_khz = freq; + printk(KERN_INFO "Refined TSC clocksource calibration: " + "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + +out: + clocksource_register_khz(&clocksource_tsc, tsc_khz); } -static void __init init_tsc_clocksource(void) + +static int __init init_tsc_clocksource(void) { + if (!cpu_has_tsc || tsc_disabled > 0) + return 0; + if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ @@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } - clocksource_register_khz(&clocksource_tsc, tsc_khz); + schedule_delayed_work(&tsc_irqwork, 0); + return 0; } +/* + * We use device_initcall here, to ensure we run after the hpet + * is fully initialized, which may occur at fs_initcall time. + */ +device_initcall(init_tsc_clocksource); void __init tsc_init(void) { @@ -949,6 +1036,5 @@ void __init tsc_init(void) mark_tsc_unstable("TSCs unsynchronized"); check_system_tsc_reliable(); - init_tsc_clocksource(); } diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S index 56a8c2a867d9..0edefc19a113 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu.S @@ -7,6 +7,7 @@ * Copyright (c) 2007 Andi Kleen (ak@suse.de) * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) + * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com) * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. @@ -14,18 +15,17 @@ * This is a common code for verification whether CPU supports * long mode and SSE or not. It is not called directly instead this * file is included at various places and compiled in that context. - * Following are the current usage. + * This file is expected to run in 32bit code. Currently: * - * This file is included by both 16bit and 32bit code. + * arch/x86/boot/compressed/head_64.S: Boot cpu verification + * arch/x86/kernel/trampoline_64.S: secondary processor verfication + * arch/x86/kernel/head_32.S: processor startup * - * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) - * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) - * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) - * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) - * - * verify_cpu, returns the status of cpu check in register %eax. + * verify_cpu, returns the status of longmode and SSE in register %eax. * 0: Success 1: Failure * + * On Intel, the XD_DISABLE flag will be cleared as a side-effect. + * * The caller needs to check for the error code and take the action * appropriately. Either display a message or halt. */ @@ -62,8 +62,41 @@ verify_cpu: cmpl $0x444d4163,%ecx jnz verify_cpu_noamd mov $1,%di # cpu is from AMD + jmp verify_cpu_check verify_cpu_noamd: + cmpl $0x756e6547,%ebx # GenuineIntel? + jnz verify_cpu_check + cmpl $0x49656e69,%edx + jnz verify_cpu_check + cmpl $0x6c65746e,%ecx + jnz verify_cpu_check + + # only call IA32_MISC_ENABLE when: + # family > 6 || (family == 6 && model >= 0xd) + movl $0x1, %eax # check CPU family and model + cpuid + movl %eax, %ecx + + andl $0x0ff00f00, %eax # mask family and extended family + shrl $8, %eax + cmpl $6, %eax + ja verify_cpu_clear_xd # family > 6, ok + jb verify_cpu_check # family < 6, skip + + andl $0x000f00f0, %ecx # mask model and extended model + shrl $4, %ecx + cmpl $0xd, %ecx + jb verify_cpu_check # family == 6, model < 0xd, skip + +verify_cpu_clear_xd: + movl $MSR_IA32_MISC_ENABLE, %ecx + rdmsr + btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE + jnc verify_cpu_check # only write MSR if bit was changed + wrmsr + +verify_cpu_check: movl $0x1,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK0,%edx diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e03530aebfd0..bf4700755184 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -69,7 +69,7 @@ jiffies_64 = jiffies; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ + data PT_LOAD FLAGS(6); /* RW_ */ #ifdef CONFIG_X86_64 user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP @@ -116,6 +116,10 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 +#if defined(CONFIG_DEBUG_RODATA) + /* .text should occupy whole number of pages */ + . = ALIGN(PAGE_SIZE); +#endif X64_ALIGN_DEBUG_RODATA_BEGIN RO_DATA(PAGE_SIZE) X64_ALIGN_DEBUG_RODATA_END @@ -335,7 +339,7 @@ SECTIONS __bss_start = .; *(.bss..page_aligned) *(.bss) - . = ALIGN(4); + . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index e7d5382ef263..4f420c2f2d55 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -4,7 +4,6 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/processor-flags.h> -#include <asm/pgtable.h> /*G:020 * Our story starts with the kernel booting into startup_32 in @@ -38,113 +37,9 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp - call init_pagetables - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET -/* - * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond __brk_base. The variable - * _brk_end is set up to point to the first "safe" location. - * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end. - * - * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they - * don't have a stack at this point, so we can't just use call and ret. - */ -init_pagetables: -#if PTRS_PER_PMD > 1 -#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) -#else -#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) -#endif -#define pa(X) ((X) - __PAGE_OFFSET) - -/* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = \ - PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT -#ifdef CONFIG_X86_PAE - - /* - * In PAE mode initial_page_table is statically defined to contain - * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 - * entries). The identity mapping is handled by pointing two PGD entries - * to the first kernel PMD. - * - * Note the upper half of each PMD or PTE are always zero at this stage. - */ - -#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ - - xorl %ebx,%ebx /* %ebx is kept at zero */ - - movl $pa(__brk_base), %edi - movl $pa(initial_pg_pmd), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ - movl %ecx,(%edx) /* Store PMD entry */ - /* Upper half already zero */ - addl $8,%edx - movl $512,%ecx -11: - stosl - xchgl %eax,%ebx - stosl - xchgl %eax,%ebx - addl $0x1000,%eax - loop 11b - - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b -1: - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ - movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) -#else /* Not PAE */ - -page_pde_offset = (__PAGE_OFFSET >> 20); - - movl $pa(__brk_base), %edi - movl $pa(initial_page_table), %edx - movl $PTE_IDENT_ATTR, %eax -10: - leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ - movl %ecx,(%edx) /* Store identity PDE entry */ - movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ - addl $4,%edx - movl $1024, %ecx -11: - stosl - addl $0x1000,%eax - loop 11b - /* - * End condition: we must map up to the end + MAPPING_BEYOND_END. - */ - movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp - cmpl %ebp,%eax - jb 10b - addl $__PAGE_OFFSET, %edi - movl %edi, pa(_brk_end) - shrl $12, %eax - movl %eax, pa(max_pfn_mapped) - - /* Do early initialization of the fixmap area */ - movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(initial_page_table+0xffc) -#endif - ret - /*G:055 * We create a macro which puts the assembler code between lgstart_ and lgend_ * markers. These templates are put in the .text section: they can't be diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 55543397a8a7..09df2f9a3d69 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o -obj-$(CONFIG_K8_NUMA) += k8topology_64.o +obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c index 804a3b6c6e14..51fae9cfdecb 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -1,8 +1,8 @@ /* - * AMD K8 NUMA support. + * AMD NUMA support. * Discover the memory map and associated nodes. * - * This version reads it directly from the K8 northbridge. + * This version reads it directly from the AMD northbridge. * * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ @@ -57,7 +57,7 @@ static __init void early_get_boot_cpu_id(void) { /* * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in k8_scan_nodes() + * create apicid_to_node in amd_scan_nodes() */ #ifdef CONFIG_X86_MPPARSE /* @@ -69,7 +69,7 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } -int __init k8_get_nodes(struct bootnode *physnodes) +int __init amd_get_nodes(struct bootnode *physnodes) { int i; int ret = 0; @@ -82,7 +82,7 @@ int __init k8_get_nodes(struct bootnode *physnodes) return ret; } -int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) +int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) { unsigned long start = PFN_PHYS(start_pfn); unsigned long end = PFN_PHYS(end_pfn); @@ -194,7 +194,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) return 0; } -int __init k8_scan_nodes(void) +int __init amd_scan_nodes(void) { unsigned int bits; unsigned int cores; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c0e28a13de7d..947f42abe820 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that - * writeable first. + * writeable and non-executable first. */ + set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e969f9f401b..f89b5bb4e93f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) static inline int is_kernel_text(unsigned long addr) { - if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) + if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) return 1; return 0; } @@ -912,6 +912,23 @@ void set_kernel_text_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); } +static void mark_nxdata_nx(void) +{ + /* + * When this called, init has already been executed and released, + * so everything past _etext sould be NX. + */ + unsigned long start = PFN_ALIGN(_etext); + /* + * This comes from is_kernel_text upper limit. Also HPAGE where used: + */ + unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; + + if (__supported_pte_mask & _PAGE_NX) + printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); + set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); @@ -946,6 +963,7 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif + mark_nxdata_nx(); } #endif diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7ffc9b727efd..7762a517d69d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -264,7 +264,7 @@ static struct bootnode physnodes[MAX_NUMNODES] __initdata; static char *cmdline __initdata; static int __init setup_physnodes(unsigned long start, unsigned long end, - int acpi, int k8) + int acpi, int amd) { int nr_nodes = 0; int ret = 0; @@ -274,13 +274,13 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, if (acpi) nr_nodes = acpi_get_nodes(physnodes); #endif -#ifdef CONFIG_K8_NUMA - if (k8) - nr_nodes = k8_get_nodes(physnodes); +#ifdef CONFIG_AMD_NUMA + if (amd) + nr_nodes = amd_get_nodes(physnodes); #endif /* * Basic sanity checking on the physical node map: there may be errors - * if the SRAT or K8 incorrectly reported the topology or the mem= + * if the SRAT or AMD code incorrectly reported the topology or the mem= * kernel parameter is used. */ for (i = 0; i < nr_nodes; i++) { @@ -549,7 +549,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) * numa=fake command-line option. */ static int __init numa_emulation(unsigned long start_pfn, - unsigned long last_pfn, int acpi, int k8) + unsigned long last_pfn, int acpi, int amd) { u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; @@ -557,7 +557,7 @@ static int __init numa_emulation(unsigned long start_pfn, int num_nodes; int i; - num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); + num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd); /* * If the numa=fake command-line contains a 'M' or 'G', it represents * the fixed node size. Otherwise, if it is just a single number N, @@ -602,7 +602,7 @@ static int __init numa_emulation(unsigned long start_pfn, #endif /* CONFIG_NUMA_EMU */ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, - int acpi, int k8) + int acpi, int amd) { int i; @@ -610,7 +610,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) + if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -624,8 +624,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #endif -#ifdef CONFIG_K8_NUMA - if (!numa_off && k8 && !k8_scan_nodes()) +#ifdef CONFIG_AMD_NUMA + if (!numa_off && amd && !amd_scan_nodes()) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 532e7933d606..8b830ca14ac4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -13,6 +13,7 @@ #include <linux/pfn.h> #include <linux/percpu.h> #include <linux/gfp.h> +#include <linux/pci.h> #include <asm/e820.h> #include <asm/processor.h> @@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, unsigned long pfn) { pgprot_t forbidden = __pgprot(0); + pgprot_t required = __pgprot(0); /* * The BIOS area between 640k and 1Mb needs to be executable for * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. */ - if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) +#ifdef CONFIG_PCI_BIOS + if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_NX; +#endif /* * The kernel text needs to be executable for obvious reasons @@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; + /* + * .data and .bss should always be writable. + */ + if (within(address, (unsigned long)_sdata, (unsigned long)_edata) || + within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) + pgprot_val(required) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) /* @@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + prot = __pgprot(pgprot_val(prot) | pgprot_val(required)); return prot; } @@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, { unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; pte_t new_pte, old_pte, *tmp; - pgprot_t old_prot, new_prot; + pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; unsigned int level; @@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * We are safe now. Check whether the new pgprot is the same: */ old_pte = *kpte; - old_prot = new_prot = pte_pgprot(old_pte); + old_prot = new_prot = req_prot = pte_pgprot(old_pte); - pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); - pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* * old_pte points to the large page base address. So we need @@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; - new_prot = static_protections(new_prot, address, pfn); + new_prot = static_protections(req_prot, address, pfn); /* * We need to check the full range, whether * static_protection() requires a different pgprot for one of * the pages in the range we try to preserve: */ - addr = address + PAGE_SIZE; - pfn++; - for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { - pgprot_t chk_prot = static_protections(new_prot, addr, pfn); + addr = address & pmask; + pfn = pte_pfn(old_pte); + for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { + pgprot_t chk_prot = static_protections(req_prot, addr, pfn); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) goto out_unlock; @@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * that we limited the number of possible pages already to * the number of pages in the large page. */ - if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { + if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { /* * The address is aligned and the number of pages * covers the full page. diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index a3250aa34086..410531d3c292 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -41,7 +41,7 @@ void __init x86_report_nx(void) { if (!cpu_has_nx) { printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "missing in CPU or disabled in BIOS!\n"); + "missing in CPU!\n"); } else { #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) if (disable_nx) { diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index a17dffd136c1..f16434568a51 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) /* mark this node as "seen" in node bitmap */ BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); + /* don't need to check apic_id here, because it is always 8 bits */ apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a35cb9d8b060..171a0aacb99a 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } apic_id = pa->apic_id; + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; @@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; else apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 51104b33fd51..c3b8e24f2b16 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -610,6 +610,7 @@ static int force_ibs_eilvt_setup(void) ret = setup_ibs_ctl(i); if (ret) return ret; + pr_err(FW_BUG "using offset %d for IBS interrupts\n", i); return 0; } diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index effd96e33f16..6b8759f7634e 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_PCI_OLPC) += olpc.o obj-$(CONFIG_PCI_XEN) += xen.o obj-y += fixup.o +obj-$(CONFIG_X86_INTEL_CE) += ce4100.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c new file mode 100644 index 000000000000..85b68ef5e809 --- /dev/null +++ b/arch/x86/pci/ce4100.c @@ -0,0 +1,315 @@ +/* + * GPL LICENSE SUMMARY + * + * Copyright(c) 2010 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * The full GNU General Public License is included in this distribution + * in the file called LICENSE.GPL. + * + * Contact Information: + * Intel Corporation + * 2200 Mission College Blvd. + * Santa Clara, CA 97052 + * + * This provides access methods for PCI registers that mis-behave on + * the CE4100. Each register can be assigned a private init, read and + * write routine. The exception to this is the bridge device. The + * bridge device is the only device on bus zero (0) that requires any + * fixup so it is a special case ATM + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/init.h> + +#include <asm/pci_x86.h> + +struct sim_reg { + u32 value; + u32 mask; +}; + +struct sim_dev_reg { + int dev_func; + int reg; + void (*init)(struct sim_dev_reg *reg); + void (*read)(struct sim_dev_reg *reg, u32 *value); + void (*write)(struct sim_dev_reg *reg, u32 value); + struct sim_reg sim_reg; +}; + +struct sim_reg_op { + void (*init)(struct sim_dev_reg *reg); + void (*read)(struct sim_dev_reg *reg, u32 value); + void (*write)(struct sim_dev_reg *reg, u32 value); +}; + +#define MB (1024 * 1024) +#define KB (1024) +#define SIZE_TO_MASK(size) (~(size - 1)) + +#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\ +{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\ + {0, SIZE_TO_MASK(size)} }, + +static void reg_init(struct sim_dev_reg *reg) +{ + pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4, + ®->sim_reg.value); +} + +static void reg_read(struct sim_dev_reg *reg, u32 *value) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&pci_config_lock, flags); + *value = reg->sim_reg.value; + raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} + +static void reg_write(struct sim_dev_reg *reg, u32 value) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&pci_config_lock, flags); + reg->sim_reg.value = (value & reg->sim_reg.mask) | + (reg->sim_reg.value & ~reg->sim_reg.mask); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); +} + +static void sata_reg_init(struct sim_dev_reg *reg) +{ + pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4, + ®->sim_reg.value); + reg->sim_reg.value += 0x400; +} + +static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value) +{ + reg_read(reg, value); + if (*value != reg->sim_reg.mask) + *value |= 0x100; +} + +void sata_revid_init(struct sim_dev_reg *reg) +{ + reg->sim_reg.value = 0x01060100; + reg->sim_reg.mask = 0; +} + +static void sata_revid_read(struct sim_dev_reg *reg, u32 *value) +{ + reg_read(reg, value); +} + +static struct sim_dev_reg bus1_fixups[] = { + DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write) + DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) + DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write) + DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write) + DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write) + DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write) + DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write) + DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write) + DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write) + DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) + DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write) + DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write) + DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write) + DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write) + DEFINE_REG(14, 0, 0x8, 0, sata_revid_init, sata_revid_read, 0) + DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write) + DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write) + DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write) + DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write) + DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write) + DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write) +}; + +static void __init init_sim_regs(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { + if (bus1_fixups[i].init) + bus1_fixups[i].init(&bus1_fixups[i]); + } +} + +static inline void extract_bytes(u32 *value, int reg, int len) +{ + uint32_t mask; + + *value >>= ((reg & 3) * 8); + mask = 0xFFFFFFFF >> ((4 - len) * 8); + *value &= mask; +} + +int bridge_read(unsigned int devfn, int reg, int len, u32 *value) +{ + u32 av_bridge_base, av_bridge_limit; + int retval = 0; + + switch (reg) { + /* Make BARs appear to not request any memory. */ + case PCI_BASE_ADDRESS_0: + case PCI_BASE_ADDRESS_0 + 1: + case PCI_BASE_ADDRESS_0 + 2: + case PCI_BASE_ADDRESS_0 + 3: + *value = 0; + break; + + /* Since subordinate bus number register is hardwired + * to zero and read only, so do the simulation. + */ + case PCI_PRIMARY_BUS: + if (len == 4) + *value = 0x00010100; + break; + + case PCI_SUBORDINATE_BUS: + *value = 1; + break; + + case PCI_MEMORY_BASE: + case PCI_MEMORY_LIMIT: + /* Get the A/V bridge base address. */ + pci_direct_conf1.read(0, 0, devfn, + PCI_BASE_ADDRESS_0, 4, &av_bridge_base); + + av_bridge_limit = av_bridge_base + (512*MB - 1); + av_bridge_limit >>= 16; + av_bridge_limit &= 0xFFF0; + + av_bridge_base >>= 16; + av_bridge_base &= 0xFFF0; + + if (reg == PCI_MEMORY_LIMIT) + *value = av_bridge_limit; + else if (len == 2) + *value = av_bridge_base; + else + *value = (av_bridge_limit << 16) | av_bridge_base; + break; + /* Make prefetchable memory limit smaller than prefetchable + * memory base, so not claim prefetchable memory space. + */ + case PCI_PREF_MEMORY_BASE: + *value = 0xFFF0; + break; + case PCI_PREF_MEMORY_LIMIT: + *value = 0x0; + break; + /* Make IO limit smaller than IO base, so not claim IO space. */ + case PCI_IO_BASE: + *value = 0xF0; + break; + case PCI_IO_LIMIT: + *value = 0; + break; + default: + retval = 1; + } + return retval; +} + +static int ce4100_conf_read(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + int i, retval = 1; + + if (bus == 1) { + for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { + if (bus1_fixups[i].dev_func == devfn && + bus1_fixups[i].reg == (reg & ~3) && + bus1_fixups[i].read) { + bus1_fixups[i].read(&(bus1_fixups[i]), + value); + extract_bytes(value, reg, len); + return 0; + } + } + } + + if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) && + !bridge_read(devfn, reg, len, value)) + return 0; + + return pci_direct_conf1.read(seg, bus, devfn, reg, len, value); +} + +static int ce4100_conf_write(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + int i; + + if (bus == 1) { + for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { + if (bus1_fixups[i].dev_func == devfn && + bus1_fixups[i].reg == (reg & ~3) && + bus1_fixups[i].write) { + bus1_fixups[i].write(&(bus1_fixups[i]), + value); + return 0; + } + } + } + + /* Discard writes to A/V bridge BAR. */ + if (bus == 0 && PCI_DEVFN(1, 0) == devfn && + ((reg & ~3) == PCI_BASE_ADDRESS_0)) + return 0; + + return pci_direct_conf1.write(seg, bus, devfn, reg, len, value); +} + +struct pci_raw_ops ce4100_pci_conf = { + .read = ce4100_conf_read, + .write = ce4100_conf_write, +}; + +static int __init ce4100_pci_init(void) +{ + init_sim_regs(); + raw_pci_ops = &ce4100_pci_conf; + return 0; +} +subsys_initcall(ce4100_pci_init); diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 2492d165096a..a5f7d0d63de0 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -9,6 +9,7 @@ #include <linux/uaccess.h> #include <asm/pci_x86.h> #include <asm/pci-functions.h> +#include <asm/cacheflush.h> /* BIOS32 signature: "_32_" */ #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) @@ -25,6 +26,27 @@ #define PCIBIOS_HW_TYPE1_SPEC 0x10 #define PCIBIOS_HW_TYPE2_SPEC 0x20 +int pcibios_enabled; + +/* According to the BIOS specification at: + * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could + * restrict the x zone to some pages and make it ro. But this may be + * broken on some bios, complex to handle with static_protections. + * We could make the 0xe0000-0x100000 range rox, but this can break + * some ISA mapping. + * + * So we let's an rw and x hole when pcibios is used. This shouldn't + * happen for modern system with mmconfig, and if you don't want it + * you could disable pcibios... + */ +static inline void set_bios_x(void) +{ + pcibios_enabled = 1; + set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT); + if (__supported_pte_mask & _PAGE_NX) + printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n"); +} + /* * This is the standard structure used to identify the entry point * to the BIOS32 Service Directory, as documented in @@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void) DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", bios32_entry); bios32_indirect.address = bios32_entry + PAGE_OFFSET; + set_bios_x(); if (check_pcibios()) return &pci_bios_access; } diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 7bf70b812fa2..021eee91c056 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -1,5 +1,7 @@ # Platform specific code goes here +obj-y += ce4100/ obj-y += efi/ +obj-y += iris/ obj-y += mrst/ obj-y += olpc/ obj-y += scx200/ diff --git a/arch/x86/platform/ce4100/Makefile b/arch/x86/platform/ce4100/Makefile new file mode 100644 index 000000000000..91fc92971d94 --- /dev/null +++ b/arch/x86/platform/ce4100/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_X86_INTEL_CE) += ce4100.o diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c new file mode 100644 index 000000000000..d2c0d51a7178 --- /dev/null +++ b/arch/x86/platform/ce4100/ce4100.c @@ -0,0 +1,132 @@ +/* + * Intel CE4100 platform specific setup code + * + * (C) Copyright 2010 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/serial_reg.h> +#include <linux/serial_8250.h> + +#include <asm/setup.h> +#include <asm/io.h> + +static int ce4100_i8042_detect(void) +{ + return 0; +} + +static void __init sdv_find_smp_config(void) +{ +} + +#ifdef CONFIG_SERIAL_8250 + + +static unsigned int mem_serial_in(struct uart_port *p, int offset) +{ + offset = offset << p->regshift; + return readl(p->membase + offset); +} + +/* + * The UART Tx interrupts are not set under some conditions and therefore serial + * transmission hangs. This is a silicon issue and has not been root caused. The + * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT + * bit of LSR register in interrupt handler to see whether at least one of these + * two bits is set, if so then process the transmit request. If this workaround + * is not applied, then the serial transmission may hang. This workaround is for + * errata number 9 in Errata - B step. +*/ + +static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset) +{ + unsigned int ret, ier, lsr; + + if (offset == UART_IIR) { + offset = offset << p->regshift; + ret = readl(p->membase + offset); + if (ret & UART_IIR_NO_INT) { + /* see if the TX interrupt should have really set */ + ier = mem_serial_in(p, UART_IER); + /* see if the UART's XMIT interrupt is enabled */ + if (ier & UART_IER_THRI) { + lsr = mem_serial_in(p, UART_LSR); + /* now check to see if the UART should be + generating an interrupt (but isn't) */ + if (lsr & (UART_LSR_THRE | UART_LSR_TEMT)) + ret &= ~UART_IIR_NO_INT; + } + } + } else + ret = mem_serial_in(p, offset); + return ret; +} + +static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value) +{ + offset = offset << p->regshift; + writel(value, p->membase + offset); +} + +static void ce4100_serial_fixup(int port, struct uart_port *up, + unsigned short *capabilites) +{ +#ifdef CONFIG_EARLY_PRINTK + /* + * Over ride the legacy port configuration that comes from + * asm/serial.h. Using the ioport driver then switching to the + * PCI memmaped driver hangs the IOAPIC + */ + if (up->iotype != UPIO_MEM32) { + up->uartclk = 14745600; + up->mapbase = 0xdffe0200; + set_fixmap_nocache(FIX_EARLYCON_MEM_BASE, + up->mapbase & PAGE_MASK); + up->membase = + (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE); + up->membase += up->mapbase & ~PAGE_MASK; + up->iotype = UPIO_MEM32; + up->regshift = 2; + } +#endif + up->iobase = 0; + up->serial_in = ce4100_mem_serial_in; + up->serial_out = ce4100_mem_serial_out; + + *capabilites |= (1 << 12); +} + +static __init void sdv_serial_fixup(void) +{ + serial8250_set_isa_configurator(ce4100_serial_fixup); +} + +#else +static inline void sdv_serial_fixup(void); +#endif + +static void __init sdv_arch_setup(void) +{ + sdv_serial_fixup(); +} + +/* + * CE4100 specific x86_init function overrides and early setup + * calls. + */ +void __init x86_ce4100_early_setup(void) +{ + x86_init.oem.arch_setup = sdv_arch_setup; + x86_platform.i8042_detect = ce4100_i8042_detect; + x86_init.resources.probe_roms = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_init.mpparse.find_smp_config = sdv_find_smp_config; +} diff --git a/arch/x86/platform/iris/Makefile b/arch/x86/platform/iris/Makefile new file mode 100644 index 000000000000..db921983a102 --- /dev/null +++ b/arch/x86/platform/iris/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_X86_32_IRIS) += iris.o diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c new file mode 100644 index 000000000000..1ba7f5ed8c9b --- /dev/null +++ b/arch/x86/platform/iris/iris.c @@ -0,0 +1,91 @@ +/* + * Eurobraille/Iris power off support. + * + * Eurobraille's Iris machine is a PC with no APM or ACPI support. + * It is shutdown by a special I/O sequence which this module provides. + * + * Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org> + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/moduleparam.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/pm.h> +#include <asm/io.h> + +#define IRIS_GIO_BASE 0x340 +#define IRIS_GIO_INPUT IRIS_GIO_BASE +#define IRIS_GIO_OUTPUT (IRIS_GIO_BASE + 1) +#define IRIS_GIO_PULSE 0x80 /* First byte to send */ +#define IRIS_GIO_REST 0x00 /* Second byte to send */ +#define IRIS_GIO_NODEV 0xff /* Likely not an Iris */ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>"); +MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); +MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); + +static int force; + +module_param(force, bool, 0); +MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation."); + +static void (*old_pm_power_off)(void); + +static void iris_power_off(void) +{ + outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT); + msleep(850); + outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT); +} + +/* + * Before installing the power_off handler, try to make sure the OS is + * running on an Iris. Since Iris does not support DMI, this is done + * by reading its input port and seeing whether the read value is + * meaningful. + */ +static int iris_init(void) +{ + unsigned char status; + if (force != 1) { + printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n"); + return -ENODEV; + } + status = inb(IRIS_GIO_INPUT); + if (status == IRIS_GIO_NODEV) { + printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n"); + return -ENODEV; + } + old_pm_power_off = pm_power_off; + pm_power_off = &iris_power_off; + printk(KERN_INFO "Iris power_off handler installed.\n"); + + return 0; +} + +static void iris_exit(void) +{ + pm_power_off = old_pm_power_off; + printk(KERN_INFO "Iris power_off handler uninstalled.\n"); +} + +module_init(iris_init); +module_exit(iris_exit); diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index efbbc552fa95..f61ccdd49341 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1 +1,3 @@ obj-$(CONFIG_X86_MRST) += mrst.o +obj-$(CONFIG_X86_MRST) += vrtc.o +obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 65df603622b2..65df603622b2 100644 --- a/arch/x86/kernel/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 79ae68154e87..fee0b4914e07 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -9,9 +9,19 @@ * as published by the Free Software Foundation; version 2 * of the License. */ + +#define pr_fmt(fmt) "mrst: " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/sfi.h> +#include <linux/intel_pmic_gpio.h> +#include <linux/spi/spi.h> +#include <linux/i2c.h> +#include <linux/i2c/pca953x.h> +#include <linux/gpio_keys.h> +#include <linux/input.h> +#include <linux/platform_device.h> #include <linux/irq.h> #include <linux/module.h> @@ -23,7 +33,9 @@ #include <asm/mrst.h> #include <asm/io.h> #include <asm/i8259.h> +#include <asm/intel_scu_ipc.h> #include <asm/apb_timer.h> +#include <asm/reboot.h> /* * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, @@ -102,10 +114,10 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table) memcpy(sfi_mtimer_array, pentry, totallen); } - printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); + pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num); pentry = sfi_mtimer_array; for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { - printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," + pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz," " irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->freq_hz, pentry->irq); if (!pentry->irq) @@ -176,14 +188,14 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) memcpy(sfi_mrtc_array, pentry, totallen); } - printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); + pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num); pentry = sfi_mrtc_array; for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { - printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", + pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->irq); mp_irq.type = MP_IOAPIC; mp_irq.irqtype = mp_INT; - mp_irq.irqflag = 0; + mp_irq.irqflag = 0xf; /* level trigger and active low */ mp_irq.srcbus = 0; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; @@ -209,6 +221,7 @@ static unsigned long __init mrst_calibrate_tsc(void) void __init mrst_time_init(void) { + sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); switch (mrst_timer_options) { case MRST_TIMER_APBT_ONLY: break; @@ -224,16 +237,10 @@ void __init mrst_time_init(void) return; } /* we need at least one APB timer */ - sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); pre_init_apic_IRQ0(); apbt_time_init(); } -void __init mrst_rtc_init(void) -{ - sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); -} - void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) @@ -256,6 +263,17 @@ static int mrst_i8042_detect(void) return 0; } +/* Reboot and power off are handled by the SCU on a MID device */ +static void mrst_power_off(void) +{ + intel_scu_ipc_simple_command(0xf1, 1); +} + +static void mrst_reboot(void) +{ + intel_scu_ipc_simple_command(0xf1, 0); +} + /* * Moorestown specific x86_init function overrides and early setup * calls. @@ -281,6 +299,10 @@ void __init x86_mrst_early_setup(void) legacy_pic = &null_legacy_pic; + /* Moorestown specific power_off/restart method */ + pm_power_off = mrst_power_off; + machine_ops.emergency_restart = mrst_reboot; + /* Avoid searching for BIOS MP tables */ x86_init.mpparse.find_smp_config = x86_init_noop; x86_init.mpparse.get_smp_config = x86_init_uint_noop; @@ -309,3 +331,505 @@ static inline int __init setup_x86_mrst_timer(char *arg) return 0; } __setup("x86_mrst_timer=", setup_x86_mrst_timer); + +/* + * Parsing GPIO table first, since the DEVS table will need this table + * to map the pin name to the actual pin. + */ +static struct sfi_gpio_table_entry *gpio_table; +static int gpio_num_entry; + +static int __init sfi_parse_gpio(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_gpio_table_entry *pentry; + int num, i; + + if (gpio_table) + return 0; + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry); + pentry = (struct sfi_gpio_table_entry *)sb->pentry; + + gpio_table = (struct sfi_gpio_table_entry *) + kmalloc(num * sizeof(*pentry), GFP_KERNEL); + if (!gpio_table) + return -1; + memcpy(gpio_table, pentry, num * sizeof(*pentry)); + gpio_num_entry = num; + + pr_debug("GPIO pin info:\n"); + for (i = 0; i < num; i++, pentry++) + pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s," + " pin = %d\n", i, + pentry->controller_name, + pentry->pin_name, + pentry->pin_no); + return 0; +} + +static int get_gpio_by_name(const char *name) +{ + struct sfi_gpio_table_entry *pentry = gpio_table; + int i; + + if (!pentry) + return -1; + for (i = 0; i < gpio_num_entry; i++, pentry++) { + if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN)) + return pentry->pin_no; + } + return -1; +} + +/* + * Here defines the array of devices platform data that IAFW would export + * through SFI "DEVS" table, we use name and type to match the device and + * its platform data. + */ +struct devs_id { + char name[SFI_NAME_LEN + 1]; + u8 type; + u8 delay; + void *(*get_platform_data)(void *info); +}; + +/* the offset for the mapping of global gpio pin to irq */ +#define MRST_IRQ_OFFSET 0x100 + +static void __init *pmic_gpio_platform_data(void *info) +{ + static struct intel_pmic_gpio_platform_data pmic_gpio_pdata; + int gpio_base = get_gpio_by_name("pmic_gpio_base"); + + if (gpio_base == -1) + gpio_base = 64; + pmic_gpio_pdata.gpio_base = gpio_base; + pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET; + pmic_gpio_pdata.gpiointr = 0xffffeff8; + + return &pmic_gpio_pdata; +} + +static void __init *max3111_platform_data(void *info) +{ + struct spi_board_info *spi_info = info; + int intr = get_gpio_by_name("max3111_int"); + + if (intr == -1) + return NULL; + spi_info->irq = intr + MRST_IRQ_OFFSET; + return NULL; +} + +/* we have multiple max7315 on the board ... */ +#define MAX7315_NUM 2 +static void __init *max7315_platform_data(void *info) +{ + static struct pca953x_platform_data max7315_pdata[MAX7315_NUM]; + static int nr; + struct pca953x_platform_data *max7315 = &max7315_pdata[nr]; + struct i2c_board_info *i2c_info = info; + int gpio_base, intr; + char base_pin_name[SFI_NAME_LEN + 1]; + char intr_pin_name[SFI_NAME_LEN + 1]; + + if (nr == MAX7315_NUM) { + pr_err("too many max7315s, we only support %d\n", + MAX7315_NUM); + return NULL; + } + /* we have several max7315 on the board, we only need load several + * instances of the same pca953x driver to cover them + */ + strcpy(i2c_info->type, "max7315"); + if (nr++) { + sprintf(base_pin_name, "max7315_%d_base", nr); + sprintf(intr_pin_name, "max7315_%d_int", nr); + } else { + strcpy(base_pin_name, "max7315_base"); + strcpy(intr_pin_name, "max7315_int"); + } + + gpio_base = get_gpio_by_name(base_pin_name); + intr = get_gpio_by_name(intr_pin_name); + + if (gpio_base == -1) + return NULL; + max7315->gpio_base = gpio_base; + if (intr != -1) { + i2c_info->irq = intr + MRST_IRQ_OFFSET; + max7315->irq_base = gpio_base + MRST_IRQ_OFFSET; + } else { + i2c_info->irq = -1; + max7315->irq_base = -1; + } + return max7315; +} + +static void __init *emc1403_platform_data(void *info) +{ + static short intr2nd_pdata; + struct i2c_board_info *i2c_info = info; + int intr = get_gpio_by_name("thermal_int"); + int intr2nd = get_gpio_by_name("thermal_alert"); + + if (intr == -1 || intr2nd == -1) + return NULL; + + i2c_info->irq = intr + MRST_IRQ_OFFSET; + intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET; + + return &intr2nd_pdata; +} + +static void __init *lis331dl_platform_data(void *info) +{ + static short intr2nd_pdata; + struct i2c_board_info *i2c_info = info; + int intr = get_gpio_by_name("accel_int"); + int intr2nd = get_gpio_by_name("accel_2"); + + if (intr == -1 || intr2nd == -1) + return NULL; + + i2c_info->irq = intr + MRST_IRQ_OFFSET; + intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET; + + return &intr2nd_pdata; +} + +static void __init *no_platform_data(void *info) +{ + return NULL; +} + +static const struct devs_id __initconst device_ids[] = { + {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, + {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data}, + {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, + {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, + {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data}, + {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data}, + {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, + {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, + {}, +}; + +#define MAX_IPCDEVS 24 +static struct platform_device *ipc_devs[MAX_IPCDEVS]; +static int ipc_next_dev; + +#define MAX_SCU_SPI 24 +static struct spi_board_info *spi_devs[MAX_SCU_SPI]; +static int spi_next_dev; + +#define MAX_SCU_I2C 24 +static struct i2c_board_info *i2c_devs[MAX_SCU_I2C]; +static int i2c_bus[MAX_SCU_I2C]; +static int i2c_next_dev; + +static void __init intel_scu_device_register(struct platform_device *pdev) +{ + if(ipc_next_dev == MAX_IPCDEVS) + pr_err("too many SCU IPC devices"); + else + ipc_devs[ipc_next_dev++] = pdev; +} + +static void __init intel_scu_spi_device_register(struct spi_board_info *sdev) +{ + struct spi_board_info *new_dev; + + if (spi_next_dev == MAX_SCU_SPI) { + pr_err("too many SCU SPI devices"); + return; + } + + new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL); + if (!new_dev) { + pr_err("failed to alloc mem for delayed spi dev %s\n", + sdev->modalias); + return; + } + memcpy(new_dev, sdev, sizeof(*sdev)); + + spi_devs[spi_next_dev++] = new_dev; +} + +static void __init intel_scu_i2c_device_register(int bus, + struct i2c_board_info *idev) +{ + struct i2c_board_info *new_dev; + + if (i2c_next_dev == MAX_SCU_I2C) { + pr_err("too many SCU I2C devices"); + return; + } + + new_dev = kzalloc(sizeof(*idev), GFP_KERNEL); + if (!new_dev) { + pr_err("failed to alloc mem for delayed i2c dev %s\n", + idev->type); + return; + } + memcpy(new_dev, idev, sizeof(*idev)); + + i2c_bus[i2c_next_dev] = bus; + i2c_devs[i2c_next_dev++] = new_dev; +} + +/* Called by IPC driver */ +void intel_scu_devices_create(void) +{ + int i; + + for (i = 0; i < ipc_next_dev; i++) + platform_device_add(ipc_devs[i]); + + for (i = 0; i < spi_next_dev; i++) + spi_register_board_info(spi_devs[i], 1); + + for (i = 0; i < i2c_next_dev; i++) { + struct i2c_adapter *adapter; + struct i2c_client *client; + + adapter = i2c_get_adapter(i2c_bus[i]); + if (adapter) { + client = i2c_new_device(adapter, i2c_devs[i]); + if (!client) + pr_err("can't create i2c device %s\n", + i2c_devs[i]->type); + } else + i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1); + } +} +EXPORT_SYMBOL_GPL(intel_scu_devices_create); + +/* Called by IPC driver */ +void intel_scu_devices_destroy(void) +{ + int i; + + for (i = 0; i < ipc_next_dev; i++) + platform_device_del(ipc_devs[i]); +} +EXPORT_SYMBOL_GPL(intel_scu_devices_destroy); + +static void __init install_irq_resource(struct platform_device *pdev, int irq) +{ + /* Single threaded */ + static struct resource __initdata res = { + .name = "IRQ", + .flags = IORESOURCE_IRQ, + }; + res.start = irq; + platform_device_add_resources(pdev, &res, 1); +} + +static void __init sfi_handle_ipc_dev(struct platform_device *pdev) +{ + const struct devs_id *dev = device_ids; + void *pdata = NULL; + + while (dev->name[0]) { + if (dev->type == SFI_DEV_TYPE_IPC && + !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) { + pdata = dev->get_platform_data(pdev); + break; + } + dev++; + } + pdev->dev.platform_data = pdata; + intel_scu_device_register(pdev); +} + +static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info) +{ + const struct devs_id *dev = device_ids; + void *pdata = NULL; + + while (dev->name[0]) { + if (dev->type == SFI_DEV_TYPE_SPI && + !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) { + pdata = dev->get_platform_data(spi_info); + break; + } + dev++; + } + spi_info->platform_data = pdata; + if (dev->delay) + intel_scu_spi_device_register(spi_info); + else + spi_register_board_info(spi_info, 1); +} + +static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info) +{ + const struct devs_id *dev = device_ids; + void *pdata = NULL; + + while (dev->name[0]) { + if (dev->type == SFI_DEV_TYPE_I2C && + !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) { + pdata = dev->get_platform_data(i2c_info); + break; + } + dev++; + } + i2c_info->platform_data = pdata; + + if (dev->delay) + intel_scu_i2c_device_register(bus, i2c_info); + else + i2c_register_board_info(bus, i2c_info, 1); + } + + +static int __init sfi_parse_devs(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_device_table_entry *pentry; + struct spi_board_info spi_info; + struct i2c_board_info i2c_info; + struct platform_device *pdev; + int num, i, bus; + int ioapic; + struct io_apic_irq_attr irq_attr; + + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); + pentry = (struct sfi_device_table_entry *)sb->pentry; + + for (i = 0; i < num; i++, pentry++) { + if (pentry->irq != (u8)0xff) { /* native RTE case */ + /* these SPI2 devices are not exposed to system as PCI + * devices, but they have separate RTE entry in IOAPIC + * so we have to enable them one by one here + */ + ioapic = mp_find_ioapic(pentry->irq); + irq_attr.ioapic = ioapic; + irq_attr.ioapic_pin = pentry->irq; + irq_attr.trigger = 1; + irq_attr.polarity = 1; + io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr); + } + switch (pentry->type) { + case SFI_DEV_TYPE_IPC: + /* ID as IRQ is a hack that will go away */ + pdev = platform_device_alloc(pentry->name, pentry->irq); + if (pdev == NULL) { + pr_err("out of memory for SFI platform device '%s'.\n", + pentry->name); + continue; + } + install_irq_resource(pdev, pentry->irq); + pr_debug("info[%2d]: IPC bus, name = %16.16s, " + "irq = 0x%2x\n", i, pentry->name, pentry->irq); + sfi_handle_ipc_dev(pdev); + break; + case SFI_DEV_TYPE_SPI: + memset(&spi_info, 0, sizeof(spi_info)); + strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN); + spi_info.irq = pentry->irq; + spi_info.bus_num = pentry->host_num; + spi_info.chip_select = pentry->addr; + spi_info.max_speed_hz = pentry->max_freq; + pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, " + "irq = 0x%2x, max_freq = %d, cs = %d\n", i, + spi_info.bus_num, + spi_info.modalias, + spi_info.irq, + spi_info.max_speed_hz, + spi_info.chip_select); + sfi_handle_spi_dev(&spi_info); + break; + case SFI_DEV_TYPE_I2C: + memset(&i2c_info, 0, sizeof(i2c_info)); + bus = pentry->host_num; + strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN); + i2c_info.irq = pentry->irq; + i2c_info.addr = pentry->addr; + pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, " + "irq = 0x%2x, addr = 0x%x\n", i, bus, + i2c_info.type, + i2c_info.irq, + i2c_info.addr); + sfi_handle_i2c_dev(bus, &i2c_info); + break; + case SFI_DEV_TYPE_UART: + case SFI_DEV_TYPE_HSI: + default: + ; + } + } + return 0; +} + +static int __init mrst_platform_init(void) +{ + sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio); + sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs); + return 0; +} +arch_initcall(mrst_platform_init); + +/* + * we will search these buttons in SFI GPIO table (by name) + * and register them dynamically. Please add all possible + * buttons here, we will shrink them if no GPIO found. + */ +static struct gpio_keys_button gpio_button[] = { + {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000}, + {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20}, + {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20}, + {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20}, + {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20}, + {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20}, + {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20}, + {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20}, + {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20}, + {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20}, +}; + +static struct gpio_keys_platform_data mrst_gpio_keys = { + .buttons = gpio_button, + .rep = 1, + .nbuttons = -1, /* will fill it after search */ +}; + +static struct platform_device pb_device = { + .name = "gpio-keys", + .id = -1, + .dev = { + .platform_data = &mrst_gpio_keys, + }, +}; + +/* + * Shrink the non-existent buttons, register the gpio button + * device if there is some + */ +static int __init pb_keys_init(void) +{ + struct gpio_keys_button *gb = gpio_button; + int i, num, good = 0; + + num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); + for (i = 0; i < num; i++) { + gb[i].gpio = get_gpio_by_name(gb[i].desc); + if (gb[i].gpio == -1) + continue; + + if (i != good) + gb[good] = gb[i]; + good++; + } + + if (good) { + mrst_gpio_keys.nbuttons = good; + return platform_device_register(&pb_device); + } + return 0; +} +late_initcall(pb_keys_init); diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c new file mode 100644 index 000000000000..32cd7edd71a0 --- /dev/null +++ b/arch/x86/platform/mrst/vrtc.c @@ -0,0 +1,165 @@ +/* + * vrtc.c: Driver for virtual RTC device on Intel MID platform + * + * (C) Copyright 2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * VRTC is emulated by system controller firmware, the real HW + * RTC is located in the PMIC device. SCU FW shadows PMIC RTC + * in a memory mapped IO space that is visible to the host IA + * processor. + * + * This driver is based on RTC CMOS driver. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sfi.h> +#include <linux/platform_device.h> + +#include <asm/mrst.h> +#include <asm/mrst-vrtc.h> +#include <asm/time.h> +#include <asm/fixmap.h> + +static unsigned char __iomem *vrtc_virt_base; + +unsigned char vrtc_cmos_read(unsigned char reg) +{ + unsigned char retval; + + /* vRTC's registers range from 0x0 to 0xD */ + if (reg > 0xd || !vrtc_virt_base) + return 0xff; + + lock_cmos_prefix(reg); + retval = __raw_readb(vrtc_virt_base + (reg << 2)); + lock_cmos_suffix(reg); + return retval; +} +EXPORT_SYMBOL_GPL(vrtc_cmos_read); + +void vrtc_cmos_write(unsigned char val, unsigned char reg) +{ + if (reg > 0xd || !vrtc_virt_base) + return; + + lock_cmos_prefix(reg); + __raw_writeb(val, vrtc_virt_base + (reg << 2)); + lock_cmos_suffix(reg); +} +EXPORT_SYMBOL_GPL(vrtc_cmos_write); + +unsigned long vrtc_get_time(void) +{ + u8 sec, min, hour, mday, mon; + u32 year; + + while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP)) + cpu_relax(); + + sec = vrtc_cmos_read(RTC_SECONDS); + min = vrtc_cmos_read(RTC_MINUTES); + hour = vrtc_cmos_read(RTC_HOURS); + mday = vrtc_cmos_read(RTC_DAY_OF_MONTH); + mon = vrtc_cmos_read(RTC_MONTH); + year = vrtc_cmos_read(RTC_YEAR); + + /* vRTC YEAR reg contains the offset to 1960 */ + year += 1960; + + printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d " + "mon: %d year: %d\n", sec, min, hour, mday, mon, year); + + return mktime(year, mon, mday, hour, min, sec); +} + +/* Only care about the minutes and seconds */ +int vrtc_set_mmss(unsigned long nowtime) +{ + int real_sec, real_min; + int vrtc_min; + + vrtc_min = vrtc_cmos_read(RTC_MINUTES); + + real_sec = nowtime % 60; + real_min = nowtime / 60; + if (((abs(real_min - vrtc_min) + 15)/30) & 1) + real_min += 30; + real_min %= 60; + + vrtc_cmos_write(real_sec, RTC_SECONDS); + vrtc_cmos_write(real_min, RTC_MINUTES); + return 0; +} + +void __init mrst_rtc_init(void) +{ + unsigned long rtc_paddr; + void __iomem *virt_base; + + sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); + if (!sfi_mrtc_num) + return; + + rtc_paddr = sfi_mrtc_array[0].phys_addr; + + /* vRTC's register address may not be page aligned */ + set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr); + + virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC); + virt_base += rtc_paddr & ~PAGE_MASK; + vrtc_virt_base = virt_base; + + x86_platform.get_wallclock = vrtc_get_time; + x86_platform.set_wallclock = vrtc_set_mmss; +} + +/* + * The Moorestown platform has a memory mapped virtual RTC device that emulates + * the programming interface of the RTC. + */ + +static struct resource vrtc_resources[] = { + [0] = { + .flags = IORESOURCE_MEM, + }, + [1] = { + .flags = IORESOURCE_IRQ, + } +}; + +static struct platform_device vrtc_device = { + .name = "rtc_mrst", + .id = -1, + .resource = vrtc_resources, + .num_resources = ARRAY_SIZE(vrtc_resources), +}; + +/* Register the RTC device if appropriate */ +static int __init mrst_device_create(void) +{ + /* No Moorestown, no device */ + if (!mrst_identify_cpu()) + return -ENODEV; + /* No timer, no device */ + if (!sfi_mrtc_num) + return -ENODEV; + + /* iomem resource */ + vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr; + vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr + + MRST_VRTC_MAP_SZ; + /* irq resource */ + vrtc_resources[1].start = sfi_mrtc_array[0].irq; + vrtc_resources[1].end = sfi_mrtc_array[0].irq; + + return platform_device_register(&vrtc_device); +} + +module_init(mrst_device_create); diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index dd4c281ffe57..ca54875ac795 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -48,9 +48,9 @@ static void __init mp_sfi_register_lapic_address(unsigned long address) /* All CPUs enumerated by SFI must be present and enabled */ static void __cpuinit mp_sfi_register_lapic(u8 id) { - if (MAX_APICS - id <= 0) { + if (MAX_LOCAL_APIC - id <= 0) { pr_warning("Processor #%d invalid (max %d)\n", - id, MAX_APICS); + id, MAX_LOCAL_APIC); return; } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index ba9caa808a9c..df58e9cad96a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1341,7 +1341,7 @@ uv_activation_descriptor_init(int node, int pnode) /* * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) - * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub + * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE) */ bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); @@ -1490,7 +1490,7 @@ calculate_destination_timeout(void) /* * initialize the bau_control structure for each cpu */ -static void __init uv_init_per_cpu(int nuvhubs) +static int __init uv_init_per_cpu(int nuvhubs) { int i; int cpu; @@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs) struct bau_control *smaster = NULL; struct socket_desc { short num_cpus; - short cpu_number[16]; + short cpu_number[MAX_CPUS_PER_SOCKET]; }; struct uvhub_desc { unsigned short socket_mask; @@ -1540,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs) sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; sdp->num_cpus++; + if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { + printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus); + return 1; + } } for (uvhub = 0; uvhub < nuvhubs; uvhub++) { if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) @@ -1570,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs) bcp->uvhub_master = hmaster; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> blade_processor_id; + if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { + printk(KERN_EMERG + "%d cpus per uvhub invalid\n", + bcp->uvhub_cpu); + return 1; + } } nextsocket: socket++; @@ -1595,6 +1605,7 @@ nextsocket: bcp->congested_reps = congested_reps; bcp->congested_period = congested_period; } + return 0; } /* @@ -1625,7 +1636,10 @@ static int __init uv_bau_init(void) spin_lock_init(&disable_lock); congested_cycles = microsec_2_cycles(congested_response_us); - uv_init_per_cpu(nuvhubs); + if (uv_init_per_cpu(nuvhubs)) { + nobau = 1; + return 0; + } uv_partition_base_pnode = 0x7fffffff; for (uvhub = 0; uvhub < nuvhubs; uvhub++) diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index 3371bd053b89..632037671746 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) ver = m->apicver; if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->apicid, MAX_APICS); + m->apicid, MAX_LOCAL_APIC); return; } diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 5718566e00f9..d9926afec110 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_type id, int __init acpi_numa_init(void) { int ret = 0; + int nr_cpu_entries = nr_cpu_ids; + +#ifdef CONFIG_X86 + /* + * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= + * SRAT cpu entries could have different order with that in MADT. + * So go over all cpu entries in SRAT to get apicid to node mapping. + */ + nr_cpu_entries = MAX_LOCAL_APIC; +#endif /* SRAT: Static Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, - acpi_parse_x2apic_affinity, nr_cpu_ids); + acpi_parse_x2apic_affinity, nr_cpu_entries); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, - acpi_parse_processor_affinity, nr_cpu_ids); + acpi_parse_processor_affinity, nr_cpu_entries); ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 42396df55556..9252e85706ef 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -38,7 +38,7 @@ static int agp_bridges_found; static void amd64_tlbflush(struct agp_memory *temp) { - k8_flush_garts(); + amd_flush_garts(); } static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) @@ -124,7 +124,7 @@ static int amd64_fetch_size(void) u32 temp; struct aper_size_info_32 *values; - dev = k8_northbridges.nb_misc[0]; + dev = node_to_amd_nb(0)->misc; if (dev==NULL) return 0; @@ -181,16 +181,15 @@ static int amd_8151_configure(void) unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); int i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return 0; /* Configure AGP regs in each x86-64 host bridge. */ - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { agp_bridge->gart_bus_addr = - amd64_configure(k8_northbridges.nb_misc[i], - gatt_bus); + amd64_configure(node_to_amd_nb(i)->misc, gatt_bus); } - k8_flush_garts(); + amd_flush_garts(); return 0; } @@ -200,11 +199,11 @@ static void amd64_cleanup(void) u32 tmp; int i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; /* disable gart translation */ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp); tmp &= ~GARTEN; @@ -331,15 +330,15 @@ static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr) { int i; - if (cache_k8_northbridges() < 0) + if (amd_cache_northbridges() < 0) return -ENODEV; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return -ENODEV; i = 0; - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; if (fix_northbridge(dev, pdev, cap_ptr) < 0) { dev_err(&dev->dev, "no usable aperture found\n"); #ifdef __x86_64__ @@ -416,7 +415,7 @@ static int __devinit uli_agp_init(struct pci_dev *pdev) } /* shadow x86-64 registers into ULi registers */ - pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE, + pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE, &httfea); /* if x86-64 aperture base is beyond 4G, exit here */ @@ -484,7 +483,7 @@ static int nforce3_agp_init(struct pci_dev *pdev) pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp); /* shadow x86-64 registers into NVIDIA registers */ - pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE, + pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE, &apbase); /* if x86-64 aperture base is beyond 4G, exit here */ @@ -778,7 +777,7 @@ int __init agp_amd64_init(void) } /* First check that we have at least one AMD64 NB */ - if (!pci_dev_present(k8_nb_ids)) + if (!pci_dev_present(amd_nb_misc_ids)) return -ENODEV; /* Look for any AGP bridge */ diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index eca9ba193e94..df211181fca4 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2917,7 +2917,7 @@ static int __init amd64_edac_init(void) opstate_init(); - if (cache_k8_northbridges() < 0) + if (amd_cache_northbridges() < 0) goto err_ret; msrs = msrs_alloc(); @@ -2934,7 +2934,7 @@ static int __init amd64_edac_init(void) * to finish initialization of the MC instances. */ err = -ENODEV; - for (nb = 0; nb < k8_northbridges.num; nb++) { + for (nb = 0; nb < amd_nb_num(); nb++) { if (!pvt_lookup[nb]) continue; diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index 41a9e34899ac..ca35b0ce944a 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -26,6 +26,7 @@ #include <linux/sfi.h> #include <asm/mrst.h> #include <asm/intel_scu_ipc.h> +#include <asm/mrst.h> /* IPC defines the following message types */ #define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */ @@ -699,6 +700,9 @@ static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id) iounmap(ipcdev.ipc_base); return -ENOMEM; } + + intel_scu_devices_create(); + return 0; } @@ -720,6 +724,7 @@ static void ipc_remove(struct pci_dev *pdev) iounmap(ipcdev.ipc_base); iounmap(ipcdev.i2c_base); ipcdev.pdev = NULL; + intel_scu_devices_destroy(); } static const struct pci_device_id pci_ids[] = { diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 2883428d5ac8..4941cade319f 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -463,6 +463,18 @@ config RTC_DRV_CMOS This driver can also be built as a module. If so, the module will be called rtc-cmos. +config RTC_DRV_VRTC + tristate "Virtual RTC for Moorestown platforms" + depends on X86_MRST + default y if X86_MRST + + help + Say "yes" here to get direct support for the real time clock + found on Moorestown platforms. The VRTC is a emulated RTC that + derives its clock source from a real RTC in the PMIC. The MC146818 + style programming interface is mostly conserved, but any + updates are done via IPC calls to the system controller FW. + config RTC_DRV_DS1216 tristate "Dallas DS1216" depends on SNI_RM diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 4c2832df4697..2afdaf3ff986 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_RTC_DRV_CMOS) += rtc-cmos.o obj-$(CONFIG_RTC_DRV_COH901331) += rtc-coh901331.o obj-$(CONFIG_RTC_DRV_DAVINCI) += rtc-davinci.o obj-$(CONFIG_RTC_DRV_DM355EVM) += rtc-dm355evm.o +obj-$(CONFIG_RTC_DRV_VRTC) += rtc-mrst.o obj-$(CONFIG_RTC_DRV_DS1216) += rtc-ds1216.o obj-$(CONFIG_RTC_DRV_DS1286) += rtc-ds1286.o obj-$(CONFIG_RTC_DRV_DS1302) += rtc-ds1302.o diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c new file mode 100644 index 000000000000..bcd0cf63eb16 --- /dev/null +++ b/drivers/rtc/rtc-mrst.c @@ -0,0 +1,582 @@ +/* + * rtc-mrst.c: Driver for Moorestown virtual RTC + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * Feng Tang (feng.tang@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * VRTC is emulated by system controller firmware, the real HW + * RTC is located in the PMIC device. SCU FW shadows PMIC RTC + * in a memory mapped IO space that is visible to the host IA + * processor. + * + * This driver is based upon drivers/rtc/rtc-cmos.c + */ + +/* + * Note: + * * vRTC only supports binary mode and 24H mode + * * vRTC only support PIE and AIE, no UIE, and its PIE only happens + * at 23:59:59pm everyday, no support for adjustable frequency + * * Alarm function is also limited to hr/min/sec. + */ + +#include <linux/mod_devicetable.h> +#include <linux/platform_device.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sfi.h> + +#include <asm-generic/rtc.h> +#include <asm/intel_scu_ipc.h> +#include <asm/mrst.h> +#include <asm/mrst-vrtc.h> + +struct mrst_rtc { + struct rtc_device *rtc; + struct device *dev; + int irq; + struct resource *iomem; + + u8 enabled_wake; + u8 suspend_ctrl; +}; + +static const char driver_name[] = "rtc_mrst"; + +#define RTC_IRQMASK (RTC_PF | RTC_AF) + +static inline int is_intr(u8 rtc_intr) +{ + if (!(rtc_intr & RTC_IRQF)) + return 0; + return rtc_intr & RTC_IRQMASK; +} + +/* + * rtc_time's year contains the increment over 1900, but vRTC's YEAR + * register can't be programmed to value larger than 0x64, so vRTC + * driver chose to use 1960 (1970 is UNIX time start point) as the base, + * and does the translation at read/write time. + * + * Why not just use 1970 as the offset? it's because using 1960 will + * make it consistent in leap year setting for both vrtc and low-level + * physical rtc devices. + */ +static int mrst_read_time(struct device *dev, struct rtc_time *time) +{ + unsigned long flags; + + if (rtc_is_updating()) + mdelay(20); + + spin_lock_irqsave(&rtc_lock, flags); + time->tm_sec = vrtc_cmos_read(RTC_SECONDS); + time->tm_min = vrtc_cmos_read(RTC_MINUTES); + time->tm_hour = vrtc_cmos_read(RTC_HOURS); + time->tm_mday = vrtc_cmos_read(RTC_DAY_OF_MONTH); + time->tm_mon = vrtc_cmos_read(RTC_MONTH); + time->tm_year = vrtc_cmos_read(RTC_YEAR); + spin_unlock_irqrestore(&rtc_lock, flags); + + /* Adjust for the 1960/1900 */ + time->tm_year += 60; + time->tm_mon--; + return RTC_24H; +} + +static int mrst_set_time(struct device *dev, struct rtc_time *time) +{ + int ret; + unsigned long flags; + unsigned char mon, day, hrs, min, sec; + unsigned int yrs; + + yrs = time->tm_year; + mon = time->tm_mon + 1; /* tm_mon starts at zero */ + day = time->tm_mday; + hrs = time->tm_hour; + min = time->tm_min; + sec = time->tm_sec; + + if (yrs < 70 || yrs > 138) + return -EINVAL; + yrs -= 60; + + spin_lock_irqsave(&rtc_lock, flags); + + vrtc_cmos_write(yrs, RTC_YEAR); + vrtc_cmos_write(mon, RTC_MONTH); + vrtc_cmos_write(day, RTC_DAY_OF_MONTH); + vrtc_cmos_write(hrs, RTC_HOURS); + vrtc_cmos_write(min, RTC_MINUTES); + vrtc_cmos_write(sec, RTC_SECONDS); + + spin_unlock_irqrestore(&rtc_lock, flags); + + ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETTIME); + return ret; +} + +static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned char rtc_control; + + if (mrst->irq <= 0) + return -EIO; + + /* Basic alarms only support hour, minute, and seconds fields. + * Some also support day and month, for alarms up to a year in + * the future. + */ + t->time.tm_mday = -1; + t->time.tm_mon = -1; + t->time.tm_year = -1; + + /* vRTC only supports binary mode */ + spin_lock_irq(&rtc_lock); + t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM); + t->time.tm_min = vrtc_cmos_read(RTC_MINUTES_ALARM); + t->time.tm_hour = vrtc_cmos_read(RTC_HOURS_ALARM); + + rtc_control = vrtc_cmos_read(RTC_CONTROL); + spin_unlock_irq(&rtc_lock); + + t->enabled = !!(rtc_control & RTC_AIE); + t->pending = 0; + + return 0; +} + +static void mrst_checkintr(struct mrst_rtc *mrst, unsigned char rtc_control) +{ + unsigned char rtc_intr; + + /* + * NOTE after changing RTC_xIE bits we always read INTR_FLAGS; + * allegedly some older rtcs need that to handle irqs properly + */ + rtc_intr = vrtc_cmos_read(RTC_INTR_FLAGS); + rtc_intr &= (rtc_control & RTC_IRQMASK) | RTC_IRQF; + if (is_intr(rtc_intr)) + rtc_update_irq(mrst->rtc, 1, rtc_intr); +} + +static void mrst_irq_enable(struct mrst_rtc *mrst, unsigned char mask) +{ + unsigned char rtc_control; + + /* + * Flush any pending IRQ status, notably for update irqs, + * before we enable new IRQs + */ + rtc_control = vrtc_cmos_read(RTC_CONTROL); + mrst_checkintr(mrst, rtc_control); + + rtc_control |= mask; + vrtc_cmos_write(rtc_control, RTC_CONTROL); + + mrst_checkintr(mrst, rtc_control); +} + +static void mrst_irq_disable(struct mrst_rtc *mrst, unsigned char mask) +{ + unsigned char rtc_control; + + rtc_control = vrtc_cmos_read(RTC_CONTROL); + rtc_control &= ~mask; + vrtc_cmos_write(rtc_control, RTC_CONTROL); + mrst_checkintr(mrst, rtc_control); +} + +static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned char hrs, min, sec; + int ret = 0; + + if (!mrst->irq) + return -EIO; + + hrs = t->time.tm_hour; + min = t->time.tm_min; + sec = t->time.tm_sec; + + spin_lock_irq(&rtc_lock); + /* Next rtc irq must not be from previous alarm setting */ + mrst_irq_disable(mrst, RTC_AIE); + + /* Update alarm */ + vrtc_cmos_write(hrs, RTC_HOURS_ALARM); + vrtc_cmos_write(min, RTC_MINUTES_ALARM); + vrtc_cmos_write(sec, RTC_SECONDS_ALARM); + + spin_unlock_irq(&rtc_lock); + + ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETALARM); + if (ret) + return ret; + + spin_lock_irq(&rtc_lock); + if (t->enabled) + mrst_irq_enable(mrst, RTC_AIE); + + spin_unlock_irq(&rtc_lock); + + return 0; +} + +static int mrst_irq_set_state(struct device *dev, int enabled) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned long flags; + + if (!mrst->irq) + return -ENXIO; + + spin_lock_irqsave(&rtc_lock, flags); + + if (enabled) + mrst_irq_enable(mrst, RTC_PIE); + else + mrst_irq_disable(mrst, RTC_PIE); + + spin_unlock_irqrestore(&rtc_lock, flags); + return 0; +} + +#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE) + +/* Currently, the vRTC doesn't support UIE ON/OFF */ +static int +mrst_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned long flags; + + switch (cmd) { + case RTC_AIE_OFF: + case RTC_AIE_ON: + if (!mrst->irq) + return -EINVAL; + break; + default: + /* PIE ON/OFF is handled by mrst_irq_set_state() */ + return -ENOIOCTLCMD; + } + + spin_lock_irqsave(&rtc_lock, flags); + switch (cmd) { + case RTC_AIE_OFF: /* alarm off */ + mrst_irq_disable(mrst, RTC_AIE); + break; + case RTC_AIE_ON: /* alarm on */ + mrst_irq_enable(mrst, RTC_AIE); + break; + } + spin_unlock_irqrestore(&rtc_lock, flags); + return 0; +} + +#else +#define mrst_rtc_ioctl NULL +#endif + +#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE) + +static int mrst_procfs(struct device *dev, struct seq_file *seq) +{ + unsigned char rtc_control, valid; + + spin_lock_irq(&rtc_lock); + rtc_control = vrtc_cmos_read(RTC_CONTROL); + valid = vrtc_cmos_read(RTC_VALID); + spin_unlock_irq(&rtc_lock); + + return seq_printf(seq, + "periodic_IRQ\t: %s\n" + "alarm\t\t: %s\n" + "BCD\t\t: no\n" + "periodic_freq\t: daily (not adjustable)\n", + (rtc_control & RTC_PIE) ? "on" : "off", + (rtc_control & RTC_AIE) ? "on" : "off"); +} + +#else +#define mrst_procfs NULL +#endif + +static const struct rtc_class_ops mrst_rtc_ops = { + .ioctl = mrst_rtc_ioctl, + .read_time = mrst_read_time, + .set_time = mrst_set_time, + .read_alarm = mrst_read_alarm, + .set_alarm = mrst_set_alarm, + .proc = mrst_procfs, + .irq_set_state = mrst_irq_set_state, +}; + +static struct mrst_rtc mrst_rtc; + +/* + * When vRTC IRQ is captured by SCU FW, FW will clear the AIE bit in + * Reg B, so no need for this driver to clear it + */ +static irqreturn_t mrst_rtc_irq(int irq, void *p) +{ + u8 irqstat; + + spin_lock(&rtc_lock); + /* This read will clear all IRQ flags inside Reg C */ + irqstat = vrtc_cmos_read(RTC_INTR_FLAGS); + spin_unlock(&rtc_lock); + + irqstat &= RTC_IRQMASK | RTC_IRQF; + if (is_intr(irqstat)) { + rtc_update_irq(p, 1, irqstat); + return IRQ_HANDLED; + } + return IRQ_NONE; +} + +static int __init +vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq) +{ + int retval = 0; + unsigned char rtc_control; + + /* There can be only one ... */ + if (mrst_rtc.dev) + return -EBUSY; + + if (!iomem) + return -ENODEV; + + iomem = request_mem_region(iomem->start, + iomem->end + 1 - iomem->start, + driver_name); + if (!iomem) { + dev_dbg(dev, "i/o mem already in use.\n"); + return -EBUSY; + } + + mrst_rtc.irq = rtc_irq; + mrst_rtc.iomem = iomem; + + mrst_rtc.rtc = rtc_device_register(driver_name, dev, + &mrst_rtc_ops, THIS_MODULE); + if (IS_ERR(mrst_rtc.rtc)) { + retval = PTR_ERR(mrst_rtc.rtc); + goto cleanup0; + } + + mrst_rtc.dev = dev; + dev_set_drvdata(dev, &mrst_rtc); + rename_region(iomem, dev_name(&mrst_rtc.rtc->dev)); + + spin_lock_irq(&rtc_lock); + mrst_irq_disable(&mrst_rtc, RTC_PIE | RTC_AIE); + rtc_control = vrtc_cmos_read(RTC_CONTROL); + spin_unlock_irq(&rtc_lock); + + if (!(rtc_control & RTC_24H) || (rtc_control & (RTC_DM_BINARY))) + dev_dbg(dev, "TODO: support more than 24-hr BCD mode\n"); + + if (rtc_irq) { + retval = request_irq(rtc_irq, mrst_rtc_irq, + IRQF_DISABLED, dev_name(&mrst_rtc.rtc->dev), + mrst_rtc.rtc); + if (retval < 0) { + dev_dbg(dev, "IRQ %d is already in use, err %d\n", + rtc_irq, retval); + goto cleanup1; + } + } + dev_dbg(dev, "initialised\n"); + return 0; + +cleanup1: + mrst_rtc.dev = NULL; + rtc_device_unregister(mrst_rtc.rtc); +cleanup0: + release_region(iomem->start, iomem->end + 1 - iomem->start); + dev_err(dev, "rtc-mrst: unable to initialise\n"); + return retval; +} + +static void rtc_mrst_do_shutdown(void) +{ + spin_lock_irq(&rtc_lock); + mrst_irq_disable(&mrst_rtc, RTC_IRQMASK); + spin_unlock_irq(&rtc_lock); +} + +static void __exit rtc_mrst_do_remove(struct device *dev) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + struct resource *iomem; + + rtc_mrst_do_shutdown(); + + if (mrst->irq) + free_irq(mrst->irq, mrst->rtc); + + rtc_device_unregister(mrst->rtc); + mrst->rtc = NULL; + + iomem = mrst->iomem; + release_region(iomem->start, iomem->end + 1 - iomem->start); + mrst->iomem = NULL; + + mrst->dev = NULL; + dev_set_drvdata(dev, NULL); +} + +#ifdef CONFIG_PM +static int mrst_suspend(struct device *dev, pm_message_t mesg) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned char tmp; + + /* Only the alarm might be a wakeup event source */ + spin_lock_irq(&rtc_lock); + mrst->suspend_ctrl = tmp = vrtc_cmos_read(RTC_CONTROL); + if (tmp & (RTC_PIE | RTC_AIE)) { + unsigned char mask; + + if (device_may_wakeup(dev)) + mask = RTC_IRQMASK & ~RTC_AIE; + else + mask = RTC_IRQMASK; + tmp &= ~mask; + vrtc_cmos_write(tmp, RTC_CONTROL); + + mrst_checkintr(mrst, tmp); + } + spin_unlock_irq(&rtc_lock); + + if (tmp & RTC_AIE) { + mrst->enabled_wake = 1; + enable_irq_wake(mrst->irq); + } + + dev_dbg(&mrst_rtc.rtc->dev, "suspend%s, ctrl %02x\n", + (tmp & RTC_AIE) ? ", alarm may wake" : "", + tmp); + + return 0; +} + +/* + * We want RTC alarms to wake us from the deep power saving state + */ +static inline int mrst_poweroff(struct device *dev) +{ + return mrst_suspend(dev, PMSG_HIBERNATE); +} + +static int mrst_resume(struct device *dev) +{ + struct mrst_rtc *mrst = dev_get_drvdata(dev); + unsigned char tmp = mrst->suspend_ctrl; + + /* Re-enable any irqs previously active */ + if (tmp & RTC_IRQMASK) { + unsigned char mask; + + if (mrst->enabled_wake) { + disable_irq_wake(mrst->irq); + mrst->enabled_wake = 0; + } + + spin_lock_irq(&rtc_lock); + do { + vrtc_cmos_write(tmp, RTC_CONTROL); + + mask = vrtc_cmos_read(RTC_INTR_FLAGS); + mask &= (tmp & RTC_IRQMASK) | RTC_IRQF; + if (!is_intr(mask)) + break; + + rtc_update_irq(mrst->rtc, 1, mask); + tmp &= ~RTC_AIE; + } while (mask & RTC_AIE); + spin_unlock_irq(&rtc_lock); + } + + dev_dbg(&mrst_rtc.rtc->dev, "resume, ctrl %02x\n", tmp); + + return 0; +} + +#else +#define mrst_suspend NULL +#define mrst_resume NULL + +static inline int mrst_poweroff(struct device *dev) +{ + return -ENOSYS; +} + +#endif + +static int __init vrtc_mrst_platform_probe(struct platform_device *pdev) +{ + return vrtc_mrst_do_probe(&pdev->dev, + platform_get_resource(pdev, IORESOURCE_MEM, 0), + platform_get_irq(pdev, 0)); +} + +static int __exit vrtc_mrst_platform_remove(struct platform_device *pdev) +{ + rtc_mrst_do_remove(&pdev->dev); + return 0; +} + +static void vrtc_mrst_platform_shutdown(struct platform_device *pdev) +{ + if (system_state == SYSTEM_POWER_OFF && !mrst_poweroff(&pdev->dev)) + return; + + rtc_mrst_do_shutdown(); +} + +MODULE_ALIAS("platform:vrtc_mrst"); + +static struct platform_driver vrtc_mrst_platform_driver = { + .probe = vrtc_mrst_platform_probe, + .remove = __exit_p(vrtc_mrst_platform_remove), + .shutdown = vrtc_mrst_platform_shutdown, + .driver = { + .name = (char *) driver_name, + .suspend = mrst_suspend, + .resume = mrst_resume, + } +}; + +static int __init vrtc_mrst_init(void) +{ + return platform_driver_register(&vrtc_mrst_platform_driver); +} + +static void __exit vrtc_mrst_exit(void) +{ + platform_driver_unregister(&vrtc_mrst_platform_driver); +} + +module_init(vrtc_mrst_init); +module_exit(vrtc_mrst_exit); + +MODULE_AUTHOR("Jacob Pan; Feng Tang"); +MODULE_DESCRIPTION("Driver for Moorestown virtual RTC"); +MODULE_LICENSE("GPL"); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5476c066d4ee..3c4039d5eef1 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, int metadata; unsigned int revokes = 0; int x; - int error; + int error = 0; if (!*top) sm->sm_first = 0; @@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (metadata) revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; - error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); + if (ip != GFS2_I(sdp->sd_rindex)) + error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); + else if (!sdp->sd_rgrps) + error = gfs2_ri_update(ip); + if (error) return error; @@ -879,7 +883,8 @@ out_rg_gunlock: out_rlist: gfs2_rlist_free(&rlist); out: - gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); + if (ip != GFS2_I(sdp->sd_rindex)) + gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh); return error; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f92c17704169..08a8beb152e6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -541,21 +541,6 @@ out_locked: spin_unlock(&gl->gl_spin); } -static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, - unsigned int req_state, - unsigned int flags) -{ - int ret = LM_OUT_ERROR; - - if (!sdp->sd_lockstruct.ls_ops->lm_lock) - return req_state == LM_ST_UNLOCKED ? 0 : req_state; - - if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, - req_state, flags); - return ret; -} - /** * do_xmote - Calls the DLM to change the state of a lock * @gl: The lock state @@ -575,13 +560,14 @@ __acquires(&gl->gl_spin) lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); - BUG_ON(gl->gl_state == target); - BUG_ON(gl->gl_state == gl->gl_target); + GLOCK_BUG_ON(gl, gl->gl_state == target); + GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && glops->go_inval) { set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); do_error(gl, 0); /* Fail queued try locks */ } + gl->gl_req = target; spin_unlock(&gl->gl_spin); if (glops->go_xmote_th) glops->go_xmote_th(gl); @@ -594,15 +580,17 @@ __acquires(&gl->gl_spin) gl->gl_state == LM_ST_DEFERRED) && !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) lck_flags |= LM_FLAG_TRY_1CB; - ret = gfs2_lm_lock(sdp, gl, target, lck_flags); - if (!(ret & LM_OUT_ASYNC)) { - finish_xmote(gl, ret); + if (sdp->sd_lockstruct.ls_ops->lm_lock) { + /* lock_dlm */ + ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); + GLOCK_BUG_ON(gl, ret); + } else { /* lock_nolock */ + finish_xmote(gl, target); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); - } else { - GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC); } + spin_lock(&gl->gl_spin); } @@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); + if (seq) { struct gfs2_glock_iter *gi = seq->private; vsprintf(gi->string, fmt, args); seq_printf(seq, gi->string); } else { - printk(KERN_ERR " "); - vprintk(fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR " %pV", &vaf); } + va_end(args); } @@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl) * @gl: Pointer to the glock * @ret: The return value from the dlm * + * The gl_reply field is under the gl_spin lock so that it is ok + * to use a bitfield shared with other glock state fields. */ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + spin_lock(&gl->gl_spin); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { - spin_lock(&gl->gl_spin); if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); spin_unlock(&gl->gl_spin); return; } - spin_unlock(&gl->gl_spin); } + + spin_unlock(&gl->gl_spin); set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + smp_wmb(); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gfs2_glock_put(gl); @@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) { struct task_struct *gh_owner = NULL; - char buffer[KSYM_SYMBOL_LEN]; char flags_buf[32]; - sprint_symbol(buffer, gh->gh_ip); if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); - gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n", - state2str(gh->gh_state), - hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), - gh->gh_error, - gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, - gh_owner ? gh_owner->comm : "(ended)", buffer); + gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", + state2str(gh->gh_state), + hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), + gh->gh_error, + gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, + gh_owner ? gh_owner->comm : "(ended)", + (void *)gh->gh_ip); return 0; } @@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void) } #endif - glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER | + glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZEABLE, 0); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); - gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER | - WQ_FREEZEABLE, 0); + gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", + WQ_MEM_RECLAIM | WQ_FREEZEABLE, + 0); if (IS_ERR(gfs2_delete_workqueue)) { destroy_workqueue(glock_workqueue); return PTR_ERR(gfs2_delete_workqueue); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index db1c26d6d220..691851ceb615 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -87,11 +87,10 @@ enum { #define GL_ASYNC 0x00000040 #define GL_EXACT 0x00000080 #define GL_SKIP 0x00000100 -#define GL_ATIME 0x00000200 #define GL_NOCACHE 0x00000400 /* - * lm_lock() and lm_async_cb return flags + * lm_async_cb return flags * * LM_OUT_ST_MASK * Masks the lower two bits of lock state in the returned value. @@ -99,15 +98,11 @@ enum { * LM_OUT_CANCELED * The lock request was canceled. * - * LM_OUT_ASYNC - * The result of the request will be returned in an LM_CB_ASYNC callback. - * */ #define LM_OUT_ST_MASK 0x00000003 #define LM_OUT_CANCELED 0x00000008 -#define LM_OUT_ASYNC 0x00000080 -#define LM_OUT_ERROR 0x00000100 +#define LM_OUT_ERROR 0x00000004 /* * lm_recovery_done() messages @@ -124,25 +119,12 @@ struct lm_lockops { void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); - unsigned int (*lm_lock) (struct gfs2_glock *gl, - unsigned int req_state, unsigned int flags); + int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags); void (*lm_cancel) (struct gfs2_glock *gl); const match_table_t *lm_tokens; }; -#define LM_FLAG_TRY 0x00000001 -#define LM_FLAG_TRY_1CB 0x00000002 -#define LM_FLAG_NOEXP 0x00000004 -#define LM_FLAG_ANY 0x00000008 -#define LM_FLAG_PRIORITY 0x00000010 - -#define GL_ASYNC 0x00000040 -#define GL_EXACT 0x00000080 -#define GL_SKIP 0x00000100 -#define GL_NOCACHE 0x00000400 - -#define GLR_TRYFAILED 13 - extern struct workqueue_struct *gfs2_delete_workqueue; static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { @@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp, int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); + +__attribute__ ((format(printf, 2, 3))) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 0d149dcc04e5..263561bf1a50 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl) if (gl->gl_state != LM_ST_UNLOCKED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - flush_workqueue(gfs2_delete_workqueue); gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 764fbb49efc8..8d3d2b4a0a7d 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -207,12 +207,14 @@ struct gfs2_glock { spinlock_t gl_spin; - unsigned int gl_state; - unsigned int gl_target; - unsigned int gl_reply; + /* State fields protected by gl_spin */ + unsigned int gl_state:2, /* Current state */ + gl_target:2, /* Target state */ + gl_demote_state:2, /* State requested by remote node */ + gl_req:2, /* State in last dlm request */ + gl_reply:8; /* Last reply from the dlm */ + unsigned int gl_hash; - unsigned int gl_req; - unsigned int gl_demote_state; /* state requested by remote node */ unsigned long gl_demote_time; /* time of first demote request */ struct list_head gl_holders; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e1213f7f9217..14e682dbe8bf 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) if (error) return error; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - setattr_copy(inode, attr); mark_inode_dirty(inode); - - gfs2_assert_warn(GFS2_SB(inode), !error); gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 1c09425b45fd..6e493aee28f8 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags, return lkf; } -static unsigned int gdlm_lock(struct gfs2_glock *gl, - unsigned int req_state, unsigned int flags) +static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; - int error; int req; u32 lkf; - gl->gl_req = req_state; req = make_mode(req_state); lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); @@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl, * Submit the actual lock request. */ - error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, - GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); - if (error == -EAGAIN) - return 0; - if (error) - return LM_OUT_ERROR; - return LM_OUT_ASYNC; + return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); } static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 12cbea7502c2..1db6b7343229 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *dibh; u32 ouid, ogid, nuid, ngid; int error; @@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (error) goto out_gunlock_q; - error = gfs2_meta_inode_buffer(ip, &dibh); + error = gfs2_setattr_simple(ip, attr); if (error) goto out_end_trans; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - int error; - - error = vmtruncate(inode, attr->ia_size); - gfs2_assert_warn(sdp, !error); - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); gfs2_quota_change(ip, -blocks, ouid, ogid); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index f606baf9ba72..a689901963de 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_limit = qp->qu_limit; } + if (fdq->d_fieldmask & FS_DQ_BCOUNT) { + qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_value = qp->qu_value; + } } /* Write the quota into the quota file on disk */ @@ -1509,7 +1513,7 @@ out: } /* GFS2 only supports a subset of the XFS fields */ -#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD) +#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT) static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, struct fs_disk_quota *fdq) @@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, if ((fdq->d_fieldmask & FS_DQ_BSOFT) && ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) fdq->d_fieldmask ^= FS_DQ_BSOFT; + if ((fdq->d_fieldmask & FS_DQ_BHARD) && ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) fdq->d_fieldmask ^= FS_DQ_BHARD; + + if ((fdq->d_fieldmask & FS_DQ_BCOUNT) && + ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value))) + fdq->d_fieldmask ^= FS_DQ_BCOUNT; + if (fdq->d_fieldmask == 0) goto out_i; @@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = { .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; - diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 33c8407b876f..7293ea27020c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); - if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode)) + if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode)) break; error = gfs2_internal_read(ip, &ra_state, buf, &pos, sizeof(struct gfs2_rindex)); @@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip, * Returns: 0 on successful update, error code otherwise */ -static int gfs2_ri_update(struct gfs2_inode *ip) +int gfs2_ri_update(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; @@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip) } /** - * gfs2_ri_update_special - Pull in a new resource index from the disk - * - * This is a special version that's safe to call from gfs2_inplace_reserve_i. - * In this case we know that we don't have any resource groups in memory yet. - * - * @ip: pointer to the rindex inode - * - * Returns: 0 on successful update, error code otherwise - */ -static int gfs2_ri_update_special(struct gfs2_inode *ip) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct inode *inode = &ip->i_inode; - struct file_ra_state ra_state; - struct gfs2_rgrpd *rgd; - unsigned int max_data = 0; - int error; - - file_ra_state_init(&ra_state, inode->i_mapping); - for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { - /* Ignore partials */ - if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > - i_size_read(inode)) - break; - error = read_rindex_entry(ip, &ra_state); - if (error) { - clear_rgrpdi(sdp); - return error; - } - } - list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) - if (rgd->rd_data > max_data) - max_data = rgd->rd_data; - sdp->sd_max_rg_data = max_data; - - sdp->sd_rindex_uptodate = 1; - return 0; -} - -/** * gfs2_rindex_hold - Grab a lock on the rindex * @sdp: The GFS2 superblock * @ri_gh: the glock holder @@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, error = gfs2_rindex_hold(sdp, &al->al_ri_gh); else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ - error = gfs2_ri_update_special(ip); + error = gfs2_ri_update(ip); if (error) return error; } +try_again: do { error = get_local_rgrp(ip, &last_unlinked); /* If there is no space, flushing the log may release some */ - if (error) + if (error) { + if (ip == GFS2_I(sdp->sd_rindex) && + !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + return error; + goto try_again; + } gfs2_log_flush(sdp, NULL); + } } while (error && tries++ < 3); if (error) { diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 0e35c0466f9a..50c2bb04369c 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, extern void gfs2_inplace_release(struct gfs2_inode *ip); +extern int gfs2_ri_update(struct gfs2_inode *ip); extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 30b58f07c8a6..439b61c03262 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1296,10 +1296,8 @@ fail: int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { - struct inode *inode = &ip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_location el; - struct buffer_head *dibh; int error; error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); @@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) if (error) return error; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_trans_end; - - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - int error; - - error = vmtruncate(inode, attr->ia_size); - gfs2_assert_warn(GFS2_SB(inode), !error); - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - -out_trans_end: + error = gfs2_setattr_simple(ip, attr); gfs2_trans_end(sdp); return error; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 182845147fe4..08cba2c3b612 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = { #endif +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[PROC_NUMBUF]; + long nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = strict_strtol(strstrip(buffer), 0, &nice); + if (err) + return -EINVAL; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = nice; + err = proc_sched_autogroup_set_nice(p, &err); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .write = sched_autogroup_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_AUTOGROUP */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -2733,6 +2809,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK INF("syscall", S_IRUSR, proc_pid_syscall), diff --git a/include/linux/completion.h b/include/linux/completion.h index 36d57f74cd01..51494e6b5548 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x); extern int wait_for_completion_killable(struct completion *x); extern unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); -extern unsigned long wait_for_completion_interruptible_timeout( - struct completion *x, unsigned long timeout); -extern unsigned long wait_for_completion_killable_timeout( - struct completion *x, unsigned long timeout); +extern long wait_for_completion_interruptible_timeout( + struct completion *x, unsigned long timeout); +extern long wait_for_completion_killable_timeout( + struct completion *x, unsigned long timeout); extern bool try_wait_for_completion(struct completion *x); extern bool completion_done(struct completion *x); diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index a90b3892074a..1c70028f81f9 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -44,34 +44,24 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n, extern int ddebug_remove_module(const char *mod_name); #define dynamic_pr_debug(fmt, ...) do { \ - __label__ do_printk; \ - __label__ out; \ static struct _ddebug descriptor \ __used \ __attribute__((section("__verbose"), aligned(8))) = \ { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \ _DPRINTK_FLAGS_DEFAULT }; \ - JUMP_LABEL(&descriptor.enabled, do_printk); \ - goto out; \ -do_printk: \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -out: ; \ + if (unlikely(descriptor.enabled)) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ } while (0) #define dynamic_dev_dbg(dev, fmt, ...) do { \ - __label__ do_printk; \ - __label__ out; \ static struct _ddebug descriptor \ __used \ __attribute__((section("__verbose"), aligned(8))) = \ { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \ _DPRINTK_FLAGS_DEFAULT }; \ - JUMP_LABEL(&descriptor.enabled, do_printk); \ - goto out; \ -do_printk: \ - dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \ -out: ; \ + if (unlikely(descriptor.enabled)) \ + dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \ } while (0) #else diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index fd0c1b857d3d..330586ffffbb 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -22,7 +22,7 @@ #include <linux/wait.h> #include <linux/percpu.h> #include <linux/timer.h> - +#include <linux/timerqueue.h> struct hrtimer_clock_base; struct hrtimer_cpu_base; @@ -79,8 +79,8 @@ enum hrtimer_restart { /** * struct hrtimer - the basic hrtimer structure - * @node: red black tree node for time ordered insertion - * @_expires: the absolute expiry time in the hrtimers internal + * @node: timerqueue node, which also manages node.expires, + * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding * slack to the _softexpires value. For non range timers @@ -101,8 +101,7 @@ enum hrtimer_restart { * The hrtimer structure must be initialized by hrtimer_init() */ struct hrtimer { - struct rb_node node; - ktime_t _expires; + struct timerqueue_node node; ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; @@ -141,8 +140,7 @@ struct hrtimer_sleeper { struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; clockid_t index; - struct rb_root active; - struct rb_node *first; + struct timerqueue_head active; ktime_t resolution; ktime_t (*get_time)(void); ktime_t softirq_time; @@ -158,7 +156,6 @@ struct hrtimer_clock_base { * @lock: lock protecting the base and associated clock bases * and timers * @clock_base: array of clock bases for this cpu - * @curr_timer: the timer which is executing a callback right now * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @hres_active: State of high resolution mode @@ -184,43 +181,43 @@ struct hrtimer_cpu_base { static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { - timer->_expires = time; + timer->node.expires = time; timer->_softexpires = time; } static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta) { timer->_softexpires = time; - timer->_expires = ktime_add_safe(time, delta); + timer->node.expires = ktime_add_safe(time, delta); } static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta) { timer->_softexpires = time; - timer->_expires = ktime_add_safe(time, ns_to_ktime(delta)); + timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); } static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64) { - timer->_expires.tv64 = tv64; + timer->node.expires.tv64 = tv64; timer->_softexpires.tv64 = tv64; } static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time) { - timer->_expires = ktime_add_safe(timer->_expires, time); + timer->node.expires = ktime_add_safe(timer->node.expires, time); timer->_softexpires = ktime_add_safe(timer->_softexpires, time); } static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns) { - timer->_expires = ktime_add_ns(timer->_expires, ns); + timer->node.expires = ktime_add_ns(timer->node.expires, ns); timer->_softexpires = ktime_add_ns(timer->_softexpires, ns); } static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer) { - return timer->_expires; + return timer->node.expires; } static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) @@ -230,7 +227,7 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer) { - return timer->_expires.tv64; + return timer->node.expires.tv64; } static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) { @@ -239,12 +236,12 @@ static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) { - return ktime_to_ns(timer->_expires); + return ktime_to_ns(timer->node.expires); } static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return ktime_sub(timer->_expires, timer->base->get_time()); + return ktime_sub(timer->node.expires, timer->base->get_time()); } #ifdef CONFIG_HIGH_RES_TIMERS diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1f8c06ce0fa6..caa151fbebb7 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -12,6 +12,13 @@ #include <linux/securebits.h> #include <net/net_namespace.h> +#ifdef CONFIG_SMP +# define INIT_PUSHABLE_TASKS(tsk) \ + .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), +#else +# define INIT_PUSHABLE_TASKS(tsk) +#endif + extern struct files_struct init_files; extern struct fs_struct init_fs; @@ -83,6 +90,12 @@ extern struct group_info init_groups; */ # define CAP_INIT_BSET CAP_FULL_SET +#ifdef CONFIG_RCU_BOOST +#define INIT_TASK_RCU_BOOST() \ + .rcu_boost_mutex = NULL, +#else +#define INIT_TASK_RCU_BOOST() +#endif #ifdef CONFIG_TREE_PREEMPT_RCU #define INIT_TASK_RCU_TREE_PREEMPT() \ .rcu_blocked_node = NULL, @@ -94,7 +107,8 @@ extern struct group_info init_groups; .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ - INIT_TASK_RCU_TREE_PREEMPT() + INIT_TASK_RCU_TREE_PREEMPT() \ + INIT_TASK_RCU_BOOST() #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif @@ -137,7 +151,7 @@ extern struct cred init_cred; .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ - .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ + INIT_PUSHABLE_TASKS(tsk) \ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ .real_parent = &tsk, \ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 79d0c4f6d071..55e0d4253e49 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -114,15 +114,15 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); struct irqaction { irq_handler_t handler; unsigned long flags; - const char *name; void *dev_id; struct irqaction *next; int irq; - struct proc_dir_entry *dir; irq_handler_t thread_fn; struct task_struct *thread; unsigned long thread_flags; -}; + const char *name; + struct proc_dir_entry *dir; +} ____cacheline_internodealigned_in_smp; extern irqreturn_t no_action(int cpl, void *dev_id); diff --git a/include/linux/module.h b/include/linux/module.h index 7575bbbdf2a2..8b17fd8c790d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -308,6 +308,9 @@ struct module /* The size of the executable code in each section. */ unsigned int init_text_size, core_text_size; + /* Size of RO sections of the module (text+rodata) */ + unsigned int init_ro_size, core_ro_size; + /* Arch-specific module values */ struct mod_arch_specific arch; @@ -672,7 +675,6 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter) { return 0; } - #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS @@ -687,6 +689,13 @@ extern int module_sysfs_initialized; #define __MODULE_STRING(x) __stringify(x) +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +extern void set_all_modules_text_rw(void); +extern void set_all_modules_text_ro(void); +#else +static inline void set_all_modules_text_rw(void) { } +static inline void set_all_modules_text_ro(void) { } +#endif #ifdef CONFIG_GENERIC_BUG void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, diff --git a/include/linux/mutex.h b/include/linux/mutex.h index f363bc8fdc74..94b48bd40dd7 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); +#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX +#define arch_mutex_cpu_relax() cpu_relax() +#endif + #endif diff --git a/include/linux/rculist.h b/include/linux/rculist.h index f31ef61f1c65..2dea94fc4402 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list, #define list_first_entry_rcu(ptr, type, member) \ list_entry_rcu((ptr)->next, type, member) -#define __list_for_each_rcu(pos, head) \ - for (pos = rcu_dereference_raw(list_next_rcu(head)); \ - pos != (head); \ - pos = rcu_dereference_raw(list_next_rcu((pos))) - /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 03cda7bed985..af5614856285 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -47,6 +47,8 @@ extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) +#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) @@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head, extern void synchronize_sched(void); extern void rcu_barrier_bh(void); extern void rcu_barrier_sched(void); -extern void synchronize_sched_expedited(void); extern int sched_expedited_torture_stats(char *page); static inline void __rcu_read_lock_bh(void) @@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void) #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ -extern void rcu_init(void); extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern void rcu_check_callbacks(int cpu, int user); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 13877cb93a60..30ebd7c8d874 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,7 +27,9 @@ #include <linux/cache.h> -#define rcu_init_sched() do { } while (0) +static inline void rcu_init(void) +{ +} #ifdef CONFIG_TINY_RCU @@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void) synchronize_sched(); } +static inline void synchronize_sched_expedited(void) +{ + synchronize_sched(); +} + #ifdef CONFIG_TINY_RCU static inline void rcu_preempt_note_context_switch(void) @@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void) } #ifdef CONFIG_DEBUG_LOCK_ALLOC - extern int rcu_scheduler_active __read_mostly; extern void rcu_scheduler_starting(void); - #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - static inline void rcu_scheduler_starting(void) { } - #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 95518e628794..3a933482734a 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,6 +30,7 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H +extern void rcu_init(void); extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu); extern void rcu_cpu_stall_reset(void); @@ -47,6 +48,7 @@ static inline void exit_rcu(void) #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ extern void synchronize_rcu_bh(void); +extern void synchronize_sched_expedited(void); extern void synchronize_rcu_expedited(void); static inline void synchronize_rcu_bh_expedited(void) diff --git a/include/linux/sched.h b/include/linux/sched.h index a99d735db3df..777cd01e240e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -513,6 +513,8 @@ struct thread_group_cputimer { spinlock_t lock; }; +struct autogroup; + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -580,6 +582,9 @@ struct signal_struct { struct tty_struct *tty; /* NULL if no tty */ +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. @@ -1233,13 +1238,18 @@ struct task_struct { #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + struct rt_mutex *rcu_boost_mutex; +#endif /* #ifdef CONFIG_RCU_BOOST */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; #endif struct list_head tasks; +#ifdef CONFIG_SMP struct plist_node pushable_tasks; +#endif struct mm_struct *mm, *active_mm; #if defined(SPLIT_RSS_COUNTING) @@ -1763,7 +1773,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ -#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ +#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */ +#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */ static inline void rcu_copy_process(struct task_struct *p) { @@ -1771,7 +1782,10 @@ static inline void rcu_copy_process(struct task_struct *p) p->rcu_read_unlock_special = 0; #ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; -#endif +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + p->rcu_boost_mutex = NULL; +#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&p->rcu_node_entry); } @@ -1876,14 +1890,11 @@ extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #ifdef CONFIG_HOTPLUG_CPU -extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); extern void idle_task_exit(void); #else static inline void idle_task_exit(void) {} #endif -extern void sched_idle_next(void); - #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) extern void wake_up_idle_cpu(int cpu); #else @@ -1893,8 +1904,6 @@ static inline void wake_up_idle_cpu(int cpu) { } extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_shares_ratelimit; -extern unsigned int sysctl_sched_shares_thresh; extern unsigned int sysctl_sched_child_runs_first; enum sched_tunable_scaling { @@ -1910,6 +1919,7 @@ extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; +extern unsigned int sysctl_sched_shares_window; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, @@ -1935,6 +1945,24 @@ int sched_rt_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_compat_yield; +#ifdef CONFIG_SCHED_AUTOGROUP +extern unsigned int sysctl_sched_autogroup_enabled; + +extern void sched_autogroup_create_attach(struct task_struct *p); +extern void sched_autogroup_detach(struct task_struct *p); +extern void sched_autogroup_fork(struct signal_struct *sig); +extern void sched_autogroup_exit(struct signal_struct *sig); +#ifdef CONFIG_PROC_FS +extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); +extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice); +#endif +#else +static inline void sched_autogroup_create_attach(struct task_struct *p) { } +static inline void sched_autogroup_detach(struct task_struct *p) { } +static inline void sched_autogroup_fork(struct signal_struct *sig) { } +static inline void sched_autogroup_exit(struct signal_struct *sig) { } +#endif + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); @@ -1953,9 +1981,10 @@ extern int task_nice(const struct task_struct *p); extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); -extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); +extern int sched_setscheduler(struct task_struct *, int, + const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, - struct sched_param *); + const struct sched_param *); extern struct task_struct *idle_task(int cpu); extern struct task_struct *curr_task(int cpu); extern void set_curr_task(int cpu, struct task_struct *p); diff --git a/include/linux/sfi.h b/include/linux/sfi.h index 7f770c638e99..fe817918b30e 100644 --- a/include/linux/sfi.h +++ b/include/linux/sfi.h @@ -77,6 +77,8 @@ #define SFI_OEM_ID_SIZE 6 #define SFI_OEM_TABLE_ID_SIZE 8 +#define SFI_NAME_LEN 16 + #define SFI_SYST_SEARCH_BEGIN 0x000E0000 #define SFI_SYST_SEARCH_END 0x000FFFFF @@ -156,13 +158,13 @@ struct sfi_device_table_entry { u16 addr; u8 irq; u32 max_freq; - char name[16]; + char name[SFI_NAME_LEN]; } __packed; struct sfi_gpio_table_entry { - char controller_name[16]; + char controller_name[SFI_NAME_LEN]; u16 pin_no; - char pin_name[16]; + char pin_name[SFI_NAME_LEN]; } __packed; typedef int (*sfi_table_handler) (struct sfi_table_header *table); diff --git a/include/linux/timer.h b/include/linux/timer.h index 38cf093ef62c..6abd9138beda 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -24,9 +24,9 @@ struct timer_list { int slack; #ifdef CONFIG_TIMER_STATS + int start_pid; void *start_site; char start_comm[16]; - int start_pid; #endif #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; @@ -48,12 +48,38 @@ extern struct tvec_base boot_tvec_bases; #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) #endif +/* + * Note that all tvec_bases are 2 byte aligned and lower bit of + * base in timer_list is guaranteed to be zero. Use the LSB to + * indicate whether the timer is deferrable. + * + * A deferrable timer will work normally when the system is busy, but + * will not cause a CPU to come out of idle just to service it; instead, + * the timer will be serviced when the CPU eventually wakes up with a + * subsequent non-deferrable timer. + */ +#define TBASE_DEFERRABLE_FLAG (0x1) + #define TIMER_INITIALIZER(_function, _expires, _data) { \ .entry = { .prev = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ .base = &boot_tvec_bases, \ + .slack = -1, \ + __TIMER_LOCKDEP_MAP_INITIALIZER( \ + __FILE__ ":" __stringify(__LINE__)) \ + } + +#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *) \ + ((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG)) + +#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) {\ + .entry = { .prev = TIMER_ENTRY_STATIC }, \ + .function = (_function), \ + .expires = (_expires), \ + .data = (_data), \ + .base = TBASE_MAKE_DEFERRED(&boot_tvec_bases), \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ __FILE__ ":" __stringify(__LINE__)) \ } @@ -248,11 +274,11 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer) extern void add_timer(struct timer_list *timer); +extern int try_to_del_timer_sync(struct timer_list *timer); + #ifdef CONFIG_SMP - extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else -# define try_to_del_timer_sync(t) del_timer(t) # define del_timer_sync(t) del_timer(t) #endif diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h new file mode 100644 index 000000000000..d24aabaca474 --- /dev/null +++ b/include/linux/timerqueue.h @@ -0,0 +1,50 @@ +#ifndef _LINUX_TIMERQUEUE_H +#define _LINUX_TIMERQUEUE_H + +#include <linux/rbtree.h> +#include <linux/ktime.h> + + +struct timerqueue_node { + struct rb_node node; + ktime_t expires; +}; + +struct timerqueue_head { + struct rb_root head; + struct timerqueue_node *next; +}; + + +extern void timerqueue_add(struct timerqueue_head *head, + struct timerqueue_node *node); +extern void timerqueue_del(struct timerqueue_head *head, + struct timerqueue_node *node); +extern struct timerqueue_node *timerqueue_iterate_next( + struct timerqueue_node *node); + +/** + * timerqueue_getnext - Returns the timer with the earlies expiration time + * + * @head: head of timerqueue + * + * Returns a pointer to the timer node that has the + * earliest expiration time. + */ +static inline +struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) +{ + return head->next; +} + +static inline void timerqueue_init(struct timerqueue_node *node) +{ + RB_CLEAR_NODE(&node->node); +} + +static inline void timerqueue_init_head(struct timerqueue_head *head) +{ + head->head = RB_ROOT; + head->next = NULL; +} +#endif /* _LINUX_TIMERQUEUE_H */ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d3e4f87e95c0..c6814616653b 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -32,7 +32,7 @@ struct tracepoint { int state; /* State. */ void (*regfunc)(void); void (*unregfunc)(void); - struct tracepoint_func *funcs; + struct tracepoint_func __rcu *funcs; } __attribute__((aligned(32))); /* * Aligned on 32 bytes because it is * globally visible and gcc happily @@ -326,7 +326,7 @@ do_trace: \ * memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); * __entry->next_pid = next->pid; * __entry->next_prio = next->prio; - * ) + * ), * * * * * Formatted output of a trace record via TP_printk(). diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0c0771f06bfa..bd257fee6031 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -127,12 +127,20 @@ struct execute_work { .timer = TIMER_INITIALIZER(NULL, 0, 0), \ } +#define __DEFERRED_WORK_INITIALIZER(n, f) { \ + .work = __WORK_INITIALIZER((n).work, (f)), \ + .timer = TIMER_DEFERRED_INITIALIZER(NULL, 0, 0), \ + } + #define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) #define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f) +#define DECLARE_DEFERRED_WORK(n, f) \ + struct delayed_work n = __DEFERRED_WORK_INITIALIZER(n, f) + /* * initialize a work item's function pointer */ diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index b0b4eb24d592..da39b22636f7 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -21,6 +21,16 @@ #undef CREATE_TRACE_POINTS #include <linux/stringify.h> +/* + * module.h includes tracepoints, and because ftrace.h + * pulls in module.h: + * trace/ftrace.h -> linux/ftrace_event.h -> linux/perf_event.h -> + * linux/ftrace.h -> linux/module.h + * we must include module.h here before we play with any of + * the TRACE_EVENT() macros, otherwise the tracepoints included + * by module.h may break the build. + */ +#include <linux/module.h> #undef TRACE_EVENT #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 75ce9d500d8e..f10293c41b1e 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -25,9 +25,7 @@ TRACE_EVENT(kfree_skb, TP_fast_assign( __entry->skbaddr = skb; - if (skb) { - __entry->protocol = ntohs(skb->protocol); - } + __entry->protocol = ntohs(skb->protocol); __entry->location = location; ), diff --git a/init/Kconfig b/init/Kconfig index c9728992a776..8dfd094e6875 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -393,7 +393,6 @@ config PREEMPT_RCU config RCU_TRACE bool "Enable tracing for RCU" - depends on TREE_RCU || TREE_PREEMPT_RCU help This option provides tracing in RCU which presents stats in debugfs for debugging RCU implementation. @@ -459,6 +458,60 @@ config TREE_RCU_TRACE TREE_PREEMPT_RCU implementations, permitting Makefile to trivially select kernel/rcutree_trace.c. +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && TINY_PREEMPT_RCU + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_PRIO + int "Real-time priority to boost RCU readers to" + range 1 99 + depends on RCU_BOOST + default 1 + help + This option specifies the real-time priority to which preempted + RCU readers are to be boosted. If you are working with CPU-bound + real-time applications, you should specify a priority higher then + the highest-priority CPU-bound application. + + Specify the real-time priority, or take the default if unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + +config SRCU_SYNCHRONIZE_DELAY + int "Microseconds to delay before waiting for readers" + range 0 20 + default 10 + help + This option controls how long SRCU delays before entering its + loop waiting on SRCU readers. The purpose of this loop is + to avoid the unconditional context-switch penalty that would + otherwise be incurred if there was an active SRCU reader, + in a manner similar to adaptive locking schemes. This should + be set to be a bit longer than the common-case SRCU read-side + critical-section overhead. + + Accept the default if unsure. + endmenu # "RCU Subsystem" config IKCONFIG @@ -741,6 +794,19 @@ config NET_NS endif # NAMESPACES +config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select EVENTFD + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED + help + This option optimizes the scheduler for common desktop workloads by + automatically creating and populating task groups. This separation + of workloads isolates aggressive CPU burners (like build jobs) from + desktop applications. Task group autogeneration is currently based + upon task session. + config MM_OWNER bool diff --git a/kernel/Makefile b/kernel/Makefile index 0b5ff083fa22..e0f2831634b4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ +obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f18491..156cc5556140 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu) } struct take_cpu_down_param { - struct task_struct *caller; unsigned long mod; void *hcpu; }; @@ -198,7 +197,6 @@ struct take_cpu_down_param { static int __ref take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; - unsigned int cpu = (unsigned long)param->hcpu; int err; /* Ensure this CPU doesn't handle any more interrupts. */ @@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param) cpu_notify(CPU_DYING | param->mod, param->hcpu); - if (task_cpu(param->caller) == cpu) - move_task_off_dead_cpu(cpu, param->caller); - /* Force idle task to run as soon as we yield: it should - immediately notice cpu is offline and die quickly. */ - sched_idle_next(); return 0; } @@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { - .caller = current, .mod = mod, .hcpu = hcpu, }; @@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } BUG_ON(cpu_online(cpu)); - /* Wait for it to sleep (leaving idle task). */ + /* + * The migration_call() CPU_DYING callback will have removed all + * runnable tasks from the cpu, there's only the idle task left now + * that the migration thread is done doing the stop_machine thing. + * + * Wait for the stop thread to go away. + */ while (!idle_cpu(cpu)) - yield(); + cpu_relax(); /* This actually kills the CPU. */ __cpu_die(cpu); @@ -386,6 +384,14 @@ out: #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; +void __weak arch_disable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_disable_nonboot_cpus_end(void) +{ +} + int disable_nonboot_cpus(void) { int cpu, first_cpu, error = 0; @@ -397,6 +403,7 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); + arch_disable_nonboot_cpus_begin(); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { @@ -412,6 +419,8 @@ int disable_nonboot_cpus(void) } } + arch_disable_nonboot_cpus_end(); + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ diff --git a/kernel/fork.c b/kernel/fork.c index 5447dc7defa9..7d164e25b0f0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig) static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) + if (atomic_dec_and_test(&sig->sigcnt)) { + sched_autogroup_exit(sig); free_signal_struct(sig); + } } void __put_task_struct(struct task_struct *tsk) @@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) posix_cpu_timers_init_group(sig); tty_audit_fork(sig); + sched_autogroup_fork(sig); sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; @@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm: } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) - free_signal_struct(p->signal); + put_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: diff --git a/kernel/futex.c b/kernel/futex.c index 40a8777a27d0..3019b92e6917 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled; #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* + * Futex flags used to encode options to functions and preserve them across + * restarts. + */ +#define FLAGS_SHARED 0x01 +#define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 + +/* * Priority Inheritance state: */ struct futex_pi_state { @@ -123,6 +131,12 @@ struct futex_q { u32 bitset; }; +static const struct futex_q futex_q_init = { + /* list gets initialized in queue_me()*/ + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY +}; + /* * Hash buckets are shared by all the futex_keys that hash to the same * location. Each key may have multiple futex_q structures, one for each task @@ -283,8 +297,7 @@ again: return 0; } -static inline -void put_futex_key(int fshared, union futex_key *key) +static inline void put_futex_key(union futex_key *key) { drop_futex_key_refs(key); } @@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) /* * Wake up waiters matching bitset queued on this futex (uaddr). */ -static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) +static int +futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); if (unlikely(ret != 0)) goto out; @@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); - put_futex_key(fshared, &key); + put_futex_key(&key); out: return ret; } @@ -917,7 +931,7 @@ out: * to this virtual address: */ static int -futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -962,11 +976,11 @@ retry_private: if (ret) goto out_put_keys; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); goto retry; } @@ -996,9 +1010,9 @@ retry_private: double_unlock_hb(hb1, hb2); out_put_keys: - put_futex_key(fshared, &key2); + put_futex_key(&key2); out_put_key1: - put_futex_key(fshared, &key1); + put_futex_key(&key1); out: return ret; } @@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 * @uaddr1: source futex user address - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @flags: futex flags (FLAGS_SHARED, etc.) * @uaddr2: target futex user address * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) * @nr_requeue: number of waiters to requeue (0-INT_MAX) * @cmpval: @uaddr1 expected value (or %NULL) * @requeue_pi: if we are attempting to requeue from a non-pi futex to a - * pi futex (pi to pi requeue is not supported) + * pi futex (pi to pi requeue is not supported) * * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. @@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * >=0 - on success, the number of tasks requeued or woken * <0 - on error */ -static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval, - int requeue_pi) +static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + u32 __user *uaddr2, int nr_wake, int nr_requeue, + u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; int drop_count = 0, task_count = 0, ret; @@ -1191,10 +1205,10 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1216,11 +1230,11 @@ retry_private: if (ret) goto out_put_keys; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -1260,8 +1274,8 @@ retry_private: break; case -EFAULT: double_unlock_hb(hb1, hb2); - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; @@ -1269,8 +1283,8 @@ retry_private: case -EAGAIN: /* The owner was exiting, try again. */ double_unlock_hb(hb1, hb2); - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); cond_resched(); goto retry; default: @@ -1352,9 +1366,9 @@ out_unlock: drop_futex_key_refs(&key1); out_put_keys: - put_futex_key(fshared, &key2); + put_futex_key(&key2); out_put_key1: - put_futex_key(fshared, &key1); + put_futex_key(&key1); out: if (pi_state != NULL) free_pi_state(pi_state); @@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner, int fshared) + struct task_struct *newowner) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; @@ -1587,20 +1601,11 @@ handle_fault: goto retry; } -/* - * In case we must use restart_block to restart a futex_wait, - * we encode in the 'flags' shared capability - */ -#define FLAGS_SHARED 0x01 -#define FLAGS_CLOCKRT 0x02 -#define FLAGS_HAS_TIMEOUT 0x04 - static long futex_wait_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management * @uaddr: user address of the futex - * @fshared: whether the futex is shared (1) or not (0) * @q: futex_q (contains pi_state and access to the rt_mutex) * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) * @@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart); * 0 - success, lock not taken * <0 - on error (-EFAULT) */ -static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, - int locked) +static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { struct task_struct *owner; int ret = 0; @@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, * did a lock-steal - fix up the PI-state in that case: */ if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current, fshared); + ret = fixup_pi_state_owner(uaddr, q, current); goto out; } @@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, * lock. Fix the state up. */ owner = rt_mutex_owner(&q->pi_state->pi_mutex); - ret = fixup_pi_state_owner(uaddr, q, owner, fshared); + ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } @@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value - * @fshared: whether the futex is shared (1) or not (0) + * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @hb: storage for hash_bucket pointer to be returned to caller * @@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * 0 - uaddr contains val and hb has been locked * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked */ -static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, +static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) { u32 uval; @@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, * rare, but normal. */ retry: - q->key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q->key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); if (unlikely(ret != 0)) return ret; @@ -1769,10 +1772,10 @@ retry_private: if (ret) goto out; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &q->key); + put_futex_key(&q->key); goto retry; } @@ -1783,32 +1786,29 @@ retry_private: out: if (ret) - put_futex_key(fshared, &q->key); + put_futex_key(&q->key); return ret; } -static int futex_wait(u32 __user *uaddr, int fshared, - u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to = NULL; struct restart_block *restart; struct futex_hash_bucket *hb; - struct futex_q q; + struct futex_q q = futex_q_init; int ret; if (!bitset) return -EINVAL; - - q.pi_state = NULL; q.bitset = bitset; - q.rt_waiter = NULL; - q.requeue_pi_key = NULL; if (abs_time) { to = &timeout; - hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, current->timer_slack_ns); @@ -1819,7 +1819,7 @@ retry: * Prepare to wait on uaddr. On success, holds hb lock and increments * q.key refs. */ - ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) goto out; @@ -1852,12 +1852,7 @@ retry: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = FLAGS_HAS_TIMEOUT; - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; + restart->futex.flags = flags; ret = -ERESTART_RESTARTBLOCK; @@ -1873,7 +1868,6 @@ out: static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; - int fshared = 0; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { @@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart) tp = &t; } restart->fn = do_no_restart_syscall; - if (restart->futex.flags & FLAGS_SHARED) - fshared = 1; - return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, - restart->futex.bitset, - restart->futex.flags & FLAGS_CLOCKRT); + + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); } @@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, int fshared, - int detect, ktime_t *time, int trylock) +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, + ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; struct futex_hash_bucket *hb; - struct futex_q q; + struct futex_q q = futex_q_init; int res, ret; if (refill_pi_state_cache()) @@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, hrtimer_set_expires(&to->timer, *time); } - q.pi_state = NULL; - q.rt_waiter = NULL; - q.requeue_pi_key = NULL; retry: - q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); if (unlikely(ret != 0)) goto out; @@ -1941,7 +1929,7 @@ retry_private: * exit to complete. */ queue_unlock(&q, hb); - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); cond_resched(); goto retry; default: @@ -1971,7 +1959,7 @@ retry_private: * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ - res = fixup_owner(uaddr, fshared, &q, !ret); + res = fixup_owner(uaddr, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it acquired * the lock, clear our -ETIMEDOUT or -EINTR. @@ -1995,7 +1983,7 @@ out_unlock_put_key: queue_unlock(&q, hb); out_put_key: - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); out: if (to) destroy_hrtimer_on_stack(&to->timer); @@ -2008,10 +1996,10 @@ uaddr_faulted: if (ret) goto out_put_key; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); goto retry; } @@ -2020,7 +2008,7 @@ uaddr_faulted: * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -static int futex_unlock_pi(u32 __user *uaddr, int fshared) +static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -2038,7 +2026,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); if (unlikely(ret != 0)) goto out; @@ -2093,14 +2081,14 @@ retry: out_unlock: spin_unlock(&hb->lock); - put_futex_key(fshared, &key); + put_futex_key(&key); out: return ret; pi_faulted: spin_unlock(&hb->lock); - put_futex_key(fshared, &key); + put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /** * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) - * @fshared: whether the futexes are shared (1) or not (0). They must be + * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout @@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * 0 - On success * <0 - On error */ -static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, +static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, - int clockrt, u32 __user *uaddr2) + u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; struct futex_hash_bucket *hb; - union futex_key key2; - struct futex_q q; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; int res, ret; if (!bitset) @@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, if (abs_time) { to = &timeout; - hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, current->timer_slack_ns); @@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; - key2 = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); if (unlikely(ret != 0)) goto out; - q.pi_state = NULL; q.bitset = bitset; q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; @@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * Prepare to wait on uaddr. On success, increments q.key (key1) ref * count. */ - ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) goto out_key2; @@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, */ if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); - ret = fixup_pi_state_owner(uaddr2, &q, current, - fshared); + ret = fixup_pi_state_owner(uaddr2, &q, current); spin_unlock(q.lock_ptr); } } else { @@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ - res = fixup_owner(uaddr2, fshared, &q, !ret); + res = fixup_owner(uaddr2, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it * acquired the lock, clear -ETIMEDOUT or -EINTR. @@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, } out_put_keys: - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); out_key2: - put_futex_key(fshared, &key2); + put_futex_key(&key2); out: if (to) { @@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int clockrt, ret = -ENOSYS; - int cmd = op & FUTEX_CMD_MASK; - int fshared = 0; + int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; + unsigned int flags = 0; if (!(op & FUTEX_PRIVATE_FLAG)) - fshared = 1; + flags |= FLAGS_SHARED; - clockrt = op & FUTEX_CLOCK_REALTIME; - if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) - return -ENOSYS; + if (op & FUTEX_CLOCK_REALTIME) { + flags |= FLAGS_CLOCKRT; + if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) + return -ENOSYS; + } switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); + ret = futex_wait(uaddr, flags, val, timeout, val3); break; case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAKE_BITSET: - ret = futex_wake(uaddr, fshared, val, val3); + ret = futex_wake(uaddr, flags, val, val3); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); + ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, - 0); + ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); break; case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); + ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + ret = futex_lock_pi(uaddr, flags, val, timeout, 0); break; case FUTEX_UNLOCK_PI: if (futex_cmpxchg_enabled) - ret = futex_unlock_pi(uaddr, fshared); + ret = futex_unlock_pi(uaddr, flags); break; case FUTEX_TRYLOCK_PI: if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); break; case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; - ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, - clockrt, uaddr2); + ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, + uaddr2); break; case FUTEX_CMP_REQUEUE_PI: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, - 1); + ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); break; default: ret = -ENOSYS; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72206cf5c6cf..f2429fc3438c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; + struct timerqueue_node *next; - if (!base->first) + next = timerqueue_getnext(&base->active); + if (!next) continue; - timer = rb_entry(base->first, struct hrtimer, node); + timer = container_of(next, struct hrtimer, node); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); /* * clock_was_set() has changed base->offset so the @@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct rb_node **link = &base->active.rb_node; - struct rb_node *parent = NULL; - struct hrtimer *entry; - int leftmost = 1; - debug_activate(timer); - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct hrtimer, node); - /* - * We dont care about collisions. Nodes with - * the same expiry time stay together. - */ - if (hrtimer_get_expires_tv64(timer) < - hrtimer_get_expires_tv64(entry)) { - link = &(*link)->rb_left; - } else { - link = &(*link)->rb_right; - leftmost = 0; - } - } - - /* - * Insert the timer to the rbtree and check whether it - * replaces the first pending timer - */ - if (leftmost) - base->first = &timer->node; + timerqueue_add(&base->active, &timer->node); - rb_link_node(&timer->node, parent, link); - rb_insert_color(&timer->node, &base->active); /* * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the * state of a possibly running callback. */ timer->state |= HRTIMER_STATE_ENQUEUED; - return leftmost; + return (&timer->node == base->active.next); } /* @@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer, if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; - /* - * Remove the timer from the rbtree and replace the first - * entry pointer if necessary. - */ - if (base->first == &timer->node) { - base->first = rb_next(&timer->node); + if (&timer->node == timerqueue_getnext(&base->active)) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ if (reprogram && hrtimer_hres_active()) { @@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer, } #endif } - rb_erase(&timer->node, &base->active); + timerqueue_del(&base->active, &timer->node); out: timer->state = newstate; } @@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void) if (!hrtimer_hres_active()) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; + struct timerqueue_node *next; - if (!base->first) + next = timerqueue_getnext(&base->active); + if (!next) continue; - timer = rb_entry(base->first, struct hrtimer, node); + timer = container_of(next, struct hrtimer, node); delta.tv64 = hrtimer_get_expires_tv64(timer); delta = ktime_sub(delta, base->get_time()); if (delta.tv64 < mindelta.tv64) @@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->base = &cpu_base->clock_base[clock_id]; hrtimer_init_timer_hres(timer); + timerqueue_init(&timer->node); #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -1278,14 +1248,14 @@ retry: for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { ktime_t basenow; - struct rb_node *node; + struct timerqueue_node *node; basenow = ktime_add(now, base->offset); - while ((node = base->first)) { + while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; - timer = rb_entry(node, struct hrtimer, node); + timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is @@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void) */ void hrtimer_run_queues(void) { - struct rb_node *node; + struct timerqueue_node *node; struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; int index, gettime = 1; @@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void) for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { base = &cpu_base->clock_base[index]; - - if (!base->first) + if (!timerqueue_getnext(&base->active)) continue; if (gettime) { @@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void) raw_spin_lock(&cpu_base->lock); - while ((node = base->first)) { + while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; - timer = rb_entry(node, struct hrtimer, node); + timer = container_of(node, struct hrtimer, node); if (base->softirq_time.tv64 <= hrtimer_get_expires_tv64(timer)) break; @@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu) raw_spin_lock_init(&cpu_base->lock); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; + timerqueue_init_head(&cpu_base->clock_base[i].active); + } hrtimer_init_hres(cpu_base); } @@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; - struct rb_node *node; + struct timerqueue_node *node; - while ((node = rb_first(&old_base->active))) { - timer = rb_entry(node, struct hrtimer, node); + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5f92acc5f952..91a5fa25054e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } */ static int irq_thread(void *data) { - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; + static struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); int wake, oneshot = desc->status & IRQ_ONESHOT; diff --git a/kernel/kthread.c b/kernel/kthread.c index ca61bbdd44b2..5355cfd44a3f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), wait_for_completion(&create.done); if (!IS_ERR(create.result)) { - struct sched_param param = { .sched_priority = 0 }; + static struct sched_param param = { .sched_priority = 0 }; va_list args; va_start(args, namefmt); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 59b76c8ce9d7..1969d2fc4b36 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) namelen += 2; for (i = 0; i < LOCKSTAT_POINTS; i++) { - char sym[KSYM_SYMBOL_LEN]; char ip[32]; if (class->contention_point[i] == 0) @@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) if (!i) seq_line(m, '-', 40-namelen, namelen); - sprint_symbol(sym, class->contention_point[i]); snprintf(ip, sizeof(ip), "[<%p>]", (void *)class->contention_point[i]); - seq_printf(m, "%40s %14lu %29s %s\n", name, - stats->contention_point[i], - ip, sym); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contention_point[i], + ip, (void *)class->contention_point[i]); } for (i = 0; i < LOCKSTAT_POINTS; i++) { - char sym[KSYM_SYMBOL_LEN]; char ip[32]; if (class->contending_point[i] == 0) @@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) if (!i) seq_line(m, '-', 40-namelen, namelen); - sprint_symbol(sym, class->contending_point[i]); snprintf(ip, sizeof(ip), "[<%p>]", (void *)class->contending_point[i]); - seq_printf(m, "%40s %14lu %29s %s\n", name, - stats->contending_point[i], - ip, sym); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contending_point[i], + ip, (void *)class->contending_point[i]); } if (i) { seq_puts(m, "\n"); diff --git a/kernel/module.c b/kernel/module.c index d190664f25ff..34e00b708fad 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -56,6 +56,7 @@ #include <linux/percpu.h> #include <linux/kmemleak.h> #include <linux/jump_label.h> +#include <linux/pfn.h> #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -70,6 +71,26 @@ #define ARCH_SHF_SMALL 0 #endif +/* + * Modules' sections will be aligned on page boundaries + * to ensure complete separation of code and data, but + * only when CONFIG_DEBUG_SET_MODULE_RONX=y + */ +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +# define debug_align(X) ALIGN(X, PAGE_SIZE) +#else +# define debug_align(X) (X) +#endif + +/* + * Given BASE and SIZE this macro calculates the number of pages the + * memory regions occupies + */ +#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ + (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ + PFN_DOWN((unsigned long)BASE) + 1) \ + : (0UL)) + /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) @@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod) return 0; } +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +/* + * LKM RO/NX protection: protect module's text/ro-data + * from modification and any data from execution. + */ +void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) +{ + unsigned long begin_pfn = PFN_DOWN((unsigned long)start); + unsigned long end_pfn = PFN_DOWN((unsigned long)end); + + if (end_pfn > begin_pfn) + set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); +} + +static void set_section_ro_nx(void *base, + unsigned long text_size, + unsigned long ro_size, + unsigned long total_size) +{ + /* begin and end PFNs of the current subsection */ + unsigned long begin_pfn; + unsigned long end_pfn; + + /* + * Set RO for module text and RO-data: + * - Always protect first page. + * - Do not protect last partial page. + */ + if (ro_size > 0) + set_page_attributes(base, base + ro_size, set_memory_ro); + + /* + * Set NX permissions for module data: + * - Do not protect first partial page. + * - Always protect last page. + */ + if (total_size > text_size) { + begin_pfn = PFN_UP((unsigned long)base + text_size); + end_pfn = PFN_UP((unsigned long)base + total_size); + if (end_pfn > begin_pfn) + set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); + } +} + +/* Setting memory back to RW+NX before releasing it */ +void unset_section_ro_nx(struct module *mod, void *module_region) +{ + unsigned long total_pages; + + if (mod->module_core == module_region) { + /* Set core as NX+RW */ + total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); + set_memory_nx((unsigned long)mod->module_core, total_pages); + set_memory_rw((unsigned long)mod->module_core, total_pages); + + } else if (mod->module_init == module_region) { + /* Set init as NX+RW */ + total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); + set_memory_nx((unsigned long)mod->module_init, total_pages); + set_memory_rw((unsigned long)mod->module_init, total_pages); + } +} + +/* Iterate through all modules and set each module's text as RW */ +void set_all_modules_text_rw() +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry_rcu(mod, &modules, list) { + if ((mod->module_core) && (mod->core_text_size)) { + set_page_attributes(mod->module_core, + mod->module_core + mod->core_text_size, + set_memory_rw); + } + if ((mod->module_init) && (mod->init_text_size)) { + set_page_attributes(mod->module_init, + mod->module_init + mod->init_text_size, + set_memory_rw); + } + } + mutex_unlock(&module_mutex); +} + +/* Iterate through all modules and set each module's text as RO */ +void set_all_modules_text_ro() +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry_rcu(mod, &modules, list) { + if ((mod->module_core) && (mod->core_text_size)) { + set_page_attributes(mod->module_core, + mod->module_core + mod->core_text_size, + set_memory_ro); + } + if ((mod->module_init) && (mod->init_text_size)) { + set_page_attributes(mod->module_init, + mod->module_init + mod->init_text_size, + set_memory_ro); + } + } + mutex_unlock(&module_mutex); +} +#else +static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } +static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } +#endif + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1566,6 +1696,7 @@ static void free_module(struct module *mod) destroy_params(mod->kp, mod->num_kp); /* This may be NULL, but that's OK */ + unset_section_ro_nx(mod, mod->module_init); module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -1574,6 +1705,7 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ + unset_section_ro_nx(mod, mod->module_core); module_free(mod, mod->module_core); #ifdef CONFIG_MPU @@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info) s->sh_entsize = get_offset(mod, &mod->core_size, s, i); DEBUGP("\t%s\n", name); } - if (m == 0) + switch (m) { + case 0: /* executable */ + mod->core_size = debug_align(mod->core_size); mod->core_text_size = mod->core_size; + break; + case 1: /* RO: text and ro-data */ + mod->core_size = debug_align(mod->core_size); + mod->core_ro_size = mod->core_size; + break; + case 3: /* whole core */ + mod->core_size = debug_align(mod->core_size); + break; + } } DEBUGP("Init section allocation order:\n"); @@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | INIT_OFFSET_MASK); DEBUGP("\t%s\n", sname); } - if (m == 0) + switch (m) { + case 0: /* executable */ + mod->init_size = debug_align(mod->init_size); mod->init_text_size = mod->init_size; + break; + case 1: /* RO: text and ro-data */ + mod->init_size = debug_align(mod->init_size); + mod->init_ro_size = mod->init_size; + break; + case 3: /* whole init */ + mod->init_size = debug_align(mod->init_size); + break; + } } } @@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mod->symtab = mod->core_symtab; mod->strtab = mod->core_strtab; #endif + unset_section_ro_nx(mod, mod->module_init); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; diff --git a/kernel/mutex.c b/kernel/mutex.c index 200407c1502f..a5889fb28ecf 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax(); + arch_mutex_cpu_relax(); } #endif spin_lock_mutex(&lock->wait_lock, flags); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9ca4973f736d..93bd2eb2bc53 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer); static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); -static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); + +#define lock_timer(tid, flags) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ + __timr; \ +}) static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { @@ -619,7 +625,7 @@ out: * the find to the timer lock. To avoid a dead lock, the timer id MUST * be release with out holding the timer lock. */ -static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* diff --git a/kernel/printk.c b/kernel/printk.c index a23315dc4498..ab3ffc5b3b64 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending); void printk_tick(void) { - if (__get_cpu_var(printk_pending)) { - __get_cpu_var(printk_pending) = 0; + if (__this_cpu_read(printk_pending)) { + __this_cpu_write(printk_pending, 0); wake_up_interruptible(&log_wait); } } int printk_needs_cpu(int cpu) { - if (unlikely(cpu_is_offline(cpu))) + if (cpu_is_offline(cpu)) printk_tick(); - return per_cpu(printk_pending, cpu); + return __this_cpu_read(printk_pending); } void wake_up_klogd(void) diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index d806735342ac..034493724749 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -36,31 +36,16 @@ #include <linux/time.h> #include <linux/cpu.h> -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ - struct rcu_head **curtail; /* ->next pointer of last CB. */ -}; - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, -}; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ +static struct task_struct *rcu_kthread_task; +static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); +static unsigned long have_rcu_kthread_work; +static void invoke_rcu_kthread(void); /* Forward declarations for rcutiny_plugin.h. */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +struct rcu_ctrlblk; +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static int rcu_kthread(void *arg); static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); @@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu) { if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } /* @@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu) void rcu_bh_qs(int cpu) { if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } /* @@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user) } /* - * Helper function for rcu_process_callbacks() that operates on the - * specified rcu_ctrlkblk structure. + * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure + * whose grace period has elapsed. */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) { struct rcu_head *next, *list; unsigned long flags; + RCU_TRACE(int cb_count = 0); /* If no RCU callbacks ready to invoke, just return. */ if (&rcp->rcucblist == rcp->donetail) @@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); + local_bh_disable(); list->func(list); + local_bh_enable(); list = next; + RCU_TRACE(cb_count++); } + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); } /* - * Invoke any callbacks whose grace period has completed. + * This kthread invokes RCU callbacks whose grace periods have + * elapsed. It is awakened as needed, and takes the place of the + * RCU_SOFTIRQ that was used previously for this purpose. + * This is a kthread, but it is never stopped, at least not until + * the system goes down. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static int rcu_kthread(void *arg) { - __rcu_process_callbacks(&rcu_sched_ctrlblk); - __rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); + unsigned long work; + unsigned long morework; + unsigned long flags; + + for (;;) { + wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); + morework = rcu_boost(); + local_irq_save(flags); + work = have_rcu_kthread_work; + have_rcu_kthread_work = morework; + local_irq_restore(flags); + if (work) { + rcu_process_callbacks(&rcu_sched_ctrlblk); + rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); + } + schedule_timeout_interruptible(1); /* Leave CPU for others. */ + } + + return 0; /* Not reached, but needed to shut gcc up. */ +} + +/* + * Wake up rcu_kthread() to process callbacks now eligible for invocation + * or to boost readers. + */ +static void invoke_rcu_kthread(void) +{ + unsigned long flags; + + local_irq_save(flags); + have_rcu_kthread_work = 1; + wake_up(&rcu_kthread_wq); + local_irq_restore(flags); } /* @@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; + RCU_TRACE(rcp->qlen++); local_irq_restore(flags); } @@ -282,7 +308,16 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL_GPL(rcu_barrier_sched); -void __init rcu_init(void) +/* + * Spawn the kthread that invokes RCU callbacks. + */ +static int __init rcu_spawn_kthreads(void) { - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + struct sched_param sp; + + rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); + sp.sched_priority = RCU_BOOST_PRIO; + sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); + return 0; } +early_initcall(rcu_spawn_kthreads); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 6ceca4f745ff..015abaea962a 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -22,6 +22,40 @@ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/kthread.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#ifdef CONFIG_RCU_TRACE +#define RCU_TRACE(stmt) stmt +#else /* #ifdef CONFIG_RCU_TRACE */ +#define RCU_TRACE(stmt) +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ + RCU_TRACE(long qlen); /* Number of pending CBs. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +int rcu_scheduler_active __read_mostly; +EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + #ifdef CONFIG_TINY_PREEMPT_RCU #include <linux/delay.h> @@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk { struct list_head *gp_tasks; /* Pointer to the first task blocking the */ /* current grace period, or NULL if there */ - /* is not such task. */ + /* is no such task. */ struct list_head *exp_tasks; /* Pointer to first task blocking the */ /* current expedited grace period, or NULL */ /* if there is no such task. If there */ /* is no current expedited grace period, */ /* then there cannot be any such task. */ +#ifdef CONFIG_RCU_BOOST + struct list_head *boost_tasks; + /* Pointer to first task that needs to be */ + /* priority-boosted, or NULL if no priority */ + /* boosting is needed. If there is no */ + /* current or expedited grace period, there */ + /* can be no such task. */ +#endif /* #ifdef CONFIG_RCU_BOOST */ u8 gpnum; /* Current grace period. */ u8 gpcpu; /* Last grace period blocked by the CPU. */ u8 completed; /* Last grace period completed. */ /* If all three are equal, RCU is idle. */ +#ifdef CONFIG_RCU_BOOST + s8 boosted_this_gp; /* Has boosting already happened? */ + unsigned long boost_time; /* When to start boosting (jiffies) */ +#endif /* #ifdef CONFIG_RCU_BOOST */ +#ifdef CONFIG_RCU_TRACE + unsigned long n_grace_periods; +#ifdef CONFIG_RCU_BOOST + unsigned long n_tasks_boosted; + unsigned long n_exp_boosts; + unsigned long n_normal_boosts; + unsigned long n_normal_balk_blkd_tasks; + unsigned long n_normal_balk_gp_tasks; + unsigned long n_normal_balk_boost_tasks; + unsigned long n_normal_balk_boosted; + unsigned long n_normal_balk_notyet; + unsigned long n_normal_balk_nos; + unsigned long n_exp_balk_blkd_tasks; + unsigned long n_exp_balk_nos; +#endif /* #ifdef CONFIG_RCU_BOOST */ +#endif /* #ifdef CONFIG_RCU_TRACE */ }; static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { @@ -122,6 +184,210 @@ static int rcu_preempt_gp_in_progress(void) } /* + * Advance a ->blkd_tasks-list pointer to the next entry, instead + * returning NULL if at the end of the list. + */ +static struct list_head *rcu_next_node_entry(struct task_struct *t) +{ + struct list_head *np; + + np = t->rcu_node_entry.next; + if (np == &rcu_preempt_ctrlblk.blkd_tasks) + np = NULL; + return np; +} + +#ifdef CONFIG_RCU_TRACE + +#ifdef CONFIG_RCU_BOOST +static void rcu_initiate_boost_trace(void); +static void rcu_initiate_exp_boost_trace(void); +#endif /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Dump additional statistice for TINY_PREEMPT_RCU. + */ +static void show_tiny_preempt_stats(struct seq_file *m) +{ + seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", + rcu_preempt_ctrlblk.rcb.qlen, + rcu_preempt_ctrlblk.n_grace_periods, + rcu_preempt_ctrlblk.gpnum, + rcu_preempt_ctrlblk.gpcpu, + rcu_preempt_ctrlblk.completed, + "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], + "N."[!rcu_preempt_ctrlblk.gp_tasks], + "E."[!rcu_preempt_ctrlblk.exp_tasks]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, " ttb=%c btg=", + "B."[!rcu_preempt_ctrlblk.boost_tasks]); + switch (rcu_preempt_ctrlblk.boosted_this_gp) { + case -1: + seq_puts(m, "exp"); + break; + case 0: + seq_puts(m, "no"); + break; + case 1: + seq_puts(m, "begun"); + break; + case 2: + seq_puts(m, "done"); + break; + default: + seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); + } + seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", + rcu_preempt_ctrlblk.n_tasks_boosted, + rcu_preempt_ctrlblk.n_exp_boosts, + rcu_preempt_ctrlblk.n_normal_boosts, + (int)(jiffies & 0xffff), + (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); + seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", + "normal balk", + rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, + rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, + rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, + rcu_preempt_ctrlblk.n_normal_balk_boosted, + rcu_preempt_ctrlblk.n_normal_balk_notyet, + rcu_preempt_ctrlblk.n_normal_balk_nos); + seq_printf(m, " exp balk: bt=%lu nos=%lu\n", + rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, + rcu_preempt_ctrlblk.n_exp_balk_nos); +#endif /* #ifdef CONFIG_RCU_BOOST */ +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +#ifdef CONFIG_RCU_BOOST + +#include "rtmutex_common.h" + +/* + * Carry out RCU priority boosting on the task indicated by ->boost_tasks, + * and advance ->boost_tasks to the next task in the ->blkd_tasks list. + */ +static int rcu_boost(void) +{ + unsigned long flags; + struct rt_mutex mtx; + struct list_head *np; + struct task_struct *t; + + if (rcu_preempt_ctrlblk.boost_tasks == NULL) + return 0; /* Nothing to boost. */ + raw_local_irq_save(flags); + rcu_preempt_ctrlblk.boosted_this_gp++; + t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, + rcu_node_entry); + np = rcu_next_node_entry(t); + rt_mutex_init_proxy_locked(&mtx, t); + t->rcu_boost_mutex = &mtx; + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + raw_local_irq_restore(flags); + rt_mutex_lock(&mtx); + RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); + rcu_preempt_ctrlblk.boosted_this_gp++; + rt_mutex_unlock(&mtx); + return rcu_preempt_ctrlblk.boost_tasks != NULL; +} + +/* + * Check to see if it is now time to start boosting RCU readers blocking + * the current grace period, and, if so, tell the rcu_kthread_task to + * start boosting them. If there is an expedited boost in progress, + * we wait for it to complete. + * + * If there are no blocked readers blocking the current grace period, + * return 0 to let the caller know, otherwise return 1. Note that this + * return value is independent of whether or not boosting was done. + */ +static int rcu_initiate_boost(void) +{ + if (!rcu_preempt_blocked_readers_cgp()) { + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); + return 0; + } + if (rcu_preempt_ctrlblk.gp_tasks != NULL && + rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.boosted_this_gp == 0 && + ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { + rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; + invoke_rcu_kthread(); + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); + } else + RCU_TRACE(rcu_initiate_boost_trace()); + return 1; +} + +/* + * Initiate boosting for an expedited grace period. + */ +static void rcu_initiate_expedited_boost(void) +{ + unsigned long flags; + + raw_local_irq_save(flags); + if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { + rcu_preempt_ctrlblk.boost_tasks = + rcu_preempt_ctrlblk.blkd_tasks.next; + rcu_preempt_ctrlblk.boosted_this_gp = -1; + invoke_rcu_kthread(); + RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); + } else + RCU_TRACE(rcu_initiate_exp_boost_trace()); + raw_local_irq_restore(flags); +} + +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); + +/* + * Do priority-boost accounting for the start of a new grace period. + */ +static void rcu_preempt_boost_start_gp(void) +{ + rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; + if (rcu_preempt_ctrlblk.boosted_this_gp > 0) + rcu_preempt_ctrlblk.boosted_this_gp = 0; +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +/* + * If there is no RCU priority boosting, we don't boost. + */ +static int rcu_boost(void) +{ + return 0; +} + +/* + * If there is no RCU priority boosting, we don't initiate boosting, + * but we do indicate whether there are blocked readers blocking the + * current grace period. + */ +static int rcu_initiate_boost(void) +{ + return rcu_preempt_blocked_readers_cgp(); +} + +/* + * If there is no RCU priority boosting, we don't initiate expedited boosting. + */ +static void rcu_initiate_expedited_boost(void) +{ +} + +/* + * If there is no RCU priority boosting, nothing to do at grace-period start. + */ +static void rcu_preempt_boost_start_gp(void) +{ +} + +#endif /* else #ifdef CONFIG_RCU_BOOST */ + +/* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * in a quiescent state. There might be any number of tasks blocked @@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void) rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + /* If there is no GP then there is nothing more to do. */ + if (!rcu_preempt_gp_in_progress()) + return; /* - * If there is no GP, or if blocked readers are still blocking GP, - * then there is nothing more to do. + * Check up on boosting. If there are no readers blocking the + * current grace period, leave. */ - if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) + if (rcu_initiate_boost()) return; /* Advance callbacks. */ @@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void) if (!rcu_preempt_blocked_readers_any()) rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; - /* If there are done callbacks, make RCU_SOFTIRQ process them. */ + /* If there are done callbacks, cause them to be invoked. */ if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); } /* @@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void) /* Official start of GP. */ rcu_preempt_ctrlblk.gpnum++; + RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); /* Any blocked RCU readers block new GP. */ if (rcu_preempt_blocked_readers_any()) rcu_preempt_ctrlblk.gp_tasks = rcu_preempt_ctrlblk.blkd_tasks.next; + /* Set up for RCU priority boosting. */ + rcu_preempt_boost_start_gp(); + /* If there is no running reader, CPU is done with GP. */ if (!rcu_preempt_running_reader()) rcu_preempt_cpu_qs(); @@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t) */ empty = !rcu_preempt_blocked_readers_cgp(); empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; - np = t->rcu_node_entry.next; - if (np == &rcu_preempt_ctrlblk.blkd_tasks) - np = NULL; + np = rcu_next_node_entry(t); list_del(&t->rcu_node_entry); if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) rcu_preempt_ctrlblk.gp_tasks = np; if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) rcu_preempt_ctrlblk.exp_tasks = np; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) + rcu_preempt_ctrlblk.boost_tasks = np; +#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&t->rcu_node_entry); /* @@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t) if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) rcu_report_exp_done(); } +#ifdef CONFIG_RCU_BOOST + /* Unboost self if was boosted. */ + if (special & RCU_READ_UNLOCK_BOOSTED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; + rt_mutex_unlock(t->rcu_boost_mutex); + t->rcu_boost_mutex = NULL; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ local_irq_restore(flags); } @@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void) rcu_preempt_cpu_qs(); if (&rcu_preempt_ctrlblk.rcb.rcucblist != rcu_preempt_ctrlblk.rcb.donetail) - raise_softirq(RCU_SOFTIRQ); + invoke_rcu_kthread(); if (rcu_preempt_gp_in_progress() && rcu_cpu_blocking_cur_gp() && rcu_preempt_running_reader()) @@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void) /* * TINY_PREEMPT_RCU has an extra callback-list tail pointer to - * update, so this is invoked from __rcu_process_callbacks() to + * update, so this is invoked from rcu_process_callbacks() to * handle that case. Of course, it is invoked for all flavors of * RCU, but RCU callbacks can appear only on one of the lists, and * neither ->nexttail nor ->donetail can possibly be NULL, so there @@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) */ static void rcu_preempt_process_callbacks(void) { - __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); + rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); } /* @@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) local_irq_save(flags); *rcu_preempt_ctrlblk.nexttail = head; rcu_preempt_ctrlblk.nexttail = &head->next; + RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); rcu_preempt_start_gp(); /* checks to see if GP needed. */ local_irq_restore(flags); } @@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void) /* Wait for tail of ->blkd_tasks list to drain. */ if (rcu_preempted_readers_exp()) + rcu_initiate_expedited_boost(); wait_event(sync_rcu_preempt_exp_wq, !rcu_preempted_readers_exp()); @@ -572,6 +857,27 @@ void exit_rcu(void) #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ +#ifdef CONFIG_RCU_TRACE + +/* + * Because preemptible RCU does not exist, it is not necessary to + * dump out its statistics. + */ +static void show_tiny_preempt_stats(struct seq_file *m) +{ +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +/* + * Because preemptible RCU does not exist, it is never necessary to + * boost preempted RCU readers. + */ +static int rcu_boost(void) +{ + return 0; +} + /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void) #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC - #include <linux/kernel_stat.h> /* * During boot, we forgive RCU lockdep issues. After this function is * invoked, we start taking RCU lockdep issues seriously. */ -void rcu_scheduler_starting(void) +void __init rcu_scheduler_starting(void) { WARN_ON(nr_context_switches() > 0); rcu_scheduler_active = 1; } #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_RCU_BOOST +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO +#else /* #ifdef CONFIG_RCU_BOOST */ +#define RCU_BOOST_PRIO 1 +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + +#ifdef CONFIG_RCU_TRACE + +#ifdef CONFIG_RCU_BOOST + +static void rcu_initiate_boost_trace(void) +{ + if (rcu_preempt_ctrlblk.gp_tasks == NULL) + rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; + else if (rcu_preempt_ctrlblk.boost_tasks != NULL) + rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; + else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) + rcu_preempt_ctrlblk.n_normal_balk_boosted++; + else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) + rcu_preempt_ctrlblk.n_normal_balk_notyet++; + else + rcu_preempt_ctrlblk.n_normal_balk_nos++; +} + +static void rcu_initiate_exp_boost_trace(void) +{ + if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) + rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; + else + rcu_preempt_ctrlblk.n_exp_balk_nos++; +} + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) +{ + unsigned long flags; + + raw_local_irq_save(flags); + rcp->qlen -= n; + raw_local_irq_restore(flags); +} + +/* + * Dump statistics for TINY_RCU, such as they are. + */ +static int show_tiny_stats(struct seq_file *m, void *unused) +{ + show_tiny_preempt_stats(m); + seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); + seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); + return 0; +} + +static int show_tiny_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_tiny_stats, NULL); +} + +static const struct file_operations show_tiny_stats_fops = { + .owner = THIS_MODULE, + .open = show_tiny_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; + +static int __init rcutiny_trace_init(void) +{ + struct dentry *retval; + + rcudir = debugfs_create_dir("rcu", NULL); + if (!rcudir) + goto free_out; + retval = debugfs_create_file("rcudata", 0444, rcudir, + NULL, &show_tiny_stats_fops); + if (!retval) + goto free_out; + return 0; +free_out: + debugfs_remove_recursive(rcudir); + return 1; +} + +static void __exit rcutiny_trace_cleanup(void) +{ + debugfs_remove_recursive(rcudir); +} + +module_init(rcutiny_trace_init); +module_exit(rcutiny_trace_cleanup); + +MODULE_AUTHOR("Paul E. McKenney"); +MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); +MODULE_LICENSE("GPL"); + +#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9d8e8fb2515f..89613f97ff26 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -47,6 +47,7 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <asm/byteorder.h> +#include <linux/sched.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " @@ -64,6 +65,9 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff = 0; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ +static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ +static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(test_boost, int, 0444); +MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +module_param(test_boost_interval, int, 0444); +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +module_param(test_boost_duration, int, 0444); +MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -109,6 +119,7 @@ static struct task_struct *stats_task; static struct task_struct *shuffler_task; static struct task_struct *stutter_task; static struct task_struct *fqs_task; +static struct task_struct *boost_tasks[NR_CPUS]; #define RCU_TORTURE_PIPE_LEN 10 @@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; +static long n_rcu_torture_boost_ktrerror; +static long n_rcu_torture_boost_rterror; +static long n_rcu_torture_boost_allocerror; +static long n_rcu_torture_boost_afferror; +static long n_rcu_torture_boost_failure; +static long n_rcu_torture_boosts; static long n_rcu_torture_timers; static struct list_head rcu_torture_removed; static cpumask_var_t shuffle_tmp_mask; @@ -147,6 +164,16 @@ static int stutter_pause_test; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; +#ifdef CONFIG_RCU_BOOST +#define rcu_can_boost() 1 +#else /* #ifdef CONFIG_RCU_BOOST */ +#define rcu_can_boost() 0 +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + +static unsigned long boost_starttime; /* jiffies of next boost test start. */ +DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ + /* and boost task create/destroy. */ + /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ @@ -277,6 +304,7 @@ struct rcu_torture_ops { void (*fqs)(void); int (*stats)(char *page); int irq_capable; + int can_boost; char *name; }; @@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = { .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .can_boost = rcu_can_boost(), .name = "rcu" }; @@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .can_boost = rcu_can_boost(), .name = "rcu_sync" }; @@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, + .can_boost = rcu_can_boost(), .name = "rcu_expedited" }; @@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = { }; /* + * RCU torture priority-boost testing. Runs one real-time thread per + * CPU for moderate bursts, repeatedly registering RCU callbacks and + * spinning waiting for them to be invoked. If a given callback takes + * too long to be invoked, we assume that priority inversion has occurred. + */ + +struct rcu_boost_inflight { + struct rcu_head rcu; + int inflight; +}; + +static void rcu_torture_boost_cb(struct rcu_head *head) +{ + struct rcu_boost_inflight *rbip = + container_of(head, struct rcu_boost_inflight, rcu); + + smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ + rbip->inflight = 0; +} + +static int rcu_torture_boost(void *arg) +{ + unsigned long call_rcu_time; + unsigned long endtime; + unsigned long oldstarttime; + struct rcu_boost_inflight rbi = { .inflight = 0 }; + struct sched_param sp; + + VERBOSE_PRINTK_STRING("rcu_torture_boost started"); + + /* Set real-time priority. */ + sp.sched_priority = 1; + if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { + VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); + n_rcu_torture_boost_rterror++; + } + + /* Each pass through the following loop does one boost-test cycle. */ + do { + /* Wait for the next test interval. */ + oldstarttime = boost_starttime; + while (jiffies - oldstarttime > ULONG_MAX / 2) { + schedule_timeout_uninterruptible(1); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* Do one boost-test interval. */ + endtime = oldstarttime + test_boost_duration * HZ; + call_rcu_time = jiffies; + while (jiffies - endtime > ULONG_MAX / 2) { + /* If we don't have a callback in flight, post one. */ + if (!rbi.inflight) { + smp_mb(); /* RCU core before ->inflight = 1. */ + rbi.inflight = 1; + call_rcu(&rbi.rcu, rcu_torture_boost_cb); + if (jiffies - call_rcu_time > + test_boost_duration * HZ - HZ / 2) { + VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + } + call_rcu_time = jiffies; + } + cond_resched(); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* + * Set the start time of the next test interval. + * Yes, this is vulnerable to long delays, but such + * delays simply cause a false negative for the next + * interval. Besides, we are running at RT priority, + * so delays should be relatively rare. + */ + while (oldstarttime == boost_starttime) { + if (mutex_trylock(&boost_mutex)) { + boost_starttime = jiffies + + test_boost_interval * HZ; + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + + /* Go do the stutter. */ +checkwait: rcu_stutter_wait("rcu_torture_boost"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + /* Clean up and exit. */ + VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + rcutorture_shutdown_absorb("rcu_torture_boost"); + while (!kthread_should_stop() || rbi.inflight) + schedule_timeout_uninterruptible(1); + smp_mb(); /* order accesses to ->inflight before stack-frame death. */ + return 0; +} + +/* * RCU torture force-quiescent-state kthread. Repeatedly induces * bursts of calls to force_quiescent_state(), increasing the probability * of occurrence of some important types of race conditions. @@ -933,7 +1068,8 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d nt: %ld", + "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " + "rtbf: %ld rtb: %ld nt: %ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), @@ -941,8 +1077,19 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free), atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror, + n_rcu_torture_boost_allocerror, + n_rcu_torture_boost_afferror, + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, n_rcu_torture_timers); - if (atomic_read(&n_rcu_torture_mberror) != 0) + if (atomic_read(&n_rcu_torture_mberror) != 0 || + n_rcu_torture_boost_ktrerror != 0 || + n_rcu_torture_boost_rterror != 0 || + n_rcu_torture_boost_allocerror != 0 || + n_rcu_torture_boost_afferror != 0 || + n_rcu_torture_boost_failure != 0) cnt += sprintf(&page[cnt], " !!!"); cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); if (i > 1) { @@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg) } static inline void -rcu_torture_print_module_parms(char *tag) +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) { printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " "shuffle_interval=%d stutter=%d irqreader=%d " - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " + "test_boost=%d/%d test_boost_interval=%d " + "test_boost_duration=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, + test_boost, cur_ops->can_boost, + test_boost_interval, test_boost_duration); } -static struct notifier_block rcutorture_nb = { +static struct notifier_block rcutorture_shutdown_nb = { .notifier_call = rcutorture_shutdown_notify, }; +static void rcutorture_booster_cleanup(int cpu) +{ + struct task_struct *t; + + if (boost_tasks[cpu] == NULL) + return; + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); + t = boost_tasks[cpu]; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + + /* This must be outside of the mutex, otherwise deadlock! */ + kthread_stop(t); +} + +static int rcutorture_booster_init(int cpu) +{ + int retval; + + if (boost_tasks[cpu] != NULL) + return 0; /* Already created, nothing more to do. */ + + /* Don't allow time recalculation while creating a new task. */ + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); + boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, + "rcu_torture_boost"); + if (IS_ERR(boost_tasks[cpu])) { + retval = PTR_ERR(boost_tasks[cpu]); + VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); + n_rcu_torture_boost_ktrerror++; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + return retval; + } + kthread_bind(boost_tasks[cpu], cpu); + wake_up_process(boost_tasks[cpu]); + mutex_unlock(&boost_mutex); + return 0; +} + +static int rcutorture_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + (void)rcutorture_booster_init(cpu); + break; + case CPU_DOWN_PREPARE: + rcutorture_booster_cleanup(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block rcutorture_cpu_nb = { + .notifier_call = rcutorture_cpu_notify, +}; + static void rcu_torture_cleanup(void) { @@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void) } fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&rcutorture_nb); + unregister_reboot_notifier(&rcutorture_shutdown_nb); if (stutter_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); kthread_stop(stutter_task); @@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void) kthread_stop(fqs_task); } fqs_task = NULL; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + unregister_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) + rcutorture_booster_cleanup(i); + } /* Wait for all RCU callbacks to fire. */ @@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void) if (cur_ops->cleanup) cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) - rcu_torture_print_module_parms("End of test: FAILURE"); + rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else - rcu_torture_print_module_parms("End of test: SUCCESS"); + rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); } static int __init @@ -1242,7 +1464,7 @@ rcu_torture_init(void) nrealreaders = nreaders; else nrealreaders = 2 * num_online_cpus(); - rcu_torture_print_module_parms("Start of test"); + rcu_torture_print_module_parms(cur_ops, "Start of test"); fullstop = FULLSTOP_DONTSTOP; /* Set up the freelist. */ @@ -1263,6 +1485,12 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_free, 0); atomic_set(&n_rcu_torture_mberror, 0); atomic_set(&n_rcu_torture_error, 0); + n_rcu_torture_boost_ktrerror = 0; + n_rcu_torture_boost_rterror = 0; + n_rcu_torture_boost_allocerror = 0; + n_rcu_torture_boost_afferror = 0; + n_rcu_torture_boost_failure = 0; + n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) atomic_set(&rcu_torture_wcount[i], 0); for_each_possible_cpu(cpu) { @@ -1376,7 +1604,27 @@ rcu_torture_init(void) goto unwind; } } - register_reboot_notifier(&rcutorture_nb); + if (test_boost_interval < 1) + test_boost_interval = 1; + if (test_boost_duration < 2) + test_boost_duration = 2; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + int retval; + + boost_starttime = jiffies + test_boost_interval * HZ; + register_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) { + if (cpu_is_offline(i)) + continue; /* Heuristic: CPU can go offline. */ + retval = rcutorture_booster_init(i); + if (retval < 0) { + firsterr = retval; + goto unwind; + } + } + } + register_reboot_notifier(&rcutorture_shutdown_nb); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ccdc04c47981..d0ddfea6579d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .gpnum = -300, \ .completed = -300, \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ - .orphan_cbs_list = NULL, \ - .orphan_cbs_tail = &structname.orphan_cbs_list, \ - .orphan_qlen = 0, \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void) static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { if (rdp->gpnum != rnp->gpnum) { - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; + /* + * If the current grace period is waiting for this CPU, + * set up to detect a quiescent state, otherwise don't + * go looking for one. + */ rdp->gpnum = rnp->gpnum; + if (rnp->qsmask & rdp->grpmask) { + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + } else + rdp->qs_pending = 0; } } @@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; + + /* + * If we were in an extended quiescent state, we may have + * missed some grace periods that others CPUs handled on + * our behalf. Catch up with this state to avoid noting + * spurious new grace periods. If another grace period + * has started, then rnp->gpnum will have advanced, so + * we will detect this later on. + */ + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) + rdp->gpnum = rdp->completed; + + /* + * If RCU does not need a quiescent state from this CPU, + * then make sure that this CPU doesn't go looking for one. + */ + if ((rnp->qsmask & rdp->grpmask) == 0) + rdp->qs_pending = 0; } } @@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) #ifdef CONFIG_HOTPLUG_CPU /* - * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the - * specified flavor of RCU. The callbacks will be adopted by the next - * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever - * comes first. Because this is invoked from the CPU_DYING notifier, - * irqs are already disabled. + * Move a dying CPU's RCU callbacks to online CPU's callback list. + * Synchronization is not required because this function executes + * in stop_machine() context. */ -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +static void rcu_send_cbs_to_online(struct rcu_state *rsp) { int i; + /* current DYING CPU is cleared in the cpu_online_mask */ + int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - *rsp->orphan_cbs_tail = rdp->nxtlist; - rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; + + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; + receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen += rdp->qlen; + receive_rdp->n_cbs_adopted += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; + rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - rsp->orphan_qlen += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen = 0; - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ -} - -/* - * Adopt previously orphaned RCU callbacks. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_data *rdp; - - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rdp = this_cpu_ptr(rsp->rda); - if (rsp->orphan_cbs_list == NULL) { - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - return; - } - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; - rdp->qlen += rsp->orphan_qlen; - rdp->n_cbs_adopted += rsp->orphan_qlen; - rsp->orphan_cbs_list = NULL; - rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; - rsp->orphan_qlen = 0; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } /* @@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); - - rcu_adopt_orphan_cbs(rsp); } /* @@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu) #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) -{ -} - -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +static void rcu_send_cbs_to_online(struct rcu_state *rsp) { } @@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ local_irq_save(flags); rdp = this_cpu_ptr(rsp->rda); - rcu_process_gp_end(rsp, rdp); - check_for_new_grace_period(rsp, rdp); /* Add the callback to our list. */ *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; - /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ - } - /* * Force the grace period if too many callbacks or too long waiting. * Enforce hysteresis, and don't invoke force_quiescent_state() @@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * is the only one waiting for a grace period to complete. */ if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { - rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; + + /* Are we ignoring a completed grace period? */ + rcu_process_gp_end(rsp, rdp); + check_for_new_grace_period(rsp, rdp); + + /* Start a new grace period if one not already started. */ + if (!rcu_gp_in_progress(rsp)) { + unsigned long nestflag; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); + rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + } else { + /* Give the grace period a kick. */ + rdp->blimit = LONG_MAX; + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; + } } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp, * decrement rcu_barrier_cpu_count -- otherwise the first CPU * might complete its grace period before all of the other CPUs * did their increment, causing this function to return too - * early. + * early. Note that on_each_cpu() disables irqs, which prevents + * any CPUs from coming online or going offline until each online + * CPU has queued its RCU-barrier callback. */ atomic_set(&rcu_barrier_cpu_count, 1); - preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ - rcu_adopt_orphan_cbs(rsp); on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); - preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ if (atomic_dec_and_test(&rcu_barrier_cpu_count)) complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); @@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_DYING: case CPU_DYING_FROZEN: /* - * preempt_disable() in _rcu_barrier() prevents stop_machine(), - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(). - * The dying CPU clears its cpu_online_mask bit and - * moves all of its RCU callbacks to ->orphan_cbs_list - * in the context of stop_machine(), so subsequent calls - * to _rcu_barrier() will adopt these callbacks and only - * then queue rcu_barrier_func() on all remaining CPUs. + * The whole machine is "stopped" except this CPU, so we can + * touch any data without introducing corruption. We send the + * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_send_cbs_to_orphanage(&rcu_bh_state); - rcu_send_cbs_to_orphanage(&rcu_sched_state); - rcu_preempt_send_cbs_to_orphanage(); + rcu_send_cbs_to_online(&rcu_bh_state); + rcu_send_cbs_to_online(&rcu_sched_state); + rcu_preempt_send_cbs_to_online(); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) + for (i = NUM_RCU_LVLS - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; + rsp->levelspread[0] = RCU_FANOUT_LEAF; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 91d4170c5c13..e8f057e44e3e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -31,46 +31,51 @@ /* * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. * In theory, it should be possible to add more levels straightforwardly. - * In practice, this has not been tested, so there is probably some - * bug somewhere. + * In practice, this did work well going from three levels to four. + * Of course, your mileage may vary. */ #define MAX_RCU_LVLS 4 -#define RCU_FANOUT (CONFIG_RCU_FANOUT) -#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) -#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) -#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) - -#if NR_CPUS <= RCU_FANOUT +#if CONFIG_RCU_FANOUT > 16 +#define RCU_FANOUT_LEAF 16 +#else /* #if CONFIG_RCU_FANOUT > 16 */ +#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) +#endif /* #else #if CONFIG_RCU_FANOUT > 16 */ +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT_1 # define NUM_RCU_LVLS 1 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 -#elif NR_CPUS <= RCU_FANOUT_SQ +#elif NR_CPUS <= RCU_FANOUT_2 # define NUM_RCU_LVLS 2 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 # define NUM_RCU_LVL_4 0 -#elif NR_CPUS <= RCU_FANOUT_CUBE +#elif NR_CPUS <= RCU_FANOUT_3 # define NUM_RCU_LVLS 3 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) -# define NUM_RCU_LVL_3 NR_CPUS +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_3 (NR_CPUS) # define NUM_RCU_LVL_4 0 -#elif NR_CPUS <= RCU_FANOUT_FOURTH +#elif NR_CPUS <= RCU_FANOUT_4 # define NUM_RCU_LVLS 4 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) -# define NUM_RCU_LVL_4 NR_CPUS +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_4 (NR_CPUS) #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" -#endif /* #if (NR_CPUS) <= RCU_FANOUT */ +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) @@ -203,8 +208,8 @@ struct rcu_data { long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ - unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ - unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ + unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -309,15 +314,7 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ raw_spinlock_t onofflock; /* exclude on/offline and */ - /* starting new GP. Also */ - /* protects the following */ - /* orphan_cbs fields. */ - struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ - /* orphaned by all CPUs in */ - /* a given leaf rcu_node */ - /* going offline. */ - struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ - long orphan_qlen; /* Number of orphaned cbs. */ + /* starting new GP. */ raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_send_cbs_to_orphanage(void); +static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); static void rcu_needs_cpu_flush(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 71a4147473f9..a3638710dc67 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,6 +25,7 @@ */ #include <linux/delay.h> +#include <linux/stop_machine.h> /* * Check the RCU kernel configuration parameters and print informative @@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Move preemptable RCU's callbacks to ->orphan_cbs_list. + * Move preemptable RCU's callbacks from dying CPU to other online CPU. */ -static void rcu_preempt_send_cbs_to_orphanage(void) +static void rcu_preempt_send_cbs_to_online(void) { - rcu_send_cbs_to_orphanage(&rcu_preempt_state); + rcu_send_cbs_to_online(&rcu_preempt_state); } /* @@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) /* * Because there is no preemptable RCU, there are no callbacks to move. */ -static void rcu_preempt_send_cbs_to_orphanage(void) +static void rcu_preempt_send_cbs_to_online(void) { } @@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void) #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifndef CONFIG_SMP + +void synchronize_sched_expedited(void) +{ + cond_resched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); /* See above comment block. */ + return 0; +} + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word. Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs. If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period. We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done. If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot. In this case, our work is + * done for us, and we can simply return. Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ + int firstsnap, s, snap, trycount = 0; + + /* Note that atomic_inc_return() implies full memory barrier. */ + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + get_online_cpus(); + + /* + * Each pass through the following loop attempts to force a + * context switch on each CPU. + */ + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + NULL) == -EAGAIN) { + put_online_cpus(); + + /* No joy, try again later. Or just synchronize_sched(). */ + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + + /* Check to see if someone else did our work for us. */ + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + + /* + * Refetching sync_sched_expedited_started allows later + * callers to piggyback on our grace period. We subtract + * 1 to get the same token that the last incrementer got. + * We retry after they started, so our grace period works + * for them, and they started after our first try, so their + * grace period works for us. + */ + get_online_cpus(); + snap = atomic_read(&sync_sched_expedited_started) - 1; + smp_mb(); /* ensure read is before try_stop_cpus(). */ + } + + /* + * Everyone up to our most recent fetch is covered by our grace + * period. Update the counter, but only if our work is still + * relevant -- which it won't be if someone who started later + * than we did beat us to the punch. + */ + do { + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { + smp_mb(); /* ensure test happens before caller kfree */ + break; + } + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */ + #if !defined(CONFIG_RCU_FAST_NO_HZ) /* diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d15430b9d122..c8e97853b970 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", rsp->completed, gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh, rsp->orphan_qlen); + rsp->n_force_qs_lh); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); @@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = { static struct dentry *rcudir; -static int __init rcuclassic_trace_init(void) +static int __init rcutree_trace_init(void) { struct dentry *retval; @@ -337,14 +337,14 @@ free_out: return 1; } -static void __exit rcuclassic_trace_cleanup(void) +static void __exit rcutree_trace_cleanup(void) { debugfs_remove_recursive(rcudir); } -module_init(rcuclassic_trace_init); -module_exit(rcuclassic_trace_cleanup); +module_init(rcutree_trace_init); +module_exit(rcutree_trace_cleanup); MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); diff --git a/kernel/sched.c b/kernel/sched.c index c68cead94dd7..04949089e760 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,9 +75,11 @@ #include <asm/tlb.h> #include <asm/irq_regs.h> +#include <asm/mutex.h> #include "sched_cpupri.h" #include "workqueue_sched.h" +#include "sched_autogroup.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -253,6 +255,8 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; + + atomic_t load_weight; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -268,24 +272,19 @@ struct task_group { struct task_group *parent; struct list_head siblings; struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif }; #define root_task_group init_task_group -/* task_group_lock serializes add/remove of task groups and also changes to - * a task group's cpu shares. - */ +/* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return list_empty(&root_task_group.children); -} -#endif - # define INIT_TASK_GROUP_LOAD NICE_0_LOAD /* @@ -342,6 +341,7 @@ struct cfs_rq { * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This * list is used during load balance. */ + int on_list; struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ @@ -360,14 +360,17 @@ struct cfs_rq { unsigned long h_load; /* - * this cpu's part of tg->shares + * Maintaining per-cpu shares distribution for group scheduling + * + * load_stamp is the last time we updated the load average + * load_last is the last time we updated the load average and saw load + * load_unacc_exec_time is currently unaccounted execution time */ - unsigned long shares; + u64 load_avg; + u64 load_period; + u64 load_stamp, load_last, load_unacc_exec_time; - /* - * load.weight at the time we set shares - */ - unsigned long rq_weight; + unsigned long load_contribution; #endif #endif }; @@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq) */ static inline struct task_group *task_group(struct task_struct *p) { + struct task_group *tg; struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); - return container_of(css, struct task_group, css); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -793,20 +799,6 @@ late_initcall(sched_init_debug); const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * ratelimit for updating the group shares. - * default: 0.25ms - */ -unsigned int sysctl_sched_shares_ratelimit = 250000; -unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; - -/* - * Inject some fuzzyness into changing the per-cpu group shares - * this avoids remote rq-locks at the expense of fairness. - * default: 4 - */ -unsigned int sysctl_sched_shares_thresh = 4; - -/* * period over which we average the RT time consumption, measured * in ms. * @@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) lw->inv_weight = 0; } +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static __read_mostly unsigned long __percpu *update_shares_data; - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, - unsigned long sd_rq_weight, - unsigned long *usd_rq_weight) -{ - unsigned long shares, rq_weight; - int boost = 0; - - rq_weight = usd_rq_weight[cpu]; - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - /* - * \Sum_j shares_j * rq_weight_i - * shares_i = ----------------------------- - * \Sum_j rq_weight_j - */ - shares = (sd_shares * rq_weight) / sd_rq_weight; - shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - - if (abs(shares - tg->se[cpu]->load.weight) > - sysctl_sched_shares_thresh) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - __set_se_shares(tg->se[cpu], shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } -} - -/* - * Re-compute the task group their per cpu shares over the given domain. - * This needs to be done in a bottom-up fashion because the rq weight of a - * parent group depends on the shares of its child groups. - */ -static int tg_shares_up(struct task_group *tg, void *data) -{ - unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; - unsigned long *usd_rq_weight; - struct sched_domain *sd = data; - unsigned long flags; - int i; - - if (!tg->se[0]) - return 0; - - local_irq_save(flags); - usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); - - for_each_cpu(i, sched_domain_span(sd)) { - weight = tg->cfs_rq[i]->load.weight; - usd_rq_weight[i] = weight; - - rq_weight += weight; - /* - * If there are currently no tasks on the cpu pretend there - * is one of average load so that when a new task gets to - * run here it will not get delayed by group starvation. - */ - if (!weight) - weight = NICE_0_LOAD; - - sum_weight += weight; - shares += tg->cfs_rq[i]->shares; - } - - if (!rq_weight) - rq_weight = sum_weight; - - if ((!shares && rq_weight) || shares > tg->shares) - shares = tg->shares; - - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; - - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); - - local_irq_restore(flags); - - return 0; -} - /* * Compute the cpu's hierarchical load factor for each task group. * This needs to be done in a top-down fashion because the load of a child @@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data) load = cpu_rq(cpu)->load.weight; } else { load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->cfs_rq[cpu]->shares; + load *= tg->se[cpu]->load.weight; load /= tg->parent->cfs_rq[cpu]->load.weight + 1; } @@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data) return 0; } -static void update_shares(struct sched_domain *sd) -{ - s64 elapsed; - u64 now; - - if (root_task_group_empty()) - return; - - now = local_clock(); - elapsed = now - sd->last_update; - - if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { - sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, sd); - } -} - static void update_h_load(long cpu) { walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -#else - -static inline void update_shares(struct sched_domain *sd) -{ -} - #endif #ifdef CONFIG_PREEMPT @@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -#ifdef CONFIG_SMP - cfs_rq->shares = shares; -#endif -} -#endif - static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); @@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_autogroup.c" #include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" @@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data); * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static bool migrate_task(struct task_struct *p, int dest_cpu) +static bool migrate_task(struct task_struct *p, struct rq *rq) { - struct rq *rq = task_rq(p); - /* * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. @@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) return dest_cpu; /* No more Mr. Nice Guy. */ - if (unlikely(dest_cpu >= nr_cpu_ids)) { - dest_cpu = cpuset_cpus_allowed_fallback(p); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); - } + dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); } return dest_cpu; @@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif +#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); +#endif put_cpu(); } @@ -3549,7 +3418,7 @@ void sched_exec(void) * select_task_rq() can race against ->cpus_allowed */ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { + likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; task_rq_unlock(rq, &flags); @@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) if (task_thread_info(rq->curr) != owner || need_resched()) return 0; - cpu_relax(); + arch_mutex_cpu_relax(); } return 1; @@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. */ -unsigned long __sched +long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) { @@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. */ -unsigned long __sched +long __sched wait_for_completion_killable_timeout(struct completion *x, unsigned long timeout) { @@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p) } static int __sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param, bool user) + const struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -5056,7 +4925,7 @@ recheck: * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, true); } @@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); * but our caller might not have that capability. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, false); } @@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, + printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) @@ -5754,7 +5623,6 @@ static void update_sysctl(void) SET_SYSCTL(sched_min_granularity); SET_SYSCTL(sched_latency); SET_SYSCTL(sched_wakeup_granularity); - SET_SYSCTL(sched_shares_ratelimit); #undef SET_SYSCTL } @@ -5830,7 +5698,7 @@ again: goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, dest_cpu)) { + if (migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); @@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data) } #ifdef CONFIG_HOTPLUG_CPU + /* - * Figure out where task on dead CPU should go, use force if necessary. + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. */ -void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +void idle_task_exit(void) { - struct rq *rq = cpu_rq(dead_cpu); - int needs_cpu, uninitialized_var(dest_cpu); - unsigned long flags; + struct mm_struct *mm = current->active_mm; - local_irq_save(flags); + BUG_ON(cpu_online(smp_processor_id())); - raw_spin_lock(&rq->lock); - needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); - if (needs_cpu) - dest_cpu = select_fallback_rq(dead_cpu, p); - raw_spin_unlock(&rq->lock); - /* - * It can only fail if we race with set_cpus_allowed(), - * in the racer should migrate the task anyway. - */ - if (needs_cpu) - __migrate_task(p, dead_cpu, dest_cpu); - local_irq_restore(flags); + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); } /* @@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) static void migrate_nr_uninterruptible(struct rq *rq_src) { struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - unsigned long flags; - local_irq_save(flags); - double_rq_lock(rq_src, rq_dest); rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; rq_src->nr_uninterruptible = 0; - double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); -} - -/* Run through task list and migrate tasks from the dead cpu. */ -static void migrate_live_tasks(int src_cpu) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - - do_each_thread(t, p) { - if (p == current) - continue; - - if (task_cpu(p) == src_cpu) - move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); - - read_unlock(&tasklist_lock); } /* - * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible. - * Used by CPU offline code. + * remove the tasks which were accounted by rq from calc_load_tasks. */ -void sched_idle_next(void) +static void calc_global_load_remove(struct rq *rq) { - int this_cpu = smp_processor_id(); - struct rq *rq = cpu_rq(this_cpu); - struct task_struct *p = rq->idle; - unsigned long flags; - - /* cpu has to be offline */ - BUG_ON(cpu_online(this_cpu)); - - /* - * Strictly not necessary since rest of the CPUs are stopped by now - * and interrupts disabled on the current cpu. - */ - raw_spin_lock_irqsave(&rq->lock, flags); - - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - - activate_task(rq, p, 0); - - raw_spin_unlock_irqrestore(&rq->lock, flags); + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; } /* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts. */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) - switch_mm(mm, &init_mm, current); - mmdrop(mm); -} - -/* called under rq->lock with disabled interrupts */ -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) +static void migrate_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - - /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(!p->exit_state); - - /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->state == TASK_DEAD); - - get_task_struct(p); + struct task_struct *next, *stop = rq->stop; + int dest_cpu; /* - * Drop lock around migration; if someone else moves it, - * that's OK. No task can be added to this CPU, so iteration is - * fine. + * Fudge the rq selection such that the below task selection loop + * doesn't get stuck on the currently eligible stop task. + * + * We're currently inside stop_machine() and the rq is either stuck + * in the stop_machine_cpu_stop() loop, or we're executing this code, + * either way we should never end up calling schedule() until we're + * done here. */ - raw_spin_unlock_irq(&rq->lock); - move_task_off_dead_cpu(dead_cpu, p); - raw_spin_lock_irq(&rq->lock); - - put_task_struct(p); -} - -/* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) -{ - struct rq *rq = cpu_rq(dead_cpu); - struct task_struct *next; + rq->stop = NULL; for ( ; ; ) { - if (!rq->nr_running) + /* + * There's this thread running, bail when that's the only + * remaining thread. + */ + if (rq->nr_running == 1) break; + next = pick_next_task(rq); - if (!next) - break; + BUG_ON(!next); next->sched_class->put_prev_task(rq, next); - migrate_dead(dead_cpu, next); + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_cpu, next); + raw_spin_unlock(&rq->lock); + + __migrate_task(next, dead_cpu, dest_cpu); + + raw_spin_lock(&rq->lock); } -} -/* - * remove the tasks which were accounted by rq from calc_load_tasks. - */ -static void calc_global_load_remove(struct rq *rq) -{ - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + rq->stop = stop; } + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: - case CPU_ONLINE_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { @@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_live_tasks(cpu); - /* Idle task back to normal (off runqueue, low prio) */ - raw_spin_lock_irq(&rq->lock); - deactivate_task(rq, rq->idle, 0); - __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); - rq->idle->sched_class = &idle_sched_class; - migrate_dead_tasks(cpu); - raw_spin_unlock_irq(&rq->lock); - migrate_nr_uninterruptible(rq); - BUG_ON(rq->nr_running != 0); - calc_global_load_remove(rq); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } + migrate_tasks(cpu); + BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + + migrate_nr_uninterruptible(rq); + calc_global_load_remove(rq); break; #endif } @@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, int add, + struct sched_entity *se, int cpu, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; - if (add) - list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; /* se could be NULL for init_task_group */ @@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - se->load.weight = tg->shares; - se->load.inv_weight = 0; + update_load_set(&se->load, 0); se->parent = parent; } #endif #ifdef CONFIG_RT_GROUP_SCHED static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent) { struct rq *rq = cpu_rq(cpu); @@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - if (add) - list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; if (!rt_se) @@ -8164,13 +7946,9 @@ void __init sched_init(void) #ifdef CONFIG_CGROUP_SCHED list_add(&init_task_group.list, &task_groups); INIT_LIST_HEAD(&init_task_group.children); - + autogroup_init(&init_task); #endif /* CONFIG_CGROUP_SCHED */ -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP - update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), - __alignof__(unsigned long)); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -8184,7 +7962,6 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); -#ifdef CONFIG_CGROUP_SCHED /* * How much cpu bandwidth does init_task_group get? * @@ -8204,16 +7981,13 @@ void __init sched_init(void) * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). */ - init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); -#endif + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -#ifdef CONFIG_CGROUP_SCHED - init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); -#endif + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -8486,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } return 1; @@ -8497,15 +8271,21 @@ err: return 0; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, - &cpu_rq(cpu)->leaf_cfs_rq_list); -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { - list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); } #else /* !CONFG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) @@ -8518,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { } @@ -8576,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } return 1; @@ -8586,17 +8362,6 @@ err_free_rq: err: return 0; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, - &cpu_rq(cpu)->leaf_rt_rq_list); -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ - list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); -} #else /* !CONFIG_RT_GROUP_SCHED */ static inline void free_rt_sched_group(struct task_group *tg) { @@ -8607,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ -} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED @@ -8630,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; - int i; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -8643,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent) goto err; spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - register_fair_sched_group(tg, i); - register_rt_sched_group(tg, i); - } list_add_rcu(&tg->list, &task_groups); WARN_ON(!parent); /* root should already exist */ @@ -8676,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg) unsigned long flags; int i; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { + /* end participation in shares distribution */ + for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); - unregister_rt_sched_group(tg, i); - } + + spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); @@ -8727,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - int on_rq; - - on_rq = se->on_rq; - if (on_rq) - dequeue_entity(cfs_rq, se, 0); - - se->load.weight = shares; - se->load.inv_weight = 0; - - if (on_rq) - enqueue_entity(cfs_rq, se, 0); -} - -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __set_se_shares(se, shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) @@ -8776,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (tg->shares == shares) goto done; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for any ongoing reference to this group to finish */ - synchronize_sched(); - - /* - * Now we are free to modify the group's share on each cpu - * w/o tripping rebalance_share or load_balance_fair. - */ tg->shares = shares; for_each_possible_cpu(i) { - /* - * force a rebalance - */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); - set_se_shares(tg->se[i], shares); + struct rq *rq = cpu_rq(i); + struct sched_entity *se; + + se = tg->se[i]; + /* Propagate contribution to hierarchy */ + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); } - /* - * Enable load balance activity on this group, by inserting it back on - * each cpu's rq->leaf_cfs_rq_list. - */ - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - register_fair_sched_group(tg, i); - list_add_rcu(&tg->siblings, &tg->parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); done: mutex_unlock(&shares_mutex); return 0; @@ -9532,72 +9239,3 @@ struct cgroup_subsys cpuacct_subsys = { }; #endif /* CONFIG_CGROUP_CPUACCT */ -#ifndef CONFIG_SMP - -void synchronize_sched_expedited(void) -{ - barrier(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#else /* #ifndef CONFIG_SMP */ - -static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); - -static int synchronize_sched_expedited_cpu_stop(void *data) -{ - /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. - */ - smp_mb(); /* See above comment block. */ - return 0; -} - -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly. This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. - * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier. Failing to - * observe this restriction will result in deadlock. - */ -void synchronize_sched_expedited(void) -{ - int snap, trycount = 0; - - smp_mb(); /* ensure prior mod happens before capturing snap. */ - snap = atomic_read(&synchronize_sched_expedited_count) + 1; - get_online_cpus(); - while (try_stop_cpus(cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - if (trycount++ < 10) - udelay(trycount * num_online_cpus()); - else { - synchronize_sched(); - return; - } - if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { - smp_mb(); /* ensure test happens before caller kfree */ - return; - } - get_online_cpus(); - } - atomic_inc(&synchronize_sched_expedited_count); - smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ - put_online_cpus(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#endif /* #else #ifndef CONFIG_SMP */ diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c new file mode 100644 index 000000000000..c80fedcd476b --- /dev/null +++ b/kernel/sched_autogroup.c @@ -0,0 +1,238 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/utsname.h> + +unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; +static struct autogroup autogroup_default; +static atomic_t autogroup_seq_nr; + +static void autogroup_init(struct task_struct *init_task) +{ + autogroup_default.tg = &init_task_group; + init_task_group.autogroup = &autogroup_default; + kref_init(&autogroup_default.kref); + init_rwsem(&autogroup_default.lock); + init_task->signal->autogroup = &autogroup_default; +} + +static inline void autogroup_free(struct task_group *tg) +{ + kfree(tg->autogroup); +} + +static inline void autogroup_destroy(struct kref *kref) +{ + struct autogroup *ag = container_of(kref, struct autogroup, kref); + + sched_destroy_group(ag->tg); +} + +static inline void autogroup_kref_put(struct autogroup *ag) +{ + kref_put(&ag->kref, autogroup_destroy); +} + +static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) +{ + kref_get(&ag->kref); + return ag; +} + +static inline struct autogroup *autogroup_task_get(struct task_struct *p) +{ + struct autogroup *ag; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return autogroup_kref_get(&autogroup_default); + + ag = autogroup_kref_get(p->signal->autogroup); + unlock_task_sighand(p, &flags); + + return ag; +} + +static inline struct autogroup *autogroup_create(void) +{ + struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); + struct task_group *tg; + + if (!ag) + goto out_fail; + + tg = sched_create_group(&init_task_group); + + if (IS_ERR(tg)) + goto out_free; + + kref_init(&ag->kref); + init_rwsem(&ag->lock); + ag->id = atomic_inc_return(&autogroup_seq_nr); + ag->tg = tg; + tg->autogroup = ag; + + return ag; + +out_free: + kfree(ag); +out_fail: + if (printk_ratelimit()) { + printk(KERN_WARNING "autogroup_create: %s failure.\n", + ag ? "sched_create_group()" : "kmalloc()"); + } + + return autogroup_kref_get(&autogroup_default); +} + +static inline bool +task_wants_autogroup(struct task_struct *p, struct task_group *tg) +{ + if (tg != &root_task_group) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + /* + * We can only assume the task group can't go away on us if + * autogroup_move_group() can see us on ->thread_group list. + */ + if (p->flags & PF_EXITING) + return false; + + return true; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +static void +autogroup_move_group(struct task_struct *p, struct autogroup *ag) +{ + struct autogroup *prev; + struct task_struct *t; + unsigned long flags; + + BUG_ON(!lock_task_sighand(p, &flags)); + + prev = p->signal->autogroup; + if (prev == ag) { + unlock_task_sighand(p, &flags); + return; + } + + p->signal->autogroup = autogroup_kref_get(ag); + + t = p; + do { + sched_move_task(t); + } while_each_thread(p, t); + + unlock_task_sighand(p, &flags); + autogroup_kref_put(prev); +} + +/* Allocates GFP_KERNEL, cannot be called under any spinlock */ +void sched_autogroup_create_attach(struct task_struct *p) +{ + struct autogroup *ag = autogroup_create(); + + autogroup_move_group(p, ag); + /* drop extra refrence added by autogroup_create() */ + autogroup_kref_put(ag); +} +EXPORT_SYMBOL(sched_autogroup_create_attach); + +/* Cannot be called under siglock. Currently has no users */ +void sched_autogroup_detach(struct task_struct *p) +{ + autogroup_move_group(p, &autogroup_default); +} +EXPORT_SYMBOL(sched_autogroup_detach); + +void sched_autogroup_fork(struct signal_struct *sig) +{ + sig->autogroup = autogroup_task_get(current); +} + +void sched_autogroup_exit(struct signal_struct *sig) +{ + autogroup_kref_put(sig->autogroup); +} + +static int __init setup_autogroup(char *str) +{ + sysctl_sched_autogroup_enabled = 0; + + return 1; +} + +__setup("noautogroup", setup_autogroup); + +#ifdef CONFIG_PROC_FS + +int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +{ + static unsigned long next = INITIAL_JIFFIES; + struct autogroup *ag; + int err; + + if (*nice < -20 || *nice > 19) + return -EINVAL; + + err = security_task_setnice(current, *nice); + if (err) + return err; + + if (*nice < 0 && !can_nice(current, *nice)) + return -EPERM; + + /* this is a heavy operation taking global locks.. */ + if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) + return -EAGAIN; + + next = HZ / 10 + jiffies; + ag = autogroup_task_get(p); + + down_write(&ag->lock); + err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + if (!err) + ag->nice = *nice; + up_write(&ag->lock); + + autogroup_kref_put(ag); + + return err; +} + +void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) +{ + struct autogroup *ag = autogroup_task_get(p); + + down_read(&ag->lock); + seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); + up_read(&ag->lock); + + autogroup_kref_put(ag); +} +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); +} +#endif /* CONFIG_SCHED_DEBUG */ + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h new file mode 100644 index 000000000000..5358e241cb20 --- /dev/null +++ b/kernel/sched_autogroup.h @@ -0,0 +1,32 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +struct autogroup { + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} +#endif + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 52f1a149bfb1..9d8af0b3fb64 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) } EXPORT_SYMBOL_GPL(sched_clock); -static __read_mostly int sched_clock_running; +__read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK __read_mostly int sched_clock_stable; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d17dd9b..1dfae3d014b5 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) #ifdef CONFIG_FAIR_GROUP_SCHED -static void print_cfs_group_stats(struct seq_file *m, int cpu, - struct task_group *tg) +static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; if (!se) @@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif -#ifdef CONFIG_CGROUP_SCHED - { - char path[64]; - - rcu_read_lock(); - cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); - rcu_read_unlock(); - SEQ_printf(m, " %s", path); - } -#endif SEQ_printf(m, "\n"); } @@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_unlock_irqrestore(&tasklist_lock, flags); } -#if defined(CONFIG_CGROUP_SCHED) && \ - (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) -static void task_group_path(struct task_group *tg, char *buf, int buflen) -{ - /* may be NULL if the underlying cgroup isn't fully-created yet */ - if (!tg->css.cgroup) { - buf[0] = '\0'; - return; - } - cgroup_path(tg->css.cgroup, buf, buflen); -} -#endif - void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, @@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; -#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) - char path[128]; - struct task_group *tg = cfs_rq->tg; - - task_group_path(tg, path, sizeof(path)); - - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); -#else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); -#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", + SPLIT_NS(cfs_rq->load_avg)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", + SPLIT_NS(cfs_rq->load_period)); + SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", + cfs_rq->load_contribution); + SEQ_printf(m, " .%-30s: %d\n", "load_tg", + atomic_read(&cfs_rq->tg->load_weight)); #endif + print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { -#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) - char path[128]; - struct task_group *tg = rt_rq->tg; - - task_group_path(tg, path, sizeof(path)); - - SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); -#else SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); -#endif - #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) @@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #undef P } +extern __read_mostly int sched_clock_running; + static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); @@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = { static int sched_debug_show(struct seq_file *m, void *v) { - u64 now = ktime_to_ns(ktime_get()); + u64 ktime, sched_clk, cpu_clk; + unsigned long flags; int cpu; - SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", + local_irq_save(flags); + ktime = ktime_to_ns(ktime_get()); + sched_clk = sched_clock(); + cpu_clk = local_clock(); + local_irq_restore(flags); + + SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); +#define P(x) \ + SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(ktime); + PN(sched_clk); + PN(cpu_clk); + P(jiffies); +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + P(sched_clock_stable); +#endif +#undef PN +#undef P + + SEQ_printf(m, "\n"); + SEQ_printf(m, "sysctl_sched\n"); #define P(x) \ SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - P(jiffies); PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 00ebd7686676..c62ebae65cf0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +/* + * The exponential sliding window over which load is averaged for shares + * distribution. + * (default: 10msec) + */ +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; + static const struct sched_class fair_sched_class; /************************************************************** @@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return cfs_rq->tg->cfs_rq[this_cpu]; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->on_list) { + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases. + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } else { + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } + + cfs_rq->on_list = 1; + } +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->on_list) { + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + cfs_rq->on_list = 0; + } +} + /* Iterate thr' all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq(rq, cfs_rq) \ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) @@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return &cpu_rq(this_cpu)->cfs; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) @@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, WRT_SYSCTL(sched_min_granularity); WRT_SYSCTL(sched_latency); WRT_SYSCTL(sched_wakeup_granularity); - WRT_SYSCTL(sched_shares_ratelimit); #undef WRT_SYSCTL return 0; @@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED + cfs_rq->load_unacc_exec_time += delta_exec; +#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &cfs_rq->tasks); } cfs_rq->nr_running++; - se->on_rq = 1; } static void @@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } cfs_rq->nr_running--; - se->on_rq = 0; } +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, + int global_update) +{ + struct task_group *tg = cfs_rq->tg; + long load_avg; + + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; + + if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; + } +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ + u64 period = sysctl_sched_shares_window; + u64 now, delta; + unsigned long load = cfs_rq->load.weight; + + if (!cfs_rq) + return; + + now = rq_of(cfs_rq)->clock; + delta = now - cfs_rq->load_stamp; + + /* truncate load history at 4 idle periods */ + if (cfs_rq->load_stamp > cfs_rq->load_last && + now - cfs_rq->load_last > 4 * period) { + cfs_rq->load_period = 0; + cfs_rq->load_avg = 0; + } + + cfs_rq->load_stamp = now; + cfs_rq->load_unacc_exec_time = 0; + cfs_rq->load_period += delta; + if (load) { + cfs_rq->load_last = now; + cfs_rq->load_avg += delta * load; + } + + /* consider updating load contribution on each fold or truncate */ + if (global_update || cfs_rq->load_period > period + || !cfs_rq->load_period) + update_cfs_rq_load_contribution(cfs_rq, global_update); + + while (cfs_rq->load_period > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (cfs_rq->load_period)); + cfs_rq->load_period /= 2; + cfs_rq->load_avg /= 2; + } + + if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) + list_del_leaf_cfs_rq(cfs_rq); +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + } + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +{ + struct task_group *tg; + struct sched_entity *se; + long load_weight, load, shares; + + if (!cfs_rq) + return; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se) + return; + + load = cfs_rq->load.weight + weight_delta; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + reweight_entity(cfs_rq_of(se), se, shares); +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +{ +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) { @@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); + se->on_rq = 1; + + if (cfs_rq->nr_running == 1) + list_add_leaf_cfs_rq(cfs_rq); } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq, 0); /* * Normalize the entity after updating the min_vruntime because the @@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ update_curr(cfs_rq); + /* + * Update share accounting for long-running entities. + */ + update_entity_shares_tick(cfs_rq); + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother @@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) flags = ENQUEUE_WAKEUP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } + hrtick_update(rq); } @@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; flags |= DEQUEUE_SLEEP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } + hrtick_update(rq); } @@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. - * - * The problem is that perfectly aligning the shares is rather expensive, hence - * we try to avoid doing that too often - see update_shares(), which ratelimits - * this change. - * - * We compensate this by not only taking the current delta into account, but - * also considering the delta between when the shares were last adjusted and - * now. - * - * We still saw a performance dip, some tracing learned us that between - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased - * significantly. Therefore try to bias the error in direction of failing - * the affine wakeup. - * */ -static long effective_load(struct task_group *tg, int cpu, - long wl, long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; if (!tg->parent) return wl; - /* - * By not taking the decrease of shares on the other cpu into - * account our error leans towards reducing the affine wakeups. - */ - if (!wl && sched_feat(ASYM_EFF_LOAD)) - return wl; - for_each_sched_entity(se) { long S, rw, s, a, b; - long more_w; - - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; S = se->my_q->tg->shares; - s = se->my_q->shares; - rw = se->my_q->rq_weight; + s = se->load.weight; + rw = se->my_q->load.weight; a = S*(rw + wl); b = S*rw + s*wg; @@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ sd = tmp; } -#ifdef CONFIG_FAIR_GROUP_SCHED - if (sched_feat(LB_SHARES_UPDATE)) { - /* - * Pick the largest domain to update shares over - */ - tmp = sd; - if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) - tmp = affine_sd; - - if (tmp) { - raw_spin_unlock(&rq->lock); - update_shares(tmp); - raw_spin_lock(&rq->lock); - } - } -#endif - if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) return select_idle_sibling(p, cpu); @@ -1909,6 +2071,48 @@ out: } #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * update tg->load_weight by folding this cpu's load_avg + */ +static int update_shares_cpu(struct task_group *tg, int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned long flags; + struct rq *rq; + + if (!tg->se[cpu]) + return 0; + + rq = cpu_rq(cpu); + cfs_rq = tg->cfs_rq[cpu]; + + raw_spin_lock_irqsave(&rq->lock, flags); + + update_rq_clock(rq); + update_cfs_load(cfs_rq, 1); + + /* + * We need to update shares after updating tg->load_weight in + * order to adjust the weight of groups with long running tasks. + */ + update_cfs_shares(cfs_rq, 0); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return 0; +} + +static void update_shares(int cpu) +{ + struct cfs_rq *cfs_rq; + struct rq *rq = cpu_rq(cpu); + + rcu_read_lock(); + for_each_leaf_cfs_rq(rq, cfs_rq) + update_shares_cpu(cfs_rq->tg, cpu); + rcu_read_unlock(); +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return max_load_move - rem_load_move; } #else +static inline void update_shares(int cpu) +{ +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3174,8 +3381,6 @@ out_one_pinned: else ld_moved = 0; out: - if (ld_moved) - update_shares(sd); return ld_moved; } @@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ raw_spin_unlock(&this_rq->lock); + update_shares(this_cpu); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; @@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; + update_shares(cpu); + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 185f920ec1a2..68e69acc29b9 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_SHARES_UPDATE, 1) -SCHED_FEAT(ASYM_EFF_LOAD, 1) /* * Spin-wait on mutex acquisition when the mutex owner is running on diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bea7d79f7e9c..c914ec747ca6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); } +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_add_rcu(&rt_rq->leaf_rt_rq_list, + &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_del_rcu(&rt_rq->leaf_rt_rq_list); +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) @@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(def_rt_bandwidth.rt_period); } +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) @@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; + if (!rt_rq->rt_nr_running) + list_add_leaf_rt_rq(rt_rq); + if (head) list_add(&rt_se->run_list, queue); else @@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); + if (!rt_rq->rt_nr_running) + list_del_leaf_rt_rq(rt_rq); } /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 18f4be0d5fe0..d4d918a91881 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + static struct sched_param param = { + .sched_priority = MAX_RT_PRIO-1 + }; p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; diff --git a/kernel/srcu.c b/kernel/srcu.c index c71e07500536..98d8c1e80edb 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -31,6 +31,7 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/smp.h> +#include <linux/delay.h> #include <linux/srcu.h> static int init_srcu_struct_fields(struct srcu_struct *sp) @@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) * all srcu_read_lock() calls using the old counters have completed. * Their corresponding critical sections might well be still * executing, but the srcu_read_lock() primitives themselves - * will have finished executing. + * will have finished executing. We initially give readers + * an arbitrarily chosen 10 microseconds to get out of their + * SRCU read-side critical sections, then loop waiting 1/HZ + * seconds per iteration. */ + if (srcu_readers_active_idx(sp, idx)) + udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); diff --git a/kernel/sys.c b/kernel/sys.c index 7f5a0cd296a9..2745dcdb6c6c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid) err = session; out: write_unlock_irq(&tasklist_lock); - if (err > 0) + if (err > 0) { proc_sid_connector(group_leader); + sched_autogroup_create_attach(group_leader); + } return err; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 46404414d8a7..ae5cbb1e3ced 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -static int min_sched_shares_ratelimit = 100000; /* 100 usec */ -static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif #ifdef CONFIG_COMPACTION @@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_wakeup_granularity_ns, }, { - .procname = "sched_shares_ratelimit", - .data = &sysctl_sched_shares_ratelimit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_shares_ratelimit, - .extra2 = &max_sched_shares_ratelimit, - }, - { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, .maxlen = sizeof(enum sched_tunable_scaling), @@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_tunable_scaling, }, { - .procname = "sched_shares_thresh", - .data = &sysctl_sched_shares_thresh, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, - { .procname = "sched_migration_cost", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), @@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "sched_shares_window", + .data = &sysctl_sched_shares_window, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), @@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_AUTOGROUP + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index ac38fbb176cc..a9ae369925ce 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/kernel.h> /* * fixed point arithmetic scale factor for skew @@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync, int index; int num_samples = sync->num_samples; - if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { + if (num_samples > ARRAY_SIZE(buffer)) { samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); if (!samples) { samples = buffer; - num_samples = sizeof(buffer)/sizeof(buffer[0]); + num_samples = ARRAY_SIZE(buffer); } } else { samples = buffer; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 49010d822f72..5bb86da82003 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -32,6 +32,8 @@ struct timekeeper { cycle_t cycle_interval; /* Number of clock shifted nano seconds in one NTP interval. */ u64 xtime_interval; + /* shifted nano seconds left over when rounding cycle_interval */ + s64 xtime_remainder; /* Raw nano seconds accumulated per NTP interval. */ u32 raw_interval; @@ -62,7 +64,7 @@ struct timekeeper timekeeper; static void timekeeper_setup_internals(struct clocksource *clock) { cycle_t interval; - u64 tmp; + u64 tmp, ntpinterval; timekeeper.clock = clock; clock->cycle_last = clock->read(clock); @@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; + ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) @@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Go back from cycles -> shifted ns */ timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; timekeeper.raw_interval = ((u64) interval * clock->mult) >> clock->shift; @@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) /* Accumulate error between NTP and clock interval */ timekeeper.ntp_error += tick_length << shift; - timekeeper.ntp_error -= timekeeper.xtime_interval << + timekeeper.ntp_error -= + (timekeeper.xtime_interval + timekeeper.xtime_remainder) << (timekeeper.ntp_error_shift + shift); return offset; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ab8f5e33fa92..32a19f9397fc 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, { struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct rb_node *curr; + struct timerqueue_node *curr; unsigned long flags; next_one: i = 0; raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = base->first; + curr = timerqueue_getnext(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = rb_next(curr); + curr = timerqueue_iterate_next(curr); i++; } if (curr) { - timer = rb_entry(curr, struct hrtimer, node); + timer = container_of(curr, struct hrtimer, node); tmp = *timer; raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); diff --git a/kernel/timer.c b/kernel/timer.c index 353b9227c2ec..43ca9936f2d0 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; -/* - * Note that all tvec_bases are 2 byte aligned and lower bit of - * base in timer_list is guaranteed to be zero. Use the LSB to - * indicate whether the timer is deferrable. - * - * A deferrable timer will work normally when the system is busy, but - * will not cause a CPU to come out of idle just to service it; instead, - * the timer will be serviced when the CPU eventually wakes up with a - * subsequent non-deferrable timer. - */ -#define TBASE_DEFERRABLE_FLAG (0x1) - /* Functions below help us manage 'deferrable' flag */ static inline unsigned int tbase_get_deferrable(struct tvec_base *base) { @@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base) static inline void timer_set_deferrable(struct timer_list *timer) { - timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); + timer->base = TBASE_MAKE_DEFERRED(timer->base); } static inline void @@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); - -static inline void set_running_timer(struct tvec_base *base, - struct timer_list *timer) -{ -#ifdef CONFIG_SMP - base->running_timer = timer; -#endif -} - static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; @@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer) } EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer do del * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. - * - * It must not be called from interrupt contexts. */ int try_to_del_timer_sync(struct timer_list *timer) { @@ -973,6 +948,7 @@ out: } EXPORT_SYMBOL(try_to_del_timer_sync); +#ifdef CONFIG_SMP /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent + * hardirq contexts. The caller must not hold locks which would prevent * completion of the timer's handler. The timer's handler must not call * add_timer_on(). Upon exit the timer is not queued and the handler is * not running on any CPU. @@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync); int del_timer_sync(struct timer_list *timer) { #ifdef CONFIG_LOCKDEP - unsigned long flags; - - local_irq_save(flags); + local_bh_disable(); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); - local_irq_restore(flags); + local_bh_enable(); #endif - + /* + * don't use it in hardirq context, because it + * could lead to deadlock. + */ + WARN_ON(in_irq()); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base) timer_stats_account_timer(timer); - set_running_timer(base, timer); + base->running_timer = timer; detach_timer(timer, 1); spin_unlock_irq(&base->lock); @@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base) spin_lock_irq(&base->lock); } } - set_running_timer(base, NULL); + base->running_timer = NULL; spin_unlock_irq(&base->lock); } @@ -1249,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, */ unsigned long get_next_timer_interrupt(unsigned long now) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = __this_cpu_read(tvec_bases); unsigned long expires; /* @@ -1298,7 +1276,7 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = __this_cpu_read(tvec_bases); hrtimer_run_pending(); diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f338190b26..761c510a06c5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_EVENT_TRACING) += power-traces.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8cf959bad45..dc53ecb80589 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) __this_cpu_inc(user_stack_count); - - event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) - return; + goto out_drop_count; entry = ring_buffer_event_data(event); entry->tgid = current->tgid; @@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + out_drop_count: __this_cpu_dec(user_stack_count); - out: preempt_enable(); } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b3209..562c56e048fd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) static int trace_wakeup_test_thread(void *data) { /* Make this a RT thread, doesn't need to be too high */ - struct sched_param param = { .sched_priority = 5 }; + static struct sched_param param = { .sched_priority = 5 }; struct completion *x = data; sched_setscheduler(current, SCHED_FIFO, ¶m); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index aaa8dae08236..6e7b575ac33c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -309,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); sched_setscheduler(current, SCHED_FIFO, ¶m); diff --git a/lib/Makefile b/lib/Makefile index e6a3763b8212..9e2db72d128e 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o \ + rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o prio_heap.o ratelimit.o show_mem.o \ diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 3094318bfea7..b335acb43be2 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -141,11 +141,10 @@ static void ddebug_change(const struct ddebug_query *query, else if (!dp->flags) dt->num_enabled++; dp->flags = newflags; - if (newflags) { - jump_label_enable(&dp->enabled); - } else { - jump_label_disable(&dp->enabled); - } + if (newflags) + dp->enabled = 1; + else + dp->enabled = 0; if (verbose) printk(KERN_INFO "ddebug: changed %s:%d [%s]%s %s\n", diff --git a/lib/timerqueue.c b/lib/timerqueue.c new file mode 100644 index 000000000000..e3a1050e6820 --- /dev/null +++ b/lib/timerqueue.c @@ -0,0 +1,107 @@ +/* + * Generic Timer-queue + * + * Manages a simple queue of timers, ordered by expiration time. + * Uses rbtrees for quick list adds and expiration. + * + * NOTE: All of the following functions need to be serialized + * to avoid races. No locking is done by this libary code. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/timerqueue.h> +#include <linux/rbtree.h> +#include <linux/module.h> + +/** + * timerqueue_add - Adds timer to timerqueue. + * + * @head: head of timerqueue + * @node: timer node to be added + * + * Adds the timer node to the timerqueue, sorted by the + * node's expires value. + */ +void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) +{ + struct rb_node **p = &head->head.rb_node; + struct rb_node *parent = NULL; + struct timerqueue_node *ptr; + + /* Make sure we don't add nodes that are already added */ + WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); + + while (*p) { + parent = *p; + ptr = rb_entry(parent, struct timerqueue_node, node); + if (node->expires.tv64 < ptr->expires.tv64) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&node->node, parent, p); + rb_insert_color(&node->node, &head->head); + + if (!head->next || node->expires.tv64 < head->next->expires.tv64) + head->next = node; +} +EXPORT_SYMBOL_GPL(timerqueue_add); + +/** + * timerqueue_del - Removes a timer from the timerqueue. + * + * @head: head of timerqueue + * @node: timer node to be removed + * + * Removes the timer node from the timerqueue. + */ +void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) +{ + WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); + + /* update next pointer */ + if (head->next == node) { + struct rb_node *rbn = rb_next(&node->node); + + head->next = rbn ? + rb_entry(rbn, struct timerqueue_node, node) : NULL; + } + rb_erase(&node->node, &head->head); + RB_CLEAR_NODE(&node->node); +} +EXPORT_SYMBOL_GPL(timerqueue_del); + +/** + * timerqueue_iterate_next - Returns the timer after the provided timer + * + * @node: Pointer to a timer. + * + * Provides the timer that is after the given node. This is used, when + * necessary, to iterate through the list of timers in a timer list + * without modifying the list. + */ +struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) +{ + struct rb_node *next; + + if (!node) + return NULL; + next = rb_next(&node->node); + if (!next) + return NULL; + return container_of(next, struct timerqueue_node, node); +} +EXPORT_SYMBOL_GPL(timerqueue_iterate_next); diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 39580a5dc5df..9f85012acf0d 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -155,6 +155,8 @@ use strict; # '@parameter' - name of a parameter # '%CONST' - name of a constant. +## init lots of data + my $errors = 0; my $warnings = 0; my $anon_struct_union = 0; @@ -218,21 +220,14 @@ my %highlights_list = ( $type_constant, "\$1", $type_param, "\$1" ); my $blankline_list = ""; -sub usage { - print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n"; - print " [ -no-doc-sections ]\n"; - print " [ -function funcname [ -function funcname ...] ]\n"; - print " [ -nofunction funcname [ -nofunction funcname ...] ]\n"; - print " c source file(s) > outputfile\n"; - print " -v : verbose output, more warnings & other info listed\n"; - exit 1; -} - # read arguments if ($#ARGV == -1) { usage(); } +my $kernelversion; +my $dohighlight = ""; + my $verbose = 0; my $output_mode = "man"; my $no_doc_sections = 0; @@ -245,7 +240,7 @@ my $man_date = ('January', 'February', 'March', 'April', 'May', 'June', 'November', 'December')[(localtime)[4]] . " " . ((localtime)[5]+1900); -# Essentially these are globals +# Essentially these are globals. # They probably want to be tidied up, made more localised or something. # CAVEAT EMPTOR! Some of the others I localised may not want to be, which # could cause "use of undefined value" or other bugs. @@ -353,6 +348,18 @@ while ($ARGV[0] =~ m/^-(.*)/) { } } +# continue execution near EOF; + +sub usage { + print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n"; + print " [ -no-doc-sections ]\n"; + print " [ -function funcname [ -function funcname ...] ]\n"; + print " [ -nofunction funcname [ -nofunction funcname ...] ]\n"; + print " c source file(s) > outputfile\n"; + print " -v : verbose output, more warnings & other info listed\n"; + exit 1; +} + # get kernel version from env sub get_kernel_version() { my $version = 'unknown kernel version'; @@ -362,15 +369,6 @@ sub get_kernel_version() { } return $version; } -my $kernelversion = get_kernel_version(); - -# generate a sequence of code that will splice in highlighting information -# using the s// operator. -my $dohighlight = ""; -foreach my $pattern (keys %highlights) { -# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n"; - $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n"; -} ## # dumps section contents to arrays/hashes intended for that purpose. @@ -1851,34 +1849,6 @@ sub dump_function($$) { }); } -sub process_file($); - -# Read the file that maps relative names to absolute names for -# separate source and object directories and for shadow trees. -if (open(SOURCE_MAP, "<.tmp_filelist.txt")) { - my ($relname, $absname); - while(<SOURCE_MAP>) { - chop(); - ($relname, $absname) = (split())[0..1]; - $relname =~ s:^/+::; - $source_map{$relname} = $absname; - } - close(SOURCE_MAP); -} - -foreach (@ARGV) { - chomp; - process_file($_); -} -if ($verbose && $errors) { - print STDERR "$errors errors\n"; -} -if ($verbose && $warnings) { - print STDERR "$warnings warnings\n"; -} - -exit($errors); - sub reset_state { $function = ""; %constants = (); @@ -2285,3 +2255,39 @@ sub process_file($) { } } } + + +$kernelversion = get_kernel_version(); + +# generate a sequence of code that will splice in highlighting information +# using the s// operator. +foreach my $pattern (keys %highlights) { +# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n"; + $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n"; +} + +# Read the file that maps relative names to absolute names for +# separate source and object directories and for shadow trees. +if (open(SOURCE_MAP, "<.tmp_filelist.txt")) { + my ($relname, $absname); + while(<SOURCE_MAP>) { + chop(); + ($relname, $absname) = (split())[0..1]; + $relname =~ s:^/+::; + $source_map{$relname} = $absname; + } + close(SOURCE_MAP); +} + +foreach (@ARGV) { + chomp; + process_file($_); +} +if ($verbose && $errors) { + print STDERR "$errors errors\n"; +} +if ($verbose && $warnings) { + print STDERR "$warnings warnings\n"; +} + +exit($errors); |