summaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile4
-rw-r--r--arch/i386/kernel/acpi/boot.c2
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c21
-rw-r--r--arch/i386/kernel/alternative.c102
-rw-r--r--arch/i386/kernel/apic.c22
-rw-r--r--arch/i386/kernel/apm.c10
-rw-r--r--arch/i386/kernel/asm-offsets.c18
-rw-r--r--arch/i386/kernel/cpu/Makefile4
-rw-r--r--arch/i386/kernel/cpu/amd.c15
-rw-r--r--arch/i386/kernel/cpu/bugs.c191
-rw-r--r--arch/i386/kernel/cpu/centaur.c10
-rw-r--r--arch/i386/kernel/cpu/common.c217
-rw-r--r--arch/i386/kernel/cpu/cyrix.c21
-rw-r--r--arch/i386/kernel/cpu/intel.c4
-rw-r--r--arch/i386/kernel/cpu/mcheck/k7.c13
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c3
-rw-r--r--arch/i386/kernel/cpu/mcheck/p4.c16
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c101
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c11
-rw-r--r--arch/i386/kernel/cpu/nexgen.c10
-rw-r--r--arch/i386/kernel/cpu/perfctr-watchdog.c658
-rw-r--r--arch/i386/kernel/cpu/proc.c3
-rw-r--r--arch/i386/kernel/cpu/rise.c9
-rw-r--r--arch/i386/kernel/cpu/transmeta.c10
-rw-r--r--arch/i386/kernel/cpu/umc.c10
-rw-r--r--arch/i386/kernel/doublefault.c29
-rw-r--r--arch/i386/kernel/e820.c64
-rw-r--r--arch/i386/kernel/efi.c16
-rw-r--r--arch/i386/kernel/entry.S23
-rw-r--r--arch/i386/kernel/head.S118
-rw-r--r--arch/i386/kernel/i386_ksyms.c2
-rw-r--r--arch/i386/kernel/io_apic.c38
-rw-r--r--arch/i386/kernel/ioport.c3
-rw-r--r--arch/i386/kernel/irq.c3
-rw-r--r--arch/i386/kernel/mpparse.c2
-rw-r--r--arch/i386/kernel/nmi.c832
-rw-r--r--arch/i386/kernel/paravirt.c522
-rw-r--r--arch/i386/kernel/process.c37
-rw-r--r--arch/i386/kernel/quirks.c69
-rw-r--r--arch/i386/kernel/reboot.c55
-rw-r--r--arch/i386/kernel/reboot_fixups.c2
-rw-r--r--arch/i386/kernel/smp.c304
-rw-r--r--arch/i386/kernel/smpboot.c146
-rw-r--r--arch/i386/kernel/sysenter.c269
-rw-r--r--arch/i386/kernel/time.c2
-rw-r--r--arch/i386/kernel/trampoline.S12
-rw-r--r--arch/i386/kernel/traps.c32
-rw-r--r--arch/i386/kernel/tsc.c13
-rw-r--r--arch/i386/kernel/verify_cpu.S65
-rw-r--r--arch/i386/kernel/vmi.c131
-rw-r--r--arch/i386/kernel/vmiclock.c318
-rw-r--r--arch/i386/kernel/vmitime.c482
-rw-r--r--arch/i386/kernel/vmlinux.lds.S24
-rw-r--r--arch/i386/kernel/vsyscall.lds.S4
54 files changed, 2406 insertions, 2696 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 4ae3dcf1d2f0..4f98516b9f94 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,12 +39,10 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
-obj-$(CONFIG_VMI) += vmi.o vmitime.o
+obj-$(CONFIG_VMI) += vmi.o vmiclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o
obj-y += pcspeaker.o
-EXTRA_AFLAGS := -traditional
-
obj-$(CONFIG_SCx200) += scx200.o
# vsyscall.o contains the vsyscall DSO images as __initdata.
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 9ea5b8ecc7e1..280898b045b2 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -874,7 +874,7 @@ static void __init acpi_process_madt(void)
acpi_ioapic = 1;
smp_found_config = 1;
- clustered_apic_check();
+ setup_apic_routing();
}
}
if (error == -EINVAL) {
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index 8f7efd38254d..23f78efc577d 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,7 +10,6 @@
#include <asm/pci-direct.h>
#include <asm/acpi.h>
#include <asm/apic.h>
-#include <asm/irq.h>
#ifdef CONFIG_ACPI
@@ -48,24 +47,6 @@ static int __init check_bridge(int vendor, int device)
return 0;
}
-static void check_intel(void)
-{
- u16 vendor, device;
-
- vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
-
- if (vendor != PCI_VENDOR_ID_INTEL)
- return;
-
- device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
-#ifdef CONFIG_SMP
- if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
- device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
- device == PCI_DEVICE_ID_INTEL_E7525_MCH)
- quirk_intel_irqbalance();
-#endif
-}
-
void __init check_acpi_pci(void)
{
int num, slot, func;
@@ -77,8 +58,6 @@ void __init check_acpi_pci(void)
if (!early_pci_allowed())
return;
- check_intel();
-
/* Poor man's PCI discovery */
for (num = 0; num < 32; num++) {
for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 426f59b0106b..e5cec6685cc5 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -5,6 +5,7 @@
#include <asm/alternative.h>
#include <asm/sections.h>
+static int noreplace_smp = 0;
static int smp_alt_once = 0;
static int debug_alternative = 0;
@@ -13,15 +14,33 @@ static int __init bootonly(char *str)
smp_alt_once = 1;
return 1;
}
+__setup("smp-alt-boot", bootonly);
+
static int __init debug_alt(char *str)
{
debug_alternative = 1;
return 1;
}
-
-__setup("smp-alt-boot", bootonly);
__setup("debug-alternative", debug_alt);
+static int __init setup_noreplace_smp(char *str)
+{
+ noreplace_smp = 1;
+ return 1;
+}
+__setup("noreplace-smp", setup_noreplace_smp);
+
+#ifdef CONFIG_PARAVIRT
+static int noreplace_paravirt = 0;
+
+static int __init setup_noreplace_paravirt(char *str)
+{
+ noreplace_paravirt = 1;
+ return 1;
+}
+__setup("noreplace-paravirt", setup_noreplace_paravirt);
+#endif
+
#define DPRINTK(fmt, args...) if (debug_alternative) \
printk(KERN_DEBUG fmt, args)
@@ -132,11 +151,8 @@ static void nop_out(void *insns, unsigned int len)
}
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
extern u8 *__smp_locks[], *__smp_locks_end[];
-extern u8 __smp_alt_begin[], __smp_alt_end[];
-
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
self modifying code. This implies that assymetric systems where
@@ -171,29 +187,6 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
#ifdef CONFIG_SMP
-static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end)
-{
- struct alt_instr *a;
-
- DPRINTK("%s: alt table %p-%p\n", __FUNCTION__, start, end);
- for (a = start; a < end; a++) {
- memcpy(a->replacement + a->replacementlen,
- a->instr,
- a->instrlen);
- }
-}
-
-static void alternatives_smp_apply(struct alt_instr *start, struct alt_instr *end)
-{
- struct alt_instr *a;
-
- for (a = start; a < end; a++) {
- memcpy(a->instr,
- a->replacement + a->replacementlen,
- a->instrlen);
- }
-}
-
static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
u8 **ptr;
@@ -211,6 +204,9 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
{
u8 **ptr;
+ if (noreplace_smp)
+ return;
+
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
@@ -245,6 +241,9 @@ void alternatives_smp_module_add(struct module *mod, char *name,
struct smp_alt_module *smp;
unsigned long flags;
+ if (noreplace_smp)
+ return;
+
if (smp_alt_once) {
if (boot_cpu_has(X86_FEATURE_UP))
alternatives_smp_unlock(locks, locks_end,
@@ -279,7 +278,7 @@ void alternatives_smp_module_del(struct module *mod)
struct smp_alt_module *item;
unsigned long flags;
- if (smp_alt_once)
+ if (smp_alt_once || noreplace_smp)
return;
spin_lock_irqsave(&smp_alt, flags);
@@ -310,7 +309,7 @@ void alternatives_smp_switch(int smp)
return;
#endif
- if (smp_alt_once)
+ if (noreplace_smp || smp_alt_once)
return;
BUG_ON(!smp && (num_online_cpus() > 1));
@@ -319,8 +318,6 @@ void alternatives_smp_switch(int smp)
printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
- alternatives_smp_apply(__smp_alt_instructions,
- __smp_alt_instructions_end);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_lock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
@@ -328,8 +325,6 @@ void alternatives_smp_switch(int smp)
printk(KERN_INFO "SMP alternatives: switching to UP code\n");
set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
- apply_alternatives(__smp_alt_instructions,
- __smp_alt_instructions_end);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_unlock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
@@ -340,36 +335,31 @@ void alternatives_smp_switch(int smp)
#endif
#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+void apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end)
{
- struct paravirt_patch *p;
+ struct paravirt_patch_site *p;
+
+ if (noreplace_paravirt)
+ return;
for (p = start; p < end; p++) {
unsigned int used;
used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
p->len);
-#ifdef CONFIG_DEBUG_PARAVIRT
- {
- int i;
- /* Deliberately clobber regs using "not %reg" to find bugs. */
- for (i = 0; i < 3; i++) {
- if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
- memcpy(p->instr + used, "\xf7\xd0", 2);
- p->instr[used+1] |= i;
- used += 2;
- }
- }
- }
-#endif
+
+ BUG_ON(used > p->len);
+
/* Pad the rest with nops */
nop_out(p->instr + used, p->len - used);
}
- /* Sync to be conservative, in case we patched following instructions */
+ /* Sync to be conservative, in case we patched following
+ * instructions */
sync_core();
}
-extern struct paravirt_patch __start_parainstructions[],
+extern struct paravirt_patch_site __start_parainstructions[],
__stop_parainstructions[];
#endif /* CONFIG_PARAVIRT */
@@ -396,23 +386,19 @@ void __init alternative_instructions(void)
printk(KERN_INFO "SMP alternatives: switching to UP code\n");
set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
- apply_alternatives(__smp_alt_instructions,
- __smp_alt_instructions_end);
alternatives_smp_unlock(__smp_locks, __smp_locks_end,
_text, _etext);
}
free_init_pages("SMP alternatives",
- (unsigned long)__smp_alt_begin,
- (unsigned long)__smp_alt_end);
+ __pa_symbol(&__smp_locks),
+ __pa_symbol(&__smp_locks_end));
} else {
- alternatives_smp_save(__smp_alt_instructions,
- __smp_alt_instructions_end);
alternatives_smp_module_add(NULL, "core kernel",
__smp_locks, __smp_locks_end,
_text, _etext);
alternatives_smp_switch(0);
}
#endif
- apply_paravirt(__start_parainstructions, __stop_parainstructions);
+ apply_paravirt(__parainstructions, __parainstructions_end);
local_irq_restore(flags);
}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 93aa911646ad..aca054cc0552 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -129,6 +129,28 @@ static int modern_apic(void)
return lapic_get_version() >= 0x14;
}
+void apic_wait_icr_idle(void)
+{
+ while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+unsigned long safe_apic_wait_icr_idle(void)
+{
+ unsigned long send_status;
+ int timeout;
+
+ timeout = 0;
+ do {
+ send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+ if (!send_status)
+ break;
+ udelay(100);
+ } while (timeout++ < 1000);
+
+ return send_status;
+}
+
/**
* enable_NMI_through_LVT0 - enable NMI through local vector table 0
*/
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 064bbf2861f4..367ff1d930cb 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -233,11 +233,10 @@
#include <asm/desc.h>
#include <asm/i8253.h>
#include <asm/paravirt.h>
+#include <asm/reboot.h>
#include "io_ports.h"
-extern void machine_real_restart(unsigned char *, int);
-
#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
extern int (*console_blank_hook)(int);
#endif
@@ -384,13 +383,6 @@ static int ignore_sys_suspend;
static int ignore_normal_resume;
static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
-#ifdef CONFIG_APM_RTC_IS_GMT
-# define clock_cmos_diff 0
-# define got_clock_diff 1
-#else
-static long clock_cmos_diff;
-static int got_clock_diff;
-#endif
static int debug __read_mostly;
static int smp __read_mostly;
static int apm_disabled = -1;
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c37535163bfc..27a776c9044d 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -11,11 +11,11 @@
#include <linux/suspend.h>
#include <asm/ucontext.h>
#include "sigframe.h"
+#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
#include <asm/thread_info.h>
#include <asm/elf.h>
-#include <asm/pda.h>
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -25,6 +25,9 @@
#define OFFSET(sym, str, mem) \
DEFINE(sym, offsetof(struct str, mem));
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+
void foo(void)
{
OFFSET(SIGCONTEXT_eax, sigcontext, eax);
@@ -90,17 +93,18 @@ void foo(void)
OFFSET(pbe_next, pbe, next);
/* Offset from the sysenter stack to tss.esp0 */
- DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
+ DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
sizeof(struct tss_struct));
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
- DEFINE(VDSO_PRELINK, VDSO_PRELINK);
+ DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
+ DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
+ DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
+ DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
- OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+ DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
- BLANK();
- OFFSET(PDA_cpu, i386_pda, cpu_number);
- OFFSET(PDA_pcurrent, i386_pda, pcurrent);
+ OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
#ifdef CONFIG_PARAVIRT
BLANK();
diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile
index 010aecfffbc1..74f27a463db0 100644
--- a/arch/i386/kernel/cpu/Makefile
+++ b/arch/i386/kernel/cpu/Makefile
@@ -2,7 +2,7 @@
# Makefile for x86-compatible CPU details and quirks
#
-obj-y := common.o proc.o
+obj-y := common.o proc.o bugs.o
obj-y += amd.o
obj-y += cyrix.o
@@ -17,3 +17,5 @@ obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 2d47db482972..4fec702afd7e 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -53,6 +53,8 @@ static __cpuinit int amd_apic_timer_broken(void)
return 0;
}
+int force_mwait __cpuinitdata;
+
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
{
u32 l, h;
@@ -275,6 +277,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
if (amd_apic_timer_broken())
set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability);
+
+ if (c->x86 == 0x10 && !force_mwait)
+ clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
}
static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
@@ -314,13 +319,3 @@ int __init amd_init_cpu(void)
cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev;
return 0;
}
-
-//early_arch_initcall(amd_init_cpu);
-
-static int __init amd_exit_cpu(void)
-{
- cpu_devs[X86_VENDOR_AMD] = NULL;
- return 0;
-}
-
-late_initcall(amd_exit_cpu);
diff --git a/arch/i386/kernel/cpu/bugs.c b/arch/i386/kernel/cpu/bugs.c
new file mode 100644
index 000000000000..54428a2500f3
--- /dev/null
+++ b/arch/i386/kernel/cpu/bugs.c
@@ -0,0 +1,191 @@
+/*
+ * arch/i386/cpu/bugs.c
+ *
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Cyrix stuff, June 1998 by:
+ * - Rafael R. Reilova (moved everything from head.S),
+ * <rreilova@ececs.uc.edu>
+ * - Channing Corn (tests & fixes),
+ * - Andrew D. Balsa (code cleanup).
+ */
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/paravirt.h>
+#include <asm/alternative.h>
+
+static int __init no_halt(char *s)
+{
+ boot_cpu_data.hlt_works_ok = 0;
+ return 1;
+}
+
+__setup("no-hlt", no_halt);
+
+static int __init mca_pentium(char *s)
+{
+ mca_pentium_flag = 1;
+ return 1;
+}
+
+__setup("mca-pentium", mca_pentium);
+
+static int __init no_387(char *s)
+{
+ boot_cpu_data.hard_math = 0;
+ write_cr0(0xE | read_cr0());
+ return 1;
+}
+
+__setup("no387", no_387);
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+static void __init check_fpu(void)
+{
+ if (!boot_cpu_data.hard_math) {
+#ifndef CONFIG_MATH_EMULATION
+ printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
+ printk(KERN_EMERG "Giving up.\n");
+ for (;;) ;
+#endif
+ return;
+ }
+
+/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
+ /* Test for the divl bug.. */
+ __asm__("fninit\n\t"
+ "fldl %1\n\t"
+ "fdivl %2\n\t"
+ "fmull %2\n\t"
+ "fldl %1\n\t"
+ "fsubp %%st,%%st(1)\n\t"
+ "fistpl %0\n\t"
+ "fwait\n\t"
+ "fninit"
+ : "=m" (*&boot_cpu_data.fdiv_bug)
+ : "m" (*&x), "m" (*&y));
+ if (boot_cpu_data.fdiv_bug)
+ printk("Hmm, FPU with FDIV bug.\n");
+}
+
+static void __init check_hlt(void)
+{
+ if (paravirt_enabled())
+ return;
+
+ printk(KERN_INFO "Checking 'hlt' instruction... ");
+ if (!boot_cpu_data.hlt_works_ok) {
+ printk("disabled\n");
+ return;
+ }
+ halt();
+ halt();
+ halt();
+ halt();
+ printk("OK.\n");
+}
+
+/*
+ * Most 386 processors have a bug where a POPAD can lock the
+ * machine even from user space.
+ */
+
+static void __init check_popad(void)
+{
+#ifndef CONFIG_X86_POPAD_OK
+ int res, inp = (int) &res;
+
+ printk(KERN_INFO "Checking for popad bug... ");
+ __asm__ __volatile__(
+ "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
+ : "=&a" (res)
+ : "d" (inp)
+ : "ecx", "edi" );
+ /* If this fails, it means that any user program may lock the CPU hard. Too bad. */
+ if (res != 12345678) printk( "Buggy.\n" );
+ else printk( "OK.\n" );
+#endif
+}
+
+/*
+ * Check whether we are able to run this kernel safely on SMP.
+ *
+ * - In order to run on a i386, we need to be compiled for i386
+ * (for due to lack of "invlpg" and working WP on a i386)
+ * - In order to run on anything without a TSC, we need to be
+ * compiled for a i486.
+ * - In order to support the local APIC on a buggy Pentium machine,
+ * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
+ * which happens implicitly if compiled for a Pentium or lower
+ * (unless an advanced selection of CPU features is used) as an
+ * otherwise config implies a properly working local APIC without
+ * the need to do extra reads from the APIC.
+*/
+
+static void __init check_config(void)
+{
+/*
+ * We'd better not be a i386 if we're configured to use some
+ * i486+ only features! (WP works in supervisor mode and the
+ * new "invlpg" and "bswap" instructions)
+ */
+#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
+ if (boot_cpu_data.x86 == 3)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
+#endif
+
+/*
+ * If we configured ourselves for a TSC, we'd better have one!
+ */
+#ifdef CONFIG_X86_TSC
+ if (!cpu_has_tsc && !tsc_disable)
+ panic("Kernel compiled for Pentium+, requires TSC feature!");
+#endif
+
+/*
+ * If we were told we had a good local APIC, check for buggy Pentia,
+ * i.e. all B steppings and the C2 stepping of P54C when using their
+ * integrated APIC (see 11AP erratum in "Pentium Processor
+ * Specification Update").
+ */
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
+ && cpu_has_apic
+ && boot_cpu_data.x86 == 5
+ && boot_cpu_data.x86_model == 2
+ && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
+ panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
+#endif
+}
+
+
+void __init check_bugs(void)
+{
+ identify_boot_cpu();
+#ifndef CONFIG_SMP
+ printk("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+#endif
+ check_config();
+ check_fpu();
+ check_hlt();
+ check_popad();
+ init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ alternative_instructions();
+}
diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c
index 8c25047975c0..473eac883c7b 100644
--- a/arch/i386/kernel/cpu/centaur.c
+++ b/arch/i386/kernel/cpu/centaur.c
@@ -469,13 +469,3 @@ int __init centaur_init_cpu(void)
cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev;
return 0;
}
-
-//early_arch_initcall(centaur_init_cpu);
-
-static int __init centaur_exit_cpu(void)
-{
- cpu_devs[X86_VENDOR_CENTAUR] = NULL;
- return 0;
-}
-
-late_initcall(centaur_exit_cpu);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index dcbbd0a8bfc2..794d593c47eb 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,15 +18,37 @@
#include <asm/apic.h>
#include <mach_apic.h>
#endif
-#include <asm/pda.h>
#include "cpu.h"
-DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
-EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+ [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
+ [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
+ [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
+ [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
+ /*
+ * Segments used for calling PnP BIOS have byte granularity.
+ * They code segments and data segments have fixed 64k limits,
+ * the transfer segment sizes are set at run time.
+ */
+ [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+ [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
+ [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
+ /*
+ * The APM segments have byte granularity and their bases
+ * are set at run time. All have 64k limits.
+ */
+ [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+ /* 16-bit code */
+ [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
+ [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
-struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
+ [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
+ [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
static int cachesize_override __cpuinitdata = -1;
static int disable_x86_fxsr __cpuinitdata;
@@ -368,7 +390,7 @@ __setup("serialnumber", x86_serial_nr_setup);
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
{
int i;
@@ -479,15 +501,22 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Init Machine Check Exception if available. */
mcheck_init(c);
+}
- if (c == &boot_cpu_data)
- sysenter_setup();
+void __init identify_boot_cpu(void)
+{
+ identify_cpu(&boot_cpu_data);
+ sysenter_setup();
enable_sep_cpu();
+ mtrr_bp_init();
+}
- if (c == &boot_cpu_data)
- mtrr_bp_init();
- else
- mtrr_ap_init();
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+ enable_sep_cpu();
+ mtrr_ap_init();
}
#ifdef CONFIG_X86_HT
@@ -601,129 +630,36 @@ void __init early_cpu_init(void)
#endif
}
-/* Make sure %gs is initialized properly in idle threads */
+/* Make sure %fs is initialized properly in idle threads */
struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
- regs->xfs = __KERNEL_PDA;
+ regs->xfs = __KERNEL_PERCPU;
return regs;
}
-static __cpuinit int alloc_gdt(int cpu)
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
{
- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
- struct desc_struct *gdt;
- struct i386_pda *pda;
-
- gdt = (struct desc_struct *)cpu_gdt_descr->address;
- pda = cpu_pda(cpu);
-
- /*
- * This is a horrible hack to allocate the GDT. The problem
- * is that cpu_init() is called really early for the boot CPU
- * (and hence needs bootmem) but much later for the secondary
- * CPUs, when bootmem will have gone away
- */
- if (NODE_DATA(0)->bdata->node_bootmem_map) {
- BUG_ON(gdt != NULL || pda != NULL);
-
- gdt = alloc_bootmem_pages(PAGE_SIZE);
- pda = alloc_bootmem(sizeof(*pda));
- /* alloc_bootmem(_pages) panics on failure, so no check */
-
- memset(gdt, 0, PAGE_SIZE);
- memset(pda, 0, sizeof(*pda));
- } else {
- /* GDT and PDA might already have been allocated if
- this is a CPU hotplug re-insertion. */
- if (gdt == NULL)
- gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
-
- if (pda == NULL)
- pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
-
- if (unlikely(!gdt || !pda)) {
- free_pages((unsigned long)gdt, 0);
- kfree(pda);
- return 0;
- }
- }
-
- cpu_gdt_descr->address = (unsigned long)gdt;
- cpu_pda(cpu) = pda;
-
- return 1;
-}
+ struct Xgt_desc_struct gdt_descr;
-/* Initial PDA used by boot CPU */
-struct i386_pda boot_pda = {
- ._pda = &boot_pda,
- .cpu_number = 0,
- .pcurrent = &init_task,
-};
-
-static inline void set_kernel_fs(void)
-{
- /* Set %fs for this CPU's PDA. Memory clobber is to create a
- barrier with respect to any PDA operations, so the compiler
- doesn't move any before here. */
- asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
+ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+ asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
}
-/* Initialize the CPU's GDT and PDA. The boot CPU does this for
- itself, but secondaries find this done for them. */
-__cpuinit int init_gdt(int cpu, struct task_struct *idle)
-{
- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
- struct desc_struct *gdt;
- struct i386_pda *pda;
-
- /* For non-boot CPUs, the GDT and PDA should already have been
- allocated. */
- if (!alloc_gdt(cpu)) {
- printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
- return 0;
- }
-
- gdt = (struct desc_struct *)cpu_gdt_descr->address;
- pda = cpu_pda(cpu);
-
- BUG_ON(gdt == NULL || pda == NULL);
-
- /*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
- */
- memcpy(gdt, cpu_gdt_table, GDT_SIZE);
- cpu_gdt_descr->size = GDT_SIZE - 1;
-
- pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
- (u32 *)&gdt[GDT_ENTRY_PDA].b,
- (unsigned long)pda, sizeof(*pda) - 1,
- 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
-
- memset(pda, 0, sizeof(*pda));
- pda->_pda = pda;
- pda->cpu_number = cpu;
- pda->pcurrent = idle;
-
- return 1;
-}
-
-void __cpuinit cpu_set_gdt(int cpu)
-{
- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-
- /* Reinit these anyway, even if they've already been done (on
- the boot CPU, this will transition from the boot gdt+pda to
- the real ones). */
- load_gdt(cpu_gdt_descr);
- set_kernel_fs();
-}
-
-/* Common CPU init for both boot and secondary CPUs */
-static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ */
+void __cpuinit cpu_init(void)
{
+ int cpu = smp_processor_id();
+ struct task_struct *curr = current;
struct tss_struct * t = &per_cpu(init_tss, cpu);
struct thread_struct *thread = &curr->thread;
@@ -744,6 +680,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
}
load_idt(&idt_descr);
+ switch_to_new_gdt();
/*
* Set up and load the per-CPU TSS and LDT
@@ -783,38 +720,6 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
mxcsr_feature_mask_init();
}
-/* Entrypoint to initialize secondary CPU */
-void __cpuinit secondary_cpu_init(void)
-{
- int cpu = smp_processor_id();
- struct task_struct *curr = current;
-
- _cpu_init(cpu, curr);
-}
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- */
-void __cpuinit cpu_init(void)
-{
- int cpu = smp_processor_id();
- struct task_struct *curr = current;
-
- /* Set up the real GDT and PDA, so we can transition from the
- boot versions. */
- if (!init_gdt(cpu, curr)) {
- /* failed to allocate something; not much we can do... */
- for (;;)
- local_irq_enable();
- }
-
- cpu_set_gdt(cpu);
- _cpu_init(cpu, curr);
-}
-
#ifdef CONFIG_HOTPLUG_CPU
void __cpuinit cpu_uninit(void)
{
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index de27bd07bc9c..0b8411a864fb 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
*/
if (vendor == PCI_VENDOR_ID_CYRIX &&
(device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
- pit_latch_buggy = 1;
+ mark_tsc_unstable("cyrix 5510/5520 detected");
}
#endif
c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
@@ -448,16 +448,6 @@ int __init cyrix_init_cpu(void)
return 0;
}
-//early_arch_initcall(cyrix_init_cpu);
-
-static int __init cyrix_exit_cpu(void)
-{
- cpu_devs[X86_VENDOR_CYRIX] = NULL;
- return 0;
-}
-
-late_initcall(cyrix_exit_cpu);
-
static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
@@ -470,12 +460,3 @@ int __init nsc_init_cpu(void)
return 0;
}
-//early_arch_initcall(nsc_init_cpu);
-
-static int __init nsc_exit_cpu(void)
-{
- cpu_devs[X86_VENDOR_NSC] = NULL;
- return 0;
-}
-
-late_initcall(nsc_exit_cpu);
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 56fe26584957..dc4e08147b1f 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -188,8 +188,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
}
#endif
- if (c->x86 == 15)
+ if (c->x86 == 15) {
set_bit(X86_FEATURE_P4, c->x86_capability);
+ set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
+ }
if (c->x86 == 6)
set_bit(X86_FEATURE_P3, c->x86_capability);
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c
index b0862af595aa..f9fa4142551e 100644
--- a/arch/i386/kernel/cpu/mcheck/k7.c
+++ b/arch/i386/kernel/cpu/mcheck/k7.c
@@ -75,6 +75,9 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
machine_check_vector = k7_machine_check;
wmb();
+ if (!cpu_has(c, X86_FEATURE_MCE))
+ return;
+
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
@@ -82,9 +85,13 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
nr_mce_banks = l & 0xff;
/* Clear status for MC index 0 separately, we don't touch CTL,
- * as some Athlons cause spurious MCEs when its enabled. */
- wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
- for (i=1; i<nr_mce_banks; i++) {
+ * as some K7 Athlons cause spurious MCEs when its enabled. */
+ if (boot_cpu_data.x86 == 6) {
+ wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+ i = 1;
+ } else
+ i = 0;
+ for (; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index 4f10c62d180c..56cd485b127c 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -38,8 +38,7 @@ void mcheck_init(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
- if (c->x86==6 || c->x86==15)
- amd_mcheck_init(c);
+ amd_mcheck_init(c);
break;
case X86_VENDOR_INTEL:
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 504434a46011..1509edfb2313 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -124,13 +124,10 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
+static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
{
u32 h;
- if (mce_num_extended_msrs == 0)
- goto done;
-
rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
@@ -141,12 +138,6 @@ static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
-
- /* can we rely on kmalloc to do a dynamic
- * allocation for the reserved registers?
- */
-done:
- return mce_num_extended_msrs;
}
static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
@@ -155,7 +146,6 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
- struct intel_mce_extended_msrs dbg;
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
@@ -164,7 +154,9 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
- if (intel_get_extended_msrs(&dbg)) {
+ if (mce_num_extended_msrs > 0) {
+ struct intel_mce_extended_msrs dbg;
+ intel_get_extended_msrs(&dbg);
printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
smp_processor_id(), dbg.eip, dbg.eflags);
printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index f77fc53db654..5367e32e0403 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -20,13 +20,25 @@ struct mtrr_state {
mtrr_type def_type;
};
+struct fixed_range_block {
+ int base_msr; /* start address of an MTRR block */
+ int ranges; /* number of MTRRs in this block */
+};
+
+static struct fixed_range_block fixed_range_blocks[] = {
+ { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */
+ { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */
+ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */
+ {}
+};
+
static unsigned long smp_changes_mask;
static struct mtrr_state mtrr_state = {};
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "mtrr."
-static __initdata int mtrr_show;
+static int mtrr_show;
module_param_named(show, mtrr_show, bool, 0);
/* Get the MSR pair relating to a var range */
@@ -37,7 +49,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
}
-static void __init
+static void
get_fixed_ranges(mtrr_type * frs)
{
unsigned int *p = (unsigned int *) frs;
@@ -51,12 +63,18 @@ get_fixed_ranges(mtrr_type * frs)
rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
}
-static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+void mtrr_save_fixed_ranges(void *info)
+{
+ get_fixed_ranges(mtrr_state.fixed_ranges);
+}
+
+static void __cpuinit print_fixed(unsigned base, unsigned step, const mtrr_type*types)
{
unsigned i;
for (i = 0; i < 8; ++i, ++types, base += step)
- printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types));
+ printk(KERN_INFO "MTRR %05X-%05X %s\n",
+ base, base + step - 1, mtrr_attrib_to_str(*types));
}
/* Grab all of the MTRR state for this CPU into *state */
@@ -147,6 +165,44 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
smp_processor_id(), msr, a, b);
}
+/**
+ * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
+ * see AMD publication no. 24593, chapter 3.2.1 for more information
+ */
+static inline void k8_enable_fixed_iorrs(void)
+{
+ unsigned lo, hi;
+
+ rdmsr(MSR_K8_SYSCFG, lo, hi);
+ mtrr_wrmsr(MSR_K8_SYSCFG, lo
+ | K8_MTRRFIXRANGE_DRAM_ENABLE
+ | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
+}
+
+/**
+ * Checks and updates an fixed-range MTRR if it differs from the value it
+ * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also.
+ * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information
+ * \param msr MSR address of the MTTR which should be checked and updated
+ * \param changed pointer which indicates whether the MTRR needed to be changed
+ * \param msrwords pointer to the MSR values which the MSR should have
+ */
+static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
+{
+ unsigned lo, hi;
+
+ rdmsr(msr, lo, hi);
+
+ if (lo != msrwords[0] || hi != msrwords[1]) {
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86 == 15 &&
+ ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
+ k8_enable_fixed_iorrs();
+ mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
+ *changed = TRUE;
+ }
+}
+
int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
/* [SUMMARY] Get a free MTRR.
<base> The starting (base) address of the region.
@@ -196,36 +252,21 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
*type = base_lo & 0xff;
}
+/**
+ * Checks and updates the fixed-range MTRRs if they differ from the saved set
+ * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges()
+ */
static int set_fixed_ranges(mtrr_type * frs)
{
- unsigned int *p = (unsigned int *) frs;
+ unsigned long long *saved = (unsigned long long *) frs;
int changed = FALSE;
- int i;
- unsigned int lo, hi;
+ int block=-1, range;
- rdmsr(MTRRfix64K_00000_MSR, lo, hi);
- if (p[0] != lo || p[1] != hi) {
- mtrr_wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
- changed = TRUE;
- }
+ while (fixed_range_blocks[++block].ranges)
+ for (range=0; range < fixed_range_blocks[block].ranges; range++)
+ set_fixed_range(fixed_range_blocks[block].base_msr + range,
+ &changed, (unsigned int *) saved++);
- for (i = 0; i < 2; i++) {
- rdmsr(MTRRfix16K_80000_MSR + i, lo, hi);
- if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) {
- mtrr_wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2],
- p[3 + i * 2]);
- changed = TRUE;
- }
- }
-
- for (i = 0; i < 8; i++) {
- rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi);
- if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) {
- mtrr_wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2],
- p[7 + i * 2]);
- changed = TRUE;
- }
- }
return changed;
}
@@ -428,7 +469,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
}
}
- if (base + size < 0x100) {
+ if (base < 0x100) {
printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n",
base, size);
return -EINVAL;
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 0acfb6a5a220..02a2f39e5e0a 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -729,6 +729,17 @@ void mtrr_ap_init(void)
local_irq_restore(flags);
}
+/**
+ * Save current fixed-range MTRR state of the BSP
+ */
+void mtrr_save_state(void)
+{
+ if (smp_processor_id() == 0)
+ mtrr_save_fixed_ranges(NULL);
+ else
+ smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
+}
+
static int __init mtrr_init_finialize(void)
{
if (!mtrr_if)
diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c
index 8bf23cc80c63..961fbe1a748f 100644
--- a/arch/i386/kernel/cpu/nexgen.c
+++ b/arch/i386/kernel/cpu/nexgen.c
@@ -58,13 +58,3 @@ int __init nexgen_init_cpu(void)
cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
return 0;
}
-
-//early_arch_initcall(nexgen_init_cpu);
-
-static int __init nexgen_exit_cpu(void)
-{
- cpu_devs[X86_VENDOR_NEXGEN] = NULL;
- return 0;
-}
-
-late_initcall(nexgen_exit_cpu);
diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c
new file mode 100644
index 000000000000..2b04c8f1db62
--- /dev/null
+++ b/arch/i386/kernel/cpu/perfctr-watchdog.c
@@ -0,0 +1,658 @@
+/* local apic based NMI watchdog for various CPUs.
+ This file also handles reservation of performance counters for coordination
+ with other users (like oprofile).
+
+ Note that these events normally don't tick when the CPU idles. This means
+ the frequency varies with CPU load.
+
+ Original code for K7/P6 written by Keith Owens */
+
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/apic.h>
+#include <asm/intel_arch_perfmon.h>
+
+struct nmi_watchdog_ctlblk {
+ unsigned int cccr_msr;
+ unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
+ unsigned int evntsel_msr; /* the MSR to select the events to handle */
+};
+
+/* Interface defining a CPU specific perfctr watchdog */
+struct wd_ops {
+ int (*reserve)(void);
+ void (*unreserve)(void);
+ int (*setup)(unsigned nmi_hz);
+ void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
+ void (*stop)(void *);
+ unsigned perfctr;
+ unsigned evntsel;
+ u64 checkbit;
+};
+
+static struct wd_ops *wd_ops;
+
+/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
+ */
+#define NMI_MAX_COUNTER_BITS 66
+
+/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ * different subsystems this reservation system just tries to coordinate
+ * things a little
+ */
+static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
+static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
+
+static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+ return wd_ops ? msr - wd_ops->perfctr : 0;
+}
+
+/* converts an msr to an appropriate reservation bit */
+/* returns the bit offset of the event selection register */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+ return wd_ops ? msr - wd_ops->evntsel : 0;
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, perfctr_nmi_owner));
+}
+
+/* checks the an msr for availability */
+int avail_to_resrv_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, perfctr_nmi_owner));
+}
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, perfctr_nmi_owner))
+ return 1;
+ return 0;
+}
+
+void release_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, perfctr_nmi_owner);
+}
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, evntsel_nmi_owner))
+ return 1;
+ return 0;
+}
+
+void release_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, evntsel_nmi_owner);
+}
+
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+EXPORT_SYMBOL(release_perfctr_nmi);
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+EXPORT_SYMBOL(release_evntsel_nmi);
+
+void disable_lapic_nmi_watchdog(void)
+{
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+ if (atomic_read(&nmi_active) <= 0)
+ return;
+
+ on_each_cpu(wd_ops->stop, NULL, 0, 1);
+ wd_ops->unreserve();
+
+ BUG_ON(atomic_read(&nmi_active) != 0);
+}
+
+void enable_lapic_nmi_watchdog(void)
+{
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+ /* are we already enabled */
+ if (atomic_read(&nmi_active) != 0)
+ return;
+
+ /* are we lapic aware */
+ if (!wd_ops)
+ return;
+ if (!wd_ops->reserve()) {
+ printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
+ return;
+ }
+
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+ touch_nmi_watchdog();
+}
+
+/*
+ * Activate the NMI watchdog via the local APIC.
+ */
+
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+ u64 counter_val;
+ unsigned int retval = hz;
+
+ /*
+ * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
+ * are writable, with higher bits sign extending from bit 31.
+ * So, we can only program the counter with 31 bit values and
+ * 32nd bit should be 1, for 33.. to be 1.
+ * Find the appropriate nmi_hz
+ */
+ counter_val = (u64)cpu_khz * 1000;
+ do_div(counter_val, retval);
+ if (counter_val > 0x7fffffffULL) {
+ u64 count = (u64)cpu_khz * 1000;
+ do_div(count, 0x7fffffffUL);
+ retval = count + 1;
+ }
+ return retval;
+}
+
+static void
+write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
+{
+ u64 count = (u64)cpu_khz * 1000;
+
+ do_div(count, nmi_hz);
+ if(descr)
+ Dprintk("setting %s to -0x%08Lx\n", descr, count);
+ wrmsrl(perfctr_msr, 0 - count);
+}
+
+static void write_watchdog_counter32(unsigned int perfctr_msr,
+ const char *descr, unsigned nmi_hz)
+{
+ u64 count = (u64)cpu_khz * 1000;
+
+ do_div(count, nmi_hz);
+ if(descr)
+ Dprintk("setting %s to -0x%08Lx\n", descr, count);
+ wrmsr(perfctr_msr, (u32)(-count), 0);
+}
+
+/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
+ nicely stable so there is not much variety */
+
+#define K7_EVNTSEL_ENABLE (1 << 22)
+#define K7_EVNTSEL_INT (1 << 20)
+#define K7_EVNTSEL_OS (1 << 17)
+#define K7_EVNTSEL_USR (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
+#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+static int setup_k7_watchdog(unsigned nmi_hz)
+{
+ unsigned int perfctr_msr, evntsel_msr;
+ unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ perfctr_msr = MSR_K7_PERFCTR0;
+ evntsel_msr = MSR_K7_EVNTSEL0;
+
+ wrmsrl(perfctr_msr, 0UL);
+
+ evntsel = K7_EVNTSEL_INT
+ | K7_EVNTSEL_OS
+ | K7_EVNTSEL_USR
+ | K7_NMI_EVENT;
+
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= K7_EVNTSEL_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ return 1;
+}
+
+static void single_msr_stop_watchdog(void *arg)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ wrmsr(wd->evntsel_msr, 0, 0);
+}
+
+static int single_msr_reserve(void)
+{
+ if (!reserve_perfctr_nmi(wd_ops->perfctr))
+ return 0;
+
+ if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
+ release_perfctr_nmi(wd_ops->perfctr);
+ return 0;
+ }
+ return 1;
+}
+
+static void single_msr_unreserve(void)
+{
+ release_evntsel_nmi(wd_ops->perfctr);
+ release_perfctr_nmi(wd_ops->evntsel);
+}
+
+static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+ /* start the cycle over again */
+ write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
+}
+
+static struct wd_ops k7_wd_ops = {
+ .reserve = single_msr_reserve,
+ .unreserve = single_msr_unreserve,
+ .setup = setup_k7_watchdog,
+ .rearm = single_msr_rearm,
+ .stop = single_msr_stop_watchdog,
+ .perfctr = MSR_K7_PERFCTR0,
+ .evntsel = MSR_K7_EVNTSEL0,
+ .checkbit = 1ULL<<63,
+};
+
+/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
+
+#define P6_EVNTSEL0_ENABLE (1 << 22)
+#define P6_EVNTSEL_INT (1 << 20)
+#define P6_EVNTSEL_OS (1 << 17)
+#define P6_EVNTSEL_USR (1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
+#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+static int setup_p6_watchdog(unsigned nmi_hz)
+{
+ unsigned int perfctr_msr, evntsel_msr;
+ unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ perfctr_msr = MSR_P6_PERFCTR0;
+ evntsel_msr = MSR_P6_EVNTSEL0;
+
+ wrmsrl(perfctr_msr, 0UL);
+
+ evntsel = P6_EVNTSEL_INT
+ | P6_EVNTSEL_OS
+ | P6_EVNTSEL_USR
+ | P6_NMI_EVENT;
+
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+ write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= P6_EVNTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ return 1;
+}
+
+static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+ /* P6 based Pentium M need to re-unmask
+ * the apic vector but it doesn't hurt
+ * other P6 variant.
+ * ArchPerfom/Core Duo also needs this */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /* P6/ARCH_PERFMON has 32 bit counter write */
+ write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
+}
+
+static struct wd_ops p6_wd_ops = {
+ .reserve = single_msr_reserve,
+ .unreserve = single_msr_unreserve,
+ .setup = setup_p6_watchdog,
+ .rearm = p6_rearm,
+ .stop = single_msr_stop_watchdog,
+ .perfctr = MSR_P6_PERFCTR0,
+ .evntsel = MSR_P6_EVNTSEL0,
+ .checkbit = 1ULL<<39,
+};
+
+/* Intel P4 performance counters. By far the most complicated of all. */
+
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
+#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
+#define P4_ESCR_OS (1<<3)
+#define P4_ESCR_USR (1<<2)
+#define P4_CCCR_OVF_PMI0 (1<<26)
+#define P4_CCCR_OVF_PMI1 (1<<27)
+#define P4_CCCR_THRESHOLD(N) ((N)<<20)
+#define P4_CCCR_COMPLEMENT (1<<19)
+#define P4_CCCR_COMPARE (1<<18)
+#define P4_CCCR_REQUIRED (3<<16)
+#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
+#define P4_CCCR_ENABLE (1<<12)
+#define P4_CCCR_OVF (1<<31)
+
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ CRU_ESCR0 (with any non-null event selector) through a complemented
+ max threshold. [IA32-Vol3, Section 14.9.9] */
+
+static int setup_p4_watchdog(unsigned nmi_hz)
+{
+ unsigned int perfctr_msr, evntsel_msr, cccr_msr;
+ unsigned int evntsel, cccr_val;
+ unsigned int misc_enable, dummy;
+ unsigned int ht_num;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
+ if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+ return 0;
+
+#ifdef CONFIG_SMP
+ /* detect which hyperthread we are on */
+ if (smp_num_siblings == 2) {
+ unsigned int ebx, apicid;
+
+ ebx = cpuid_ebx(1);
+ apicid = (ebx >> 24) & 0xff;
+ ht_num = apicid & 1;
+ } else
+#endif
+ ht_num = 0;
+
+ /* performance counters are shared resources
+ * assign each hyperthread its own set
+ * (re-use the ESCR0 register, seems safe
+ * and keeps the cccr_val the same)
+ */
+ if (!ht_num) {
+ /* logical cpu 0 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR0;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR0;
+ cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+ } else {
+ /* logical cpu 1 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR1;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR1;
+ cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
+ }
+
+ evntsel = P4_ESCR_EVENT_SELECT(0x3F)
+ | P4_ESCR_OS
+ | P4_ESCR_USR;
+
+ cccr_val |= P4_CCCR_THRESHOLD(15)
+ | P4_CCCR_COMPLEMENT
+ | P4_CCCR_COMPARE
+ | P4_CCCR_REQUIRED;
+
+ wrmsr(evntsel_msr, evntsel, 0);
+ wrmsr(cccr_msr, cccr_val, 0);
+ write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = cccr_msr;
+ return 1;
+}
+
+static void stop_p4_watchdog(void *arg)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+ wrmsr(wd->cccr_msr, 0, 0);
+ wrmsr(wd->evntsel_msr, 0, 0);
+}
+
+static int p4_reserve(void)
+{
+ if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
+ return 0;
+#ifdef CONFIG_SMP
+ if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
+ goto fail1;
+#endif
+ if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
+ goto fail2;
+ /* RED-PEN why is ESCR1 not reserved here? */
+ return 1;
+ fail2:
+#ifdef CONFIG_SMP
+ if (smp_num_siblings > 1)
+ release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
+ fail1:
+#endif
+ release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
+ return 0;
+}
+
+static void p4_unreserve(void)
+{
+#ifdef CONFIG_SMP
+ if (smp_num_siblings > 1)
+ release_evntsel_nmi(MSR_P4_IQ_PERFCTR1);
+#endif
+ release_evntsel_nmi(MSR_P4_IQ_PERFCTR0);
+ release_perfctr_nmi(MSR_P4_CRU_ESCR0);
+}
+
+static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+ unsigned dummy;
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ rdmsrl(wd->cccr_msr, dummy);
+ dummy &= ~P4_CCCR_OVF;
+ wrmsrl(wd->cccr_msr, dummy);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /* start the cycle over again */
+ write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
+}
+
+static struct wd_ops p4_wd_ops = {
+ .reserve = p4_reserve,
+ .unreserve = p4_unreserve,
+ .setup = setup_p4_watchdog,
+ .rearm = p4_rearm,
+ .stop = stop_p4_watchdog,
+ /* RED-PEN this is wrong for the other sibling */
+ .perfctr = MSR_P4_BPU_PERFCTR0,
+ .evntsel = MSR_P4_BSU_ESCR0,
+ .checkbit = 1ULL<<39,
+};
+
+/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
+ all future Intel CPUs. */
+
+#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
+static int setup_intel_arch_watchdog(unsigned nmi_hz)
+{
+ unsigned int ebx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int perfctr_msr, evntsel_msr;
+ unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Unhalted Core Cycles Event or not.
+ * NOTE: Corresponding bit = 0 in ebx indicates event present.
+ */
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+ (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+ return 0;
+
+ perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1;
+ evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1;
+
+ wrmsrl(perfctr_msr, 0UL);
+
+ evntsel = ARCH_PERFMON_EVENTSEL_INT
+ | ARCH_PERFMON_EVENTSEL_OS
+ | ARCH_PERFMON_EVENTSEL_USR
+ | ARCH_PERFMON_NMI_EVENT_SEL
+ | ARCH_PERFMON_NMI_EVENT_UMASK;
+
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+ write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1);
+ return 1;
+}
+
+static struct wd_ops intel_arch_wd_ops = {
+ .reserve = single_msr_reserve,
+ .unreserve = single_msr_unreserve,
+ .setup = setup_intel_arch_watchdog,
+ .rearm = p6_rearm,
+ .stop = single_msr_stop_watchdog,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
+};
+
+static void probe_nmi_watchdog(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
+ boot_cpu_data.x86 != 16)
+ return;
+ wd_ops = &k7_wd_ops;
+ break;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ wd_ops = &intel_arch_wd_ops;
+ break;
+ }
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_model > 0xd)
+ return;
+
+ wd_ops = &p6_wd_ops;
+ break;
+ case 15:
+ if (boot_cpu_data.x86_model > 0x4)
+ return;
+
+ wd_ops = &p4_wd_ops;
+ break;
+ default:
+ return;
+ }
+ break;
+ }
+}
+
+/* Interface to nmi.c */
+
+int lapic_watchdog_init(unsigned nmi_hz)
+{
+ if (!wd_ops) {
+ probe_nmi_watchdog();
+ if (!wd_ops)
+ return -1;
+ }
+
+ if (!(wd_ops->setup(nmi_hz))) {
+ printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
+ raw_smp_processor_id());
+ return -1;
+ }
+
+ return 0;
+}
+
+void lapic_watchdog_stop(void)
+{
+ if (wd_ops)
+ wd_ops->stop(NULL);
+}
+
+unsigned lapic_adjust_nmi_hz(unsigned hz)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+ if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+ wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
+ hz = adjust_for_32bit_ctr(hz);
+ return hz;
+}
+
+int lapic_wd_event(unsigned nmi_hz)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+ u64 ctr;
+ rdmsrl(wd->perfctr_msr, ctr);
+ if (ctr & wd_ops->checkbit) { /* perfctr still running? */
+ return 0;
+ }
+ wd_ops->rearm(wd, nmi_hz);
+ return 1;
+}
+
+int lapic_watchdog_ok(void)
+{
+ return wd_ops != NULL;
+}
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 47e3ebbfb28d..89d91e6cc972 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -72,8 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
"stc",
"100mhzsteps",
"hwpstate",
- NULL,
- NULL, /* constant_tsc - moved to flags */
+ "", /* constant_tsc - moved to flags */
/* nothing */
};
struct cpuinfo_x86 *c = v;
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
index 9317f7414989..50076f22e90f 100644
--- a/arch/i386/kernel/cpu/rise.c
+++ b/arch/i386/kernel/cpu/rise.c
@@ -50,12 +50,3 @@ int __init rise_init_cpu(void)
return 0;
}
-//early_arch_initcall(rise_init_cpu);
-
-static int __init rise_exit_cpu(void)
-{
- cpu_devs[X86_VENDOR_RISE] = NULL;
- return 0;
-}
-
-late_initcall(rise_exit_cpu);
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 5678d46863c6..6471a5a13202 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -112,13 +112,3 @@ int __init transmeta_init_cpu(void)
cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev;
return 0;
}
-
-//early_arch_initcall(transmeta_init_cpu);
-
-static int __init transmeta_exit_cpu(void)
-{
- cpu_devs[X86_VENDOR_TRANSMETA] = NULL;
- return 0;
-}
-
-late_initcall(transmeta_exit_cpu);
diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c
index 1bf3f87e9c5b..a7a4e75bdcd7 100644
--- a/arch/i386/kernel/cpu/umc.c
+++ b/arch/i386/kernel/cpu/umc.c
@@ -24,13 +24,3 @@ int __init umc_init_cpu(void)
cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev;
return 0;
}
-
-//early_arch_initcall(umc_init_cpu);
-
-static int __init umc_exit_cpu(void)
-{
- cpu_devs[X86_VENDOR_UMC] = NULL;
- return 0;
-}
-
-late_initcall(umc_exit_cpu);
diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c
index b4d14c2eb345..265c5597efb0 100644
--- a/arch/i386/kernel/doublefault.c
+++ b/arch/i386/kernel/doublefault.c
@@ -33,7 +33,7 @@ static void doublefault_fn(void)
printk("double fault, tss at %08lx\n", tss);
if (ptr_ok(tss)) {
- struct tss_struct *t = (struct tss_struct *)tss;
+ struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp);
@@ -49,18 +49,21 @@ static void doublefault_fn(void)
}
struct tss_struct doublefault_tss __cacheline_aligned = {
- .esp0 = STACK_START,
- .ss0 = __KERNEL_DS,
- .ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+ .x86_tss = {
+ .esp0 = STACK_START,
+ .ss0 = __KERNEL_DS,
+ .ldt = 0,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
- .eip = (unsigned long) doublefault_fn,
- .eflags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */
- .esp = STACK_START,
- .es = __USER_DS,
- .cs = __KERNEL_CS,
- .ss = __KERNEL_DS,
- .ds = __USER_DS,
+ .eip = (unsigned long) doublefault_fn,
+ /* 0x2 bit is always set */
+ .eflags = X86_EFLAGS_SF | 0x2,
+ .esp = STACK_START,
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
- .__cr3 = __pa(swapper_pg_dir)
+ .__cr3 = __pa(swapper_pg_dir)
+ }
};
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index 70f39560846a..9645bb51f76a 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -161,26 +161,27 @@ static struct resource standard_io_resources[] = { {
static int __init romsignature(const unsigned char *rom)
{
+ const unsigned short * const ptr = (const unsigned short *)rom;
unsigned short sig;
- return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
- sig == ROMSIGNATURE;
+ return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
}
-static int __init romchecksum(unsigned char *rom, unsigned long length)
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
{
- unsigned char sum;
+ unsigned char sum, c;
- for (sum = 0; length; length--)
- sum += *rom++;
- return sum == 0;
+ for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+ sum += c;
+ return !length && !sum;
}
static void __init probe_roms(void)
{
+ const unsigned char *rom;
unsigned long start, length, upper;
- unsigned char *rom;
- int i;
+ unsigned char c;
+ int i;
/* video rom */
upper = adapter_rom_resources[0].start;
@@ -191,8 +192,11 @@ static void __init probe_roms(void)
video_rom_resource.start = start;
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
/* 0 < length <= 0x7f * 512, historically */
- length = rom[2] * 512;
+ length = c * 512;
/* if checksum okay, trust length byte */
if (length && romchecksum(rom, length))
@@ -226,8 +230,11 @@ static void __init probe_roms(void)
if (!romsignature(rom))
continue;
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
/* 0 < length <= 0x7f * 512, historically */
- length = rom[2] * 512;
+ length = c * 512;
/* but accept any length that fits if checksum okay */
if (!length || start + length > upper || !romchecksum(rom, length))
@@ -386,10 +393,8 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
____________________33__
______________________4_
*/
- printk("sanitize start\n");
/* if there's only one memory region, don't bother */
if (*pnr_map < 2) {
- printk("sanitize bail 0\n");
return -1;
}
@@ -398,7 +403,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
/* bail out if we find any unreasonable addresses in bios map */
for (i=0; i<old_nr; i++)
if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
- printk("sanitize bail 1\n");
return -1;
}
@@ -494,7 +498,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
*pnr_map = new_nr;
- printk("sanitize end\n");
return 0;
}
@@ -525,7 +528,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
unsigned long long size = biosmap->size;
unsigned long long end = start + size;
unsigned long type = biosmap->type;
- printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
/* Overflow in 64 bits? Ignore the memory map. */
if (start > end)
@@ -536,17 +538,11 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
* Not right. Fix it up.
*/
if (type == E820_RAM) {
- printk("copy_e820_map() type is E820_RAM\n");
if (start < 0x100000ULL && end > 0xA0000ULL) {
- printk("copy_e820_map() lies in range...\n");
- if (start < 0xA0000ULL) {
- printk("copy_e820_map() start < 0xA0000ULL\n");
+ if (start < 0xA0000ULL)
add_memory_region(start, 0xA0000ULL-start, type);
- }
- if (end <= 0x100000ULL) {
- printk("copy_e820_map() end <= 0x100000ULL\n");
+ if (end <= 0x100000ULL)
continue;
- }
start = 0x100000ULL;
size = end - start;
}
@@ -818,6 +814,26 @@ void __init limit_regions(unsigned long long size)
print_memory_map("limit_regions endfunc");
}
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ const struct e820entry *ei = &e820.map[i];
+ if (type && ei->type != type)
+ continue;
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
/*
* This function checks if the entire range <start,end> is mapped with type.
*
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8f9c624ace6f..dd9e7faafa7c 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -69,13 +69,11 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
{
unsigned long cr4;
unsigned long temp;
- struct Xgt_desc_struct *cpu_gdt_descr;
+ struct Xgt_desc_struct gdt_descr;
spin_lock(&efi_rt_lock);
local_irq_save(efi_rt_eflags);
- cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
-
/*
* If I don't have PSE, I should just duplicate two entries in page
* directory. If I have PSE, I just need to duplicate one entry in
@@ -105,17 +103,19 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
*/
local_flush_tlb();
- cpu_gdt_descr->address = __pa(cpu_gdt_descr->address);
- load_gdt(cpu_gdt_descr);
+ gdt_descr.address = __pa(get_cpu_gdt_table(0));
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
}
static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
{
unsigned long cr4;
- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
+ struct Xgt_desc_struct gdt_descr;
- cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address);
- load_gdt(cpu_gdt_descr);
+ gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
cr4 = read_cr4();
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 18bddcb8e9e8..b1f16ee65e4d 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -15,7 +15,7 @@
* I changed all the .align's to 4 (16 byte alignment), as that's faster
* on a 486.
*
- * Stack layout in 'ret_from_system_call':
+ * Stack layout in 'syscall_exit':
* ptrace needs to have all regs on the stack.
* if the order here is changed, it needs to be
* updated in fork.c:copy_process, signal.c:do_signal,
@@ -132,7 +132,7 @@ VM_MASK = 0x00020000
movl $(__USER_DS), %edx; \
movl %edx, %ds; \
movl %edx, %es; \
- movl $(__KERNEL_PDA), %edx; \
+ movl $(__KERNEL_PERCPU), %edx; \
movl %edx, %fs
#define RESTORE_INT_REGS \
@@ -305,16 +305,12 @@ sysenter_past_esp:
pushl $(__USER_CS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET cs, 0*/
-#ifndef CONFIG_COMPAT_VDSO
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
-#else
- pushl $SYSENTER_RETURN
-#endif
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eip, 0
@@ -342,7 +338,7 @@ sysenter_past_esp:
jae syscall_badsys
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp)
- DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx
@@ -560,9 +556,7 @@ END(syscall_badsys)
#define FIXUP_ESPFIX_STACK \
/* since we are on a wrong stack, we cant make it a C code :( */ \
- movl %fs:PDA_cpu, %ebx; \
- PER_CPU(cpu_gdt_descr, %ebx); \
- movl GDS_address(%ebx), %ebx; \
+ PER_CPU(gdt_page, %ebx); \
GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
addl %esp, %eax; \
pushl $__KERNEL_DS; \
@@ -635,7 +629,7 @@ ENTRY(name) \
SAVE_ALL; \
TRACE_IRQS_OFF \
movl %esp,%eax; \
- call smp_/**/name; \
+ call smp_##name; \
jmp ret_from_intr; \
CFI_ENDPROC; \
ENDPROC(name)
@@ -643,11 +637,6 @@ ENDPROC(name)
/* The include is where all of the SMP etc. interrupts come from */
#include "entry_arch.h"
-/* This alternate entry is needed because we hijack the apic LVTT */
-#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
-BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
-#endif
-
KPROBE_ENTRY(page_fault)
RING0_EC_FRAME
pushl $do_page_fault
@@ -686,7 +675,7 @@ error_code:
pushl %fs
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET fs, 0*/
- movl $(__KERNEL_PDA), %ecx
+ movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
UNWIND_ESPFIX_STACK
popl %ecx
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 3fa7f9389afe..9b10af65faaa 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -34,17 +34,32 @@
/*
* This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially. We need one bit for
- * each possible page, but only in low memory, which means
- * 2^32/4096/8 = 128K worst case (4G/4G split.)
+ * and including _end* we need mapped initially.
+ * We need:
+ * - one bit for each possible page, but only in low memory, which means
+ * 2^32/4096/8 = 128K worst case (4G/4G split.)
+ * - enough space to map all low memory, which means
+ * (2^32/4096) / 1024 pages (worst case, non PAE)
+ * (2^32/4096) / 512 + 4 pages (worst case for PAE)
+ * - a few pages for allocator use before the kernel pagetable has
+ * been set up
*
* Modulo rounding, each megabyte assigned here requires a kilobyte of
* memory, which is currently unreclaimed.
*
* This should be a multiple of a page.
*/
-#define INIT_MAP_BEYOND_END (128*1024)
+LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
+#if PTRS_PER_PMD > 1
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#else
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#endif
+BOOTBITMAP_SIZE = LOW_PAGES / 8
+ALLOCATOR_SLOP = 4
+
+INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
/*
* 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -147,8 +162,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
- * we know the trampoline has already loaded the boot_gdt_table GDT
- * for us.
+ * we know the trampoline has already loaded the boot_gdt for us.
*
* If cpu hotplug is not supported then this code can go in init section
* which will be freed later
@@ -318,12 +332,12 @@ is386: movl $2,%ecx # set MP
movl %eax,%cr0
call check_x87
- call setup_pda
lgdt early_gdt_descr
lidt idt_descr
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
+ movl %eax,%fs # gets reset once there's real percpu
movl $(__USER_DS),%eax # DS/ES contains default USER segment
movl %eax,%ds
@@ -333,16 +347,17 @@ is386: movl $2,%ecx # set MP
movl %eax,%gs
lldt %ax
- movl $(__KERNEL_PDA),%eax
- mov %eax,%fs
-
cld # gcc2 wants the direction flag cleared at all times
pushl $0 # fake return address for unwinder
#ifdef CONFIG_SMP
movb ready, %cl
movb $1, ready
cmpb $0,%cl # the first CPU calls start_kernel
- jne initialize_secondary # all other CPUs call initialize_secondary
+ je 1f
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax,%fs # set this cpu's percpu
+ jmp initialize_secondary # all other CPUs call initialize_secondary
+1:
#endif /* CONFIG_SMP */
jmp start_kernel
@@ -366,23 +381,6 @@ check_x87:
ret
/*
- * Point the GDT at this CPU's PDA. On boot this will be
- * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
- * that CPU's GDT and PDA.
- */
-ENTRY(setup_pda)
- /* get the PDA pointer */
- movl start_pda, %eax
-
- /* slot the PDA address into the GDT */
- mov early_gdt_descr+2, %ecx
- mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
- shr $16, %eax
- mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
- mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
- ret
-
-/*
* setup_idt
*
* sets up a idt with 256 entries pointing to
@@ -554,9 +552,6 @@ ENTRY(empty_zero_page)
* This starts the data section.
*/
.data
-ENTRY(start_pda)
- .long boot_pda
-
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE
.long __BOOT_DS
@@ -588,7 +583,7 @@ fault_msg:
.word 0 # 32 bit align gdt_desc.address
boot_gdt_descr:
.word __BOOT_DS+7
- .long boot_gdt_table - __PAGE_OFFSET
+ .long boot_gdt - __PAGE_OFFSET
.word 0 # 32-bit align idt_desc.address
idt_descr:
@@ -599,67 +594,14 @@ idt_descr:
.word 0 # 32 bit align gdt_desc.address
ENTRY(early_gdt_descr)
.word GDT_ENTRIES*8-1
- .long cpu_gdt_table
+ .long per_cpu__gdt_page /* Overwritten for secondary CPUs */
/*
- * The boot_gdt_table must mirror the equivalent in setup.S and is
+ * The boot_gdt must mirror the equivalent in setup.S and is
* used only for booting.
*/
.align L1_CACHE_BYTES
-ENTRY(boot_gdt_table)
+ENTRY(boot_gdt)
.fill GDT_ENTRY_BOOT_CS,8,0
.quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
-
-/*
- * The Global Descriptor Table contains 28 quadwords, per-CPU.
- */
- .align L1_CACHE_BYTES
-ENTRY(cpu_gdt_table)
- .quad 0x0000000000000000 /* NULL descriptor */
- .quad 0x0000000000000000 /* 0x0b reserved */
- .quad 0x0000000000000000 /* 0x13 reserved */
- .quad 0x0000000000000000 /* 0x1b reserved */
- .quad 0x0000000000000000 /* 0x20 unused */
- .quad 0x0000000000000000 /* 0x28 unused */
- .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
- .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
- .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
- .quad 0x0000000000000000 /* 0x4b reserved */
- .quad 0x0000000000000000 /* 0x53 reserved */
- .quad 0x0000000000000000 /* 0x5b reserved */
-
- .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
- .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
- .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
- .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
-
- .quad 0x0000000000000000 /* 0x80 TSS descriptor */
- .quad 0x0000000000000000 /* 0x88 LDT descriptor */
-
- /*
- * Segments used for calling PnP BIOS have byte granularity.
- * They code segments and data segments have fixed 64k limits,
- * the transfer segment sizes are set at run time.
- */
- .quad 0x00409a000000ffff /* 0x90 32-bit code */
- .quad 0x00009a000000ffff /* 0x98 16-bit code */
- .quad 0x000092000000ffff /* 0xa0 16-bit data */
- .quad 0x0000920000000000 /* 0xa8 16-bit data */
- .quad 0x0000920000000000 /* 0xb0 16-bit data */
-
- /*
- * The APM segments have byte granularity and their bases
- * are set at run time. All have 64k limits.
- */
- .quad 0x00409a000000ffff /* 0xb8 APM CS code */
- .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
- .quad 0x004092000000ffff /* 0xc8 APM DS data */
-
- .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
- .quad 0x00cf92000000ffff /* 0xd8 - PDA */
- .quad 0x0000000000000000 /* 0xe0 - unused */
- .quad 0x0000000000000000 /* 0xe8 - unused */
- .quad 0x0000000000000000 /* 0xf0 - unused */
- .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
-
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 4afe26e86260..e3d4b73bfdb0 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -28,5 +28,3 @@ EXPORT_SYMBOL(__read_lock_failed);
#endif
EXPORT_SYMBOL(csum_partial);
-
-EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 89d85d244926..1b623cda3a64 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -35,6 +35,7 @@
#include <linux/msi.h>
#include <linux/htirq.h>
#include <linux/freezer.h>
+#include <linux/kthread.h>
#include <asm/io.h>
#include <asm/smp.h>
@@ -661,8 +662,6 @@ static int balanced_irq(void *unused)
unsigned long prev_balance_time = jiffies;
long time_remaining = balanced_irq_interval;
- daemonize("kirqd");
-
/* push everything to CPU 0 to give us a starting point. */
for (i = 0 ; i < NR_IRQS ; i++) {
irq_desc[i].pending_mask = cpumask_of_cpu(0);
@@ -722,10 +721,9 @@ static int __init balanced_irq_init(void)
}
printk(KERN_INFO "Starting balanced_irq\n");
- if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
+ if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
return 0;
- else
- printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
+ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
failed:
for_each_possible_cpu(i) {
kfree(irq_cpu_data[i].irq_delta);
@@ -1403,10 +1401,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
enable_8259A_irq(0);
}
-static inline void UNEXPECTED_IO_APIC(void)
-{
-}
-
void __init print_IO_APIC(void)
{
int apic, i;
@@ -1446,34 +1440,12 @@ void __init print_IO_APIC(void)
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
- if (reg_00.bits.ID >= get_physical_broadcast())
- UNEXPECTED_IO_APIC();
- if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
- UNEXPECTED_IO_APIC();
printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
- if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
- (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
- (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
- (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
- (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
- (reg_01.bits.entries != 0x2E) &&
- (reg_01.bits.entries != 0x3F)
- )
- UNEXPECTED_IO_APIC();
printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
- if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
- (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
- (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
- (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
- (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
- )
- UNEXPECTED_IO_APIC();
- if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
- UNEXPECTED_IO_APIC();
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1483,8 +1455,6 @@ void __init print_IO_APIC(void)
if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
- if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
- UNEXPECTED_IO_APIC();
}
/*
@@ -1496,8 +1466,6 @@ void __init print_IO_APIC(void)
reg_03.raw != reg_01.raw) {
printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
- if (reg_03.bits.__reserved_1)
- UNEXPECTED_IO_APIC();
}
printk(KERN_DEBUG ".... IRQ redirection table:\n");
diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c
index 498e8bc197d5..d1e42e0dbe67 100644
--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -16,6 +16,7 @@
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/thread_info.h>
+#include <linux/syscalls.h>
/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
@@ -113,7 +114,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* Reset the owner so that a process switch will not set
* tss->io_bitmap_base to IO_BITMAP_OFFSET.
*/
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+ tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
tss->io_bitmap_owner = NULL;
put_cpu();
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 8db8d514c9c0..d2daf672f4a2 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -24,6 +24,9 @@
DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
EXPORT_PER_CPU_SYMBOL(irq_stat);
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
* each architecture has to answer this themselves.
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 4f5983c98669..0952eccd8f28 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
}
++mpc_record;
}
- clustered_apic_check();
+ setup_apic_routing();
if (!num_processors)
printk(KERN_ERR "SMP mptable: no processors registered!\n");
return num_processors;
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 84c3497efb60..33cf2f3c444f 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -20,7 +20,6 @@
#include <linux/sysdev.h>
#include <linux/sysctl.h>
#include <linux/percpu.h>
-#include <linux/dmi.h>
#include <linux/kprobes.h>
#include <linux/cpumask.h>
#include <linux/kernel_stat.h>
@@ -28,30 +27,14 @@
#include <asm/smp.h>
#include <asm/nmi.h>
#include <asm/kdebug.h>
-#include <asm/intel_arch_perfmon.h>
#include "mach_traps.h"
int unknown_nmi_panic;
int nmi_watchdog_enabled;
-/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
- * evtsel_nmi_owner tracks the ownership of the event selection
- * - different performance counters/ event selection may be reserved for
- * different subsystems this reservation system just tries to coordinate
- * things a little
- */
-
-/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
- * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
- */
-#define NMI_MAX_COUNTER_BITS 66
-#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS)
-
-static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]);
-static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]);
-
static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
/* nmi_active:
* >0: the lapic NMI watchdog is active, but can be disabled
* <0: the lapic NMI watchdog has not been set up, and cannot
@@ -63,206 +46,11 @@ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
unsigned int nmi_watchdog = NMI_DEFAULT;
static unsigned int nmi_hz = HZ;
-struct nmi_watchdog_ctlblk {
- int enabled;
- u64 check_bit;
- unsigned int cccr_msr;
- unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
- unsigned int evntsel_msr; /* the MSR to select the events to handle */
-};
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
+static DEFINE_PER_CPU(short, wd_enabled);
/* local prototypes */
static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
-extern void show_registers(struct pt_regs *regs);
-extern int unknown_nmi_panic;
-
-/* converts an msr to an appropriate reservation bit */
-static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
-{
- /* returns the bit offset of the performance counter register */
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- return (msr - MSR_K7_PERFCTR0);
- case X86_VENDOR_INTEL:
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
- return (msr - MSR_ARCH_PERFMON_PERFCTR0);
-
- switch (boot_cpu_data.x86) {
- case 6:
- return (msr - MSR_P6_PERFCTR0);
- case 15:
- return (msr - MSR_P4_BPU_PERFCTR0);
- }
- }
- return 0;
-}
-
-/* converts an msr to an appropriate reservation bit */
-static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
-{
- /* returns the bit offset of the event selection register */
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- return (msr - MSR_K7_EVNTSEL0);
- case X86_VENDOR_INTEL:
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
- return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
-
- switch (boot_cpu_data.x86) {
- case 6:
- return (msr - MSR_P6_EVNTSEL0);
- case 15:
- return (msr - MSR_P4_BSU_ESCR0);
- }
- }
- return 0;
-}
-
-/* checks for a bit availability (hack for oprofile) */
-int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
-{
- int cpu;
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
- for_each_possible_cpu (cpu) {
- if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
- return 0;
- }
- return 1;
-}
-
-/* checks the an msr for availability */
-int avail_to_resrv_perfctr_nmi(unsigned int msr)
-{
- unsigned int counter;
- int cpu;
-
- counter = nmi_perfctr_msr_to_bit(msr);
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- for_each_possible_cpu (cpu) {
- if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
- return 0;
- }
- return 1;
-}
-
-static int __reserve_perfctr_nmi(int cpu, unsigned int msr)
-{
- unsigned int counter;
- if (cpu < 0)
- cpu = smp_processor_id();
-
- counter = nmi_perfctr_msr_to_bit(msr);
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
- return 1;
- return 0;
-}
-
-static void __release_perfctr_nmi(int cpu, unsigned int msr)
-{
- unsigned int counter;
- if (cpu < 0)
- cpu = smp_processor_id();
-
- counter = nmi_perfctr_msr_to_bit(msr);
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]);
-}
-
-int reserve_perfctr_nmi(unsigned int msr)
-{
- int cpu, i;
- for_each_possible_cpu (cpu) {
- if (!__reserve_perfctr_nmi(cpu, msr)) {
- for_each_possible_cpu (i) {
- if (i >= cpu)
- break;
- __release_perfctr_nmi(i, msr);
- }
- return 0;
- }
- }
- return 1;
-}
-
-void release_perfctr_nmi(unsigned int msr)
-{
- int cpu;
- for_each_possible_cpu (cpu) {
- __release_perfctr_nmi(cpu, msr);
- }
-}
-
-int __reserve_evntsel_nmi(int cpu, unsigned int msr)
-{
- unsigned int counter;
- if (cpu < 0)
- cpu = smp_processor_id();
-
- counter = nmi_evntsel_msr_to_bit(msr);
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]))
- return 1;
- return 0;
-}
-
-static void __release_evntsel_nmi(int cpu, unsigned int msr)
-{
- unsigned int counter;
- if (cpu < 0)
- cpu = smp_processor_id();
-
- counter = nmi_evntsel_msr_to_bit(msr);
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]);
-}
-
-int reserve_evntsel_nmi(unsigned int msr)
-{
- int cpu, i;
- for_each_possible_cpu (cpu) {
- if (!__reserve_evntsel_nmi(cpu, msr)) {
- for_each_possible_cpu (i) {
- if (i >= cpu)
- break;
- __release_evntsel_nmi(i, msr);
- }
- return 0;
- }
- }
- return 1;
-}
-
-void release_evntsel_nmi(unsigned int msr)
-{
- int cpu;
- for_each_possible_cpu (cpu) {
- __release_evntsel_nmi(cpu, msr);
- }
-}
-
-static __cpuinit inline int nmi_known_cpu(void)
-{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
- || (boot_cpu_data.x86 == 16));
- case X86_VENDOR_INTEL:
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
- return 1;
- else
- return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
- }
- return 0;
-}
-
static int endflag __initdata = 0;
#ifdef CONFIG_SMP
@@ -284,28 +72,6 @@ static __init void nmi_cpu_busy(void *data)
}
#endif
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
- u64 counter_val;
- unsigned int retval = hz;
-
- /*
- * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
- * are writable, with higher bits sign extending from bit 31.
- * So, we can only program the counter with 31 bit values and
- * 32nd bit should be 1, for 33.. to be 1.
- * Find the appropriate nmi_hz
- */
- counter_val = (u64)cpu_khz * 1000;
- do_div(counter_val, retval);
- if (counter_val > 0x7fffffffULL) {
- u64 count = (u64)cpu_khz * 1000;
- do_div(count, 0x7fffffffUL);
- retval = count + 1;
- }
- return retval;
-}
-
static int __init check_nmi_watchdog(void)
{
unsigned int *prev_nmi_count;
@@ -338,14 +104,14 @@ static int __init check_nmi_watchdog(void)
if (!cpu_isset(cpu, cpu_callin_map))
continue;
#endif
- if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
+ if (!per_cpu(wd_enabled, cpu))
continue;
if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
cpu,
prev_nmi_count[cpu],
nmi_count(cpu));
- per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
+ per_cpu(wd_enabled, cpu) = 0;
atomic_dec(&nmi_active);
}
}
@@ -359,16 +125,8 @@ static int __init check_nmi_watchdog(void)
/* now that we know it works we can reduce NMI frequency to
something more reasonable; makes a difference in some configs */
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- nmi_hz = 1;
-
- if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
- wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- }
- }
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ nmi_hz = lapic_adjust_nmi_hz(1);
kfree(prev_nmi_count);
return 0;
@@ -391,85 +149,8 @@ static int __init setup_nmi_watchdog(char *str)
__setup("nmi_watchdog=", setup_nmi_watchdog);
-static void disable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- if (atomic_read(&nmi_active) <= 0)
- return;
-
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
-
- BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-static void enable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- /* are we already enabled */
- if (atomic_read(&nmi_active) != 0)
- return;
-
- /* are we lapic aware */
- if (nmi_known_cpu() <= 0)
- return;
- on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
- touch_nmi_watchdog();
-}
-
-void disable_timer_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_IO_APIC);
-
- if (atomic_read(&nmi_active) <= 0)
- return;
-
- disable_irq(0);
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
-
- BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_timer_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_IO_APIC);
-
- if (atomic_read(&nmi_active) == 0) {
- touch_nmi_watchdog();
- on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
- enable_irq(0);
- }
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
- apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
-}
-
-static void __acpi_nmi_enable(void *__unused)
-{
- apic_write_around(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
-}
+/* Suspend/resume support */
#ifdef CONFIG_PM
@@ -516,7 +197,7 @@ static int __init init_lapic_nmi_sysfs(void)
if (nmi_watchdog != NMI_LOCAL_APIC)
return 0;
- if ( atomic_read(&nmi_active) < 0 )
+ if (atomic_read(&nmi_active) < 0)
return 0;
error = sysdev_class_register(&nmi_sysclass);
@@ -529,433 +210,69 @@ late_initcall(init_lapic_nmi_sysfs);
#endif /* CONFIG_PM */
-/*
- * Activate the NMI watchdog via the local APIC.
- * Original code written by Keith Owens.
- */
-
-static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if(descr)
- Dprintk("setting %s to -0x%08Lx\n", descr, count);
- wrmsrl(perfctr_msr, 0 - count);
-}
-
-static void write_watchdog_counter32(unsigned int perfctr_msr,
- const char *descr)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if(descr)
- Dprintk("setting %s to -0x%08Lx\n", descr, count);
- wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-
-/* Note that these events don't tick when the CPU idles. This means
- the frequency varies with CPU load. */
-
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(void)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = MSR_K7_PERFCTR0;
- evntsel_msr = MSR_K7_EVNTSEL0;
- if (!__reserve_perfctr_nmi(-1, perfctr_msr))
- goto fail;
-
- if (!__reserve_evntsel_nmi(-1, evntsel_msr))
- goto fail1;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = K7_EVNTSEL_INT
- | K7_EVNTSEL_OS
- | K7_EVNTSEL_USR
- | K7_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; //unused
- wd->check_bit = 1ULL<<63;
- return 1;
-fail1:
- __release_perfctr_nmi(-1, perfctr_msr);
-fail:
- return 0;
-}
-
-static void stop_k7_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- wrmsr(wd->evntsel_msr, 0, 0);
-
- __release_evntsel_nmi(-1, wd->evntsel_msr);
- __release_perfctr_nmi(-1, wd->perfctr_msr);
-}
-
-#define P6_EVNTSEL0_ENABLE (1 << 22)
-#define P6_EVNTSEL_INT (1 << 20)
-#define P6_EVNTSEL_OS (1 << 17)
-#define P6_EVNTSEL_USR (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-static int setup_p6_watchdog(void)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = MSR_P6_PERFCTR0;
- evntsel_msr = MSR_P6_EVNTSEL0;
- if (!__reserve_perfctr_nmi(-1, perfctr_msr))
- goto fail;
-
- if (!__reserve_evntsel_nmi(-1, evntsel_msr))
- goto fail1;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = P6_EVNTSEL_INT
- | P6_EVNTSEL_OS
- | P6_EVNTSEL_USR
- | P6_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; //unused
- wd->check_bit = 1ULL<<39;
- return 1;
-fail1:
- __release_perfctr_nmi(-1, perfctr_msr);
-fail:
- return 0;
-}
-
-static void stop_p6_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- wrmsr(wd->evntsel_msr, 0, 0);
-
- __release_evntsel_nmi(-1, wd->evntsel_msr);
- __release_perfctr_nmi(-1, wd->perfctr_msr);
-}
-
-/* Note that these events don't tick when the CPU idles. This means
- the frequency varies with CPU load. */
-
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
-#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
-#define P4_ESCR_OS (1<<3)
-#define P4_ESCR_USR (1<<2)
-#define P4_CCCR_OVF_PMI0 (1<<26)
-#define P4_CCCR_OVF_PMI1 (1<<27)
-#define P4_CCCR_THRESHOLD(N) ((N)<<20)
-#define P4_CCCR_COMPLEMENT (1<<19)
-#define P4_CCCR_COMPARE (1<<18)
-#define P4_CCCR_REQUIRED (3<<16)
-#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
-#define P4_CCCR_ENABLE (1<<12)
-#define P4_CCCR_OVF (1<<31)
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- CRU_ESCR0 (with any non-null event selector) through a complemented
- max threshold. [IA32-Vol3, Section 14.9.9] */
-
-static int setup_p4_watchdog(void)
+static void __acpi_nmi_enable(void *__unused)
{
- unsigned int perfctr_msr, evntsel_msr, cccr_msr;
- unsigned int evntsel, cccr_val;
- unsigned int misc_enable, dummy;
- unsigned int ht_num;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
- return 0;
-
-#ifdef CONFIG_SMP
- /* detect which hyperthread we are on */
- if (smp_num_siblings == 2) {
- unsigned int ebx, apicid;
-
- ebx = cpuid_ebx(1);
- apicid = (ebx >> 24) & 0xff;
- ht_num = apicid & 1;
- } else
-#endif
- ht_num = 0;
-
- /* performance counters are shared resources
- * assign each hyperthread its own set
- * (re-use the ESCR0 register, seems safe
- * and keeps the cccr_val the same)
- */
- if (!ht_num) {
- /* logical cpu 0 */
- perfctr_msr = MSR_P4_IQ_PERFCTR0;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR0;
- cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
- } else {
- /* logical cpu 1 */
- perfctr_msr = MSR_P4_IQ_PERFCTR1;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR1;
- cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
- }
-
- if (!__reserve_perfctr_nmi(-1, perfctr_msr))
- goto fail;
-
- if (!__reserve_evntsel_nmi(-1, evntsel_msr))
- goto fail1;
-
- evntsel = P4_ESCR_EVENT_SELECT(0x3F)
- | P4_ESCR_OS
- | P4_ESCR_USR;
-
- cccr_val |= P4_CCCR_THRESHOLD(15)
- | P4_CCCR_COMPLEMENT
- | P4_CCCR_COMPARE
- | P4_CCCR_REQUIRED;
-
- wrmsr(evntsel_msr, evntsel, 0);
- wrmsr(cccr_msr, cccr_val, 0);
- write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = cccr_msr;
- wd->check_bit = 1ULL<<39;
- return 1;
-fail1:
- __release_perfctr_nmi(-1, perfctr_msr);
-fail:
- return 0;
+ apic_write_around(APIC_LVT0, APIC_DM_NMI);
}
-static void stop_p4_watchdog(void)
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- wrmsr(wd->cccr_msr, 0, 0);
- wrmsr(wd->evntsel_msr, 0, 0);
-
- __release_evntsel_nmi(-1, wd->evntsel_msr);
- __release_perfctr_nmi(-1, wd->perfctr_msr);
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+ on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
}
-#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static int setup_intel_arch_watchdog(void)
+static void __acpi_nmi_disable(void *__unused)
{
- unsigned int ebx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- /*
- * Check whether the Architectural PerfMon supports
- * Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebx indicates event present.
- */
- cpuid(10, &(eax.full), &ebx, &unused, &unused);
- if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
- (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- goto fail;
-
- perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
- evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
-
- if (!__reserve_perfctr_nmi(-1, perfctr_msr))
- goto fail;
-
- if (!__reserve_evntsel_nmi(-1, evntsel_msr))
- goto fail1;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = ARCH_PERFMON_EVENTSEL_INT
- | ARCH_PERFMON_EVENTSEL_OS
- | ARCH_PERFMON_EVENTSEL_USR
- | ARCH_PERFMON_NMI_EVENT_SEL
- | ARCH_PERFMON_NMI_EVENT_UMASK;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; //unused
- wd->check_bit = 1ULL << (eax.split.bit_width - 1);
- return 1;
-fail1:
- __release_perfctr_nmi(-1, perfctr_msr);
-fail:
- return 0;
+ apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
}
-static void stop_intel_arch_watchdog(void)
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
{
- unsigned int ebx;
- union cpuid10_eax eax;
- unsigned int unused;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- /*
- * Check whether the Architectural PerfMon supports
- * Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebx indicates event present.
- */
- cpuid(10, &(eax.full), &ebx, &unused, &unused);
- if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
- (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- return;
-
- wrmsr(wd->evntsel_msr, 0, 0);
- __release_evntsel_nmi(-1, wd->evntsel_msr);
- __release_perfctr_nmi(-1, wd->perfctr_msr);
+ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+ on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
}
void setup_apic_nmi_watchdog (void *unused)
{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- /* only support LOCAL and IO APICs for now */
- if ((nmi_watchdog != NMI_LOCAL_APIC) &&
- (nmi_watchdog != NMI_IO_APIC))
- return;
-
- if (wd->enabled == 1)
- return;
+ if (__get_cpu_var(wd_enabled))
+ return;
/* cheap hack to support suspend/resume */
/* if cpu0 is not active neither should the other cpus */
if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
return;
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
- boot_cpu_data.x86 != 16)
- return;
- if (!setup_k7_watchdog())
- return;
- break;
- case X86_VENDOR_INTEL:
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- if (!setup_intel_arch_watchdog())
- return;
- break;
- }
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 0xd)
- return;
-
- if (!setup_p6_watchdog())
- return;
- break;
- case 15:
- if (boot_cpu_data.x86_model > 0x4)
- return;
-
- if (!setup_p4_watchdog())
- return;
- break;
- default:
- return;
- }
- break;
- default:
+ switch (nmi_watchdog) {
+ case NMI_LOCAL_APIC:
+ __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
+ if (lapic_watchdog_init(nmi_hz) < 0) {
+ __get_cpu_var(wd_enabled) = 0;
return;
}
+ /* FALL THROUGH */
+ case NMI_IO_APIC:
+ __get_cpu_var(wd_enabled) = 1;
+ atomic_inc(&nmi_active);
}
- wd->enabled = 1;
- atomic_inc(&nmi_active);
}
void stop_apic_nmi_watchdog(void *unused)
{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
/* only support LOCAL and IO APICs for now */
if ((nmi_watchdog != NMI_LOCAL_APIC) &&
(nmi_watchdog != NMI_IO_APIC))
return;
-
- if (wd->enabled == 0)
+ if (__get_cpu_var(wd_enabled) == 0)
return;
-
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- stop_k7_watchdog();
- break;
- case X86_VENDOR_INTEL:
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- stop_intel_arch_watchdog();
- break;
- }
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 0xd)
- break;
- stop_p6_watchdog();
- break;
- case 15:
- if (boot_cpu_data.x86_model > 0x4)
- break;
- stop_p4_watchdog();
- break;
- }
- break;
- default:
- return;
- }
- }
- wd->enabled = 0;
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ lapic_watchdog_stop();
+ __get_cpu_var(wd_enabled) = 0;
atomic_dec(&nmi_active);
}
@@ -1011,8 +328,6 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
unsigned int sum;
int touched = 0;
int cpu = smp_processor_id();
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- u64 dummy;
int rc=0;
/* check for other users first */
@@ -1055,53 +370,20 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
alert_counter[cpu] = 0;
}
/* see if the nmi watchdog went off */
- if (wd->enabled) {
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- rdmsrl(wd->perfctr_msr, dummy);
- if (dummy & wd->check_bit){
- /* this wasn't a watchdog timer interrupt */
- goto done;
- }
-
- /* only Intel P4 uses the cccr msr */
- if (wd->cccr_msr != 0) {
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- rdmsrl(wd->cccr_msr, dummy);
- dummy &= ~P4_CCCR_OVF;
- wrmsrl(wd->cccr_msr, dummy);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL);
- }
- else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
- wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
- /* P6 based Pentium M need to re-unmask
- * the apic vector but it doesn't hurt
- * other P6 variant.
- * ArchPerfom/Core Duo also needs this */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- /* P6/ARCH_PERFMON has 32 bit counter write */
- write_watchdog_counter32(wd->perfctr_msr, NULL);
- } else {
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL);
- }
- rc = 1;
- } else if (nmi_watchdog == NMI_IO_APIC) {
- /* don't know how to accurately check for this.
- * just assume it was a watchdog timer interrupt
- * This matches the old behaviour.
- */
- rc = 1;
- }
+ if (!__get_cpu_var(wd_enabled))
+ return rc;
+ switch (nmi_watchdog) {
+ case NMI_LOCAL_APIC:
+ rc |= lapic_wd_event(nmi_hz);
+ break;
+ case NMI_IO_APIC:
+ /* don't know how to accurately check for this.
+ * just assume it was a watchdog timer interrupt
+ * This matches the old behaviour.
+ */
+ rc = 1;
+ break;
}
-done:
return rc;
}
@@ -1146,7 +428,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
}
if (nmi_watchdog == NMI_DEFAULT) {
- if (nmi_known_cpu() > 0)
+ if (lapic_watchdog_ok())
nmi_watchdog = NMI_LOCAL_APIC;
else
nmi_watchdog = NMI_IO_APIC;
@@ -1182,11 +464,3 @@ void __trigger_all_cpu_backtrace(void)
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-EXPORT_SYMBOL(reserve_perfctr_nmi);
-EXPORT_SYMBOL(release_perfctr_nmi);
-EXPORT_SYMBOL(reserve_evntsel_nmi);
-EXPORT_SYMBOL(release_evntsel_nmi);
-EXPORT_SYMBOL(disable_timer_nmi_watchdog);
-EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 2ec331e03fa9..5c10f376bce1 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -20,6 +20,7 @@
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/start_kernel.h>
+#include <linux/highmem.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -35,7 +36,7 @@
#include <asm/timer.h>
/* nop stub */
-static void native_nop(void)
+void _paravirt_nop(void)
{
}
@@ -54,331 +55,148 @@ char *memory_setup(void)
#define DEF_NATIVE(name, code) \
extern const char start_##name[], end_##name[]; \
asm("start_" #name ": " code "; end_" #name ":")
-DEF_NATIVE(cli, "cli");
-DEF_NATIVE(sti, "sti");
-DEF_NATIVE(popf, "push %eax; popf");
-DEF_NATIVE(pushf, "pushf; pop %eax");
-DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
+
+DEF_NATIVE(irq_disable, "cli");
+DEF_NATIVE(irq_enable, "sti");
+DEF_NATIVE(restore_fl, "push %eax; popf");
+DEF_NATIVE(save_fl, "pushf; pop %eax");
DEF_NATIVE(iret, "iret");
-DEF_NATIVE(sti_sysexit, "sti; sysexit");
+DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(clts, "clts");
+DEF_NATIVE(read_tsc, "rdtsc");
-static const struct native_insns
-{
- const char *start, *end;
-} native_insns[] = {
- [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
- [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
- [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
- [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
- [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
- [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
- [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
-};
+DEF_NATIVE(ud2a, "ud2a");
static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
{
- unsigned int insn_len;
-
- /* Don't touch it if we don't have a replacement */
- if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
- return len;
-
- insn_len = native_insns[type].end - native_insns[type].start;
-
- /* Similarly if we can't fit replacement. */
- if (len < insn_len)
- return len;
+ const unsigned char *start, *end;
+ unsigned ret;
+
+ switch(type) {
+#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
+ SITE(irq_disable);
+ SITE(irq_enable);
+ SITE(restore_fl);
+ SITE(save_fl);
+ SITE(iret);
+ SITE(irq_enable_sysexit);
+ SITE(read_cr2);
+ SITE(read_cr3);
+ SITE(write_cr3);
+ SITE(clts);
+ SITE(read_tsc);
+#undef SITE
+
+ patch_site:
+ ret = paravirt_patch_insns(insns, len, start, end);
+ break;
- memcpy(insns, native_insns[type].start, insn_len);
- return insn_len;
-}
+ case PARAVIRT_PATCH(make_pgd):
+ case PARAVIRT_PATCH(make_pte):
+ case PARAVIRT_PATCH(pgd_val):
+ case PARAVIRT_PATCH(pte_val):
+#ifdef CONFIG_X86_PAE
+ case PARAVIRT_PATCH(make_pmd):
+ case PARAVIRT_PATCH(pmd_val):
+#endif
+ /* These functions end up returning exactly what
+ they're passed, in the same registers. */
+ ret = paravirt_patch_nop();
+ break;
-static unsigned long native_get_debugreg(int regno)
-{
- unsigned long val = 0; /* Damn you, gcc! */
-
- switch (regno) {
- case 0:
- asm("movl %%db0, %0" :"=r" (val)); break;
- case 1:
- asm("movl %%db1, %0" :"=r" (val)); break;
- case 2:
- asm("movl %%db2, %0" :"=r" (val)); break;
- case 3:
- asm("movl %%db3, %0" :"=r" (val)); break;
- case 6:
- asm("movl %%db6, %0" :"=r" (val)); break;
- case 7:
- asm("movl %%db7, %0" :"=r" (val)); break;
default:
- BUG();
- }
- return val;
-}
-
-static void native_set_debugreg(int regno, unsigned long value)
-{
- switch (regno) {
- case 0:
- asm("movl %0,%%db0" : /* no output */ :"r" (value));
- break;
- case 1:
- asm("movl %0,%%db1" : /* no output */ :"r" (value));
- break;
- case 2:
- asm("movl %0,%%db2" : /* no output */ :"r" (value));
+ ret = paravirt_patch_default(type, clobbers, insns, len);
break;
- case 3:
- asm("movl %0,%%db3" : /* no output */ :"r" (value));
- break;
- case 6:
- asm("movl %0,%%db6" : /* no output */ :"r" (value));
- break;
- case 7:
- asm("movl %0,%%db7" : /* no output */ :"r" (value));
- break;
- default:
- BUG();
}
-}
-
-void init_IRQ(void)
-{
- paravirt_ops.init_IRQ();
-}
-
-static void native_clts(void)
-{
- asm volatile ("clts");
-}
-
-static unsigned long native_read_cr0(void)
-{
- unsigned long val;
- asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
- return val;
-}
-
-static void native_write_cr0(unsigned long val)
-{
- asm volatile("movl %0,%%cr0": :"r" (val));
-}
-
-static unsigned long native_read_cr2(void)
-{
- unsigned long val;
- asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
- return val;
-}
-
-static void native_write_cr2(unsigned long val)
-{
- asm volatile("movl %0,%%cr2": :"r" (val));
-}
-
-static unsigned long native_read_cr3(void)
-{
- unsigned long val;
- asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
- return val;
-}
-
-static void native_write_cr3(unsigned long val)
-{
- asm volatile("movl %0,%%cr3": :"r" (val));
-}
-
-static unsigned long native_read_cr4(void)
-{
- unsigned long val;
- asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
- return val;
-}
-
-static unsigned long native_read_cr4_safe(void)
-{
- unsigned long val;
- /* This could fault if %cr4 does not exist */
- asm("1: movl %%cr4, %0 \n"
- "2: \n"
- ".section __ex_table,\"a\" \n"
- ".long 1b,2b \n"
- ".previous \n"
- : "=r" (val): "0" (0));
- return val;
-}
-
-static void native_write_cr4(unsigned long val)
-{
- asm volatile("movl %0,%%cr4": :"r" (val));
-}
-
-static unsigned long native_save_fl(void)
-{
- unsigned long f;
- asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
- return f;
-}
-
-static void native_restore_fl(unsigned long f)
-{
- asm volatile("pushl %0 ; popfl": /* no output */
- :"g" (f)
- :"memory", "cc");
-}
-
-static void native_irq_disable(void)
-{
- asm volatile("cli": : :"memory");
-}
-
-static void native_irq_enable(void)
-{
- asm volatile("sti": : :"memory");
-}
-
-static void native_safe_halt(void)
-{
- asm volatile("sti; hlt": : :"memory");
-}
-static void native_halt(void)
-{
- asm volatile("hlt": : :"memory");
+ return ret;
}
-static void native_wbinvd(void)
+unsigned paravirt_patch_nop(void)
{
- asm volatile("wbinvd": : :"memory");
+ return 0;
}
-static unsigned long long native_read_msr(unsigned int msr, int *err)
+unsigned paravirt_patch_ignore(unsigned len)
{
- unsigned long long val;
-
- asm volatile("2: rdmsr ; xorl %0,%0\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: movl %3,%0 ; jmp 1b\n\t"
- ".previous\n\t"
- ".section __ex_table,\"a\"\n"
- " .align 4\n\t"
- " .long 2b,3b\n\t"
- ".previous"
- : "=r" (*err), "=A" (val)
- : "c" (msr), "i" (-EFAULT));
-
- return val;
+ return len;
}
-static int native_write_msr(unsigned int msr, unsigned long long val)
+unsigned paravirt_patch_call(void *target, u16 tgt_clobbers,
+ void *site, u16 site_clobbers,
+ unsigned len)
{
- int err;
- asm volatile("2: wrmsr ; xorl %0,%0\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: movl %4,%0 ; jmp 1b\n\t"
- ".previous\n\t"
- ".section __ex_table,\"a\"\n"
- " .align 4\n\t"
- " .long 2b,3b\n\t"
- ".previous"
- : "=a" (err)
- : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
- "i" (-EFAULT));
- return err;
-}
+ unsigned char *call = site;
+ unsigned long delta = (unsigned long)target - (unsigned long)(call+5);
-static unsigned long long native_read_tsc(void)
-{
- unsigned long long val;
- asm volatile("rdtsc" : "=A" (val));
- return val;
-}
+ if (tgt_clobbers & ~site_clobbers)
+ return len; /* target would clobber too much for this site */
+ if (len < 5)
+ return len; /* call too long for patch site */
-static unsigned long long native_read_pmc(void)
-{
- unsigned long long val;
- asm volatile("rdpmc" : "=A" (val));
- return val;
-}
+ *call++ = 0xe8; /* call */
+ *(unsigned long *)call = delta;
-static void native_load_tr_desc(void)
-{
- asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+ return 5;
}
-static void native_load_gdt(const struct Xgt_desc_struct *dtr)
+unsigned paravirt_patch_jmp(void *target, void *site, unsigned len)
{
- asm volatile("lgdt %0"::"m" (*dtr));
-}
+ unsigned char *jmp = site;
+ unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5);
-static void native_load_idt(const struct Xgt_desc_struct *dtr)
-{
- asm volatile("lidt %0"::"m" (*dtr));
-}
+ if (len < 5)
+ return len; /* call too long for patch site */
-static void native_store_gdt(struct Xgt_desc_struct *dtr)
-{
- asm ("sgdt %0":"=m" (*dtr));
-}
+ *jmp++ = 0xe9; /* jmp */
+ *(unsigned long *)jmp = delta;
-static void native_store_idt(struct Xgt_desc_struct *dtr)
-{
- asm ("sidt %0":"=m" (*dtr));
+ return 5;
}
-static unsigned long native_store_tr(void)
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
{
- unsigned long tr;
- asm ("str %0":"=r" (tr));
- return tr;
-}
+ void *opfunc = *((void **)&paravirt_ops + type);
+ unsigned ret;
-static void native_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
- C(0); C(1); C(2);
-#undef C
-}
+ if (opfunc == NULL)
+ /* If there's no function, patch it with a ud2a (BUG) */
+ ret = paravirt_patch_insns(site, len, start_ud2a, end_ud2a);
+ else if (opfunc == paravirt_nop)
+ /* If the operation is a nop, then nop the callsite */
+ ret = paravirt_patch_nop();
+ else if (type == PARAVIRT_PATCH(iret) ||
+ type == PARAVIRT_PATCH(irq_enable_sysexit))
+ /* If operation requires a jmp, then jmp */
+ ret = paravirt_patch_jmp(opfunc, site, len);
+ else
+ /* Otherwise call the function; assume target could
+ clobber any caller-save reg */
+ ret = paravirt_patch_call(opfunc, CLBR_ANY,
+ site, clobbers, len);
-static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
-{
- u32 *lp = (u32 *)((char *)dt + entry*8);
- lp[0] = entry_low;
- lp[1] = entry_high;
+ return ret;
}
-static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+unsigned paravirt_patch_insns(void *site, unsigned len,
+ const char *start, const char *end)
{
- native_write_dt_entry(dt, entrynum, low, high);
-}
+ unsigned insn_len = end - start;
-static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
-{
- native_write_dt_entry(dt, entrynum, low, high);
-}
-
-static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
-{
- native_write_dt_entry(dt, entrynum, low, high);
-}
+ if (insn_len > len || start == NULL)
+ insn_len = len;
+ else
+ memcpy(site, start, insn_len);
-static void native_load_esp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- tss->esp0 = thread->esp0;
-
- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
- if (unlikely(tss->ss1 != thread->sysenter_cs)) {
- tss->ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
+ return insn_len;
}
-static void native_io_delay(void)
+void init_IRQ(void)
{
- asm volatile("outb %al,$0x80");
+ paravirt_ops.init_IRQ();
}
static void native_flush_tlb(void)
@@ -395,83 +213,11 @@ static void native_flush_tlb_global(void)
__native_flush_tlb_global();
}
-static void native_flush_tlb_single(u32 addr)
+static void native_flush_tlb_single(unsigned long addr)
{
__native_flush_tlb_single(addr);
}
-#ifndef CONFIG_X86_PAE
-static void native_set_pte(pte_t *ptep, pte_t pteval)
-{
- *ptep = pteval;
-}
-
-static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
-{
- *ptep = pteval;
-}
-
-static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- *pmdp = pmdval;
-}
-
-#else /* CONFIG_X86_PAE */
-
-static void native_set_pte(pte_t *ptep, pte_t pte)
-{
- ptep->pte_high = pte.pte_high;
- smp_wmb();
- ptep->pte_low = pte.pte_low;
-}
-
-static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
-{
- ptep->pte_high = pte.pte_high;
- smp_wmb();
- ptep->pte_low = pte.pte_low;
-}
-
-static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
- ptep->pte_low = 0;
- smp_wmb();
- ptep->pte_high = pte.pte_high;
- smp_wmb();
- ptep->pte_low = pte.pte_low;
-}
-
-static void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
- set_64bit((unsigned long long *)ptep,pte_val(pteval));
-}
-
-static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
- set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
-}
-
-static void native_set_pud(pud_t *pudp, pud_t pudval)
-{
- *pudp = pudval;
-}
-
-static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- ptep->pte_low = 0;
- smp_wmb();
- ptep->pte_high = 0;
-}
-
-static void native_pmd_clear(pmd_t *pmd)
-{
- u32 *tmp = (u32 *)pmd;
- *tmp = 0;
- smp_wmb();
- *(tmp + 1) = 0;
-}
-#endif /* CONFIG_X86_PAE */
-
/* These are in entry.S */
extern void native_iret(void);
extern void native_irq_enable_sysexit(void);
@@ -487,10 +233,11 @@ struct paravirt_ops paravirt_ops = {
.name = "bare hardware",
.paravirt_enabled = 0,
.kernel_rpl = 0,
+ .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
.patch = native_patch,
.banner = default_banner,
- .arch_setup = native_nop,
+ .arch_setup = paravirt_nop,
.memory_setup = machine_specific_memory_setup,
.get_wallclock = native_get_wallclock,
.set_wallclock = native_set_wallclock,
@@ -517,8 +264,8 @@ struct paravirt_ops paravirt_ops = {
.safe_halt = native_safe_halt,
.halt = native_halt,
.wbinvd = native_wbinvd,
- .read_msr = native_read_msr,
- .write_msr = native_write_msr,
+ .read_msr = native_read_msr_safe,
+ .write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.get_scheduled_cycles = native_read_tsc,
@@ -531,9 +278,9 @@ struct paravirt_ops paravirt_ops = {
.store_idt = native_store_idt,
.store_tr = native_store_tr,
.load_tls = native_load_tls,
- .write_ldt_entry = native_write_ldt_entry,
- .write_gdt_entry = native_write_gdt_entry,
- .write_idt_entry = native_write_idt_entry,
+ .write_ldt_entry = write_dt_entry,
+ .write_gdt_entry = write_dt_entry,
+ .write_idt_entry = write_dt_entry,
.load_esp0 = native_load_esp0,
.set_iopl_mask = native_set_iopl_mask,
@@ -545,44 +292,57 @@ struct paravirt_ops paravirt_ops = {
.apic_read = native_apic_read,
.setup_boot_clock = setup_boot_APIC_clock,
.setup_secondary_clock = setup_secondary_APIC_clock,
+ .startup_ipi_hook = paravirt_nop,
#endif
- .set_lazy_mode = (void *)native_nop,
+ .set_lazy_mode = paravirt_nop,
+
+ .pagetable_setup_start = native_pagetable_setup_start,
+ .pagetable_setup_done = native_pagetable_setup_done,
.flush_tlb_user = native_flush_tlb,
.flush_tlb_kernel = native_flush_tlb_global,
.flush_tlb_single = native_flush_tlb_single,
+ .flush_tlb_others = native_flush_tlb_others,
- .map_pt_hook = (void *)native_nop,
-
- .alloc_pt = (void *)native_nop,
- .alloc_pd = (void *)native_nop,
- .alloc_pd_clone = (void *)native_nop,
- .release_pt = (void *)native_nop,
- .release_pd = (void *)native_nop,
+ .alloc_pt = paravirt_nop,
+ .alloc_pd = paravirt_nop,
+ .alloc_pd_clone = paravirt_nop,
+ .release_pt = paravirt_nop,
+ .release_pd = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
- .pte_update = (void *)native_nop,
- .pte_update_defer = (void *)native_nop,
+ .pte_update = paravirt_nop,
+ .pte_update_defer = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+ .kmap_atomic_pte = kmap_atomic,
+#endif
+
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
.set_pte_present = native_set_pte_present,
.set_pud = native_set_pud,
.pte_clear = native_pte_clear,
.pmd_clear = native_pmd_clear,
+
+ .pmd_val = native_pmd_val,
+ .make_pmd = native_make_pmd,
#endif
+ .pte_val = native_pte_val,
+ .pgd_val = native_pgd_val,
+
+ .make_pte = native_make_pte,
+ .make_pgd = native_make_pgd,
+
.irq_enable_sysexit = native_irq_enable_sysexit,
.iret = native_iret,
- .startup_ipi_hook = (void *)native_nop,
+ .dup_mmap = paravirt_nop,
+ .exit_mmap = paravirt_nop,
+ .activate_mm = paravirt_nop,
};
-/*
- * NOTE: CONFIG_PARAVIRT is experimental and the paravirt_ops
- * semantics are subject to change. Hence we only do this
- * internal-only export of this, until it gets sorted out and
- * all lowlevel CPU ops used by modules are separately exported.
- */
-EXPORT_SYMBOL_GPL(paravirt_ops);
+EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 393a67d5d943..61999479b7a4 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -39,6 +39,7 @@
#include <linux/random.h>
#include <linux/personality.h>
#include <linux/tick.h>
+#include <linux/percpu.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -57,7 +58,6 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
-#include <asm/pda.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -66,6 +66,12 @@ static int hlt_counter;
unsigned long boot_option_idle_override = 0;
EXPORT_SYMBOL(boot_option_idle_override);
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
/*
* Return saved PC of a blocked thread.
*/
@@ -272,25 +278,24 @@ void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
}
}
-static int __init idle_setup (char *str)
+static int __init idle_setup(char *str)
{
- if (!strncmp(str, "poll", 4)) {
+ if (!strcmp(str, "poll")) {
printk("using polling idle threads.\n");
pm_idle = poll_idle;
#ifdef CONFIG_X86_SMP
if (smp_num_siblings > 1)
printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
#endif
- } else if (!strncmp(str, "halt", 4)) {
- printk("using halt in idle threads.\n");
- pm_idle = default_idle;
- }
+ } else if (!strcmp(str, "mwait"))
+ force_mwait = 1;
+ else
+ return -1;
boot_option_idle_override = 1;
- return 1;
+ return 0;
}
-
-__setup("idle=", idle_setup);
+early_param("idle", idle_setup);
void show_regs(struct pt_regs * regs)
{
@@ -343,7 +348,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
regs.xds = __USER_DS;
regs.xes = __USER_DS;
- regs.xfs = __KERNEL_PDA;
+ regs.xfs = __KERNEL_PERCPU;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper;
regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -376,7 +381,7 @@ void exit_thread(void)
t->io_bitmap_max = 0;
tss->io_bitmap_owner = NULL;
tss->io_bitmap_max = 0;
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+ tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
put_cpu();
}
}
@@ -555,7 +560,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
* Disable the bitmap via an invalid offset. We still cache
* the previous bitmap owner and the IO bitmap contents:
*/
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+ tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
return;
}
@@ -565,7 +570,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
* matches the next task, we dont have to do anything but
* to set a valid offset in the TSS:
*/
- tss->io_bitmap_base = IO_BITMAP_OFFSET;
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
return;
}
/*
@@ -577,7 +582,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
* redundant copies when the currently switched task does not
* perform any I/O during its timeslice.
*/
- tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+ tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
}
/*
@@ -712,7 +717,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
if (prev->gs | next->gs)
loadsegment(gs, next->gs);
- write_pda(pcurrent, next_p);
+ x86_write_percpu(current_task, next_p);
return prev_p;
}
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 34874c398b44..9f6ab1789bb0 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,12 +3,10 @@
*/
#include <linux/pci.h>
#include <linux/irq.h>
-#include <asm/pci-direct.h>
-#include <asm/genapic.h>
-#include <asm/cpu.h>
#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
-static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
+
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
{
u8 config, rev;
u32 word;
@@ -16,12 +14,14 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
/* BIOS may enable hardware IRQ balancing for
* E7520/E7320/E7525(revision ID 0x9 and below)
* based platforms.
- * For those platforms, make sure that the genapic is set to 'flat'
+ * Disable SW irqbalance/affinity on those platforms.
*/
pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
if (rev > 0x9)
return;
+ printk(KERN_INFO "Intel E7520/7320/7525 detected.");
+
/* enable access to config space*/
pci_read_config_byte(dev, 0xf4, &config);
pci_write_config_byte(dev, 0xf4, config|0x2);
@@ -30,44 +30,6 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
if (!(word & (1 << 13))) {
-#ifdef CONFIG_X86_64
- if (genapic != &apic_flat)
- panic("APIC mode must be flat on this system\n");
-#elif defined(CONFIG_X86_GENERICARCH)
- if (genapic != &apic_default)
- panic("APIC mode must be default(flat) on this system. Use apic=default\n");
-#endif
- }
-
- /* put back the original value for config space*/
- if (!(config & 0x2))
- pci_write_config_byte(dev, 0xf4, config);
-}
-
-void __init quirk_intel_irqbalance(void)
-{
- u8 config, rev;
- u32 word;
-
- /* BIOS may enable hardware IRQ balancing for
- * E7520/E7320/E7525(revision ID 0x9 and below)
- * based platforms.
- * Disable SW irqbalance/affinity on those platforms.
- */
- rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
- if (rev > 0x9)
- return;
-
- printk(KERN_INFO "Intel E7520/7320/7525 detected.");
-
- /* enable access to config space */
- config = read_pci_config_byte(0, 0, 0, 0xf4);
- write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
-
- /* read xTPR register */
- word = read_pci_config_16(0, 0, 0x40, 0x4c);
-
- if (!(word & (1 << 13))) {
printk(KERN_INFO "Disabling irq balancing and affinity\n");
#ifdef CONFIG_IRQBALANCE
irqbalance_disable("");
@@ -76,24 +38,13 @@ void __init quirk_intel_irqbalance(void)
#ifdef CONFIG_PROC_FS
no_irq_affinity = 1;
#endif
-#ifdef CONFIG_HOTPLUG_CPU
- printk(KERN_INFO "Disabling cpu hotplug control\n");
- enable_cpu_hotplug = 0;
-#endif
-#ifdef CONFIG_X86_64
- /* force the genapic selection to flat mode so that
- * interrupts can be redirected to more than one CPU.
- */
- genapic_force = &apic_flat;
-#endif
}
- /* put back the original value for config space */
+ /* put back the original value for config space*/
if (!(config & 0x2))
- write_pci_config_byte(0, 0, 0, 0xf4, config);
+ pci_write_config_byte(dev, 0xf4, config);
}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
-
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
#endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 3514b4153f7f..50dfc65319cd 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -17,7 +17,8 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include "mach_reboot.h"
-#include <linux/reboot_fixups.h>
+#include <asm/reboot_fixups.h>
+#include <asm/reboot.h>
/*
* Power off function, if any
@@ -197,8 +198,6 @@ static unsigned char jump_to_bios [] =
*/
void machine_real_restart(unsigned char *code, int length)
{
- unsigned long flags;
-
local_irq_disable();
/* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -211,9 +210,9 @@ void machine_real_restart(unsigned char *code, int length)
safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
*/
- spin_lock_irqsave(&rtc_lock, flags);
+ spin_lock(&rtc_lock);
CMOS_WRITE(0x00, 0x8f);
- spin_unlock_irqrestore(&rtc_lock, flags);
+ spin_unlock(&rtc_lock);
/* Remap the kernel at virtual address zero, as well as offset zero
from the kernel segment. This assumes the kernel segment starts at
@@ -280,7 +279,7 @@ void machine_real_restart(unsigned char *code, int length)
EXPORT_SYMBOL(machine_real_restart);
#endif
-void machine_shutdown(void)
+static void native_machine_shutdown(void)
{
#ifdef CONFIG_SMP
int reboot_cpu_id;
@@ -316,7 +315,11 @@ void machine_shutdown(void)
#endif
}
-void machine_emergency_restart(void)
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+
+static void native_machine_emergency_restart(void)
{
if (!reboot_thru_bios) {
if (efi_enabled) {
@@ -340,17 +343,17 @@ void machine_emergency_restart(void)
machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
}
-void machine_restart(char * __unused)
+static void native_machine_restart(char * __unused)
{
machine_shutdown();
machine_emergency_restart();
}
-void machine_halt(void)
+static void native_machine_halt(void)
{
}
-void machine_power_off(void)
+static void native_machine_power_off(void)
{
if (pm_power_off) {
machine_shutdown();
@@ -359,3 +362,35 @@ void machine_power_off(void)
}
+struct machine_ops machine_ops = {
+ .power_off = native_machine_power_off,
+ .shutdown = native_machine_shutdown,
+ .emergency_restart = native_machine_emergency_restart,
+ .restart = native_machine_restart,
+ .halt = native_machine_halt,
+};
+
+void machine_power_off(void)
+{
+ machine_ops.power_off();
+}
+
+void machine_shutdown(void)
+{
+ machine_ops.shutdown();
+}
+
+void machine_emergency_restart(void)
+{
+ machine_ops.emergency_restart();
+}
+
+void machine_restart(char *cmd)
+{
+ machine_ops.restart(cmd);
+}
+
+void machine_halt(void)
+{
+ machine_ops.halt();
+}
diff --git a/arch/i386/kernel/reboot_fixups.c b/arch/i386/kernel/reboot_fixups.c
index 99aab41a05b0..2d78d918340f 100644
--- a/arch/i386/kernel/reboot_fixups.c
+++ b/arch/i386/kernel/reboot_fixups.c
@@ -10,7 +10,7 @@
#include <asm/delay.h>
#include <linux/pci.h>
-#include <linux/reboot_fixups.h>
+#include <asm/reboot_fixups.h>
static void cs5530a_warm_reset(struct pci_dev *dev)
{
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 0e8977871b1f..89a45a9ddcd4 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -165,20 +165,20 @@ void fastcall send_IPI_self(int vector)
}
/*
- * This is only used on smaller machines.
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
*/
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
{
- unsigned long mask = cpus_addr(cpumask)[0];
unsigned long cfg;
- unsigned long flags;
- local_irq_save(flags);
- WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
/*
* Wait for idle.
*/
- apic_wait_icr_idle();
+ if (unlikely(vector == NMI_VECTOR))
+ safe_apic_wait_icr_idle();
+ else
+ apic_wait_icr_idle();
/*
* prepare target chip field
@@ -195,13 +195,25 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write_around(APIC_ICR, cfg);
+}
+
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+{
+ unsigned long mask = cpus_addr(cpumask)[0];
+ unsigned long flags;
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+ __send_IPI_dest_field(mask, vector);
local_irq_restore(flags);
}
void send_IPI_mask_sequence(cpumask_t mask, int vector)
{
- unsigned long cfg, flags;
+ unsigned long flags;
unsigned int query_cpu;
/*
@@ -211,30 +223,10 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
*/
local_irq_save(flags);
-
for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
if (cpu_isset(query_cpu, mask)) {
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
- apic_write_around(APIC_ICR2, cfg);
-
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write_around(APIC_ICR, cfg);
+ __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+ vector);
}
}
local_irq_restore(flags);
@@ -256,7 +248,6 @@ static cpumask_t flush_cpumask;
static struct mm_struct * flush_mm;
static unsigned long flush_va;
static DEFINE_SPINLOCK(tlbstate_lock);
-#define FLUSH_ALL 0xffffffff
/*
* We cannot call mmdrop() because we are in interrupt context,
@@ -338,7 +329,7 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
- if (flush_va == FLUSH_ALL)
+ if (flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
__flush_tlb_one(flush_va);
@@ -353,9 +344,11 @@ out:
put_cpu_no_resched();
}
-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
- unsigned long va)
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+ unsigned long va)
{
+ cpumask_t cpumask = *cpumaskp;
+
/*
* A couple of (to be removed) sanity checks:
*
@@ -366,10 +359,12 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
BUG_ON(cpu_isset(smp_processor_id(), cpumask));
BUG_ON(!mm);
+#ifdef CONFIG_HOTPLUG_CPU
/* If a CPU which we ran on has gone down, OK. */
cpus_and(cpumask, cpumask, cpu_online_map);
- if (cpus_empty(cpumask))
+ if (unlikely(cpus_empty(cpumask)))
return;
+#endif
/*
* i'm not happy about this global shared spinlock in the
@@ -380,17 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
flush_mm = mm;
flush_va = va;
-#if NR_CPUS <= BITS_PER_LONG
- atomic_set_mask(cpumask, &flush_cpumask);
-#else
- {
- int k;
- unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
- unsigned long *cpu_mask = (unsigned long *)&cpumask;
- for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
- atomic_set_mask(cpu_mask[k], &flush_mask[k]);
- }
-#endif
+ cpus_or(flush_cpumask, cpumask, flush_cpumask);
/*
* We have to send the IPI only to
* CPUs affected.
@@ -417,7 +402,7 @@ void flush_tlb_current_task(void)
local_flush_tlb();
if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
preempt_enable();
}
@@ -436,7 +421,7 @@ void flush_tlb_mm (struct mm_struct * mm)
leave_mm(smp_processor_id());
}
if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
preempt_enable();
}
@@ -483,7 +468,7 @@ void flush_tlb_all(void)
* it goes straight through and wastes no time serializing
* anything. Worst case is that we lose a reschedule ...
*/
-void smp_send_reschedule(int cpu)
+void native_smp_send_reschedule(int cpu)
{
WARN_ON(cpu_is_offline(cpu));
send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
@@ -515,36 +500,78 @@ void unlock_ipi_call_lock(void)
static struct call_data_struct *call_data;
+static void __smp_call_function(void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ struct call_data_struct data;
+ int cpus = num_online_cpus() - 1;
+
+ if (!cpus)
+ return;
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ mb();
+
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ cpu_relax();
+
+ if (wait)
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
+}
+
+
/**
- * smp_call_function(): Run a function on all other CPUs.
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on. Must not include the current cpu.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @nonatomic: currently unused.
* @wait: If true, wait (atomically) until function has completed on other CPUs.
*
- * Returns 0 on success, else a negative status code. Does not return until
- * remote CPUs are nearly ready to execute <<func>> or are or have executed.
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
*
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler.
*/
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
- int wait)
+int native_smp_call_function_mask(cpumask_t mask,
+ void (*func)(void *), void *info,
+ int wait)
{
struct call_data_struct data;
+ cpumask_t allbutself;
int cpus;
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
/* Holding any lock stops cpus from going down. */
spin_lock(&call_lock);
- cpus = num_online_cpus() - 1;
+
+ allbutself = cpu_online_map;
+ cpu_clear(smp_processor_id(), allbutself);
+
+ cpus_and(mask, mask, allbutself);
+ cpus = cpus_weight(mask);
+
if (!cpus) {
spin_unlock(&call_lock);
return 0;
}
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
data.func = func;
data.info = info;
atomic_set(&data.started, 0);
@@ -554,9 +581,12 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
call_data = &data;
mb();
-
- /* Send a message to all other CPUs and wait for them to respond */
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+ /* Send a message to other CPUs */
+ if (cpus_equal(mask, allbutself))
+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ else
+ send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
/* Wait for response */
while (atomic_read(&data.started) != cpus)
@@ -569,15 +599,68 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
return 0;
}
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
+ int wait)
+{
+ return smp_call_function_mask(cpu_online_map, func, info, wait);
+}
EXPORT_SYMBOL(smp_call_function);
+/**
+ * smp_call_function_single - Run a function on another CPU
+ * @cpu: The target CPU. Cannot be the calling CPU.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ /* prevent preemption and reschedule on another processor */
+ int ret;
+ int me = get_cpu();
+ if (cpu == me) {
+ WARN_ON(1);
+ put_cpu();
+ return -EBUSY;
+ }
+
+ ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
+
+ put_cpu();
+ return ret;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
static void stop_this_cpu (void * dummy)
{
+ local_irq_disable();
/*
* Remove this CPU:
*/
cpu_clear(smp_processor_id(), cpu_online_map);
- local_irq_disable();
disable_local_APIC();
if (cpu_data[smp_processor_id()].hlt_works_ok)
for(;;) halt();
@@ -588,13 +671,18 @@ static void stop_this_cpu (void * dummy)
* this function calls the 'stop' function on all other CPUs in the system.
*/
-void smp_send_stop(void)
+void native_smp_send_stop(void)
{
- smp_call_function(stop_this_cpu, NULL, 1, 0);
+ /* Don't deadlock on the call lock in panic */
+ int nolock = !spin_trylock(&call_lock);
+ unsigned long flags;
- local_irq_disable();
+ local_irq_save(flags);
+ __smp_call_function(stop_this_cpu, NULL, 0, 0);
+ if (!nolock)
+ spin_unlock(&call_lock);
disable_local_APIC();
- local_irq_enable();
+ local_irq_restore(flags);
}
/*
@@ -633,77 +721,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
}
}
-/*
- * this function sends a 'generic call function' IPI to one other CPU
- * in the system.
- *
- * cpu is a standard Linux logical CPU number.
- */
-static void
-__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
- int nonatomic, int wait)
-{
- struct call_data_struct data;
- int cpus = 1;
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- call_data = &data;
- wmb();
- /* Send a message to all other CPUs and wait for them to respond */
- send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus)
- cpu_relax();
-
- if (!wait)
- return;
-
- while (atomic_read(&data.finished) != cpus)
- cpu_relax();
-}
-
-/*
- * smp_call_function_single - Run a function on another CPU
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Currently unused.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Retrurns 0 on success, else a negative status code.
- *
- * Does not return until the remote CPU is nearly ready to execute <func>
- * or is or has executed.
- */
-
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
- int nonatomic, int wait)
-{
- /* prevent preemption and reschedule on another processor */
- int me = get_cpu();
- if (cpu == me) {
- WARN_ON(1);
- put_cpu();
- return -EBUSY;
- }
-
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
- spin_lock_bh(&call_lock);
- __smp_call_function_single(cpu, func, info, nonatomic, wait);
- spin_unlock_bh(&call_lock);
- put_cpu();
- return 0;
-}
-EXPORT_SYMBOL(smp_call_function_single);
-
static int convert_apicid_to_cpu(int apic_id)
{
int i;
@@ -730,3 +747,14 @@ int safe_smp_processor_id(void)
return cpuid >= 0 ? cpuid : 0;
}
+
+struct smp_ops smp_ops = {
+ .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = native_smp_prepare_cpus,
+ .cpu_up = native_cpu_up,
+ .smp_cpus_done = native_smp_cpus_done,
+
+ .smp_send_stop = native_smp_send_stop,
+ .smp_send_reschedule = native_smp_send_reschedule,
+ .smp_call_function_mask = native_smp_call_function_mask,
+};
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4ff55e675576..a4b7ad283f49 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -53,13 +53,12 @@
#include <asm/desc.h>
#include <asm/arch_hooks.h>
#include <asm/nmi.h>
-#include <asm/pda.h>
-#include <asm/genapic.h>
#include <mach_apic.h>
#include <mach_wakecpu.h>
#include <smpboot_hooks.h>
#include <asm/vmi.h>
+#include <asm/mtrr.h>
/* Set if we find a B stepping CPU */
static int __devinitdata smp_b_stepping;
@@ -100,6 +99,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid);
u8 apicid_2_node[MAX_APICID];
+DEFINE_PER_CPU(unsigned long, this_cpu_off);
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
+
/*
* Trampoline 80x86 program as an array.
*/
@@ -156,7 +158,7 @@ static void __cpuinit smp_store_cpu_info(int id)
*c = boot_cpu_data;
if (id!=0)
- identify_cpu(c);
+ identify_secondary_cpu(c);
/*
* Mask B, Pentium, but not Pentium MMX
*/
@@ -379,14 +381,14 @@ set_cpu_sibling_map(int cpu)
static void __cpuinit start_secondary(void *unused)
{
/*
- * Don't put *anything* before secondary_cpu_init(), SMP
- * booting is too fragile that we want to limit the
- * things done here to the most necessary things.
+ * Don't put *anything* before cpu_init(), SMP booting is too
+ * fragile that we want to limit the things done here to the
+ * most necessary things.
*/
#ifdef CONFIG_VMI
vmi_bringup();
#endif
- secondary_cpu_init();
+ cpu_init();
preempt_disable();
smp_callin();
while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -441,12 +443,6 @@ static void __cpuinit start_secondary(void *unused)
void __devinit initialize_secondary(void)
{
/*
- * switch to the per CPU GDT we already set up
- * in do_boot_cpu()
- */
- cpu_set_gdt(current_thread_info()->cpu);
-
- /*
* We don't actually need to load the full TSS,
* basically just the stack pointer and the eip.
*/
@@ -463,7 +459,6 @@ extern struct {
void * esp;
unsigned short ss;
} stack_start;
-extern struct i386_pda *start_pda;
#ifdef CONFIG_NUMA
@@ -521,12 +516,12 @@ static void unmap_cpu_to_logical_apicid(int cpu)
unmap_cpu_to_node(cpu);
}
-#if APIC_DEBUG
static inline void __inquire_remote_apic(int apicid)
{
int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
char *names[] = { "ID", "VERSION", "SPIV" };
- int timeout, status;
+ int timeout;
+ unsigned long status;
printk("Inquiring remote APIC #%d...\n", apicid);
@@ -536,7 +531,9 @@ static inline void __inquire_remote_apic(int apicid)
/*
* Wait for idle.
*/
- apic_wait_icr_idle();
+ status = safe_apic_wait_icr_idle();
+ if (status)
+ printk("a previous APIC delivery may have failed\n");
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -550,14 +547,13 @@ static inline void __inquire_remote_apic(int apicid)
switch (status) {
case APIC_ICR_RR_VALID:
status = apic_read(APIC_RRR);
- printk("%08x\n", status);
+ printk("%lx\n", status);
break;
default:
printk("failed\n");
}
}
}
-#endif
#ifdef WAKE_SECONDARY_VIA_NMI
/*
@@ -568,8 +564,8 @@ static inline void __inquire_remote_apic(int apicid)
static int __devinit
wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
{
- unsigned long send_status = 0, accept_status = 0;
- int timeout, maxlvt;
+ unsigned long send_status, accept_status = 0;
+ int maxlvt;
/* Target chip */
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
@@ -579,12 +575,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
Dprintk("Waiting for send to finish...\n");
- timeout = 0;
- do {
- Dprintk("+");
- udelay(100);
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- } while (send_status && (timeout++ < 1000));
+ send_status = safe_apic_wait_icr_idle();
/*
* Give the other CPU some time to accept the IPI.
@@ -614,8 +605,8 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
static int __devinit
wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
{
- unsigned long send_status = 0, accept_status = 0;
- int maxlvt, timeout, num_starts, j;
+ unsigned long send_status, accept_status = 0;
+ int maxlvt, num_starts, j;
/*
* Be paranoid about clearing APIC errors.
@@ -640,12 +631,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
| APIC_DM_INIT);
Dprintk("Waiting for send to finish...\n");
- timeout = 0;
- do {
- Dprintk("+");
- udelay(100);
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- } while (send_status && (timeout++ < 1000));
+ send_status = safe_apic_wait_icr_idle();
mdelay(10);
@@ -658,12 +644,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
Dprintk("Waiting for send to finish...\n");
- timeout = 0;
- do {
- Dprintk("+");
- udelay(100);
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- } while (send_status && (timeout++ < 1000));
+ send_status = safe_apic_wait_icr_idle();
atomic_set(&init_deasserted, 1);
@@ -719,12 +700,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
Dprintk("Startup point 1.\n");
Dprintk("Waiting for send to finish...\n");
- timeout = 0;
- do {
- Dprintk("+");
- udelay(100);
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- } while (send_status && (timeout++ < 1000));
+ send_status = safe_apic_wait_icr_idle();
/*
* Give the other CPU some time to accept the IPI.
@@ -788,6 +764,25 @@ static inline struct task_struct * alloc_idle_task(int cpu)
#define alloc_idle_task(cpu) fork_idle(cpu)
#endif
+/* Initialize the CPU's GDT. This is either the boot CPU doing itself
+ (still using the master per-cpu area), or a CPU doing it for a
+ secondary which will soon come up. */
+static __cpuinit void init_gdt(int cpu)
+{
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+ pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
+ (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
+ __per_cpu_offset[cpu], 0xFFFFF,
+ 0x80 | DESCTYPE_S | 0x2, 0x8);
+
+ per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
+ per_cpu(cpu_number, cpu) = cpu;
+}
+
+/* Defined in head.S */
+extern struct Xgt_desc_struct early_gdt_descr;
+
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -802,6 +797,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
unsigned short nmi_high = 0, nmi_low = 0;
/*
+ * Save current MTRR state in case it was changed since early boot
+ * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+ */
+ mtrr_save_state();
+
+ /*
* We can't use kernel_thread since we must avoid to
* reschedule the child.
*/
@@ -809,13 +810,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
- /* Pre-allocate and initialize the CPU's GDT and PDA so it
- doesn't have to do any memory allocation during the
- delicate CPU-bringup phase. */
- if (!init_gdt(cpu, idle)) {
- printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
- return -1; /* ? */
- }
+ init_gdt(cpu);
+ per_cpu(current_task, cpu) = idle;
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
idle->thread.eip = (unsigned long) start_secondary;
/* start_eip had better be page-aligned! */
@@ -941,7 +938,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
DECLARE_COMPLETION_ONSTACK(done);
struct warm_boot_cpu_info info;
int apicid, ret;
- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
apicid = x86_cpu_to_apicid[cpu];
if (apicid == BAD_APICID) {
@@ -949,18 +945,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
goto exit;
}
- /*
- * the CPU isn't initialized at boot time, allocate gdt table here.
- * cpu_init will initialize it
- */
- if (!cpu_gdt_descr->address) {
- cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
- if (!cpu_gdt_descr->address)
- printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
- ret = -ENOMEM;
- goto exit;
- }
-
info.complete = &done;
info.apicid = apicid;
info.cpu = cpu;
@@ -1173,7 +1157,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
/* These are wrappers to interface to the new boot process. Someone
who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
-void __init smp_prepare_cpus(unsigned int max_cpus)
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
smp_commenced_mask = cpumask_of_cpu(0);
cpu_callin_map = cpumask_of_cpu(0);
@@ -1181,13 +1165,18 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
smp_boot_cpus(max_cpus);
}
-void __devinit smp_prepare_boot_cpu(void)
+void __init native_smp_prepare_boot_cpu(void)
{
- cpu_set(smp_processor_id(), cpu_online_map);
- cpu_set(smp_processor_id(), cpu_callout_map);
- cpu_set(smp_processor_id(), cpu_present_map);
- cpu_set(smp_processor_id(), cpu_possible_map);
- per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ unsigned int cpu = smp_processor_id();
+
+ init_gdt(cpu);
+ switch_to_new_gdt();
+
+ cpu_set(cpu, cpu_online_map);
+ cpu_set(cpu, cpu_callout_map);
+ cpu_set(cpu, cpu_present_map);
+ cpu_set(cpu, cpu_possible_map);
+ __get_cpu_var(cpu_state) = CPU_ONLINE;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1277,7 +1266,7 @@ void __cpu_die(unsigned int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit native_cpu_up(unsigned int cpu)
{
unsigned long flags;
#ifdef CONFIG_HOTPLUG_CPU
@@ -1319,15 +1308,10 @@ int __cpuinit __cpu_up(unsigned int cpu)
touch_nmi_watchdog();
}
-#ifdef CONFIG_X86_GENERICARCH
- if (num_online_cpus() > 8 && genapic == &apic_default)
- panic("Default flat APIC routing can't be used with > 8 cpus\n");
-#endif
-
return 0;
}
-void __init smp_cpus_done(unsigned int max_cpus)
+void __init native_smp_cpus_done(unsigned int max_cpus)
{
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 13ca54a85a1c..ff4ee6f3326b 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -22,16 +22,26 @@
#include <asm/msr.h>
#include <asm/pgtable.h>
#include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+
+enum {
+ VDSO_DISABLED = 0,
+ VDSO_ENABLED = 1,
+ VDSO_COMPAT = 2,
+};
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT VDSO_COMPAT
+#else
+#define VDSO_DEFAULT VDSO_ENABLED
+#endif
/*
* Should the kernel map a VDSO page into processes and pass its
* address down to glibc upon exec()?
*/
-#ifdef CONFIG_PARAVIRT
-unsigned int __read_mostly vdso_enabled = 0;
-#else
-unsigned int __read_mostly vdso_enabled = 1;
-#endif
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
EXPORT_SYMBOL_GPL(vdso_enabled);
@@ -46,6 +56,123 @@ __setup("vdso=", vdso_setup);
extern asmlinkage void sysenter_entry(void);
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+ unsigned offset, unsigned size)
+{
+ Elf32_Sym *sym = (void *)ehdr + offset;
+ unsigned nsym = size / sizeof(*sym);
+ unsigned i;
+
+ for(i = 0; i < nsym; i++, sym++) {
+ if (sym->st_shndx == SHN_UNDEF ||
+ sym->st_shndx == SHN_ABS)
+ continue; /* skip */
+
+ if (sym->st_shndx > SHN_LORESERVE) {
+ printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+ sym->st_shndx);
+ continue;
+ }
+
+ switch(ELF_ST_TYPE(sym->st_info)) {
+ case STT_OBJECT:
+ case STT_FUNC:
+ case STT_SECTION:
+ case STT_FILE:
+ sym->st_value += VDSO_HIGH_BASE;
+ }
+ }
+}
+
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+ Elf32_Dyn *dyn = (void *)ehdr + offset;
+
+ for(; dyn->d_tag != DT_NULL; dyn++)
+ switch(dyn->d_tag) {
+ case DT_PLTGOT:
+ case DT_HASH:
+ case DT_STRTAB:
+ case DT_SYMTAB:
+ case DT_RELA:
+ case DT_INIT:
+ case DT_FINI:
+ case DT_REL:
+ case DT_DEBUG:
+ case DT_JMPREL:
+ case DT_VERSYM:
+ case DT_VERDEF:
+ case DT_VERNEED:
+ case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+ /* definitely pointers needing relocation */
+ dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+ break;
+
+ case DT_ENCODING ... OLD_DT_LOOS-1:
+ case DT_LOOS ... DT_HIOS-1:
+ /* Tags above DT_ENCODING are pointers if
+ they're even */
+ if (dyn->d_tag >= DT_ENCODING &&
+ (dyn->d_tag & 1) == 0)
+ dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+ break;
+
+ case DT_VERDEFNUM:
+ case DT_VERNEEDNUM:
+ case DT_FLAGS_1:
+ case DT_RELACOUNT:
+ case DT_RELCOUNT:
+ case DT_VALRNGLO ... DT_VALRNGHI:
+ /* definitely not pointers */
+ break;
+
+ case OLD_DT_LOOS ... DT_LOOS-1:
+ case DT_HIOS ... DT_VALRNGLO-1:
+ default:
+ if (dyn->d_tag > DT_ENCODING)
+ printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+ dyn->d_tag);
+ break;
+ }
+}
+
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+ Elf32_Phdr *phdr;
+ Elf32_Shdr *shdr;
+ int i;
+
+ BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+ !elf_check_arch(ehdr) ||
+ ehdr->e_type != ET_DYN);
+
+ ehdr->e_entry += VDSO_HIGH_BASE;
+
+ /* rebase phdrs */
+ phdr = (void *)ehdr + ehdr->e_phoff;
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ phdr[i].p_vaddr += VDSO_HIGH_BASE;
+
+ /* relocate dynamic stuff */
+ if (phdr[i].p_type == PT_DYNAMIC)
+ reloc_dyn(ehdr, phdr[i].p_offset);
+ }
+
+ /* rebase sections */
+ shdr = (void *)ehdr + ehdr->e_shoff;
+ for(i = 0; i < ehdr->e_shnum; i++) {
+ if (!(shdr[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ shdr[i].sh_addr += VDSO_HIGH_BASE;
+
+ if (shdr[i].sh_type == SHT_SYMTAB ||
+ shdr[i].sh_type == SHT_DYNSYM)
+ reloc_symtab(ehdr, shdr[i].sh_offset,
+ shdr[i].sh_size);
+ }
+}
+
void enable_sep_cpu(void)
{
int cpu = get_cpu();
@@ -56,14 +183,33 @@ void enable_sep_cpu(void)
return;
}
- tss->ss1 = __KERNEL_CS;
- tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
- wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0);
+ wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
put_cpu();
}
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+ gate_vma.vm_mm = NULL;
+ gate_vma.vm_start = FIXADDR_USER_START;
+ gate_vma.vm_end = FIXADDR_USER_END;
+ gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+ gate_vma.vm_page_prot = __P101;
+ /*
+ * Make sure the vDSO gets into every core dump.
+ * Dumping its contents makes post-mortem fully interpretable later
+ * without matching up the same kernel and hardware config to see
+ * what PC values meant.
+ */
+ gate_vma.vm_flags |= VM_ALWAYSDUMP;
+ return 0;
+}
+
/*
* These symbols are defined by vsyscall.o to mark the bounds
* of the ELF DSO images included therein.
@@ -72,31 +218,48 @@ extern const char vsyscall_int80_start, vsyscall_int80_end;
extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
static struct page *syscall_pages[1];
+static void map_compat_vdso(int map)
+{
+ static int vdso_mapped;
+
+ if (map == vdso_mapped)
+ return;
+
+ vdso_mapped = map;
+
+ __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
+ map ? PAGE_READONLY_EXEC : PAGE_NONE);
+
+ /* flush stray tlbs */
+ flush_tlb_all();
+}
+
int __init sysenter_setup(void)
{
void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+ const void *vsyscall;
+ size_t vsyscall_len;
+
syscall_pages[0] = virt_to_page(syscall_page);
-#ifdef CONFIG_COMPAT_VDSO
- __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC);
+ gate_vma_init();
+
printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
-#endif
if (!boot_cpu_has(X86_FEATURE_SEP)) {
- memcpy(syscall_page,
- &vsyscall_int80_start,
- &vsyscall_int80_end - &vsyscall_int80_start);
- return 0;
+ vsyscall = &vsyscall_int80_start;
+ vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
+ } else {
+ vsyscall = &vsyscall_sysenter_start;
+ vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
}
- memcpy(syscall_page,
- &vsyscall_sysenter_start,
- &vsyscall_sysenter_end - &vsyscall_sysenter_start);
+ memcpy(syscall_page, vsyscall, vsyscall_len);
+ relocate_vdso(syscall_page);
return 0;
}
-#ifndef CONFIG_COMPAT_VDSO
/* Defined in vsyscall-sysenter.S */
extern void SYSENTER_RETURN;
@@ -105,36 +268,52 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
{
struct mm_struct *mm = current->mm;
unsigned long addr;
- int ret;
+ int ret = 0;
+ bool compat;
down_write(&mm->mmap_sem);
- addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
- if (IS_ERR_VALUE(addr)) {
- ret = addr;
- goto up_fail;
- }
- /*
- * MAYWRITE to allow gdb to COW and set breakpoints
- *
- * Make sure the vDSO gets into every core dump.
- * Dumping its contents makes post-mortem fully interpretable later
- * without matching up the same kernel and hardware config to see
- * what PC values meant.
- */
- ret = install_special_mapping(mm, addr, PAGE_SIZE,
- VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
- VM_ALWAYSDUMP,
- syscall_pages);
- if (ret)
- goto up_fail;
+ /* Test compat mode once here, in case someone
+ changes it via sysctl */
+ compat = (vdso_enabled == VDSO_COMPAT);
+
+ map_compat_vdso(compat);
+
+ if (compat)
+ addr = VDSO_HIGH_BASE;
+ else {
+ addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+ goto up_fail;
+ }
+
+ /*
+ * MAYWRITE to allow gdb to COW and set breakpoints
+ *
+ * Make sure the vDSO gets into every core dump.
+ * Dumping its contents makes post-mortem fully
+ * interpretable later without matching up the same
+ * kernel and hardware config to see what PC values
+ * meant.
+ */
+ ret = install_special_mapping(mm, addr, PAGE_SIZE,
+ VM_READ|VM_EXEC|
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+ VM_ALWAYSDUMP,
+ syscall_pages);
+
+ if (ret)
+ goto up_fail;
+ }
current->mm->context.vdso = (void *)addr;
current_thread_info()->sysenter_return =
- (void *)VDSO_SYM(&SYSENTER_RETURN);
-up_fail:
+ (void *)VDSO_SYM(&SYSENTER_RETURN);
+
+ up_fail:
up_write(&mm->mmap_sem);
+
return ret;
}
@@ -147,6 +326,11 @@ const char *arch_vma_name(struct vm_area_struct *vma)
struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
+ struct mm_struct *mm = tsk->mm;
+
+ /* Check to see if this task was created in compat vdso mode */
+ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+ return &gate_vma;
return NULL;
}
@@ -159,4 +343,3 @@ int in_gate_area_no_task(unsigned long addr)
{
return 0;
}
-#endif
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 94e5cb091104..a665df61f08c 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -70,8 +70,6 @@
#include <asm/i8259.h>
-int pit_latch_buggy; /* extern */
-
#include "do_timer.h"
unsigned int cpu_khz; /* Detected as we calibrate the TSC */
diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S
index 2f1814c5cfd7..f62815f8d06a 100644
--- a/arch/i386/kernel/trampoline.S
+++ b/arch/i386/kernel/trampoline.S
@@ -29,7 +29,7 @@
*
* TYPE VALUE
* R_386_32 startup_32_smp
- * R_386_32 boot_gdt_table
+ * R_386_32 boot_gdt
*/
#include <linux/linkage.h>
@@ -62,8 +62,8 @@ r_base = .
* to 32 bit.
*/
- lidtl boot_idt - r_base # load idt with 0, 0
- lgdtl boot_gdt - r_base # load gdt with whatever is appropriate
+ lidtl boot_idt_descr - r_base # load idt with 0, 0
+ lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
xor %ax, %ax
inc %ax # protected mode (PE) bit
@@ -73,11 +73,11 @@ r_base = .
# These need to be in the same 64K segment as the above;
# hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt:
+boot_gdt_descr:
.word __BOOT_DS + 7 # gdt limit
- .long boot_gdt_table-__PAGE_OFFSET # gdt base
+ .long boot_gdt - __PAGE_OFFSET # gdt base
-boot_idt:
+boot_idt_descr:
.word 0 # idt limit = 0
.long 0 # idt base = 0L
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index af0d3f70a817..f21b41e7770c 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -476,8 +476,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
siginfo_t *info)
{
struct task_struct *tsk = current;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
if (regs->eflags & VM_MASK) {
if (vm86)
@@ -489,6 +487,18 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
goto kernel_trap;
trap_signal: {
+ /*
+ * We want error_code and trap_no set for userspace faults and
+ * kernelspace faults which result in die(), but not
+ * kernelspace faults which are fixed up. die() gives the
+ * process no chance to handle the signal and notice the
+ * kernel fault information, so that won't result in polluting
+ * the information about previously queued, but not yet
+ * delivered, faults. See also do_general_protection below.
+ */
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+
if (info)
force_sig_info(signr, info, tsk);
else
@@ -497,8 +507,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
}
kernel_trap: {
- if (!fixup_exception(regs))
+ if (!fixup_exception(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
die(str, regs, error_code);
+ }
return;
}
@@ -583,7 +596,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
* and we set the offset field correctly. Then we let the CPU to
* restart the faulting instruction.
*/
- if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
+ if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
thread->io_bitmap_ptr) {
memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
thread->io_bitmap_max);
@@ -596,16 +609,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
thread->io_bitmap_max, 0xff,
tss->io_bitmap_max - thread->io_bitmap_max);
tss->io_bitmap_max = thread->io_bitmap_max;
- tss->io_bitmap_base = IO_BITMAP_OFFSET;
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
tss->io_bitmap_owner = thread;
put_cpu();
return;
}
put_cpu();
- current->thread.error_code = error_code;
- current->thread.trap_no = 13;
-
if (regs->eflags & VM_MASK)
goto gp_in_vm86;
@@ -624,6 +634,8 @@ gp_in_vm86:
gp_in_kernel:
if (!fixup_exception(regs)) {
+ current->thread.error_code = error_code;
+ current->thread.trap_no = 13;
if (notify_die(DIE_GPF, "general protection fault", regs,
error_code, 13, SIGSEGV) == NOTIFY_STOP)
return;
@@ -1018,9 +1030,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
fastcall unsigned long patch_espfix_desc(unsigned long uesp,
unsigned long kesp)
{
- int cpu = smp_processor_id();
- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
- struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
+ struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
unsigned long base = (kesp - uesp) & -THREAD_SIZE;
unsigned long new_kesp = kesp - base;
unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 6cb8f5336732..f64b81f3033b 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -200,13 +200,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
{
struct cpufreq_freqs *freq = data;
- if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
- write_seqlock_irq(&xtime_lock);
-
if (!ref_freq) {
if (!freq->old){
ref_freq = freq->new;
- goto end;
+ return 0;
}
ref_freq = freq->old;
loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
@@ -233,13 +230,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
* TSC based sched_clock turns
* to junk w/ cpufreq
*/
- mark_tsc_unstable();
+ mark_tsc_unstable("cpufreq changes");
}
}
}
-end:
- if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
- write_sequnlock_irq(&xtime_lock);
return 0;
}
@@ -281,11 +275,12 @@ static struct clocksource clocksource_tsc = {
CLOCK_SOURCE_MUST_VERIFY,
};
-void mark_tsc_unstable(void)
+void mark_tsc_unstable(char *reason)
{
if (!tsc_unstable) {
tsc_unstable = 1;
tsc_enabled = 0;
+ printk("Marking TSC unstable due to: %s.\n", reason);
/* Can be called before registration */
if (clocksource_tsc.mult)
clocksource_change_rating(&clocksource_tsc, 0);
diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S
new file mode 100644
index 000000000000..e51a8695d54e
--- /dev/null
+++ b/arch/i386/kernel/verify_cpu.S
@@ -0,0 +1,65 @@
+/* Check if CPU has some minimum CPUID bits
+ This runs in 16bit mode so that the caller can still use the BIOS
+ to output errors on the screen */
+#include <asm/cpufeature.h>
+
+verify_cpu:
+ pushfl # Save caller passed flags
+ pushl $0 # Kill any dangerous flags
+ popfl
+
+#if CONFIG_X86_MINIMUM_CPU_MODEL >= 4
+ pushfl
+ orl $(1<<18),(%esp) # try setting AC
+ popfl
+ pushfl
+ popl %eax
+ testl $(1<<18),%eax
+ jz bad
+#endif
+#if REQUIRED_MASK1 != 0
+ pushfl # standard way to check for cpuid
+ popl %eax
+ movl %eax,%ebx
+ xorl $0x200000,%eax
+ pushl %eax
+ popfl
+ pushfl
+ popl %eax
+ cmpl %eax,%ebx
+ pushfl # standard way to check for cpuid
+ popl %eax
+ movl %eax,%ebx
+ xorl $0x200000,%eax
+ pushl %eax
+ popfl
+ pushfl
+ popl %eax
+ cmpl %eax,%ebx
+ jz bad # REQUIRED_MASK1 != 0 requires CPUID
+
+ movl $0x0,%eax # See if cpuid 1 is implemented
+ cpuid
+ cmpl $0x1,%eax
+ jb bad # no cpuid 1
+
+ movl $0x1,%eax # Does the cpu have what it takes
+ cpuid
+
+#if CONFIG_X86_MINIMUM_CPU_MODEL > 4
+#error add proper model checking here
+#endif
+
+ andl $REQUIRED_MASK1,%edx
+ xorl $REQUIRED_MASK1,%edx
+ jnz bad
+#endif /* REQUIRED_MASK1 */
+
+ popfl
+ xor %eax,%eax
+ ret
+
+bad:
+ popfl
+ movl $1,%eax
+ ret
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index 697a70e8c0c9..c8726c424b35 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -26,6 +26,7 @@
#include <linux/cpu.h>
#include <linux/bootmem.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <asm/vmi.h>
#include <asm/io.h>
#include <asm/fixmap.h>
@@ -56,7 +57,7 @@ static int disable_noidle;
static int disable_vmi_timer;
/* Cached VMI operations */
-struct {
+static struct {
void (*cpuid)(void /* non-c */);
void (*_set_ldt)(u32 selector);
void (*set_tr)(u32 selector);
@@ -65,16 +66,15 @@ struct {
void (*release_page)(u32, u32);
void (*set_pte)(pte_t, pte_t *, unsigned);
void (*update_pte)(pte_t *, unsigned);
- void (*set_linear_mapping)(int, u32, u32, u32);
- void (*flush_tlb)(int);
+ void (*set_linear_mapping)(int, void *, u32, u32);
+ void (*_flush_tlb)(int);
void (*set_initial_ap_state)(int, int);
void (*halt)(void);
void (*set_lazy_mode)(int mode);
} vmi_ops;
-/* XXX move this to alternative.h */
-extern struct paravirt_patch __start_parainstructions[],
- __stop_parainstructions[];
+/* Cached VMI operations */
+struct vmi_timer_ops vmi_timer_ops;
/*
* VMI patching routines.
@@ -83,11 +83,6 @@ extern struct paravirt_patch __start_parainstructions[],
#define MNEM_JMP 0xe9
#define MNEM_RET 0xc3
-static char irq_save_disable_callout[] = {
- MNEM_CALL, 0, 0, 0, 0,
- MNEM_CALL, 0, 0, 0, 0,
- MNEM_RET
-};
#define IRQ_PATCH_INT_MASK 0
#define IRQ_PATCH_DISABLE 5
@@ -135,33 +130,17 @@ static unsigned patch_internal(int call, unsigned len, void *insns)
static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
{
switch (type) {
- case PARAVIRT_IRQ_DISABLE:
+ case PARAVIRT_PATCH(irq_disable):
return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
- case PARAVIRT_IRQ_ENABLE:
+ case PARAVIRT_PATCH(irq_enable):
return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
- case PARAVIRT_RESTORE_FLAGS:
+ case PARAVIRT_PATCH(restore_fl):
return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
- case PARAVIRT_SAVE_FLAGS:
+ case PARAVIRT_PATCH(save_fl):
return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
- case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE:
- if (len >= 10) {
- patch_internal(VMI_CALL_GetInterruptMask, len, insns);
- patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
- return 10;
- } else {
- /*
- * You bastards didn't leave enough room to
- * patch save_flags_irq_disable inline. Patch
- * to a helper
- */
- BUG_ON(len < 5);
- *(char *)insns = MNEM_CALL;
- patch_offset(insns, irq_save_disable_callout);
- return 5;
- }
- case PARAVIRT_INTERRUPT_RETURN:
+ case PARAVIRT_PATCH(iret):
return patch_internal(VMI_CALL_IRET, len, insns);
- case PARAVIRT_STI_SYSEXIT:
+ case PARAVIRT_PATCH(irq_enable_sysexit):
return patch_internal(VMI_CALL_SYSEXIT, len, insns);
default:
break;
@@ -230,24 +209,24 @@ static void vmi_set_tr(void)
static void vmi_load_esp0(struct tss_struct *tss,
struct thread_struct *thread)
{
- tss->esp0 = thread->esp0;
+ tss->x86_tss.esp0 = thread->esp0;
/* This can only happen when SEP is enabled, no need to test "SEP"arately */
- if (unlikely(tss->ss1 != thread->sysenter_cs)) {
- tss->ss1 = thread->sysenter_cs;
+ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+ tss->x86_tss.ss1 = thread->sysenter_cs;
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
- vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0);
+ vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
}
static void vmi_flush_tlb_user(void)
{
- vmi_ops.flush_tlb(VMI_FLUSH_TLB);
+ vmi_ops._flush_tlb(VMI_FLUSH_TLB);
}
static void vmi_flush_tlb_kernel(void)
{
- vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
+ vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
}
/* Stub to do nothing at all; used for delays and unimplemented calls */
@@ -255,18 +234,6 @@ static void vmi_nop(void)
{
}
-/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
-static fastcall void vmi_safe_halt(void)
-{
- int idle = vmi_stop_hz_timer();
- vmi_ops.halt();
- if (idle) {
- local_irq_disable();
- vmi_account_time_restart_hz_timer();
- local_irq_enable();
- }
-}
-
#ifdef CONFIG_DEBUG_PAGE_TYPE
#ifdef CONFIG_X86_PAE
@@ -370,8 +337,11 @@ static void vmi_check_page_type(u32 pfn, int type)
#define vmi_check_page_type(p,t) do { } while (0)
#endif
-static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn)
+#ifdef CONFIG_HIGHPTE
+static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
{
+ void *va = kmap_atomic(page, type);
+
/*
* Internally, the VMI ROM must map virtual addresses to physical
* addresses for processing MMU updates. By the time MMU updates
@@ -385,8 +355,11 @@ static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn)
* args: SLOT VA COUNT PFN
*/
BUG_ON(type != KM_PTE0 && type != KM_PTE1);
- vmi_ops.set_linear_mapping((type - KM_PTE0)+1, (u32)va, 1, pfn);
+ vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
+
+ return va;
}
+#endif
static void vmi_allocate_pt(u32 pfn)
{
@@ -443,13 +416,13 @@ static void vmi_release_pd(u32 pfn)
((level) | (is_current_as(mm, user) ? \
(VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep)
+static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
-static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
@@ -462,7 +435,7 @@ static void vmi_set_pte(pte_t *ptep, pte_t pte)
vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
}
-static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
{
vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
@@ -516,7 +489,7 @@ static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
}
-void vmi_pmd_clear(pmd_t *pmd)
+static void vmi_pmd_clear(pmd_t *pmd)
{
const pte_t pte = { 0 };
vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
@@ -525,8 +498,6 @@ void vmi_pmd_clear(pmd_t *pmd)
#endif
#ifdef CONFIG_SMP
-extern void setup_pda(void);
-
static void __devinit
vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
unsigned long start_esp)
@@ -551,13 +522,11 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
ap.ds = __USER_DS;
ap.es = __USER_DS;
- ap.fs = __KERNEL_PDA;
+ ap.fs = __KERNEL_PERCPU;
ap.gs = 0;
ap.eflags = 0;
- setup_pda();
-
#ifdef CONFIG_X86_PAE
/* efer should match BSP efer. */
if (cpu_has_nx) {
@@ -575,9 +544,9 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
}
#endif
-static void vmi_set_lazy_mode(int mode)
+static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
{
- static DEFINE_PER_CPU(int, lazy_mode);
+ static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
if (!vmi_ops.set_lazy_mode)
return;
@@ -685,7 +654,7 @@ void vmi_bringup(void)
{
/* We must establish the lowmem mapping for MMU ops to work */
if (vmi_ops.set_linear_mapping)
- vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
+ vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
}
/*
@@ -740,7 +709,6 @@ do { \
} \
} while (0)
-
/*
* Activate the VMI interface and switch into paravirtualized mode
*/
@@ -796,12 +764,6 @@ static inline int __init activate_vmi(void)
para_fill(irq_disable, DisableInterrupts);
para_fill(irq_enable, EnableInterrupts);
- /* irq_save_disable !!! sheer pain */
- patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
- (char *)paravirt_ops.save_fl);
- patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
- (char *)paravirt_ops.irq_disable);
-
para_fill(wbinvd, WBINVD);
para_fill(read_tsc, RDTSC);
@@ -831,8 +793,8 @@ static inline int __init activate_vmi(void)
para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
/* user and kernel flush are just handled with different flags to FlushTLB */
- para_wrap(flush_tlb_user, vmi_flush_tlb_user, flush_tlb, FlushTLB);
- para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, flush_tlb, FlushTLB);
+ para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
+ para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
para_fill(flush_tlb_single, InvalPage);
/*
@@ -878,8 +840,13 @@ static inline int __init activate_vmi(void)
paravirt_ops.release_pt = vmi_release_pt;
paravirt_ops.release_pd = vmi_release_pd;
}
- para_wrap(map_pt_hook, vmi_map_pt_hook, set_linear_mapping,
- SetLinearMapping);
+
+ /* Set linear is needed in all cases */
+ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
+#ifdef CONFIG_HIGHPTE
+ if (vmi_ops.set_linear_mapping)
+ paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
+#endif
/*
* These MUST always be patched. Don't support indirect jumps
@@ -920,8 +887,8 @@ static inline int __init activate_vmi(void)
paravirt_ops.get_wallclock = vmi_get_wallclock;
paravirt_ops.set_wallclock = vmi_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
- paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
+ paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
+ paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
#endif
paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
paravirt_ops.get_cpu_khz = vmi_cpu_khz;
@@ -933,11 +900,7 @@ static inline int __init activate_vmi(void)
disable_vmi_timer = 1;
}
- /* No idle HZ mode only works if VMI timer and no idle is enabled */
- if (disable_noidle || disable_vmi_timer)
- para_fill(safe_halt, Halt);
- else
- para_wrap(safe_halt, vmi_safe_halt, halt, Halt);
+ para_fill(safe_halt, Halt);
/*
* Alternative instruction rewriting doesn't happen soon enough
@@ -945,7 +908,7 @@ static inline int __init activate_vmi(void)
* to do this before IRQs get reenabled. Fortunately, it is
* idempotent.
*/
- apply_paravirt(__start_parainstructions, __stop_parainstructions);
+ apply_paravirt(__parainstructions, __parainstructions_end);
vmi_bringup();
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
new file mode 100644
index 000000000000..26a37f8a8762
--- /dev/null
+++ b/arch/i386/kernel/vmiclock.c
@@ -0,0 +1,318 @@
+/*
+ * VMI paravirtual timer support routines.
+ *
+ * Copyright (C) 2007, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/cpumask.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+
+#include <asm/vmi.h>
+#include <asm/vmi_time.h>
+#include <asm/arch_hooks.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+
+#include <irq_vectors.h>
+#include "io_ports.h"
+
+#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+
+static DEFINE_PER_CPU(struct clock_event_device, local_events);
+
+static inline u32 vmi_counter(u32 flags)
+{
+ /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
+ * cycle counter. */
+ return flags & VMI_ALARM_COUNTER_MASK;
+}
+
+/* paravirt_ops.get_wallclock = vmi_get_wallclock */
+unsigned long vmi_get_wallclock(void)
+{
+ unsigned long long wallclock;
+ wallclock = vmi_timer_ops.get_wallclock(); // nsec
+ (void)do_div(wallclock, 1000000000); // sec
+
+ return wallclock;
+}
+
+/* paravirt_ops.set_wallclock = vmi_set_wallclock */
+int vmi_set_wallclock(unsigned long now)
+{
+ return 0;
+}
+
+/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
+unsigned long long vmi_get_sched_cycles(void)
+{
+ return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+}
+
+/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
+unsigned long vmi_cpu_khz(void)
+{
+ unsigned long long khz;
+ khz = vmi_timer_ops.get_cycle_frequency();
+ (void)do_div(khz, 1000);
+ return khz;
+}
+
+static inline unsigned int vmi_get_timer_vector(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ return FIRST_DEVICE_VECTOR;
+#else
+ return FIRST_EXTERNAL_VECTOR;
+#endif
+}
+
+/** vmi clockchip */
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int startup_timer_irq(unsigned int irq)
+{
+ unsigned long val = apic_read(APIC_LVTT);
+ apic_write(APIC_LVTT, vmi_get_timer_vector());
+
+ return (val & APIC_SEND_PENDING);
+}
+
+static void mask_timer_irq(unsigned int irq)
+{
+ unsigned long val = apic_read(APIC_LVTT);
+ apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
+}
+
+static void unmask_timer_irq(unsigned int irq)
+{
+ unsigned long val = apic_read(APIC_LVTT);
+ apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
+}
+
+static void ack_timer_irq(unsigned int irq)
+{
+ ack_APIC_irq();
+}
+
+static struct irq_chip vmi_chip __read_mostly = {
+ .name = "VMI-LOCAL",
+ .startup = startup_timer_irq,
+ .mask = mask_timer_irq,
+ .unmask = unmask_timer_irq,
+ .ack = ack_timer_irq
+};
+#endif
+
+/** vmi clockevent */
+#define VMI_ALARM_WIRED_IRQ0 0x00000000
+#define VMI_ALARM_WIRED_LVTT 0x00010000
+static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
+
+static inline int vmi_get_alarm_wiring(void)
+{
+ return vmi_wiring;
+}
+
+static void vmi_timer_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ cycle_t now, cycles_per_hz;
+ BUG_ON(!irqs_disabled());
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_ONESHOT:
+ break;
+ case CLOCK_EVT_MODE_PERIODIC:
+ cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
+ (void)do_div(cycles_per_hz, HZ);
+ now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
+ vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
+ break;
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ switch (evt->mode) {
+ case CLOCK_EVT_MODE_ONESHOT:
+ vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
+ break;
+ case CLOCK_EVT_MODE_PERIODIC:
+ vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
+ break;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static int vmi_timer_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ /* Unfortunately, set_next_event interface only passes relative
+ * expiry, but we want absolute expiry. It'd be better if were
+ * were passed an aboslute expiry, since a bunch of time may
+ * have been stolen between the time the delta is computed and
+ * when we set the alarm below. */
+ cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
+
+ BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+ vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
+ return 0;
+}
+
+static struct clock_event_device vmi_clockevent = {
+ .name = "vmi-timer",
+ .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+ .shift = 22,
+ .set_mode = vmi_timer_set_mode,
+ .set_next_event = vmi_timer_next_event,
+ .rating = 1000,
+ .irq = 0,
+};
+
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
+{
+ struct clock_event_device *evt = &__get_cpu_var(local_events);
+ evt->event_handler(evt);
+ return IRQ_HANDLED;
+}
+
+static struct irqaction vmi_clock_action = {
+ .name = "vmi-timer",
+ .handler = vmi_timer_interrupt,
+ .flags = IRQF_DISABLED | IRQF_NOBALANCING,
+ .mask = CPU_MASK_ALL,
+};
+
+static void __devinit vmi_time_init_clockevent(void)
+{
+ cycle_t cycles_per_msec;
+ struct clock_event_device *evt;
+
+ int cpu = smp_processor_id();
+ evt = &__get_cpu_var(local_events);
+
+ /* Use cycles_per_msec since div_sc params are 32-bits. */
+ cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+ (void)do_div(cycles_per_msec, 1000);
+
+ memcpy(evt, &vmi_clockevent, sizeof(*evt));
+ /* Must pick .shift such that .mult fits in 32-bits. Choosing
+ * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
+ * before overflow. */
+ evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
+ /* Upper bound is clockevent's use of ulong for cycle deltas. */
+ evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
+ evt->min_delta_ns = clockevent_delta2ns(1, evt);
+ evt->cpumask = cpumask_of_cpu(cpu);
+
+ printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
+ evt->name, evt->mult, evt->shift);
+ clockevents_register_device(evt);
+}
+
+void __init vmi_time_init(void)
+{
+ /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
+ outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+
+ vmi_time_init_clockevent();
+ setup_irq(0, &vmi_clock_action);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+void __devinit vmi_time_bsp_init(void)
+{
+ /*
+ * On APIC systems, we want local timers to fire on each cpu. We do
+ * this by programming LVTT to deliver timer events to the IRQ handler
+ * for IRQ-0, since we can't re-use the APIC local timer handler
+ * without interfering with that code.
+ */
+ clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+ local_irq_disable();
+#ifdef CONFIG_X86_SMP
+ /*
+ * XXX handle_percpu_irq only defined for SMP; we need to switch over
+ * to using it, since this is a local interrupt, which each CPU must
+ * handle individually without locking out or dropping simultaneous
+ * local timers on other CPUs. We also don't want to trigger the
+ * quirk workaround code for interrupts which gets invoked from
+ * handle_percpu_irq via eoi, so we use our own IRQ chip.
+ */
+ set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
+#else
+ set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
+#endif
+ vmi_wiring = VMI_ALARM_WIRED_LVTT;
+ apic_write(APIC_LVTT, vmi_get_timer_vector());
+ local_irq_enable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+}
+
+void __devinit vmi_time_ap_init(void)
+{
+ vmi_time_init_clockevent();
+ apic_write(APIC_LVTT, vmi_get_timer_vector());
+}
+#endif
+
+/** vmi clocksource */
+
+static cycle_t read_real_cycles(void)
+{
+ return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
+}
+
+static struct clocksource clocksource_vmi = {
+ .name = "vmi-timer",
+ .rating = 450,
+ .read = read_real_cycles,
+ .mask = CLOCKSOURCE_MASK(64),
+ .mult = 0, /* to be set */
+ .shift = 22,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static int __init init_vmi_clocksource(void)
+{
+ cycle_t cycles_per_msec;
+
+ if (!vmi_timer_ops.get_cycle_frequency)
+ return 0;
+ /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
+ cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+ (void)do_div(cycles_per_msec, 1000);
+
+ /* Note that clocksource.{mult, shift} converts in the opposite direction
+ * as clockevents. */
+ clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
+ clocksource_vmi.shift);
+
+ printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
+ return clocksource_register(&clocksource_vmi);
+
+}
+module_init(init_vmi_clocksource);
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
deleted file mode 100644
index 9dfb17739b67..000000000000
--- a/arch/i386/kernel/vmitime.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-/*
- * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
- * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/jiffies.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include <linux/rcupdate.h>
-#include <linux/clocksource.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/div64.h>
-#include <asm/timer.h>
-#include <asm/desc.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-
-#include <mach_timer.h>
-#include <io_ports.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
-#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
-#else
-#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
-#endif
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-#ifdef CONFIG_NO_IDLE_HZ
-
-/* /proc/sys/kernel/hz_timer state. */
-int sysctl_hz_timer;
-
-/* Some stats */
-static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
-static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
-static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
-
-#endif /* CONFIG_NO_IDLE_HZ */
-
-/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
-static int alarm_hz = CONFIG_VMI_ALARM_HZ;
-
-/* Cache of the value get_cycle_frequency / HZ. */
-static signed long long cycles_per_jiffy;
-
-/* Cache of the value get_cycle_frequency / alarm_hz. */
-static signed long long cycles_per_alarm;
-
-/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
- * Protected by xtime_lock. */
-static unsigned long long real_cycles_accounted_system;
-
-/* The number of cycles accounted for by update_process_times(), per cpu. */
-static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
-
-/* The number of stolen cycles accounted, per cpu. */
-static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
-
-/* Clock source. */
-static cycle_t read_real_cycles(void)
-{
- return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
-}
-
-static cycle_t read_available_cycles(void)
-{
- return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
-}
-
-#if 0
-static cycle_t read_stolen_cycles(void)
-{
- return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
-}
-#endif /* 0 */
-
-static struct clocksource clocksource_vmi = {
- .name = "vmi-timer",
- .rating = 450,
- .read = read_real_cycles,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be set */
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-
-/* Timer interrupt handler. */
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
-
-static struct irqaction vmi_timer_irq = {
- .handler = vmi_timer_interrupt,
- .flags = IRQF_DISABLED,
- .mask = CPU_MASK_NONE,
- .name = "VMI-alarm",
-};
-
-/* Alarm rate */
-static int __init vmi_timer_alarm_rate_setup(char* str)
-{
- int alarm_rate;
- if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
- alarm_hz = alarm_rate;
- printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
- }
- return 1;
-}
-__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
-
-
-/* Initialization */
-static void vmi_get_wallclock_ts(struct timespec *ts)
-{
- unsigned long long wallclock;
- wallclock = vmi_timer_ops.get_wallclock(); // nsec units
- ts->tv_nsec = do_div(wallclock, 1000000000);
- ts->tv_sec = wallclock;
-}
-
-unsigned long vmi_get_wallclock(void)
-{
- struct timespec ts;
- vmi_get_wallclock_ts(&ts);
- return ts.tv_sec;
-}
-
-int vmi_set_wallclock(unsigned long now)
-{
- return -1;
-}
-
-unsigned long long vmi_get_sched_cycles(void)
-{
- return read_available_cycles();
-}
-
-unsigned long vmi_cpu_khz(void)
-{
- unsigned long long khz;
-
- khz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(khz, 1000);
- return khz;
-}
-
-void __init vmi_time_init(void)
-{
- unsigned long long cycles_per_sec, cycles_per_msec;
- unsigned long flags;
-
- local_irq_save(flags);
- setup_irq(0, &vmi_timer_irq);
-#ifdef CONFIG_X86_LOCAL_APIC
- set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
-#endif
-
- real_cycles_accounted_system = read_real_cycles();
- per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
-
- cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
- cycles_per_jiffy = cycles_per_sec;
- (void)do_div(cycles_per_jiffy, HZ);
- cycles_per_alarm = cycles_per_sec;
- (void)do_div(cycles_per_alarm, alarm_hz);
- cycles_per_msec = cycles_per_sec;
- (void)do_div(cycles_per_msec, 1000);
-
- printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
- "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
- cycles_per_alarm);
-
- clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
- clocksource_vmi.shift);
- if (clocksource_register(&clocksource_vmi))
- printk(KERN_WARNING "Error registering VMITIME clocksource.");
-
- /* Disable PIT. */
- outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
- /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
- * reduce the latency calling update_process_times. */
- vmi_timer_ops.set_alarm(
- VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
- per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
- cycles_per_alarm);
-
- local_irq_restore(flags);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-void __init vmi_timer_setup_boot_alarm(void)
-{
- local_irq_disable();
-
- /* Route the interrupt to the correct vector. */
- apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
-
- /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
- vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
- vmi_timer_ops.set_alarm(
- VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
- per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
- cycles_per_alarm);
- local_irq_enable();
-}
-
-/* Initialize the time accounting variables for an AP on an SMP system.
- * Also, set the local alarm for the AP. */
-void __devinit vmi_timer_setup_secondary_alarm(void)
-{
- int cpu = smp_processor_id();
-
- /* Route the interrupt to the correct vector. */
- apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
-
- per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
-
- vmi_timer_ops.set_alarm(
- VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
- per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
- cycles_per_alarm);
-}
-
-#endif
-
-/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
-static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
-{
- long long cycles_not_accounted;
-
- write_seqlock(&xtime_lock);
-
- cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
- while (cycles_not_accounted >= cycles_per_jiffy) {
- /* systems wide jiffies. */
- do_timer(1);
-
- cycles_not_accounted -= cycles_per_jiffy;
- real_cycles_accounted_system += cycles_per_jiffy;
- }
-
- write_sequnlock(&xtime_lock);
-}
-
-/* Update per-cpu process times. */
-static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
- unsigned long long cur_process_times_cycles)
-{
- long long cycles_not_accounted;
- cycles_not_accounted = cur_process_times_cycles -
- per_cpu(process_times_cycles_accounted_cpu, cpu);
-
- while (cycles_not_accounted >= cycles_per_jiffy) {
- /* Account time to the current process. This includes
- * calling into the scheduler to decrement the timeslice
- * and possibly reschedule.*/
- update_process_times(user_mode(regs));
- /* XXX handle /proc/profile multiplier. */
- profile_tick(CPU_PROFILING);
-
- cycles_not_accounted -= cycles_per_jiffy;
- per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
- }
-}
-
-#ifdef CONFIG_NO_IDLE_HZ
-/* Update per-cpu idle times. Used when a no-hz halt is ended. */
-static void vmi_account_no_hz_idle_cycles(int cpu,
- unsigned long long cur_process_times_cycles)
-{
- long long cycles_not_accounted;
- unsigned long no_idle_hz_jiffies = 0;
-
- cycles_not_accounted = cur_process_times_cycles -
- per_cpu(process_times_cycles_accounted_cpu, cpu);
-
- while (cycles_not_accounted >= cycles_per_jiffy) {
- no_idle_hz_jiffies++;
- cycles_not_accounted -= cycles_per_jiffy;
- per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
- }
- /* Account time to the idle process. */
- account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
-}
-#endif
-
-/* Update per-cpu stolen time. */
-static void vmi_account_stolen_cycles(int cpu,
- unsigned long long cur_real_cycles,
- unsigned long long cur_avail_cycles)
-{
- long long stolen_cycles_not_accounted;
- unsigned long stolen_jiffies = 0;
-
- if (cur_real_cycles < cur_avail_cycles)
- return;
-
- stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
- per_cpu(stolen_cycles_accounted_cpu, cpu);
-
- while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
- stolen_jiffies++;
- stolen_cycles_not_accounted -= cycles_per_jiffy;
- per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
- }
- /* HACK: pass NULL to force time onto cpustat->steal. */
- account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
-}
-
-/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
- * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
-static void vmi_local_timer_interrupt(int cpu)
-{
- unsigned long long cur_real_cycles, cur_process_times_cycles;
-
- cur_real_cycles = read_real_cycles();
- cur_process_times_cycles = read_available_cycles();
- /* Update system wide (real) time state (xtime, jiffies). */
- vmi_account_real_cycles(cur_real_cycles);
- /* Update per-cpu process times. */
- vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
- /* Update time stolen from this cpu by the hypervisor. */
- vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
-}
-
-#ifdef CONFIG_NO_IDLE_HZ
-
-/* Must be called only from idle loop, with interrupts disabled. */
-int vmi_stop_hz_timer(void)
-{
- /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
-
- unsigned long seq, next;
- unsigned long long real_cycles_expiry;
- int cpu = smp_processor_id();
-
- BUG_ON(!irqs_disabled());
- if (sysctl_hz_timer != 0)
- return 0;
-
- cpu_set(cpu, nohz_cpu_mask);
- smp_mb();
-
- if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
- (next = next_timer_interrupt(),
- time_before_eq(next, jiffies + HZ/CONFIG_VMI_ALARM_HZ))) {
- cpu_clear(cpu, nohz_cpu_mask);
- return 0;
- }
-
- /* Convert jiffies to the real cycle counter. */
- do {
- seq = read_seqbegin(&xtime_lock);
- real_cycles_expiry = real_cycles_accounted_system +
- (long)(next - jiffies) * cycles_per_jiffy;
- } while (read_seqretry(&xtime_lock, seq));
-
- /* This cpu is going idle. Disable the periodic alarm. */
- vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
- per_cpu(idle_start_jiffies, cpu) = jiffies;
- /* Set the real time alarm to expire at the next event. */
- vmi_timer_ops.set_alarm(
- VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
- real_cycles_expiry, 0);
- return 1;
-}
-
-static void vmi_reenable_hz_timer(int cpu)
-{
- /* For /proc/vmi/info idle_hz stat. */
- per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
- per_cpu(vmi_idle_no_hz_irqs, cpu)++;
-
- /* Don't bother explicitly cancelling the one-shot alarm -- at
- * worse we will receive a spurious timer interrupt. */
- vmi_timer_ops.set_alarm(
- VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
- per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
- cycles_per_alarm);
- /* Indicate this cpu is no longer nohz idle. */
- cpu_clear(cpu, nohz_cpu_mask);
-}
-
-/* Called from interrupt handlers when (local) HZ timer is disabled. */
-void vmi_account_time_restart_hz_timer(void)
-{
- unsigned long long cur_real_cycles, cur_process_times_cycles;
- int cpu = smp_processor_id();
-
- BUG_ON(!irqs_disabled());
- /* Account the time during which the HZ timer was disabled. */
- cur_real_cycles = read_real_cycles();
- cur_process_times_cycles = read_available_cycles();
- /* Update system wide (real) time state (xtime, jiffies). */
- vmi_account_real_cycles(cur_real_cycles);
- /* Update per-cpu idle times. */
- vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
- /* Update time stolen from this cpu by the hypervisor. */
- vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
- /* Reenable the hz timer. */
- vmi_reenable_hz_timer(cpu);
-}
-
-#endif /* CONFIG_NO_IDLE_HZ */
-
-/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
- * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
- * APIC setup and setup_boot_vmi_alarm() is called. */
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
- vmi_local_timer_interrupt(smp_processor_id());
- return IRQ_HANDLED;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
- * Also used in UP when CONFIG_X86_LOCAL_APIC.
- * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
-void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
- int cpu = smp_processor_id();
-
- /*
- * the NMI deadlock-detector uses this.
- */
- per_cpu(irq_stat,cpu).apic_timer_irqs++;
-
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- */
- ack_APIC_irq();
-
- /*
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- irq_enter();
- vmi_local_timer_interrupt(cpu);
- irq_exit();
- set_irq_regs(old_regs);
-}
-
-#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 6f38f818380b..23e8614edeee 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -26,12 +26,11 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
OUTPUT_ARCH(i386)
ENTRY(phys_startup_32)
jiffies = jiffies_64;
-_proxy_pda = 1;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
- note PT_NOTE FLAGS(4); /* R__ */
+ note PT_NOTE FLAGS(0); /* ___ */
}
SECTIONS
{
@@ -61,8 +60,6 @@ SECTIONS
__stop___ex_table = .;
}
- RODATA
-
BUG_TABLE
. = ALIGN(4);
@@ -72,6 +69,8 @@ SECTIONS
__tracedata_end = .;
}
+ RODATA
+
/* writeable */
. = ALIGN(4096);
.data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
@@ -117,22 +116,11 @@ SECTIONS
/* might get freed after init */
. = ALIGN(4096);
- .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
- __smp_alt_begin = .;
- __smp_alt_instructions = .;
- *(.smp_altinstructions)
- __smp_alt_instructions_end = .;
- }
- . = ALIGN(4);
.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
__smp_locks = .;
*(.smp_locks)
__smp_locks_end = .;
}
- .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
- *(.smp_altinstr_replacement)
- __smp_alt_end = .;
- }
/* will be freed after init
* Following ALIGN() is required to make sure no other data falls on the
* same page where __smp_alt_end is pointing as that page might be freed
@@ -178,9 +166,9 @@ SECTIONS
}
. = ALIGN(4);
.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __start_parainstructions = .;
+ __parainstructions = .;
*(.parainstructions)
- __stop_parainstructions = .;
+ __parainstructions_end = .;
}
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
@@ -194,7 +182,7 @@ SECTIONS
__initramfs_end = .;
}
#endif
- . = ALIGN(L1_CACHE_BYTES);
+ . = ALIGN(4096);
.data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
__per_cpu_start = .;
*(.data.percpu)
diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S
index f66cd11adb72..4a8b0ed9b8fb 100644
--- a/arch/i386/kernel/vsyscall.lds.S
+++ b/arch/i386/kernel/vsyscall.lds.S
@@ -7,7 +7,7 @@
SECTIONS
{
- . = VDSO_PRELINK + SIZEOF_HEADERS;
+ . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
@@ -21,7 +21,7 @@ SECTIONS
For the layouts to match, we need to skip more than enough
space for the dynamic symbol table et al. If this amount
is insufficient, ld -shared will barf. Just increase it here. */
- . = VDSO_PRELINK + 0x400;
+ . = VDSO_PRELINK_asm + 0x400;
.text : { *(.text) } :text =0x90909090
.note : { *(.note.*) } :text :note