54 files changed, 2406 insertions, 2696 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 4ae3dcf1d2f0..4f98516b9f94 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,12 +39,10 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
 
-obj-$(CONFIG_VMI)		+= vmi.o vmitime.o
+obj-$(CONFIG_VMI)		+= vmi.o vmiclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 obj-y				+= pcspeaker.o
 
-EXTRA_AFLAGS   := -traditional
-
 obj-$(CONFIG_SCx200)		+= scx200.o
 
 # vsyscall.o contains the vsyscall DSO images as __initdata.
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 9ea5b8ecc7e1..280898b045b2 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -874,7 +874,7 @@ static void __init acpi_process_madt(void)
 				acpi_ioapic = 1;
 
 				smp_found_config = 1;
-				clustered_apic_check();
+				setup_apic_routing();
 			}
 		}
 		if (error == -EINVAL) {
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index 8f7efd38254d..23f78efc577d 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,7 +10,6 @@
 #include <asm/pci-direct.h>
 #include <asm/acpi.h>
 #include <asm/apic.h>
-#include <asm/irq.h>
 
 #ifdef CONFIG_ACPI
 
@@ -48,24 +47,6 @@ static int __init check_bridge(int vendor, int device)
 	return 0;
 }
 
-static void check_intel(void)
-{
-	u16 vendor, device;
-
-	vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
-
-	if (vendor != PCI_VENDOR_ID_INTEL)
-		return;
-
-	device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
-#ifdef CONFIG_SMP
-	if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
-	    device == PCI_DEVICE_ID_INTEL_E7525_MCH)
-		quirk_intel_irqbalance();
-#endif
-}
-
 void __init check_acpi_pci(void)
 {
 	int num, slot, func;
@@ -77,8 +58,6 @@ void __init check_acpi_pci(void)
 	if (!early_pci_allowed())
 		return;
 
-	check_intel();
-
 	/* Poor man's PCI discovery */
 	for (num = 0; num < 32; num++) {
 		for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 426f59b0106b..e5cec6685cc5 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -5,6 +5,7 @@
 #include <asm/alternative.h>
 #include <asm/sections.h>
 
+static int noreplace_smp     = 0;
 static int smp_alt_once      = 0;
 static int debug_alternative = 0;
 
@@ -13,15 +14,33 @@ static int __init bootonly(char *str)
 	smp_alt_once = 1;
 	return 1;
 }
+__setup("smp-alt-boot", bootonly);
+
 static int __init debug_alt(char *str)
 {
 	debug_alternative = 1;
 	return 1;
 }
-
-__setup("smp-alt-boot", bootonly);
 __setup("debug-alternative", debug_alt);
 
+static int __init setup_noreplace_smp(char *str)
+{
+	noreplace_smp = 1;
+	return 1;
+}
+__setup("noreplace-smp", setup_noreplace_smp);
+
+#ifdef CONFIG_PARAVIRT
+static int noreplace_paravirt = 0;
+
+static int __init setup_noreplace_paravirt(char *str)
+{
+	noreplace_paravirt = 1;
+	return 1;
+}
+__setup("noreplace-paravirt", setup_noreplace_paravirt);
+#endif
+
 #define DPRINTK(fmt, args...) if (debug_alternative) \
 	printk(KERN_DEBUG fmt, args)
 
@@ -132,11 +151,8 @@ static void nop_out(void *insns, unsigned int len)
 }
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
 extern u8 *__smp_locks[], *__smp_locks_end[];
 
-extern u8 __smp_alt_begin[], __smp_alt_end[];
-
 /* Replace instructions with better alternatives for this CPU type.
    This runs before SMP is initialized to avoid SMP problems with
    self modifying code. This implies that assymetric systems where
@@ -171,29 +187,6 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 
 #ifdef CONFIG_SMP
 
-static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end)
-{
-	struct alt_instr *a;
-
-	DPRINTK("%s: alt table %p-%p\n", __FUNCTION__, start, end);
-	for (a = start; a < end; a++) {
-		memcpy(a->replacement + a->replacementlen,
-		       a->instr,
-		       a->instrlen);
-	}
-}
-
-static void alternatives_smp_apply(struct alt_instr *start, struct alt_instr *end)
-{
-	struct alt_instr *a;
-
-	for (a = start; a < end; a++) {
-		memcpy(a->instr,
-		       a->replacement + a->replacementlen,
-		       a->instrlen);
-	}
-}
-
 static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 {
 	u8 **ptr;
@@ -211,6 +204,9 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
 {
 	u8 **ptr;
 
+	if (noreplace_smp)
+		return;
+
 	for (ptr = start; ptr < end; ptr++) {
 		if (*ptr < text)
 			continue;
@@ -245,6 +241,9 @@ void alternatives_smp_module_add(struct module *mod, char *name,
 	struct smp_alt_module *smp;
 	unsigned long flags;
 
+	if (noreplace_smp)
+		return;
+
 	if (smp_alt_once) {
 		if (boot_cpu_has(X86_FEATURE_UP))
 			alternatives_smp_unlock(locks, locks_end,
@@ -279,7 +278,7 @@ void alternatives_smp_module_del(struct module *mod)
 	struct smp_alt_module *item;
 	unsigned long flags;
 
-	if (smp_alt_once)
+	if (smp_alt_once || noreplace_smp)
 		return;
 
 	spin_lock_irqsave(&smp_alt, flags);
@@ -310,7 +309,7 @@ void alternatives_smp_switch(int smp)
 	return;
 #endif
 
-	if (smp_alt_once)
+	if (noreplace_smp || smp_alt_once)
 		return;
 	BUG_ON(!smp && (num_online_cpus() > 1));
 
@@ -319,8 +318,6 @@ void alternatives_smp_switch(int smp)
 		printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
 		clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
 		clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
-		alternatives_smp_apply(__smp_alt_instructions,
-				       __smp_alt_instructions_end);
 		list_for_each_entry(mod, &smp_alt_modules, next)
 			alternatives_smp_lock(mod->locks, mod->locks_end,
 					      mod->text, mod->text_end);
@@ -328,8 +325,6 @@ void alternatives_smp_switch(int smp)
 		printk(KERN_INFO "SMP alternatives: switching to UP code\n");
 		set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
 		set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
-		apply_alternatives(__smp_alt_instructions,
-				   __smp_alt_instructions_end);
 		list_for_each_entry(mod, &smp_alt_modules, next)
 			alternatives_smp_unlock(mod->locks, mod->locks_end,
 						mod->text, mod->text_end);
@@ -340,36 +335,31 @@ void alternatives_smp_switch(int smp)
 #endif
 
 #ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+void apply_paravirt(struct paravirt_patch_site *start,
+		    struct paravirt_patch_site *end)
 {
-	struct paravirt_patch *p;
+	struct paravirt_patch_site *p;
+
+	if (noreplace_paravirt)
+		return;
 
 	for (p = start; p < end; p++) {
 		unsigned int used;
 
 		used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
 					  p->len);
-#ifdef CONFIG_DEBUG_PARAVIRT
-		{
-		int i;
-		/* Deliberately clobber regs using "not %reg" to find bugs. */
-		for (i = 0; i < 3; i++) {
-			if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
-				memcpy(p->instr + used, "\xf7\xd0", 2);
-				p->instr[used+1] |= i;
-				used += 2;
-			}
-		}
-		}
-#endif
+
+		BUG_ON(used > p->len);
+
 		/* Pad the rest with nops */
 		nop_out(p->instr + used, p->len - used);
 	}
 
-	/* Sync to be conservative, in case we patched following instructions */
+	/* Sync to be conservative, in case we patched following
+	 * instructions */
 	sync_core();
 }
-extern struct paravirt_patch __start_parainstructions[],
+extern struct paravirt_patch_site __start_parainstructions[],
 	__stop_parainstructions[];
 #endif	/* CONFIG_PARAVIRT */
 
@@ -396,23 +386,19 @@ void __init alternative_instructions(void)
 			printk(KERN_INFO "SMP alternatives: switching to UP code\n");
 			set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
 			set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability);
-			apply_alternatives(__smp_alt_instructions,
-					   __smp_alt_instructions_end);
 			alternatives_smp_unlock(__smp_locks, __smp_locks_end,
 						_text, _etext);
 		}
 		free_init_pages("SMP alternatives",
-				(unsigned long)__smp_alt_begin,
-				(unsigned long)__smp_alt_end);
+				__pa_symbol(&__smp_locks),
+				__pa_symbol(&__smp_locks_end));
 	} else {
-		alternatives_smp_save(__smp_alt_instructions,
-				      __smp_alt_instructions_end);
 		alternatives_smp_module_add(NULL, "core kernel",
 					    __smp_locks, __smp_locks_end,
 					    _text, _etext);
 		alternatives_smp_switch(0);
 	}
 #endif
- 	apply_paravirt(__start_parainstructions, __stop_parainstructions);
+ 	apply_paravirt(__parainstructions, __parainstructions_end);
 	local_irq_restore(flags);
 }
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 93aa911646ad..aca054cc0552 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -129,6 +129,28 @@ static int modern_apic(void)
 	return lapic_get_version() >= 0x14;
 }
 
+void apic_wait_icr_idle(void)
+{
+	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+		cpu_relax();
+}
+
+unsigned long safe_apic_wait_icr_idle(void)
+{
+	unsigned long send_status;
+	int timeout;
+
+	timeout = 0;
+	do {
+		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+		if (!send_status)
+			break;
+		udelay(100);
+	} while (timeout++ < 1000);
+
+	return send_status;
+}
+
 /**
  * enable_NMI_through_LVT0 - enable NMI through local vector table 0
  */
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 064bbf2861f4..367ff1d930cb 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -233,11 +233,10 @@
 #include <asm/desc.h>
 #include <asm/i8253.h>
 #include <asm/paravirt.h>
+#include <asm/reboot.h>
 
 #include "io_ports.h"
 
-extern void machine_real_restart(unsigned char *, int);
-
 #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
 extern int (*console_blank_hook)(int);
 #endif
@@ -384,13 +383,6 @@ static int			ignore_sys_suspend;
 static int			ignore_normal_resume;
 static int			bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
 
-#ifdef CONFIG_APM_RTC_IS_GMT
-#	define	clock_cmos_diff	0
-#	define	got_clock_diff	1
-#else
-static long			clock_cmos_diff;
-static int			got_clock_diff;
-#endif
 static int			debug __read_mostly;
 static int			smp __read_mostly;
 static int			apm_disabled = -1;
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c37535163bfc..27a776c9044d 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -11,11 +11,11 @@
 #include <linux/suspend.h>
 #include <asm/ucontext.h>
 #include "sigframe.h"
+#include <asm/pgtable.h>
 #include <asm/fixmap.h>
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 #include <asm/elf.h>
-#include <asm/pda.h>
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -25,6 +25,9 @@
 #define OFFSET(sym, str, mem) \
 	DEFINE(sym, offsetof(struct str, mem));
 
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+
 void foo(void)
 {
 	OFFSET(SIGCONTEXT_eax, sigcontext, eax);
@@ -90,17 +93,18 @@ void foo(void)
 	OFFSET(pbe_next, pbe, next);
 
 	/* Offset from the sysenter stack to tss.esp0 */
-	DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
+	DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
 		 sizeof(struct tss_struct));
 
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-	DEFINE(VDSO_PRELINK, VDSO_PRELINK);
+	DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
+	DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
+	DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
+	DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
 
-	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+	DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
 
-	BLANK();
- 	OFFSET(PDA_cpu, i386_pda, cpu_number);
-	OFFSET(PDA_pcurrent, i386_pda, pcurrent);
+	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
 
 #ifdef CONFIG_PARAVIRT
 	BLANK();
diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile
index 010aecfffbc1..74f27a463db0 100644
--- a/arch/i386/kernel/cpu/Makefile
+++ b/arch/i386/kernel/cpu/Makefile
@@ -2,7 +2,7 @@
 # Makefile for x86-compatible CPU details and quirks
 #
 
-obj-y	:=	common.o proc.o
+obj-y	:=	common.o proc.o bugs.o
 
 obj-y	+=	amd.o
 obj-y	+=	cyrix.o
@@ -17,3 +17,5 @@ obj-$(CONFIG_X86_MCE)	+=	mcheck/
 
 obj-$(CONFIG_MTRR)	+= 	mtrr/
 obj-$(CONFIG_CPU_FREQ)	+=	cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 2d47db482972..4fec702afd7e 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -53,6 +53,8 @@ static __cpuinit int amd_apic_timer_broken(void)
 	return 0;
 }
 
+int force_mwait __cpuinitdata;
+
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -275,6 +277,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	if (amd_apic_timer_broken())
 		set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability);
+
+	if (c->x86 == 0x10 && !force_mwait)
+		clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
 }
 
 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
@@ -314,13 +319,3 @@ int __init amd_init_cpu(void)
 	cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev;
 	return 0;
 }
-
-//early_arch_initcall(amd_init_cpu);
-
-static int __init amd_exit_cpu(void)
-{
-	cpu_devs[X86_VENDOR_AMD] = NULL;
-	return 0;
-}
-
-late_initcall(amd_exit_cpu);
diff --git a/arch/i386/kernel/cpu/bugs.c b/arch/i386/kernel/cpu/bugs.c
new file mode 100644
index 000000000000..54428a2500f3
--- /dev/null
+++ b/arch/i386/kernel/cpu/bugs.c
@@ -0,0 +1,191 @@
+/*
+ *  arch/i386/cpu/bugs.c
+ *
+ *  Copyright (C) 1994  Linus Torvalds
+ *
+ *  Cyrix stuff, June 1998 by:
+ *	- Rafael R. Reilova (moved everything from head.S),
+ *        <rreilova@ececs.uc.edu>
+ *	- Channing Corn (tests & fixes),
+ *	- Andrew D. Balsa (code cleanup).
+ */
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/paravirt.h>
+#include <asm/alternative.h>
+
+static int __init no_halt(char *s)
+{
+	boot_cpu_data.hlt_works_ok = 0;
+	return 1;
+}
+
+__setup("no-hlt", no_halt);
+
+static int __init mca_pentium(char *s)
+{
+	mca_pentium_flag = 1;
+	return 1;
+}
+
+__setup("mca-pentium", mca_pentium);
+
+static int __init no_387(char *s)
+{
+	boot_cpu_data.hard_math = 0;
+	write_cr0(0xE | read_cr0());
+	return 1;
+}
+
+__setup("no387", no_387);
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+static void __init check_fpu(void)
+{
+	if (!boot_cpu_data.hard_math) {
+#ifndef CONFIG_MATH_EMULATION
+		printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
+		printk(KERN_EMERG "Giving up.\n");
+		for (;;) ;
+#endif
+		return;
+	}
+
+/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
+	/* Test for the divl bug.. */
+	__asm__("fninit\n\t"
+		"fldl %1\n\t"
+		"fdivl %2\n\t"
+		"fmull %2\n\t"
+		"fldl %1\n\t"
+		"fsubp %%st,%%st(1)\n\t"
+		"fistpl %0\n\t"
+		"fwait\n\t"
+		"fninit"
+		: "=m" (*&boot_cpu_data.fdiv_bug)
+		: "m" (*&x), "m" (*&y));
+	if (boot_cpu_data.fdiv_bug)
+		printk("Hmm, FPU with FDIV bug.\n");
+}
+
+static void __init check_hlt(void)
+{
+	if (paravirt_enabled())
+		return;
+
+	printk(KERN_INFO "Checking 'hlt' instruction... ");
+	if (!boot_cpu_data.hlt_works_ok) {
+		printk("disabled\n");
+		return;
+	}
+	halt();
+	halt();
+	halt();
+	halt();
+	printk("OK.\n");
+}
+
+/*
+ *	Most 386 processors have a bug where a POPAD can lock the
+ *	machine even from user space.
+ */
+
+static void __init check_popad(void)
+{
+#ifndef CONFIG_X86_POPAD_OK
+	int res, inp = (int) &res;
+
+	printk(KERN_INFO "Checking for popad bug... ");
+	__asm__ __volatile__(
+	  "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
+	  : "=&a" (res)
+	  : "d" (inp)
+	  : "ecx", "edi" );
+	/* If this fails, it means that any user program may lock the CPU hard. Too bad. */
+	if (res != 12345678) printk( "Buggy.\n" );
+		        else printk( "OK.\n" );
+#endif
+}
+
+/*
+ * Check whether we are able to run this kernel safely on SMP.
+ *
+ * - In order to run on a i386, we need to be compiled for i386
+ *   (for due to lack of "invlpg" and working WP on a i386)
+ * - In order to run on anything without a TSC, we need to be
+ *   compiled for a i486.
+ * - In order to support the local APIC on a buggy Pentium machine,
+ *   we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
+ *   which happens implicitly if compiled for a Pentium or lower
+ *   (unless an advanced selection of CPU features is used) as an
+ *   otherwise config implies a properly working local APIC without
+ *   the need to do extra reads from the APIC.
+*/
+
+static void __init check_config(void)
+{
+/*
+ * We'd better not be a i386 if we're configured to use some
+ * i486+ only features! (WP works in supervisor mode and the
+ * new "invlpg" and "bswap" instructions)
+ */
+#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
+	if (boot_cpu_data.x86 == 3)
+		panic("Kernel requires i486+ for 'invlpg' and other features");
+#endif
+
+/*
+ * If we configured ourselves for a TSC, we'd better have one!
+ */
+#ifdef CONFIG_X86_TSC
+	if (!cpu_has_tsc && !tsc_disable)
+		panic("Kernel compiled for Pentium+, requires TSC feature!");
+#endif
+
+/*
+ * If we were told we had a good local APIC, check for buggy Pentia,
+ * i.e. all B steppings and the C2 stepping of P54C when using their
+ * integrated APIC (see 11AP erratum in "Pentium Processor
+ * Specification Update").
+ */
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
+	    && cpu_has_apic
+	    && boot_cpu_data.x86 == 5
+	    && boot_cpu_data.x86_model == 2
+	    && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
+		panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
+#endif
+}
+
+
+void __init check_bugs(void)
+{
+	identify_boot_cpu();
+#ifndef CONFIG_SMP
+	printk("CPU: ");
+	print_cpu_info(&boot_cpu_data);
+#endif
+	check_config();
+	check_fpu();
+	check_hlt();
+	check_popad();
+	init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+	alternative_instructions();
+}
diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c
index 8c25047975c0..473eac883c7b 100644
--- a/arch/i386/kernel/cpu/centaur.c
+++ b/arch/i386/kernel/cpu/centaur.c
@@ -469,13 +469,3 @@ int __init centaur_init_cpu(void)
 	cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev;
 	return 0;
 }
-
-//early_arch_initcall(centaur_init_cpu);
-
-static int __init centaur_exit_cpu(void)
-{
-	cpu_devs[X86_VENDOR_CENTAUR] = NULL;
-	return 0;
-}
-
-late_initcall(centaur_exit_cpu);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index dcbbd0a8bfc2..794d593c47eb 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,15 +18,37 @@
 #include <asm/apic.h>
 #include <mach_apic.h>
 #endif
-#include <asm/pda.h>
 
 #include "cpu.h"
 
-DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
-EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+	[GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
+	[GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
+	[GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
+	[GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
+	/*
+	 * Segments used for calling PnP BIOS have byte granularity.
+	 * They code segments and data segments have fixed 64k limits,
+	 * the transfer segment sizes are set at run time.
+	 */
+	[GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+	[GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
+	[GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
+	/*
+	 * The APM segments have byte granularity and their bases
+	 * are set at run time.  All have 64k limits.
+	 */
+	[GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+	/* 16-bit code */
+	[GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
+	[GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
 
-struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
+	[GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
+	[GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_fxsr __cpuinitdata;
@@ -368,7 +390,7 @@ __setup("serialnumber", x86_serial_nr_setup);
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 {
 	int i;
 
@@ -479,15 +501,22 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
+}
 
-	if (c == &boot_cpu_data)
-		sysenter_setup();
+void __init identify_boot_cpu(void)
+{
+	identify_cpu(&boot_cpu_data);
+	sysenter_setup();
 	enable_sep_cpu();
+	mtrr_bp_init();
+}
 
-	if (c == &boot_cpu_data)
-		mtrr_bp_init();
-	else
-		mtrr_ap_init();
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+	BUG_ON(c == &boot_cpu_data);
+	identify_cpu(c);
+	enable_sep_cpu();
+	mtrr_ap_init();
 }
 
 #ifdef CONFIG_X86_HT
@@ -601,129 +630,36 @@ void __init early_cpu_init(void)
 #endif
 }
 
-/* Make sure %gs is initialized properly in idle threads */
+/* Make sure %fs is initialized properly in idle threads */
 struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
-	regs->xfs = __KERNEL_PDA;
+	regs->xfs = __KERNEL_PERCPU;
 	return regs;
 }
 
-static __cpuinit int alloc_gdt(int cpu)
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
 {
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-	struct desc_struct *gdt;
-	struct i386_pda *pda;
-
-	gdt = (struct desc_struct *)cpu_gdt_descr->address;
-	pda = cpu_pda(cpu);
-
-	/*
-	 * This is a horrible hack to allocate the GDT.  The problem
-	 * is that cpu_init() is called really early for the boot CPU
-	 * (and hence needs bootmem) but much later for the secondary
-	 * CPUs, when bootmem will have gone away
-	 */
-	if (NODE_DATA(0)->bdata->node_bootmem_map) {
-		BUG_ON(gdt != NULL || pda != NULL);
-
-		gdt = alloc_bootmem_pages(PAGE_SIZE);
-		pda = alloc_bootmem(sizeof(*pda));
-		/* alloc_bootmem(_pages) panics on failure, so no check */
-
-		memset(gdt, 0, PAGE_SIZE);
-		memset(pda, 0, sizeof(*pda));
-	} else {
-		/* GDT and PDA might already have been allocated if
-		   this is a CPU hotplug re-insertion. */
-		if (gdt == NULL)
-			gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
-
-		if (pda == NULL)
-			pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
-
-		if (unlikely(!gdt || !pda)) {
-			free_pages((unsigned long)gdt, 0);
-			kfree(pda);
-			return 0;
-		}
-	}
-
- 	cpu_gdt_descr->address = (unsigned long)gdt;
-	cpu_pda(cpu) = pda;
-
-	return 1;
-}
+	struct Xgt_desc_struct gdt_descr;
 
-/* Initial PDA used by boot CPU */
-struct i386_pda boot_pda = {
-	._pda = &boot_pda,
-	.cpu_number = 0,
-	.pcurrent = &init_task,
-};
-
-static inline void set_kernel_fs(void)
-{
-	/* Set %fs for this CPU's PDA.  Memory clobber is to create a
-	   barrier with respect to any PDA operations, so the compiler
-	   doesn't move any before here. */
-	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
+	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
 }
 
-/* Initialize the CPU's GDT and PDA.  The boot CPU does this for
-   itself, but secondaries find this done for them. */
-__cpuinit int init_gdt(int cpu, struct task_struct *idle)
-{
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-	struct desc_struct *gdt;
-	struct i386_pda *pda;
-
-	/* For non-boot CPUs, the GDT and PDA should already have been
-	   allocated. */
-	if (!alloc_gdt(cpu)) {
-		printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
-		return 0;
-	}
-
-	gdt = (struct desc_struct *)cpu_gdt_descr->address;
-	pda = cpu_pda(cpu);
-
-	BUG_ON(gdt == NULL || pda == NULL);
-
-	/*
-	 * Initialize the per-CPU GDT with the boot GDT,
-	 * and set up the GDT descriptor:
-	 */
- 	memcpy(gdt, cpu_gdt_table, GDT_SIZE);
-	cpu_gdt_descr->size = GDT_SIZE - 1;
-
-	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
-			(u32 *)&gdt[GDT_ENTRY_PDA].b,
-			(unsigned long)pda, sizeof(*pda) - 1,
-			0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
-
-	memset(pda, 0, sizeof(*pda));
-	pda->_pda = pda;
-	pda->cpu_number = cpu;
-	pda->pcurrent = idle;
-
-	return 1;
-}
-
-void __cpuinit cpu_set_gdt(int cpu)
-{
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-
-	/* Reinit these anyway, even if they've already been done (on
-	   the boot CPU, this will transition from the boot gdt+pda to
-	   the real ones). */
-	load_gdt(cpu_gdt_descr);
-	set_kernel_fs();
-}
-
-/* Common CPU init for both boot and secondary CPUs */
-static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ */
+void __cpuinit cpu_init(void)
 {
+	int cpu = smp_processor_id();
+	struct task_struct *curr = current;
 	struct tss_struct * t = &per_cpu(init_tss, cpu);
 	struct thread_struct *thread = &curr->thread;
 
@@ -744,6 +680,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
 	}
 
 	load_idt(&idt_descr);
+	switch_to_new_gdt();
 
 	/*
 	 * Set up and load the per-CPU TSS and LDT
@@ -783,38 +720,6 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
 	mxcsr_feature_mask_init();
 }
 
-/* Entrypoint to initialize secondary CPU */
-void __cpuinit secondary_cpu_init(void)
-{
-	int cpu = smp_processor_id();
-	struct task_struct *curr = current;
-
-	_cpu_init(cpu, curr);
-}
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- */
-void __cpuinit cpu_init(void)
-{
-	int cpu = smp_processor_id();
-	struct task_struct *curr = current;
-
-	/* Set up the real GDT and PDA, so we can transition from the
-	   boot versions. */
-	if (!init_gdt(cpu, curr)) {
-		/* failed to allocate something; not much we can do... */
-		for (;;)
-			local_irq_enable();
-	}
-
-	cpu_set_gdt(cpu);
-	_cpu_init(cpu, curr);
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 void __cpuinit cpu_uninit(void)
 {
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index de27bd07bc9c..0b8411a864fb 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 		 */  
 		if (vendor == PCI_VENDOR_ID_CYRIX &&
 	 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
-			pit_latch_buggy = 1;
+			mark_tsc_unstable("cyrix 5510/5520 detected");
 	}
 #endif
 		c->x86_cache_size=16;	/* Yep 16K integrated cache thats it */
@@ -448,16 +448,6 @@ int __init cyrix_init_cpu(void)
 	return 0;
 }
 
-//early_arch_initcall(cyrix_init_cpu);
-
-static int __init cyrix_exit_cpu(void)
-{
-	cpu_devs[X86_VENDOR_CYRIX] = NULL;
-	return 0;
-}
-
-late_initcall(cyrix_exit_cpu);
-
 static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
 	.c_vendor	= "NSC",
 	.c_ident 	= { "Geode by NSC" },
@@ -470,12 +460,3 @@ int __init nsc_init_cpu(void)
 	return 0;
 }
 
-//early_arch_initcall(nsc_init_cpu);
-
-static int __init nsc_exit_cpu(void)
-{
-	cpu_devs[X86_VENDOR_NSC] = NULL;
-	return 0;
-}
-
-late_initcall(nsc_exit_cpu);
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 56fe26584957..dc4e08147b1f 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -188,8 +188,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	}
 #endif
 
-	if (c->x86 == 15)
+	if (c->x86 == 15) {
 		set_bit(X86_FEATURE_P4, c->x86_capability);
+		set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
+	}
 	if (c->x86 == 6) 
 		set_bit(X86_FEATURE_P3, c->x86_capability);
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c
index b0862af595aa..f9fa4142551e 100644
--- a/arch/i386/kernel/cpu/mcheck/k7.c
+++ b/arch/i386/kernel/cpu/mcheck/k7.c
@@ -75,6 +75,9 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
 	machine_check_vector = k7_machine_check;
 	wmb();
 
+	if (!cpu_has(c, X86_FEATURE_MCE))
+		return;
+
 	printk (KERN_INFO "Intel machine check architecture supported.\n");
 	rdmsr (MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<8))	/* Control register present ? */
@@ -82,9 +85,13 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
 	nr_mce_banks = l & 0xff;
 
 	/* Clear status for MC index 0 separately, we don't touch CTL,
-	 * as some Athlons cause spurious MCEs when its enabled. */
-	wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
-	for (i=1; i<nr_mce_banks; i++) {
+	 * as some K7 Athlons cause spurious MCEs when its enabled. */
+	if (boot_cpu_data.x86 == 6) {
+		wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+		i = 1;
+	} else
+		i = 0;
+	for (; i<nr_mce_banks; i++) {
 		wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
 		wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
 	}
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index 4f10c62d180c..56cd485b127c 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -38,8 +38,7 @@ void mcheck_init(struct cpuinfo_x86 *c)
 
 	switch (c->x86_vendor) {
 		case X86_VENDOR_AMD:
-			if (c->x86==6 || c->x86==15)
-				amd_mcheck_init(c);
+			amd_mcheck_init(c);
 			break;
 
 		case X86_VENDOR_INTEL:
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 504434a46011..1509edfb2313 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -124,13 +124,10 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 
 
 /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
+static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
 {
 	u32 h;
 
-	if (mce_num_extended_msrs == 0)
-		goto done;
-
 	rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
 	rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
 	rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
@@ -141,12 +138,6 @@ static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
 	rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
 	rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
 	rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
-
-	/* can we rely on kmalloc to do a dynamic
-	 * allocation for the reserved registers?
-	 */
-done:
-	return mce_num_extended_msrs;
 }
 
 static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
@@ -155,7 +146,6 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
 	u32 alow, ahigh, high, low;
 	u32 mcgstl, mcgsth;
 	int i;
-	struct intel_mce_extended_msrs dbg;
 
 	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 	if (mcgstl & (1<<0))	/* Recoverable ? */
@@ -164,7 +154,9 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
 	printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
 		smp_processor_id(), mcgsth, mcgstl);
 
-	if (intel_get_extended_msrs(&dbg)) {
+	if (mce_num_extended_msrs > 0) {
+		struct intel_mce_extended_msrs dbg;
+		intel_get_extended_msrs(&dbg);
 		printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
 			smp_processor_id(), dbg.eip, dbg.eflags);
 		printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index f77fc53db654..5367e32e0403 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -20,13 +20,25 @@ struct mtrr_state {
 	mtrr_type def_type;
 };
 
+struct fixed_range_block {
+	int base_msr; /* start address of an MTRR block */
+	int ranges;   /* number of MTRRs in this block  */
+};
+
+static struct fixed_range_block fixed_range_blocks[] = {
+	{ MTRRfix64K_00000_MSR, 1 }, /* one  64k MTRR  */
+	{ MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */
+	{ MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
+	{}
+};
+
 static unsigned long smp_changes_mask;
 static struct mtrr_state mtrr_state = {};
 
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
 
-static __initdata int mtrr_show;
+static int mtrr_show;
 module_param_named(show, mtrr_show, bool, 0);
 
 /*  Get the MSR pair relating to a var range  */
@@ -37,7 +49,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
 	rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
 }
 
-static void __init
+static void
 get_fixed_ranges(mtrr_type * frs)
 {
 	unsigned int *p = (unsigned int *) frs;
@@ -51,12 +63,18 @@ get_fixed_ranges(mtrr_type * frs)
 		rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
 }
 
-static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+void mtrr_save_fixed_ranges(void *info)
+{
+	get_fixed_ranges(mtrr_state.fixed_ranges);
+}
+
+static void __cpuinit print_fixed(unsigned base, unsigned step, const mtrr_type*types)
 {
 	unsigned i;
 
 	for (i = 0; i < 8; ++i, ++types, base += step)
-		printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types));
+		printk(KERN_INFO "MTRR %05X-%05X %s\n",
+			base, base + step - 1, mtrr_attrib_to_str(*types));
 }
 
 /*  Grab all of the MTRR state for this CPU into *state  */
@@ -147,6 +165,44 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
 			smp_processor_id(), msr, a, b);
 }
 
+/**
+ * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
+ * see AMD publication no. 24593, chapter 3.2.1 for more information
+ */
+static inline void k8_enable_fixed_iorrs(void)
+{
+	unsigned lo, hi;
+
+	rdmsr(MSR_K8_SYSCFG, lo, hi);
+	mtrr_wrmsr(MSR_K8_SYSCFG, lo
+				| K8_MTRRFIXRANGE_DRAM_ENABLE
+				| K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
+}
+
+/**
+ * Checks and updates an fixed-range MTRR if it differs from the value it
+ * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also.
+ * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information
+ * \param msr MSR address of the MTTR which should be checked and updated
+ * \param changed pointer which indicates whether the MTRR needed to be changed
+ * \param msrwords pointer to the MSR values which the MSR should have
+ */
+static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
+{
+	unsigned lo, hi;
+
+	rdmsr(msr, lo, hi);
+
+	if (lo != msrwords[0] || hi != msrwords[1]) {
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+		    boot_cpu_data.x86 == 15 &&
+		    ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
+			k8_enable_fixed_iorrs();
+		mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
+		*changed = TRUE;
+	}
+}
+
 int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
 /*  [SUMMARY] Get a free MTRR.
     <base> The starting (base) address of the region.
@@ -196,36 +252,21 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 	*type = base_lo & 0xff;
 }
 
+/**
+ * Checks and updates the fixed-range MTRRs if they differ from the saved set
+ * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges()
+ */
 static int set_fixed_ranges(mtrr_type * frs)
 {
-	unsigned int *p = (unsigned int *) frs;
+	unsigned long long *saved = (unsigned long long *) frs;
 	int changed = FALSE;
-	int i;
-	unsigned int lo, hi;
+	int block=-1, range;
 
-	rdmsr(MTRRfix64K_00000_MSR, lo, hi);
-	if (p[0] != lo || p[1] != hi) {
-		mtrr_wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
-		changed = TRUE;
-	}
+	while (fixed_range_blocks[++block].ranges)
+	    for (range=0; range < fixed_range_blocks[block].ranges; range++)
+		set_fixed_range(fixed_range_blocks[block].base_msr + range,
+		    &changed, (unsigned int *) saved++);
 
-	for (i = 0; i < 2; i++) {
-		rdmsr(MTRRfix16K_80000_MSR + i, lo, hi);
-		if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) {
-			mtrr_wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2],
-			      p[3 + i * 2]);
-			changed = TRUE;
-		}
-	}
-
-	for (i = 0; i < 8; i++) {
-		rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi);
-		if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) {
-			mtrr_wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2],
-			      p[7 + i * 2]);
-			changed = TRUE;
-		}
-	}
 	return changed;
 }
 
@@ -428,7 +469,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
 		}
 	}
 
-	if (base + size < 0x100) {
+	if (base < 0x100) {
 		printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n",
 		       base, size);
 		return -EINVAL;
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 0acfb6a5a220..02a2f39e5e0a 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -729,6 +729,17 @@ void mtrr_ap_init(void)
 	local_irq_restore(flags);
 }
 
+/**
+ * Save current fixed-range MTRR state of the BSP
+ */
+void mtrr_save_state(void)
+{
+	if (smp_processor_id() == 0)
+		mtrr_save_fixed_ranges(NULL);
+	else
+		smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
+}
+
 static int __init mtrr_init_finialize(void)
 {
 	if (!mtrr_if)
diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c
index 8bf23cc80c63..961fbe1a748f 100644
--- a/arch/i386/kernel/cpu/nexgen.c
+++ b/arch/i386/kernel/cpu/nexgen.c
@@ -58,13 +58,3 @@ int __init nexgen_init_cpu(void)
 	cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
 	return 0;
 }
-
-//early_arch_initcall(nexgen_init_cpu);
-
-static int __init nexgen_exit_cpu(void)
-{
-	cpu_devs[X86_VENDOR_NEXGEN] = NULL;
-	return 0;
-}
-
-late_initcall(nexgen_exit_cpu);
diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c
new file mode 100644
index 000000000000..2b04c8f1db62
--- /dev/null
+++ b/arch/i386/kernel/cpu/perfctr-watchdog.c
@@ -0,0 +1,658 @@
+/* local apic based NMI watchdog for various CPUs.
+   This file also handles reservation of performance counters for coordination
+   with other users (like oprofile).
+
+   Note that these events normally don't tick when the CPU idles. This means
+   the frequency varies with CPU load.
+
+   Original code for K7/P6 written by Keith Owens */
+
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/apic.h>
+#include <asm/intel_arch_perfmon.h>
+
+struct nmi_watchdog_ctlblk {
+	unsigned int cccr_msr;
+	unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
+	unsigned int evntsel_msr;  /* the MSR to select the events to handle */
+};
+
+/* Interface defining a CPU specific perfctr watchdog */
+struct wd_ops {
+	int (*reserve)(void);
+	void (*unreserve)(void);
+	int (*setup)(unsigned nmi_hz);
+	void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
+	void (*stop)(void *);
+	unsigned perfctr;
+	unsigned evntsel;
+	u64 checkbit;
+};
+
+static struct wd_ops *wd_ops;
+
+/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
+ */
+#define NMI_MAX_COUNTER_BITS 66
+
+/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ *   different subsystems this reservation system just tries to coordinate
+ *   things a little
+ */
+static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
+static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
+
+static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+	return wd_ops ? msr - wd_ops->perfctr : 0;
+}
+
+/* converts an msr to an appropriate reservation bit */
+/* returns the bit offset of the event selection register */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+	return wd_ops ? msr - wd_ops->evntsel : 0;
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	return (!test_bit(counter, perfctr_nmi_owner));
+}
+
+/* checks the an msr for availability */
+int avail_to_resrv_perfctr_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_perfctr_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	return (!test_bit(counter, perfctr_nmi_owner));
+}
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_perfctr_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	if (!test_and_set_bit(counter, perfctr_nmi_owner))
+		return 1;
+	return 0;
+}
+
+void release_perfctr_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_perfctr_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	clear_bit(counter, perfctr_nmi_owner);
+}
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_evntsel_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	if (!test_and_set_bit(counter, evntsel_nmi_owner))
+		return 1;
+	return 0;
+}
+
+void release_evntsel_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_evntsel_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	clear_bit(counter, evntsel_nmi_owner);
+}
+
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+EXPORT_SYMBOL(release_perfctr_nmi);
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+EXPORT_SYMBOL(release_evntsel_nmi);
+
+void disable_lapic_nmi_watchdog(void)
+{
+	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+	if (atomic_read(&nmi_active) <= 0)
+		return;
+
+	on_each_cpu(wd_ops->stop, NULL, 0, 1);
+	wd_ops->unreserve();
+
+	BUG_ON(atomic_read(&nmi_active) != 0);
+}
+
+void enable_lapic_nmi_watchdog(void)
+{
+	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+	/* are we already enabled */
+	if (atomic_read(&nmi_active) != 0)
+		return;
+
+	/* are we lapic aware */
+	if (!wd_ops)
+		return;
+	if (!wd_ops->reserve()) {
+		printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
+		return;
+	}
+
+	on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+	touch_nmi_watchdog();
+}
+
+/*
+ * Activate the NMI watchdog via the local APIC.
+ */
+
+static unsigned int adjust_for_32bit_ctr(unsigned int hz)
+{
+	u64 counter_val;
+	unsigned int retval = hz;
+
+	/*
+	 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
+	 * are writable, with higher bits sign extending from bit 31.
+	 * So, we can only program the counter with 31 bit values and
+	 * 32nd bit should be 1, for 33.. to be 1.
+	 * Find the appropriate nmi_hz
+	 */
+	counter_val = (u64)cpu_khz * 1000;
+	do_div(counter_val, retval);
+ 	if (counter_val > 0x7fffffffULL) {
+		u64 count = (u64)cpu_khz * 1000;
+		do_div(count, 0x7fffffffUL);
+		retval = count + 1;
+	}
+	return retval;
+}
+
+static void
+write_watchdog_counter(unsigned int perfctr_msr, const char *descr, unsigned nmi_hz)
+{
+	u64 count = (u64)cpu_khz * 1000;
+
+	do_div(count, nmi_hz);
+	if(descr)
+		Dprintk("setting %s to -0x%08Lx\n", descr, count);
+	wrmsrl(perfctr_msr, 0 - count);
+}
+
+static void write_watchdog_counter32(unsigned int perfctr_msr,
+		const char *descr, unsigned nmi_hz)
+{
+	u64 count = (u64)cpu_khz * 1000;
+
+	do_div(count, nmi_hz);
+	if(descr)
+		Dprintk("setting %s to -0x%08Lx\n", descr, count);
+	wrmsr(perfctr_msr, (u32)(-count), 0);
+}
+
+/* AMD K7/K8/Family10h/Family11h support. AMD keeps this interface
+   nicely stable so there is not much variety */
+
+#define K7_EVNTSEL_ENABLE	(1 << 22)
+#define K7_EVNTSEL_INT		(1 << 20)
+#define K7_EVNTSEL_OS		(1 << 17)
+#define K7_EVNTSEL_USR		(1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
+#define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+static int setup_k7_watchdog(unsigned nmi_hz)
+{
+	unsigned int perfctr_msr, evntsel_msr;
+	unsigned int evntsel;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	perfctr_msr = MSR_K7_PERFCTR0;
+	evntsel_msr = MSR_K7_EVNTSEL0;
+
+	wrmsrl(perfctr_msr, 0UL);
+
+	evntsel = K7_EVNTSEL_INT
+		| K7_EVNTSEL_OS
+		| K7_EVNTSEL_USR
+		| K7_NMI_EVENT;
+
+	/* setup the timer */
+	wrmsr(evntsel_msr, evntsel, 0);
+	write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= K7_EVNTSEL_ENABLE;
+	wrmsr(evntsel_msr, evntsel, 0);
+
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = 0;  //unused
+	return 1;
+}
+
+static void single_msr_stop_watchdog(void *arg)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	wrmsr(wd->evntsel_msr, 0, 0);
+}
+
+static int single_msr_reserve(void)
+{
+	if (!reserve_perfctr_nmi(wd_ops->perfctr))
+		return 0;
+
+	if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
+		release_perfctr_nmi(wd_ops->perfctr);
+		return 0;
+	}
+	return 1;
+}
+
+static void single_msr_unreserve(void)
+{
+	release_evntsel_nmi(wd_ops->perfctr);
+	release_perfctr_nmi(wd_ops->evntsel);
+}
+
+static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+	/* start the cycle over again */
+	write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
+}
+
+static struct wd_ops k7_wd_ops = {
+	.reserve = single_msr_reserve,
+	.unreserve = single_msr_unreserve,
+	.setup = setup_k7_watchdog,
+	.rearm = single_msr_rearm,
+	.stop = single_msr_stop_watchdog,
+	.perfctr = MSR_K7_PERFCTR0,
+	.evntsel = MSR_K7_EVNTSEL0,
+	.checkbit = 1ULL<<63,
+};
+
+/* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
+
+#define P6_EVNTSEL0_ENABLE	(1 << 22)
+#define P6_EVNTSEL_INT		(1 << 20)
+#define P6_EVNTSEL_OS		(1 << 17)
+#define P6_EVNTSEL_USR		(1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
+#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+static int setup_p6_watchdog(unsigned nmi_hz)
+{
+	unsigned int perfctr_msr, evntsel_msr;
+	unsigned int evntsel;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	perfctr_msr = MSR_P6_PERFCTR0;
+	evntsel_msr = MSR_P6_EVNTSEL0;
+
+	wrmsrl(perfctr_msr, 0UL);
+
+	evntsel = P6_EVNTSEL_INT
+		| P6_EVNTSEL_OS
+		| P6_EVNTSEL_USR
+		| P6_NMI_EVENT;
+
+	/* setup the timer */
+	wrmsr(evntsel_msr, evntsel, 0);
+	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+	write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= P6_EVNTSEL0_ENABLE;
+	wrmsr(evntsel_msr, evntsel, 0);
+
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = 0;  //unused
+	return 1;
+}
+
+static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+	/* P6 based Pentium M need to re-unmask
+	 * the apic vector but it doesn't hurt
+	 * other P6 variant.
+	 * ArchPerfom/Core Duo also needs this */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	/* P6/ARCH_PERFMON has 32 bit counter write */
+	write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz);
+}
+
+static struct wd_ops p6_wd_ops = {
+	.reserve = single_msr_reserve,
+	.unreserve = single_msr_unreserve,
+	.setup = setup_p6_watchdog,
+	.rearm = p6_rearm,
+	.stop = single_msr_stop_watchdog,
+	.perfctr = MSR_P6_PERFCTR0,
+	.evntsel = MSR_P6_EVNTSEL0,
+	.checkbit = 1ULL<<39,
+};
+
+/* Intel P4 performance counters. By far the most complicated of all. */
+
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
+#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
+#define P4_ESCR_OS		(1<<3)
+#define P4_ESCR_USR		(1<<2)
+#define P4_CCCR_OVF_PMI0	(1<<26)
+#define P4_CCCR_OVF_PMI1	(1<<27)
+#define P4_CCCR_THRESHOLD(N)	((N)<<20)
+#define P4_CCCR_COMPLEMENT	(1<<19)
+#define P4_CCCR_COMPARE		(1<<18)
+#define P4_CCCR_REQUIRED	(3<<16)
+#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
+#define P4_CCCR_ENABLE		(1<<12)
+#define P4_CCCR_OVF 		(1<<31)
+
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+   CRU_ESCR0 (with any non-null event selector) through a complemented
+   max threshold. [IA32-Vol3, Section 14.9.9] */
+
+static int setup_p4_watchdog(unsigned nmi_hz)
+{
+	unsigned int perfctr_msr, evntsel_msr, cccr_msr;
+	unsigned int evntsel, cccr_val;
+	unsigned int misc_enable, dummy;
+	unsigned int ht_num;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
+	if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+		return 0;
+
+#ifdef CONFIG_SMP
+	/* detect which hyperthread we are on */
+	if (smp_num_siblings == 2) {
+		unsigned int ebx, apicid;
+
+        	ebx = cpuid_ebx(1);
+	        apicid = (ebx >> 24) & 0xff;
+        	ht_num = apicid & 1;
+	} else
+#endif
+		ht_num = 0;
+
+	/* performance counters are shared resources
+	 * assign each hyperthread its own set
+	 * (re-use the ESCR0 register, seems safe
+	 * and keeps the cccr_val the same)
+	 */
+	if (!ht_num) {
+		/* logical cpu 0 */
+		perfctr_msr = MSR_P4_IQ_PERFCTR0;
+		evntsel_msr = MSR_P4_CRU_ESCR0;
+		cccr_msr = MSR_P4_IQ_CCCR0;
+		cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+	} else {
+		/* logical cpu 1 */
+		perfctr_msr = MSR_P4_IQ_PERFCTR1;
+		evntsel_msr = MSR_P4_CRU_ESCR0;
+		cccr_msr = MSR_P4_IQ_CCCR1;
+		cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
+	}
+
+	evntsel = P4_ESCR_EVENT_SELECT(0x3F)
+	 	| P4_ESCR_OS
+		| P4_ESCR_USR;
+
+	cccr_val |= P4_CCCR_THRESHOLD(15)
+		 | P4_CCCR_COMPLEMENT
+		 | P4_CCCR_COMPARE
+		 | P4_CCCR_REQUIRED;
+
+	wrmsr(evntsel_msr, evntsel, 0);
+	wrmsr(cccr_msr, cccr_val, 0);
+	write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	cccr_val |= P4_CCCR_ENABLE;
+	wrmsr(cccr_msr, cccr_val, 0);
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = cccr_msr;
+	return 1;
+}
+
+static void stop_p4_watchdog(void *arg)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+	wrmsr(wd->cccr_msr, 0, 0);
+	wrmsr(wd->evntsel_msr, 0, 0);
+}
+
+static int p4_reserve(void)
+{
+	if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
+		return 0;
+#ifdef CONFIG_SMP
+	if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
+		goto fail1;
+#endif
+	if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
+		goto fail2;
+	/* RED-PEN why is ESCR1 not reserved here? */
+	return 1;
+ fail2:
+#ifdef CONFIG_SMP
+	if (smp_num_siblings > 1)
+		release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
+ fail1:
+#endif
+	release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
+	return 0;
+}
+
+static void p4_unreserve(void)
+{
+#ifdef CONFIG_SMP
+	if (smp_num_siblings > 1)
+		release_evntsel_nmi(MSR_P4_IQ_PERFCTR1);
+#endif
+	release_evntsel_nmi(MSR_P4_IQ_PERFCTR0);
+	release_perfctr_nmi(MSR_P4_CRU_ESCR0);
+}
+
+static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+{
+	unsigned dummy;
+	/*
+ 	 * P4 quirks:
+	 * - An overflown perfctr will assert its interrupt
+	 *   until the OVF flag in its CCCR is cleared.
+	 * - LVTPC is masked on interrupt and must be
+	 *   unmasked by the LVTPC handler.
+	 */
+	rdmsrl(wd->cccr_msr, dummy);
+	dummy &= ~P4_CCCR_OVF;
+	wrmsrl(wd->cccr_msr, dummy);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	/* start the cycle over again */
+	write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
+}
+
+static struct wd_ops p4_wd_ops = {
+	.reserve = p4_reserve,
+	.unreserve = p4_unreserve,
+	.setup = setup_p4_watchdog,
+	.rearm = p4_rearm,
+	.stop = stop_p4_watchdog,
+	/* RED-PEN this is wrong for the other sibling */
+	.perfctr = MSR_P4_BPU_PERFCTR0,
+	.evntsel = MSR_P4_BSU_ESCR0,
+	.checkbit = 1ULL<<39,
+};
+
+/* Watchdog using the Intel architected PerfMon. Used for Core2 and hopefully
+   all future Intel CPUs. */
+
+#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
+static int setup_intel_arch_watchdog(unsigned nmi_hz)
+{
+	unsigned int ebx;
+	union cpuid10_eax eax;
+	unsigned int unused;
+	unsigned int perfctr_msr, evntsel_msr;
+	unsigned int evntsel;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Unhalted Core Cycles Event or not.
+	 * NOTE: Corresponding bit = 0 in ebx indicates event present.
+	 */
+	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+	    (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		return 0;
+
+	perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1;
+	evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1;
+
+	wrmsrl(perfctr_msr, 0UL);
+
+	evntsel = ARCH_PERFMON_EVENTSEL_INT
+		| ARCH_PERFMON_EVENTSEL_OS
+		| ARCH_PERFMON_EVENTSEL_USR
+		| ARCH_PERFMON_NMI_EVENT_SEL
+		| ARCH_PERFMON_NMI_EVENT_UMASK;
+
+	/* setup the timer */
+	wrmsr(evntsel_msr, evntsel, 0);
+	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
+	write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(evntsel_msr, evntsel, 0);
+
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = 0;  //unused
+	wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1);
+	return 1;
+}
+
+static struct wd_ops intel_arch_wd_ops = {
+	.reserve = single_msr_reserve,
+	.unreserve = single_msr_unreserve,
+	.setup = setup_intel_arch_watchdog,
+	.rearm = p6_rearm,
+	.stop = single_msr_stop_watchdog,
+	.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+	.evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
+};
+
+static void probe_nmi_watchdog(void)
+{
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
+		    boot_cpu_data.x86 != 16)
+			return;
+		wd_ops = &k7_wd_ops;
+		break;
+	case X86_VENDOR_INTEL:
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+			wd_ops = &intel_arch_wd_ops;
+			break;
+		}
+		switch (boot_cpu_data.x86) {
+		case 6:
+			if (boot_cpu_data.x86_model > 0xd)
+				return;
+
+			wd_ops = &p6_wd_ops;
+			break;
+		case 15:
+			if (boot_cpu_data.x86_model > 0x4)
+				return;
+
+			wd_ops = &p4_wd_ops;
+			break;
+		default:
+			return;
+		}
+		break;
+	}
+}
+
+/* Interface to nmi.c */
+
+int lapic_watchdog_init(unsigned nmi_hz)
+{
+	if (!wd_ops) {
+		probe_nmi_watchdog();
+		if (!wd_ops)
+			return -1;
+	}
+
+	if (!(wd_ops->setup(nmi_hz))) {
+		printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
+		       raw_smp_processor_id());
+		return -1;
+	}
+
+	return 0;
+}
+
+void lapic_watchdog_stop(void)
+{
+	if (wd_ops)
+		wd_ops->stop(NULL);
+}
+
+unsigned lapic_adjust_nmi_hz(unsigned hz)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+	if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+	    wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
+		hz = adjust_for_32bit_ctr(hz);
+	return hz;
+}
+
+int lapic_wd_event(unsigned nmi_hz)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+	u64 ctr;
+	rdmsrl(wd->perfctr_msr, ctr);
+	if (ctr & wd_ops->checkbit) { /* perfctr still running? */
+		return 0;
+	}
+	wd_ops->rearm(wd, nmi_hz);
+	return 1;
+}
+
+int lapic_watchdog_ok(void)
+{
+	return wd_ops != NULL;
+}
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 47e3ebbfb28d..89d91e6cc972 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -72,8 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		"stc",
 		"100mhzsteps",
 		"hwpstate",
-		NULL,
-		NULL,	/* constant_tsc - moved to flags */
+		"",	/* constant_tsc - moved to flags */
 		/* nothing */
 	};
 	struct cpuinfo_x86 *c = v;
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
index 9317f7414989..50076f22e90f 100644
--- a/arch/i386/kernel/cpu/rise.c
+++ b/arch/i386/kernel/cpu/rise.c
@@ -50,12 +50,3 @@ int __init rise_init_cpu(void)
 	return 0;
 }
 
-//early_arch_initcall(rise_init_cpu);
-
-static int __init rise_exit_cpu(void)
-{
-	cpu_devs[X86_VENDOR_RISE] = NULL;
-	return 0;
-}
-
-late_initcall(rise_exit_cpu);
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 5678d46863c6..6471a5a13202 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -112,13 +112,3 @@ int __init transmeta_init_cpu(void)
 	cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev;
 	return 0;
 }
-
-//early_arch_initcall(transmeta_init_cpu);
-
-static int __init transmeta_exit_cpu(void)
-{
-	cpu_devs[X86_VENDOR_TRANSMETA] = NULL;
-	return 0;
-}
-
-late_initcall(transmeta_exit_cpu);
diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c
index 1bf3f87e9c5b..a7a4e75bdcd7 100644
--- a/arch/i386/kernel/cpu/umc.c
+++ b/arch/i386/kernel/cpu/umc.c
@@ -24,13 +24,3 @@ int __init umc_init_cpu(void)
 	cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev;
 	return 0;
 }
-
-//early_arch_initcall(umc_init_cpu);
-
-static int __init umc_exit_cpu(void)
-{
-	cpu_devs[X86_VENDOR_UMC] = NULL;
-	return 0;
-}
-
-late_initcall(umc_exit_cpu);
diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c
index b4d14c2eb345..265c5597efb0 100644
--- a/arch/i386/kernel/doublefault.c
+++ b/arch/i386/kernel/doublefault.c
@@ -33,7 +33,7 @@ static void doublefault_fn(void)
 		printk("double fault, tss at %08lx\n", tss);
 
 		if (ptr_ok(tss)) {
-			struct tss_struct *t = (struct tss_struct *)tss;
+			struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
 
 			printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp);
 
@@ -49,18 +49,21 @@ static void doublefault_fn(void)
 }
 
 struct tss_struct doublefault_tss __cacheline_aligned = {
-	.esp0		= STACK_START,
-	.ss0		= __KERNEL_DS,
-	.ldt		= 0,
-	.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
+	.x86_tss = {
+		.esp0		= STACK_START,
+		.ss0		= __KERNEL_DS,
+		.ldt		= 0,
+		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
 
-	.eip		= (unsigned long) doublefault_fn,
-	.eflags		= X86_EFLAGS_SF | 0x2,	/* 0x2 bit is always set */
-	.esp		= STACK_START,
-	.es		= __USER_DS,
-	.cs		= __KERNEL_CS,
-	.ss		= __KERNEL_DS,
-	.ds		= __USER_DS,
+		.eip		= (unsigned long) doublefault_fn,
+		/* 0x2 bit is always set */
+		.eflags		= X86_EFLAGS_SF | 0x2,
+		.esp		= STACK_START,
+		.es		= __USER_DS,
+		.cs		= __KERNEL_CS,
+		.ss		= __KERNEL_DS,
+		.ds		= __USER_DS,
 
-	.__cr3		= __pa(swapper_pg_dir)
+		.__cr3		= __pa(swapper_pg_dir)
+	}
 };
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index 70f39560846a..9645bb51f76a 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -161,26 +161,27 @@ static struct resource standard_io_resources[] = { {
 
 static int __init romsignature(const unsigned char *rom)
 {
+	const unsigned short * const ptr = (const unsigned short *)rom;
 	unsigned short sig;
 
-	return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
-	       sig == ROMSIGNATURE;
+	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
 }
 
-static int __init romchecksum(unsigned char *rom, unsigned long length)
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
 {
-	unsigned char sum;
+	unsigned char sum, c;
 
-	for (sum = 0; length; length--)
-		sum += *rom++;
-	return sum == 0;
+	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+		sum += c;
+	return !length && !sum;
 }
 
 static void __init probe_roms(void)
 {
+	const unsigned char *rom;
 	unsigned long start, length, upper;
-	unsigned char *rom;
-	int	      i;
+	unsigned char c;
+	int i;
 
 	/* video rom */
 	upper = adapter_rom_resources[0].start;
@@ -191,8 +192,11 @@ static void __init probe_roms(void)
 
 		video_rom_resource.start = start;
 
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
 		/* 0 < length <= 0x7f * 512, historically */
-		length = rom[2] * 512;
+		length = c * 512;
 
 		/* if checksum okay, trust length byte */
 		if (length && romchecksum(rom, length))
@@ -226,8 +230,11 @@ static void __init probe_roms(void)
 		if (!romsignature(rom))
 			continue;
 
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
 		/* 0 < length <= 0x7f * 512, historically */
-		length = rom[2] * 512;
+		length = c * 512;
 
 		/* but accept any length that fits if checksum okay */
 		if (!length || start + length > upper || !romchecksum(rom, length))
@@ -386,10 +393,8 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
 		   ____________________33__
 		   ______________________4_
 	*/
-	printk("sanitize start\n");
 	/* if there's only one memory region, don't bother */
 	if (*pnr_map < 2) {
-		printk("sanitize bail 0\n");
 		return -1;
 	}
 
@@ -398,7 +403,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
 	/* bail out if we find any unreasonable addresses in bios map */
 	for (i=0; i<old_nr; i++)
 		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
-			printk("sanitize bail 1\n");
 			return -1;
 		}
 
@@ -494,7 +498,6 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
 	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
 	*pnr_map = new_nr;
 
-	printk("sanitize end\n");
 	return 0;
 }
 
@@ -525,7 +528,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
 		unsigned long long size = biosmap->size;
 		unsigned long long end = start + size;
 		unsigned long type = biosmap->type;
-		printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
 
 		/* Overflow in 64 bits? Ignore the memory map. */
 		if (start > end)
@@ -536,17 +538,11 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
 		 * Not right. Fix it up.
 		 */
 		if (type == E820_RAM) {
-			printk("copy_e820_map() type is E820_RAM\n");
 			if (start < 0x100000ULL && end > 0xA0000ULL) {
-				printk("copy_e820_map() lies in range...\n");
-				if (start < 0xA0000ULL) {
-					printk("copy_e820_map() start < 0xA0000ULL\n");
+				if (start < 0xA0000ULL)
 					add_memory_region(start, 0xA0000ULL-start, type);
-				}
-				if (end <= 0x100000ULL) {
-					printk("copy_e820_map() end <= 0x100000ULL\n");
+				if (end <= 0x100000ULL)
 					continue;
-				}
 				start = 0x100000ULL;
 				size = end - start;
 			}
@@ -818,6 +814,26 @@ void __init limit_regions(unsigned long long size)
 	print_memory_map("limit_regions endfunc");
 }
 
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+	int i;
+	for (i = 0; i < e820.nr_map; i++) {
+		const struct e820entry *ei = &e820.map[i];
+		if (type && ei->type != type)
+			continue;
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
  /*
   * This function checks if the entire range <start,end> is mapped with type.
   *
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8f9c624ace6f..dd9e7faafa7c 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -69,13 +69,11 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
 {
 	unsigned long cr4;
 	unsigned long temp;
-	struct Xgt_desc_struct *cpu_gdt_descr;
+	struct Xgt_desc_struct gdt_descr;
 
 	spin_lock(&efi_rt_lock);
 	local_irq_save(efi_rt_eflags);
 
-	cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
-
 	/*
 	 * If I don't have PSE, I should just duplicate two entries in page
 	 * directory. If I have PSE, I just need to duplicate one entry in
@@ -105,17 +103,19 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
 	 */
 	local_flush_tlb();
 
-	cpu_gdt_descr->address = __pa(cpu_gdt_descr->address);
-	load_gdt(cpu_gdt_descr);
+	gdt_descr.address = __pa(get_cpu_gdt_table(0));
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
 }
 
 static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
 {
 	unsigned long cr4;
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
+	struct Xgt_desc_struct gdt_descr;
 
-	cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address);
-	load_gdt(cpu_gdt_descr);
+	gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
 
 	cr4 = read_cr4();
 
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 18bddcb8e9e8..b1f16ee65e4d 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -15,7 +15,7 @@
  * I changed all the .align's to 4 (16 byte alignment), as that's faster
  * on a 486.
  *
- * Stack layout in 'ret_from_system_call':
+ * Stack layout in 'syscall_exit':
  * 	ptrace needs to have all regs on the stack.
  *	if the order here is changed, it needs to be
  *	updated in fork.c:copy_process, signal.c:do_signal,
@@ -132,7 +132,7 @@ VM_MASK		= 0x00020000
 	movl $(__USER_DS), %edx; \
 	movl %edx, %ds; \
 	movl %edx, %es; \
-	movl $(__KERNEL_PDA), %edx; \
+	movl $(__KERNEL_PERCPU), %edx; \
 	movl %edx, %fs
 
 #define RESTORE_INT_REGS \
@@ -305,16 +305,12 @@ sysenter_past_esp:
 	pushl $(__USER_CS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET cs, 0*/
-#ifndef CONFIG_COMPAT_VDSO
 	/*
 	 * Push current_thread_info()->sysenter_return to the stack.
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
 	pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
-#else
-	pushl $SYSENTER_RETURN
-#endif
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET eip, 0
 
@@ -342,7 +338,7 @@ sysenter_past_esp:
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)
-	DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
@@ -560,9 +556,7 @@ END(syscall_badsys)
 
 #define FIXUP_ESPFIX_STACK \
 	/* since we are on a wrong stack, we cant make it a C code :( */ \
-	movl %fs:PDA_cpu, %ebx; \
-	PER_CPU(cpu_gdt_descr, %ebx); \
-	movl GDS_address(%ebx), %ebx; \
+	PER_CPU(gdt_page, %ebx); \
 	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
 	addl %esp, %eax; \
 	pushl $__KERNEL_DS; \
@@ -635,7 +629,7 @@ ENTRY(name)				\
 	SAVE_ALL;			\
 	TRACE_IRQS_OFF			\
 	movl %esp,%eax;			\
-	call smp_/**/name;		\
+	call smp_##name;		\
 	jmp ret_from_intr;		\
 	CFI_ENDPROC;			\
 ENDPROC(name)
@@ -643,11 +637,6 @@ ENDPROC(name)
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
 
-/* This alternate entry is needed because we hijack the apic LVTT */
-#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
-BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
-#endif
-
 KPROBE_ENTRY(page_fault)
 	RING0_EC_FRAME
 	pushl $do_page_fault
@@ -686,7 +675,7 @@ error_code:
 	pushl %fs
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET fs, 0*/
-	movl $(__KERNEL_PDA), %ecx
+	movl $(__KERNEL_PERCPU), %ecx
 	movl %ecx, %fs
 	UNWIND_ESPFIX_STACK
 	popl %ecx
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 3fa7f9389afe..9b10af65faaa 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -34,17 +34,32 @@
 
 /*
  * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.  We need one bit for
- * each possible page, but only in low memory, which means
- * 2^32/4096/8 = 128K worst case (4G/4G split.)
+ * and including _end* we need mapped initially.
+ * We need:
+ *  - one bit for each possible page, but only in low memory, which means
+ *     2^32/4096/8 = 128K worst case (4G/4G split.)
+ *  - enough space to map all low memory, which means
+ *     (2^32/4096) / 1024 pages (worst case, non PAE)
+ *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
+ *  - a few pages for allocator use before the kernel pagetable has
+ *     been set up
  *
  * Modulo rounding, each megabyte assigned here requires a kilobyte of
  * memory, which is currently unreclaimed.
  *
  * This should be a multiple of a page.
  */
-#define INIT_MAP_BEYOND_END	(128*1024)
+LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
 
+#if PTRS_PER_PMD > 1
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#else
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#endif
+BOOTBITMAP_SIZE = LOW_PAGES / 8
+ALLOCATOR_SLOP = 4
+
+INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
@@ -147,8 +162,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 /*
  * Non-boot CPU entry point; entered from trampoline.S
  * We can't lgdt here, because lgdt itself uses a data segment, but
- * we know the trampoline has already loaded the boot_gdt_table GDT
- * for us.
+ * we know the trampoline has already loaded the boot_gdt for us.
  *
  * If cpu hotplug is not supported then this code can go in init section
  * which will be freed later
@@ -318,12 +332,12 @@ is386:	movl $2,%ecx		# set MP
 	movl %eax,%cr0
 
 	call check_x87
-	call setup_pda
 	lgdt early_gdt_descr
 	lidt idt_descr
 	ljmp $(__KERNEL_CS),$1f
 1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers
 	movl %eax,%ss			# after changing gdt.
+	movl %eax,%fs			# gets reset once there's real percpu
 
 	movl $(__USER_DS),%eax		# DS/ES contains default USER segment
 	movl %eax,%ds
@@ -333,16 +347,17 @@ is386:	movl $2,%ecx		# set MP
 	movl %eax,%gs
 	lldt %ax
 
-	movl $(__KERNEL_PDA),%eax
-	mov  %eax,%fs
-
 	cld			# gcc2 wants the direction flag cleared at all times
 	pushl $0		# fake return address for unwinder
 #ifdef CONFIG_SMP
 	movb ready, %cl
 	movb $1, ready
 	cmpb $0,%cl		# the first CPU calls start_kernel
-	jne initialize_secondary # all other CPUs call initialize_secondary
+	je   1f
+	movl $(__KERNEL_PERCPU), %eax
+	movl %eax,%fs		# set this cpu's percpu
+	jmp initialize_secondary # all other CPUs call initialize_secondary
+1:
 #endif /* CONFIG_SMP */
 	jmp start_kernel
 
@@ -366,23 +381,6 @@ check_x87:
 	ret
 
 /*
- * Point the GDT at this CPU's PDA.  On boot this will be
- * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
- * that CPU's GDT and PDA.
- */
-ENTRY(setup_pda)
-	/* get the PDA pointer */
-	movl start_pda, %eax
-
-	/* slot the PDA address into the GDT */
-	mov early_gdt_descr+2, %ecx
-	mov %ax, (__KERNEL_PDA+0+2)(%ecx)		/* base & 0x0000ffff */
-	shr $16, %eax
-	mov %al, (__KERNEL_PDA+4+0)(%ecx)		/* base & 0x00ff0000 */
-	mov %ah, (__KERNEL_PDA+4+3)(%ecx)		/* base & 0xff000000 */
-	ret
-
-/*
  *  setup_idt
  *
  *  sets up a idt with 256 entries pointing to
@@ -554,9 +552,6 @@ ENTRY(empty_zero_page)
  * This starts the data section.
  */
 .data
-ENTRY(start_pda)
-	.long boot_pda
-
 ENTRY(stack_start)
 	.long init_thread_union+THREAD_SIZE
 	.long __BOOT_DS
@@ -588,7 +583,7 @@ fault_msg:
 	.word 0				# 32 bit align gdt_desc.address
 boot_gdt_descr:
 	.word __BOOT_DS+7
-	.long boot_gdt_table - __PAGE_OFFSET
+	.long boot_gdt - __PAGE_OFFSET
 
 	.word 0				# 32-bit align idt_desc.address
 idt_descr:
@@ -599,67 +594,14 @@ idt_descr:
 	.word 0				# 32 bit align gdt_desc.address
 ENTRY(early_gdt_descr)
 	.word GDT_ENTRIES*8-1
-	.long cpu_gdt_table
+	.long per_cpu__gdt_page		/* Overwritten for secondary CPUs */
 
 /*
- * The boot_gdt_table must mirror the equivalent in setup.S and is
+ * The boot_gdt must mirror the equivalent in setup.S and is
  * used only for booting.
  */
 	.align L1_CACHE_BYTES
-ENTRY(boot_gdt_table)
+ENTRY(boot_gdt)
 	.fill GDT_ENTRY_BOOT_CS,8,0
 	.quad 0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
 	.quad 0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */
-
-/*
- * The Global Descriptor Table contains 28 quadwords, per-CPU.
- */
-	.align L1_CACHE_BYTES
-ENTRY(cpu_gdt_table)
-	.quad 0x0000000000000000	/* NULL descriptor */
-	.quad 0x0000000000000000	/* 0x0b reserved */
-	.quad 0x0000000000000000	/* 0x13 reserved */
-	.quad 0x0000000000000000	/* 0x1b reserved */
-	.quad 0x0000000000000000	/* 0x20 unused */
-	.quad 0x0000000000000000	/* 0x28 unused */
-	.quad 0x0000000000000000	/* 0x33 TLS entry 1 */
-	.quad 0x0000000000000000	/* 0x3b TLS entry 2 */
-	.quad 0x0000000000000000	/* 0x43 TLS entry 3 */
-	.quad 0x0000000000000000	/* 0x4b reserved */
-	.quad 0x0000000000000000	/* 0x53 reserved */
-	.quad 0x0000000000000000	/* 0x5b reserved */
-
-	.quad 0x00cf9a000000ffff	/* 0x60 kernel 4GB code at 0x00000000 */
-	.quad 0x00cf92000000ffff	/* 0x68 kernel 4GB data at 0x00000000 */
-	.quad 0x00cffa000000ffff	/* 0x73 user 4GB code at 0x00000000 */
-	.quad 0x00cff2000000ffff	/* 0x7b user 4GB data at 0x00000000 */
-
-	.quad 0x0000000000000000	/* 0x80 TSS descriptor */
-	.quad 0x0000000000000000	/* 0x88 LDT descriptor */
-
-	/*
-	 * Segments used for calling PnP BIOS have byte granularity.
-	 * They code segments and data segments have fixed 64k limits,
-	 * the transfer segment sizes are set at run time.
-	 */
-	.quad 0x00409a000000ffff	/* 0x90 32-bit code */
-	.quad 0x00009a000000ffff	/* 0x98 16-bit code */
-	.quad 0x000092000000ffff	/* 0xa0 16-bit data */
-	.quad 0x0000920000000000	/* 0xa8 16-bit data */
-	.quad 0x0000920000000000	/* 0xb0 16-bit data */
-
-	/*
-	 * The APM segments have byte granularity and their bases
-	 * are set at run time.  All have 64k limits.
-	 */
-	.quad 0x00409a000000ffff	/* 0xb8 APM CS    code */
-	.quad 0x00009a000000ffff	/* 0xc0 APM CS 16 code (16 bit) */
-	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
-
-	.quad 0x00c0920000000000	/* 0xd0 - ESPFIX SS */
-	.quad 0x00cf92000000ffff	/* 0xd8 - PDA */
-	.quad 0x0000000000000000	/* 0xe0 - unused */
-	.quad 0x0000000000000000	/* 0xe8 - unused */
-	.quad 0x0000000000000000	/* 0xf0 - unused */
-	.quad 0x0000000000000000	/* 0xf8 - GDT entry 31: double-fault TSS */
-
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 4afe26e86260..e3d4b73bfdb0 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -28,5 +28,3 @@ EXPORT_SYMBOL(__read_lock_failed);
 #endif
 
 EXPORT_SYMBOL(csum_partial);
-
-EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 89d85d244926..1b623cda3a64 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -35,6 +35,7 @@
 #include <linux/msi.h>
 #include <linux/htirq.h>
 #include <linux/freezer.h>
+#include <linux/kthread.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -661,8 +662,6 @@ static int balanced_irq(void *unused)
 	unsigned long prev_balance_time = jiffies;
 	long time_remaining = balanced_irq_interval;
 
-	daemonize("kirqd");
-	
 	/* push everything to CPU 0 to give us a starting point.  */
 	for (i = 0 ; i < NR_IRQS ; i++) {
 		irq_desc[i].pending_mask = cpumask_of_cpu(0);
@@ -722,10 +721,9 @@ static int __init balanced_irq_init(void)
 	}
 	
 	printk(KERN_INFO "Starting balanced_irq\n");
-	if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 
+	if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
 		return 0;
-	else 
-		printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
+	printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
 failed:
 	for_each_possible_cpu(i) {
 		kfree(irq_cpu_data[i].irq_delta);
@@ -1403,10 +1401,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	enable_8259A_irq(0);
 }
 
-static inline void UNEXPECTED_IO_APIC(void)
-{
-}
-
 void __init print_IO_APIC(void)
 {
 	int apic, i;
@@ -1446,34 +1440,12 @@ void __init print_IO_APIC(void)
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
 	printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
-	if (reg_00.bits.ID >= get_physical_broadcast())
-		UNEXPECTED_IO_APIC();
-	if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
-		UNEXPECTED_IO_APIC();
 
 	printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
 	printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
-	if (	(reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
-		(reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
-		(reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
-		(reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
-		(reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
-		(reg_01.bits.entries != 0x2E) &&
-		(reg_01.bits.entries != 0x3F)
-	)
-		UNEXPECTED_IO_APIC();
 
 	printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
 	printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
-	if (	(reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
-		(reg_01.bits.version != 0x10) && /* oldest IO-APICs */
-		(reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
-		(reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
-		(reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
-	)
-		UNEXPECTED_IO_APIC();
-	if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
-		UNEXPECTED_IO_APIC();
 
 	/*
 	 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1483,8 +1455,6 @@ void __init print_IO_APIC(void)
 	if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
 		printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
 		printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
-		if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
-			UNEXPECTED_IO_APIC();
 	}
 
 	/*
@@ -1496,8 +1466,6 @@ void __init print_IO_APIC(void)
 	    reg_03.raw != reg_01.raw) {
 		printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
 		printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
-		if (reg_03.bits.__reserved_1)
-			UNEXPECTED_IO_APIC();
 	}
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c
index 498e8bc197d5..d1e42e0dbe67 100644
--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -16,6 +16,7 @@
 #include <linux/stddef.h>
 #include <linux/slab.h>
 #include <linux/thread_info.h>
+#include <linux/syscalls.h>
 
 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
 static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
@@ -113,7 +114,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	 * Reset the owner so that a process switch will not set
 	 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
 	 */
-	tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
 	tss->io_bitmap_owner = NULL;
 
 	put_cpu();
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 8db8d514c9c0..d2daf672f4a2 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -24,6 +24,9 @@
 DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 4f5983c98669..0952eccd8f28 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
 		}
 		++mpc_record;
 	}
-	clustered_apic_check();
+	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "SMP mptable: no processors registered!\n");
 	return num_processors;
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 84c3497efb60..33cf2f3c444f 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -20,7 +20,6 @@
 #include <linux/sysdev.h>
 #include <linux/sysctl.h>
 #include <linux/percpu.h>
-#include <linux/dmi.h>
 #include <linux/kprobes.h>
 #include <linux/cpumask.h>
 #include <linux/kernel_stat.h>
@@ -28,30 +27,14 @@
 #include <asm/smp.h>
 #include <asm/nmi.h>
 #include <asm/kdebug.h>
-#include <asm/intel_arch_perfmon.h>
 
 #include "mach_traps.h"
 
 int unknown_nmi_panic;
 int nmi_watchdog_enabled;
 
-/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
- * evtsel_nmi_owner tracks the ownership of the event selection
- * - different performance counters/ event selection may be reserved for
- *   different subsystems this reservation system just tries to coordinate
- *   things a little
- */
-
-/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
- * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
- */
-#define NMI_MAX_COUNTER_BITS 66
-#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS)
-
-static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]);
-static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]);
-
 static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
 /* nmi_active:
  * >0: the lapic NMI watchdog is active, but can be disabled
  * <0: the lapic NMI watchdog has not been set up, and cannot
@@ -63,206 +46,11 @@ atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
 unsigned int nmi_watchdog = NMI_DEFAULT;
 static unsigned int nmi_hz = HZ;
 
-struct nmi_watchdog_ctlblk {
-	int enabled;
-	u64 check_bit;
-	unsigned int cccr_msr;
-	unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
-	unsigned int evntsel_msr;  /* the MSR to select the events to handle */
-};
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
+static DEFINE_PER_CPU(short, wd_enabled);
 
 /* local prototypes */
 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
 
-extern void show_registers(struct pt_regs *regs);
-extern int unknown_nmi_panic;
-
-/* converts an msr to an appropriate reservation bit */
-static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
-{
-	/* returns the bit offset of the performance counter register */
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		return (msr - MSR_K7_PERFCTR0);
-	case X86_VENDOR_INTEL:
-		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-			return (msr - MSR_ARCH_PERFMON_PERFCTR0);
-
-		switch (boot_cpu_data.x86) {
-		case 6:
-			return (msr - MSR_P6_PERFCTR0);
-		case 15:
-			return (msr - MSR_P4_BPU_PERFCTR0);
-		}
-	}
-	return 0;
-}
-
-/* converts an msr to an appropriate reservation bit */
-static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
-{
-	/* returns the bit offset of the event selection register */
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		return (msr - MSR_K7_EVNTSEL0);
-	case X86_VENDOR_INTEL:
-		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-			return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
-
-		switch (boot_cpu_data.x86) {
-		case 6:
-			return (msr - MSR_P6_EVNTSEL0);
-		case 15:
-			return (msr - MSR_P4_BSU_ESCR0);
-		}
-	}
-	return 0;
-}
-
-/* checks for a bit availability (hack for oprofile) */
-int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
-{
-	int cpu;
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-	for_each_possible_cpu (cpu) {
-		if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
-			return 0;
-	}
-	return 1;
-}
-
-/* checks the an msr for availability */
-int avail_to_resrv_perfctr_nmi(unsigned int msr)
-{
-	unsigned int counter;
-	int cpu;
-
-	counter = nmi_perfctr_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	for_each_possible_cpu (cpu) {
-		if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
-			return 0;
-	}
-	return 1;
-}
-
-static int __reserve_perfctr_nmi(int cpu, unsigned int msr)
-{
-	unsigned int counter;
-	if (cpu < 0)
-		cpu = smp_processor_id();
-
-	counter = nmi_perfctr_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]))
-		return 1;
-	return 0;
-}
-
-static void __release_perfctr_nmi(int cpu, unsigned int msr)
-{
-	unsigned int counter;
-	if (cpu < 0)
-		cpu = smp_processor_id();
-
-	counter = nmi_perfctr_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)[0]);
-}
-
-int reserve_perfctr_nmi(unsigned int msr)
-{
-	int cpu, i;
-	for_each_possible_cpu (cpu) {
-		if (!__reserve_perfctr_nmi(cpu, msr)) {
-			for_each_possible_cpu (i) {
-				if (i >= cpu)
-					break;
-				__release_perfctr_nmi(i, msr);
-			}
-			return 0;
-		}
-	}
-	return 1;
-}
-
-void release_perfctr_nmi(unsigned int msr)
-{
-	int cpu;
-	for_each_possible_cpu (cpu) {
-		__release_perfctr_nmi(cpu, msr);
-	}
-}
-
-int __reserve_evntsel_nmi(int cpu, unsigned int msr)
-{
-	unsigned int counter;
-	if (cpu < 0)
-		cpu = smp_processor_id();
-
-	counter = nmi_evntsel_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]))
-		return 1;
-	return 0;
-}
-
-static void __release_evntsel_nmi(int cpu, unsigned int msr)
-{
-	unsigned int counter;
-	if (cpu < 0)
-		cpu = smp_processor_id();
-
-	counter = nmi_evntsel_msr_to_bit(msr);
-	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
-	clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]);
-}
-
-int reserve_evntsel_nmi(unsigned int msr)
-{
-	int cpu, i;
-	for_each_possible_cpu (cpu) {
-		if (!__reserve_evntsel_nmi(cpu, msr)) {
-			for_each_possible_cpu (i) {
-				if (i >= cpu)
-					break;
-				__release_evntsel_nmi(i, msr);
-			}
-			return 0;
-		}
-	}
-	return 1;
-}
-
-void release_evntsel_nmi(unsigned int msr)
-{
-	int cpu;
-	for_each_possible_cpu (cpu) {
-		__release_evntsel_nmi(cpu, msr);
-	}
-}
-
-static __cpuinit inline int nmi_known_cpu(void)
-{
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
-			|| (boot_cpu_data.x86 == 16));
-	case X86_VENDOR_INTEL:
-		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-			return 1;
-		else
-			return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
-	}
-	return 0;
-}
-
 static int endflag __initdata = 0;
 
 #ifdef CONFIG_SMP
@@ -284,28 +72,6 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
-	u64 counter_val;
-	unsigned int retval = hz;
-
-	/*
-	 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
-	 * are writable, with higher bits sign extending from bit 31.
-	 * So, we can only program the counter with 31 bit values and
-	 * 32nd bit should be 1, for 33.. to be 1.
-	 * Find the appropriate nmi_hz
-	 */
-	counter_val = (u64)cpu_khz * 1000;
-	do_div(counter_val, retval);
- 	if (counter_val > 0x7fffffffULL) {
-		u64 count = (u64)cpu_khz * 1000;
-		do_div(count, 0x7fffffffUL);
-		retval = count + 1;
-	}
-	return retval;
-}
-
 static int __init check_nmi_watchdog(void)
 {
 	unsigned int *prev_nmi_count;
@@ -338,14 +104,14 @@ static int __init check_nmi_watchdog(void)
 		if (!cpu_isset(cpu, cpu_callin_map))
 			continue;
 #endif
-		if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
+		if (!per_cpu(wd_enabled, cpu))
 			continue;
 		if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
 			printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
 				cpu,
 				prev_nmi_count[cpu],
 				nmi_count(cpu));
-			per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
+			per_cpu(wd_enabled, cpu) = 0;
 			atomic_dec(&nmi_active);
 		}
 	}
@@ -359,16 +125,8 @@ static int __init check_nmi_watchdog(void)
 
 	/* now that we know it works we can reduce NMI frequency to
 	   something more reasonable; makes a difference in some configs */
-	if (nmi_watchdog == NMI_LOCAL_APIC) {
-		struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-		nmi_hz = 1;
-
-		if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
-		    wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
-			nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-		}
-	}
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		nmi_hz = lapic_adjust_nmi_hz(1);
 
 	kfree(prev_nmi_count);
 	return 0;
@@ -391,85 +149,8 @@ static int __init setup_nmi_watchdog(char *str)
 
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
-static void disable_lapic_nmi_watchdog(void)
-{
-	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
-	if (atomic_read(&nmi_active) <= 0)
-		return;
-
-	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
-
-	BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-static void enable_lapic_nmi_watchdog(void)
-{
-	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
-	/* are we already enabled */
-	if (atomic_read(&nmi_active) != 0)
-		return;
-
-	/* are we lapic aware */
-	if (nmi_known_cpu() <= 0)
-		return;
 
-	on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
-	touch_nmi_watchdog();
-}
-
-void disable_timer_nmi_watchdog(void)
-{
-	BUG_ON(nmi_watchdog != NMI_IO_APIC);
-
-	if (atomic_read(&nmi_active) <= 0)
-		return;
-
-	disable_irq(0);
-	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
-
-	BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_timer_nmi_watchdog(void)
-{
-	BUG_ON(nmi_watchdog != NMI_IO_APIC);
-
-	if (atomic_read(&nmi_active) == 0) {
-		touch_nmi_watchdog();
-		on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
-		enable_irq(0);
-	}
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
-	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
-}
-
-static void __acpi_nmi_enable(void *__unused)
-{
-	apic_write_around(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
-}
+/* Suspend/resume support */
 
 #ifdef CONFIG_PM
 
@@ -516,7 +197,7 @@ static int __init init_lapic_nmi_sysfs(void)
 	if (nmi_watchdog != NMI_LOCAL_APIC)
 		return 0;
 
-	if ( atomic_read(&nmi_active) < 0 )
+	if (atomic_read(&nmi_active) < 0)
 		return 0;
 
 	error = sysdev_class_register(&nmi_sysclass);
@@ -529,433 +210,69 @@ late_initcall(init_lapic_nmi_sysfs);
 
 #endif	/* CONFIG_PM */
 
-/*
- * Activate the NMI watchdog via the local APIC.
- * Original code written by Keith Owens.
- */
-
-static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
-{
-	u64 count = (u64)cpu_khz * 1000;
-
-	do_div(count, nmi_hz);
-	if(descr)
-		Dprintk("setting %s to -0x%08Lx\n", descr, count);
-	wrmsrl(perfctr_msr, 0 - count);
-}
-
-static void write_watchdog_counter32(unsigned int perfctr_msr,
-		const char *descr)
-{
-	u64 count = (u64)cpu_khz * 1000;
-
-	do_div(count, nmi_hz);
-	if(descr)
-		Dprintk("setting %s to -0x%08Lx\n", descr, count);
-	wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-
-/* Note that these events don't tick when the CPU idles. This means
-   the frequency varies with CPU load. */
-
-#define K7_EVNTSEL_ENABLE	(1 << 22)
-#define K7_EVNTSEL_INT		(1 << 20)
-#define K7_EVNTSEL_OS		(1 << 17)
-#define K7_EVNTSEL_USR		(1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
-#define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(void)
-{
-	unsigned int perfctr_msr, evntsel_msr;
-	unsigned int evntsel;
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	perfctr_msr = MSR_K7_PERFCTR0;
-	evntsel_msr = MSR_K7_EVNTSEL0;
-	if (!__reserve_perfctr_nmi(-1, perfctr_msr))
-		goto fail;
-
-	if (!__reserve_evntsel_nmi(-1, evntsel_msr))
-		goto fail1;
-
-	wrmsrl(perfctr_msr, 0UL);
-
-	evntsel = K7_EVNTSEL_INT
-		| K7_EVNTSEL_OS
-		| K7_EVNTSEL_USR
-		| K7_NMI_EVENT;
-
-	/* setup the timer */
-	wrmsr(evntsel_msr, evntsel, 0);
-	write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= K7_EVNTSEL_ENABLE;
-	wrmsr(evntsel_msr, evntsel, 0);
-
-	wd->perfctr_msr = perfctr_msr;
-	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = 0;  //unused
-	wd->check_bit = 1ULL<<63;
-	return 1;
-fail1:
-	__release_perfctr_nmi(-1, perfctr_msr);
-fail:
-	return 0;
-}
-
-static void stop_k7_watchdog(void)
-{
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	wrmsr(wd->evntsel_msr, 0, 0);
-
-	__release_evntsel_nmi(-1, wd->evntsel_msr);
-	__release_perfctr_nmi(-1, wd->perfctr_msr);
-}
-
-#define P6_EVNTSEL0_ENABLE	(1 << 22)
-#define P6_EVNTSEL_INT		(1 << 20)
-#define P6_EVNTSEL_OS		(1 << 17)
-#define P6_EVNTSEL_USR		(1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
-#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-static int setup_p6_watchdog(void)
-{
-	unsigned int perfctr_msr, evntsel_msr;
-	unsigned int evntsel;
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	perfctr_msr = MSR_P6_PERFCTR0;
-	evntsel_msr = MSR_P6_EVNTSEL0;
-	if (!__reserve_perfctr_nmi(-1, perfctr_msr))
-		goto fail;
-
-	if (!__reserve_evntsel_nmi(-1, evntsel_msr))
-		goto fail1;
-
-	wrmsrl(perfctr_msr, 0UL);
-
-	evntsel = P6_EVNTSEL_INT
-		| P6_EVNTSEL_OS
-		| P6_EVNTSEL_USR
-		| P6_NMI_EVENT;
-
-	/* setup the timer */
-	wrmsr(evntsel_msr, evntsel, 0);
-	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-	write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= P6_EVNTSEL0_ENABLE;
-	wrmsr(evntsel_msr, evntsel, 0);
-
-	wd->perfctr_msr = perfctr_msr;
-	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = 0;  //unused
-	wd->check_bit = 1ULL<<39;
-	return 1;
-fail1:
-	__release_perfctr_nmi(-1, perfctr_msr);
-fail:
-	return 0;
-}
-
-static void stop_p6_watchdog(void)
-{
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	wrmsr(wd->evntsel_msr, 0, 0);
-
-	__release_evntsel_nmi(-1, wd->evntsel_msr);
-	__release_perfctr_nmi(-1, wd->perfctr_msr);
-}
-
-/* Note that these events don't tick when the CPU idles. This means
-   the frequency varies with CPU load. */
-
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
-#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
-#define P4_ESCR_OS		(1<<3)
-#define P4_ESCR_USR		(1<<2)
-#define P4_CCCR_OVF_PMI0	(1<<26)
-#define P4_CCCR_OVF_PMI1	(1<<27)
-#define P4_CCCR_THRESHOLD(N)	((N)<<20)
-#define P4_CCCR_COMPLEMENT	(1<<19)
-#define P4_CCCR_COMPARE		(1<<18)
-#define P4_CCCR_REQUIRED	(3<<16)
-#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
-#define P4_CCCR_ENABLE		(1<<12)
-#define P4_CCCR_OVF 		(1<<31)
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
-   CRU_ESCR0 (with any non-null event selector) through a complemented
-   max threshold. [IA32-Vol3, Section 14.9.9] */
-
-static int setup_p4_watchdog(void)
+static void __acpi_nmi_enable(void *__unused)
 {
-	unsigned int perfctr_msr, evntsel_msr, cccr_msr;
-	unsigned int evntsel, cccr_val;
-	unsigned int misc_enable, dummy;
-	unsigned int ht_num;
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
-	if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
-		return 0;
-
-#ifdef CONFIG_SMP
-	/* detect which hyperthread we are on */
-	if (smp_num_siblings == 2) {
-		unsigned int ebx, apicid;
-
-        	ebx = cpuid_ebx(1);
-	        apicid = (ebx >> 24) & 0xff;
-        	ht_num = apicid & 1;
-	} else
-#endif
-		ht_num = 0;
-
-	/* performance counters are shared resources
-	 * assign each hyperthread its own set
-	 * (re-use the ESCR0 register, seems safe
-	 * and keeps the cccr_val the same)
-	 */
-	if (!ht_num) {
-		/* logical cpu 0 */
-		perfctr_msr = MSR_P4_IQ_PERFCTR0;
-		evntsel_msr = MSR_P4_CRU_ESCR0;
-		cccr_msr = MSR_P4_IQ_CCCR0;
-		cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-	} else {
-		/* logical cpu 1 */
-		perfctr_msr = MSR_P4_IQ_PERFCTR1;
-		evntsel_msr = MSR_P4_CRU_ESCR0;
-		cccr_msr = MSR_P4_IQ_CCCR1;
-		cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
-	}
-
-	if (!__reserve_perfctr_nmi(-1, perfctr_msr))
-		goto fail;
-
-	if (!__reserve_evntsel_nmi(-1, evntsel_msr))
-		goto fail1;
-
-	evntsel = P4_ESCR_EVENT_SELECT(0x3F)
-	 	| P4_ESCR_OS
-		| P4_ESCR_USR;
-
-	cccr_val |= P4_CCCR_THRESHOLD(15)
-		 | P4_CCCR_COMPLEMENT
-		 | P4_CCCR_COMPARE
-		 | P4_CCCR_REQUIRED;
-
-	wrmsr(evntsel_msr, evntsel, 0);
-	wrmsr(cccr_msr, cccr_val, 0);
-	write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	cccr_val |= P4_CCCR_ENABLE;
-	wrmsr(cccr_msr, cccr_val, 0);
-	wd->perfctr_msr = perfctr_msr;
-	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = cccr_msr;
-	wd->check_bit = 1ULL<<39;
-	return 1;
-fail1:
-	__release_perfctr_nmi(-1, perfctr_msr);
-fail:
-	return 0;
+	apic_write_around(APIC_LVT0, APIC_DM_NMI);
 }
 
-static void stop_p4_watchdog(void)
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
 {
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	wrmsr(wd->cccr_msr, 0, 0);
-	wrmsr(wd->evntsel_msr, 0, 0);
-
-	__release_evntsel_nmi(-1, wd->evntsel_msr);
-	__release_perfctr_nmi(-1, wd->perfctr_msr);
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
 }
 
-#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static int setup_intel_arch_watchdog(void)
+static void __acpi_nmi_disable(void *__unused)
 {
-	unsigned int ebx;
-	union cpuid10_eax eax;
-	unsigned int unused;
-	unsigned int perfctr_msr, evntsel_msr;
-	unsigned int evntsel;
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Unhalted Core Cycles Event or not.
-	 * NOTE: Corresponding bit = 0 in ebx indicates event present.
-	 */
-	cpuid(10, &(eax.full), &ebx, &unused, &unused);
-	if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
-	    (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-		goto fail;
-
-	perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
-	evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
-
-	if (!__reserve_perfctr_nmi(-1, perfctr_msr))
-		goto fail;
-
-	if (!__reserve_evntsel_nmi(-1, evntsel_msr))
-		goto fail1;
-
-	wrmsrl(perfctr_msr, 0UL);
-
-	evntsel = ARCH_PERFMON_EVENTSEL_INT
-		| ARCH_PERFMON_EVENTSEL_OS
-		| ARCH_PERFMON_EVENTSEL_USR
-		| ARCH_PERFMON_NMI_EVENT_SEL
-		| ARCH_PERFMON_NMI_EVENT_UMASK;
-
-	/* setup the timer */
-	wrmsr(evntsel_msr, evntsel, 0);
-	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-	write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsr(evntsel_msr, evntsel, 0);
-
-	wd->perfctr_msr = perfctr_msr;
-	wd->evntsel_msr = evntsel_msr;
-	wd->cccr_msr = 0;  //unused
-	wd->check_bit = 1ULL << (eax.split.bit_width - 1);
-	return 1;
-fail1:
-	__release_perfctr_nmi(-1, perfctr_msr);
-fail:
-	return 0;
+	apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
 }
 
-static void stop_intel_arch_watchdog(void)
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
 {
-	unsigned int ebx;
-	union cpuid10_eax eax;
-	unsigned int unused;
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Unhalted Core Cycles Event or not.
-	 * NOTE: Corresponding bit = 0 in ebx indicates event present.
-	 */
-	cpuid(10, &(eax.full), &ebx, &unused, &unused);
-	if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
-	    (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-		return;
-
-	wrmsr(wd->evntsel_msr, 0, 0);
-	__release_evntsel_nmi(-1, wd->evntsel_msr);
-	__release_perfctr_nmi(-1, wd->perfctr_msr);
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
 }
 
 void setup_apic_nmi_watchdog (void *unused)
 {
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-	/* only support LOCAL and IO APICs for now */
-	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
-	    (nmi_watchdog != NMI_IO_APIC))
-	    	return;
-
-	if (wd->enabled == 1)
-		return;
+	if (__get_cpu_var(wd_enabled))
+ 		return;
 
 	/* cheap hack to support suspend/resume */
 	/* if cpu0 is not active neither should the other cpus */
 	if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
 		return;
 
-	if (nmi_watchdog == NMI_LOCAL_APIC) {
-		switch (boot_cpu_data.x86_vendor) {
-		case X86_VENDOR_AMD:
-			if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
-				boot_cpu_data.x86 != 16)
-				return;
-			if (!setup_k7_watchdog())
-				return;
-			break;
-		case X86_VENDOR_INTEL:
-			if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-				if (!setup_intel_arch_watchdog())
-					return;
-				break;
-			}
-			switch (boot_cpu_data.x86) {
-			case 6:
-				if (boot_cpu_data.x86_model > 0xd)
-					return;
-
-				if (!setup_p6_watchdog())
-					return;
-				break;
-			case 15:
-				if (boot_cpu_data.x86_model > 0x4)
-					return;
-
-				if (!setup_p4_watchdog())
-					return;
-				break;
-			default:
-				return;
-			}
-			break;
-		default:
+	switch (nmi_watchdog) {
+	case NMI_LOCAL_APIC:
+		__get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */
+		if (lapic_watchdog_init(nmi_hz) < 0) {
+			__get_cpu_var(wd_enabled) = 0;
 			return;
 		}
+		/* FALL THROUGH */
+	case NMI_IO_APIC:
+		__get_cpu_var(wd_enabled) = 1;
+		atomic_inc(&nmi_active);
 	}
-	wd->enabled = 1;
-	atomic_inc(&nmi_active);
 }
 
 void stop_apic_nmi_watchdog(void *unused)
 {
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
 	/* only support LOCAL and IO APICs for now */
 	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
 	    (nmi_watchdog != NMI_IO_APIC))
 	    	return;
-
-	if (wd->enabled == 0)
+	if (__get_cpu_var(wd_enabled) == 0)
 		return;
-
-	if (nmi_watchdog == NMI_LOCAL_APIC) {
-		switch (boot_cpu_data.x86_vendor) {
-		case X86_VENDOR_AMD:
-			stop_k7_watchdog();
-			break;
-		case X86_VENDOR_INTEL:
-			if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-				stop_intel_arch_watchdog();
-				break;
-			}
-			switch (boot_cpu_data.x86) {
-			case 6:
-				if (boot_cpu_data.x86_model > 0xd)
-					break;
-				stop_p6_watchdog();
-				break;
-			case 15:
-				if (boot_cpu_data.x86_model > 0x4)
-					break;
-				stop_p4_watchdog();
-				break;
-			}
-			break;
-		default:
-			return;
-		}
-	}
-	wd->enabled = 0;
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		lapic_watchdog_stop();
+	__get_cpu_var(wd_enabled) = 0;
 	atomic_dec(&nmi_active);
 }
 
@@ -1011,8 +328,6 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 	unsigned int sum;
 	int touched = 0;
 	int cpu = smp_processor_id();
-	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-	u64 dummy;
 	int rc=0;
 
 	/* check for other users first */
@@ -1055,53 +370,20 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 		alert_counter[cpu] = 0;
 	}
 	/* see if the nmi watchdog went off */
-	if (wd->enabled) {
-		if (nmi_watchdog == NMI_LOCAL_APIC) {
-			rdmsrl(wd->perfctr_msr, dummy);
-			if (dummy & wd->check_bit){
-				/* this wasn't a watchdog timer interrupt */
-				goto done;
-			}
-
-			/* only Intel P4 uses the cccr msr */
-	 		if (wd->cccr_msr != 0) {
-	 			/*
-	 			 * P4 quirks:
-	 			 * - An overflown perfctr will assert its interrupt
-	 			 *   until the OVF flag in its CCCR is cleared.
-	 			 * - LVTPC is masked on interrupt and must be
-	 			 *   unmasked by the LVTPC handler.
-	 			 */
-				rdmsrl(wd->cccr_msr, dummy);
-				dummy &= ~P4_CCCR_OVF;
-	 			wrmsrl(wd->cccr_msr, dummy);
-	 			apic_write(APIC_LVTPC, APIC_DM_NMI);
-				/* start the cycle over again */
-				write_watchdog_counter(wd->perfctr_msr, NULL);
-	 		}
-			else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
-				 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
-				/* P6 based Pentium M need to re-unmask
-				 * the apic vector but it doesn't hurt
-				 * other P6 variant.
-				 * ArchPerfom/Core Duo also needs this */
-				apic_write(APIC_LVTPC, APIC_DM_NMI);
-				/* P6/ARCH_PERFMON has 32 bit counter write */
-				write_watchdog_counter32(wd->perfctr_msr, NULL);
-			} else {
-				/* start the cycle over again */
-				write_watchdog_counter(wd->perfctr_msr, NULL);
-			}
-			rc = 1;
-		} else if (nmi_watchdog == NMI_IO_APIC) {
-			/* don't know how to accurately check for this.
-			 * just assume it was a watchdog timer interrupt
-			 * This matches the old behaviour.
-			 */
-			rc = 1;
-		}
+	if (!__get_cpu_var(wd_enabled))
+		return rc;
+	switch (nmi_watchdog) {
+	case NMI_LOCAL_APIC:
+		rc |= lapic_wd_event(nmi_hz);
+		break;
+	case NMI_IO_APIC:
+		/* don't know how to accurately check for this.
+		 * just assume it was a watchdog timer interrupt
+		 * This matches the old behaviour.
+		 */
+		rc = 1;
+		break;
 	}
-done:
 	return rc;
 }
 
@@ -1146,7 +428,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 	}
 
 	if (nmi_watchdog == NMI_DEFAULT) {
-		if (nmi_known_cpu() > 0)
+		if (lapic_watchdog_ok())
 			nmi_watchdog = NMI_LOCAL_APIC;
 		else
 			nmi_watchdog = NMI_IO_APIC;
@@ -1182,11 +464,3 @@ void __trigger_all_cpu_backtrace(void)
 
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-EXPORT_SYMBOL(reserve_perfctr_nmi);
-EXPORT_SYMBOL(release_perfctr_nmi);
-EXPORT_SYMBOL(reserve_evntsel_nmi);
-EXPORT_SYMBOL(release_evntsel_nmi);
-EXPORT_SYMBOL(disable_timer_nmi_watchdog);
-EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 2ec331e03fa9..5c10f376bce1 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -20,6 +20,7 @@
 #include <linux/efi.h>
 #include <linux/bcd.h>
 #include <linux/start_kernel.h>
+#include <linux/highmem.h>
 
 #include <asm/bug.h>
 #include <asm/paravirt.h>
@@ -35,7 +36,7 @@
 #include <asm/timer.h>
 
 /* nop stub */
-static void native_nop(void)
+void _paravirt_nop(void)
 {
 }
 
@@ -54,331 +55,148 @@ char *memory_setup(void)
 #define DEF_NATIVE(name, code)					\
 	extern const char start_##name[], end_##name[];		\
 	asm("start_" #name ": " code "; end_" #name ":")
-DEF_NATIVE(cli, "cli");
-DEF_NATIVE(sti, "sti");
-DEF_NATIVE(popf, "push %eax; popf");
-DEF_NATIVE(pushf, "pushf; pop %eax");
-DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
+
+DEF_NATIVE(irq_disable, "cli");
+DEF_NATIVE(irq_enable, "sti");
+DEF_NATIVE(restore_fl, "push %eax; popf");
+DEF_NATIVE(save_fl, "pushf; pop %eax");
 DEF_NATIVE(iret, "iret");
-DEF_NATIVE(sti_sysexit, "sti; sysexit");
+DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(clts, "clts");
+DEF_NATIVE(read_tsc, "rdtsc");
 
-static const struct native_insns
-{
-	const char *start, *end;
-} native_insns[] = {
-	[PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
-	[PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
-	[PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
-	[PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
-	[PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
-	[PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
-	[PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
-};
+DEF_NATIVE(ud2a, "ud2a");
 
 static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
 {
-	unsigned int insn_len;
-
-	/* Don't touch it if we don't have a replacement */
-	if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
-		return len;
-
-	insn_len = native_insns[type].end - native_insns[type].start;
-
-	/* Similarly if we can't fit replacement. */
-	if (len < insn_len)
-		return len;
+	const unsigned char *start, *end;
+	unsigned ret;
+
+	switch(type) {
+#define SITE(x)	case PARAVIRT_PATCH(x):	start = start_##x; end = end_##x; goto patch_site
+		SITE(irq_disable);
+		SITE(irq_enable);
+		SITE(restore_fl);
+		SITE(save_fl);
+		SITE(iret);
+		SITE(irq_enable_sysexit);
+		SITE(read_cr2);
+		SITE(read_cr3);
+		SITE(write_cr3);
+		SITE(clts);
+		SITE(read_tsc);
+#undef SITE
+
+	patch_site:
+		ret = paravirt_patch_insns(insns, len, start, end);
+		break;
 
-	memcpy(insns, native_insns[type].start, insn_len);
-	return insn_len;
-}
+	case PARAVIRT_PATCH(make_pgd):
+	case PARAVIRT_PATCH(make_pte):
+	case PARAVIRT_PATCH(pgd_val):
+	case PARAVIRT_PATCH(pte_val):
+#ifdef CONFIG_X86_PAE
+	case PARAVIRT_PATCH(make_pmd):
+	case PARAVIRT_PATCH(pmd_val):
+#endif
+		/* These functions end up returning exactly what
+		   they're passed, in the same registers. */
+		ret = paravirt_patch_nop();
+		break;
 
-static unsigned long native_get_debugreg(int regno)
-{
-	unsigned long val = 0; 	/* Damn you, gcc! */
-
-	switch (regno) {
-	case 0:
-		asm("movl %%db0, %0" :"=r" (val)); break;
-	case 1:
-		asm("movl %%db1, %0" :"=r" (val)); break;
-	case 2:
-		asm("movl %%db2, %0" :"=r" (val)); break;
-	case 3:
-		asm("movl %%db3, %0" :"=r" (val)); break;
-	case 6:
-		asm("movl %%db6, %0" :"=r" (val)); break;
-	case 7:
-		asm("movl %%db7, %0" :"=r" (val)); break;
 	default:
-		BUG();
-	}
-	return val;
-}
-
-static void native_set_debugreg(int regno, unsigned long value)
-{
-	switch (regno) {
-	case 0:
-		asm("movl %0,%%db0"	: /* no output */ :"r" (value));
-		break;
-	case 1:
-		asm("movl %0,%%db1"	: /* no output */ :"r" (value));
-		break;
-	case 2:
-		asm("movl %0,%%db2"	: /* no output */ :"r" (value));
+		ret = paravirt_patch_default(type, clobbers, insns, len);
 		break;
-	case 3:
-		asm("movl %0,%%db3"	: /* no output */ :"r" (value));
-		break;
-	case 6:
-		asm("movl %0,%%db6"	: /* no output */ :"r" (value));
-		break;
-	case 7:
-		asm("movl %0,%%db7"	: /* no output */ :"r" (value));
-		break;
-	default:
-		BUG();
 	}
-}
-
-void init_IRQ(void)
-{
-	paravirt_ops.init_IRQ();
-}
-
-static void native_clts(void)
-{
-	asm volatile ("clts");
-}
-
-static unsigned long native_read_cr0(void)
-{
-	unsigned long val;
-	asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
-	return val;
-}
-
-static void native_write_cr0(unsigned long val)
-{
-	asm volatile("movl %0,%%cr0": :"r" (val));
-}
-
-static unsigned long native_read_cr2(void)
-{
-	unsigned long val;
-	asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
-	return val;
-}
-
-static void native_write_cr2(unsigned long val)
-{
-	asm volatile("movl %0,%%cr2": :"r" (val));
-}
-
-static unsigned long native_read_cr3(void)
-{
-	unsigned long val;
-	asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
-	return val;
-}
-
-static void native_write_cr3(unsigned long val)
-{
-	asm volatile("movl %0,%%cr3": :"r" (val));
-}
-
-static unsigned long native_read_cr4(void)
-{
-	unsigned long val;
-	asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
-	return val;
-}
-
-static unsigned long native_read_cr4_safe(void)
-{
-	unsigned long val;
-	/* This could fault if %cr4 does not exist */
-	asm("1: movl %%cr4, %0		\n"
-		"2:				\n"
-		".section __ex_table,\"a\"	\n"
-		".long 1b,2b			\n"
-		".previous			\n"
-		: "=r" (val): "0" (0));
-	return val;
-}
-
-static void native_write_cr4(unsigned long val)
-{
-	asm volatile("movl %0,%%cr4": :"r" (val));
-}
-
-static unsigned long native_save_fl(void)
-{
-	unsigned long f;
-	asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
-	return f;
-}
-
-static void native_restore_fl(unsigned long f)
-{
-	asm volatile("pushl %0 ; popfl": /* no output */
-			     :"g" (f)
-			     :"memory", "cc");
-}
-
-static void native_irq_disable(void)
-{
-	asm volatile("cli": : :"memory");
-}
-
-static void native_irq_enable(void)
-{
-	asm volatile("sti": : :"memory");
-}
-
-static void native_safe_halt(void)
-{
-	asm volatile("sti; hlt": : :"memory");
-}
 
-static void native_halt(void)
-{
-	asm volatile("hlt": : :"memory");
+	return ret;
 }
 
-static void native_wbinvd(void)
+unsigned paravirt_patch_nop(void)
 {
-	asm volatile("wbinvd": : :"memory");
+	return 0;
 }
 
-static unsigned long long native_read_msr(unsigned int msr, int *err)
+unsigned paravirt_patch_ignore(unsigned len)
 {
-	unsigned long long val;
-
-	asm volatile("2: rdmsr ; xorl %0,%0\n"
-		     "1:\n\t"
-		     ".section .fixup,\"ax\"\n\t"
-		     "3:  movl %3,%0 ; jmp 1b\n\t"
-		     ".previous\n\t"
- 		     ".section __ex_table,\"a\"\n"
-		     "   .align 4\n\t"
-		     "   .long 	2b,3b\n\t"
-		     ".previous"
-		     : "=r" (*err), "=A" (val)
-		     : "c" (msr), "i" (-EFAULT));
-
-	return val;
+	return len;
 }
 
-static int native_write_msr(unsigned int msr, unsigned long long val)
+unsigned paravirt_patch_call(void *target, u16 tgt_clobbers,
+			     void *site, u16 site_clobbers,
+			     unsigned len)
 {
-	int err;
-	asm volatile("2: wrmsr ; xorl %0,%0\n"
-		     "1:\n\t"
-		     ".section .fixup,\"ax\"\n\t"
-		     "3:  movl %4,%0 ; jmp 1b\n\t"
-		     ".previous\n\t"
- 		     ".section __ex_table,\"a\"\n"
-		     "   .align 4\n\t"
-		     "   .long 	2b,3b\n\t"
-		     ".previous"
-		     : "=a" (err)
-		     : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
-		       "i" (-EFAULT));
-	return err;
-}
+	unsigned char *call = site;
+	unsigned long delta = (unsigned long)target - (unsigned long)(call+5);
 
-static unsigned long long native_read_tsc(void)
-{
-	unsigned long long val;
-	asm volatile("rdtsc" : "=A" (val));
-	return val;
-}
+	if (tgt_clobbers & ~site_clobbers)
+		return len;	/* target would clobber too much for this site */
+	if (len < 5)
+		return len;	/* call too long for patch site */
 
-static unsigned long long native_read_pmc(void)
-{
-	unsigned long long val;
-	asm volatile("rdpmc" : "=A" (val));
-	return val;
-}
+	*call++ = 0xe8;		/* call */
+	*(unsigned long *)call = delta;
 
-static void native_load_tr_desc(void)
-{
-	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+	return 5;
 }
 
-static void native_load_gdt(const struct Xgt_desc_struct *dtr)
+unsigned paravirt_patch_jmp(void *target, void *site, unsigned len)
 {
-	asm volatile("lgdt %0"::"m" (*dtr));
-}
+	unsigned char *jmp = site;
+	unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5);
 
-static void native_load_idt(const struct Xgt_desc_struct *dtr)
-{
-	asm volatile("lidt %0"::"m" (*dtr));
-}
+	if (len < 5)
+		return len;	/* call too long for patch site */
 
-static void native_store_gdt(struct Xgt_desc_struct *dtr)
-{
-	asm ("sgdt %0":"=m" (*dtr));
-}
+	*jmp++ = 0xe9;		/* jmp */
+	*(unsigned long *)jmp = delta;
 
-static void native_store_idt(struct Xgt_desc_struct *dtr)
-{
-	asm ("sidt %0":"=m" (*dtr));
+	return 5;
 }
 
-static unsigned long native_store_tr(void)
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
 {
-	unsigned long tr;
-	asm ("str %0":"=r" (tr));
-	return tr;
-}
+	void *opfunc = *((void **)&paravirt_ops + type);
+	unsigned ret;
 
-static void native_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
-	C(0); C(1); C(2);
-#undef C
-}
+	if (opfunc == NULL)
+		/* If there's no function, patch it with a ud2a (BUG) */
+		ret = paravirt_patch_insns(site, len, start_ud2a, end_ud2a);
+	else if (opfunc == paravirt_nop)
+		/* If the operation is a nop, then nop the callsite */
+		ret = paravirt_patch_nop();
+	else if (type == PARAVIRT_PATCH(iret) ||
+		 type == PARAVIRT_PATCH(irq_enable_sysexit))
+		/* If operation requires a jmp, then jmp */
+		ret = paravirt_patch_jmp(opfunc, site, len);
+	else
+		/* Otherwise call the function; assume target could
+		   clobber any caller-save reg */
+		ret = paravirt_patch_call(opfunc, CLBR_ANY,
+					  site, clobbers, len);
 
-static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
-{
-	u32 *lp = (u32 *)((char *)dt + entry*8);
-	lp[0] = entry_low;
-	lp[1] = entry_high;
+	return ret;
 }
 
-static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+unsigned paravirt_patch_insns(void *site, unsigned len,
+			      const char *start, const char *end)
 {
-	native_write_dt_entry(dt, entrynum, low, high);
-}
+	unsigned insn_len = end - start;
 
-static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
-{
-	native_write_dt_entry(dt, entrynum, low, high);
-}
-
-static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
-{
-	native_write_dt_entry(dt, entrynum, low, high);
-}
+	if (insn_len > len || start == NULL)
+		insn_len = len;
+	else
+		memcpy(site, start, insn_len);
 
-static void native_load_esp0(struct tss_struct *tss,
-				      struct thread_struct *thread)
-{
-	tss->esp0 = thread->esp0;
-
-	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
-	if (unlikely(tss->ss1 != thread->sysenter_cs)) {
-		tss->ss1 = thread->sysenter_cs;
-		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-	}
+	return insn_len;
 }
 
-static void native_io_delay(void)
+void init_IRQ(void)
 {
-	asm volatile("outb %al,$0x80");
+	paravirt_ops.init_IRQ();
 }
 
 static void native_flush_tlb(void)
@@ -395,83 +213,11 @@ static void native_flush_tlb_global(void)
 	__native_flush_tlb_global();
 }
 
-static void native_flush_tlb_single(u32 addr)
+static void native_flush_tlb_single(unsigned long addr)
 {
 	__native_flush_tlb_single(addr);
 }
 
-#ifndef CONFIG_X86_PAE
-static void native_set_pte(pte_t *ptep, pte_t pteval)
-{
-	*ptep = pteval;
-}
-
-static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
-{
-	*ptep = pteval;
-}
-
-static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	*pmdp = pmdval;
-}
-
-#else /* CONFIG_X86_PAE */
-
-static void native_set_pte(pte_t *ptep, pte_t pte)
-{
-	ptep->pte_high = pte.pte_high;
-	smp_wmb();
-	ptep->pte_low = pte.pte_low;
-}
-
-static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
-{
-	ptep->pte_high = pte.pte_high;
-	smp_wmb();
-	ptep->pte_low = pte.pte_low;
-}
-
-static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-	ptep->pte_low = 0;
-	smp_wmb();
-	ptep->pte_high = pte.pte_high;
-	smp_wmb();
-	ptep->pte_low = pte.pte_low;
-}
-
-static void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
-	set_64bit((unsigned long long *)ptep,pte_val(pteval));
-}
-
-static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
-}
-
-static void native_set_pud(pud_t *pudp, pud_t pudval)
-{
-	*pudp = pudval;
-}
-
-static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	ptep->pte_low = 0;
-	smp_wmb();
-	ptep->pte_high = 0;
-}
-
-static void native_pmd_clear(pmd_t *pmd)
-{
-	u32 *tmp = (u32 *)pmd;
-	*tmp = 0;
-	smp_wmb();
-	*(tmp + 1) = 0;
-}
-#endif /* CONFIG_X86_PAE */
-
 /* These are in entry.S */
 extern void native_iret(void);
 extern void native_irq_enable_sysexit(void);
@@ -487,10 +233,11 @@ struct paravirt_ops paravirt_ops = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
 	.kernel_rpl = 0,
+	.shared_kernel_pmd = 1,	/* Only used when CONFIG_X86_PAE is set */
 
  	.patch = native_patch,
 	.banner = default_banner,
-	.arch_setup = native_nop,
+	.arch_setup = paravirt_nop,
 	.memory_setup = machine_specific_memory_setup,
 	.get_wallclock = native_get_wallclock,
 	.set_wallclock = native_set_wallclock,
@@ -517,8 +264,8 @@ struct paravirt_ops paravirt_ops = {
 	.safe_halt = native_safe_halt,
 	.halt = native_halt,
 	.wbinvd = native_wbinvd,
-	.read_msr = native_read_msr,
-	.write_msr = native_write_msr,
+	.read_msr = native_read_msr_safe,
+	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
 	.get_scheduled_cycles = native_read_tsc,
@@ -531,9 +278,9 @@ struct paravirt_ops paravirt_ops = {
 	.store_idt = native_store_idt,
 	.store_tr = native_store_tr,
 	.load_tls = native_load_tls,
-	.write_ldt_entry = native_write_ldt_entry,
-	.write_gdt_entry = native_write_gdt_entry,
-	.write_idt_entry = native_write_idt_entry,
+	.write_ldt_entry = write_dt_entry,
+	.write_gdt_entry = write_dt_entry,
+	.write_idt_entry = write_dt_entry,
 	.load_esp0 = native_load_esp0,
 
 	.set_iopl_mask = native_set_iopl_mask,
@@ -545,44 +292,57 @@ struct paravirt_ops paravirt_ops = {
 	.apic_read = native_apic_read,
 	.setup_boot_clock = setup_boot_APIC_clock,
 	.setup_secondary_clock = setup_secondary_APIC_clock,
+	.startup_ipi_hook = paravirt_nop,
 #endif
-	.set_lazy_mode = (void *)native_nop,
+	.set_lazy_mode = paravirt_nop,
+
+	.pagetable_setup_start = native_pagetable_setup_start,
+	.pagetable_setup_done = native_pagetable_setup_done,
 
 	.flush_tlb_user = native_flush_tlb,
 	.flush_tlb_kernel = native_flush_tlb_global,
 	.flush_tlb_single = native_flush_tlb_single,
+	.flush_tlb_others = native_flush_tlb_others,
 
-	.map_pt_hook = (void *)native_nop,
-
-	.alloc_pt = (void *)native_nop,
-	.alloc_pd = (void *)native_nop,
-	.alloc_pd_clone = (void *)native_nop,
-	.release_pt = (void *)native_nop,
-	.release_pd = (void *)native_nop,
+	.alloc_pt = paravirt_nop,
+	.alloc_pd = paravirt_nop,
+	.alloc_pd_clone = paravirt_nop,
+	.release_pt = paravirt_nop,
+	.release_pd = paravirt_nop,
 
 	.set_pte = native_set_pte,
 	.set_pte_at = native_set_pte_at,
 	.set_pmd = native_set_pmd,
-	.pte_update = (void *)native_nop,
-	.pte_update_defer = (void *)native_nop,
+	.pte_update = paravirt_nop,
+	.pte_update_defer = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+	.kmap_atomic_pte = kmap_atomic,
+#endif
+
 #ifdef CONFIG_X86_PAE
 	.set_pte_atomic = native_set_pte_atomic,
 	.set_pte_present = native_set_pte_present,
 	.set_pud = native_set_pud,
 	.pte_clear = native_pte_clear,
 	.pmd_clear = native_pmd_clear,
+
+	.pmd_val = native_pmd_val,
+	.make_pmd = native_make_pmd,
 #endif
 
+	.pte_val = native_pte_val,
+	.pgd_val = native_pgd_val,
+
+	.make_pte = native_make_pte,
+	.make_pgd = native_make_pgd,
+
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 
-	.startup_ipi_hook = (void *)native_nop,
+	.dup_mmap = paravirt_nop,
+	.exit_mmap = paravirt_nop,
+	.activate_mm = paravirt_nop,
 };
 
-/*
- * NOTE: CONFIG_PARAVIRT is experimental and the paravirt_ops
- * semantics are subject to change. Hence we only do this
- * internal-only export of this, until it gets sorted out and
- * all lowlevel CPU ops used by modules are separately exported.
- */
-EXPORT_SYMBOL_GPL(paravirt_ops);
+EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 393a67d5d943..61999479b7a4 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -39,6 +39,7 @@
 #include <linux/random.h>
 #include <linux/personality.h>
 #include <linux/tick.h>
+#include <linux/percpu.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -57,7 +58,6 @@
 
 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
-#include <asm/pda.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -66,6 +66,12 @@ static int hlt_counter;
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
 /*
  * Return saved PC of a blocked thread.
  */
@@ -272,25 +278,24 @@ void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
 	}
 }
 
-static int __init idle_setup (char *str)
+static int __init idle_setup(char *str)
 {
-	if (!strncmp(str, "poll", 4)) {
+	if (!strcmp(str, "poll")) {
 		printk("using polling idle threads.\n");
 		pm_idle = poll_idle;
 #ifdef CONFIG_X86_SMP
 		if (smp_num_siblings > 1)
 			printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
 #endif
-	} else if (!strncmp(str, "halt", 4)) {
-		printk("using halt in idle threads.\n");
-		pm_idle = default_idle;
-	}
+	} else if (!strcmp(str, "mwait"))
+		force_mwait = 1;
+	else
+		return -1;
 
 	boot_option_idle_override = 1;
-	return 1;
+	return 0;
 }
-
-__setup("idle=", idle_setup);
+early_param("idle", idle_setup);
 
 void show_regs(struct pt_regs * regs)
 {
@@ -343,7 +348,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 
 	regs.xds = __USER_DS;
 	regs.xes = __USER_DS;
-	regs.xfs = __KERNEL_PDA;
+	regs.xfs = __KERNEL_PERCPU;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
 	regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -376,7 +381,7 @@ void exit_thread(void)
 		t->io_bitmap_max = 0;
 		tss->io_bitmap_owner = NULL;
 		tss->io_bitmap_max = 0;
-		tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
 }
@@ -555,7 +560,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
 		 * Disable the bitmap via an invalid offset. We still cache
 		 * the previous bitmap owner and the IO bitmap contents:
 		 */
-		tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
+		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		return;
 	}
 
@@ -565,7 +570,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
 		 * matches the next task, we dont have to do anything but
 		 * to set a valid offset in the TSS:
 		 */
-		tss->io_bitmap_base = IO_BITMAP_OFFSET;
+		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
 		return;
 	}
 	/*
@@ -577,7 +582,7 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
 	 * redundant copies when the currently switched task does not
 	 * perform any I/O during its timeslice.
 	 */
-	tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
 }
 
 /*
@@ -712,7 +717,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	if (prev->gs | next->gs)
 		loadsegment(gs, next->gs);
 
-	write_pda(pcurrent, next_p);
+	x86_write_percpu(current_task, next_p);
 
 	return prev_p;
 }
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 34874c398b44..9f6ab1789bb0 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,12 +3,10 @@
  */
 #include <linux/pci.h>
 #include <linux/irq.h>
-#include <asm/pci-direct.h>
-#include <asm/genapic.h>
-#include <asm/cpu.h>
 
 #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
-static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
+
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 {
 	u8 config, rev;
 	u32 word;
@@ -16,12 +14,14 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
 	/* BIOS may enable hardware IRQ balancing for
 	 * E7520/E7320/E7525(revision ID 0x9 and below)
 	 * based platforms.
-	 * For those platforms, make sure that the genapic is set to 'flat'
+	 * Disable SW irqbalance/affinity on those platforms.
 	 */
 	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
 	if (rev > 0x9)
 		return;
 
+	printk(KERN_INFO "Intel E7520/7320/7525 detected.");
+
 	/* enable access to config space*/
 	pci_read_config_byte(dev, 0xf4, &config);
 	pci_write_config_byte(dev, 0xf4, config|0x2);
@@ -30,44 +30,6 @@ static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
 	raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
 
 	if (!(word & (1 << 13))) {
-#ifdef CONFIG_X86_64
-		if (genapic !=  &apic_flat)
-			panic("APIC mode must be flat on this system\n");
-#elif defined(CONFIG_X86_GENERICARCH)
-		if (genapic != &apic_default)
-			panic("APIC mode must be default(flat) on this system. Use apic=default\n");
-#endif
-	}
-
-	/* put back the original value for config space*/
-	if (!(config & 0x2))
-		pci_write_config_byte(dev, 0xf4, config);
-}
-
-void __init quirk_intel_irqbalance(void)
-{
-	u8 config, rev;
-	u32 word;
-
-	/* BIOS may enable hardware IRQ balancing for
-	 * E7520/E7320/E7525(revision ID 0x9 and below)
-	 * based platforms.
-	 * Disable SW irqbalance/affinity on those platforms.
-	 */
-	rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
-	if (rev > 0x9)
-		return;
-
-	printk(KERN_INFO "Intel E7520/7320/7525 detected.");
-
-	/* enable access to config space */
-	config = read_pci_config_byte(0, 0, 0, 0xf4);
-	write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
-
-	/* read xTPR register */
-	word = read_pci_config_16(0, 0, 0x40, 0x4c);
-
-	if (!(word & (1 << 13))) {
 		printk(KERN_INFO "Disabling irq balancing and affinity\n");
 #ifdef CONFIG_IRQBALANCE
 		irqbalance_disable("");
@@ -76,24 +38,13 @@ void __init quirk_intel_irqbalance(void)
 #ifdef CONFIG_PROC_FS
 		no_irq_affinity = 1;
 #endif
-#ifdef CONFIG_HOTPLUG_CPU
-		printk(KERN_INFO "Disabling cpu hotplug control\n");
-		enable_cpu_hotplug = 0;
-#endif
-#ifdef CONFIG_X86_64
-		/* force the genapic selection to flat mode so that
-		 * interrupts can be redirected to more than one CPU.
-		 */
-		genapic_force = &apic_flat;
-#endif
 	}
 
-	/* put back the original value for config space */
+	/* put back the original value for config space*/
 	if (!(config & 0x2))
-		write_pci_config_byte(0, 0, 0, 0xf4, config);
+		pci_write_config_byte(dev, 0xf4, config);
 }
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  verify_quirk_intel_irqbalance);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  verify_quirk_intel_irqbalance);
-
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7320_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7525_MCH,	quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7520_MCH,	quirk_intel_irqbalance);
 #endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 3514b4153f7f..50dfc65319cd 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -17,7 +17,8 @@
 #include <asm/apic.h>
 #include <asm/desc.h>
 #include "mach_reboot.h"
-#include <linux/reboot_fixups.h>
+#include <asm/reboot_fixups.h>
+#include <asm/reboot.h>
 
 /*
  * Power off function, if any
@@ -197,8 +198,6 @@ static unsigned char jump_to_bios [] =
  */
 void machine_real_restart(unsigned char *code, int length)
 {
-	unsigned long flags;
-
 	local_irq_disable();
 
 	/* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -211,9 +210,9 @@ void machine_real_restart(unsigned char *code, int length)
 	   safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
 	 */
 
-	spin_lock_irqsave(&rtc_lock, flags);
+	spin_lock(&rtc_lock);
 	CMOS_WRITE(0x00, 0x8f);
-	spin_unlock_irqrestore(&rtc_lock, flags);
+	spin_unlock(&rtc_lock);
 
 	/* Remap the kernel at virtual address zero, as well as offset zero
 	   from the kernel segment.  This assumes the kernel segment starts at
@@ -280,7 +279,7 @@ void machine_real_restart(unsigned char *code, int length)
 EXPORT_SYMBOL(machine_real_restart);
 #endif
 
-void machine_shutdown(void)
+static void native_machine_shutdown(void)
 {
 #ifdef CONFIG_SMP
 	int reboot_cpu_id;
@@ -316,7 +315,11 @@ void machine_shutdown(void)
 #endif
 }
 
-void machine_emergency_restart(void)
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+
+static void native_machine_emergency_restart(void)
 {
 	if (!reboot_thru_bios) {
 		if (efi_enabled) {
@@ -340,17 +343,17 @@ void machine_emergency_restart(void)
 	machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
 }
 
-void machine_restart(char * __unused)
+static void native_machine_restart(char * __unused)
 {
 	machine_shutdown();
 	machine_emergency_restart();
 }
 
-void machine_halt(void)
+static void native_machine_halt(void)
 {
 }
 
-void machine_power_off(void)
+static void native_machine_power_off(void)
 {
 	if (pm_power_off) {
 		machine_shutdown();
@@ -359,3 +362,35 @@ void machine_power_off(void)
 }
 
 
+struct machine_ops machine_ops = {
+	.power_off = native_machine_power_off,
+	.shutdown = native_machine_shutdown,
+	.emergency_restart = native_machine_emergency_restart,
+	.restart = native_machine_restart,
+	.halt = native_machine_halt,
+};
+
+void machine_power_off(void)
+{
+	machine_ops.power_off();
+}
+
+void machine_shutdown(void)
+{
+	machine_ops.shutdown();
+}
+
+void machine_emergency_restart(void)
+{
+	machine_ops.emergency_restart();
+}
+
+void machine_restart(char *cmd)
+{
+	machine_ops.restart(cmd);
+}
+
+void machine_halt(void)
+{
+	machine_ops.halt();
+}
diff --git a/arch/i386/kernel/reboot_fixups.c b/arch/i386/kernel/reboot_fixups.c
index 99aab41a05b0..2d78d918340f 100644
--- a/arch/i386/kernel/reboot_fixups.c
+++ b/arch/i386/kernel/reboot_fixups.c
@@ -10,7 +10,7 @@
 
 #include <asm/delay.h>
 #include <linux/pci.h>
-#include <linux/reboot_fixups.h>
+#include <asm/reboot_fixups.h>
 
 static void cs5530a_warm_reset(struct pci_dev *dev)
 {
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 0e8977871b1f..89a45a9ddcd4 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -165,20 +165,20 @@ void fastcall send_IPI_self(int vector)
 }
 
 /*
- * This is only used on smaller machines.
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
  */
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
 {
-	unsigned long mask = cpus_addr(cpumask)[0];
 	unsigned long cfg;
-	unsigned long flags;
 
-	local_irq_save(flags);
-	WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
 	/*
 	 * Wait for idle.
 	 */
-	apic_wait_icr_idle();
+	if (unlikely(vector == NMI_VECTOR))
+		safe_apic_wait_icr_idle();
+	else
+		apic_wait_icr_idle();
 		
 	/*
 	 * prepare target chip field
@@ -195,13 +195,25 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
 	apic_write_around(APIC_ICR, cfg);
+}
+
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+{
+	unsigned long mask = cpus_addr(cpumask)[0];
+	unsigned long flags;
 
+	local_irq_save(flags);
+	WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+	__send_IPI_dest_field(mask, vector);
 	local_irq_restore(flags);
 }
 
 void send_IPI_mask_sequence(cpumask_t mask, int vector)
 {
-	unsigned long cfg, flags;
+	unsigned long flags;
 	unsigned int query_cpu;
 
 	/*
@@ -211,30 +223,10 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
 	 */ 
 
 	local_irq_save(flags);
-
 	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
 		if (cpu_isset(query_cpu, mask)) {
-		
-			/*
-			 * Wait for idle.
-			 */
-			apic_wait_icr_idle();
-		
-			/*
-			 * prepare target chip field
-			 */
-			cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
-			apic_write_around(APIC_ICR2, cfg);
-		
-			/*
-			 * program the ICR 
-			 */
-			cfg = __prepare_ICR(0, vector);
-			
-			/*
-			 * Send the IPI. The write to APIC_ICR fires this off.
-			 */
-			apic_write_around(APIC_ICR, cfg);
+			__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+					      vector);
 		}
 	}
 	local_irq_restore(flags);
@@ -256,7 +248,6 @@ static cpumask_t flush_cpumask;
 static struct mm_struct * flush_mm;
 static unsigned long flush_va;
 static DEFINE_SPINLOCK(tlbstate_lock);
-#define FLUSH_ALL	0xffffffff
 
 /*
  * We cannot call mmdrop() because we are in interrupt context, 
@@ -338,7 +329,7 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
 		 
 	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
 		if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
-			if (flush_va == FLUSH_ALL)
+			if (flush_va == TLB_FLUSH_ALL)
 				local_flush_tlb();
 			else
 				__flush_tlb_one(flush_va);
@@ -353,9 +344,11 @@ out:
 	put_cpu_no_resched();
 }
 
-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
-						unsigned long va)
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+			     unsigned long va)
 {
+	cpumask_t cpumask = *cpumaskp;
+
 	/*
 	 * A couple of (to be removed) sanity checks:
 	 *
@@ -366,10 +359,12 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
 	BUG_ON(!mm);
 
+#ifdef CONFIG_HOTPLUG_CPU
 	/* If a CPU which we ran on has gone down, OK. */
 	cpus_and(cpumask, cpumask, cpu_online_map);
-	if (cpus_empty(cpumask))
+	if (unlikely(cpus_empty(cpumask)))
 		return;
+#endif
 
 	/*
 	 * i'm not happy about this global shared spinlock in the
@@ -380,17 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 	
 	flush_mm = mm;
 	flush_va = va;
-#if NR_CPUS <= BITS_PER_LONG
-	atomic_set_mask(cpumask, &flush_cpumask);
-#else
-	{
-		int k;
-		unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
-		unsigned long *cpu_mask = (unsigned long *)&cpumask;
-		for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
-			atomic_set_mask(cpu_mask[k], &flush_mask[k]);
-	}
-#endif
+	cpus_or(flush_cpumask, cpumask, flush_cpumask);
 	/*
 	 * We have to send the IPI only to
 	 * CPUs affected.
@@ -417,7 +402,7 @@ void flush_tlb_current_task(void)
 
 	local_flush_tlb();
 	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
 	preempt_enable();
 }
 
@@ -436,7 +421,7 @@ void flush_tlb_mm (struct mm_struct * mm)
 			leave_mm(smp_processor_id());
 	}
 	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
 
 	preempt_enable();
 }
@@ -483,7 +468,7 @@ void flush_tlb_all(void)
  * it goes straight through and wastes no time serializing
  * anything. Worst case is that we lose a reschedule ...
  */
-void smp_send_reschedule(int cpu)
+void native_smp_send_reschedule(int cpu)
 {
 	WARN_ON(cpu_is_offline(cpu));
 	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
@@ -515,36 +500,78 @@ void unlock_ipi_call_lock(void)
 
 static struct call_data_struct *call_data;
 
+static void __smp_call_function(void (*func) (void *info), void *info,
+				int nonatomic, int wait)
+{
+	struct call_data_struct data;
+	int cpus = num_online_cpus() - 1;
+
+	if (!cpus)
+		return;
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	mb();
+	
+	/* Send a message to all other CPUs and wait for them to respond */
+	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (wait)
+		while (atomic_read(&data.finished) != cpus)
+			cpu_relax();
+}
+
+
 /**
- * smp_call_function(): Run a function on all other CPUs.
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.  Must not include the current cpu.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: currently unused.
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
- * Returns 0 on success, else a negative status code. Does not return until
- * remote CPUs are nearly ready to execute <<func>> or are or have executed.
+  * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
  *
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-			int wait)
+int native_smp_call_function_mask(cpumask_t mask,
+				  void (*func)(void *), void *info,
+				  int wait)
 {
 	struct call_data_struct data;
+	cpumask_t allbutself;
 	int cpus;
 
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
 	/* Holding any lock stops cpus from going down. */
 	spin_lock(&call_lock);
-	cpus = num_online_cpus() - 1;
+
+	allbutself = cpu_online_map;
+	cpu_clear(smp_processor_id(), allbutself);
+
+	cpus_and(mask, mask, allbutself);
+	cpus = cpus_weight(mask);
+
 	if (!cpus) {
 		spin_unlock(&call_lock);
 		return 0;
 	}
 
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
 	data.func = func;
 	data.info = info;
 	atomic_set(&data.started, 0);
@@ -554,9 +581,12 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 
 	call_data = &data;
 	mb();
-	
-	/* Send a message to all other CPUs and wait for them to respond */
-	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+	/* Send a message to other CPUs */
+	if (cpus_equal(mask, allbutself))
+		send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+	else
+		send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
 
 	/* Wait for response */
 	while (atomic_read(&data.started) != cpus)
@@ -569,15 +599,68 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 
 	return 0;
 }
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
+		      int wait)
+{
+	return smp_call_function_mask(cpu_online_map, func, info, wait);
+}
 EXPORT_SYMBOL(smp_call_function);
 
+/**
+ * smp_call_function_single - Run a function on another CPU
+ * @cpu: The target CPU.  Cannot be the calling CPU.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+			     int nonatomic, int wait)
+{
+	/* prevent preemption and reschedule on another processor */
+	int ret;
+	int me = get_cpu();
+	if (cpu == me) {
+		WARN_ON(1);
+		put_cpu();
+		return -EBUSY;
+	}
+
+	ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
+
+	put_cpu();
+	return ret;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
 static void stop_this_cpu (void * dummy)
 {
+	local_irq_disable();
 	/*
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_disable();
 	disable_local_APIC();
 	if (cpu_data[smp_processor_id()].hlt_works_ok)
 		for(;;) halt();
@@ -588,13 +671,18 @@ static void stop_this_cpu (void * dummy)
  * this function calls the 'stop' function on all other CPUs in the system.
  */
 
-void smp_send_stop(void)
+void native_smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 1, 0);
+	/* Don't deadlock on the call lock in panic */
+	int nolock = !spin_trylock(&call_lock);
+	unsigned long flags;
 
-	local_irq_disable();
+	local_irq_save(flags);
+	__smp_call_function(stop_this_cpu, NULL, 0, 0);
+	if (!nolock)
+		spin_unlock(&call_lock);
 	disable_local_APIC();
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 /*
@@ -633,77 +721,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
 	}
 }
 
-/*
- * this function sends a 'generic call function' IPI to one other CPU
- * in the system.
- *
- * cpu is a standard Linux logical CPU number.
- */
-static void
-__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-				int nonatomic, int wait)
-{
-	struct call_data_struct data;
-	int cpus = 1;
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	call_data = &data;
-	wmb();
-	/* Send a message to all other CPUs and wait for them to respond */
-	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
-
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
-
-	if (!wait)
-		return;
-
-	while (atomic_read(&data.finished) != cpus)
-		cpu_relax();
-}
-
-/*
- * smp_call_function_single - Run a function on another CPU
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Currently unused.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Retrurns 0 on success, else a negative status code.
- *
- * Does not return until the remote CPU is nearly ready to execute <func>
- * or is or has executed.
- */
-
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-			int nonatomic, int wait)
-{
-	/* prevent preemption and reschedule on another processor */
-	int me = get_cpu();
-	if (cpu == me) {
-		WARN_ON(1);
-		put_cpu();
-		return -EBUSY;
-	}
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	spin_lock_bh(&call_lock);
-	__smp_call_function_single(cpu, func, info, nonatomic, wait);
-	spin_unlock_bh(&call_lock);
-	put_cpu();
-	return 0;
-}
-EXPORT_SYMBOL(smp_call_function_single);
-
 static int convert_apicid_to_cpu(int apic_id)
 {
 	int i;
@@ -730,3 +747,14 @@ int safe_smp_processor_id(void)
 
 	return cpuid >= 0 ? cpuid : 0;
 }
+
+struct smp_ops smp_ops = {
+	.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+	.smp_prepare_cpus = native_smp_prepare_cpus,
+	.cpu_up = native_cpu_up,
+	.smp_cpus_done = native_smp_cpus_done,
+
+	.smp_send_stop = native_smp_send_stop,
+	.smp_send_reschedule = native_smp_send_reschedule,
+	.smp_call_function_mask = native_smp_call_function_mask,
+};
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4ff55e675576..a4b7ad283f49 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -53,13 +53,12 @@
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
 #include <asm/nmi.h>
-#include <asm/pda.h>
-#include <asm/genapic.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
 #include <smpboot_hooks.h>
 #include <asm/vmi.h>
+#include <asm/mtrr.h>
 
 /* Set if we find a B stepping CPU */
 static int __devinitdata smp_b_stepping;
@@ -100,6 +99,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid);
 
 u8 apicid_2_node[MAX_APICID];
 
+DEFINE_PER_CPU(unsigned long, this_cpu_off);
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
+
 /*
  * Trampoline 80x86 program as an array.
  */
@@ -156,7 +158,7 @@ static void __cpuinit smp_store_cpu_info(int id)
 
 	*c = boot_cpu_data;
 	if (id!=0)
-		identify_cpu(c);
+		identify_secondary_cpu(c);
 	/*
 	 * Mask B, Pentium, but not Pentium MMX
 	 */
@@ -379,14 +381,14 @@ set_cpu_sibling_map(int cpu)
 static void __cpuinit start_secondary(void *unused)
 {
 	/*
-	 * Don't put *anything* before secondary_cpu_init(), SMP
-	 * booting is too fragile that we want to limit the
-	 * things done here to the most necessary things.
+	 * Don't put *anything* before cpu_init(), SMP booting is too
+	 * fragile that we want to limit the things done here to the
+	 * most necessary things.
 	 */
 #ifdef CONFIG_VMI
 	vmi_bringup();
 #endif
-	secondary_cpu_init();
+	cpu_init();
 	preempt_disable();
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -441,12 +443,6 @@ static void __cpuinit start_secondary(void *unused)
 void __devinit initialize_secondary(void)
 {
 	/*
-	 * switch to the per CPU GDT we already set up
-	 * in do_boot_cpu()
-	 */
-	cpu_set_gdt(current_thread_info()->cpu);
-
-	/*
 	 * We don't actually need to load the full TSS,
 	 * basically just the stack pointer and the eip.
 	 */
@@ -463,7 +459,6 @@ extern struct {
 	void * esp;
 	unsigned short ss;
 } stack_start;
-extern struct i386_pda *start_pda;
 
 #ifdef CONFIG_NUMA
 
@@ -521,12 +516,12 @@ static void unmap_cpu_to_logical_apicid(int cpu)
 	unmap_cpu_to_node(cpu);
 }
 
-#if APIC_DEBUG
 static inline void __inquire_remote_apic(int apicid)
 {
 	int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
 	char *names[] = { "ID", "VERSION", "SPIV" };
-	int timeout, status;
+	int timeout;
+	unsigned long status;
 
 	printk("Inquiring remote APIC #%d...\n", apicid);
 
@@ -536,7 +531,9 @@ static inline void __inquire_remote_apic(int apicid)
 		/*
 		 * Wait for idle.
 		 */
-		apic_wait_icr_idle();
+		status = safe_apic_wait_icr_idle();
+		if (status)
+			printk("a previous APIC delivery may have failed\n");
 
 		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
 		apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -550,14 +547,13 @@ static inline void __inquire_remote_apic(int apicid)
 		switch (status) {
 		case APIC_ICR_RR_VALID:
 			status = apic_read(APIC_RRR);
-			printk("%08x\n", status);
+			printk("%lx\n", status);
 			break;
 		default:
 			printk("failed\n");
 		}
 	}
 }
-#endif
 
 #ifdef WAKE_SECONDARY_VIA_NMI
 /* 
@@ -568,8 +564,8 @@ static inline void __inquire_remote_apic(int apicid)
 static int __devinit
 wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 {
-	unsigned long send_status = 0, accept_status = 0;
-	int timeout, maxlvt;
+	unsigned long send_status, accept_status = 0;
+	int maxlvt;
 
 	/* Target chip */
 	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
@@ -579,12 +575,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 	apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
 
 	Dprintk("Waiting for send to finish...\n");
-	timeout = 0;
-	do {
-		Dprintk("+");
-		udelay(100);
-		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-	} while (send_status && (timeout++ < 1000));
+	send_status = safe_apic_wait_icr_idle();
 
 	/*
 	 * Give the other CPU some time to accept the IPI.
@@ -614,8 +605,8 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 static int __devinit
 wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 {
-	unsigned long send_status = 0, accept_status = 0;
-	int maxlvt, timeout, num_starts, j;
+	unsigned long send_status, accept_status = 0;
+	int maxlvt, num_starts, j;
 
 	/*
 	 * Be paranoid about clearing APIC errors.
@@ -640,12 +631,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 				| APIC_DM_INIT);
 
 	Dprintk("Waiting for send to finish...\n");
-	timeout = 0;
-	do {
-		Dprintk("+");
-		udelay(100);
-		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-	} while (send_status && (timeout++ < 1000));
+	send_status = safe_apic_wait_icr_idle();
 
 	mdelay(10);
 
@@ -658,12 +644,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
 
 	Dprintk("Waiting for send to finish...\n");
-	timeout = 0;
-	do {
-		Dprintk("+");
-		udelay(100);
-		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-	} while (send_status && (timeout++ < 1000));
+	send_status = safe_apic_wait_icr_idle();
 
 	atomic_set(&init_deasserted, 1);
 
@@ -719,12 +700,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 		Dprintk("Startup point 1.\n");
 
 		Dprintk("Waiting for send to finish...\n");
-		timeout = 0;
-		do {
-			Dprintk("+");
-			udelay(100);
-			send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-		} while (send_status && (timeout++ < 1000));
+		send_status = safe_apic_wait_icr_idle();
 
 		/*
 		 * Give the other CPU some time to accept the IPI.
@@ -788,6 +764,25 @@ static inline struct task_struct * alloc_idle_task(int cpu)
 #define alloc_idle_task(cpu) fork_idle(cpu)
 #endif
 
+/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
+   (still using the master per-cpu area), or a CPU doing it for a
+   secondary which will soon come up. */
+static __cpuinit void init_gdt(int cpu)
+{
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
+			(u32 *)&gdt[GDT_ENTRY_PERCPU].b,
+			__per_cpu_offset[cpu], 0xFFFFF,
+			0x80 | DESCTYPE_S | 0x2, 0x8);
+
+	per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
+	per_cpu(cpu_number, cpu) = cpu;
+}
+
+/* Defined in head.S */
+extern struct Xgt_desc_struct early_gdt_descr;
+
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -802,6 +797,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	unsigned short nmi_high = 0, nmi_low = 0;
 
 	/*
+	 * Save current MTRR state in case it was changed since early boot
+	 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+	 */
+	mtrr_save_state();
+
+	/*
 	 * We can't use kernel_thread since we must avoid to
 	 * reschedule the child.
 	 */
@@ -809,13 +810,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	if (IS_ERR(idle))
 		panic("failed fork for CPU %d", cpu);
 
-	/* Pre-allocate and initialize the CPU's GDT and PDA so it
-	   doesn't have to do any memory allocation during the
-	   delicate CPU-bringup phase. */
-	if (!init_gdt(cpu, idle)) {
-		printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
-		return -1;	/* ? */
-	}
+	init_gdt(cpu);
+ 	per_cpu(current_task, cpu) = idle;
+	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 
 	idle->thread.eip = (unsigned long) start_secondary;
 	/* start_eip had better be page-aligned! */
@@ -941,7 +938,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct warm_boot_cpu_info info;
 	int	apicid, ret;
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
 	apicid = x86_cpu_to_apicid[cpu];
 	if (apicid == BAD_APICID) {
@@ -949,18 +945,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
 		goto exit;
 	}
 
-	/*
-	 * the CPU isn't initialized at boot time, allocate gdt table here.
-	 * cpu_init will initialize it
-	 */
-	if (!cpu_gdt_descr->address) {
-		cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
-		if (!cpu_gdt_descr->address)
-			printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
-			ret = -ENOMEM;
-			goto exit;
-	}
-
 	info.complete = &done;
 	info.apicid = apicid;
 	info.cpu = cpu;
@@ -1173,7 +1157,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 
 /* These are wrappers to interface to the new boot process.  Someone
    who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
-void __init smp_prepare_cpus(unsigned int max_cpus)
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
 {
 	smp_commenced_mask = cpumask_of_cpu(0);
 	cpu_callin_map = cpumask_of_cpu(0);
@@ -1181,13 +1165,18 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	smp_boot_cpus(max_cpus);
 }
 
-void __devinit smp_prepare_boot_cpu(void)
+void __init native_smp_prepare_boot_cpu(void)
 {
-	cpu_set(smp_processor_id(), cpu_online_map);
-	cpu_set(smp_processor_id(), cpu_callout_map);
-	cpu_set(smp_processor_id(), cpu_present_map);
-	cpu_set(smp_processor_id(), cpu_possible_map);
-	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+	unsigned int cpu = smp_processor_id();
+
+	init_gdt(cpu);
+	switch_to_new_gdt();
+
+	cpu_set(cpu, cpu_online_map);
+	cpu_set(cpu, cpu_callout_map);
+	cpu_set(cpu, cpu_present_map);
+	cpu_set(cpu, cpu_possible_map);
+	__get_cpu_var(cpu_state) = CPU_ONLINE;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1277,7 +1266,7 @@ void __cpu_die(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpuinit native_cpu_up(unsigned int cpu)
 {
 	unsigned long flags;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1319,15 +1308,10 @@ int __cpuinit __cpu_up(unsigned int cpu)
 		touch_nmi_watchdog();
 	}
 
-#ifdef CONFIG_X86_GENERICARCH
-	if (num_online_cpus() > 8 && genapic == &apic_default)
-		panic("Default flat APIC routing can't be used with > 8 cpus\n");
-#endif
-
 	return 0;
 }
 
-void __init smp_cpus_done(unsigned int max_cpus)
+void __init native_smp_cpus_done(unsigned int max_cpus)
 {
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 13ca54a85a1c..ff4ee6f3326b 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -22,16 +22,26 @@
 #include <asm/msr.h>
 #include <asm/pgtable.h>
 #include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+
+enum {
+	VDSO_DISABLED = 0,
+	VDSO_ENABLED = 1,
+	VDSO_COMPAT = 2,
+};
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT	VDSO_COMPAT
+#else
+#define VDSO_DEFAULT	VDSO_ENABLED
+#endif
 
 /*
  * Should the kernel map a VDSO page into processes and pass its
  * address down to glibc upon exec()?
  */
-#ifdef CONFIG_PARAVIRT
-unsigned int __read_mostly vdso_enabled = 0;
-#else
-unsigned int __read_mostly vdso_enabled = 1;
-#endif
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
 
 EXPORT_SYMBOL_GPL(vdso_enabled);
 
@@ -46,6 +56,123 @@ __setup("vdso=", vdso_setup);
 
 extern asmlinkage void sysenter_entry(void);
 
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+				unsigned offset, unsigned size)
+{
+	Elf32_Sym *sym = (void *)ehdr + offset;
+	unsigned nsym = size / sizeof(*sym);
+	unsigned i;
+
+	for(i = 0; i < nsym; i++, sym++) {
+		if (sym->st_shndx == SHN_UNDEF ||
+		    sym->st_shndx == SHN_ABS)
+			continue;  /* skip */
+
+		if (sym->st_shndx > SHN_LORESERVE) {
+			printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+			       sym->st_shndx);
+			continue;
+		}
+
+		switch(ELF_ST_TYPE(sym->st_info)) {
+		case STT_OBJECT:
+		case STT_FUNC:
+		case STT_SECTION:
+		case STT_FILE:
+			sym->st_value += VDSO_HIGH_BASE;
+		}
+	}
+}
+
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+	Elf32_Dyn *dyn = (void *)ehdr + offset;
+
+	for(; dyn->d_tag != DT_NULL; dyn++)
+		switch(dyn->d_tag) {
+		case DT_PLTGOT:
+		case DT_HASH:
+		case DT_STRTAB:
+		case DT_SYMTAB:
+		case DT_RELA:
+		case DT_INIT:
+		case DT_FINI:
+		case DT_REL:
+		case DT_DEBUG:
+		case DT_JMPREL:
+		case DT_VERSYM:
+		case DT_VERDEF:
+		case DT_VERNEED:
+		case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+			/* definitely pointers needing relocation */
+			dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+			break;
+
+		case DT_ENCODING ... OLD_DT_LOOS-1:
+		case DT_LOOS ... DT_HIOS-1:
+			/* Tags above DT_ENCODING are pointers if
+			   they're even */
+			if (dyn->d_tag >= DT_ENCODING &&
+			    (dyn->d_tag & 1) == 0)
+				dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+			break;
+
+		case DT_VERDEFNUM:
+		case DT_VERNEEDNUM:
+		case DT_FLAGS_1:
+		case DT_RELACOUNT:
+		case DT_RELCOUNT:
+		case DT_VALRNGLO ... DT_VALRNGHI:
+			/* definitely not pointers */
+			break;
+
+		case OLD_DT_LOOS ... DT_LOOS-1:
+		case DT_HIOS ... DT_VALRNGLO-1:
+		default:
+			if (dyn->d_tag > DT_ENCODING)
+				printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+				       dyn->d_tag);
+			break;
+		}
+}
+
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+	Elf32_Phdr *phdr;
+	Elf32_Shdr *shdr;
+	int i;
+
+	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+	       !elf_check_arch(ehdr) ||
+	       ehdr->e_type != ET_DYN);
+
+	ehdr->e_entry += VDSO_HIGH_BASE;
+
+	/* rebase phdrs */
+	phdr = (void *)ehdr + ehdr->e_phoff;
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr[i].p_vaddr += VDSO_HIGH_BASE;
+
+		/* relocate dynamic stuff */
+		if (phdr[i].p_type == PT_DYNAMIC)
+			reloc_dyn(ehdr, phdr[i].p_offset);
+	}
+
+	/* rebase sections */
+	shdr = (void *)ehdr + ehdr->e_shoff;
+	for(i = 0; i < ehdr->e_shnum; i++) {
+		if (!(shdr[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		shdr[i].sh_addr += VDSO_HIGH_BASE;
+
+		if (shdr[i].sh_type == SHT_SYMTAB ||
+		    shdr[i].sh_type == SHT_DYNSYM)
+			reloc_symtab(ehdr, shdr[i].sh_offset,
+				     shdr[i].sh_size);
+	}
+}
+
 void enable_sep_cpu(void)
 {
 	int cpu = get_cpu();
@@ -56,14 +183,33 @@ void enable_sep_cpu(void)
 		return;
 	}
 
-	tss->ss1 = __KERNEL_CS;
-	tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+	tss->x86_tss.ss1 = __KERNEL_CS;
+	tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
 	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-	wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0);
+	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
 	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
 	put_cpu();	
 }
 
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+	gate_vma.vm_mm = NULL;
+	gate_vma.vm_start = FIXADDR_USER_START;
+	gate_vma.vm_end = FIXADDR_USER_END;
+	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+	gate_vma.vm_page_prot = __P101;
+	/*
+	 * Make sure the vDSO gets into every core dump.
+	 * Dumping its contents makes post-mortem fully interpretable later
+	 * without matching up the same kernel and hardware config to see
+	 * what PC values meant.
+	 */
+	gate_vma.vm_flags |= VM_ALWAYSDUMP;
+	return 0;
+}
+
 /*
  * These symbols are defined by vsyscall.o to mark the bounds
  * of the ELF DSO images included therein.
@@ -72,31 +218,48 @@ extern const char vsyscall_int80_start, vsyscall_int80_end;
 extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
 static struct page *syscall_pages[1];
 
+static void map_compat_vdso(int map)
+{
+	static int vdso_mapped;
+
+	if (map == vdso_mapped)
+		return;
+
+	vdso_mapped = map;
+
+	__set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
+		     map ? PAGE_READONLY_EXEC : PAGE_NONE);
+
+	/* flush stray tlbs */
+	flush_tlb_all();
+}
+
 int __init sysenter_setup(void)
 {
 	void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+	const void *vsyscall;
+	size_t vsyscall_len;
+
 	syscall_pages[0] = virt_to_page(syscall_page);
 
-#ifdef CONFIG_COMPAT_VDSO
-	__set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC);
+	gate_vma_init();
+
 	printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
-#endif
 
 	if (!boot_cpu_has(X86_FEATURE_SEP)) {
-		memcpy(syscall_page,
-		       &vsyscall_int80_start,
-		       &vsyscall_int80_end - &vsyscall_int80_start);
-		return 0;
+		vsyscall = &vsyscall_int80_start;
+		vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
+	} else {
+		vsyscall = &vsyscall_sysenter_start;
+		vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
 	}
 
-	memcpy(syscall_page,
-	       &vsyscall_sysenter_start,
-	       &vsyscall_sysenter_end - &vsyscall_sysenter_start);
+	memcpy(syscall_page, vsyscall, vsyscall_len);
+	relocate_vdso(syscall_page);
 
 	return 0;
 }
 
-#ifndef CONFIG_COMPAT_VDSO
 /* Defined in vsyscall-sysenter.S */
 extern void SYSENTER_RETURN;
 
@@ -105,36 +268,52 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
-	int ret;
+	int ret = 0;
+	bool compat;
 
 	down_write(&mm->mmap_sem);
-	addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
-	if (IS_ERR_VALUE(addr)) {
-		ret = addr;
-		goto up_fail;
-	}
 
-	/*
-	 * MAYWRITE to allow gdb to COW and set breakpoints
-	 *
-	 * Make sure the vDSO gets into every core dump.
-	 * Dumping its contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to see
-	 * what PC values meant.
-	 */
-	ret = install_special_mapping(mm, addr, PAGE_SIZE,
-				      VM_READ|VM_EXEC|
-				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-				      VM_ALWAYSDUMP,
-				      syscall_pages);
-	if (ret)
-		goto up_fail;
+	/* Test compat mode once here, in case someone
+	   changes it via sysctl */
+	compat = (vdso_enabled == VDSO_COMPAT);
+
+	map_compat_vdso(compat);
+
+	if (compat)
+		addr = VDSO_HIGH_BASE;
+	else {
+		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+		if (IS_ERR_VALUE(addr)) {
+			ret = addr;
+			goto up_fail;
+		}
+
+		/*
+		 * MAYWRITE to allow gdb to COW and set breakpoints
+		 *
+		 * Make sure the vDSO gets into every core dump.
+		 * Dumping its contents makes post-mortem fully
+		 * interpretable later without matching up the same
+		 * kernel and hardware config to see what PC values
+		 * meant.
+		 */
+		ret = install_special_mapping(mm, addr, PAGE_SIZE,
+					      VM_READ|VM_EXEC|
+					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+					      VM_ALWAYSDUMP,
+					      syscall_pages);
+
+		if (ret)
+			goto up_fail;
+	}
 
 	current->mm->context.vdso = (void *)addr;
 	current_thread_info()->sysenter_return =
-				    (void *)VDSO_SYM(&SYSENTER_RETURN);
-up_fail:
+		(void *)VDSO_SYM(&SYSENTER_RETURN);
+
+  up_fail:
 	up_write(&mm->mmap_sem);
+
 	return ret;
 }
 
@@ -147,6 +326,11 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 
 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
 {
+	struct mm_struct *mm = tsk->mm;
+
+	/* Check to see if this task was created in compat vdso mode */
+	if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+		return &gate_vma;
 	return NULL;
 }
 
@@ -159,4 +343,3 @@ int in_gate_area_no_task(unsigned long addr)
 {
 	return 0;
 }
-#endif
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 94e5cb091104..a665df61f08c 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -70,8 +70,6 @@
 
 #include <asm/i8259.h>
 
-int pit_latch_buggy;              /* extern */
-
 #include "do_timer.h"
 
 unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S
index 2f1814c5cfd7..f62815f8d06a 100644
--- a/arch/i386/kernel/trampoline.S
+++ b/arch/i386/kernel/trampoline.S
@@ -29,7 +29,7 @@
  *
  *	TYPE              VALUE
  *	R_386_32          startup_32_smp
- *	R_386_32          boot_gdt_table
+ *	R_386_32          boot_gdt
  */
 
 #include <linux/linkage.h>
@@ -62,8 +62,8 @@ r_base = .
 	 * to 32 bit.
 	 */
 
-	lidtl	boot_idt - r_base	# load idt with 0, 0
-	lgdtl	boot_gdt - r_base	# load gdt with whatever is appropriate
+	lidtl	boot_idt_descr - r_base	# load idt with 0, 0
+	lgdtl	boot_gdt_descr - r_base	# load gdt with whatever is appropriate
 
 	xor	%ax, %ax
 	inc	%ax		# protected mode (PE) bit
@@ -73,11 +73,11 @@ r_base = .
 
 	# These need to be in the same 64K segment as the above;
 	# hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt:
+boot_gdt_descr:
 	.word	__BOOT_DS + 7			# gdt limit
-	.long	boot_gdt_table-__PAGE_OFFSET	# gdt base
+	.long	boot_gdt - __PAGE_OFFSET	# gdt base
 
-boot_idt:
+boot_idt_descr:
 	.word	0				# idt limit = 0
 	.long	0				# idt base = 0L
 
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index af0d3f70a817..f21b41e7770c 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -476,8 +476,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
 			      siginfo_t *info)
 {
 	struct task_struct *tsk = current;
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = trapnr;
 
 	if (regs->eflags & VM_MASK) {
 		if (vm86)
@@ -489,6 +487,18 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
 		goto kernel_trap;
 
 	trap_signal: {
+		/*
+		 * We want error_code and trap_no set for userspace faults and
+		 * kernelspace faults which result in die(), but not
+		 * kernelspace faults which are fixed up.  die() gives the
+		 * process no chance to handle the signal and notice the
+		 * kernel fault information, so that won't result in polluting
+		 * the information about previously queued, but not yet
+		 * delivered, faults.  See also do_general_protection below.
+		 */
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+
 		if (info)
 			force_sig_info(signr, info, tsk);
 		else
@@ -497,8 +507,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
 	}
 
 	kernel_trap: {
-		if (!fixup_exception(regs))
+		if (!fixup_exception(regs)) {
+			tsk->thread.error_code = error_code;
+			tsk->thread.trap_no = trapnr;
 			die(str, regs, error_code);
+		}
 		return;
 	}
 
@@ -583,7 +596,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 	 * and we set the offset field correctly. Then we let the CPU to
 	 * restart the faulting instruction.
 	 */
-	if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
+	if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
 	    thread->io_bitmap_ptr) {
 		memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
 		       thread->io_bitmap_max);
@@ -596,16 +609,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 				thread->io_bitmap_max, 0xff,
 				tss->io_bitmap_max - thread->io_bitmap_max);
 		tss->io_bitmap_max = thread->io_bitmap_max;
-		tss->io_bitmap_base = IO_BITMAP_OFFSET;
+		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
 		tss->io_bitmap_owner = thread;
 		put_cpu();
 		return;
 	}
 	put_cpu();
 
-	current->thread.error_code = error_code;
-	current->thread.trap_no = 13;
-
 	if (regs->eflags & VM_MASK)
 		goto gp_in_vm86;
 
@@ -624,6 +634,8 @@ gp_in_vm86:
 
 gp_in_kernel:
 	if (!fixup_exception(regs)) {
+		current->thread.error_code = error_code;
+		current->thread.trap_no = 13;
 		if (notify_die(DIE_GPF, "general protection fault", regs,
 				error_code, 13, SIGSEGV) == NOTIFY_STOP)
 			return;
@@ -1018,9 +1030,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
 fastcall unsigned long patch_espfix_desc(unsigned long uesp,
 					  unsigned long kesp)
 {
-	int cpu = smp_processor_id();
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-	struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
+	struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
 	unsigned long base = (kesp - uesp) & -THREAD_SIZE;
 	unsigned long new_kesp = kesp - base;
 	unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 6cb8f5336732..f64b81f3033b 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -200,13 +200,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
 {
 	struct cpufreq_freqs *freq = data;
 
-	if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
-		write_seqlock_irq(&xtime_lock);
-
 	if (!ref_freq) {
 		if (!freq->old){
 			ref_freq = freq->new;
-			goto end;
+			return 0;
 		}
 		ref_freq = freq->old;
 		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
@@ -233,13 +230,10 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
 				 * TSC based sched_clock turns
 				 * to junk w/ cpufreq
 				 */
-				mark_tsc_unstable();
+				mark_tsc_unstable("cpufreq changes");
 			}
 		}
 	}
-end:
-	if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
-		write_sequnlock_irq(&xtime_lock);
 
 	return 0;
 }
@@ -281,11 +275,12 @@ static struct clocksource clocksource_tsc = {
 				  CLOCK_SOURCE_MUST_VERIFY,
 };
 
-void mark_tsc_unstable(void)
+void mark_tsc_unstable(char *reason)
 {
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
 		tsc_enabled = 0;
+		printk("Marking TSC unstable due to: %s.\n", reason);
 		/* Can be called before registration */
 		if (clocksource_tsc.mult)
 			clocksource_change_rating(&clocksource_tsc, 0);
diff --git a/arch/i386/kernel/verify_cpu.S b/arch/i386/kernel/verify_cpu.S
new file mode 100644
index 000000000000..e51a8695d54e
--- /dev/null
+++ b/arch/i386/kernel/verify_cpu.S
@@ -0,0 +1,65 @@
+/* Check if CPU has some minimum CPUID bits
+   This runs in 16bit mode so that the caller can still use the BIOS
+   to output errors on the screen */
+#include <asm/cpufeature.h>
+
+verify_cpu:
+	pushfl				# Save caller passed flags
+	pushl	$0			# Kill any dangerous flags
+	popfl
+
+#if CONFIG_X86_MINIMUM_CPU_MODEL >= 4
+	pushfl
+	orl	$(1<<18),(%esp)		# try setting AC
+	popfl
+	pushfl
+	popl    %eax
+	testl	$(1<<18),%eax
+	jz	bad
+#endif
+#if REQUIRED_MASK1 != 0
+	pushfl				# standard way to check for cpuid
+	popl	%eax
+	movl	%eax,%ebx
+	xorl	$0x200000,%eax
+	pushl	%eax
+	popfl
+	pushfl
+	popl	%eax
+	cmpl	%eax,%ebx
+	pushfl				# standard way to check for cpuid
+	popl	%eax
+	movl	%eax,%ebx
+	xorl	$0x200000,%eax
+	pushl	%eax
+	popfl
+	pushfl
+	popl	%eax
+	cmpl	%eax,%ebx
+	jz	bad			# REQUIRED_MASK1 != 0 requires CPUID
+
+	movl	$0x0,%eax		# See if cpuid 1 is implemented
+	cpuid
+	cmpl	$0x1,%eax
+	jb	bad			# no cpuid 1
+
+	movl    $0x1,%eax		# Does the cpu have what it takes
+	cpuid
+
+#if CONFIG_X86_MINIMUM_CPU_MODEL > 4
+#error	add proper model checking here
+#endif
+
+	andl	$REQUIRED_MASK1,%edx
+	xorl	$REQUIRED_MASK1,%edx
+	jnz	bad
+#endif /* REQUIRED_MASK1 */
+
+	popfl
+	xor	%eax,%eax
+	ret
+
+bad:
+	popfl
+	movl	$1,%eax
+	ret
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index 697a70e8c0c9..c8726c424b35 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -26,6 +26,7 @@
 #include <linux/cpu.h>
 #include <linux/bootmem.h>
 #include <linux/mm.h>
+#include <linux/highmem.h>
 #include <asm/vmi.h>
 #include <asm/io.h>
 #include <asm/fixmap.h>
@@ -56,7 +57,7 @@ static int disable_noidle;
 static int disable_vmi_timer;
 
 /* Cached VMI operations */
-struct {
+static struct {
 	void (*cpuid)(void /* non-c */);
 	void (*_set_ldt)(u32 selector);
 	void (*set_tr)(u32 selector);
@@ -65,16 +66,15 @@ struct {
 	void (*release_page)(u32, u32);
 	void (*set_pte)(pte_t, pte_t *, unsigned);
 	void (*update_pte)(pte_t *, unsigned);
-	void (*set_linear_mapping)(int, u32, u32, u32);
-	void (*flush_tlb)(int);
+	void (*set_linear_mapping)(int, void *, u32, u32);
+	void (*_flush_tlb)(int);
 	void (*set_initial_ap_state)(int, int);
 	void (*halt)(void);
   	void (*set_lazy_mode)(int mode);
 } vmi_ops;
 
-/* XXX move this to alternative.h */
-extern struct paravirt_patch __start_parainstructions[],
-	__stop_parainstructions[];
+/* Cached VMI operations */
+struct vmi_timer_ops vmi_timer_ops;
 
 /*
  * VMI patching routines.
@@ -83,11 +83,6 @@ extern struct paravirt_patch __start_parainstructions[],
 #define MNEM_JMP  0xe9
 #define MNEM_RET  0xc3
 
-static char irq_save_disable_callout[] = {
-	MNEM_CALL, 0, 0, 0, 0,
-	MNEM_CALL, 0, 0, 0, 0,
-	MNEM_RET
-};
 #define IRQ_PATCH_INT_MASK 0
 #define IRQ_PATCH_DISABLE  5
 
@@ -135,33 +130,17 @@ static unsigned patch_internal(int call, unsigned len, void *insns)
 static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
 {
 	switch (type) {
-		case PARAVIRT_IRQ_DISABLE:
+		case PARAVIRT_PATCH(irq_disable):
 			return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
-		case PARAVIRT_IRQ_ENABLE:
+		case PARAVIRT_PATCH(irq_enable):
 			return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
-		case PARAVIRT_RESTORE_FLAGS:
+		case PARAVIRT_PATCH(restore_fl):
 			return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
-		case PARAVIRT_SAVE_FLAGS:
+		case PARAVIRT_PATCH(save_fl):
 			return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
-        	case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE:
-			if (len >= 10) {
-				patch_internal(VMI_CALL_GetInterruptMask, len, insns);
-				patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
-				return 10;
-			} else {
-				/*
-				 * You bastards didn't leave enough room to
-				 * patch save_flags_irq_disable inline.  Patch
-				 * to a helper
-				 */
-				BUG_ON(len < 5);
-				*(char *)insns = MNEM_CALL;
-				patch_offset(insns, irq_save_disable_callout);
-				return 5;
-			}
-		case PARAVIRT_INTERRUPT_RETURN:
+		case PARAVIRT_PATCH(iret):
 			return patch_internal(VMI_CALL_IRET, len, insns);
-		case PARAVIRT_STI_SYSEXIT:
+		case PARAVIRT_PATCH(irq_enable_sysexit):
 			return patch_internal(VMI_CALL_SYSEXIT, len, insns);
 		default:
 			break;
@@ -230,24 +209,24 @@ static void vmi_set_tr(void)
 static void vmi_load_esp0(struct tss_struct *tss,
 				   struct thread_struct *thread)
 {
-	tss->esp0 = thread->esp0;
+	tss->x86_tss.esp0 = thread->esp0;
 
 	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
-	if (unlikely(tss->ss1 != thread->sysenter_cs)) {
-		tss->ss1 = thread->sysenter_cs;
+	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+		tss->x86_tss.ss1 = thread->sysenter_cs;
 		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
 	}
-	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0);
+	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
 }
 
 static void vmi_flush_tlb_user(void)
 {
-	vmi_ops.flush_tlb(VMI_FLUSH_TLB);
+	vmi_ops._flush_tlb(VMI_FLUSH_TLB);
 }
 
 static void vmi_flush_tlb_kernel(void)
 {
-	vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
+	vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
 }
 
 /* Stub to do nothing at all; used for delays and unimplemented calls */
@@ -255,18 +234,6 @@ static void vmi_nop(void)
 {
 }
 
-/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
-static fastcall void vmi_safe_halt(void)
-{
-	int idle = vmi_stop_hz_timer();
-	vmi_ops.halt();
-	if (idle) {
-		local_irq_disable();
-		vmi_account_time_restart_hz_timer();
-		local_irq_enable();
-	}
-}
-
 #ifdef CONFIG_DEBUG_PAGE_TYPE
 
 #ifdef CONFIG_X86_PAE
@@ -370,8 +337,11 @@ static void vmi_check_page_type(u32 pfn, int type)
 #define vmi_check_page_type(p,t) do { } while (0)
 #endif
 
-static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn)
+#ifdef CONFIG_HIGHPTE
+static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 {
+	void *va = kmap_atomic(page, type);
+
 	/*
 	 * Internally, the VMI ROM must map virtual addresses to physical
 	 * addresses for processing MMU updates.  By the time MMU updates
@@ -385,8 +355,11 @@ static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn)
 	 *  args:                 SLOT                 VA    COUNT PFN
 	 */
 	BUG_ON(type != KM_PTE0 && type != KM_PTE1);
-	vmi_ops.set_linear_mapping((type - KM_PTE0)+1, (u32)va, 1, pfn);
+	vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
+
+	return va;
 }
+#endif
 
 static void vmi_allocate_pt(u32 pfn)
 {
@@ -443,13 +416,13 @@ static void vmi_release_pd(u32 pfn)
         ((level) | (is_current_as(mm, user) ?                           \
                 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
 
-static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep)
+static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
 	vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
 }
 
-static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
 	vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
@@ -462,7 +435,7 @@ static void vmi_set_pte(pte_t *ptep, pte_t pte)
 	vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
 }
 
-static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
 {
 	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
 	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
@@ -516,7 +489,7 @@ static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
 }
 
-void vmi_pmd_clear(pmd_t *pmd)
+static void vmi_pmd_clear(pmd_t *pmd)
 {
 	const pte_t pte = { 0 };
 	vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
@@ -525,8 +498,6 @@ void vmi_pmd_clear(pmd_t *pmd)
 #endif
 
 #ifdef CONFIG_SMP
-extern void setup_pda(void);
-
 static void __devinit
 vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 		     unsigned long start_esp)
@@ -551,13 +522,11 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 
 	ap.ds = __USER_DS;
 	ap.es = __USER_DS;
-	ap.fs = __KERNEL_PDA;
+	ap.fs = __KERNEL_PERCPU;
 	ap.gs = 0;
 
 	ap.eflags = 0;
 
-	setup_pda();
-
 #ifdef CONFIG_X86_PAE
 	/* efer should match BSP efer. */
 	if (cpu_has_nx) {
@@ -575,9 +544,9 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 }
 #endif
 
-static void vmi_set_lazy_mode(int mode)
+static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
 {
-	static DEFINE_PER_CPU(int, lazy_mode);
+	static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
 
 	if (!vmi_ops.set_lazy_mode)
 		return;
@@ -685,7 +654,7 @@ void vmi_bringup(void)
 {
  	/* We must establish the lowmem mapping for MMU ops to work */
 	if (vmi_ops.set_linear_mapping)
-		vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
+		vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
 }
 
 /*
@@ -740,7 +709,6 @@ do {								\
 	}							\
 } while (0)
 
-
 /*
  * Activate the VMI interface and switch into paravirtualized mode
  */
@@ -796,12 +764,6 @@ static inline int __init activate_vmi(void)
 	para_fill(irq_disable, DisableInterrupts);
 	para_fill(irq_enable, EnableInterrupts);
 
-	/* irq_save_disable !!! sheer pain */
-	patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
-		     (char *)paravirt_ops.save_fl);
-	patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
-		     (char *)paravirt_ops.irq_disable);
-
 	para_fill(wbinvd, WBINVD);
 	para_fill(read_tsc, RDTSC);
 
@@ -831,8 +793,8 @@ static inline int __init activate_vmi(void)
 	para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
 
 	/* user and kernel flush are just handled with different flags to FlushTLB */
-	para_wrap(flush_tlb_user, vmi_flush_tlb_user, flush_tlb, FlushTLB);
-	para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, flush_tlb, FlushTLB);
+	para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
+	para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
 	para_fill(flush_tlb_single, InvalPage);
 
 	/*
@@ -878,8 +840,13 @@ static inline int __init activate_vmi(void)
 		paravirt_ops.release_pt = vmi_release_pt;
 		paravirt_ops.release_pd = vmi_release_pd;
 	}
-	para_wrap(map_pt_hook, vmi_map_pt_hook, set_linear_mapping,
-		  SetLinearMapping);
+
+	/* Set linear is needed in all cases */
+	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
+#ifdef CONFIG_HIGHPTE
+	if (vmi_ops.set_linear_mapping)
+		paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
+#endif
 
 	/*
 	 * These MUST always be patched.  Don't support indirect jumps
@@ -920,8 +887,8 @@ static inline int __init activate_vmi(void)
 		paravirt_ops.get_wallclock = vmi_get_wallclock;
 		paravirt_ops.set_wallclock = vmi_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-		paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
-		paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
+		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
+		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
 		paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
  		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
@@ -933,11 +900,7 @@ static inline int __init activate_vmi(void)
 		disable_vmi_timer = 1;
 	}
 
-	/* No idle HZ mode only works if VMI timer and no idle is enabled */
-	if (disable_noidle || disable_vmi_timer)
-		para_fill(safe_halt, Halt);
-	else
-		para_wrap(safe_halt, vmi_safe_halt, halt, Halt);
+	para_fill(safe_halt, Halt);
 
 	/*
 	 * Alternative instruction rewriting doesn't happen soon enough
@@ -945,7 +908,7 @@ static inline int __init activate_vmi(void)
 	 * to do this before IRQs get reenabled.  Fortunately, it is
 	 * idempotent.
 	 */
-	apply_paravirt(__start_parainstructions, __stop_parainstructions);
+	apply_paravirt(__parainstructions, __parainstructions_end);
 
 	vmi_bringup();
 
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
new file mode 100644
index 000000000000..26a37f8a8762
--- /dev/null
+++ b/arch/i386/kernel/vmiclock.c
@@ -0,0 +1,318 @@
+/*
+ * VMI paravirtual timer support routines.
+ *
+ * Copyright (C) 2007, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/cpumask.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+
+#include <asm/vmi.h>
+#include <asm/vmi_time.h>
+#include <asm/arch_hooks.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+
+#include <irq_vectors.h>
+#include "io_ports.h"
+
+#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
+
+static DEFINE_PER_CPU(struct clock_event_device, local_events);
+
+static inline u32 vmi_counter(u32 flags)
+{
+	/* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
+	 * cycle counter. */
+	return flags & VMI_ALARM_COUNTER_MASK;
+}
+
+/* paravirt_ops.get_wallclock = vmi_get_wallclock */
+unsigned long vmi_get_wallclock(void)
+{
+	unsigned long long wallclock;
+	wallclock = vmi_timer_ops.get_wallclock(); // nsec
+	(void)do_div(wallclock, 1000000000);       // sec
+
+	return wallclock;
+}
+
+/* paravirt_ops.set_wallclock = vmi_set_wallclock */
+int vmi_set_wallclock(unsigned long now)
+{
+	return 0;
+}
+
+/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
+unsigned long long vmi_get_sched_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+}
+
+/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
+unsigned long vmi_cpu_khz(void)
+{
+	unsigned long long khz;
+	khz = vmi_timer_ops.get_cycle_frequency();
+	(void)do_div(khz, 1000);
+	return khz;
+}
+
+static inline unsigned int vmi_get_timer_vector(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+	return FIRST_DEVICE_VECTOR;
+#else
+	return FIRST_EXTERNAL_VECTOR;
+#endif
+}
+
+/** vmi clockchip */
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int startup_timer_irq(unsigned int irq)
+{
+	unsigned long val = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, vmi_get_timer_vector());
+
+	return (val & APIC_SEND_PENDING);
+}
+
+static void mask_timer_irq(unsigned int irq)
+{
+	unsigned long val = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
+}
+
+static void unmask_timer_irq(unsigned int irq)
+{
+	unsigned long val = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
+}
+
+static void ack_timer_irq(unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+static struct irq_chip vmi_chip __read_mostly = {
+	.name 		= "VMI-LOCAL",
+	.startup 	= startup_timer_irq,
+	.mask	 	= mask_timer_irq,
+	.unmask	 	= unmask_timer_irq,
+	.ack 		= ack_timer_irq
+};
+#endif
+
+/** vmi clockevent */
+#define VMI_ALARM_WIRED_IRQ0    0x00000000
+#define VMI_ALARM_WIRED_LVTT    0x00010000
+static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
+
+static inline int vmi_get_alarm_wiring(void)
+{
+	return vmi_wiring;
+}
+
+static void vmi_timer_set_mode(enum clock_event_mode mode,
+			       struct clock_event_device *evt)
+{
+	cycle_t now, cycles_per_hz;
+	BUG_ON(!irqs_disabled());
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_ONESHOT:
+		break;
+	case CLOCK_EVT_MODE_PERIODIC:
+		cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
+		(void)do_div(cycles_per_hz, HZ);
+		now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
+		vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		switch (evt->mode) {
+		case CLOCK_EVT_MODE_ONESHOT:
+			vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
+			break;
+		case CLOCK_EVT_MODE_PERIODIC:
+			vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static int vmi_timer_next_event(unsigned long delta,
+				struct clock_event_device *evt)
+{
+	/* Unfortunately, set_next_event interface only passes relative
+	 * expiry, but we want absolute expiry.  It'd be better if were
+	 * were passed an aboslute expiry, since a bunch of time may
+	 * have been stolen between the time the delta is computed and
+	 * when we set the alarm below. */
+	cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
+
+	BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+	vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
+	return 0;
+}
+
+static struct clock_event_device vmi_clockevent = {
+	.name		= "vmi-timer",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+	.shift		= 22,
+	.set_mode	= vmi_timer_set_mode,
+	.set_next_event = vmi_timer_next_event,
+	.rating         = 1000,
+	.irq		= 0,
+};
+
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &__get_cpu_var(local_events);
+	evt->event_handler(evt);
+	return IRQ_HANDLED;
+}
+
+static struct irqaction vmi_clock_action  = {
+	.name 		= "vmi-timer",
+	.handler 	= vmi_timer_interrupt,
+	.flags 		= IRQF_DISABLED | IRQF_NOBALANCING,
+	.mask 		= CPU_MASK_ALL,
+};
+
+static void __devinit vmi_time_init_clockevent(void)
+{
+	cycle_t cycles_per_msec;
+	struct clock_event_device *evt;
+
+	int cpu = smp_processor_id();
+	evt = &__get_cpu_var(local_events);
+
+	/* Use cycles_per_msec since div_sc params are 32-bits. */
+	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+	(void)do_div(cycles_per_msec, 1000);
+
+	memcpy(evt, &vmi_clockevent, sizeof(*evt));
+	/* Must pick .shift such that .mult fits in 32-bits.  Choosing
+	 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
+	 * before overflow. */
+	evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
+	/* Upper bound is clockevent's use of ulong for cycle deltas. */
+	evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
+	evt->min_delta_ns = clockevent_delta2ns(1, evt);
+	evt->cpumask = cpumask_of_cpu(cpu);
+
+	printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
+	       evt->name, evt->mult, evt->shift);
+	clockevents_register_device(evt);
+}
+
+void __init vmi_time_init(void)
+{
+	/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
+	outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+
+	vmi_time_init_clockevent();
+	setup_irq(0, &vmi_clock_action);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+void __devinit vmi_time_bsp_init(void)
+{
+	/*
+	 * On APIC systems, we want local timers to fire on each cpu.  We do
+	 * this by programming LVTT to deliver timer events to the IRQ handler
+	 * for IRQ-0, since we can't re-use the APIC local timer handler
+	 * without interfering with that code.
+	 */
+	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+	local_irq_disable();
+#ifdef CONFIG_X86_SMP
+	/*
+	 * XXX handle_percpu_irq only defined for SMP; we need to switch over
+	 * to using it, since this is a local interrupt, which each CPU must
+	 * handle individually without locking out or dropping simultaneous
+	 * local timers on other CPUs.  We also don't want to trigger the
+	 * quirk workaround code for interrupts which gets invoked from
+	 * handle_percpu_irq via eoi, so we use our own IRQ chip.
+	 */
+	set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
+#else
+	set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
+#endif
+	vmi_wiring = VMI_ALARM_WIRED_LVTT;
+	apic_write(APIC_LVTT, vmi_get_timer_vector());
+	local_irq_enable();
+	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+}
+
+void __devinit vmi_time_ap_init(void)
+{
+	vmi_time_init_clockevent();
+	apic_write(APIC_LVTT, vmi_get_timer_vector());
+}
+#endif
+
+/** vmi clocksource */
+
+static cycle_t read_real_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
+}
+
+static struct clocksource clocksource_vmi = {
+	.name			= "vmi-timer",
+	.rating			= 450,
+	.read			= read_real_cycles,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.flags			= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static int __init init_vmi_clocksource(void)
+{
+	cycle_t cycles_per_msec;
+
+	if (!vmi_timer_ops.get_cycle_frequency)
+		return 0;
+	/* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
+	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+	(void)do_div(cycles_per_msec, 1000);
+
+	/* Note that clocksource.{mult, shift} converts in the opposite direction
+	 * as clockevents.  */
+	clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
+						    clocksource_vmi.shift);
+
+	printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
+	return clocksource_register(&clocksource_vmi);
+
+}
+module_init(init_vmi_clocksource);
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
deleted file mode 100644
index 9dfb17739b67..000000000000
--- a/arch/i386/kernel/vmitime.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-/*
- * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
- * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/jiffies.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include <linux/rcupdate.h>
-#include <linux/clocksource.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/div64.h>
-#include <asm/timer.h>
-#include <asm/desc.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-
-#include <mach_timer.h>
-#include <io_ports.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
-#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
-#else
-#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
-#endif
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-#ifdef CONFIG_NO_IDLE_HZ
-
-/* /proc/sys/kernel/hz_timer state. */
-int sysctl_hz_timer;
-
-/* Some stats */
-static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
-static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
-static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
-
-#endif /* CONFIG_NO_IDLE_HZ */
-
-/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
-static int alarm_hz = CONFIG_VMI_ALARM_HZ;
-
-/* Cache of the value get_cycle_frequency / HZ. */
-static signed long long cycles_per_jiffy;
-
-/* Cache of the value get_cycle_frequency / alarm_hz. */
-static signed long long cycles_per_alarm;
-
-/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
- * Protected by xtime_lock. */
-static unsigned long long real_cycles_accounted_system;
-
-/* The number of cycles accounted for by update_process_times(), per cpu. */
-static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
-
-/* The number of stolen cycles accounted, per cpu. */
-static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
-
-/* Clock source. */
-static cycle_t read_real_cycles(void)
-{
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
-}
-
-static cycle_t read_available_cycles(void)
-{
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
-}
-
-#if 0
-static cycle_t read_stolen_cycles(void)
-{
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
-}
-#endif  /*  0  */
-
-static struct clocksource clocksource_vmi = {
-	.name			= "vmi-timer",
-	.rating			= 450,
-	.read			= read_real_cycles,
-	.mask			= CLOCKSOURCE_MASK(64),
-	.mult			= 0, /* to be set */
-	.shift			= 22,
-	.flags			= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-
-/* Timer interrupt handler. */
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
-
-static struct irqaction vmi_timer_irq  = {
-	.handler = vmi_timer_interrupt,
-	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
-	.name = "VMI-alarm",
-};
-
-/* Alarm rate */
-static int __init vmi_timer_alarm_rate_setup(char* str)
-{
-	int alarm_rate;
-	if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
-		alarm_hz = alarm_rate;
-		printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
-	}
-	return 1;
-}
-__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
-
-
-/* Initialization */
-static void vmi_get_wallclock_ts(struct timespec *ts)
-{
-	unsigned long long wallclock;
-	wallclock = vmi_timer_ops.get_wallclock(); // nsec units
-	ts->tv_nsec = do_div(wallclock, 1000000000);
-	ts->tv_sec = wallclock;
-}
-
-unsigned long vmi_get_wallclock(void)
-{
-	struct timespec ts;
-	vmi_get_wallclock_ts(&ts);
-	return ts.tv_sec;
-}
-
-int vmi_set_wallclock(unsigned long now)
-{
-	return -1;
-}
-
-unsigned long long vmi_get_sched_cycles(void)
-{
-	return read_available_cycles();
-}
-
-unsigned long vmi_cpu_khz(void)
-{
-	unsigned long long khz;
-
-	khz = vmi_timer_ops.get_cycle_frequency();
-	(void)do_div(khz, 1000);
-	return khz;
-}
-
-void __init vmi_time_init(void)
-{
-	unsigned long long cycles_per_sec, cycles_per_msec;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	setup_irq(0, &vmi_timer_irq);
-#ifdef CONFIG_X86_LOCAL_APIC
-	set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
-#endif
-
-	real_cycles_accounted_system = read_real_cycles();
-	per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
-
-	cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
-	cycles_per_jiffy = cycles_per_sec;
-	(void)do_div(cycles_per_jiffy, HZ);
-	cycles_per_alarm = cycles_per_sec;
-	(void)do_div(cycles_per_alarm, alarm_hz);
-	cycles_per_msec = cycles_per_sec;
-	(void)do_div(cycles_per_msec, 1000);
-
-	printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
-	       "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
-	       cycles_per_alarm);
-
-	clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
-						    clocksource_vmi.shift);
-	if (clocksource_register(&clocksource_vmi))
-		printk(KERN_WARNING "Error registering VMITIME clocksource.");
-
-	/* Disable PIT. */
-	outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
-	/* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
-	 * reduce the latency calling update_process_times. */
-	vmi_timer_ops.set_alarm(
-		      VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
-		      per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
-		      cycles_per_alarm);
-
-	local_irq_restore(flags);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-void __init vmi_timer_setup_boot_alarm(void)
-{
-	local_irq_disable();
-
-	/* Route the interrupt to the correct vector. */
-	apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
-
-	/* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
-	vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
-	vmi_timer_ops.set_alarm(
-		      VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
-		      per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
-		      cycles_per_alarm);
-	local_irq_enable();
-}
-
-/* Initialize the time accounting variables for an AP on an SMP system.
- * Also, set the local alarm for the AP. */
-void __devinit vmi_timer_setup_secondary_alarm(void)
-{
-	int cpu = smp_processor_id();
-
-	/* Route the interrupt to the correct vector. */
-	apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
-
-	per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
-
-	vmi_timer_ops.set_alarm(
-		      VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
-		      per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
-		      cycles_per_alarm);
-}
-
-#endif
-
-/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
-static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
-{
-	long long cycles_not_accounted;
-
-	write_seqlock(&xtime_lock);
-
-	cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
-	while (cycles_not_accounted >= cycles_per_jiffy) {
-		/* systems wide jiffies. */
-		do_timer(1);
-
-		cycles_not_accounted -= cycles_per_jiffy;
-		real_cycles_accounted_system += cycles_per_jiffy;
-	}
-
-	write_sequnlock(&xtime_lock);
-}
-
-/* Update per-cpu process times. */
-static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
-					     unsigned long long cur_process_times_cycles)
-{
-	long long cycles_not_accounted;
-	cycles_not_accounted = cur_process_times_cycles -
-		per_cpu(process_times_cycles_accounted_cpu, cpu);
-
-	while (cycles_not_accounted >= cycles_per_jiffy) {
-		/* Account time to the current process.  This includes
-		 * calling into the scheduler to decrement the timeslice
-		 * and possibly reschedule.*/
-		update_process_times(user_mode(regs));
-		/* XXX handle /proc/profile multiplier.  */
-		profile_tick(CPU_PROFILING);
-
-		cycles_not_accounted -= cycles_per_jiffy;
-		per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
-	}
-}
-
-#ifdef CONFIG_NO_IDLE_HZ
-/* Update per-cpu idle times.  Used when a no-hz halt is ended. */
-static void vmi_account_no_hz_idle_cycles(int cpu,
-					  unsigned long long cur_process_times_cycles)
-{
-	long long cycles_not_accounted;
-	unsigned long no_idle_hz_jiffies = 0;
-
-	cycles_not_accounted = cur_process_times_cycles -
-		per_cpu(process_times_cycles_accounted_cpu, cpu);
-
-	while (cycles_not_accounted >= cycles_per_jiffy) {
-		no_idle_hz_jiffies++;
-		cycles_not_accounted -= cycles_per_jiffy;
-		per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
-	}
-	/* Account time to the idle process. */
-	account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
-}
-#endif
-
-/* Update per-cpu stolen time. */
-static void vmi_account_stolen_cycles(int cpu,
-				      unsigned long long cur_real_cycles,
-				      unsigned long long cur_avail_cycles)
-{
-	long long stolen_cycles_not_accounted;
-	unsigned long stolen_jiffies = 0;
-
-	if (cur_real_cycles < cur_avail_cycles)
-		return;
-
-	stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
-		per_cpu(stolen_cycles_accounted_cpu, cpu);
-
-	while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
-		stolen_jiffies++;
-		stolen_cycles_not_accounted -= cycles_per_jiffy;
-		per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
-	}
-	/* HACK: pass NULL to force time onto cpustat->steal. */
-	account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
-}
-
-/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
- * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
-static void vmi_local_timer_interrupt(int cpu)
-{
-	unsigned long long cur_real_cycles, cur_process_times_cycles;
-
-	cur_real_cycles = read_real_cycles();
-	cur_process_times_cycles = read_available_cycles();
-	/* Update system wide (real) time state (xtime, jiffies). */
-	vmi_account_real_cycles(cur_real_cycles);
-	/* Update per-cpu process times. */
-	vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
-        /* Update time stolen from this cpu by the hypervisor. */
-	vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
-}
-
-#ifdef CONFIG_NO_IDLE_HZ
-
-/* Must be called only from idle loop, with interrupts disabled. */
-int vmi_stop_hz_timer(void)
-{
-	/* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
-
-	unsigned long seq, next;
-	unsigned long long real_cycles_expiry;
-	int cpu = smp_processor_id();
-
-	BUG_ON(!irqs_disabled());
-	if (sysctl_hz_timer != 0)
-		return 0;
-
-	cpu_set(cpu, nohz_cpu_mask);
-	smp_mb();
-
-	if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
-	    (next = next_timer_interrupt(),
-	     time_before_eq(next, jiffies + HZ/CONFIG_VMI_ALARM_HZ))) {
-		cpu_clear(cpu, nohz_cpu_mask);
-		return 0;
-	}
-
-	/* Convert jiffies to the real cycle counter. */
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		real_cycles_expiry = real_cycles_accounted_system +
-			(long)(next - jiffies) * cycles_per_jiffy;
-	} while (read_seqretry(&xtime_lock, seq));
-
-	/* This cpu is going idle. Disable the periodic alarm. */
-	vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
-	per_cpu(idle_start_jiffies, cpu) = jiffies;
-	/* Set the real time alarm to expire at the next event. */
-	vmi_timer_ops.set_alarm(
-		VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
-		real_cycles_expiry, 0);
-	return 1;
-}
-
-static void vmi_reenable_hz_timer(int cpu)
-{
-	/* For /proc/vmi/info idle_hz stat. */
-	per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
-	per_cpu(vmi_idle_no_hz_irqs, cpu)++;
-
-	/* Don't bother explicitly cancelling the one-shot alarm -- at
-	 * worse we will receive a spurious timer interrupt. */
-	vmi_timer_ops.set_alarm(
-		      VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
-		      per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
-		      cycles_per_alarm);
-	/* Indicate this cpu is no longer nohz idle. */
-	cpu_clear(cpu, nohz_cpu_mask);
-}
-
-/* Called from interrupt handlers when (local) HZ timer is disabled. */
-void vmi_account_time_restart_hz_timer(void)
-{
-	unsigned long long cur_real_cycles, cur_process_times_cycles;
-	int cpu = smp_processor_id();
-
-	BUG_ON(!irqs_disabled());
-	/* Account the time during which the HZ timer was disabled. */
-	cur_real_cycles = read_real_cycles();
-	cur_process_times_cycles = read_available_cycles();
-	/* Update system wide (real) time state (xtime, jiffies). */
-	vmi_account_real_cycles(cur_real_cycles);
-	/* Update per-cpu idle times. */
-	vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
-        /* Update time stolen from this cpu by the hypervisor. */
-	vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
-	/* Reenable the hz timer. */
-	vmi_reenable_hz_timer(cpu);
-}
-
-#endif /* CONFIG_NO_IDLE_HZ */
-
-/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
- * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
- * APIC setup and setup_boot_vmi_alarm() is called.  */
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
-	vmi_local_timer_interrupt(smp_processor_id());
-	return IRQ_HANDLED;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
- * Also used in UP when CONFIG_X86_LOCAL_APIC.
- * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
-void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
-{
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	int cpu = smp_processor_id();
-
-	/*
-	 * the NMI deadlock-detector uses this.
-	 */
-        per_cpu(irq_stat,cpu).apic_timer_irqs++;
-
-	/*
-	 * NOTE! We'd better ACK the irq immediately,
-	 * because timer handling can be slow.
-	 */
-	ack_APIC_irq();
-
-	/*
-	 * update_process_times() expects us to have done irq_enter().
-	 * Besides, if we don't timer interrupts ignore the global
-	 * interrupt lock, which is the WrongThing (tm) to do.
-	 */
-	irq_enter();
-	vmi_local_timer_interrupt(cpu);
-	irq_exit();
-	set_irq_regs(old_regs);
-}
-
-#endif  /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 6f38f818380b..23e8614edeee 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -26,12 +26,11 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
 OUTPUT_ARCH(i386)
 ENTRY(phys_startup_32)
 jiffies = jiffies_64;
-_proxy_pda = 1;
 
 PHDRS {
 	text PT_LOAD FLAGS(5);	/* R_E */
 	data PT_LOAD FLAGS(7);	/* RWE */
-	note PT_NOTE FLAGS(4);	/* R__ */
+	note PT_NOTE FLAGS(0);	/* ___ */
 }
 SECTIONS
 {
@@ -61,8 +60,6 @@ SECTIONS
   	__stop___ex_table = .;
   }
 
-  RODATA
-
   BUG_TABLE
 
   . = ALIGN(4);
@@ -72,6 +69,8 @@ SECTIONS
   	__tracedata_end = .;
   }
 
+  RODATA
+
   /* writeable */
   . = ALIGN(4096);
   .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
@@ -117,22 +116,11 @@ SECTIONS
 
   /* might get freed after init */
   . = ALIGN(4096);
-  .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
-	__smp_alt_begin = .;
-	__smp_alt_instructions = .;
-	*(.smp_altinstructions)
-	__smp_alt_instructions_end = .;
-  }
-  . = ALIGN(4);
   .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
   	__smp_locks = .;
 	*(.smp_locks)
 	__smp_locks_end = .;
   }
-  .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
-	*(.smp_altinstr_replacement)
-  	__smp_alt_end = .;
-  }
   /* will be freed after init
    * Following ALIGN() is required to make sure no other data falls on the
    * same page where __smp_alt_end is pointing as that page might be freed
@@ -178,9 +166,9 @@ SECTIONS
   }
   . = ALIGN(4);
   .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-  	__start_parainstructions = .;
+  	__parainstructions = .;
 	*(.parainstructions)
-  	__stop_parainstructions = .;
+  	__parainstructions_end = .;
   }
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
@@ -194,7 +182,7 @@ SECTIONS
 	__initramfs_end = .;
   }
 #endif
-  . = ALIGN(L1_CACHE_BYTES);
+  . = ALIGN(4096);
   .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
 	__per_cpu_start = .;
 	*(.data.percpu)
diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S
index f66cd11adb72..4a8b0ed9b8fb 100644
--- a/arch/i386/kernel/vsyscall.lds.S
+++ b/arch/i386/kernel/vsyscall.lds.S
@@ -7,7 +7,7 @@
 
 SECTIONS
 {
-  . = VDSO_PRELINK + SIZEOF_HEADERS;
+  . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
 
   .hash           : { *(.hash) }		:text
   .gnu.hash       : { *(.gnu.hash) }
@@ -21,7 +21,7 @@ SECTIONS
      For the layouts to match, we need to skip more than enough
      space for the dynamic symbol table et al.  If this amount
      is insufficient, ld -shared will barf.  Just increase it here.  */
-  . = VDSO_PRELINK + 0x400;
+  . = VDSO_PRELINK_asm + 0x400;
 
   .text           : { *(.text) }		:text =0x90909090
   .note		  : { *(.note.*) }		:text :note