49 files changed, 2584 insertions, 1676 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 5427a842e841..1a884b6e6e5c 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -4,7 +4,7 @@
 
 extra-y := head.o init_task.o vmlinux.lds
 
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
+obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o bootflag.o \
 		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
@@ -81,4 +81,5 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
 	$(call if_changed,syscall)
 
 k8-y                      += ../../x86_64/kernel/k8.o
+stacktrace-y		  += ../../x86_64/kernel/stacktrace.o
 
diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile
index 7e9ac99354f4..7f7be01f44e6 100644
--- a/arch/i386/kernel/acpi/Makefile
+++ b/arch/i386/kernel/acpi/Makefile
@@ -1,5 +1,7 @@
 obj-$(CONFIG_ACPI)		+= boot.o
+ifneq ($(CONFIG_PCI),)
 obj-$(CONFIG_X86_IO_APIC)	+= earlyquirk.o
+endif
 obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup.o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index ee003bc0e8b1..1aaea6ab8c46 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -26,9 +26,12 @@
 #include <linux/init.h>
 #include <linux/acpi.h>
 #include <linux/efi.h>
+#include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/irq.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
 
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
@@ -36,11 +39,17 @@
 #include <asm/io.h>
 #include <asm/mpspec.h>
 
-#ifdef	CONFIG_X86_64
+static int __initdata acpi_force = 0;
 
-extern void __init clustered_apic_check(void);
+#ifdef	CONFIG_ACPI
+int acpi_disabled = 0;
+#else
+int acpi_disabled = 1;
+#endif
+EXPORT_SYMBOL(acpi_disabled);
+
+#ifdef	CONFIG_X86_64
 
-extern int gsi_irq_sharing(int gsi);
 #include <asm/proto.h>
 
 static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
@@ -506,16 +515,76 @@ EXPORT_SYMBOL(acpi_register_gsi);
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 int acpi_map_lsapic(acpi_handle handle, int *pcpu)
 {
-	/* TBD */
-	return -EINVAL;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+	struct acpi_table_lapic *lapic;
+	cpumask_t tmp_map, new_map;
+	u8 physid;
+	int cpu;
+
+	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+		return -EINVAL;
+
+	if (!buffer.length || !buffer.pointer)
+		return -EINVAL;
+
+	obj = buffer.pointer;
+	if (obj->type != ACPI_TYPE_BUFFER ||
+	    obj->buffer.length < sizeof(*lapic)) {
+		kfree(buffer.pointer);
+		return -EINVAL;
+	}
+
+	lapic = (struct acpi_table_lapic *)obj->buffer.pointer;
+
+	if ((lapic->header.type != ACPI_MADT_LAPIC) ||
+	    (!lapic->flags.enabled)) {
+		kfree(buffer.pointer);
+		return -EINVAL;
+	}
+
+	physid = lapic->id;
+
+	kfree(buffer.pointer);
+	buffer.length = ACPI_ALLOCATE_BUFFER;
+	buffer.pointer = NULL;
+
+	tmp_map = cpu_present_map;
+	mp_register_lapic(physid, lapic->flags.enabled);
+
+	/*
+	 * If mp_register_lapic successfully generates a new logical cpu
+	 * number, then the following will get us exactly what was mapped
+	 */
+	cpus_andnot(new_map, cpu_present_map, tmp_map);
+	if (cpus_empty(new_map)) {
+		printk ("Unable to map lapic to logical cpu number\n");
+		return -EINVAL;
+	}
+
+	cpu = first_cpu(new_map);
+
+	*pcpu = cpu;
+	return 0;
 }
 
 EXPORT_SYMBOL(acpi_map_lsapic);
 
 int acpi_unmap_lsapic(int cpu)
 {
-	/* TBD */
-	return -EINVAL;
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (x86_acpiid_to_apicid[i] == x86_cpu_to_apicid[cpu]) {
+			x86_acpiid_to_apicid[i] = -1;
+			break;
+		}
+	}
+	x86_cpu_to_apicid[cpu] = -1;
+	cpu_clear(cpu, cpu_present_map);
+	num_processors--;
+
+	return (0);
 }
 
 EXPORT_SYMBOL(acpi_unmap_lsapic);
@@ -579,6 +648,8 @@ static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
 static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
 {
 	struct acpi_table_hpet *hpet_tbl;
+	struct resource *hpet_res;
+	resource_size_t res_start;
 
 	if (!phys || !size)
 		return -EINVAL;
@@ -594,12 +665,26 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
 		       "memory.\n");
 		return -1;
 	}
+
+#define HPET_RESOURCE_NAME_SIZE 9
+	hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+	if (hpet_res) {
+		memset(hpet_res, 0, sizeof(*hpet_res));
+		hpet_res->name = (void *)&hpet_res[1];
+		hpet_res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE,
+			 "HPET %u", hpet_tbl->number);
+		hpet_res->end = (1 * 1024) - 1;
+	}
+
 #ifdef	CONFIG_X86_64
 	vxtime.hpet_address = hpet_tbl->addr.addrl |
 	    ((long)hpet_tbl->addr.addrh << 32);
 
 	printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
 	       hpet_tbl->id, vxtime.hpet_address);
+
+	res_start = vxtime.hpet_address;
 #else				/* X86 */
 	{
 		extern unsigned long hpet_address;
@@ -607,9 +692,17 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
 		hpet_address = hpet_tbl->addr.addrl;
 		printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
 		       hpet_tbl->id, hpet_address);
+
+		res_start = hpet_address;
 	}
 #endif				/* X86 */
 
+	if (hpet_res) {
+		hpet_res->start = res_start;
+		hpet_res->end += res_start;
+		insert_resource(&iomem_resource, hpet_res);
+	}
+
 	return 0;
 }
 #else
@@ -860,8 +953,6 @@ static void __init acpi_process_madt(void)
 	return;
 }
 
-extern int acpi_force;
-
 #ifdef __i386__
 
 static int __init disable_acpi_irq(struct dmi_system_id *d)
@@ -1163,3 +1254,75 @@ int __init acpi_boot_init(void)
 
 	return 0;
 }
+
+static int __init parse_acpi(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	/* "acpi=off" disables both ACPI table parsing and interpreter */
+	if (strcmp(arg, "off") == 0) {
+		disable_acpi();
+	}
+	/* acpi=force to over-ride black-list */
+	else if (strcmp(arg, "force") == 0) {
+		acpi_force = 1;
+		acpi_ht = 1;
+		acpi_disabled = 0;
+	}
+	/* acpi=strict disables out-of-spec workarounds */
+	else if (strcmp(arg, "strict") == 0) {
+		acpi_strict = 1;
+	}
+	/* Limit ACPI just to boot-time to enable HT */
+	else if (strcmp(arg, "ht") == 0) {
+		if (!acpi_force)
+			disable_acpi();
+		acpi_ht = 1;
+	}
+	/* "acpi=noirq" disables ACPI interrupt routing */
+	else if (strcmp(arg, "noirq") == 0) {
+		acpi_noirq_set();
+	} else {
+		/* Core will printk when we return error. */
+		return -EINVAL;
+	}
+	return 0;
+}
+early_param("acpi", parse_acpi);
+
+/* FIXME: Using pci= for an ACPI parameter is a travesty. */
+static int __init parse_pci(char *arg)
+{
+	if (arg && strcmp(arg, "noacpi") == 0)
+		acpi_disable_pci();
+	return 0;
+}
+early_param("pci", parse_pci);
+
+#ifdef CONFIG_X86_IO_APIC
+static int __init parse_acpi_skip_timer_override(char *arg)
+{
+	acpi_skip_timer_override = 1;
+	return 0;
+}
+early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
+#endif /* CONFIG_X86_IO_APIC */
+
+static int __init setup_acpi_sci(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	if (!strcmp(s, "edge"))
+		acpi_sci_flags.trigger = 1;
+	else if (!strcmp(s, "level"))
+		acpi_sci_flags.trigger = 3;
+	else if (!strcmp(s, "high"))
+		acpi_sci_flags.polarity = 1;
+	else if (!strcmp(s, "low"))
+		acpi_sci_flags.polarity = 3;
+	else
+		return -EINVAL;
+	return 0;
+}
+early_param("acpi_sci", setup_acpi_sci);
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index 1649a175a206..fe799b11ac0a 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -48,7 +48,11 @@ void __init check_acpi_pci(void)
 	int num, slot, func;
 
 	/* Assume the machine supports type 1. If not it will 
-	   always read ffffffff and should not have any side effect. */
+	   always read ffffffff and should not have any side effect.
+	   Actually a few buggy systems can machine check. Allow the user
+	   to disable it by command line option at least -AK */
+	if (!early_pci_allowed())
+		return;
 
 	/* Poor man's PCI discovery */
 	for (num = 0; num < 32; num++) {
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 8c844d07862f..90faae5c5d30 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -52,7 +52,18 @@ static cpumask_t timer_bcast_ipi;
 /*
  * Knob to control our willingness to enable the local APIC.
  */
-int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+
+static inline void lapic_disable(void)
+{
+	enable_local_apic = -1;
+	clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+}
+
+static inline void lapic_enable(void)
+{
+	enable_local_apic = 1;
+}
 
 /*
  * Debug level
@@ -586,8 +597,7 @@ void __devinit setup_local_APIC(void)
 			printk("No ESR for 82489DX.\n");
 	}
 
-	if (nmi_watchdog == NMI_LOCAL_APIC)
-		setup_apic_nmi_watchdog();
+	setup_apic_nmi_watchdog(NULL);
 	apic_pm_activate();
 }
 
@@ -1373,3 +1383,18 @@ int __init APIC_init_uniprocessor (void)
 
 	return 0;
 }
+
+static int __init parse_lapic(char *arg)
+{
+	lapic_enable();
+	return 0;
+}
+early_param("lapic", parse_lapic);
+
+static int __init parse_nolapic(char *arg)
+{
+	lapic_disable();
+	return 0;
+}
+early_param("nolapic", parse_nolapic);
+
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 8591f2fa920c..ff9ce4b5eaa8 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -1154,9 +1154,11 @@ out:
 
 static void set_time(void)
 {
+	struct timespec ts;
 	if (got_clock_diff) {	/* Must know time zone in order to set clock */
-		xtime.tv_sec = get_cmos_time() + clock_cmos_diff;
-		xtime.tv_nsec = 0; 
+		ts.tv_sec = get_cmos_time() + clock_cmos_diff;
+		ts.tv_nsec = 0;
+		do_settimeofday(&ts);
 	} 
 }
 
@@ -1232,13 +1234,8 @@ static int suspend(int vetoable)
 	restore_processor_state();
 
 	local_irq_disable();
-	write_seqlock(&xtime_lock);
-	spin_lock(&i8253_lock);
-	reinit_timer();
 	set_time();
-
-	spin_unlock(&i8253_lock);
-	write_sequnlock(&xtime_lock);
+	reinit_timer();
 
 	if (err == APM_NO_ERROR)
 		err = APM_SUCCESS;
@@ -1365,9 +1362,7 @@ static void check_events(void)
 			ignore_bounce = 1;
 			if ((event != APM_NORMAL_RESUME)
 			    || (ignore_normal_resume == 0)) {
-				write_seqlock_irq(&xtime_lock);
 				set_time();
-				write_sequnlock_irq(&xtime_lock);
 				device_resume();
 				pm_send_all(PM_RESUME, (void *)0);
 				queue_event(event, NULL);
@@ -1383,9 +1378,7 @@ static void check_events(void)
 			break;
 
 		case APM_UPDATE_TIME:
-			write_seqlock_irq(&xtime_lock);
 			set_time();
-			write_sequnlock_irq(&xtime_lock);
 			break;
 
 		case APM_CRITICAL_SUSPEND:
@@ -2339,6 +2332,7 @@ static int __init apm_init(void)
 	ret = kernel_thread(apm, NULL, CLONE_KERNEL | SIGCHLD);
 	if (ret < 0) {
 		printk(KERN_ERR "apm: disabled - Unable to start kernel thread.\n");
+		remove_proc_entry("apm", NULL);
 		return -ENOMEM;
 	}
 
@@ -2348,7 +2342,13 @@ static int __init apm_init(void)
 		return 0;
 	}
 
-	misc_register(&apm_device);
+	/*
+	 * Note we don't actually care if the misc_device cannot be registered.
+	 * this driver can do its job without it, even if userspace can't
+	 * control it.  just log the error
+	 */
+	if (misc_register(&apm_device))
+		printk(KERN_WARNING "apm: Could not register misc device.\n");
 
 	if (HZ != 100)
 		idle_period = (idle_period * HZ) / 100;
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e6a2d6b80cda..e4758095d87a 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -22,7 +22,7 @@
 extern void vide(void);
 __asm__(".align 4\nvide: ret");
 
-static void __init init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int mbytes = num_physpages >> (20-PAGE_SHIFT);
@@ -246,7 +246,7 @@ static void __init init_amd(struct cpuinfo_x86 *c)
 		num_cache_leaves = 3;
 }
 
-static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
 {
 	/* AMD errata T13 (order #21922) */
 	if ((c->x86 == 6)) {
@@ -259,7 +259,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
 	return size;
 }
 
-static struct cpu_dev amd_cpu_dev __initdata = {
+static struct cpu_dev amd_cpu_dev __cpuinitdata = {
 	.c_vendor	= "AMD",
 	.c_ident 	= { "AuthenticAMD" },
 	.c_models = {
@@ -275,7 +275,6 @@ static struct cpu_dev amd_cpu_dev __initdata = {
 		},
 	},
 	.c_init		= init_amd,
-	.c_identify	= generic_identify,
 	.c_size_cache	= amd_size_cache,
 };
 
diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c
index bd75629dd262..8c25047975c0 100644
--- a/arch/i386/kernel/cpu/centaur.c
+++ b/arch/i386/kernel/cpu/centaur.c
@@ -9,7 +9,7 @@
 
 #ifdef CONFIG_X86_OOSTORE
 
-static u32 __init power2(u32 x)
+static u32 __cpuinit power2(u32 x)
 {
 	u32 s=1;
 	while(s<=x)
@@ -22,7 +22,7 @@ static u32 __init power2(u32 x)
  *	Set up an actual MCR
  */
  
-static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key)
+static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
 {
 	u32 lo, hi;
 	
@@ -40,7 +40,7 @@ static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key)
  *	Shortcut: We know you can't put 4Gig of RAM on a winchip
  */
 
-static u32 __init ramtop(void)		/* 16388 */
+static u32 __cpuinit ramtop(void)		/* 16388 */
 {
 	int i;
 	u32 top = 0;
@@ -91,7 +91,7 @@ static u32 __init ramtop(void)		/* 16388 */
  *	Compute a set of MCR's to give maximum coverage
  */
 
-static int __init centaur_mcr_compute(int nr, int key)
+static int __cpuinit centaur_mcr_compute(int nr, int key)
 {
 	u32 mem = ramtop();
 	u32 root = power2(mem);
@@ -166,7 +166,7 @@ static int __init centaur_mcr_compute(int nr, int key)
 	return ct;
 }
 
-static void __init centaur_create_optimal_mcr(void)
+static void __cpuinit centaur_create_optimal_mcr(void)
 {
 	int i;
 	/*
@@ -189,7 +189,7 @@ static void __init centaur_create_optimal_mcr(void)
 		wrmsr(MSR_IDT_MCR0+i, 0, 0);
 }
 
-static void __init winchip2_create_optimal_mcr(void)
+static void __cpuinit winchip2_create_optimal_mcr(void)
 {
 	u32 lo, hi;
 	int i;
@@ -227,7 +227,7 @@ static void __init winchip2_create_optimal_mcr(void)
  *	Handle the MCR key on the Winchip 2.
  */
 
-static void __init winchip2_unprotect_mcr(void)
+static void __cpuinit winchip2_unprotect_mcr(void)
 {
 	u32 lo, hi;
 	u32 key;
@@ -239,7 +239,7 @@ static void __init winchip2_unprotect_mcr(void)
 	wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
 }
 
-static void __init winchip2_protect_mcr(void)
+static void __cpuinit winchip2_protect_mcr(void)
 {
 	u32 lo, hi;
 	
@@ -257,7 +257,7 @@ static void __init winchip2_protect_mcr(void)
 #define RNG_ENABLED	(1 << 3)
 #define RNG_ENABLE	(1 << 6)	/* MSR_VIA_RNG */
 
-static void __init init_c3(struct cpuinfo_x86 *c)
+static void __cpuinit init_c3(struct cpuinfo_x86 *c)
 {
 	u32  lo, hi;
 
@@ -303,7 +303,7 @@ static void __init init_c3(struct cpuinfo_x86 *c)
 	display_cacheinfo(c);
 }
 
-static void __init init_centaur(struct cpuinfo_x86 *c)
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
 	enum {
 		ECX8=1<<1,
@@ -442,7 +442,7 @@ static void __init init_centaur(struct cpuinfo_x86 *c)
 	}
 }
 
-static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
 {
 	/* VIA C3 CPUs (670-68F) need further shifting. */
 	if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
@@ -457,7 +457,7 @@ static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size
 	return size;
 }
 
-static struct cpu_dev centaur_cpu_dev __initdata = {
+static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Centaur",
 	.c_ident	= { "CentaurHauls" },
 	.c_init		= init_centaur,
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 70c87de582c7..2799baaadf45 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -36,7 +36,7 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
 
 extern int disable_pse;
 
-static void default_init(struct cpuinfo_x86 * c)
+static void __cpuinit default_init(struct cpuinfo_x86 * c)
 {
 	/* Not much we can do here... */
 	/* Check if at least it has cpuid */
@@ -49,7 +49,7 @@ static void default_init(struct cpuinfo_x86 * c)
 	}
 }
 
-static struct cpu_dev default_cpu = {
+static struct cpu_dev __cpuinitdata default_cpu = {
 	.c_init	= default_init,
 	.c_vendor = "Unknown",
 };
@@ -265,7 +265,7 @@ static void __init early_cpu_detect(void)
 	}
 }
 
-void __cpuinit generic_identify(struct cpuinfo_x86 * c)
+static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
 	u32 tfms, xlvl;
 	int ebx;
@@ -675,7 +675,7 @@ old_gdt:
 #endif
 
 	/* Clear %fs and %gs. */
-	asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
+	asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
 
 	/* Clear all 6 debug registers: */
 	set_debugreg(0, 0);
diff --git a/arch/i386/kernel/cpu/cpu.h b/arch/i386/kernel/cpu/cpu.h
index 5a1d4f163e84..2f6432cef6ff 100644
--- a/arch/i386/kernel/cpu/cpu.h
+++ b/arch/i386/kernel/cpu/cpu.h
@@ -24,7 +24,5 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
 extern int get_model_name(struct cpuinfo_x86 *c);
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
 
-extern void generic_identify(struct cpuinfo_x86 * c);
-
 extern void early_intel_workaround(struct cpuinfo_x86 *c);
 
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index f03b7f94c304..c0c3b59de32c 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -12,7 +12,7 @@
 /*
  * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
  */
-static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
 {
 	unsigned char ccr2, ccr3;
 	unsigned long flags;
@@ -52,25 +52,25 @@ static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
  * Actually since bugs.h doesn't even reference this perhaps someone should
  * fix the documentation ???
  */
-static unsigned char Cx86_dir0_msb __initdata = 0;
+static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
 
-static char Cx86_model[][9] __initdata = {
+static char Cx86_model[][9] __cpuinitdata = {
 	"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
 	"M II ", "Unknown"
 };
-static char Cx486_name[][5] __initdata = {
+static char Cx486_name[][5] __cpuinitdata = {
 	"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
 	"SRx2", "DRx2"
 };
-static char Cx486S_name[][4] __initdata = {
+static char Cx486S_name[][4] __cpuinitdata = {
 	"S", "S2", "Se", "S2e"
 };
-static char Cx486D_name[][4] __initdata = {
+static char Cx486D_name[][4] __cpuinitdata = {
 	"DX", "DX2", "?", "?", "?", "DX4"
 };
-static char Cx86_cb[] __initdata = "?.5x Core/Bus Clock";
-static char cyrix_model_mult1[] __initdata = "12??43";
-static char cyrix_model_mult2[] __initdata = "12233445";
+static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
+static char cyrix_model_mult1[] __cpuinitdata = "12??43";
+static char cyrix_model_mult2[] __cpuinitdata = "12233445";
 
 /*
  * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -82,7 +82,7 @@ static char cyrix_model_mult2[] __initdata = "12233445";
 
 extern void calibrate_delay(void) __init;
 
-static void __init check_cx686_slop(struct cpuinfo_x86 *c)
+static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
 {
 	unsigned long flags;
 	
@@ -107,7 +107,7 @@ static void __init check_cx686_slop(struct cpuinfo_x86 *c)
 }
 
 
-static void __init set_cx86_reorder(void)
+static void __cpuinit set_cx86_reorder(void)
 {
 	u8 ccr3;
 
@@ -122,7 +122,7 @@ static void __init set_cx86_reorder(void)
 	setCx86(CX86_CCR3, ccr3);
 }
 
-static void __init set_cx86_memwb(void)
+static void __cpuinit set_cx86_memwb(void)
 {
 	u32 cr0;
 
@@ -137,7 +137,7 @@ static void __init set_cx86_memwb(void)
 	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
 }
 
-static void __init set_cx86_inc(void)
+static void __cpuinit set_cx86_inc(void)
 {
 	unsigned char ccr3;
 
@@ -158,7 +158,7 @@ static void __init set_cx86_inc(void)
  *	Configure later MediaGX and/or Geode processor.
  */
 
-static void __init geode_configure(void)
+static void __cpuinit geode_configure(void)
 {
 	unsigned long flags;
 	u8 ccr3, ccr4;
@@ -184,14 +184,14 @@ static void __init geode_configure(void)
 
 
 #ifdef CONFIG_PCI
-static struct pci_device_id __initdata cyrix_55x0[] = {
+static struct pci_device_id __cpuinitdata cyrix_55x0[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) },
 	{ },
 };
 #endif
 
-static void __init init_cyrix(struct cpuinfo_x86 *c)
+static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 {
 	unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
 	char *buf = c->x86_model_id;
@@ -346,7 +346,7 @@ static void __init init_cyrix(struct cpuinfo_x86 *c)
 /*
  * Handle National Semiconductor branded processors
  */
-static void __init init_nsc(struct cpuinfo_x86 *c)
+static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
 {
 	/* There may be GX1 processors in the wild that are branded
 	 * NSC and not Cyrix.
@@ -394,7 +394,7 @@ static inline int test_cyrix_52div(void)
 	return (unsigned char) (test >> 8) == 0x02;
 }
 
-static void cyrix_identify(struct cpuinfo_x86 * c)
+static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
 {
 	/* Detect Cyrix with disabled CPUID */
 	if ( c->x86 == 4 && test_cyrix_52div() ) {
@@ -427,10 +427,9 @@ static void cyrix_identify(struct cpuinfo_x86 * c)
 			local_irq_restore(flags);
 		}
 	}
-	generic_identify(c);
 }
 
-static struct cpu_dev cyrix_cpu_dev __initdata = {
+static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Cyrix",
 	.c_ident 	= { "CyrixInstead" },
 	.c_init		= init_cyrix,
@@ -453,11 +452,10 @@ static int __init cyrix_exit_cpu(void)
 
 late_initcall(cyrix_exit_cpu);
 
-static struct cpu_dev nsc_cpu_dev __initdata = {
+static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
 	.c_vendor	= "NSC",
 	.c_ident 	= { "Geode by NSC" },
 	.c_init		= init_nsc,
-	.c_identify	= generic_identify,
 };
 
 int __init nsc_init_cpu(void)
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 5a2e270924b1..94a95aa5227e 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -198,7 +198,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 }
 
 
-static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
 {
 	/* Intel PIII Tualatin. This comes in two flavours.
 	 * One has 256kb of cache, the other 512. We have no way
@@ -263,7 +263,6 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
 		},
 	},
 	.c_init		= init_intel,
-	.c_identify	= generic_identify,
 	.c_size_cache	= intel_size_cache,
 };
 
diff --git a/arch/i386/kernel/cpu/mcheck/Makefile b/arch/i386/kernel/cpu/mcheck/Makefile
index 30808f3d6715..f1ebe1c1c17a 100644
--- a/arch/i386/kernel/cpu/mcheck/Makefile
+++ b/arch/i386/kernel/cpu/mcheck/Makefile
@@ -1,2 +1,2 @@
-obj-y	=	mce.o k7.o p4.o p5.o p6.o winchip.o
+obj-y	=	mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o
 obj-$(CONFIG_X86_MCE_NONFATAL)	+=	non-fatal.o
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index b95f1b3d53aa..504434a46011 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -13,6 +13,8 @@
 #include <asm/msr.h>
 #include <asm/apic.h>
 
+#include <asm/therm_throt.h>
+
 #include "mce.h"
 
 /* as supported by the P4/Xeon family */
@@ -44,25 +46,12 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
 /* P4/Xeon Thermal transition interrupt handler */
 static void intel_thermal_interrupt(struct pt_regs *regs)
 {
-	u32 l, h;
-	unsigned int cpu = smp_processor_id();
-	static unsigned long next[NR_CPUS];
+	__u64 msr_val;
 
 	ack_APIC_irq();
 
-	if (time_after(next[cpu], jiffies))
-		return;
-
-	next[cpu] = jiffies + HZ*5;
-	rdmsr(MSR_IA32_THERM_STATUS, l, h);
-	if (l & 0x1) {
-		printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
-		printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
-				cpu);
-		add_taint(TAINT_MACHINE_CHECK);
-	} else {
-		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
-	}
+	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+	therm_throt_process(msr_val & 0x1);
 }
 
 /* Thermal interrupt handler for this CPU setup */
@@ -122,10 +111,13 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	
 	rdmsr (MSR_IA32_MISC_ENABLE, l, h);
 	wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
-	
+
 	l = apic_read (APIC_LVTTHMR);
 	apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
 	printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
+
+	/* enable thermal throttle processing */
+	atomic_set(&therm_throt_en, 1);
 	return;
 }
 #endif /* CONFIG_X86_MCE_P4THERMAL */
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
new file mode 100644
index 000000000000..4f43047de406
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -0,0 +1,180 @@
+/*
+ * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c
+ *
+ * Thermal throttle event support code (such as syslog messaging and rate
+ * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ * This allows consistent reporting of CPU thermal throttle events.
+ *
+ * Maintains a counter in /sys that keeps track of the number of thermal
+ * events, such that the user knows how bad the thermal problem might be
+ * (since the logging to syslog and mcelog is rate limited).
+ *
+ * Author: Dmitriy Zavin (dmitriyz@google.com)
+ *
+ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
+ *          Inspired by Ross Biro's and Al Borchers' counter code.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <asm/cpu.h>
+#include <linux/notifier.h>
+#include <asm/therm_throt.h>
+
+/* How long to wait between reporting thermal events */
+#define CHECK_INTERVAL              (300 * HZ)
+
+static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
+atomic_t therm_throt_en = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+#define define_therm_throt_sysdev_one_ro(_name)                              \
+        static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+
+#define define_therm_throt_sysdev_show_func(name)                            \
+static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,        \
+                                              char *buf)                     \
+{                                                                            \
+	unsigned int cpu = dev->id;                                          \
+	ssize_t ret;                                                         \
+                                                                             \
+	preempt_disable();              /* CPU hotplug */                    \
+	if (cpu_online(cpu))                                                 \
+		ret = sprintf(buf, "%lu\n",                                  \
+			      per_cpu(thermal_throttle_##name, cpu));        \
+	else                                                                 \
+		ret = 0;                                                     \
+	preempt_enable();                                                    \
+                                                                             \
+	return ret;                                                          \
+}
+
+define_therm_throt_sysdev_show_func(count);
+define_therm_throt_sysdev_one_ro(count);
+
+static struct attribute *thermal_throttle_attrs[] = {
+	&attr_count.attr,
+	NULL
+};
+
+static struct attribute_group thermal_throttle_attr_group = {
+	.attrs = thermal_throttle_attrs,
+	.name = "thermal_throttle"
+};
+#endif /* CONFIG_SYSFS */
+
+/***
+ * therm_throt_process - Process thermal throttling event from interrupt
+ * @curr: Whether the condition is current or not (boolean), since the
+ *        thermal interrupt normally gets called both when the thermal
+ *        event begins and once the event has ended.
+ *
+ * This function is called by the thermal interrupt after the
+ * IRQ has been acknowledged.
+ *
+ * It will take care of rate limiting and printing messages to the syslog.
+ *
+ * Returns: 0 : Event should NOT be further logged, i.e. still in
+ *              "timeout" from previous log message.
+ *          1 : Event should be logged further, and a message has been
+ *              printed to the syslog.
+ */
+int therm_throt_process(int curr)
+{
+	unsigned int cpu = smp_processor_id();
+	__u64 tmp_jiffs = get_jiffies_64();
+
+	if (curr)
+		__get_cpu_var(thermal_throttle_count)++;
+
+	if (time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+		return 0;
+
+	__get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
+
+	/* if we just entered the thermal event */
+	if (curr) {
+		printk(KERN_CRIT "CPU%d: Temperature above threshold, "
+		       "cpu clock throttled (total events = %lu)\n", cpu,
+		       __get_cpu_var(thermal_throttle_count));
+
+		add_taint(TAINT_MACHINE_CHECK);
+	} else {
+		printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu);
+	}
+
+	return 1;
+}
+
+#ifdef CONFIG_SYSFS
+/* Add/Remove thermal_throttle interface for CPU device */
+static __cpuinit int thermal_throttle_add_dev(struct sys_device * sys_dev)
+{
+	sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static __cpuinit int thermal_throttle_remove_dev(struct sys_device * sys_dev)
+{
+	sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+	return 0;
+}
+
+/* Mutex protecting device creation against CPU hotplug */
+static DEFINE_MUTEX(therm_cpu_lock);
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
+						   unsigned long action,
+						   void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct sys_device *sys_dev;
+
+	sys_dev = get_cpu_sysdev(cpu);
+	mutex_lock(&therm_cpu_lock);
+	switch (action) {
+	case CPU_ONLINE:
+		thermal_throttle_add_dev(sys_dev);
+		break;
+	case CPU_DEAD:
+		thermal_throttle_remove_dev(sys_dev);
+		break;
+	}
+	mutex_unlock(&therm_cpu_lock);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block thermal_throttle_cpu_notifier =
+{
+	.notifier_call = thermal_throttle_cpu_callback,
+};
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static __init int thermal_throttle_init_device(void)
+{
+	unsigned int cpu = 0;
+
+	if (!atomic_read(&therm_throt_en))
+		return 0;
+
+	register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	mutex_lock(&therm_cpu_lock);
+#endif
+	/* connect live CPUs to sysfs */
+	for_each_online_cpu(cpu)
+		thermal_throttle_add_dev(get_cpu_sysdev(cpu));
+#ifdef CONFIG_HOTPLUG_CPU
+	mutex_unlock(&therm_cpu_lock);
+#endif
+
+	return 0;
+}
+
+device_initcall(thermal_throttle_init_device);
+#endif /* CONFIG_SYSFS */
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 169ac8e0db68..0b61eed8bbd8 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -243,7 +243,7 @@ static DEFINE_SPINLOCK(set_atomicity_lock);
  * has been called.
  */
 
-static void prepare_set(void)
+static void prepare_set(void) __acquires(set_atomicity_lock)
 {
 	unsigned long cr0;
 
@@ -274,7 +274,7 @@ static void prepare_set(void)
 	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi);
 }
 
-static void post_set(void)
+static void post_set(void) __releases(set_atomicity_lock)
 {
 	/*  Flush TLBs (no need to flush caches - they are disabled)  */
 	__flush_tlb();
diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c
index ad87fa58058d..8bf23cc80c63 100644
--- a/arch/i386/kernel/cpu/nexgen.c
+++ b/arch/i386/kernel/cpu/nexgen.c
@@ -10,7 +10,7 @@
  *	to have CPUID. (Thanks to Herbert Oppmann)
  */
  
-static int __init deep_magic_nexgen_probe(void)
+static int __cpuinit deep_magic_nexgen_probe(void)
 {
 	int ret;
 	
@@ -27,21 +27,20 @@ static int __init deep_magic_nexgen_probe(void)
 	return  ret;
 }
 
-static void __init init_nexgen(struct cpuinfo_x86 * c)
+static void __cpuinit init_nexgen(struct cpuinfo_x86 * c)
 {
 	c->x86_cache_size = 256; /* A few had 1 MB... */
 }
 
-static void __init nexgen_identify(struct cpuinfo_x86 * c)
+static void __cpuinit nexgen_identify(struct cpuinfo_x86 * c)
 {
 	/* Detect NexGen with old hypercode */
 	if ( deep_magic_nexgen_probe() ) {
 		strcpy(c->x86_vendor_id, "NexGenDriven");
 	}
-	generic_identify(c);
 }
 
-static struct cpu_dev nexgen_cpu_dev __initdata = {
+static struct cpu_dev nexgen_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Nexgen",
 	.c_ident	= { "NexGenDriven" },
 	.c_models = {
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index f54a15268ed7..76aac088a323 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -46,8 +46,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 
 		/* Intel-defined (#2) */
 		"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
-		"tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+		NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* VIA/Cyrix/Centaur-defined */
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
index d08d5a2811c8..9317f7414989 100644
--- a/arch/i386/kernel/cpu/rise.c
+++ b/arch/i386/kernel/cpu/rise.c
@@ -5,7 +5,7 @@
 
 #include "cpu.h"
 
-static void __init init_rise(struct cpuinfo_x86 *c)
+static void __cpuinit init_rise(struct cpuinfo_x86 *c)
 {
 	printk("CPU: Rise iDragon");
 	if (c->x86_model > 2)
@@ -28,7 +28,7 @@ static void __init init_rise(struct cpuinfo_x86 *c)
 	set_bit(X86_FEATURE_CX8, c->x86_capability);
 }
 
-static struct cpu_dev rise_cpu_dev __initdata = {
+static struct cpu_dev rise_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Rise",
 	.c_ident	= { "RiseRiseRise" },
 	.c_models = {
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 7214c9b577ab..4056fb7d2cdf 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -5,7 +5,7 @@
 #include <asm/msr.h>
 #include "cpu.h"
 
-static void __init init_transmeta(struct cpuinfo_x86 *c)
+static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 {
 	unsigned int cap_mask, uk, max, dummy;
 	unsigned int cms_rev1, cms_rev2;
@@ -85,10 +85,9 @@ static void __init init_transmeta(struct cpuinfo_x86 *c)
 #endif
 }
 
-static void __init transmeta_identify(struct cpuinfo_x86 * c)
+static void __cpuinit transmeta_identify(struct cpuinfo_x86 * c)
 {
 	u32 xlvl;
-	generic_identify(c);
 
 	/* Transmeta-defined flags: level 0x80860001 */
 	xlvl = cpuid_eax(0x80860000);
@@ -98,7 +97,7 @@ static void __init transmeta_identify(struct cpuinfo_x86 * c)
 	}
 }
 
-static struct cpu_dev transmeta_cpu_dev __initdata = {
+static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Transmeta",
 	.c_ident	= { "GenuineTMx86", "TransmetaCPU" },
 	.c_init		= init_transmeta,
diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c
index 2cd988f6dc55..1bf3f87e9c5b 100644
--- a/arch/i386/kernel/cpu/umc.c
+++ b/arch/i386/kernel/cpu/umc.c
@@ -5,12 +5,8 @@
 
 /* UMC chips appear to be only either 386 or 486, so no special init takes place.
  */
-static void __init init_umc(struct cpuinfo_x86 * c)
-{
-
-}
 
-static struct cpu_dev umc_cpu_dev __initdata = {
+static struct cpu_dev umc_cpu_dev __cpuinitdata = {
 	.c_vendor	= "UMC",
 	.c_ident 	= { "UMC UMC UMC" },
 	.c_models = {
@@ -21,7 +17,6 @@ static struct cpu_dev umc_cpu_dev __initdata = {
 		  }
 		},
 	},
-	.c_init		= init_umc,
 };
 
 int __init umc_init_cpu(void)
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 5b96f038367f..67d297dc1003 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -22,6 +22,8 @@
 #include <asm/nmi.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
+#include <asm/kdebug.h>
+
 #include <mach_ipi.h>
 
 
@@ -93,16 +95,25 @@ static void crash_save_self(struct pt_regs *regs)
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 static atomic_t waiting_for_crash_ipi;
 
-static int crash_nmi_callback(struct pt_regs *regs, int cpu)
+static int crash_nmi_callback(struct notifier_block *self,
+			unsigned long val, void *data)
 {
+	struct pt_regs *regs;
 	struct pt_regs fixed_regs;
+	int cpu;
+
+	if (val != DIE_NMI_IPI)
+		return NOTIFY_OK;
+
+	regs = ((struct die_args *)data)->regs;
+	cpu = raw_smp_processor_id();
 
 	/* Don't do anything if this handler is invoked on crashing cpu.
 	 * Otherwise, system will completely hang. Crashing cpu can get
 	 * an NMI if system was initially booted with nmi_watchdog parameter.
 	 */
 	if (cpu == crashing_cpu)
-		return 1;
+		return NOTIFY_STOP;
 	local_irq_disable();
 
 	if (!user_mode_vm(regs)) {
@@ -125,13 +136,18 @@ static void smp_send_nmi_allbutself(void)
 	send_IPI_allbutself(NMI_VECTOR);
 }
 
+static struct notifier_block crash_nmi_nb = {
+	.notifier_call = crash_nmi_callback,
+};
+
 static void nmi_shootdown_cpus(void)
 {
 	unsigned long msecs;
 
 	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
 	/* Would it be better to replace the trap vector here? */
-	set_nmi_callback(crash_nmi_callback);
+	if (register_die_notifier(&crash_nmi_nb))
+		return;		/* return what? */
 	/* Ensure the new callback function is set before sending
 	 * out the NMI
 	 */
diff --git a/arch/i386/kernel/efi_stub.S b/arch/i386/kernel/efi_stub.S
index d3ee73a3eee3..ef00bb77d7e4 100644
--- a/arch/i386/kernel/efi_stub.S
+++ b/arch/i386/kernel/efi_stub.S
@@ -7,7 +7,6 @@
 
 #include <linux/linkage.h>
 #include <asm/page.h>
-#include <asm/pgtable.h>
 
 /*
  * efi_call_phys(void *, ...) is a function with variable parameters.
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 87f9f60b803b..5a63d6fdb70e 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -76,8 +76,15 @@ DF_MASK		= 0x00000400
 NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
+/* These are replaces for paravirtualization */
+#define DISABLE_INTERRUPTS		cli
+#define ENABLE_INTERRUPTS		sti
+#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
+#define INTERRUPT_RETURN		iret
+#define GET_CR0_INTO_EAX		movl %cr0, %eax
+
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		cli; TRACE_IRQS_OFF
+#define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
 #else
 #define preempt_stop
 #define resume_kernel		restore_nocheck
@@ -176,18 +183,21 @@ VM_MASK		= 0x00020000
 
 #define RING0_INT_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, 3*4;\
 	/*CFI_OFFSET cs, -2*4;*/\
 	CFI_OFFSET eip, -3*4
 
 #define RING0_EC_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, 4*4;\
 	/*CFI_OFFSET cs, -2*4;*/\
 	CFI_OFFSET eip, -3*4
 
 #define RING0_PTREGS_FRAME \
 	CFI_STARTPROC simple;\
+	CFI_SIGNAL_FRAME;\
 	CFI_DEF_CFA esp, OLDESP-EBX;\
 	/*CFI_OFFSET cs, CS-OLDESP;*/\
 	CFI_OFFSET eip, EIP-OLDESP;\
@@ -233,10 +243,11 @@ ret_from_intr:
 check_userspace:
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
-	testl $(VM_MASK | 3), %eax
-	jz resume_kernel
+	andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+	cmpl $USER_RPL, %eax
+	jb resume_kernel		# not returning to v8086 or userspace
 ENTRY(resume_userspace)
- 	cli				# make sure we don't miss an interrupt
+ 	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -247,7 +258,7 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
-	cli
+	DISABLE_INTERRUPTS
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
@@ -267,6 +278,7 @@ need_resched:
 	# sysenter call handler stub
 ENTRY(sysenter_entry)
 	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA esp, 0
 	CFI_REGISTER esp, ebp
 	movl TSS_sysenter_esp0(%esp),%esp
@@ -275,7 +287,7 @@ sysenter_past_esp:
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
 	 */
-	sti
+	ENABLE_INTERRUPTS
 	pushl $(__USER_DS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET ss, 0*/
@@ -320,7 +332,7 @@ sysenter_past_esp:
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
-	cli
+	DISABLE_INTERRUPTS
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
@@ -330,8 +342,7 @@ sysenter_past_esp:
 	movl OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
-	sti
-	sysexit
+	ENABLE_INTERRUPTS_SYSEXIT
 	CFI_ENDPROC
 
 
@@ -356,7 +367,7 @@ syscall_call:
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)		# store the return value
 syscall_exit:
-	cli				# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -371,8 +382,8 @@ restore_all:
 	# See comments in process.c:copy_thread() for details.
 	movb OLDSS(%esp), %ah
 	movb CS(%esp), %al
-	andl $(VM_MASK | (4 << 8) | 3), %eax
-	cmpl $((4 << 8) | 3), %eax
+	andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
 	CFI_REMEMBER_STATE
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
@@ -381,11 +392,11 @@ restore_nocheck_notrace:
 	RESTORE_REGS
 	addl $4, %esp
 	CFI_ADJUST_CFA_OFFSET -4
-1:	iret
+1:	INTERRUPT_RETURN
 .section .fixup,"ax"
 iret_exc:
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -409,7 +420,7 @@ ldt_ss:
 	 * dosemu and wine happy. */
 	subl $8, %esp		# reserve space for switch16 pointer
 	CFI_ADJUST_CFA_OFFSET 8
-	cli
+	DISABLE_INTERRUPTS
 	TRACE_IRQS_OFF
 	movl %esp, %eax
 	/* Set up the 16bit stack frame with switch32 pointer on top,
@@ -419,7 +430,7 @@ ldt_ss:
 	TRACE_IRQS_IRET
 	RESTORE_REGS
 	lss 20+4(%esp), %esp	# switch to 16bit stack
-1:	iret
+1:	INTERRUPT_RETURN
 .section __ex_table,"a"
 	.align 4
 	.long 1b,iret_exc
@@ -434,7 +445,7 @@ work_pending:
 	jz work_notifysig
 work_resched:
 	call schedule
-	cli				# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -490,7 +501,7 @@ syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
 	TRACE_IRQS_ON
-	sti				# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS		# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
 	movl $1, %edx
@@ -591,11 +602,9 @@ ENTRY(name)				\
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
 
-ENTRY(divide_error)
-	RING0_INT_FRAME
-	pushl $0			# no error code
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl $do_divide_error
+KPROBE_ENTRY(page_fault)
+	RING0_EC_FRAME
+	pushl $do_page_fault
 	CFI_ADJUST_CFA_OFFSET 4
 	ALIGN
 error_code:
@@ -645,6 +654,7 @@ error_code:
 	call *%edi
 	jmp ret_from_exception
 	CFI_ENDPROC
+KPROBE_END(page_fault)
 
 ENTRY(coprocessor_error)
 	RING0_INT_FRAME
@@ -669,7 +679,7 @@ ENTRY(device_not_available)
 	pushl $-1			# mark this as an int
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
-	movl %cr0, %eax
+	GET_CR0_INTO_EAX
 	testl $0x4, %eax		# EM (math emulation bit)
 	jne device_not_available_emulate
 	preempt_stop
@@ -702,9 +712,15 @@ device_not_available_emulate:
 	jne ok;					\
 label:						\
 	movl TSS_sysenter_esp0+offset(%esp),%esp;	\
+	CFI_DEF_CFA esp, 0;			\
+	CFI_UNDEFINED eip;			\
 	pushfl;					\
+	CFI_ADJUST_CFA_OFFSET 4;		\
 	pushl $__KERNEL_CS;			\
-	pushl $sysenter_past_esp
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	pushl $sysenter_past_esp;		\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	CFI_REL_OFFSET eip, 0
 
 KPROBE_ENTRY(debug)
 	RING0_INT_FRAME
@@ -720,7 +736,8 @@ debug_stack_correct:
 	call do_debug
 	jmp ret_from_exception
 	CFI_ENDPROC
-	.previous .text
+KPROBE_END(debug)
+
 /*
  * NMI is doubly nasty. It can happen _while_ we're handling
  * a debug fault, and the debug fault hasn't yet been able to
@@ -729,7 +746,7 @@ debug_stack_correct:
  * check whether we got an NMI on the debug path where the debug
  * fault happened on the sysenter path.
  */
-ENTRY(nmi)
+KPROBE_ENTRY(nmi)
 	RING0_INT_FRAME
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
@@ -754,6 +771,7 @@ ENTRY(nmi)
 	cmpl $sysenter_entry,12(%esp)
 	je nmi_debug_stack_check
 nmi_stack_correct:
+	/* We have a RING0_INT_FRAME here */
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
@@ -764,9 +782,12 @@ nmi_stack_correct:
 	CFI_ENDPROC
 
 nmi_stack_fixup:
+	RING0_INT_FRAME
 	FIX_STACK(12,nmi_stack_correct, 1)
 	jmp nmi_stack_correct
+
 nmi_debug_stack_check:
+	/* We have a RING0_INT_FRAME here */
 	cmpw $__KERNEL_CS,16(%esp)
 	jne nmi_stack_correct
 	cmpl $debug,(%esp)
@@ -777,8 +798,10 @@ nmi_debug_stack_check:
 	jmp nmi_stack_correct
 
 nmi_16bit_stack:
-	RING0_INT_FRAME
-	/* create the pointer to lss back */
+	/* We have a RING0_INT_FRAME here.
+	 *
+	 * create the pointer to lss back
+	 */
 	pushl %ss
 	CFI_ADJUST_CFA_OFFSET 4
 	pushl %esp
@@ -799,12 +822,13 @@ nmi_16bit_stack:
 	call do_nmi
 	RESTORE_REGS
 	lss 12+4(%esp), %esp		# back to 16bit stack
-1:	iret
+1:	INTERRUPT_RETURN
 	CFI_ENDPROC
 .section __ex_table,"a"
 	.align 4
 	.long 1b,iret_exc
 .previous
+KPROBE_END(nmi)
 
 KPROBE_ENTRY(int3)
 	RING0_INT_FRAME
@@ -816,7 +840,7 @@ KPROBE_ENTRY(int3)
 	call do_int3
 	jmp ret_from_exception
 	CFI_ENDPROC
-	.previous .text
+KPROBE_END(int3)
 
 ENTRY(overflow)
 	RING0_INT_FRAME
@@ -881,7 +905,7 @@ KPROBE_ENTRY(general_protection)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
-	.previous .text
+KPROBE_END(general_protection)
 
 ENTRY(alignment_check)
 	RING0_EC_FRAME
@@ -890,13 +914,14 @@ ENTRY(alignment_check)
 	jmp error_code
 	CFI_ENDPROC
 
-KPROBE_ENTRY(page_fault)
-	RING0_EC_FRAME
-	pushl $do_page_fault
+ENTRY(divide_error)
+	RING0_INT_FRAME
+	pushl $0			# no error code
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl $do_divide_error
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
-	.previous .text
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
@@ -949,6 +974,19 @@ ENTRY(arch_unwind_init_running)
 ENDPROC(arch_unwind_init_running)
 #endif
 
+ENTRY(kernel_thread_helper)
+	pushl $0		# fake return address for unwinder
+	CFI_STARTPROC
+	movl %edx,%eax
+	push %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	call *%ebx
+	push %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	call do_exit
+	CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+
 .section .rodata,"a"
 #include "syscall_table.S"
 
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index a6b8bd89aa27..be9d883c62ce 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -371,8 +371,65 @@ rp_sidt:
 	addl $8,%edi
 	dec %ecx
 	jne rp_sidt
+
+.macro	set_early_handler handler,trapno
+	lea \handler,%edx
+	movl $(__KERNEL_CS << 16),%eax
+	movw %dx,%ax
+	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */
+	lea idt_table,%edi
+	movl %eax,8*\trapno(%edi)
+	movl %edx,8*\trapno+4(%edi)
+.endm
+
+	set_early_handler handler=early_divide_err,trapno=0
+	set_early_handler handler=early_illegal_opcode,trapno=6
+	set_early_handler handler=early_protection_fault,trapno=13
+	set_early_handler handler=early_page_fault,trapno=14
+
 	ret
 
+early_divide_err:
+	xor %edx,%edx
+	pushl $0	/* fake errcode */
+	jmp early_fault
+
+early_illegal_opcode:
+	movl $6,%edx
+	pushl $0	/* fake errcode */
+	jmp early_fault
+
+early_protection_fault:
+	movl $13,%edx
+	jmp early_fault
+
+early_page_fault:
+	movl $14,%edx
+	jmp early_fault
+
+early_fault:
+	cld
+#ifdef CONFIG_PRINTK
+	movl $(__KERNEL_DS),%eax
+	movl %eax,%ds
+	movl %eax,%es
+	cmpl $2,early_recursion_flag
+	je hlt_loop
+	incl early_recursion_flag
+	movl %cr2,%eax
+	pushl %eax
+	pushl %edx		/* trapno */
+	pushl $fault_msg
+#ifdef CONFIG_EARLY_PRINTK
+	call early_printk
+#else
+	call printk
+#endif
+#endif
+hlt_loop:
+	hlt
+	jmp hlt_loop
+
 /* This is the default interrupt "handler" :-) */
 	ALIGN
 ignore_int:
@@ -386,6 +443,9 @@ ignore_int:
 	movl $(__KERNEL_DS),%eax
 	movl %eax,%ds
 	movl %eax,%es
+	cmpl $2,early_recursion_flag
+	je hlt_loop
+	incl early_recursion_flag
 	pushl 16(%esp)
 	pushl 24(%esp)
 	pushl 32(%esp)
@@ -431,9 +491,16 @@ ENTRY(stack_start)
 
 ready:	.byte 0
 
+early_recursion_flag:
+	.long 0
+
 int_msg:
 	.asciz "Unknown interrupt or fault at EIP %p %p %p\n"
 
+fault_msg:
+	.ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
+	.asciz "Stack: %p %p %p %p %p %p %p %p\n"
+
 /*
  * The IDT and GDT 'descriptors' are a strange 48-bit object
  * only used by the lidt and lgdt instructions. They are not
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index d4756d154f47..ea5f4e7958d8 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -45,6 +45,8 @@ static void end_8259A_irq (unsigned int irq)
 
 #define shutdown_8259A_irq	disable_8259A_irq
 
+static int i8259A_auto_eoi;
+
 static void mask_and_ack_8259A(unsigned int);
 
 unsigned int startup_8259A_irq(unsigned int irq)
@@ -253,7 +255,7 @@ static void save_ELCR(char *trigger)
 
 static int i8259A_resume(struct sys_device *dev)
 {
-	init_8259A(0);
+	init_8259A(i8259A_auto_eoi);
 	restore_ELCR(irq_trigger);
 	return 0;
 }
@@ -301,6 +303,8 @@ void init_8259A(int auto_eoi)
 {
 	unsigned long flags;
 
+	i8259A_auto_eoi = auto_eoi;
+
 	spin_lock_irqsave(&i8259A_lock, flags);
 
 	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 4fb32c551fe0..fd0df75cfbda 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -40,6 +40,7 @@
 #include <asm/nmi.h>
 
 #include <mach_apic.h>
+#include <mach_apicdef.h>
 
 #include "io_ports.h"
 
@@ -65,7 +66,7 @@ int sis_apic_bug = -1;
  */
 int nr_ioapic_registers[MAX_IO_APICS];
 
-int disable_timer_pin_1 __initdata;
+static int disable_timer_pin_1 __initdata;
 
 /*
  * Rough estimation of how many shared IRQs there are, can
@@ -93,6 +94,34 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 #define vector_to_irq(vector)	(vector)
 #endif
 
+
+union entry_union {
+	struct { u32 w1, w2; };
+	struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+	union entry_union eu;
+	unsigned long flags;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+	eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	return eu.entry;
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	unsigned long flags;
+	union entry_union eu;
+	eu.entry = e;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -200,13 +229,9 @@ static void unmask_IO_APIC_irq (unsigned int irq)
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
-	unsigned long flags;
 	
 	/* Check delivery_mode to be sure we're not clearing an SMI pin */
-	spin_lock_irqsave(&ioapic_lock, flags);
-	*(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-	*(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	entry = ioapic_read_entry(apic, pin);
 	if (entry.delivery_mode == dest_SMI)
 		return;
 
@@ -215,10 +240,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	 */
 	memset(&entry, 0, sizeof(entry));
 	entry.mask = 1;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry);
 }
 
 static void clear_IO_APIC (void)
@@ -1283,9 +1305,8 @@ static void __init setup_IO_APIC_irqs(void)
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
 		}
+		ioapic_write_entry(apic, pin, entry);
 		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
-		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
 		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
@@ -1301,7 +1322,6 @@ static void __init setup_IO_APIC_irqs(void)
 static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
 {
 	struct IO_APIC_route_entry entry;
-	unsigned long flags;
 
 	memset(&entry,0,sizeof(entry));
 
@@ -1331,10 +1351,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	/*
 	 * Add it to the IO-APIC irq-routing table:
 	 */
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
-	io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry);
 
 	enable_8259A_irq(0);
 }
@@ -1444,10 +1461,7 @@ void __init print_IO_APIC(void)
 	for (i = 0; i <= reg_01.bits.entries; i++) {
 		struct IO_APIC_route_entry entry;
 
-		spin_lock_irqsave(&ioapic_lock, flags);
-		*(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
-		*(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
+		entry = ioapic_read_entry(apic, i);
 
 		printk(KERN_DEBUG " %02x %03X %02X  ",
 			i,
@@ -1666,10 +1680,7 @@ static void __init enable_IO_APIC(void)
 		/* See if any of the pins is in ExtINT mode */
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 			struct IO_APIC_route_entry entry;
-			spin_lock_irqsave(&ioapic_lock, flags);
-			*(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-			*(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-			spin_unlock_irqrestore(&ioapic_lock, flags);
+			entry = ioapic_read_entry(apic, pin);
 
 
 			/* If the interrupt line is enabled and in ExtInt mode
@@ -1726,7 +1737,6 @@ void disable_IO_APIC(void)
 	 */
 	if (ioapic_i8259.pin != -1) {
 		struct IO_APIC_route_entry entry;
-		unsigned long flags;
 
 		memset(&entry, 0, sizeof(entry));
 		entry.mask            = 0; /* Enabled */
@@ -1743,12 +1753,7 @@ void disable_IO_APIC(void)
 		/*
 		 * Add it to the IO-APIC irq-routing table:
 		 */
-		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
-			*(((int *)&entry)+1));
-		io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
-			*(((int *)&entry)+0));
-		spin_unlock_irqrestore(&ioapic_lock, flags);
+		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
 	}
 	disconnect_bsp_APIC(ioapic_i8259.pin != -1);
 }
@@ -2213,17 +2218,13 @@ static inline void unlock_ExtINT_logic(void)
 	int apic, pin, i;
 	struct IO_APIC_route_entry entry0, entry1;
 	unsigned char save_control, save_freq_select;
-	unsigned long flags;
 
 	pin  = find_isa_irq_pin(8, mp_INT);
 	apic = find_isa_irq_apic(8, mp_INT);
 	if (pin == -1)
 		return;
 
-	spin_lock_irqsave(&ioapic_lock, flags);
-	*(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-	*(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	entry0 = ioapic_read_entry(apic, pin);
 	clear_IO_APIC_pin(apic, pin);
 
 	memset(&entry1, 0, sizeof(entry1));
@@ -2236,10 +2237,7 @@ static inline void unlock_ExtINT_logic(void)
 	entry1.trigger = 0;
 	entry1.vector = 0;
 
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry1);
 
 	save_control = CMOS_READ(RTC_CONTROL);
 	save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
@@ -2258,10 +2256,7 @@ static inline void unlock_ExtINT_logic(void)
 	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
 	clear_IO_APIC_pin(apic, pin);
 
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	ioapic_write_entry(apic, pin, entry0);
 }
 
 int timer_uses_ioapic_pin_0;
@@ -2461,17 +2456,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
 {
 	struct IO_APIC_route_entry *entry;
 	struct sysfs_ioapic_data *data;
-	unsigned long flags;
 	int i;
 	
 	data = container_of(dev, struct sysfs_ioapic_data, dev);
 	entry = data->entry;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
-		*(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
-		*(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
-	}
-	spin_unlock_irqrestore(&ioapic_lock, flags);
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+		entry[i] = ioapic_read_entry(dev->id, i);
 
 	return 0;
 }
@@ -2493,11 +2483,9 @@ static int ioapic_resume(struct sys_device *dev)
 		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
-		io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
-		io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
-	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+		ioapic_write_entry(dev->id, i, entry[i]);
 
 	return 0;
 }
@@ -2694,9 +2682,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	if (!ioapic && (irq < 16))
 		disable_8259A_irq(irq);
 
+	ioapic_write_entry(ioapic, pin, entry);
 	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
-	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
 	set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
@@ -2704,3 +2691,25 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 }
 
 #endif /* CONFIG_ACPI */
+
+static int __init parse_disable_timer_pin_1(char *arg)
+{
+	disable_timer_pin_1 = 1;
+	return 0;
+}
+early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
+
+static int __init parse_enable_timer_pin_1(char *arg)
+{
+	disable_timer_pin_1 = -1;
+	return 0;
+}
+early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
+
+static int __init parse_noapic(char *arg)
+{
+	/* disable IO-APIC */
+	disable_ioapic_setup();
+	return 0;
+}
+early_param("noapic", parse_noapic);
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index 6b1ae6ba76f0..91966bafb3dc 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/kexec.h>
 #include <linux/delay.h>
+#include <linux/init.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
@@ -20,70 +21,13 @@
 #include <asm/system.h>
 
 #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-
-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L2_ATTR (_PAGE_PRESENT)
-
-#define LEVEL0_SIZE (1UL << 12UL)
-
-#ifndef CONFIG_X86_PAE
-#define LEVEL1_SIZE (1UL << 22UL)
-static u32 pgtable_level1[1024] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
-	unsigned long level1_index, level2_index;
-	u32 *pgtable_level2;
-
-	/* Find the current page table */
-	pgtable_level2 = __va(read_cr3());
-
-	/* Find the indexes of the physical address to identity map */
-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
-	level2_index = address / LEVEL1_SIZE;
-
-	/* Identity map the page table entry */
-	pgtable_level1[level1_index] = address | L0_ATTR;
-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
-
-	/* Flush the tlb so the new mapping takes effect.
-	 * Global tlb entries are not flushed but that is not an issue.
-	 */
-	load_cr3(pgtable_level2);
-}
-
-#else
-#define LEVEL1_SIZE (1UL << 21UL)
-#define LEVEL2_SIZE (1UL << 30UL)
-static u64 pgtable_level1[512] PAGE_ALIGNED;
-static u64 pgtable_level2[512] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
-	unsigned long level1_index, level2_index, level3_index;
-	u64 *pgtable_level3;
-
-	/* Find the current page table */
-	pgtable_level3 = __va(read_cr3());
-
-	/* Find the indexes of the physical address to identity map */
-	level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
-	level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
-	level3_index = address / LEVEL2_SIZE;
-
-	/* Identity map the page table entry */
-	pgtable_level1[level1_index] = address | L0_ATTR;
-	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
-	set_64bit(&pgtable_level3[level3_index],
-					       __pa(pgtable_level2) | L2_ATTR);
-
-	/* Flush the tlb so the new mapping takes effect.
-	 * Global tlb entries are not flushed but that is not an issue.
-	 */
-	load_cr3(pgtable_level3);
-}
+static u32 kexec_pgd[1024] PAGE_ALIGNED;
+#ifdef CONFIG_X86_PAE
+static u32 kexec_pmd0[1024] PAGE_ALIGNED;
+static u32 kexec_pmd1[1024] PAGE_ALIGNED;
 #endif
+static u32 kexec_pte0[1024] PAGE_ALIGNED;
+static u32 kexec_pte1[1024] PAGE_ALIGNED;
 
 static void set_idt(void *newidt, __u16 limit)
 {
@@ -127,16 +71,6 @@ static void load_segments(void)
 #undef __STR
 }
 
-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
-					unsigned long indirection_page,
-					unsigned long reboot_code_buffer,
-					unsigned long start_address,
-					unsigned int has_pae) ATTRIB_NORET;
-
-extern const unsigned char relocate_new_kernel[];
-extern void relocate_new_kernel_end(void);
-extern const unsigned int relocate_new_kernel_size;
-
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
@@ -169,25 +103,29 @@ void machine_kexec_cleanup(struct kimage *image)
  */
 NORET_TYPE void machine_kexec(struct kimage *image)
 {
-	unsigned long page_list;
-	unsigned long reboot_code_buffer;
-
-	relocate_new_kernel_t rnk;
+	unsigned long page_list[PAGES_NR];
+	void *control_page;
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
-	/* Compute some offsets */
-	reboot_code_buffer = page_to_pfn(image->control_code_page)
-								<< PAGE_SHIFT;
-	page_list = image->head;
-
-	/* Set up an identity mapping for the reboot_code_buffer */
-	identity_map_page(reboot_code_buffer);
-
-	/* copy it out */
-	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
-						relocate_new_kernel_size);
+	control_page = page_address(image->control_code_page);
+	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+	page_list[PA_CONTROL_PAGE] = __pa(control_page);
+	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+	page_list[PA_PGD] = __pa(kexec_pgd);
+	page_list[VA_PGD] = (unsigned long)kexec_pgd;
+#ifdef CONFIG_X86_PAE
+	page_list[PA_PMD_0] = __pa(kexec_pmd0);
+	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+	page_list[PA_PMD_1] = __pa(kexec_pmd1);
+	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+#endif
+	page_list[PA_PTE_0] = __pa(kexec_pte0);
+	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+	page_list[PA_PTE_1] = __pa(kexec_pte1);
+	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
 
 	/* The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
@@ -206,6 +144,28 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	set_idt(phys_to_virt(0),0);
 
 	/* now call it */
-	rnk = (relocate_new_kernel_t) reboot_code_buffer;
-	(*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
+	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+			image->start, cpu_has_pae);
+}
+
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel.  By reserving this memory we guarantee
+ * that linux never sets it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init parse_crashkernel(char *arg)
+{
+	unsigned long size, base;
+	size = memparse(arg, &arg);
+	if (*arg == '@') {
+		base = memparse(arg+1, &arg);
+		/* FIXME: Do I want a sanity check
+		 * to validate the memory range?
+		 */
+		crashk_res.start = base;
+		crashk_res.end   = base + size - 1;
+	}
+	return 0;
 }
+early_param("crashkernel", parse_crashkernel);
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index cd5456f14af4..eb57a851789d 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -42,6 +42,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mca.h>
+#include <linux/kprobes.h>
 #include <asm/system.h>
 #include <asm/io.h>
 #include <linux/proc_fs.h>
@@ -414,7 +415,8 @@ subsys_initcall(mca_init);
 
 /*--------------------------------------------------------------------*/
 
-static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
+static __kprobes void
+mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
 {
 	int slot = mca_dev->slot;
 
@@ -444,7 +446,7 @@ static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
 
 /*--------------------------------------------------------------------*/
 
-static int mca_handle_nmi_callback(struct device *dev, void *data)
+static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
 {
 	struct mca_device *mca_dev = to_mca_device(dev);
 	unsigned char pos5;
@@ -462,7 +464,7 @@ static int mca_handle_nmi_callback(struct device *dev, void *data)
 	return 0;
 }
 
-void mca_handle_nmi(void)
+void __kprobes mca_handle_nmi(void)
 {
 	/* First try - scan the various adapters and see if a specific
 	 * adapter was responsible for the error.
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 40b44cc0d14b..9b9479768d5e 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -2,6 +2,7 @@
  *	Intel CPU Microcode Update Driver for Linux
  *
  *	Copyright (C) 2000-2004 Tigran Aivazian
+ *		      2006	Shaohua Li <shaohua.li@intel.com>
  *
  *	This driver allows to upgrade microcode on Intel processors
  *	belonging to IA-32 family - PentiumPro, Pentium II, 
@@ -82,6 +83,9 @@
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
 
 #include <asm/msr.h>
 #include <asm/uaccess.h>
@@ -91,9 +95,6 @@ MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
 MODULE_LICENSE("GPL");
 
-static int verbose;
-module_param(verbose, int, 0644);
-
 #define MICROCODE_VERSION 	"1.14a"
 
 #define DEFAULT_UCODE_DATASIZE 	(2000) 	  /* 2000 bytes */
@@ -120,55 +121,40 @@ static DEFINE_SPINLOCK(microcode_update_lock);
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DEFINE_MUTEX(microcode_mutex);
 
-static void __user *user_buffer;	/* user area microcode data buffer */
-static unsigned int user_buffer_size;	/* it's size */
-
-typedef enum mc_error_code {
-	MC_SUCCESS 	= 0,
-	MC_IGNORED 	= 1,
-	MC_NOTFOUND 	= 2,
-	MC_MARKED 	= 3,
-	MC_ALLOCATED 	= 4,
-} mc_error_code_t;
-
 static struct ucode_cpu_info {
+	int valid;
 	unsigned int sig;
-	unsigned int pf, orig_pf;
+	unsigned int pf;
 	unsigned int rev;
-	unsigned int cksum;
-	mc_error_code_t err;
 	microcode_t *mc;
 } ucode_cpu_info[NR_CPUS];
-				
-static int microcode_open (struct inode *unused1, struct file *unused2)
-{
-	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
 
-static void collect_cpu_info (void *unused)
+static void collect_cpu_info(int cpu_num)
 {
-	int cpu_num = smp_processor_id();
 	struct cpuinfo_x86 *c = cpu_data + cpu_num;
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
 	unsigned int val[2];
 
-	uci->sig = uci->pf = uci->rev = uci->cksum = 0;
-	uci->err = MC_NOTFOUND; 
+	/* We should bind the task to the CPU */
+	BUG_ON(raw_smp_processor_id() != cpu_num);
+	uci->pf = uci->rev = 0;
 	uci->mc = NULL;
+	uci->valid = 1;
 
 	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
 	    	cpu_has(c, X86_FEATURE_IA64)) {
-		printk(KERN_ERR "microcode: CPU%d not a capable Intel processor\n", cpu_num);
+		printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+			"processor\n", cpu_num);
+		uci->valid = 0;
 		return;
-	} else {
-		uci->sig = cpuid_eax(0x00000001);
+	}
 
-		if ((c->x86_model >= 5) || (c->x86 > 6)) {
-			/* get processor flags from MSR 0x17 */
-			rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-			uci->pf = 1 << ((val[1] >> 18) & 7);
-		}
-		uci->orig_pf = uci->pf;
+	uci->sig = cpuid_eax(0x00000001);
+
+	if ((c->x86_model >= 5) || (c->x86 > 6)) {
+		/* get processor flags from MSR 0x17 */
+		rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+		uci->pf = 1 << ((val[1] >> 18) & 7);
 	}
 
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
@@ -180,218 +166,159 @@ static void collect_cpu_info (void *unused)
 			uci->sig, uci->pf, uci->rev);
 }
 
-static inline void mark_microcode_update (int cpu_num, microcode_header_t *mc_header, int sig, int pf, int cksum)
+static inline int microcode_update_match(int cpu_num,
+	microcode_header_t *mc_header, int sig, int pf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
 
-	pr_debug("Microcode Found.\n");
-	pr_debug("   Header Revision 0x%x\n", mc_header->hdrver);
-	pr_debug("   Loader Revision 0x%x\n", mc_header->ldrver);
-	pr_debug("   Revision 0x%x \n", mc_header->rev);
-	pr_debug("   Date %x/%x/%x\n",
-		((mc_header->date >> 24 ) & 0xff),
-		((mc_header->date >> 16 ) & 0xff),
-		(mc_header->date & 0xFFFF));
-	pr_debug("   Signature 0x%x\n", sig);
-	pr_debug("   Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n",
-		((sig >> 12) & 0x3),
-		((sig >> 8) & 0xf),
-		((sig >> 4) & 0xf),
-		((sig & 0xf)));
-	pr_debug("   Processor Flags 0x%x\n", pf);
-	pr_debug("   Checksum 0x%x\n", cksum);
-
-	if (mc_header->rev < uci->rev) {
-		if (uci->err == MC_NOTFOUND) {
-			uci->err = MC_IGNORED;
-			uci->cksum = mc_header->rev;
-		} else if (uci->err == MC_IGNORED && uci->cksum < mc_header->rev)
-			uci->cksum = mc_header->rev;
-	} else if (mc_header->rev == uci->rev) {
-		if (uci->err < MC_MARKED) {
-			/* notify the caller of success on this cpu */
-			uci->err = MC_SUCCESS;
-		}
-	} else if (uci->err != MC_ALLOCATED || mc_header->rev > uci->mc->hdr.rev) {
-		pr_debug("microcode: CPU%d found a matching microcode update with "
-			" revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev);
-		uci->cksum = cksum;
-		uci->pf = pf; /* keep the original mc pf for cksum calculation */
-		uci->err = MC_MARKED; /* found the match */
-		for_each_online_cpu(cpu_num) {
-			if (ucode_cpu_info + cpu_num != uci
-			    && ucode_cpu_info[cpu_num].mc == uci->mc) {
-				uci->mc = NULL;
-				break;
-			}
-		}
-		if (uci->mc != NULL) {
-			vfree(uci->mc);
-			uci->mc = NULL;
-		}
-	}
-	return;
+	if (!sigmatch(sig, uci->sig, pf, uci->pf)
+		|| mc_header->rev <= uci->rev)
+		return 0;
+	return 1;
 }
 
-static int find_matching_ucodes (void) 
+static int microcode_sanity_check(void *mc)
 {
-	int cursor = 0;
-	int error = 0;
-
-	while (cursor + MC_HEADER_SIZE < user_buffer_size) {
-		microcode_header_t mc_header;
-		void *newmc = NULL;
-		int i, sum, cpu_num, allocated_flag, total_size, data_size, ext_table_size;
+	microcode_header_t *mc_header = mc;
+	struct extended_sigtable *ext_header = NULL;
+	struct extended_signature *ext_sig;
+	unsigned long total_size, data_size, ext_table_size;
+	int sum, orig_sum, ext_sigcount = 0, i;
+
+	total_size = get_totalsize(mc_header);
+	data_size = get_datasize(mc_header);
+	if (data_size + MC_HEADER_SIZE > total_size) {
+		printk(KERN_ERR "microcode: error! "
+			"Bad data size in microcode data file\n");
+		return -EINVAL;
+	}
 
-		if (copy_from_user(&mc_header, user_buffer + cursor, MC_HEADER_SIZE)) {
-			printk(KERN_ERR "microcode: error! Can not read user data\n");
-			error = -EFAULT;
-			goto out;
+	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+		printk(KERN_ERR "microcode: error! "
+			"Unknown microcode update format\n");
+		return -EINVAL;
+	}
+	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+	if (ext_table_size) {
+		if ((ext_table_size < EXT_HEADER_SIZE)
+		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+			printk(KERN_ERR "microcode: error! "
+				"Small exttable size in microcode data file\n");
+			return -EINVAL;
 		}
-
-		total_size = get_totalsize(&mc_header);
-		if ((cursor + total_size > user_buffer_size) || (total_size < DEFAULT_UCODE_TOTALSIZE)) {
-			printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
-			error = -EINVAL;
-			goto out;
+		ext_header = mc + MC_HEADER_SIZE + data_size;
+		if (ext_table_size != exttable_size(ext_header)) {
+			printk(KERN_ERR "microcode: error! "
+				"Bad exttable size in microcode data file\n");
+			return -EFAULT;
 		}
+		ext_sigcount = ext_header->count;
+	}
 
-		data_size = get_datasize(&mc_header);
-		if ((data_size + MC_HEADER_SIZE > total_size) || (data_size < DEFAULT_UCODE_DATASIZE)) {
-			printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
-			error = -EINVAL;
-			goto out;
+	/* check extended table checksum */
+	if (ext_table_size) {
+		int ext_table_sum = 0;
+		int *ext_tablep = (int *)ext_header;
+
+		i = ext_table_size / DWSIZE;
+		while (i--)
+			ext_table_sum += ext_tablep[i];
+		if (ext_table_sum) {
+			printk(KERN_WARNING "microcode: aborting, "
+				"bad extended signature table checksum\n");
+			return -EINVAL;
 		}
+	}
 
-		if (mc_header.ldrver != 1 || mc_header.hdrver != 1) {
-			printk(KERN_ERR "microcode: error! Unknown microcode update format\n");
-			error = -EINVAL;
-			goto out;
+	/* calculate the checksum */
+	orig_sum = 0;
+	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+	while (i--)
+		orig_sum += ((int *)mc)[i];
+	if (orig_sum) {
+		printk(KERN_ERR "microcode: aborting, bad checksum\n");
+		return -EINVAL;
+	}
+	if (!ext_table_size)
+		return 0;
+	/* check extended signature checksum */
+	for (i = 0; i < ext_sigcount; i++) {
+		ext_sig = (struct extended_signature *)((void *)ext_header
+			+ EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
+		sum = orig_sum
+			- (mc_header->sig + mc_header->pf + mc_header->cksum)
+			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+		if (sum) {
+			printk(KERN_ERR "microcode: aborting, bad checksum\n");
+			return -EINVAL;
 		}
+	}
+	return 0;
+}
 
-		for_each_online_cpu(cpu_num) {
-			struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-			if (sigmatch(mc_header.sig, uci->sig, mc_header.pf, uci->orig_pf))
-				mark_microcode_update(cpu_num, &mc_header, mc_header.sig, mc_header.pf, mc_header.cksum);
-		}
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ * return < 0 - error
+ */
+static int get_maching_microcode(void *mc, int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	microcode_header_t *mc_header = mc;
+	struct extended_sigtable *ext_header;
+	unsigned long total_size = get_totalsize(mc_header);
+	int ext_sigcount, i;
+	struct extended_signature *ext_sig;
+	void *new_mc;
+
+	if (microcode_update_match(cpu, mc_header,
+			mc_header->sig, mc_header->pf))
+		goto find;
+
+	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+		return 0;
+
+	ext_header = (struct extended_sigtable *)(mc +
+			get_datasize(mc_header) + MC_HEADER_SIZE);
+	ext_sigcount = ext_header->count;
+	ext_sig = (struct extended_signature *)((void *)ext_header
+			+ EXT_HEADER_SIZE);
+	for (i = 0; i < ext_sigcount; i++) {
+		if (microcode_update_match(cpu, mc_header,
+				ext_sig->sig, ext_sig->pf))
+			goto find;
+		ext_sig++;
+	}
+	return 0;
+find:
+	pr_debug("microcode: CPU %d found a matching microcode update with"
+		" version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
+	new_mc = vmalloc(total_size);
+	if (!new_mc) {
+		printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+		return -ENOMEM;
+	}
 
-		ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
-		if (ext_table_size) {
-			struct extended_sigtable ext_header;
-			struct extended_signature ext_sig;
-			int ext_sigcount;
+	/* free previous update file */
+	vfree(uci->mc);
 
-			if ((ext_table_size < EXT_HEADER_SIZE) 
-					|| ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
-				printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
-				error = -EINVAL;
-				goto out;
-			}
-			if (copy_from_user(&ext_header, user_buffer + cursor 
-					+ MC_HEADER_SIZE + data_size, EXT_HEADER_SIZE)) {
-				printk(KERN_ERR "microcode: error! Can not read user data\n");
-				error = -EFAULT;
-				goto out;
-			}
-			if (ext_table_size != exttable_size(&ext_header)) {
-				printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
-				error = -EFAULT;
-				goto out;
-			}
-
-			ext_sigcount = ext_header.count;
-			
-			for (i = 0; i < ext_sigcount; i++) {
-				if (copy_from_user(&ext_sig, user_buffer + cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE 
-						+ EXT_SIGNATURE_SIZE * i, EXT_SIGNATURE_SIZE)) {
-					printk(KERN_ERR "microcode: error! Can not read user data\n");
-					error = -EFAULT;
-					goto out;
-				}
-				for_each_online_cpu(cpu_num) {
-					struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-					if (sigmatch(ext_sig.sig, uci->sig, ext_sig.pf, uci->orig_pf)) {
-						mark_microcode_update(cpu_num, &mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum);
-					}
-				}
-			}
-		}
-		/* now check if any cpu has matched */
-		allocated_flag = 0;
-		sum = 0;
-		for_each_online_cpu(cpu_num) {
-			if (ucode_cpu_info[cpu_num].err == MC_MARKED) { 
-				struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-				if (!allocated_flag) {
-					allocated_flag = 1;
-					newmc = vmalloc(total_size);
-					if (!newmc) {
-						printk(KERN_ERR "microcode: error! Can not allocate memory\n");
-						error = -ENOMEM;
-						goto out;
-					}
-					if (copy_from_user(newmc + MC_HEADER_SIZE, 
-								user_buffer + cursor + MC_HEADER_SIZE, 
-								total_size - MC_HEADER_SIZE)) {
-						printk(KERN_ERR "microcode: error! Can not read user data\n");
-						vfree(newmc);
-						error = -EFAULT;
-						goto out;
-					}
-					memcpy(newmc, &mc_header, MC_HEADER_SIZE);
-					/* check extended table checksum */
-					if (ext_table_size) {
-						int ext_table_sum = 0;
-						int * ext_tablep = (((void *) newmc) + MC_HEADER_SIZE + data_size);
-						i = ext_table_size / DWSIZE;
-						while (i--) ext_table_sum += ext_tablep[i];
-						if (ext_table_sum) {
-							printk(KERN_WARNING "microcode: aborting, bad extended signature table checksum\n");
-							vfree(newmc);
-							error = -EINVAL;
-							goto out;
-						}
-					}
-
-					/* calculate the checksum */
-					i = (MC_HEADER_SIZE + data_size) / DWSIZE;
-					while (i--) sum += ((int *)newmc)[i];
-					sum -= (mc_header.sig + mc_header.pf + mc_header.cksum);
-				}
-				ucode_cpu_info[cpu_num].mc = newmc;
-				ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /* mc updated */
-				if (sum + uci->sig + uci->pf + uci->cksum != 0) {
-					printk(KERN_ERR "microcode: CPU%d aborting, bad checksum\n", cpu_num);
-					error = -EINVAL;
-					goto out;
-				}
-			}
-		}
-		cursor += total_size; /* goto the next update patch */
-	} /* end of while */
-out:
-	return error;
+	memcpy(new_mc, mc, total_size);
+	uci->mc = new_mc;
+	return 1;
 }
 
-static void do_update_one (void * unused)
+static void apply_microcode(int cpu)
 {
 	unsigned long flags;
 	unsigned int val[2];
-	int cpu_num = smp_processor_id();
+	int cpu_num = raw_smp_processor_id();
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
 
-	if (uci->mc == NULL) {
-		if (verbose) {
-			if (uci->err == MC_SUCCESS)
-				printk(KERN_INFO "microcode: CPU%d already at revision 0x%x\n",
-					cpu_num, uci->rev);
-			else
-				printk(KERN_INFO "microcode: No new microcode data for CPU%d\n", cpu_num);
-		}
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu_num != cpu);
+
+	if (uci->mc == NULL)
 		return;
-	}
 
 	/* serialize access to the physical write to MSR 0x79 */
 	spin_lock_irqsave(&microcode_update_lock, flags);          
@@ -408,68 +335,107 @@ static void do_update_one (void * unused)
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
 
-	/* notify the caller of success on this cpu */
-	uci->err = MC_SUCCESS;
 	spin_unlock_irqrestore(&microcode_update_lock, flags);
-	printk(KERN_INFO "microcode: CPU%d updated from revision "
+	if (val[1] != uci->mc->hdr.rev) {
+		printk(KERN_ERR "microcode: CPU%d updated from revision "
+			"0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
+		return;
+	}
+	pr_debug("microcode: CPU%d updated from revision "
 	       "0x%x to 0x%x, date = %08x \n", 
 	       cpu_num, uci->rev, val[1], uci->mc->hdr.date);
-	return;
+	uci->rev = val[1];
 }
 
-static int do_microcode_update (void)
-{
-	int i, error;
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static void __user *user_buffer;	/* user area microcode data buffer */
+static unsigned int user_buffer_size;	/* it's size */
 
-	if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) {
-		printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
-		error = -EIO;
-		goto out;
+static long get_next_ucode(void **mc, long offset)
+{
+	microcode_header_t mc_header;
+	unsigned long total_size;
+
+	/* No more data */
+	if (offset >= user_buffer_size)
+		return 0;
+	if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
+		printk(KERN_ERR "microcode: error! Can not read user data\n");
+		return -EFAULT;
 	}
-
-	if ((error = find_matching_ucodes())) {
-		printk(KERN_ERR "microcode: Error in the microcode data\n");
-		goto out_free;
+	total_size = get_totalsize(&mc_header);
+	if (offset + total_size > user_buffer_size) {
+		printk(KERN_ERR "microcode: error! Bad total size in microcode "
+				"data file\n");
+		return -EINVAL;
 	}
-
-	if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) {
-		printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
-		error = -EIO;
+	*mc = vmalloc(total_size);
+	if (!*mc)
+		return -ENOMEM;
+	if (copy_from_user(*mc, user_buffer + offset, total_size)) {
+		printk(KERN_ERR "microcode: error! Can not read user data\n");
+		vfree(*mc);
+		return -EFAULT;
 	}
+	return offset + total_size;
+}
+
+static int do_microcode_update (void)
+{
+	long cursor = 0;
+	int error = 0;
+	void *new_mc;
+	int cpu;
+	cpumask_t old;
+
+	old = current->cpus_allowed;
 
-out_free:
-	for_each_online_cpu(i) {
-		if (ucode_cpu_info[i].mc) {
-			int j;
-			void *tmp = ucode_cpu_info[i].mc;
-			vfree(tmp);
-			for_each_online_cpu(j) {
-				if (ucode_cpu_info[j].mc == tmp)
-					ucode_cpu_info[j].mc = NULL;
-			}
+	while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
+		error = microcode_sanity_check(new_mc);
+		if (error)
+			goto out;
+		/*
+		 * It's possible the data file has multiple matching ucode,
+		 * lets keep searching till the latest version
+		 */
+		for_each_online_cpu(cpu) {
+			struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+			if (!uci->valid)
+				continue;
+			set_cpus_allowed(current, cpumask_of_cpu(cpu));
+			error = get_maching_microcode(new_mc, cpu);
+			if (error < 0)
+				goto out;
+			if (error == 1)
+				apply_microcode(cpu);
 		}
-		if (ucode_cpu_info[i].err == MC_IGNORED && verbose)
-			printk(KERN_WARNING "microcode: CPU%d not 'upgrading' to earlier revision"
-			       " 0x%x (current=0x%x)\n", i, ucode_cpu_info[i].cksum, ucode_cpu_info[i].rev);
+		vfree(new_mc);
 	}
 out:
+	if (cursor > 0)
+		vfree(new_mc);
+	if (cursor < 0)
+		error = cursor;
+	set_cpus_allowed(current, old);
 	return error;
 }
 
+static int microcode_open (struct inode *unused1, struct file *unused2)
+{
+	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
 static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
 {
 	ssize_t ret;
 
-	if (len < DEFAULT_UCODE_TOTALSIZE) {
-		printk(KERN_ERR "microcode: not enough data\n"); 
-		return -EINVAL;
-	}
-
 	if ((len >> PAGE_SHIFT) > num_physpages) {
 		printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
 		return -EINVAL;
 	}
 
+	lock_cpu_hotplug();
 	mutex_lock(&microcode_mutex);
 
 	user_buffer = (void __user *) buf;
@@ -480,6 +446,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		ret = (ssize_t)len;
 
 	mutex_unlock(&microcode_mutex);
+	unlock_cpu_hotplug();
 
 	return ret;
 }
@@ -496,7 +463,7 @@ static struct miscdevice microcode_dev = {
 	.fops		= &microcode_fops,
 };
 
-static int __init microcode_init (void)
+static int __init microcode_dev_init (void)
 {
 	int error;
 
@@ -508,6 +475,280 @@ static int __init microcode_init (void)
 		return error;
 	}
 
+	return 0;
+}
+
+static void __exit microcode_dev_exit (void)
+{
+	misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while(0)
+#endif
+
+static long get_next_ucode_from_buffer(void **mc, void *buf,
+	unsigned long size, long offset)
+{
+	microcode_header_t *mc_header;
+	unsigned long total_size;
+
+	/* No more data */
+	if (offset >= size)
+		return 0;
+	mc_header = (microcode_header_t *)(buf + offset);
+	total_size = get_totalsize(mc_header);
+
+	if (offset + total_size > size) {
+		printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+		return -EINVAL;
+	}
+
+	*mc = vmalloc(total_size);
+	if (!*mc) {
+		printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+		return -ENOMEM;
+	}
+	memcpy(*mc, buf + offset, total_size);
+	return offset + total_size;
+}
+
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+
+static int cpu_request_microcode(int cpu)
+{
+	char name[30];
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	const struct firmware *firmware;
+	void *buf;
+	unsigned long size;
+	long offset = 0;
+	int error;
+	void *mc;
+
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu != raw_smp_processor_id());
+	sprintf(name,"intel-ucode/%02x-%02x-%02x",
+		c->x86, c->x86_model, c->x86_mask);
+	error = request_firmware(&firmware, name, &microcode_pdev->dev);
+	if (error) {
+		pr_debug("ucode data file %s load failed\n", name);
+		return error;
+	}
+	buf = (void *)firmware->data;
+	size = firmware->size;
+	while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
+			> 0) {
+		error = microcode_sanity_check(mc);
+		if (error)
+			break;
+		error = get_maching_microcode(mc, cpu);
+		if (error < 0)
+			break;
+		/*
+		 * It's possible the data file has multiple matching ucode,
+		 * lets keep searching till the latest version
+		 */
+		if (error == 1) {
+			apply_microcode(cpu);
+			error = 0;
+		}
+		vfree(mc);
+	}
+	if (offset > 0)
+		vfree(mc);
+	if (offset < 0)
+		error = offset;
+	release_firmware(firmware);
+
+	return error;
+}
+
+static void microcode_init_cpu(int cpu)
+{
+	cpumask_t old;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	old = current->cpus_allowed;
+
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	mutex_lock(&microcode_mutex);
+	collect_cpu_info(cpu);
+	if (uci->valid)
+		cpu_request_microcode(cpu);
+	mutex_unlock(&microcode_mutex);
+	set_cpus_allowed(current, old);
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	mutex_lock(&microcode_mutex);
+	uci->valid = 0;
+	vfree(uci->mc);
+	uci->mc = NULL;
+	mutex_unlock(&microcode_mutex);
+}
+
+static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+	char *end;
+	unsigned long val = simple_strtoul(buf, &end, 0);
+	int err = 0;
+	int cpu = dev->id;
+
+	if (end == buf)
+		return -EINVAL;
+	if (val == 1) {
+		cpumask_t old;
+
+		old = current->cpus_allowed;
+
+		lock_cpu_hotplug();
+		set_cpus_allowed(current, cpumask_of_cpu(cpu));
+
+		mutex_lock(&microcode_mutex);
+		if (uci->valid)
+			err = cpu_request_microcode(cpu);
+		mutex_unlock(&microcode_mutex);
+		unlock_cpu_hotplug();
+		set_cpus_allowed(current, old);
+	}
+	if (err)
+		return err;
+	return sz;
+}
+
+static ssize_t version_show(struct sys_device *dev, char *buf)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+	return sprintf(buf, "0x%x\n", uci->rev);
+}
+
+static ssize_t pf_show(struct sys_device *dev, char *buf)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+	return sprintf(buf, "0x%x\n", uci->pf);
+}
+
+static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
+static SYSDEV_ATTR(version, 0400, version_show, NULL);
+static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+	&attr_reload.attr,
+	&attr_version.attr,
+	&attr_processor_flags.attr,
+	NULL
+};
+
+static struct attribute_group mc_attr_group = {
+	.attrs = mc_default_attrs,
+	.name = "microcode",
+};
+
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+	int cpu = sys_dev->id;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	if (!cpu_online(cpu))
+		return 0;
+	pr_debug("Microcode:CPU %d added\n", cpu);
+	memset(uci, 0, sizeof(*uci));
+	sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+
+	microcode_init_cpu(cpu);
+	return 0;
+}
+
+static int mc_sysdev_remove(struct sys_device *sys_dev)
+{
+	int cpu = sys_dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+	pr_debug("Microcode:CPU %d removed\n", cpu);
+	microcode_fini_cpu(cpu);
+	sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+	return 0;
+}
+
+static int mc_sysdev_resume(struct sys_device *dev)
+{
+	int cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+	pr_debug("Microcode:CPU %d resumed\n", cpu);
+	/* only CPU 0 will apply ucode here */
+	apply_microcode(0);
+	return 0;
+}
+
+static struct sysdev_driver mc_sysdev_driver = {
+	.add = mc_sysdev_add,
+	.remove = mc_sysdev_remove,
+	.resume = mc_sysdev_resume,
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+static __cpuinit int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct sys_device *sys_dev;
+
+	sys_dev = get_cpu_sysdev(cpu);
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		mc_sysdev_add(sys_dev);
+		break;
+	case CPU_DOWN_PREPARE:
+		mc_sysdev_remove(sys_dev);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block mc_cpu_notifier = {
+	.notifier_call = mc_cpu_callback,
+};
+#endif
+
+static int __init microcode_init (void)
+{
+	int error;
+
+	error = microcode_dev_init();
+	if (error)
+		return error;
+	microcode_pdev = platform_device_register_simple("microcode", -1,
+							 NULL, 0);
+	if (IS_ERR(microcode_pdev)) {
+		microcode_dev_exit();
+		return PTR_ERR(microcode_pdev);
+	}
+
+	lock_cpu_hotplug();
+	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+	unlock_cpu_hotplug();
+	if (error) {
+		microcode_dev_exit();
+		platform_device_unregister(microcode_pdev);
+		return error;
+	}
+
+	register_hotcpu_notifier(&mc_cpu_notifier);
+
 	printk(KERN_INFO 
 		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
 	return 0;
@@ -515,9 +756,16 @@ static int __init microcode_init (void)
 
 static void __exit microcode_exit (void)
 {
-	misc_deregister(&microcode_dev);
+	microcode_dev_exit();
+
+	unregister_hotcpu_notifier(&mc_cpu_notifier);
+
+	lock_cpu_hotplug();
+	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+	unlock_cpu_hotplug();
+
+	platform_device_unregister(microcode_pdev);
 }
 
 module_init(microcode_init)
 module_exit(microcode_exit)
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index a70b5fa0ef06..442aaf8c77eb 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -30,6 +30,7 @@
 #include <asm/io_apic.h>
 
 #include <mach_apic.h>
+#include <mach_apicdef.h>
 #include <mach_mpparse.h>
 #include <bios_ebda.h>
 
@@ -68,7 +69,7 @@ unsigned int def_to_bigsmp = 0;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
 /* Internal processor count */
-static unsigned int __devinitdata num_processors;
+unsigned int __cpuinitdata num_processors;
 
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
@@ -228,12 +229,14 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
 
 	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
 
+#if MAX_MP_BUSSES < 256
 	if (m->mpc_busid >= MAX_MP_BUSSES) {
 		printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
 			" is too large, max. supported is %d\n",
 			m->mpc_busid, str, MAX_MP_BUSSES - 1);
 		return;
 	}
+#endif
 
 	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
@@ -293,19 +296,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
 			m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
-	/*
-	 * Well it seems all SMP boards in existence
-	 * use ExtINT/LVT1 == LINT0 and
-	 * NMI/LVT2 == LINT1 - the following check
-	 * will show us if this assumptions is false.
-	 * Until then we do not have to add baggage.
-	 */
-	if ((m->mpc_irqtype == mp_ExtINT) &&
-		(m->mpc_destapiclint != 0))
-			BUG();
-	if ((m->mpc_irqtype == mp_NMI) &&
-		(m->mpc_destapiclint != 1))
-			BUG();
 }
 
 #ifdef CONFIG_X86_NUMAQ
@@ -822,8 +812,7 @@ int es7000_plat;
 
 #ifdef CONFIG_ACPI
 
-void __init mp_register_lapic_address (
-	u64			address)
+void __init mp_register_lapic_address(u64 address)
 {
 	mp_lapic_addr = (unsigned long) address;
 
@@ -835,13 +824,10 @@ void __init mp_register_lapic_address (
 	Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
 }
 
-
-void __devinit mp_register_lapic (
-	u8			id, 
-	u8			enabled)
+void __devinit mp_register_lapic (u8 id, u8 enabled)
 {
 	struct mpc_config_processor processor;
-	int			boot_cpu = 0;
+	int boot_cpu = 0;
 	
 	if (MAX_APICS - id <= 0) {
 		printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
@@ -878,11 +864,9 @@ static struct mp_ioapic_routing {
 	u32			pin_programmed[4];
 } mp_ioapic_routing[MAX_IO_APICS];
 
-
-static int mp_find_ioapic (
-	int			gsi)
+static int mp_find_ioapic (int gsi)
 {
-	int			i = 0;
+	int i = 0;
 
 	/* Find the IOAPIC that manages this GSI. */
 	for (i = 0; i < nr_ioapics; i++) {
@@ -895,15 +879,11 @@ static int mp_find_ioapic (
 
 	return -1;
 }
-	
 
-void __init mp_register_ioapic (
-	u8			id, 
-	u32			address,
-	u32			gsi_base)
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
 {
-	int			idx = 0;
-	int			tmpid;
+	int idx = 0;
+	int tmpid;
 
 	if (nr_ioapics >= MAX_IO_APICS) {
 		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
@@ -949,16 +929,10 @@ void __init mp_register_ioapic (
 		mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
 		mp_ioapic_routing[idx].gsi_base,
 		mp_ioapic_routing[idx].gsi_end);
-
-	return;
 }
 
-
-void __init mp_override_legacy_irq (
-	u8			bus_irq,
-	u8			polarity, 
-	u8			trigger, 
-	u32			gsi)
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 {
 	struct mpc_config_intsrc intsrc;
 	int			ioapic = -1;
@@ -996,15 +970,13 @@ void __init mp_override_legacy_irq (
 	mp_irqs[mp_irq_entries] = intsrc;
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!\n");
-
-	return;
 }
 
 void __init mp_config_acpi_legacy_irqs (void)
 {
 	struct mpc_config_intsrc intsrc;
-	int			i = 0;
-	int			ioapic = -1;
+	int i = 0;
+	int ioapic = -1;
 
 	/* 
 	 * Fabricate the legacy ISA bus (bus #31).
@@ -1073,12 +1045,12 @@ void __init mp_config_acpi_legacy_irqs (void)
 
 #define MAX_GSI_NUM	4096
 
-int mp_register_gsi (u32 gsi, int triggering, int polarity)
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
 {
-	int			ioapic = -1;
-	int			ioapic_pin = 0;
-	int			idx, bit = 0;
-	static int		pci_irq = 16;
+	int ioapic = -1;
+	int ioapic_pin = 0;
+	int idx, bit = 0;
+	static int pci_irq = 16;
 	/*
 	 * Mapping between Global System Interrups, which
 	 * represent all possible interrupts, and IRQs
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index acb351478e42..dbda706fdd14 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -21,83 +21,174 @@
 #include <linux/sysdev.h>
 #include <linux/sysctl.h>
 #include <linux/percpu.h>
+#include <linux/dmi.h>
+#include <linux/kprobes.h>
 
 #include <asm/smp.h>
 #include <asm/nmi.h>
+#include <asm/kdebug.h>
 #include <asm/intel_arch_perfmon.h>
 
 #include "mach_traps.h"
 
-unsigned int nmi_watchdog = NMI_NONE;
-extern int unknown_nmi_panic;
-static unsigned int nmi_hz = HZ;
-static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
-static unsigned int nmi_p4_cccr_val;
-extern void show_registers(struct pt_regs *regs);
+/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ *   different subsystems this reservation system just tries to coordinate
+ *   things a little
+ */
+static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
+static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
 
-/*
- * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
- * - it may be reserved by some other driver, or not
- * - when not reserved by some other driver, it may be used for
- *   the NMI watchdog, or not
- *
- * This is maintained separately from nmi_active because the NMI
- * watchdog may also be driven from the I/O APIC timer.
+/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
  */
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
-static unsigned int lapic_nmi_owner;
-#define LAPIC_NMI_WATCHDOG	(1<<0)
-#define LAPIC_NMI_RESERVED	(1<<1)
+#define NMI_MAX_COUNTER_BITS 66
 
 /* nmi_active:
- * +1: the lapic NMI watchdog is active, but can be disabled
- *  0: the lapic NMI watchdog has not been set up, and cannot
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
  *     be enabled
- * -1: the lapic NMI watchdog is disabled, but can be enabled
+ *  0: the lapic NMI watchdog is disabled, but can be enabled
  */
-int nmi_active;
+atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
 
-#define K7_EVNTSEL_ENABLE	(1 << 22)
-#define K7_EVNTSEL_INT		(1 << 20)
-#define K7_EVNTSEL_OS		(1 << 17)
-#define K7_EVNTSEL_USR		(1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
-#define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
 
-#define P6_EVNTSEL0_ENABLE	(1 << 22)
-#define P6_EVNTSEL_INT		(1 << 20)
-#define P6_EVNTSEL_OS		(1 << 17)
-#define P6_EVNTSEL_USR		(1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
-#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED
+struct nmi_watchdog_ctlblk {
+	int enabled;
+	u64 check_bit;
+	unsigned int cccr_msr;
+	unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
+	unsigned int evntsel_msr;  /* the MSR to select the events to handle */
+};
+static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
 
-#define MSR_P4_MISC_ENABLE	0x1A0
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
-#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL	(1<<12)
-#define MSR_P4_PERFCTR0		0x300
-#define MSR_P4_CCCR0		0x360
-#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
-#define P4_ESCR_OS		(1<<3)
-#define P4_ESCR_USR		(1<<2)
-#define P4_CCCR_OVF_PMI0	(1<<26)
-#define P4_CCCR_OVF_PMI1	(1<<27)
-#define P4_CCCR_THRESHOLD(N)	((N)<<20)
-#define P4_CCCR_COMPLEMENT	(1<<19)
-#define P4_CCCR_COMPARE		(1<<18)
-#define P4_CCCR_REQUIRED	(3<<16)
-#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
-#define P4_CCCR_ENABLE		(1<<12)
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
-   CRU_ESCR0 (with any non-null event selector) through a complemented
-   max threshold. [IA32-Vol3, Section 14.9.9] */
-#define MSR_P4_IQ_COUNTER0	0x30C
-#define P4_NMI_CRU_ESCR0	(P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
-#define P4_NMI_IQ_CCCR0	\
-	(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|	\
-	 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
 
-#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+extern void show_registers(struct pt_regs *regs);
+extern int unknown_nmi_panic;
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+	/* returns the bit offset of the performance counter register */
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		return (msr - MSR_K7_PERFCTR0);
+	case X86_VENDOR_INTEL:
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+			return (msr - MSR_ARCH_PERFMON_PERFCTR0);
+
+		switch (boot_cpu_data.x86) {
+		case 6:
+			return (msr - MSR_P6_PERFCTR0);
+		case 15:
+			return (msr - MSR_P4_BPU_PERFCTR0);
+		}
+	}
+	return 0;
+}
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+	/* returns the bit offset of the event selection register */
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		return (msr - MSR_K7_EVNTSEL0);
+	case X86_VENDOR_INTEL:
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+			return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
+
+		switch (boot_cpu_data.x86) {
+		case 6:
+			return (msr - MSR_P6_EVNTSEL0);
+		case 15:
+			return (msr - MSR_P4_BSU_ESCR0);
+		}
+	}
+	return 0;
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+/* checks the an msr for availability */
+int avail_to_resrv_perfctr_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_perfctr_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_perfctr_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
+		return 1;
+	return 0;
+}
+
+void release_perfctr_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_perfctr_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
+}
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_evntsel_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
+		return 1;
+	return 0;
+}
+
+void release_evntsel_nmi(unsigned int msr)
+{
+	unsigned int counter;
+
+	counter = nmi_evntsel_msr_to_bit(msr);
+	BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+	clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
+}
+
+static __cpuinit inline int nmi_known_cpu(void)
+{
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
+	case X86_VENDOR_INTEL:
+		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+			return 1;
+		else
+			return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
+	}
+	return 0;
+}
 
 #ifdef CONFIG_SMP
 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
@@ -125,7 +216,18 @@ static int __init check_nmi_watchdog(void)
 	unsigned int *prev_nmi_count;
 	int cpu;
 
-	if (nmi_watchdog == NMI_NONE)
+	/* Enable NMI watchdog for newer systems.
+           Actually it should be safe for most systems before 2004 too except
+	   for some IBM systems that corrupt registers when NMI happens
+	   during SMM. Unfortunately we don't have more exact information
+ 	   on these and use this coarse check. */
+	if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004)
+		nmi_watchdog = NMI_LOCAL_APIC;
+
+	if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
+		return 0;
+
+	if (!atomic_read(&nmi_active))
 		return 0;
 
 	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
@@ -149,25 +251,45 @@ static int __init check_nmi_watchdog(void)
 		if (!cpu_isset(cpu, cpu_callin_map))
 			continue;
 #endif
+		if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
+			continue;
 		if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
-			endflag = 1;
 			printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
 				cpu,
 				prev_nmi_count[cpu],
 				nmi_count(cpu));
-			nmi_active = 0;
-			lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
-			kfree(prev_nmi_count);
-			return -1;
+			per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
+			atomic_dec(&nmi_active);
 		}
 	}
+	if (!atomic_read(&nmi_active)) {
+		kfree(prev_nmi_count);
+		atomic_set(&nmi_active, -1);
+		return -1;
+	}
 	endflag = 1;
 	printk("OK.\n");
 
 	/* now that we know it works we can reduce NMI frequency to
 	   something more reasonable; makes a difference in some configs */
-	if (nmi_watchdog == NMI_LOCAL_APIC)
+	if (nmi_watchdog == NMI_LOCAL_APIC) {
+		struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
 		nmi_hz = 1;
+		/*
+		 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
+		 * are writable, with higher bits sign extending from bit 31.
+		 * So, we can only program the counter with 31 bit values and
+		 * 32nd bit should be 1, for 33.. to be 1.
+		 * Find the appropriate nmi_hz
+		 */
+	 	if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
+			((u64)cpu_khz * 1000) > 0x7fffffffULL) {
+			u64 count = (u64)cpu_khz * 1000;
+			do_div(count, 0x7fffffffUL);
+			nmi_hz = count + 1;
+		}
+	}
 
 	kfree(prev_nmi_count);
 	return 0;
@@ -181,124 +303,70 @@ static int __init setup_nmi_watchdog(char *str)
 
 	get_option(&str, &nmi);
 
-	if (nmi >= NMI_INVALID)
+	if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
 		return 0;
-	if (nmi == NMI_NONE)
-		nmi_watchdog = nmi;
 	/*
 	 * If any other x86 CPU has a local APIC, then
 	 * please test the NMI stuff there and send me the
 	 * missing bits. Right now Intel P6/P4 and AMD K7 only.
 	 */
-	if ((nmi == NMI_LOCAL_APIC) &&
-			(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-			(boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
-		nmi_watchdog = nmi;
-	if ((nmi == NMI_LOCAL_APIC) &&
-			(boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
-	  		(boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
-		nmi_watchdog = nmi;
-	/*
-	 * We can enable the IO-APIC watchdog
-	 * unconditionally.
-	 */
-	if (nmi == NMI_IO_APIC) {
-		nmi_active = 1;
-		nmi_watchdog = nmi;
-	}
+	if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
+		return 0;  /* no lapic support */
+	nmi_watchdog = nmi;
 	return 1;
 }
 
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
-static void disable_intel_arch_watchdog(void);
-
 static void disable_lapic_nmi_watchdog(void)
 {
-	if (nmi_active <= 0)
+	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+	if (atomic_read(&nmi_active) <= 0)
 		return;
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		wrmsr(MSR_K7_EVNTSEL0, 0, 0);
-		break;
-	case X86_VENDOR_INTEL:
-		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-			disable_intel_arch_watchdog();
-			break;
-		}
-		switch (boot_cpu_data.x86) {
-		case 6:
-			if (boot_cpu_data.x86_model > 0xd)
-				break;
 
-			wrmsr(MSR_P6_EVNTSEL0, 0, 0);
-			break;
-		case 15:
-			if (boot_cpu_data.x86_model > 0x4)
-				break;
+	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
 
-			wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
-			wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
-			break;
-		}
-		break;
-	}
-	nmi_active = -1;
-	/* tell do_nmi() and others that we're not active any more */
-	nmi_watchdog = 0;
+	BUG_ON(atomic_read(&nmi_active) != 0);
 }
 
 static void enable_lapic_nmi_watchdog(void)
 {
-	if (nmi_active < 0) {
-		nmi_watchdog = NMI_LOCAL_APIC;
-		setup_apic_nmi_watchdog();
-	}
-}
+	BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
 
-int reserve_lapic_nmi(void)
-{
-	unsigned int old_owner;
-
-	spin_lock(&lapic_nmi_owner_lock);
-	old_owner = lapic_nmi_owner;
-	lapic_nmi_owner |= LAPIC_NMI_RESERVED;
-	spin_unlock(&lapic_nmi_owner_lock);
-	if (old_owner & LAPIC_NMI_RESERVED)
-		return -EBUSY;
-	if (old_owner & LAPIC_NMI_WATCHDOG)
-		disable_lapic_nmi_watchdog();
-	return 0;
-}
+	/* are we already enabled */
+	if (atomic_read(&nmi_active) != 0)
+		return;
 
-void release_lapic_nmi(void)
-{
-	unsigned int new_owner;
+	/* are we lapic aware */
+	if (nmi_known_cpu() <= 0)
+		return;
 
-	spin_lock(&lapic_nmi_owner_lock);
-	new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
-	lapic_nmi_owner = new_owner;
-	spin_unlock(&lapic_nmi_owner_lock);
-	if (new_owner & LAPIC_NMI_WATCHDOG)
-		enable_lapic_nmi_watchdog();
+	on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+	touch_nmi_watchdog();
 }
 
 void disable_timer_nmi_watchdog(void)
 {
-	if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
+	BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+	if (atomic_read(&nmi_active) <= 0)
 		return;
 
-	unset_nmi_callback();
-	nmi_active = -1;
-	nmi_watchdog = NMI_NONE;
+	disable_irq(0);
+	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+
+	BUG_ON(atomic_read(&nmi_active) != 0);
 }
 
 void enable_timer_nmi_watchdog(void)
 {
-	if (nmi_active < 0) {
-		nmi_watchdog = NMI_IO_APIC;
+	BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+	if (atomic_read(&nmi_active) == 0) {
 		touch_nmi_watchdog();
-		nmi_active = 1;
+		on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+		enable_irq(0);
 	}
 }
 
@@ -308,15 +376,20 @@ static int nmi_pm_active; /* nmi_active before suspend */
 
 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
 {
-	nmi_pm_active = nmi_active;
-	disable_lapic_nmi_watchdog();
+	/* only CPU0 goes here, other CPUs should be offline */
+	nmi_pm_active = atomic_read(&nmi_active);
+	stop_apic_nmi_watchdog(NULL);
+	BUG_ON(atomic_read(&nmi_active) != 0);
 	return 0;
 }
 
 static int lapic_nmi_resume(struct sys_device *dev)
 {
-	if (nmi_pm_active > 0)
-		enable_lapic_nmi_watchdog();
+	/* only CPU0 goes here, other CPUs should be offline */
+	if (nmi_pm_active > 0) {
+		setup_apic_nmi_watchdog(NULL);
+		touch_nmi_watchdog();
+	}
 	return 0;
 }
 
@@ -336,7 +409,13 @@ static int __init init_lapic_nmi_sysfs(void)
 {
 	int error;
 
-	if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
+	/* should really be a BUG_ON but b/c this is an
+	 * init call, it just doesn't work.  -dcz
+	 */
+	if (nmi_watchdog != NMI_LOCAL_APIC)
+		return 0;
+
+	if ( atomic_read(&nmi_active) < 0 )
 		return 0;
 
 	error = sysdev_class_register(&nmi_sysclass);
@@ -354,138 +433,269 @@ late_initcall(init_lapic_nmi_sysfs);
  * Original code written by Keith Owens.
  */
 
-static void clear_msr_range(unsigned int base, unsigned int n)
-{
-	unsigned int i;
-
-	for(i = 0; i < n; ++i)
-		wrmsr(base+i, 0, 0);
-}
-
-static void write_watchdog_counter(const char *descr)
+static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
 {
 	u64 count = (u64)cpu_khz * 1000;
 
 	do_div(count, nmi_hz);
 	if(descr)
 		Dprintk("setting %s to -0x%08Lx\n", descr, count);
-	wrmsrl(nmi_perfctr_msr, 0 - count);
+	wrmsrl(perfctr_msr, 0 - count);
 }
 
-static void setup_k7_watchdog(void)
+/* Note that these events don't tick when the CPU idles. This means
+   the frequency varies with CPU load. */
+
+#define K7_EVNTSEL_ENABLE	(1 << 22)
+#define K7_EVNTSEL_INT		(1 << 20)
+#define K7_EVNTSEL_OS		(1 << 17)
+#define K7_EVNTSEL_USR		(1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
+#define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+static int setup_k7_watchdog(void)
 {
+	unsigned int perfctr_msr, evntsel_msr;
 	unsigned int evntsel;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	perfctr_msr = MSR_K7_PERFCTR0;
+	evntsel_msr = MSR_K7_EVNTSEL0;
+	if (!reserve_perfctr_nmi(perfctr_msr))
+		goto fail;
 
-	nmi_perfctr_msr = MSR_K7_PERFCTR0;
+	if (!reserve_evntsel_nmi(evntsel_msr))
+		goto fail1;
 
-	clear_msr_range(MSR_K7_EVNTSEL0, 4);
-	clear_msr_range(MSR_K7_PERFCTR0, 4);
+	wrmsrl(perfctr_msr, 0UL);
 
 	evntsel = K7_EVNTSEL_INT
 		| K7_EVNTSEL_OS
 		| K7_EVNTSEL_USR
 		| K7_NMI_EVENT;
 
-	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
-	write_watchdog_counter("K7_PERFCTR0");
+	/* setup the timer */
+	wrmsr(evntsel_msr, evntsel, 0);
+	write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= K7_EVNTSEL_ENABLE;
-	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+	wrmsr(evntsel_msr, evntsel, 0);
+
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = 0;  //unused
+	wd->check_bit = 1ULL<<63;
+	return 1;
+fail1:
+	release_perfctr_nmi(perfctr_msr);
+fail:
+	return 0;
+}
+
+static void stop_k7_watchdog(void)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	wrmsr(wd->evntsel_msr, 0, 0);
+
+	release_evntsel_nmi(wd->evntsel_msr);
+	release_perfctr_nmi(wd->perfctr_msr);
 }
 
-static void setup_p6_watchdog(void)
+#define P6_EVNTSEL0_ENABLE	(1 << 22)
+#define P6_EVNTSEL_INT		(1 << 20)
+#define P6_EVNTSEL_OS		(1 << 17)
+#define P6_EVNTSEL_USR		(1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
+#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+static int setup_p6_watchdog(void)
 {
+	unsigned int perfctr_msr, evntsel_msr;
 	unsigned int evntsel;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	perfctr_msr = MSR_P6_PERFCTR0;
+	evntsel_msr = MSR_P6_EVNTSEL0;
+	if (!reserve_perfctr_nmi(perfctr_msr))
+		goto fail;
 
-	nmi_perfctr_msr = MSR_P6_PERFCTR0;
+	if (!reserve_evntsel_nmi(evntsel_msr))
+		goto fail1;
 
-	clear_msr_range(MSR_P6_EVNTSEL0, 2);
-	clear_msr_range(MSR_P6_PERFCTR0, 2);
+	wrmsrl(perfctr_msr, 0UL);
 
 	evntsel = P6_EVNTSEL_INT
 		| P6_EVNTSEL_OS
 		| P6_EVNTSEL_USR
 		| P6_NMI_EVENT;
 
-	wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
-	write_watchdog_counter("P6_PERFCTR0");
+	/* setup the timer */
+	wrmsr(evntsel_msr, evntsel, 0);
+	write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= P6_EVNTSEL0_ENABLE;
-	wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
+	wrmsr(evntsel_msr, evntsel, 0);
+
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = 0;  //unused
+	wd->check_bit = 1ULL<<39;
+	return 1;
+fail1:
+	release_perfctr_nmi(perfctr_msr);
+fail:
+	return 0;
+}
+
+static void stop_p6_watchdog(void)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	wrmsr(wd->evntsel_msr, 0, 0);
+
+	release_evntsel_nmi(wd->evntsel_msr);
+	release_perfctr_nmi(wd->perfctr_msr);
 }
 
+/* Note that these events don't tick when the CPU idles. This means
+   the frequency varies with CPU load. */
+
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
+#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
+#define P4_ESCR_OS		(1<<3)
+#define P4_ESCR_USR		(1<<2)
+#define P4_CCCR_OVF_PMI0	(1<<26)
+#define P4_CCCR_OVF_PMI1	(1<<27)
+#define P4_CCCR_THRESHOLD(N)	((N)<<20)
+#define P4_CCCR_COMPLEMENT	(1<<19)
+#define P4_CCCR_COMPARE		(1<<18)
+#define P4_CCCR_REQUIRED	(3<<16)
+#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
+#define P4_CCCR_ENABLE		(1<<12)
+#define P4_CCCR_OVF 		(1<<31)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+   CRU_ESCR0 (with any non-null event selector) through a complemented
+   max threshold. [IA32-Vol3, Section 14.9.9] */
+
 static int setup_p4_watchdog(void)
 {
+	unsigned int perfctr_msr, evntsel_msr, cccr_msr;
+	unsigned int evntsel, cccr_val;
 	unsigned int misc_enable, dummy;
+	unsigned int ht_num;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 
-	rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
+	rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
 	if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
 		return 0;
 
-	nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
-	nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
 #ifdef CONFIG_SMP
-	if (smp_num_siblings == 2)
-		nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
+	/* detect which hyperthread we are on */
+	if (smp_num_siblings == 2) {
+		unsigned int ebx, apicid;
+
+        	ebx = cpuid_ebx(1);
+	        apicid = (ebx >> 24) & 0xff;
+        	ht_num = apicid & 1;
+	} else
 #endif
+		ht_num = 0;
 
-	if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
-		clear_msr_range(0x3F1, 2);
-	/* MSR 0x3F0 seems to have a default value of 0xFC00, but current
-	   docs doesn't fully define it, so leave it alone for now. */
-	if (boot_cpu_data.x86_model >= 0x3) {
-		/* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
-		clear_msr_range(0x3A0, 26);
-		clear_msr_range(0x3BC, 3);
+	/* performance counters are shared resources
+	 * assign each hyperthread its own set
+	 * (re-use the ESCR0 register, seems safe
+	 * and keeps the cccr_val the same)
+	 */
+	if (!ht_num) {
+		/* logical cpu 0 */
+		perfctr_msr = MSR_P4_IQ_PERFCTR0;
+		evntsel_msr = MSR_P4_CRU_ESCR0;
+		cccr_msr = MSR_P4_IQ_CCCR0;
+		cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
 	} else {
-		clear_msr_range(0x3A0, 31);
+		/* logical cpu 1 */
+		perfctr_msr = MSR_P4_IQ_PERFCTR1;
+		evntsel_msr = MSR_P4_CRU_ESCR0;
+		cccr_msr = MSR_P4_IQ_CCCR1;
+		cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
 	}
-	clear_msr_range(0x3C0, 6);
-	clear_msr_range(0x3C8, 6);
-	clear_msr_range(0x3E0, 2);
-	clear_msr_range(MSR_P4_CCCR0, 18);
-	clear_msr_range(MSR_P4_PERFCTR0, 18);
-
-	wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
-	wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
-	write_watchdog_counter("P4_IQ_COUNTER0");
+
+	if (!reserve_perfctr_nmi(perfctr_msr))
+		goto fail;
+
+	if (!reserve_evntsel_nmi(evntsel_msr))
+		goto fail1;
+
+	evntsel = P4_ESCR_EVENT_SELECT(0x3F)
+	 	| P4_ESCR_OS
+		| P4_ESCR_USR;
+
+	cccr_val |= P4_CCCR_THRESHOLD(15)
+		 | P4_CCCR_COMPLEMENT
+		 | P4_CCCR_COMPARE
+		 | P4_CCCR_REQUIRED;
+
+	wrmsr(evntsel_msr, evntsel, 0);
+	wrmsr(cccr_msr, cccr_val, 0);
+	write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
+	cccr_val |= P4_CCCR_ENABLE;
+	wrmsr(cccr_msr, cccr_val, 0);
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = cccr_msr;
+	wd->check_bit = 1ULL<<39;
 	return 1;
+fail1:
+	release_perfctr_nmi(perfctr_msr);
+fail:
+	return 0;
 }
 
-static void disable_intel_arch_watchdog(void)
+static void stop_p4_watchdog(void)
 {
-	unsigned ebx;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Unhalted Core Cycles Event or not.
-	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
-	 */
-	ebx = cpuid_ebx(10);
-	if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-		wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0);
+	wrmsr(wd->cccr_msr, 0, 0);
+	wrmsr(wd->evntsel_msr, 0, 0);
+
+	release_evntsel_nmi(wd->evntsel_msr);
+	release_perfctr_nmi(wd->perfctr_msr);
 }
 
+#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
 static int setup_intel_arch_watchdog(void)
 {
+	unsigned int ebx;
+	union cpuid10_eax eax;
+	unsigned int unused;
+	unsigned int perfctr_msr, evntsel_msr;
 	unsigned int evntsel;
-	unsigned ebx;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 
 	/*
 	 * Check whether the Architectural PerfMon supports
 	 * Unhalted Core Cycles Event or not.
-	 * NOTE: Corresponding bit = 0 in ebp indicates event present.
+	 * NOTE: Corresponding bit = 0 in ebx indicates event present.
 	 */
-	ebx = cpuid_ebx(10);
-	if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-		return 0;
+	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+	    (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		goto fail;
+
+	perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+	evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
 
-	nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+	if (!reserve_perfctr_nmi(perfctr_msr))
+		goto fail;
 
-	clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2);
-	clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2);
+	if (!reserve_evntsel_nmi(evntsel_msr))
+		goto fail1;
+
+	wrmsrl(perfctr_msr, 0UL);
 
 	evntsel = ARCH_PERFMON_EVENTSEL_INT
 		| ARCH_PERFMON_EVENTSEL_OS
@@ -493,51 +703,145 @@ static int setup_intel_arch_watchdog(void)
 		| ARCH_PERFMON_NMI_EVENT_SEL
 		| ARCH_PERFMON_NMI_EVENT_UMASK;
 
-	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
-	write_watchdog_counter("INTEL_ARCH_PERFCTR0");
+	/* setup the timer */
+	wrmsr(evntsel_msr, evntsel, 0);
+	write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0);
+	wrmsr(evntsel_msr, evntsel, 0);
+
+	wd->perfctr_msr = perfctr_msr;
+	wd->evntsel_msr = evntsel_msr;
+	wd->cccr_msr = 0;  //unused
+	wd->check_bit = 1ULL << (eax.split.bit_width - 1);
 	return 1;
+fail1:
+	release_perfctr_nmi(perfctr_msr);
+fail:
+	return 0;
 }
 
-void setup_apic_nmi_watchdog (void)
+static void stop_intel_arch_watchdog(void)
 {
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
-			return;
-		setup_k7_watchdog();
-		break;
-	case X86_VENDOR_INTEL:
-		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-			if (!setup_intel_arch_watchdog())
+	unsigned int ebx;
+	union cpuid10_eax eax;
+	unsigned int unused;
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Unhalted Core Cycles Event or not.
+	 * NOTE: Corresponding bit = 0 in ebx indicates event present.
+	 */
+	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+	    (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+		return;
+
+	wrmsr(wd->evntsel_msr, 0, 0);
+	release_evntsel_nmi(wd->evntsel_msr);
+	release_perfctr_nmi(wd->perfctr_msr);
+}
+
+void setup_apic_nmi_watchdog (void *unused)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	/* only support LOCAL and IO APICs for now */
+	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+	    (nmi_watchdog != NMI_IO_APIC))
+	    	return;
+
+	if (wd->enabled == 1)
+		return;
+
+	/* cheap hack to support suspend/resume */
+	/* if cpu0 is not active neither should the other cpus */
+	if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+		return;
+
+	if (nmi_watchdog == NMI_LOCAL_APIC) {
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_AMD:
+			if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
 				return;
-			break;
-		}
-		switch (boot_cpu_data.x86) {
-		case 6:
-			if (boot_cpu_data.x86_model > 0xd)
+			if (!setup_k7_watchdog())
 				return;
-
-			setup_p6_watchdog();
 			break;
-		case 15:
-			if (boot_cpu_data.x86_model > 0x4)
-				return;
+		case X86_VENDOR_INTEL:
+			if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+				if (!setup_intel_arch_watchdog())
+					return;
+				break;
+			}
+			switch (boot_cpu_data.x86) {
+			case 6:
+				if (boot_cpu_data.x86_model > 0xd)
+					return;
+
+				if (!setup_p6_watchdog())
+					return;
+				break;
+			case 15:
+				if (boot_cpu_data.x86_model > 0x4)
+					return;
 
-			if (!setup_p4_watchdog())
+				if (!setup_p4_watchdog())
+					return;
+				break;
+			default:
 				return;
+			}
 			break;
 		default:
 			return;
 		}
-		break;
-	default:
+	}
+	wd->enabled = 1;
+	atomic_inc(&nmi_active);
+}
+
+void stop_apic_nmi_watchdog(void *unused)
+{
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+	/* only support LOCAL and IO APICs for now */
+	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+	    (nmi_watchdog != NMI_IO_APIC))
+	    	return;
+
+	if (wd->enabled == 0)
 		return;
+
+	if (nmi_watchdog == NMI_LOCAL_APIC) {
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_AMD:
+			stop_k7_watchdog();
+			break;
+		case X86_VENDOR_INTEL:
+			if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+				stop_intel_arch_watchdog();
+				break;
+			}
+			switch (boot_cpu_data.x86) {
+			case 6:
+				if (boot_cpu_data.x86_model > 0xd)
+					break;
+				stop_p6_watchdog();
+				break;
+			case 15:
+				if (boot_cpu_data.x86_model > 0x4)
+					break;
+				stop_p4_watchdog();
+				break;
+			}
+			break;
+		default:
+			return;
+		}
 	}
-	lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
-	nmi_active = 1;
+	wd->enabled = 0;
+	atomic_dec(&nmi_active);
 }
 
 /*
@@ -579,7 +883,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
 
 extern void die_nmi(struct pt_regs *, const char *msg);
 
-void nmi_watchdog_tick (struct pt_regs * regs)
+__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 {
 
 	/*
@@ -588,11 +892,23 @@ void nmi_watchdog_tick (struct pt_regs * regs)
 	 * smp_processor_id().
 	 */
 	unsigned int sum;
+	int touched = 0;
 	int cpu = smp_processor_id();
+	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+	u64 dummy;
+	int rc=0;
+
+	/* check for other users first */
+	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+			== NOTIFY_STOP) {
+		rc = 1;
+		touched = 1;
+	}
 
 	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
 
-	if (last_irq_sums[cpu] == sum) {
+	/* if the apic timer isn't firing, this cpu isn't doing much */
+	if (!touched && last_irq_sums[cpu] == sum) {
 		/*
 		 * Ayiee, looks like this CPU is stuck ...
 		 * wait a few IRQs (5 seconds) before doing the oops ...
@@ -607,27 +923,59 @@ void nmi_watchdog_tick (struct pt_regs * regs)
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
 	}
-	if (nmi_perfctr_msr) {
-		if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
-			/*
-			 * P4 quirks:
-			 * - An overflown perfctr will assert its interrupt
-			 *   until the OVF flag in its CCCR is cleared.
-			 * - LVTPC is masked on interrupt and must be
-			 *   unmasked by the LVTPC handler.
+	/* see if the nmi watchdog went off */
+	if (wd->enabled) {
+		if (nmi_watchdog == NMI_LOCAL_APIC) {
+			rdmsrl(wd->perfctr_msr, dummy);
+			if (dummy & wd->check_bit){
+				/* this wasn't a watchdog timer interrupt */
+				goto done;
+			}
+
+			/* only Intel P4 uses the cccr msr */
+	 		if (wd->cccr_msr != 0) {
+	 			/*
+	 			 * P4 quirks:
+	 			 * - An overflown perfctr will assert its interrupt
+	 			 *   until the OVF flag in its CCCR is cleared.
+	 			 * - LVTPC is masked on interrupt and must be
+	 			 *   unmasked by the LVTPC handler.
+	 			 */
+				rdmsrl(wd->cccr_msr, dummy);
+				dummy &= ~P4_CCCR_OVF;
+	 			wrmsrl(wd->cccr_msr, dummy);
+	 			apic_write(APIC_LVTPC, APIC_DM_NMI);
+	 		}
+			else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+				 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+				/* P6 based Pentium M need to re-unmask
+				 * the apic vector but it doesn't hurt
+				 * other P6 variant.
+				 * ArchPerfom/Core Duo also needs this */
+				apic_write(APIC_LVTPC, APIC_DM_NMI);
+			}
+			/* start the cycle over again */
+			write_watchdog_counter(wd->perfctr_msr, NULL);
+			rc = 1;
+		} else if (nmi_watchdog == NMI_IO_APIC) {
+			/* don't know how to accurately check for this.
+			 * just assume it was a watchdog timer interrupt
+			 * This matches the old behaviour.
 			 */
-			wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
-			apic_write(APIC_LVTPC, APIC_DM_NMI);
+			rc = 1;
 		}
-		else if (nmi_perfctr_msr == MSR_P6_PERFCTR0 ||
-		         nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
-			/* Only P6 based Pentium M need to re-unmask
-			 * the apic vector but it doesn't hurt
-			 * other P6 variant */
-			apic_write(APIC_LVTPC, APIC_DM_NMI);
-		}
-		write_watchdog_counter(NULL);
 	}
+done:
+	return rc;
+}
+
+int do_nmi_callback(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+	if (unknown_nmi_panic)
+		return unknown_nmi_panic_callback(regs, cpu);
+#endif
+	return 0;
 }
 
 #ifdef CONFIG_SYSCTL
@@ -637,36 +985,46 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 	unsigned char reason = get_nmi_reason();
 	char buf[64];
 
-	if (!(reason & 0xc0)) {
-		sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-		die_nmi(regs, buf);
-	}
+	sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+	die_nmi(regs, buf);
 	return 0;
 }
 
 /*
- * proc handler for /proc/sys/kernel/unknown_nmi_panic
+ * proc handler for /proc/sys/kernel/nmi
  */
-int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
 			void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int old_state;
 
-	old_state = unknown_nmi_panic;
+	nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+	old_state = nmi_watchdog_enabled;
 	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (!!old_state == !!unknown_nmi_panic)
+	if (!!old_state == !!nmi_watchdog_enabled)
 		return 0;
 
-	if (unknown_nmi_panic) {
-		if (reserve_lapic_nmi() < 0) {
-			unknown_nmi_panic = 0;
-			return -EBUSY;
-		} else {
-			set_nmi_callback(unknown_nmi_panic_callback);
-		}
+	if (atomic_read(&nmi_active) < 0) {
+		printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+		return -EIO;
+	}
+
+	if (nmi_watchdog == NMI_DEFAULT) {
+		if (nmi_known_cpu() > 0)
+			nmi_watchdog = NMI_LOCAL_APIC;
+		else
+			nmi_watchdog = NMI_IO_APIC;
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC) {
+		if (nmi_watchdog_enabled)
+			enable_lapic_nmi_watchdog();
+		else
+			disable_lapic_nmi_watchdog();
 	} else {
-		release_lapic_nmi();
-		unset_nmi_callback();
+		printk( KERN_WARNING
+			"NMI watchdog doesn't know what hardware to touch\n");
+		return -EIO;
 	}
 	return 0;
 }
@@ -675,7 +1033,11 @@ int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
 
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(reserve_lapic_nmi);
-EXPORT_SYMBOL(release_lapic_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+EXPORT_SYMBOL(release_perfctr_nmi);
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+EXPORT_SYMBOL(release_evntsel_nmi);
 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
 EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 8657c739656a..8c190ca7ae44 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -37,6 +37,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
 #include <linux/random.h>
+#include <linux/personality.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -320,15 +321,6 @@ void show_regs(struct pt_regs * regs)
  * the "args".
  */
 extern void kernel_thread_helper(void);
-__asm__(".section .text\n"
-	".align 4\n"
-	"kernel_thread_helper:\n\t"
-	"movl %edx,%eax\n\t"
-	"pushl %edx\n\t"
-	"call *%ebx\n\t"
-	"pushl %eax\n\t"
-	"call do_exit\n"
-	".previous");
 
 /*
  * Create a kernel thread
@@ -346,7 +338,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 	regs.xes = __USER_DS;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
-	regs.xcs = __KERNEL_CS;
+	regs.xcs = __KERNEL_CS | get_kernel_rpl();
 	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 
 	/* Ok, create the new process.. */
@@ -905,7 +897,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
 
 unsigned long arch_align_stack(unsigned long sp)
 {
-	if (randomize_va_space)
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
 		sp -= get_random_int() % 8192;
 	return sp & ~0xf;
 }
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index d3db03f4085d..775f50e9395b 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -185,17 +185,17 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_
 	return addr;
 }
 
-static inline int is_at_popf(struct task_struct *child, struct pt_regs *regs)
+static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 {
 	int i, copied;
-	unsigned char opcode[16];
+	unsigned char opcode[15];
 	unsigned long addr = convert_eip_to_linear(child, regs);
 
 	copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
 	for (i = 0; i < copied; i++) {
 		switch (opcode[i]) {
-		/* popf */
-		case 0x9d:
+		/* popf and iret */
+		case 0x9d: case 0xcf:
 			return 1;
 		/* opcode and address size prefixes */
 		case 0x66: case 0x67:
@@ -247,7 +247,7 @@ static void set_singlestep(struct task_struct *child)
 	 * don't mark it as being "us" that set it, so that we
 	 * won't clear it by hand later.
 	 */
-	if (is_at_popf(child, regs))
+	if (is_setting_trap_flag(child, regs))
 		return;
 	
 	child->ptrace |= PT_DTRACE;
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 54cfeabbc5e4..84278e0093a2 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -145,14 +145,10 @@ real_mode_gdt_entries [3] =
 	0x000092000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
 };
 
-static struct
-{
-	unsigned short       size __attribute__ ((packed));
-	unsigned long long * base __attribute__ ((packed));
-}
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, NULL },
-no_idt = { 0, NULL };
+static struct Xgt_desc_struct
+real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
+real_mode_idt = { 0x3ff, 0 },
+no_idt = { 0, 0 };
 
 
 /* This is 16-bit protected mode code to disable paging and the cache,
diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S
index d312616effa1..f151d6fae462 100644
--- a/arch/i386/kernel/relocate_kernel.S
+++ b/arch/i386/kernel/relocate_kernel.S
@@ -7,16 +7,138 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 2)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
+
+	.text
+	.align PAGE_ALIGNED
+	.globl relocate_kernel
+relocate_kernel:
+	movl	8(%esp), %ebp /* list of pages */
+
+#ifdef CONFIG_X86_PAE
+	/* map the control page at its virtual address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xc0000000, %eax
+	shrl	$27, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PMD_0)(%ebp), %edx
+	orl	$PAE_PGD_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PMD_0)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x3fe00000, %eax
+	shrl	$18, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_0)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_0)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x001ff000, %eax
+	shrl	$9, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	/* identity map the control page at its physical address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xc0000000, %eax
+	shrl	$27, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PMD_1)(%ebp), %edx
+	orl	$PAE_PGD_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PMD_1)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x3fe00000, %eax
+	shrl	$18, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_1)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_1)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x001ff000, %eax
+	shrl	$9, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+#else
+	/* map the control page at its virtual address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xffc00000, %eax
+	shrl	$20, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_0)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_0)(%ebp), %edi
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x003ff000, %eax
+	shrl	$10, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	/* identity map the control page at its physical address */
+
+	movl	PTR(VA_PGD)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0xffc00000, %eax
+	shrl	$20, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_PTE_1)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+
+	movl	PTR(VA_PTE_1)(%ebp), %edi
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %eax
+	andl	$0x003ff000, %eax
+	shrl	$10, %eax
+	addl	%edi, %eax
+
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edx
+	orl	$PAGE_ATTR, %edx
+	movl	%edx, (%eax)
+#endif
 
-	/*
-	 * Must be relocatable PIC code callable as a C function, that once
-	 * it starts can not use the previous processes stack.
-	 */
-	.globl relocate_new_kernel
 relocate_new_kernel:
 	/* read the arguments and say goodbye to the stack */
 	movl  4(%esp), %ebx /* page_list */
-	movl  8(%esp), %ebp /* reboot_code_buffer */
+	movl  8(%esp), %ebp /* list of pages */
 	movl  12(%esp), %edx /* start address */
 	movl  16(%esp), %ecx /* cpu_has_pae */
 
@@ -24,11 +146,26 @@ relocate_new_kernel:
 	pushl $0
 	popfl
 
-	/* set a new stack at the bottom of our page... */
-	lea   4096(%ebp), %esp
+	/* get physical address of control page now */
+	/* this is impossible after page table switch */
+	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
 
-	/* store the parameters back on the stack */
-	pushl   %edx /* store the start address */
+	/* switch to new set of page tables */
+	movl	PTR(PA_PGD)(%ebp), %eax
+	movl	%eax, %cr3
+
+	/* setup a new stack at the end of the physical control page */
+	lea	4096(%edi), %esp
+
+	/* jump to identity mapped page */
+	movl    %edi, %eax
+	addl    $(identity_mapped - relocate_kernel), %eax
+	pushl   %eax
+	ret
+
+identity_mapped:
+	/* store the start address on the stack */
+	pushl   %edx
 
 	/* Set cr0 to a known state:
 	 * 31 0 == Paging disabled
@@ -113,8 +250,3 @@ relocate_new_kernel:
 	xorl    %edi, %edi
 	xorl    %ebp, %ebp
 	ret
-relocate_new_kernel_end:
-
-	.globl relocate_new_kernel_size
-relocate_new_kernel_size:
-	.long relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c
deleted file mode 100644
index 98352c374c76..000000000000
--- a/arch/i386/kernel/semaphore.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-#include <asm/semaphore.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
- */
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __down_failed\n"
-"__down_failed:\n\t"
-#if defined(CONFIG_FRAME_POINTER)
-	"pushl %ebp\n\t"
-	"movl  %esp,%ebp\n\t"
-#endif
-	"pushl %edx\n\t"
-	"pushl %ecx\n\t"
-	"call __down\n\t"
-	"popl %ecx\n\t"
-	"popl %edx\n\t"
-#if defined(CONFIG_FRAME_POINTER)
-	"movl %ebp,%esp\n\t"
-	"popl %ebp\n\t"
-#endif
-	"ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __down_failed_interruptible\n"
-"__down_failed_interruptible:\n\t"
-#if defined(CONFIG_FRAME_POINTER)
-	"pushl %ebp\n\t"
-	"movl  %esp,%ebp\n\t"
-#endif
-	"pushl %edx\n\t"
-	"pushl %ecx\n\t"
-	"call __down_interruptible\n\t"
-	"popl %ecx\n\t"
-	"popl %edx\n\t"
-#if defined(CONFIG_FRAME_POINTER)
-	"movl %ebp,%esp\n\t"
-	"popl %ebp\n\t"
-#endif
-	"ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __down_failed_trylock\n"
-"__down_failed_trylock:\n\t"
-#if defined(CONFIG_FRAME_POINTER)
-	"pushl %ebp\n\t"
-	"movl  %esp,%ebp\n\t"
-#endif
-	"pushl %edx\n\t"
-	"pushl %ecx\n\t"
-	"call __down_trylock\n\t"
-	"popl %ecx\n\t"
-	"popl %edx\n\t"
-#if defined(CONFIG_FRAME_POINTER)
-	"movl %ebp,%esp\n\t"
-	"popl %ebp\n\t"
-#endif
-	"ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __up_wakeup\n"
-"__up_wakeup:\n\t"
-	"pushl %edx\n\t"
-	"pushl %ecx\n\t"
-	"call __up\n\t"
-	"popl %ecx\n\t"
-	"popl %edx\n\t"
-	"ret"
-);
-
-/*
- * rw spinlock fallbacks
- */
-#if defined(CONFIG_SMP)
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__write_lock_failed\n"
-"__write_lock_failed:\n\t"
-	LOCK_PREFIX "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jne	1b\n\t"
-	LOCK_PREFIX "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jnz	__write_lock_failed\n\t"
-	"ret"
-);
-
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__read_lock_failed\n"
-"__read_lock_failed:\n\t"
-	LOCK_PREFIX "incl	(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$1,(%eax)\n\t"
-	"js	1b\n\t"
-	LOCK_PREFIX "decl	(%eax)\n\t"
-	"js	__read_lock_failed\n\t"
-	"ret"
-);
-#endif
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index f1682206d304..814cdebf7377 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -53,6 +53,7 @@
 #include <asm/apic.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
+#include <asm/mmzone.h>
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
 #include <asm/sections.h>
@@ -89,18 +90,6 @@ EXPORT_SYMBOL(boot_cpu_data);
 
 unsigned long mmu_cr4_features;
 
-#ifdef	CONFIG_ACPI
-	int acpi_disabled = 0;
-#else
-	int acpi_disabled = 1;
-#endif
-EXPORT_SYMBOL(acpi_disabled);
-
-#ifdef	CONFIG_ACPI
-int __initdata acpi_force = 0;
-extern acpi_interrupt_flags	acpi_sci_flags;
-#endif
-
 /* for MCA, but anyone else can use it if they want */
 unsigned int machine_id;
 #ifdef CONFIG_MCA
@@ -148,7 +137,6 @@ EXPORT_SYMBOL(ist_info);
 struct e820map e820;
 
 extern void early_cpu_init(void);
-extern void generic_apic_probe(char *);
 extern int root_mountflags;
 
 unsigned long saved_videomode;
@@ -700,238 +688,150 @@ static inline void copy_edd(void)
 }
 #endif
 
-static void __init parse_cmdline_early (char ** cmdline_p)
-{
-	char c = ' ', *to = command_line, *from = saved_command_line;
-	int len = 0;
-	int userdef = 0;
+static int __initdata user_defined_memmap = 0;
 
-	/* Save unparsed command line copy for /proc/cmdline */
-	saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
+/*
+ * "mem=nopentium" disables the 4MB page tables.
+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+ * to <mem>, overriding the bios size.
+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+ * <start> to <start>+<mem>, overriding the bios size.
+ *
+ * HPA tells me bootloaders need to parse mem=, so no new
+ * option should be mem=  [also see Documentation/i386/boot.txt]
+ */
+static int __init parse_mem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
 
-	for (;;) {
-		if (c != ' ')
-			goto next_char;
-		/*
-		 * "mem=nopentium" disables the 4MB page tables.
-		 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
-		 * to <mem>, overriding the bios size.
-		 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
-		 * <start> to <start>+<mem>, overriding the bios size.
-		 *
-		 * HPA tells me bootloaders need to parse mem=, so no new
-		 * option should be mem=  [also see Documentation/i386/boot.txt]
+	if (strcmp(arg, "nopentium") == 0) {
+		clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+		disable_pse = 1;
+	} else {
+		/* If the user specifies memory size, we
+		 * limit the BIOS-provided memory map to
+		 * that size. exactmap can be used to specify
+		 * the exact map. mem=number can be used to
+		 * trim the existing memory map.
 		 */
-		if (!memcmp(from, "mem=", 4)) {
-			if (to != command_line)
-				to--;
-			if (!memcmp(from+4, "nopentium", 9)) {
-				from += 9+4;
-				clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
-				disable_pse = 1;
-			} else {
-				/* If the user specifies memory size, we
-				 * limit the BIOS-provided memory map to
-				 * that size. exactmap can be used to specify
-				 * the exact map. mem=number can be used to
-				 * trim the existing memory map.
-				 */
-				unsigned long long mem_size;
- 
-				mem_size = memparse(from+4, &from);
-				limit_regions(mem_size);
-				userdef=1;
-			}
-		}
-
-		else if (!memcmp(from, "memmap=", 7)) {
-			if (to != command_line)
-				to--;
-			if (!memcmp(from+7, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
-				/* If we are doing a crash dump, we
-				 * still need to know the real mem
-				 * size before original memory map is
-				 * reset.
-				 */
-				find_max_pfn();
-				saved_max_pfn = max_pfn;
-#endif
-				from += 8+7;
-				e820.nr_map = 0;
-				userdef = 1;
-			} else {
-				/* If the user specifies memory size, we
-				 * limit the BIOS-provided memory map to
-				 * that size. exactmap can be used to specify
-				 * the exact map. mem=number can be used to
-				 * trim the existing memory map.
-				 */
-				unsigned long long start_at, mem_size;
+		unsigned long long mem_size;
  
-				mem_size = memparse(from+7, &from);
-				if (*from == '@') {
-					start_at = memparse(from+1, &from);
-					add_memory_region(start_at, mem_size, E820_RAM);
-				} else if (*from == '#') {
-					start_at = memparse(from+1, &from);
-					add_memory_region(start_at, mem_size, E820_ACPI);
-				} else if (*from == '$') {
-					start_at = memparse(from+1, &from);
-					add_memory_region(start_at, mem_size, E820_RESERVED);
-				} else {
-					limit_regions(mem_size);
-					userdef=1;
-				}
-			}
-		}
-
-		else if (!memcmp(from, "noexec=", 7))
-			noexec_setup(from + 7);
+		mem_size = memparse(arg, &arg);
+		limit_regions(mem_size);
+		user_defined_memmap = 1;
+	}
+	return 0;
+}
+early_param("mem", parse_mem);
 
+static int __init parse_memmap(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
 
-#ifdef  CONFIG_X86_SMP
-		/*
-		 * If the BIOS enumerates physical processors before logical,
-		 * maxcpus=N at enumeration-time can be used to disable HT.
+	if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+		/* If we are doing a crash dump, we
+		 * still need to know the real mem
+		 * size before original memory map is
+		 * reset.
 		 */
-		else if (!memcmp(from, "maxcpus=", 8)) {
-			extern unsigned int maxcpus;
-
-			maxcpus = simple_strtoul(from + 8, NULL, 0);
-		}
+		find_max_pfn();
+		saved_max_pfn = max_pfn;
 #endif
-
-#ifdef CONFIG_ACPI
-		/* "acpi=off" disables both ACPI table parsing and interpreter */
-		else if (!memcmp(from, "acpi=off", 8)) {
-			disable_acpi();
-		}
-
-		/* acpi=force to over-ride black-list */
-		else if (!memcmp(from, "acpi=force", 10)) {
-			acpi_force = 1;
-			acpi_ht = 1;
-			acpi_disabled = 0;
-		}
-
-		/* acpi=strict disables out-of-spec workarounds */
-		else if (!memcmp(from, "acpi=strict", 11)) {
-			acpi_strict = 1;
-		}
-
-		/* Limit ACPI just to boot-time to enable HT */
-		else if (!memcmp(from, "acpi=ht", 7)) {
-			if (!acpi_force)
-				disable_acpi();
-			acpi_ht = 1;
-		}
-		
-		/* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
-		else if (!memcmp(from, "pci=noacpi", 10)) {
-			acpi_disable_pci();
-		}
-		/* "acpi=noirq" disables ACPI interrupt routing */
-		else if (!memcmp(from, "acpi=noirq", 10)) {
-			acpi_noirq_set();
+		e820.nr_map = 0;
+		user_defined_memmap = 1;
+	} else {
+		/* If the user specifies memory size, we
+		 * limit the BIOS-provided memory map to
+		 * that size. exactmap can be used to specify
+		 * the exact map. mem=number can be used to
+		 * trim the existing memory map.
+		 */
+		unsigned long long start_at, mem_size;
+
+		mem_size = memparse(arg, &arg);
+		if (*arg == '@') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_RAM);
+		} else if (*arg == '#') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_ACPI);
+		} else if (*arg == '$') {
+			start_at = memparse(arg+1, &arg);
+			add_memory_region(start_at, mem_size, E820_RESERVED);
+		} else {
+			limit_regions(mem_size);
+			user_defined_memmap = 1;
 		}
+	}
+	return 0;
+}
+early_param("memmap", parse_memmap);
 
-		else if (!memcmp(from, "acpi_sci=edge", 13))
-			acpi_sci_flags.trigger =  1;
-
-		else if (!memcmp(from, "acpi_sci=level", 14))
-			acpi_sci_flags.trigger = 3;
-
-		else if (!memcmp(from, "acpi_sci=high", 13))
-			acpi_sci_flags.polarity = 1;
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel.
+ */
+static int __init parse_elfcorehdr(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
 
-		else if (!memcmp(from, "acpi_sci=low", 12))
-			acpi_sci_flags.polarity = 3;
+	elfcorehdr_addr = memparse(arg, &arg);
+	return 0;
+}
+early_param("elfcorehdr", parse_elfcorehdr);
+#endif /* CONFIG_PROC_VMCORE */
 
-#ifdef CONFIG_X86_IO_APIC
-		else if (!memcmp(from, "acpi_skip_timer_override", 24))
-			acpi_skip_timer_override = 1;
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
 
-		if (!memcmp(from, "disable_timer_pin_1", 19))
-			disable_timer_pin_1 = 1;
-		if (!memcmp(from, "enable_timer_pin_1", 18))
-			disable_timer_pin_1 = -1;
+	highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+	return 0;
+}
+early_param("highmem", parse_highmem);
 
-		/* disable IO-APIC */
-		else if (!memcmp(from, "noapic", 6))
-			disable_ioapic_setup();
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
 
-#ifdef CONFIG_X86_LOCAL_APIC
-		/* enable local APIC */
-		else if (!memcmp(from, "lapic", 5))
-			lapic_enable();
+	__VMALLOC_RESERVE = memparse(arg, &arg);
+	return 0;
+}
+early_param("vmalloc", parse_vmalloc);
 
-		/* disable local APIC */
-		else if (!memcmp(from, "nolapic", 6))
-			lapic_disable();
-#endif /* CONFIG_X86_LOCAL_APIC */
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+	unsigned long address;
 
-#ifdef CONFIG_KEXEC
-		/* crashkernel=size@addr specifies the location to reserve for
-		 * a crash kernel.  By reserving this memory we guarantee
-		 * that linux never set's it up as a DMA target.
-		 * Useful for holding code to do something appropriate
-		 * after a kernel panic.
-		 */
-		else if (!memcmp(from, "crashkernel=", 12)) {
-			unsigned long size, base;
-			size = memparse(from+12, &from);
-			if (*from == '@') {
-				base = memparse(from+1, &from);
-				/* FIXME: Do I want a sanity check
-				 * to validate the memory range?
-				 */
-				crashk_res.start = base;
-				crashk_res.end   = base + size - 1;
-			}
-		}
-#endif
-#ifdef CONFIG_PROC_VMCORE
-		/* elfcorehdr= specifies the location of elf core header
-		 * stored by the crashed kernel.
-		 */
-		else if (!memcmp(from, "elfcorehdr=", 11))
-			elfcorehdr_addr = memparse(from+11, &from);
-#endif
+	if (!arg)
+		return -EINVAL;
 
-		/*
-		 * highmem=size forces highmem to be exactly 'size' bytes.
-		 * This works even on boxes that have no highmem otherwise.
-		 * This also works to reduce highmem size on bigger boxes.
-		 */
-		else if (!memcmp(from, "highmem=", 8))
-			highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
-	
-		/*
-		 * vmalloc=size forces the vmalloc area to be exactly 'size'
-		 * bytes. This can be used to increase (or decrease) the
-		 * vmalloc area - the default is 128m.
-		 */
-		else if (!memcmp(from, "vmalloc=", 8))
-			__VMALLOC_RESERVE = memparse(from+8, &from);
-
-	next_char:
-		c = *(from++);
-		if (!c)
-			break;
-		if (COMMAND_LINE_SIZE <= ++len)
-			break;
-		*(to++) = c;
-	}
-	*to = '\0';
-	*cmdline_p = command_line;
-	if (userdef) {
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		print_memory_map("user");
-	}
+	address = memparse(arg, &arg);
+	reserve_top_address(address);
+	return 0;
 }
+early_param("reservetop", parse_reservetop);
 
 /*
  * Callback for efi_memory_walk.
@@ -1170,6 +1070,14 @@ static unsigned long __init setup_memory(void)
 	}
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
+	num_physpages = highend_pfn;
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+	num_physpages = max_low_pfn;
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+	max_mapnr = num_physpages;
 #endif
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(max_low_pfn));
@@ -1181,22 +1089,20 @@ static unsigned long __init setup_memory(void)
 
 void __init zone_sizes_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
-	unsigned int max_dma, low;
-
-	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-	low = max_low_pfn;
-
-	if (low < max_dma)
-		zones_size[ZONE_DMA] = low;
-	else {
-		zones_size[ZONE_DMA] = max_dma;
-		zones_size[ZONE_NORMAL] = low - max_dma;
 #ifdef CONFIG_HIGHMEM
-		zones_size[ZONE_HIGHMEM] = highend_pfn - low;
+	unsigned long max_zone_pfns[MAX_NR_ZONES] = {
+			virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT,
+			max_low_pfn,
+			highend_pfn};
+	add_active_range(0, 0, highend_pfn);
+#else
+	unsigned long max_zone_pfns[MAX_NR_ZONES] = {
+			virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT,
+			max_low_pfn};
+	add_active_range(0, 0, max_low_pfn);
 #endif
-	}
-	free_area_init(zones_size);
+
+	free_area_init_nodes(max_zone_pfns);
 }
 #else
 extern unsigned long __init setup_memory(void);
@@ -1258,7 +1164,7 @@ void __init setup_bootmem_allocator(void)
 	 */
 	find_smp_config();
 #endif
-
+	numa_kva_reserve();
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
@@ -1499,17 +1405,15 @@ void __init setup_arch(char **cmdline_p)
 	data_resource.start = virt_to_phys(_etext);
 	data_resource.end = virt_to_phys(_edata)-1;
 
-	parse_cmdline_early(cmdline_p);
+	parse_early_param();
 
-#ifdef CONFIG_EARLY_PRINTK
-	{
-		char *s = strstr(*cmdline_p, "earlyprintk=");
-		if (s) {
-			setup_early_printk(strchr(s, '=') + 1);
-			printk("early console enabled\n");
-		}
+	if (user_defined_memmap) {
+		printk(KERN_INFO "user-defined physical RAM map:\n");
+		print_memory_map("user");
 	}
-#endif
+
+	strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
 
 	max_low_pfn = setup_memory();
 
@@ -1538,7 +1442,7 @@ void __init setup_arch(char **cmdline_p)
 	dmi_scan_machine();
 
 #ifdef CONFIG_X86_GENERICARCH
-	generic_apic_probe(*cmdline_p);
+	generic_apic_probe();
 #endif	
 	if (efi_enabled)
 		efi_map_memmap();
@@ -1550,9 +1454,11 @@ void __init setup_arch(char **cmdline_p)
 	acpi_boot_table_init();
 #endif
 
+#ifdef CONFIG_PCI
 #ifdef CONFIG_X86_IO_APIC
 	check_acpi_pci();	/* Checks more than just ACPI actually */
 #endif
+#endif
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index c10789d7a9d3..465188e2d701 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -634,3 +634,69 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
 	}
 }
 
+/*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ *
+ * cpu is a standard Linux logical CPU number.
+ */
+static void
+__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+				int nonatomic, int wait)
+{
+	struct call_data_struct data;
+	int cpus = 1;
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	wmb();
+	/* Send a message to all other CPUs and wait for them to respond */
+	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (!wait)
+		return;
+
+	while (atomic_read(&data.finished) != cpus)
+		cpu_relax();
+}
+
+/*
+ * smp_call_function_single - Run a function on another CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+			int nonatomic, int wait)
+{
+	/* prevent preemption and reschedule on another processor */
+	int me = get_cpu();
+	if (cpu == me) {
+		WARN_ON(1);
+		put_cpu();
+		return -EBUSY;
+	}
+	spin_lock_bh(&call_lock);
+	__smp_call_function_single(cpu, func, info, nonatomic, wait);
+	spin_unlock_bh(&call_lock);
+	put_cpu();
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index f948419c888a..020d873b7d21 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -177,6 +177,9 @@ static void __devinit smp_store_cpu_info(int id)
 	 */
 	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
 
+		if (num_possible_cpus() == 1)
+			goto valid_k7;
+
 		/* Athlon 660/661 is valid. */	
 		if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
 			goto valid_k7;
@@ -642,9 +645,13 @@ static void map_cpu_to_logical_apicid(void)
 {
 	int cpu = smp_processor_id();
 	int apicid = logical_smp_processor_id();
+	int node = apicid_to_node(apicid);
+
+	if (!node_online(node))
+		node = first_online_node;
 
 	cpu_2_logical_apicid[cpu] = apicid;
-	map_cpu_to_node(cpu, apicid_to_node(apicid));
+	map_cpu_to_node(cpu, node);
 }
 
 static void unmap_cpu_to_logical_apicid(int cpu)
@@ -1372,7 +1379,8 @@ int __cpu_disable(void)
 	 */
 	if (cpu == 0)
 		return -EBUSY;
-
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		stop_apic_nmi_watchdog(NULL);
 	clear_local_APIC();
 	/* Allow any queued timer interrupts to get serviced */
 	local_irq_enable();
@@ -1486,3 +1494,16 @@ void __init smp_intr_init(void)
 	/* IPI for generic function call */
 	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 }
+
+/*
+ * If the BIOS enumerates physical processors before logical,
+ * maxcpus=N at enumeration-time can be used to disable HT.
+ */
+static int __init parse_maxcpus(char *arg)
+{
+	extern unsigned int maxcpus;
+
+	maxcpus = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+early_param("maxcpus", parse_maxcpus);
diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c
index b1809c9a0899..32413122c4c2 100644
--- a/arch/i386/kernel/srat.c
+++ b/arch/i386/kernel/srat.c
@@ -42,7 +42,7 @@
 #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
 static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
 
-#define MAX_CHUNKS_PER_NODE	4
+#define MAX_CHUNKS_PER_NODE	3
 #define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
 struct node_memory_chunk_s {
 	unsigned long	start_pfn;
@@ -54,8 +54,6 @@ struct node_memory_chunk_s {
 static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
 
 static int num_memory_chunks;		/* total number of memory chunks */
-static int zholes_size_init;
-static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
 
 extern void * boot_ioremap(unsigned long, unsigned long);
 
@@ -135,50 +133,6 @@ static void __init parse_memory_affinity_structure (char *sratp)
 		 "enabled and removable" : "enabled" ) );
 }
 
-#if MAX_NR_ZONES != 4
-#error "MAX_NR_ZONES != 4, chunk_to_zone requires review"
-#endif
-/* Take a chunk of pages from page frame cstart to cend and count the number
- * of pages in each zone, returned via zones[].
- */
-static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, 
-		unsigned long *zones)
-{
-	unsigned long max_dma;
-	extern unsigned long max_low_pfn;
-
-	int z;
-	unsigned long rend;
-
-	/* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide
-	 * similarly scoped information and should be handled in a consistant
-	 * manner.
-	 */
-	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
-	/* Split the hole into the zones in which it falls.  Repeatedly
-	 * take the segment in which the remaining hole starts, round it
-	 * to the end of that zone.
-	 */
-	memset(zones, 0, MAX_NR_ZONES * sizeof(long));
-	while (cstart < cend) {
-		if (cstart < max_dma) {
-			z = ZONE_DMA;
-			rend = (cend < max_dma)? cend : max_dma;
-
-		} else if (cstart < max_low_pfn) {
-			z = ZONE_NORMAL;
-			rend = (cend < max_low_pfn)? cend : max_low_pfn;
-
-		} else {
-			z = ZONE_HIGHMEM;
-			rend = cend;
-		}
-		zones[z] += rend - cstart;
-		cstart = rend;
-	}
-}
-
 /*
  * The SRAT table always lists ascending addresses, so can always
  * assume that the first "start" address that you see is the real
@@ -223,7 +177,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 
 	memset(pxm_bitmap, 0, sizeof(pxm_bitmap));	/* init proximity domain bitmap */
 	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
-	memset(zholes_size, 0, sizeof(zholes_size));
 
 	num_memory_chunks = 0;
 	while (p < end) {
@@ -287,6 +240,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
 		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
 		node_read_chunk(chunk->nid, chunk);
+		add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
 	}
  
 	for_each_online_node(nid) {
@@ -395,57 +349,7 @@ int __init get_memcfg_from_srat(void)
 		return acpi20_parse_srat((struct acpi_table_srat *)header);
 	}
 out_err:
+	remove_all_active_ranges();
 	printk("failed to get NUMA memory information from SRAT table\n");
 	return 0;
 }
-
-/* For each node run the memory list to determine whether there are
- * any memory holes.  For each hole determine which ZONE they fall
- * into.
- *
- * NOTE#1: this requires knowledge of the zone boundries and so
- * _cannot_ be performed before those are calculated in setup_memory.
- * 
- * NOTE#2: we rely on the fact that the memory chunks are ordered by
- * start pfn number during setup.
- */
-static void __init get_zholes_init(void)
-{
-	int nid;
-	int c;
-	int first;
-	unsigned long end = 0;
-
-	for_each_online_node(nid) {
-		first = 1;
-		for (c = 0; c < num_memory_chunks; c++){
-			if (node_memory_chunk[c].nid == nid) {
-				if (first) {
-					end = node_memory_chunk[c].end_pfn;
-					first = 0;
-
-				} else {
-					/* Record any gap between this chunk
-					 * and the previous chunk on this node
-					 * against the zones it spans.
-					 */
-					chunk_to_zones(end,
-						node_memory_chunk[c].start_pfn,
-						&zholes_size[nid * MAX_NR_ZONES]);
-				}
-			}
-		}
-	}
-}
-
-unsigned long * __init get_zholes_size(int nid)
-{
-	if (!zholes_size_init) {
-		zholes_size_init++;
-		get_zholes_init();
-	}
-	if (nid >= MAX_NUMNODES || !node_online(nid))
-		printk("%s: nid = %d is invalid/offline. num_online_nodes = %d",
-		       __FUNCTION__, nid, num_online_nodes());
-	return &zholes_size[nid * MAX_NR_ZONES];
-}
diff --git a/arch/i386/kernel/stacktrace.c b/arch/i386/kernel/stacktrace.c
deleted file mode 100644
index e62a037ab399..000000000000
--- a/arch/i386/kernel/stacktrace.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * arch/i386/kernel/stacktrace.c
- *
- * Stack trace management functions
- *
- *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- */
-#include <linux/sched.h>
-#include <linux/stacktrace.h>
-
-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
-{
-	return	p > (void *)tinfo &&
-		p < (void *)tinfo + THREAD_SIZE - 3;
-}
-
-/*
- * Save stack-backtrace addresses into a stack_trace buffer:
- */
-static inline unsigned long
-save_context_stack(struct stack_trace *trace, unsigned int skip,
-		   struct thread_info *tinfo, unsigned long *stack,
-		   unsigned long ebp)
-{
-	unsigned long addr;
-
-#ifdef CONFIG_FRAME_POINTER
-	while (valid_stack_ptr(tinfo, (void *)ebp)) {
-		addr = *(unsigned long *)(ebp + 4);
-		if (!skip)
-			trace->entries[trace->nr_entries++] = addr;
-		else
-			skip--;
-		if (trace->nr_entries >= trace->max_entries)
-			break;
-		/*
-		 * break out of recursive entries (such as
-		 * end_of_stack_stop_unwind_function):
-	 	 */
-		if (ebp == *(unsigned long *)ebp)
-			break;
-
-		ebp = *(unsigned long *)ebp;
-	}
-#else
-	while (valid_stack_ptr(tinfo, stack)) {
-		addr = *stack++;
-		if (__kernel_text_address(addr)) {
-			if (!skip)
-				trace->entries[trace->nr_entries++] = addr;
-			else
-				skip--;
-			if (trace->nr_entries >= trace->max_entries)
-				break;
-		}
-	}
-#endif
-
-	return ebp;
-}
-
-/*
- * Save stack-backtrace addresses into a stack_trace buffer.
- * If all_contexts is set, all contexts (hardirq, softirq and process)
- * are saved. If not set then only the current context is saved.
- */
-void save_stack_trace(struct stack_trace *trace,
-		      struct task_struct *task, int all_contexts,
-		      unsigned int skip)
-{
-	unsigned long ebp;
-	unsigned long *stack = &ebp;
-
-	WARN_ON(trace->nr_entries || !trace->max_entries);
-
-	if (!task || task == current) {
-		/* Grab ebp right from our regs: */
-		asm ("movl %%ebp, %0" : "=r" (ebp));
-	} else {
-		/* ebp is the last reg pushed by switch_to(): */
-		ebp = *(unsigned long *) task->thread.esp;
-	}
-
-	while (1) {
-		struct thread_info *context = (struct thread_info *)
-				((unsigned long)stack & (~(THREAD_SIZE - 1)));
-
-		ebp = save_context_stack(trace, skip, context, stack, ebp);
-		stack = (unsigned long *)context->previous_esp;
-		if (!all_contexts || !stack ||
-				trace->nr_entries >= trace->max_entries)
-			break;
-		trace->entries[trace->nr_entries++] = ULONG_MAX;
-		if (trace->nr_entries >= trace->max_entries)
-			break;
-	}
-}
-
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d4775398..7e639f78b0b9 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,4 @@ ENTRY(sys_call_table)
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
 	.long sys_move_pages
+	.long sys_getcpu
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index edd00f6cee37..86944acfb647 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -130,18 +130,33 @@ static int set_rtc_mmss(unsigned long nowtime)
 
 int timer_ack;
 
-#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
 unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	if (!user_mode_vm(regs) && in_lock_functions(pc))
+#ifdef CONFIG_SMP
+	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
 		return *(unsigned long *)(regs->ebp + 4);
-
+#else
+		unsigned long *sp;
+		if ((regs->xcs & 3) == 0)
+			sp = (unsigned long *)&regs->esp;
+		else
+			sp = (unsigned long *)regs->esp;
+		/* Return address is either directly at stack pointer
+		   or above a saved eflags. Eflags has bits 22-31 zero,
+		   kernel addresses don't. */
+ 		if (sp[0] >> 22)
+			return sp[0];
+		if (sp[1] >> 22)
+			return sp[1];
+#endif
+	}
+#endif
 	return pc;
 }
 EXPORT_SYMBOL(profile_pc);
-#endif
 
 /*
  * This is the same as the above, except we _also_ save the current
@@ -270,16 +285,19 @@ void notify_arch_cmos_timer(void)
 	mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
-static long clock_cmos_diff, sleep_start;
+static long clock_cmos_diff;
+static unsigned long sleep_start;
 
 static int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
 	/*
 	 * Estimate time zone so that set_time can update the clock
 	 */
-	clock_cmos_diff = -get_cmos_time();
+	unsigned long ctime =  get_cmos_time();
+
+	clock_cmos_diff = -ctime;
 	clock_cmos_diff += get_seconds();
-	sleep_start = get_cmos_time();
+	sleep_start = ctime;
 	return 0;
 }
 
@@ -287,18 +305,29 @@ static int timer_resume(struct sys_device *dev)
 {
 	unsigned long flags;
 	unsigned long sec;
-	unsigned long sleep_length;
-
+	unsigned long ctime = get_cmos_time();
+	long sleep_length = (ctime - sleep_start) * HZ;
+	struct timespec ts;
+
+	if (sleep_length < 0) {
+		printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
+		/* The time after the resume must not be earlier than the time
+		 * before the suspend or some nasty things will happen
+		 */
+		sleep_length = 0;
+		ctime = sleep_start;
+	}
 #ifdef CONFIG_HPET_TIMER
 	if (is_hpet_enabled())
 		hpet_reenable();
 #endif
 	setup_pit_timer();
-	sec = get_cmos_time() + clock_cmos_diff;
-	sleep_length = (get_cmos_time() - sleep_start) * HZ;
+
+	sec = ctime + clock_cmos_diff;
+	ts.tv_sec = sec;
+	ts.tv_nsec = 0;
+	do_settimeofday(&ts);
 	write_seqlock_irqsave(&xtime_lock, flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
 	jiffies_64 += sleep_length;
 	wall_jiffies += sleep_length;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -334,10 +363,11 @@ extern void (*late_time_init)(void);
 /* Duplicate of time_init() below, with hpet_enable part added */
 static void __init hpet_time_init(void)
 {
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
+	struct timespec ts;
+	ts.tv_sec = get_cmos_time();
+	ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+
+	do_settimeofday(&ts);
 
 	if ((hpet_enable() >= 0) && hpet_use_timer) {
 		printk("Using HPET for base-timer\n");
@@ -349,6 +379,7 @@ static void __init hpet_time_init(void)
 
 void __init time_init(void)
 {
+	struct timespec ts;
 #ifdef CONFIG_HPET_TIMER
 	if (is_hpet_capable()) {
 		/*
@@ -359,10 +390,10 @@ void __init time_init(void)
 		return;
 	}
 #endif
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
+	ts.tv_sec = get_cmos_time();
+	ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+
+	do_settimeofday(&ts);
 
 	time_init_hook();
 }
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 14a1376fedd1..6bf14a4e995e 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -301,23 +301,25 @@ int hpet_rtc_timer_init(void)
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
 	local_irq_save(flags);
+
 	cnt = hpet_readl(HPET_COUNTER);
 	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
 	hpet_writel(cnt, HPET_T1_CMP);
 	hpet_t1_cmp = cnt;
-	local_irq_restore(flags);
 
 	cfg = hpet_readl(HPET_T1_CFG);
 	cfg &= ~HPET_TN_PERIODIC;
 	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
 	hpet_writel(cfg, HPET_T1_CFG);
 
+	local_irq_restore(flags);
+
 	return 1;
 }
 
 static void hpet_rtc_timer_reinit(void)
 {
-	unsigned int cfg, cnt;
+	unsigned int cfg, cnt, ticks_per_int, lost_ints;
 
 	if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
 		cfg = hpet_readl(HPET_T1_CFG);
@@ -332,10 +334,33 @@ static void hpet_rtc_timer_reinit(void)
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
 	/* It is more accurate to use the comparator value than current count.*/
-	cnt = hpet_t1_cmp;
-	cnt += hpet_tick*HZ/hpet_rtc_int_freq;
-	hpet_writel(cnt, HPET_T1_CMP);
-	hpet_t1_cmp = cnt;
+	ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+	hpet_t1_cmp += ticks_per_int;
+	hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+	/*
+	 * If the interrupt handler was delayed too long, the write above tries
+	 * to schedule the next interrupt in the past and the hardware would
+	 * not interrupt until the counter had wrapped around.
+	 * So we have to check that the comparator wasn't set to a past time.
+	 */
+	cnt = hpet_readl(HPET_COUNTER);
+	if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+		lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+		/* Make sure that, even with the time needed to execute
+		 * this code, the next scheduled interrupt has been moved
+		 * back to the future: */
+		lost_ints++;
+
+		hpet_t1_cmp += lost_ints * ticks_per_int;
+		hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+		if (PIE_on)
+			PIE_count += lost_ints;
+
+		printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+		       hpet_rtc_int_freq);
+	}
 }
 
 /*
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index e2e281d4bcc8..07d6da36a825 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/nodemask.h>
+#include <linux/mmzone.h>
 #include <asm/cpu.h>
 
 static struct i386_cpu cpu_devices[NR_CPUS];
@@ -55,34 +56,18 @@ EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
-
-
-#ifdef CONFIG_NUMA
-#include <linux/mmzone.h>
-
 static int __init topology_init(void)
 {
 	int i;
 
+#ifdef CONFIG_NUMA
 	for_each_online_node(i)
 		register_one_node(i);
+#endif /* CONFIG_NUMA */
 
 	for_each_present_cpu(i)
 		arch_register_cpu(i);
 	return 0;
 }
 
-#else /* !CONFIG_NUMA */
-
-static int __init topology_init(void)
-{
-	int i;
-
-	for_each_present_cpu(i)
-		arch_register_cpu(i);
-	return 0;
-}
-
-#endif /* CONFIG_NUMA */
-
 subsys_initcall(topology_init);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 7e9edafffd8a..a13037fe0ee3 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -28,6 +28,7 @@
 #include <linux/kprobes.h>
 #include <linux/kexec.h>
 #include <linux/unwind.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -40,7 +41,6 @@
 
 #include <asm/processor.h>
 #include <asm/system.h>
-#include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/atomic.h>
 #include <asm/debugreg.h>
@@ -51,6 +51,7 @@
 #include <asm/smp.h>
 #include <asm/arch_hooks.h>
 #include <asm/kdebug.h>
+#include <asm/stacktrace.h>
 
 #include <linux/module.h>
 
@@ -118,26 +119,16 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 		p < (void *)tinfo + THREAD_SIZE - 3;
 }
 
-/*
- * Print one address/symbol entries per line.
- */
-static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
-{
-	printk(" [<%08lx>] ", addr);
-
-	print_symbol("%s\n", addr);
-}
-
 static inline unsigned long print_context_stack(struct thread_info *tinfo,
 				unsigned long *stack, unsigned long ebp,
-				char *log_lvl)
+				struct stacktrace_ops *ops, void *data)
 {
 	unsigned long addr;
 
 #ifdef	CONFIG_FRAME_POINTER
 	while (valid_stack_ptr(tinfo, (void *)ebp)) {
 		addr = *(unsigned long *)(ebp + 4);
-		print_addr_and_symbol(addr, log_lvl);
+		ops->address(data, addr);
 		/*
 		 * break out of recursive entries (such as
 		 * end_of_stack_stop_unwind_function):
@@ -150,30 +141,37 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 	while (valid_stack_ptr(tinfo, stack)) {
 		addr = *stack++;
 		if (__kernel_text_address(addr))
-			print_addr_and_symbol(addr, log_lvl);
+			ops->address(data, addr);
 	}
 #endif
 	return ebp;
 }
 
+struct ops_and_data {
+	struct stacktrace_ops *ops;
+	void *data;
+};
+
 static asmlinkage int
-show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
+dump_trace_unwind(struct unwind_frame_info *info, void *data)
 {
+	struct ops_and_data *oad = (struct ops_and_data *)data;
 	int n = 0;
 
 	while (unwind(info) == 0 && UNW_PC(info)) {
 		n++;
-		print_addr_and_symbol(UNW_PC(info), log_lvl);
+		oad->ops->address(oad->data, UNW_PC(info));
 		if (arch_unw_user_mode(info))
 			break;
 	}
 	return n;
 }
 
-static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			       unsigned long *stack, char *log_lvl)
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+	        unsigned long *stack,
+		struct stacktrace_ops *ops, void *data)
 {
-	unsigned long ebp;
+	unsigned long ebp = 0;
 
 	if (!task)
 		task = current;
@@ -181,54 +179,116 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	if (call_trace >= 0) {
 		int unw_ret = 0;
 		struct unwind_frame_info info;
+		struct ops_and_data oad = { .ops = ops, .data = data };
 
 		if (regs) {
 			if (unwind_init_frame_info(&info, task, regs) == 0)
-				unw_ret = show_trace_unwind(&info, log_lvl);
+				unw_ret = dump_trace_unwind(&info, &oad);
 		} else if (task == current)
-			unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
+			unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
 		else {
 			if (unwind_init_blocked(&info, task) == 0)
-				unw_ret = show_trace_unwind(&info, log_lvl);
+				unw_ret = dump_trace_unwind(&info, &oad);
 		}
 		if (unw_ret > 0) {
 			if (call_trace == 1 && !arch_unw_user_mode(&info)) {
-				print_symbol("DWARF2 unwinder stuck at %s\n",
+				ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
 					     UNW_PC(&info));
 				if (UNW_SP(&info) >= PAGE_OFFSET) {
-					printk("Leftover inexact backtrace:\n");
+					ops->warning(data, "Leftover inexact backtrace:\n");
 					stack = (void *)UNW_SP(&info);
+					if (!stack)
+						return;
+					ebp = UNW_FP(&info);
 				} else
-					printk("Full inexact backtrace again:\n");
+					ops->warning(data, "Full inexact backtrace again:\n");
 			} else if (call_trace >= 1)
 				return;
 			else
-				printk("Full inexact backtrace again:\n");
+				ops->warning(data, "Full inexact backtrace again:\n");
 		} else
-			printk("Inexact backtrace:\n");
+			ops->warning(data, "Inexact backtrace:\n");
+	}
+	if (!stack) {
+		unsigned long dummy;
+		stack = &dummy;
+		if (task && task != current)
+			stack = (unsigned long *)task->thread.esp;
 	}
 
-	if (task == current) {
-		/* Grab ebp right from our regs */
-		asm ("movl %%ebp, %0" : "=r" (ebp) : );
-	} else {
-		/* ebp is the last reg pushed by switch_to */
-		ebp = *(unsigned long *) task->thread.esp;
+#ifdef CONFIG_FRAME_POINTER
+	if (!ebp) {
+		if (task == current) {
+			/* Grab ebp right from our regs */
+			asm ("movl %%ebp, %0" : "=r" (ebp) : );
+		} else {
+			/* ebp is the last reg pushed by switch_to */
+			ebp = *(unsigned long *) task->thread.esp;
+		}
 	}
+#endif
 
 	while (1) {
 		struct thread_info *context;
 		context = (struct thread_info *)
 			((unsigned long)stack & (~(THREAD_SIZE - 1)));
-		ebp = print_context_stack(context, stack, ebp, log_lvl);
+		ebp = print_context_stack(context, stack, ebp, ops, data);
+		/* Should be after the line below, but somewhere
+		   in early boot context comes out corrupted and we
+		   can't reference it -AK */
+		if (ops->stack(data, "IRQ") < 0)
+			break;
 		stack = (unsigned long*)context->previous_esp;
 		if (!stack)
 			break;
-		printk("%s =======================\n", log_lvl);
 	}
 }
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	printk(data);
+	print_symbol(msg, symbol);
+	printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+	printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+	return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr)
+{
+	printk("%s [<%08lx>] ", (char *)data, addr);
+	print_symbol("%s\n", addr);
+}
+
+static struct stacktrace_ops print_trace_ops = {
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		   unsigned long * stack, char *log_lvl)
+{
+	dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+	printk("%s =======================\n", log_lvl);
+}
 
-void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long * stack)
 {
 	show_trace_log_lvl(task, regs, stack, "");
 }
@@ -291,8 +351,9 @@ void show_registers(struct pt_regs *regs)
 		ss = regs->xss & 0xffff;
 	}
 	print_modules();
-	printk(KERN_EMERG "CPU:    %d\nEIP:    %04x:[<%08lx>]    %s VLI\n"
-			"EFLAGS: %08lx   (%s %.*s) \n",
+	printk(KERN_EMERG "CPU:    %d\n"
+		KERN_EMERG "EIP:    %04x:[<%08lx>]    %s VLI\n"
+		KERN_EMERG "EFLAGS: %08lx   (%s %.*s)\n",
 		smp_processor_id(), 0xffff & regs->xcs, regs->eip,
 		print_tainted(), regs->eflags, system_utsname.release,
 		(int)strcspn(system_utsname.version, " "),
@@ -313,6 +374,8 @@ void show_registers(struct pt_regs *regs)
 	 */
 	if (in_kernel) {
 		u8 __user *eip;
+		int code_bytes = 64;
+		unsigned char c;
 
 		printk("\n" KERN_EMERG "Stack: ");
 		show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
@@ -320,9 +383,12 @@ void show_registers(struct pt_regs *regs)
 		printk(KERN_EMERG "Code: ");
 
 		eip = (u8 __user *)regs->eip - 43;
-		for (i = 0; i < 64; i++, eip++) {
-			unsigned char c;
-
+		if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
+			/* try starting at EIP */
+			eip = (u8 __user *)regs->eip;
+			code_bytes = 32;
+		}
+		for (i = 0; i < code_bytes; i++, eip++) {
 			if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
 				printk(" Bad EIP value.");
 				break;
@@ -343,7 +409,7 @@ static void handle_BUG(struct pt_regs *regs)
 
 	if (eip < PAGE_OFFSET)
 		return;
-	if (__get_user(ud2, (unsigned short __user *)eip))
+	if (probe_kernel_address((unsigned short __user *)eip, ud2))
 		return;
 	if (ud2 != 0x0b0f)
 		return;
@@ -356,7 +422,8 @@ static void handle_BUG(struct pt_regs *regs)
 		char *file;
 		char c;
 
-		if (__get_user(line, (unsigned short __user *)(eip + 2)))
+		if (probe_kernel_address((unsigned short __user *)(eip + 2),
+					line))
 			break;
 		if (__get_user(file, (char * __user *)(eip + 4)) ||
 		    (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
@@ -629,18 +696,24 @@ gp_in_kernel:
 	}
 }
 
-static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
-	printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
-			"to continue\n");
+	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+		"CPU %d.\n", reason, smp_processor_id());
 	printk(KERN_EMERG "You probably have a hardware problem with your RAM "
 			"chips\n");
+	if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 
 	/* Clear and disable the memory parity error line. */
 	clear_mem_error(reason);
 }
 
-static void io_check_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
 {
 	unsigned long i;
 
@@ -656,7 +729,8 @@ static void io_check_error(unsigned char reason, struct pt_regs * regs)
 	outb(reason, 0x61);
 }
 
-static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 {
 #ifdef CONFIG_MCA
 	/* Might actually be able to figure out what the guilty party
@@ -666,15 +740,18 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 		return;
 	}
 #endif
-	printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
-		reason, smp_processor_id());
-	printk("Dazed and confused, but trying to continue\n");
-	printk("Do you have a strange power saving mode enabled?\n");
+	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+		"CPU %d.\n", reason, smp_processor_id());
+	printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+	if (panic_on_unrecovered_nmi)
+                panic("NMI: Not continuing");
+
+	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 }
 
 static DEFINE_SPINLOCK(nmi_print_lock);
 
-void die_nmi (struct pt_regs *regs, const char *msg)
+void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
 {
 	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
 	    NOTIFY_STOP)
@@ -706,7 +783,7 @@ void die_nmi (struct pt_regs *regs, const char *msg)
 	do_exit(SIGSEGV);
 }
 
-static void default_do_nmi(struct pt_regs * regs)
+static __kprobes void default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = 0;
 
@@ -723,12 +800,12 @@ static void default_do_nmi(struct pt_regs * regs)
 		 * Ok, so this is none of the documented NMI sources,
 		 * so it must be the NMI watchdog.
 		 */
-		if (nmi_watchdog) {
-			nmi_watchdog_tick(regs);
+		if (nmi_watchdog_tick(regs, reason))
 			return;
-		}
+		if (!do_nmi_callback(regs, smp_processor_id()))
 #endif
-		unknown_nmi_error(reason, regs);
+			unknown_nmi_error(reason, regs);
+
 		return;
 	}
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -744,14 +821,7 @@ static void default_do_nmi(struct pt_regs * regs)
 	reassert_nmi();
 }
 
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
-	return 0;
-}
- 
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
- 
-fastcall void do_nmi(struct pt_regs * regs, long error_code)
+fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;
 
@@ -761,25 +831,11 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
 
 	++nmi_count(cpu);
 
-	if (!rcu_dereference(nmi_callback)(regs, cpu))
-		default_do_nmi(regs);
+	default_do_nmi(regs);
 
 	nmi_exit();
 }
 
-void set_nmi_callback(nmi_callback_t callback)
-{
-	vmalloc_sync_all();
-	rcu_assign_pointer(nmi_callback, callback);
-}
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-
-void unset_nmi_callback(void)
-{
-	nmi_callback = dummy_nmi_callback;
-}
-EXPORT_SYMBOL_GPL(unset_nmi_callback);
-
 #ifdef CONFIG_KPROBES
 fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
@@ -1119,20 +1175,6 @@ void __init trap_init_f00f_bug(void)
 }
 #endif
 
-#define _set_gate(gate_addr,type,dpl,addr,seg) \
-do { \
-  int __d0, __d1; \
-  __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
-	"movw %4,%%dx\n\t" \
-	"movl %%eax,%0\n\t" \
-	"movl %%edx,%1" \
-	:"=m" (*((long *) (gate_addr))), \
-	 "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
-	:"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
-	 "3" ((char *) (addr)),"2" ((seg) << 16)); \
-} while (0)
-
-
 /*
  * This needs to use 'idt_table' rather than 'idt', and
  * thus use the _nonmapped_ version of the IDT, as the
@@ -1141,7 +1183,7 @@ do { \
  */
 void set_intr_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
+	_set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
 }
 
 /*
@@ -1149,22 +1191,22 @@ void set_intr_gate(unsigned int n, void *addr)
  */
 static inline void set_system_intr_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
+	_set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
 }
 
 static void __init set_trap_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
+	_set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
 }
 
 static void __init set_system_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
+	_set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
 }
 
 static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
 {
-	_set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+	_set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
 }
 
 
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 7e0d8dab2075..b8fa0a8b2e47 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -192,7 +192,7 @@ int recalibrate_cpu_khz(void)
 
 EXPORT_SYMBOL(recalibrate_cpu_khz);
 
-void tsc_init(void)
+void __init tsc_init(void)
 {
 	if (!cpu_has_tsc || tsc_disable)
 		return;
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 2d4f1386e2b1..1e7ac1c44ddc 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
 OUTPUT_ARCH(i386)
 ENTRY(phys_startup_32)
 jiffies = jiffies_64;
+
+PHDRS {
+	text PT_LOAD FLAGS(5);	/* R_E */
+	data PT_LOAD FLAGS(7);	/* RWE */
+	note PT_NOTE FLAGS(4);	/* R__ */
+}
 SECTIONS
 {
   . = __KERNEL_START;
@@ -26,7 +32,7 @@ SECTIONS
 	KPROBES_TEXT
 	*(.fixup)
 	*(.gnu.warning)
-	} = 0x9090
+	} :text = 0x9090
 
   _etext = .;			/* End of text section */
 
@@ -48,7 +54,7 @@ SECTIONS
   .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
 	*(.data)
 	CONSTRUCTORS
-	}
+	} :data
 
   . = ALIGN(4096);
   __nosave_begin = .;
@@ -184,4 +190,6 @@ SECTIONS
   STABS_DEBUG
 
   DWARF_DEBUG
+
+  NOTES
 }